我们在采集网页的时候有的网页数据不能直接在源代码中看到,需要对地址进行抓包处理。一般抓包之后的数据格式是json格式,那么对于json格式的数据我们要怎么采集呢!下面使用腾讯新闻-滚动地址来实际举例说明下。
举例网址:https://new.qq.com/rolls/?ext=ent 实际获取数据的地址:https://pacaio.match.qq.com/openapi/json?key=ent:20200721&num=15&page=0&expIds=&callback=__jp0 这里要注意一下,数据地址中有一个时间的动态参数,要注意设置成yyyyMMdd。这样才能在每天时间更新之后,抓取到每天最新的数据。如果不修改,只会获取到一天的数据。下面是实际获取到的数据代码,如果获取到的代码经常变动的话,抓取标签是一定要匹配正确。
__jp0( { "biz": 17002, "code": 0, "data": [ { "article_type": 0, "comment_id": "5593168460", "comment_num": 0, "id": "20200721A077OB", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124690803_294195/0" ] }, "publish_time": "2020-07-21 09:25:14", "source": "师姐说娱乐", "source_id": "5415250", "tag_label": [ [ "旧时光", "355962" ], [ "陈德容", "101520" ], [ "天心", "112554" ] ], "title": "旧时光里的湾湾女星,陈德容天心岳翎,美的各有千秋!", "url": "https://new.qq.com/omn/20200721/20200721A077OB00.html" }, { "article_type": 0, "comment_id": "5593157982", "comment_num": 0, "id": "20200721A07561", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124680281_294195/0" ] }, "publish_time": "2020-07-21 09:22:58", "source": "清池", "source_id": "5129013", "tag_label": [ [ "谁说我结不了婚", "154338639" ], [ "三十而已", "148299941" ], [ "童瑶", "84856" ] ], "title": "从《谁说我结不了婚》到《三十而已》,说童瑶选剧本的眼光", "url": "https://new.qq.com/omn/20200721/20200721A0756100.html" }, { "article_type": 0, "comment_id": "5593149812", "comment_num": 0, "id": "20200721A073GM", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124673756_294195/0" ] }, "publish_time": "2020-07-21 09:21:35", "source": "娱见现实", "source_id": "5561154", "tag_label": [ [ "欧阳娜娜", "226223" ], [ "何炅", "84664" ], [ "向往的生活", "299321" ] ], "title": "欧阳娜娜痛哭,何炅反应很暖心,背后原因暴露娱乐圈现状", "url": "https://new.qq.com/omn/20200721/20200721A073GM00.html" }, { "article_type": 0, "comment_id": "5593137865", "comment_num": 0, "id": "20200721A071FI", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124668566_294195/0" ] }, "publish_time": "2020-07-21 09:19:44", "source": "80后马里奥", "source_id": "7027734", "tag_label": [ [ "trot", "10160686" ], [ "卢志勋", "3720590" ], [ "爱豆", "92548" ] ], "title": "因为算命先生的劝说,他从爱豆变成了Trot歌手,最终获得了成功", "url": "https://new.qq.com/omn/20200721/20200721A071FI00.html" }, { "article_type": 0, "comment_id": "5593106329", "comment_num": 0, "id": "20200721A06VJU", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124648118_294195/0" ] }, "publish_time": "2020-07-21 09:14:49", "source": "新咖八卦", "source_id": "5921265", "tag_label": [ [ "蒋劲夫", "85312" ], [ "中浦悠花", "105076466" ], [ "小鲜肉", "204379" ] ], "title": "如果不是2年前的“中日恋情”,如今的他应该能比肩贾乃亮了吧!", "url": "https://new.qq.com/omn/20200721/20200721A06VJU00.html" }, { "article_type": 0, "comment_id": "5593098907", "comment_num": 0, "id": "20200721A06U5H", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124640637_294195/0" ] }, "publish_time": "2020-07-21 09:13:41", "source": "环球娱乐资讯", "source_id": "5021291", "tag_label": [ [ "不说谎恋人", "104848432" ], [ "刘海宽", "2801582" ], [ "李哲", "216335" ] ], "title": "《不说谎恋人》甜蜜落幕 刘海宽演技自然细腻获好评", "url": "https://new.qq.com/omn/20200721/20200721A06U5H00.html" }, { "article_type": 0, "comment_id": "5593095563", "comment_num": 0, "id": "20200721A06TFZ", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124637610_294195/0" ] }, "publish_time": "2020-07-21 09:13:09", "source": "一平看世界", "source_id": "16708619", "tag_label": [ [ "艾梅柏·希尔德", "565698" ], [ "约翰尼·德普", "3745311" ], [ "莱昂纳多·迪卡普里奥", "310052" ] ], "title": "希尔德称德普指控我与小李子和钱老板偷情,还给他们起侮辱绰号", "url": "https://new.qq.com/omn/20200721/20200721A06TFZ00.html" }, { "article_type": 0, "comment_id": "5593084597", "comment_num": 0, "id": "20200721A06R99", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124630466_294195/0" ] }, "publish_time": "2020-07-21 09:11:22", "source": "今日影视头条", "source_id": "5174463", "tag_label": [ [ "欧阳娜娜", "226223" ], [ "何炅", "84664" ], [ "浙江卫视", "213178" ] ], "title": "欧阳娜娜哭倒在何炅的怀里,怪不得何炅为了她曾暗讽浙江卫视", "url": "https://new.qq.com/omn/20200721/20200721A06R9900.html" }, { "article_type": 0, "comment_id": "5586683241", "comment_num": 0, "id": "20200719A0PAZ6", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124619714_294195/0" ] }, "publish_time": "2020-07-21 09:09:05", "source": "MU影讯", "source_id": "5098371", "tag_label": [ [ "陈百祥", "102933" ], [ "黄杏秀", "554571" ] ], "title": "陈百祥结婚41年,妻子不孕依然恩爱如初,看到照片全懂了", "url": "https://new.qq.com/omn/20200719/20200719A0PAZ600.html" }, { "article_type": 0, "comment_id": "5593059904", "comment_num": 0, "id": "20200721A06MCS", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124614170_294195/0" ] }, "publish_time": "2020-07-21 09:07:36", "source": "新咖八卦", "source_id": "5921265", "tag_label": [ [ "马伊琍", "94921" ], [ "文章", "81480" ], [ "离婚", "88971" ] ], "title": "文章和马伊琍早就各玩各的?知情人:都各自有对象了", "url": "https://new.qq.com/omn/20200721/20200721A06MCS00.html" }, { "article_type": 0, "comment_id": "5586570468", "comment_num": 0, "id": "20200721A06LZF", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124613317_294195/0" ] }, "publish_time": "2020-07-21 09:07:22", "source": "伍脊六兽", "source_id": "5136273", "tag_label": [ [ "罗志祥", "139249" ], [ "周扬青", "167824" ], [ "恋情", "82789" ] ], "title": "罗志祥晒与妈妈打球日常,恋情事业皆失意,唯有母亲一直不离不弃", "url": "https://new.qq.com/omn/20200721/20200721A06LZF00.html" }, { "article_type": 56, "comment_id": "5593045351", "comment_num": 0, "id": "20200721V06J3N", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124592859_294195/0" ] }, "publish_time": "2020-07-21 09:06:40", "source": "娱乐圈哔哔King", "source_id": "5792019", "tag_label": [], "title": "佟丽娅即兴跳顶碗舞“大型翻车”,下一秒表演《萨日朗》救场弥补尴尬场面", "url": "https://new.qq.com/omn/20200721/20200721V06J3N00.html" }, { "article_type": 0, "comment_id": "5593044386", "comment_num": 0, "id": "20200721A06IUX", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124603853_294195/0" ] }, "publish_time": "2020-07-21 09:05:04", "source": "明星粉丝团", "source_id": "8104", "tag_label": [ [ "钟晓芹", "153461968" ], [ "杨玏", "3187230" ] ], "title": "自私冷漠太直男?这个角色要不是他演的,估计得被网暴", "url": "https://new.qq.com/omn/20200721/20200721A06IUX00.html" }, { "article_type": 0, "comment_id": "5592371602", "comment_num": 0, "id": "20200721A02CVJ", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124606039_294195/0" ] }, "publish_time": "2020-07-21 09:05:00", "source": "音乐汪", "source_id": "16710428", "tag_label": [ [ "综艺节目", "102602" ] ], "title": "《浪姐》的“百万路人盘”遭质疑,终究是综艺节目,认真你就输了", "url": "https://new.qq.com/omn/20200721/20200721A02CVJ00.html" }, { "article_type": 0, "comment_id": "5593020054", "comment_num": 0, "id": "20200721A06DIM", "irs_imgs": { "294X195": [ "https://inews.gtimg.com/newsapp_ls/0/12124586706_294195/0" ] }, "publish_time": "2020-07-21 09:01:14", "source": "柳林后", "source_id": "16323570", "tag_label": [ [ "流浪地球", "281863" ], [ "柳林", "273272" ] ], "title": "复工首日票价低至4元,《流浪地球》还是加长版", "url": "https://new.qq.com/omn/20200721/20200721A06DIM00.html" } ], "msg": "ok", "seq": "20200721103741-uYHdlzb7lWmORSky" }
火车头对于返回的json内容怎么将整个页面的内容全部提取?因为返回的内容经常有变化(前后的特征标签),所以前后截取、和xpath都不能用,正则正常的也没办法,有大神指导怎么采集劝业内容的吗。
这个时候如果想哟啊获取整个页面的内容,可以选择使用正则表达式来实现(?<content>[\\s\\S]*?)即可。如果要匹配title和url可以选择使用前后截取的方式来获取数据。
如果你还有其它疑问可以来本站搜索相关问题,这里会有你想要的答案:大海资源库
暂无评论内容