from urllib import request from lxml import etree import re import ssl import json ssl._create_default_https_context=ssl._create_unverified_context def spider(page): base_url='https://www.qiushibaike.com/8hr/page/%s/'%page headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} req=request.Request(base_url,headers=headers) response=request.urlopen(req) html=response.read() html=etree.HTML(html) parse(html) def parse(html): div= html.xpath('//div[@id="content-left"]')#此div是个列表 sub_div=div[0].xpath('./div') # print(len(sub_div))#25 #保存整页内容 items=[]#列表中包含每一个字典 for sub_sub_div in sub_div: item={} # 作者 h2=sub_sub_div.xpath('./div[@class="author clearfix"]//h2')[0].text #正文 但是正文有可能取不全,没能拿到<br>===>单标签 span_text = sub_sub_div.xpath('./a[@class="contentHerf"]//span')[0].text print(h2) print(span_text) #正文图片 img_src_list=sub_sub_div.xpath('./div[@class="thumb"]/a/img/@src') imgsrc='' # global img_src if img_src_list!=[]: img_src="https:"+img_src_list[0] imgsrc=img_src print(img_src) # imgsrc=imgsrc # 好笑数量 vote=sub_sub_div.xpath('.//span[@class="stats-vote"]/i')[0].text print(vote) #评论 comments=sub_sub_div.xpath('.//span[@class="stats-comments"]//i')[0].text print(comments) item['author']=h2 item['text'] = span_text item['img'] = imgsrc item['vote'] = vote item['comments'] = comments items.append(item) print('~~~~~~~~~~~~~~~~~~~~~') #生成json文件 data = json.dumps(items, ensure_ascii=False) with open('qiushi.json','w',encoding='utf-8') as f: f.write(data) if __name__=='__main__': spider(1)
Python3~xpath应用糗事百科爬虫
猜你喜欢
转载自blog.csdn.net/zbrj12345/article/details/80308909
今日推荐
周排行