实验1:爬取《斗破苍穹》全文小说
观察链接发现小说第一章至第三章的链接为:
第1章:http://www.doupoxs.com/doupocangqiong/2.html
第2章:http://www.doupoxs.com/doupocangqiong/5.html
第3章:http://www.doupoxs.com/doupocangqiong/6.html
编码无规律,因此需要通过返回码判定数据是否存在。直接上代码:
import requests import re import time headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' } f = open('F:/cangqiang.txt','a+') def get_info(url): res = requests.get(url,headers=headers) if res.status_code == 200: contents = re.findall('<p>(.*?)</p>',res.content.decode('utf-8'),re.S) for content in contents: f.write(content+'\n') else: pass if __name__ == '__main__': urls = ['http://www.doupoxs.com/doupocangqiong/{}.html'.format(str(i)) for i in range(2,1665)] for url in urls: get_info(url) time.sleep(1) f.close()
运行结果:
实例2:爬取糗事百科文本段子
#使用Requests、re库爬取糗事百科文本类段子信息 import requests import re headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' } info_lists = [] #判断性别 def judgement_sex(class_name): if class_name == 'womenIcon': return '女' else: return '男' def get_info(url): res = requests.get(url,headers=headers) ids = re.findall('<h2>(.*?)</h2>',res.text, re.S) levels = re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text, re.S) sexes = re.findall('<div class="articleGender (.*?)">',res.text,re.S) contents = re.findall('<div class="content"><span>(.*?)</span></div>', res.text, re.S) laughts = re.findall('<span class="stats-vote"><i class="number">(\d+)</i>', res.text, re.S) comments = re.findall('<i class="number">(\d+)</i>评论',res.text, re.S) for id,level,sex,content,laught,comment in zip(ids, levels, sexes, contents, laughts, comments): info = { 'id':id, 'level':level, 'sex':judgement_sex(sex), 'content':content, 'laught':laught, 'comment':comment } info_lists.append(info) if __name__ =='__main__': urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,36)] for url in urls: get_info(url) for info_list in info_lists: f = open('F:/qishibaike.txt', 'a+') try: f.write(info_list['id']+'\n') f.write(info_list['level'] + '\n') f.write(info_list['content'] + '\n') f.write(info_list['laught'] + '\n') f.write(info_list['comment'] + '\n\n') f.close() except UnicodeDecodeError: pass