(单线程)使用正则的内涵段子爬虫
代码如下
1 from urllib.request import *
2 import re
3 import time
4
5
6 class Spider(object):
7 def __init__(self):
8 self.__start_page = int(input("请输入要爬取的开始页面:"))
9 self.__end_page = int(input("请输入要爬取的结束页面:"))
10
11 # 模拟浏览器代理
12 self.__header = {
13 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36"}
14
15 # 用来记录帖子数量
16 self.num = 1
17
18 def __load_page(self, start_page, end_page):
19 """
20 下载页面
21 """
22 print("正在爬取中....")
23 for page in range(self.__start_page, self.__end_page + 1):
24 # 由于第一页是url比较特殊,要单独爬取
25 if page == 1:
26 url = "https://www.neihanba.com/dz/index.html"
27
28 else:
29 # 待爬取的地址
30 url = "https://www.neihanba.com/dz/list_" + str(page) + ".html"
31
32 # 发起请求
33 request = Request(url, headers=self.__header)
34 response = urlopen(request)
35
36 # 获取每页的html源码字符串
37 html = response.read().decode("gbk")
38
39 # 获取所有符合条件的,返回一个列表
40 content_list = re.findall(r'<div class="f18 mb20">.*?</div>', html)
41 42 # 调用__deal_info()开始处理多余的信息
43 self.__deal_info(content_list, page)
44
45 def __deal_info(self, content_list, page):
46 """
47 处理每条的段子多余的部分
48 """
49 content = " =============================第%d页=========================\n" % page # 用来拼接内容
50
51 for info in content_list:
52 # 注意 | 左右的空格不能随便加
53 info = re.sub('(<div class="f18 mb20">)|(</div>)', "", info)
54
55 content = content + " %d、" % self.num + info + "\n"
56 self.num += 1
57
58 # 调用write__page()开始写入数据
59 self.__write_page(content)
60
61 def __write_page(self, content):
62 """
63 把每个段子逐个写入文件里
64 """
65 with open("内涵段子.txt", "a") as f:
66 f.write(content)
67
68 def run(self):
69 """
70 控制爬虫运行
71 """
72
73 start_time = time.time()
74
75 # 开始爬取
76 self.__load_page(self.__start_page, self.__end_page)
77 print("爬取完成...")
78 end_time = time.time()
79 print('用时:%.2f秒.' % (end_time - start_time))
80
81
82 if __name__ == '__main__':
83 spider = Spider()
84 spider.run()
结果预览如下:
如果你和我有共同爱好,我们可以加个好友一起交流哈!