存在的问题,在多线程抓取中,存在大量重复抓取链接,待解决
# -*- coding : utf-8 -*-
# ①获取url源代码 --② 获取标题,摘要(11:30~12:30) ③ 插入db数据库(17:00~18:00) ④ 村环获取所有链接,并插入数据库(19:30前) ⑤ 处理图片问题(回家解决并上传CSDN)
from threading import Thread
from queue import Queue
import requests,re
import time
from lxml import etree
from pymongo import MongoClient
from html import unescape
class BaiduSpider(Thread):
def __init__(self,queue):
super().__init__()
self.query = queue
self.headers = {
"User-Agent":"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
}
self.seen = set()
def run(self) -> None:
while True:
try:
site = self.query.get()
if site in self.seen:
continue
self.seen.add(site)
res = self.down_source(site)
if res is None:
continue
print(f'下载链接{site}')
self.get_url(res)
article = self.parsing(res)
self.con_mongo(*article) #这种模式是在函数定义时在某个形参前面加一个星号,调用时按位置匹配不带星号的形参和实参,多余的实参都将作为一个元组的元素保存到星号对应的形参中。
finally:
self.query.task_done()
def down_source(self,site,retries=3):
try:
r = requests.get(site,headers = self.headers)
except requests.Timeout as err:
html = None
if retries > 0 :
self.down_source(site,retries =retries-1)
else:
print(f"{site}加载超时..{err}")
except requests.RequestException as err:
html = None
print(f'{site}页面下载失败')
else:
r.encoding = "utf-8"
html = r.text
return html
@staticmethod
def parsing(res):
doc = etree.HTML(res) # 将html内容转化为文档数 格式为<Element html at 0x1fd33b68400>
title = ''.join(doc.xpath('//h1[@class="article-title"]/a/text()'))
content = doc.xpath('//article[@class="article-content"]')
s1 = etree.tostring(content[0], method='html', encoding='utf-8')
s1 = s1.decode('utf-8')
# 将HTML编码转换为字符串表示
article_content = unescape(s1)
# res_content = re.sub(r'</?article[^>]*>|</?div[^>]*>','',article_content) # 去除代码中的div
res_content = re.sub(r'</?article[^>]*>|<div class="article-social">[\s\S]*</div>','',article_content).strip() # 去除代码中的div
return title,res_content
def get_url(self,res):
doc = etree.HTML(res)
r_url = doc.xpath('//a/attribute::href')
for s_url in r_url:
url1 = ''.join(re.findall(r'http://www.wui5.com/\d+.html', s_url))
self.query.put(url1)
@staticmethod
def con_mongo(title,content):
collecting.insert_one({
"title" : title,
"content" : content,
})
if __name__ == "__main__":
query = Queue()
url = "http://www.wui5.com/7723.html"
query.put(url)
client = MongoClient()
db = client['bdspride'] # 创建/链接数据库
collecting = db['wui6'] # 创建集合
for x in range(30):
bd = BaiduSpider(query)
bd.daemon = True
bd.start()
query.join()