import random
import requests
from fake_useragent import UserAgent
from retrying import retry # 装饰器 下载错误重复下载
import hashlib # 信息摘要 md5
import queue # 队列
import re # 正则表达式
from urllib import robotparser # 解析网站robots。txt文件
from urllib.parse import urlparse,urljoin,urldefrag # 解析url
from threading import Thread # 多线程
from datetime import datetime # 获取时间
import time
import mongo_cache
MAX_DEP = 2 # 定义爬取深度
def get_robots(url):
"""
解析robots.txt文件
:param url:
:return:
"""
rp = robotparser.RobotFileParser()
rp.set_url(urljoin(url,'robots.txt'))
rp.read()
return rp
def save_url(html_content,url_str):
"""
存储下载内容
:param html_content:
:param url_str:
:return:
"""
md5 = hashlib.md5()
md5.update(html_content)
# file_path = "./download/" + md5.hexdigest() + ".html"
file_path = "D:\crawler\download\crawler-" + gen_html_name(url_str)
with open(file_path,'wb') as f:
f.write(html_content)
def gen_html_name(url_str):
path = urlparse(url_str).path
path_array = path.split('/')
return path_array[len(path_array) - 1]#取出最后一个
def extractor_url_lists(html_content):
"""
抽取网页中的其他链接
:param html_content:
:return:
"""
url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return url_regex.findall(html_content)
class CrawlerCommon(Thread):
"""
实现一个通用爬虫,涵盖基本的爬虫功能及涉及一些反爬虫技术
"""
def __init__(self,init_url):
super(CrawlerCommon,self).__init__()
__ua = UserAgent() # 随机User_Agent
self.seed_url = init_url #初始爬取的种子网址
self.crawler_queue = queue.Queue() #使用不同的队列会造成BFS和DFS的效果
self.crawler_queue.put(init_url) #将种子网址放入队列
self.visited = {init_url : 0} # 初始爬取深度为0
self.rp = get_robots(init_url) # 初始化orbots解析器
self.headers = {'User-Agent': __ua.random} #生成一个随机user-agent
self.link_regex = '(index|view)' #抽取网址的过滤条件
self.throttle = Throttle(5.0) #下载限流器的间隔5秒
self.mcache = mongo_cache.MongoCache() #初始化mongo_cache
self.time_sleep = 5
def retry_download(self,url_str,data,method,proxies):
"""
使用装饰器的重试下载类
:param url_str:
:param data:
:param method:
:param proxies:
:return:
"""
if method == 'POST':
result = requests.post(url_str,data=data,headers=self.headers,proxies=proxies)
else:
result = requests.get(url_str,headers=self.headers,timeout=3,proxies=proxies)
assert result.status_code == 200 #此处为断言,判断状态码是否为200
return result.content
def download(self,url_str,data=None,method="get",proxies={}):
"""
真正的下载类
:param url_str:
:param data:
:param method:
:param proxies:
:return:
"""
print("download url is :::::",url_str)
try:
result = self.retry_download(url_str,data,method,proxies)
except Exception as e: #python3使用as e
print(e.message)
result = None
return result
def nomalize(self,url_str):
"""
补全下载链接
:param url_str:
:return:
"""
real_url,_ = urldefrag(url_str)
return urljoin(self.seed_url,real_url)
def save_result(self,html_content,url_str):
"""
将结果存入数据库库,存入前检查内容是否存在
:param html_content: 下载的二进制内容
:param url_str: 下载网页的url
:return:
"""
if url_str not in self.mcache:
self.mcache[url_str]=html_content
else:
data_from_mongo = self.mcache[url_str]
md5_func = hashlib.md5()
md5_func.update(data_from_mongo)
mongo_md5_str = md5_func.hexdigest()
md5_func.update(html_content)
download_md5_str =md5_func.hexdigest()
if download_md5_str != mongo_md5_str:
self.mcache[url_str] = html_content
def run(self):
"""
进行网页爬取的主要方法
:return:
"""
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
# 检测robots。txt文件规则
if self.rp.can_fetch(self.headers["User-Agent"],url_str):
self.throttle.wait_url(url_str)
# random_oper = random.randint(0,1)
# if random_oper == 1:
# time.sleep(self.time_sleep + random.random() * random.randint(1,5))
# else:
# time.sleep(self.time_sleep - random.random())
depth = self.visited[url_str]
if depth < MAX_DEP:
# 下载链接
html_content = self.download(url_str)
# 存储链接
if html_content is not None:
# self.mcache[url_str] = html_content
self.save_result(html_content,url_str)
save_url(html_content,url_str)
# 筛选出页面所有链接
url_list = extractor_url_lists(html_content.decode('utf-8'))
# 筛选需要爬取的链接
filter_urls = [link for link in url_list if re.search('/(mongodb)',link)]
for url in filter_urls:
# 补全链接
real_url = self.nomalize(url)
# 判断链接是否访问过
if real_url not in self.visited:
self.visited[real_url] = depth + 1
self.crawler_queue.put(real_url)
else:
print("robots.txt 禁止下载:",url_str)
class Throttle(object):
"""
下载限流器
"""
def __init__(self,delay):
self.domains = {}
self.delay = delay
def wait_url(self,url_str):
domain_url = urlparse(url_str).netloc # 取出网址域名(netloc)
last_accessed = self.domains.get(domain_url) #取出域名的上次下载时间
if self.delay > 0 and last_accessed is not None:
#将当前时间和上次下载时间相减,得出两次下载时间间隔
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
# 如果时间大于0 休眠
if sleep_interval > 0:
time.sleep(sleep_interval)
self.domains[domain_url] = datetime.now() #把当前时间以域名作为key存到字典中
if __name__ == '__main__':
crawler = CrawlerCommon('http://www.runoob.com/mongodb/mongodb-tutorial.html')
crawler.run()
爬虫进阶(1)
猜你喜欢
转载自blog.csdn.net/fengbansha/article/details/85115796
今日推荐
周排行