from urllib.request import urlopen,urlparse
from urllib.error import URLError,HTTPError
import re
import time
from datetime import datetime
#url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&sm=0&p=1'
#url = 'http://httpstat.us/500'
class Slimit(object):
'''
下载限速
'''
def __init__(self,time_sleep):
self.time_sleep = time_sleep
self.url_time = {}
def wait(self,url):
domain = urlparse(url).netloc
last_time = self.url_time.get(domain)
if self.time_sleep > 0 and last_time is not None:
st = self.time_sleep - (datetime.now() - last_time).seconds
if st > 0:
print('[+]限速:%.2f秒' % st)
time.sleep(st)
else:
print('[+]无需限速')
self.url_time[domain] = datetime.now()
def download(url,retries_num=3):
try:
print('download... %s' % url)
res = urlopen(url)
html = res.read().decode('utf-8' )
except HTTPError as e:
print(e.code)
html = None
if retries_num > 0:
print('[E]HTTPError!,retry times %d' % (4-retries_num))
if hasattr(e,'code') and 500 <=e.code <=600:
html = download(url,retries_num-1)
else:
print('[E]Failed!')
except URLError as e:
html = None
print('[E]Unlocated URL!',url)
return html
def get_links(html):
webpage_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(home_url,link_regex,depth_regex=None,max_depth=-1,time_sleep=0):
crawl_queue = [home_url]
seen = {home_url:0}
st = Slimit(0.5)
while crawl_queue:
url = crawl_queue.pop(0)
depth = seen[url]
if depth != max_depth:
st.wait(url)
html = download(url)
if depth_regex and re.match(re.compile(depth_regex,re.IGNORECASE),url):
continue
for link in get_links(html):
if re.match(re.compile(link_regex,re.IGNORECASE),link):
if link not in seen:
crawl_queue.append(link)
seen[link] = depth + 1
def main():
home_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&p=1&isadv=0'
link_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm|http://sou.zhaopin.com/jobs/searchresult.ashx\?jl=%e8%a5%bf%e5%ae%89&kw=python'
depth_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm'
res = link_crawler(home_url=home_url,link_regex=link_regex,depth_regex=depth_regex,max_depth=-1,time_sleep=2)
main()
爬虫-限速
猜你喜欢
转载自blog.csdn.net/ywf331/article/details/79689701
今日推荐
周排行