爬虫-限速

from urllib.request import urlopen,urlparse
from urllib.error import URLError,HTTPError
import re
import time
from datetime import datetime

#url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&sm=0&p=1'
#url = 'http://httpstat.us/500'

class Slimit(object):
    '''
    下载限速
    '''
    def __init__(self,time_sleep):
        self.time_sleep = time_sleep
        self.url_time = {}
    def wait(self,url):
        domain = urlparse(url).netloc
        last_time = self.url_time.get(domain)
        if self.time_sleep > 0 and last_time is not None:
            st = self.time_sleep - (datetime.now() - last_time).seconds
            if st > 0:
                print('[+]限速：%.2f秒' % st)
                time.sleep(st)
            else:
                print('[+]无需限速')
        self.url_time[domain] = datetime.now()  



def download(url,retries_num=3):
    try:
        print('download... %s' % url)
        res = urlopen(url)
        html = res.read().decode('utf-8' )
    except HTTPError as e:
        print(e.code)
        html = None
        if retries_num > 0:
            print('[E]HTTPError!,retry times %d' % (4-retries_num))
            if hasattr(e,'code') and 500 <=e.code <=600:
                html = download(url,retries_num-1)
        else:
            print('[E]Failed!')
    except URLError as e:
        html = None
        print('[E]Unlocated URL!',url)

    return html

def get_links(html):
    webpage_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html)

def link_crawler(home_url,link_regex,depth_regex=None,max_depth=-1,time_sleep=0):
    crawl_queue = [home_url]
    seen = {home_url:0}
    st = Slimit(0.5)
    while crawl_queue:
        url = crawl_queue.pop(0)
        depth = seen[url]
        if depth != max_depth:
            st.wait(url)
            html = download(url)
            if depth_regex and re.match(re.compile(depth_regex,re.IGNORECASE),url):
                continue
            for link in get_links(html):
                if re.match(re.compile(link_regex,re.IGNORECASE),link):
                    if link not in seen:
                        crawl_queue.append(link)
                        seen[link] = depth + 1

def main():
    home_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&p=1&isadv=0'
    link_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm|http://sou.zhaopin.com/jobs/searchresult.ashx\?jl=%e8%a5%bf%e5%ae%89&amp;kw=python'
    depth_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm'
    res = link_crawler(home_url=home_url,link_regex=link_regex,depth_regex=depth_regex,max_depth=-1,time_sleep=2)                                          


main()
猜你喜欢