【多线程待解决】爬取糗事百科

代码：

# -*- coding: utf-8 -*-
'''
import urllib.request
import re
import ssl
import urllib.error

headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1, 3):
    url = "https://www.qiushibaike.com/8hr/page/" + str(i)+"/"
    # print(url)
    context = ssl._create_unverified_context()
    pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
    # pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
    pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
    datalist = re.compile(pat, re.S).findall(pageData)
    for j in range(0, len(datalist)):
        print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
        print(datalist[j])

import threading
class A(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self) #初始化线程
    def run(self):
        for i in range(0,10):
            print("我是线程A")

class B(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self) #初始化线程
    def run(self):
        for i in range(0,10):
            print("我是线程B")
thread1 = A()
thread1.start()
thread2 = B()
thread2.start()
'''
import urllib.request
import re
import ssl
import threading
import urllib.error

headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
# headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self) #初始化线程
    def run(self):
        for i in range(1,36,2):
            url="https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
            context = ssl._create_unverified_context()
            pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
            # pageData=urllib.request.urlopen(url).read().decode("utf-8", "ignore")
            pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist = re.compile(pat, re.S).findall(pageData)
            for j in range(0, len(datalist)):
                try:
                    print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
                    print(datalist[j])
                except Exception as e:
                    print("exception:"+str(e))
                    time.sleep(1)

class Two(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self) #初始化线程
    def run(self):
        for i in range(2, 36,2):
            url = "https://www.qiushibaike.com/8hr/page/" + str(i)+"/"
            # print(url)
            context = ssl._create_unverified_context()
            pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
            # pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
            pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist = re.compile(pat, re.S).findall(pageData)
            for j in range(0, len(datalist)):
                try:
                    print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
                    print(datalist[j])
                except Exception as e:
                    print("exception:"+str(e))
                    time.sleep(1)

one = One()
one.start()
two = Two()
two.start()

报错如下：

D:\python.exe F:/pycodes/webCrawl/qiuShiBaiKe.py
Exception in thread Thread-2:
Traceback (most recent call last):
  File "D:\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 81, in run
    pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
  File "D:\lib\urllib\request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "D:\lib\urllib\request.py", line 532, in open
    response = meth(req, response)
  File "D:\lib\urllib\request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "D:\lib\urllib\request.py", line 570, in error
    return self._call_chain(*args)
  File "D:\lib\urllib\request.py", line 504, in _call_chain
    result = func(*args)
  File "D:\lib\urllib\request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure

Exception in thread Thread-1:
Traceback (most recent call last):
  File "D:\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 61, in run
    pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
  File "D:\lib\urllib\request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "D:\lib\urllib\request.py", line 532, in open
    response = meth(req, response)
  File "D:\lib\urllib\request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "D:\lib\urllib\request.py", line 570, in error
    return self._call_chain(*args)
  File "D:\lib\urllib\request.py", line 504, in _call_chain
    result = func(*args)
  File "D:\lib\urllib\request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure


Process finished with exit code 0

【多线程 待解决】爬取糗事百科

猜你喜欢

【多线程待解决】爬取糗事百科