代码:
# -*- coding: utf-8 -*-
'''
import urllib.request
import re
import ssl
import urllib.error
headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1, 3):
url = "https://www.qiushibaike.com/8hr/page/" + str(i)+"/"
# print(url)
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
# pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
import threading
class A(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(0,10):
print("我是线程A")
class B(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(0,10):
print("我是线程B")
thread1 = A()
thread1.start()
thread2 = B()
thread2.start()
'''
import urllib.request
import re
import ssl
import threading
import urllib.error
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
# headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(1,36,2):
url="https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
# pageData=urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
try:
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(2, 36,2):
url = "https://www.qiushibaike.com/8hr/page/" + str(i)+"/"
# print(url)
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
# pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
try:
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
one = One()
one.start()
two = Two()
two.start()
报错如下:
D:\python.exe F:/pycodes/webCrawl/qiuShiBaiKe.py
Exception in thread Thread-2:
Traceback (most recent call last):
File "D:\lib\threading.py", line 916, in _bootstrap_inner
self.run()
File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 81, in run
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
File "D:\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "D:\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "D:\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "D:\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "D:\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "D:\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure
Exception in thread Thread-1:
Traceback (most recent call last):
File "D:\lib\threading.py", line 916, in _bootstrap_inner
self.run()
File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 61, in run
pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
File "D:\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "D:\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "D:\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "D:\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "D:\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "D:\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure
Process finished with exit code 0