Python爬虫(6) 多线程

import threading as td;
import queue as qu;
import re;
import urllib.request as ur;
import urllib.error as ue;
import time;

#队列实例
urlqueue = qu.Queue();

#模拟成浏览器
headers = {"User-Agent" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2767.400"};

opener = ur.build_opener();
opener.addheaders = [headers];

#将opener安装为全局
ur.install_opener(opener);

listurl = [];

#使用代理服务器函数
def use_proxy(proxy_addr,url):
try:
proxy = ur.ProxyHandler({'http':proxy_addr});
opener = ur.build_opener(proxy,ur.HTTPHandler);
ur.install_opener(opener);
data = ur.urlopen(url).read().decode('utf-8');
return data;
except ue.URLError as e:
if hasattr(e,"code"):
print(e.code);
if hasattr(e,"reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:"+str(e));
time.sleep(1);

#线程1,专门获取对应网址并处理为真实网址
class geturl(td.Thread):
def __init__(self,key,pagestart,pageend,proxy,urlqueue):
td.Thread.__init__(self);
self.pagestart = pagestart;
self.pageend = pageend;
self.proxy = proxy;
self.urlqueue = urlqueue;

def run(self):
page = self.pagestart;
#编码关键词key
keycode = ur.quote(key);
#编码“&page”
pagecode = ur.quote("&page");

for page in range(self.pagestart,self.pageend + 1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page);
#用代理服务器爬取,解决ID被封杀问题
data1 = use_proxy(self.proxy,url);
#列表页url正则

print("data1:"+data1);

listurlpat = '<div class="txt-box">.*?(http://.*?)';
listurl.append(re.compile(listurlpat,re.S).findall(data1))

#便于调试
print("获取到"+str(len(listurl))+"页")
for i in range(0,len(listurl)):
#等一等线程2,合理分配资源
time.sleep(7)
for j in range(0,len(listurl[i])):
try:
url = listurl[i][j];
#处理成真实URL,读者亦可以观察对应网址的关系自行分析,采集网址比真实网址多了一串amp
url = url.replace("apm:","");
print("第"+str(i)+"i"+str(j)+"j次入队");
self.urlqueue.put(url);
self.urlqueue.tasl_done();
except ue.URLError as e:
if hasattr(e, "code"):
print(e.code);
if hasattr(e, "reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:" + str(e));
time.sleep(1);

#线程2,与线程1并行自行,从线程1提供的文章网址中一次爬取对应文章信息并处理
class getcontent(td.Thread):
def __init__(self,urlqueue,proxy):
td.Thread.__init__(self);
self.urlqueue = urlqueue;
self.proxy = proxy;

def run(self):
html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; chatset=utf-8" />
<title>微信文章页面</title>
</head>
<body>'''
fh = open("G:/Pcode/2.html","wb");
fh.write(html1.encode("utf-8"));
fh.close();
fh = open("G:/Pcode/2.html","ab");
i=1
while(True):
try:
url = self.urlqueue.get();
data = use_proxy(self.proxy,url);
titlepat = "<title>(.*?)</title>";
contentpat = 'id="js_content">(.*?)id="js_sg_bar"';
title = re.compile(titlepat).findall(data);
thistitle ="此次没有获取到";
thiscontent="此次没有获取到";
if(title != []):
thistitle = title[0];

if(content != []):
thiscontent = content[0];
dataall = "<p>标题为:"+thistitle+"</p><p>内容为:"+thiscontent+"</p><br/>";
fh.write(dataall.encode("utf-8"));
print("第"+str(i)+"个网页处理"); #便于调试
i += 1;
except ue.URLError as e:
if hasattr(e, "code"):
print(e.code);
if hasattr(e, "reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:" + str(e));
time.sleep(1);
fh.close();
html2 = '''</body>
</html>
'''
fh = open("G:/Pcode/2.html","ab");
fh.write(html2.encode("utf-8"));
fh.close();

class conrl(td.Thread):
def __init__(self,urlqueue):
td.Thread.__init__(self);
self.urlqueue = urlqueue;

def run(self):
while(True):
print("程序执行中")
time.sleep(60)
if(self.urlqueue.empty()):
print("程序执行完毕!");
exit();


key = "人工智能";
proxy = "60.191.201.38:45461";
proxy2 = "";

pagestart = 1#起始页
pageend = 2#抓取到哪页

#创建线程1对象,随后启动线程1
t1=geturl(key,pagestart,pageend,proxy,urlqueue);
t1.start();

#创建线程2对象,随后启动线程2
t2=getcontent(urlqueue,proxy);
t2.start();

#创建线程3对象,随后启动线程3
t3=conrl(urlqueue);
t3.start();

猜你喜欢

转载自www.cnblogs.com/q3619940/p/10672097.html