微信在4月的时候更新了一版,以前的公众号爬虫没有 那么好用了,现在微信开始针对个人账号了,频繁的访问会导致账号被封,查看公众号历史的时候会出现“页面无法打开”的提示,但是大概两天之后又可以查看了,所以现在的做法就是控制访问频率+多微信账号进行采集以前制作的公众号采集站好久没打理了,整理下代码吧.
获取文章链接:Get_list.py
# -*- coding: UTF-8 -*-
import re
import urllib2
import cookielib
import json
import time
import sys
from Unique import Redis
import base64
import redis
sys.setrecursionlimit(999999999)
REDIS=Redis()
TASK_SCHEDUL = 'task::mweb'
REDIS_URL = 'redis://xxxx:6379'
REDIS_HOST = 'xxxx'
REDIS_PORT = 6379
def from_settings():
return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='xxxx',db=4)
rediscli = from_settings()
def from_settings1():#settings
return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='',db=10)
rediscli1 = from_settings1()
##读取文件中的抓取地址##
def get_start_url():
Flag=0
tps=rediscli.rpop('task::getmes')
print 'tttttt----%s'%tps
Url= re.sub(r'&f=json','',tps)
return Url
##生成cookie##
def create_cookie(Url):
global opener
cookie = cookielib.CookieJar()
handler=urllib2.HTTPCookieProcessor(cookie)
opener = urllib2.build_opener(handler)
response = opener.open(Url,timeout=5)
C_url=Url+'&f=json'
return C_url
##抓取最近10天的数据##
def get_recent_ten_list(url):
headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Mobile/10B329 MicroMessenger/5.0.1'}
request = urllib2.Request(url,headers=headers)
response2 = opener.open(request,timeout=5)
result = response2.read()
#结构化数据转成字典#
return result
def format_data(datas):
data=datas
print 'format'
is_continue=1
is_friend=0
temp_b=0
url_list=[]
content_list=[]
###########一次取得最近10天的数据###########
items = {'info:create_time':'%s'%time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))}
msg_list=re.findall(r'comm_msg_info(.*?)}},',data)
print len(msg_list)
for x in range(0,len(msg_list)):
#封面信息内容
##有些公众号中作者自己发布的内容格式不一样
try:
temp_content_url_1=re.search(r'content_url:(.*?),',str(msg_list[x]))
temp_index_cover_img=re.search(r'cover:(.*?),',str(msg_list[x]))
temp_cover_title=re.search(r'title:(.*?),',str(msg_list[x]))
temp_cover_digest=re.search(r'digest:(.*?),',str(msg_list[x]))
temp_is_multi=re.search(r'is_multi:(.*?),',str(msg_list[x]))
try:
temp_source_url=re.search(r'source_url:(.*?),',str(msg_list[x]))
except:
temp_source_url=''
temp_content_url=temp_content_url_1.group(1)
index_cover_img=temp_index_cover_img.group(1)
cover_title=temp_cover_title.group(1)
cover_digest=temp_cover_digest.group(1)
is_multi=temp_is_multi.group(1)
try:
c_source_url=temp_source_url.group(1)
#print temp_source_url.group(1)
except:
c_source_url=''
except Exception,e:
print 'json err___.%s'%e
continue
temp_comm_msg_info=re.search(r'id:(.*?),',str(msg_list[x]))
comm_msg_info=temp_comm_msg_info.group(1)
content_url=re.sub(r'\\|amp;','',temp_content_url) #封面中的url带有符号,最终取得的封面的url
temp_pub_time=re.search(r'datetime:(.*?),',str(msg_list[x]))
pub_time=temp_pub_time.group(1)
Key_cover=base64.encodestring(content_url)
if not REDIS.getkey(Key_cover):
print '%s_____'%(x+1)
print '封面标题--->%s'%cover_title.decode('utf8','ignore').encode('utf8')
print '封面url---->%s'%content_url
content_list.append(cover_digest.decode('utf8','ignore').encode('utf8'))
url_list.append(content_url)
print u"当前封面ID------->>%s"%comm_msg_info
if content_url:
rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"1"}'%(content_url,pub_time,index_cover_img,cover_title,c_source_url))
else:
print '当前封面已采集---:%s'%cover_title.decode('utf8','ignore').encode('utf8')
temp_a=comm_msg_info
if temp_b>=temp_a:
temp_c=temp_a
temp_b=temp_c
else:
temp_c=temp_a
temp_b=temp_c
if is_multi=='1':
temp_more_content=''.join(re.findall(r'multi_app_msg_item_list(.*?)],',str(msg_list[x])))
temp_multi_app_msg_item_list=re.findall(r'{(.*?)}',temp_more_content)
for z in range(0,len(temp_multi_app_msg_item_list)):
temp_multi_url=''.join(re.findall(r'content_url:(.*?),',temp_multi_app_msg_item_list[z]))
list_title=''.join(re.findall(r'title:(.*?),',''.join(temp_multi_app_msg_item_list[z])))
multi_url=re.sub(r'\\|amp;','',temp_multi_url) #最终取得的列表的url地址
list_cover_img=''.join(re.findall(r'cover:(.*?),',temp_multi_app_msg_item_list[z]))
Key_cover2=base64.encodestring(multi_url)
l_source_url=''.join(re.findall(r'source_url:(.*?),',temp_multi_app_msg_item_list[z]))
if not REDIS.getkey(Key_cover2):
#print multi_url
print '列表标题----->%s'%list_title.decode('utf8','ignore').encode('utf8')
content_list.append(list_title.decode('utf8','ignore').encode('utf8'))
url_list.append(multi_url)
#hbase.put(Key_cover2,'t_cr_duplicate',items)
if multi_url:
rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"%s"}'%(multi_url,pub_time,list_cover_img,list_title,l_source_url,(z+2)))
#REDIS.setkey(Key_cover2,multi_url)
else:
print '列表已采集----:%s'%list_title
continue
print '----------分--------割-------线------------'
return [is_continue,is_friend,temp_b,content_list,url_list]
#################10天以后的数据,更多消息(需要关注公众号)####################
#BIZS=Window()
def retry_history(count):
url=get_start_url()
Json_url = create_cookie(url)
data=get_recent_ten_list(Json_url)
return data
def run():
ddd=0
url=get_start_url()
#print url
url_biz=re.findall(r'biz=(.*?)&',url)
re_biz=''.join(url_biz)
Json_url = create_cookie(url)
print Json_url
data=get_recent_ten_list(Json_url)
msglist=re.findall(r"var msgList = '(.*?)';",data)
data=re.sub(r'"|amp;|\\|\s+','',''.join(msglist))
data=re.sub(r' ',' ',data)
history_length=0 #公众号下发布总文章数
ddd+=1
try:
History_args=format_data(data)
rediscli.set('flag','1')
except Exception,e:
print "格式化数据错误--->%s"%e
print '当前-------%s'%ddd
time.sleep(44444)
rediscli.set('flag','0')
rediscli.rpush('task::getmes',url)
data=retry_history(0)
History_args=format_data(data)
history_length+=len(History_args[4])
while True:
if History_args[0]==1 and History_args[1]==1:
uls=url+"&f=json&frommsgid=%s&count=10"%(History_args[2])
print uls
data1=get_recent_ten_list(uls)
try:
History_args2=format_data(data1) #历史文章太多可能导致key过期,异常处理
except Exception,e:
print u'历史文章出现异常--->%s'%e
print 'cid--->%s'%curtime_msgid
History_args[2]=curtime_msgid
url=get_start_url()
continue
uptime_msgid=History_args[2] #记录上次最小的id值
curtime_msgid=History_args2[2] #本次的最小id值
if uptime_msgid==curtime_msgid:
print '所有历史记录获取完毕'
break
else:
history_length+=len(History_args2[4])
History_args[2]=History_args2[2]
print '上次ID---->:%s 当前的ID---->:%s'%(uptime_msgid,curtime_msgid)
print '^^^^^^^^^^^^^^^^^当前累计文章数:%s^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'%history_length
if curtime_msgid==0:
print u'历史记录获取完毕'
break
else:
continue
elif History_args[0]==1 and History_args[1]==0:
print '没有关注公众号,若想取得所有历史信息,请关注此公众号'
break
elif History_args[0]==0 and History_args[1]==1:
print '关注的此公众号信息量极少'
break
else:
print '此公众号没有关注,但没有更多历史信息'
break
print '****************文章总数量:%s***********************'%history_length
while True:
try:
run()
except Exception,e:
print e
time.sleep(3)
continue
抓取具体的文章:crawl_detail.py
# -*- coding: UTF-8 -*-
import urllib2
import cookielib
import random
import MySQLdb
import re
import redis
from bs4 import BeautifulSoup
import time
import sys
from Unique import Redis
import base64
sys.setrecursionlimit(999999999)
reload(sys)
sys.setdefaultencoding('utf-8')
TASK_SCHEDUL = 'task::mweb'
REDIS_URL = 'redis://xxxx:6379'
REDIS_HOST = 'xxxx'
REDIS_PORT = 6379
REDIS_PASS='xxxx'
REDIS=Redis()
def from_settings():#settings
return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password=REDIS_PASS,db=4)
rediscli = from_settings()
try:
con = MySQLdb.connect(host='xxxx',user='xxxx',passwd='xxxx',port=3306,charset='utf8')
con.ping(True)
cur = con.cursor()
except Exception,e:
print e
with con:
con.select_db('wechat')
cur.execute('select max(flag),min(flag) from articles')
data=cur.fetchall()
for tow in data:
pass
print tow[0]
print tow[1]
max_time=int(tow[0])
min_time=int(tow[1])
my_wx_id=''
temp_my_time=0
cmt=0
while True:
cmt += 1
leng = rediscli.llen('task::mweb')
if leng>0:
items = rediscli.lpop('task::mweb')
try:
infos=eval(items)
except Exception,e:
print e
continue
url=infos['url']
pubtime=infos['time']
#如果从数据库中查询出的最新的文章时间比当前的新(即当前采集的文章不是最新的),那么插入到flag字段中的数据为本文章的发布时间
if max_time > int(pubtime):
insert_flag=int(pubtime)
#当前文章是最新的情况下,插入到数据库中的flag字段内容为
else:
insert_flag=int(pubtime)
max_time=int(pubtime)
cover_img=infos['cover_img']
title=infos['title']
source_url=infos['source_url']
flag=infos['flag']
else:
print u'队列任务空'
time.sleep(5)
continue
cv=base64.encodestring(url)
if not REDIS.getkey(cv):
try:
print url
#没必要添加代理
#proxy = {'http':'60.169.78.218:808'}
#proxy_support = urllib2.ProxyHandler(proxy)
#opener = urllib2.build_opener(proxy_support)
#urllib2.install_opener(opener)
headers = {'User-Agent':'Mozilla/5.0 (W indows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
request = urllib2.Request(url,headers=headers)
result=urllib2.urlopen(request,timeout=15)
content=result.read()
soup = BeautifulSoup(content)
try:
wx_id=soup.find('span',attrs={'class':'profile_meta_value'}).get_text()
if not wx_id:
temp_wx_id=re.findall(r'var user_name = "(.*?)"',content)
wx_id=''.join(temp_wx_id)
if str(wx_id)==str(my_wx_id) and int(pubtime)==int(temp_my_time):
min_time-=1
insert_flag=min_time
print '%s__%s相同了__%s'%(wx_id,my_wx_id,insert_flag)
my_wx_id=wx_id
temp_my_time=pubtime
print 'insert_flag---%s__max_time_%s'%(insert_flag,max_time)
except Exception,e:
'''
if str(e).find('NoneType')!=-1:
print '内容被举报了'
REDIS.setkey(cv,url)
else:
print u'解析页面微信号出现问题----->%s'%e
'''
print 'rrrrrrrrrrr---%s'%e
#print content
#time.sleep(3)
continue
wx_name=soup.find('strong',attrs={'class':'profile_nickname'}).get_text()
wx_pubtime=soup.find('em',attrs={'id':'post-date'}).get_text()
print u'公众号---:%s'%wx_name
print u'微信号---:%s'%wx_id
print u'第 ----:%s 个-----%s'%(cmt,leng)
tex = soup.find('div', attrs = {'id':'js_article'})
tex2=soup.find('div',attrs={'id':'js_content'})
rand_str=random.randint(360,500)
digest=str(tex2.get_text().strip())[:rand_str]
#内容长度过短的不去显示,并且flag值为999,可以选择删除这些文章,没有什么实际的文本
if len(digest)<250:
insert_flag=-999
print '摘要信息----》%s'%digest
tex = soup.find('div', attrs = {'id':'js_article'})
try:
cur.execute("insert into articles(wx_id,wx_name,content,url,title,publish_time,cover_img,source_url,digest,flag) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" %(str(wx_id),str(wx_name.encode('utf8','ignore')),MySQLdb.escape_string(str(tex).encode('utf8','ignore')),url,MySQLdb.escape_string(soup.title.string.encode('utf8','ignore')),pubtime,cover_img,source_url,MySQLdb.escape_string(digest),insert_flag))
except Exception,e:
print 'mysql error ---:%s'%e
con.commit()
REDIS.setkey(cv,url)
except Exception,e:
print u'异常点---%s-----%s'%(cmt,e)
time.sleep(3)
continue
else:
print u'采集过了___:%s'%url
con.close()
redis去重模块:Unique.py
# -*- coding: UTF-8 -*-
import redis
import base64
class Redis:
def __init__(self):
self.red=redis.Redis(host='xxxx',port=6379,password='xxxx',db=4)
def setkey(self,key,value):
self.red.set(key,value)
def getkey(self,key):
return self.red.get(key)
整个公众号的爬虫代码都在这里了,此方案4月份之前很不错,但是4月份以后微信的反爬更新,控制好频率还是能用的。
欢迎交流,有更好方案的欢迎。如果有需要的,稍后项目会上传的git。
2017.6.8