公众号文章的爬虫

微信在4月的时候更新了一版，以前的公众号爬虫没有那么好用了，现在微信开始针对个人账号了，频繁的访问会导致账号被封，查看公众号历史的时候会出现“页面无法打开”的提示，但是大概两天之后又可以查看了，所以现在的做法就是控制访问频率+多微信账号进行采集以前制作的公众号采集站好久没打理了，整理下代码吧.

获取文章链接：Get_list.py

# -*- coding: UTF-8 -*-
import re
import urllib2
import cookielib
import json
import time
import sys
from Unique import Redis
import base64
import redis
sys.setrecursionlimit(999999999)
REDIS=Redis()
TASK_SCHEDUL = 'task::mweb'
REDIS_URL = 'redis://xxxx:6379'
REDIS_HOST = 'xxxx'
REDIS_PORT = 6379
def from_settings():
    return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='xxxx',db=4)

rediscli = from_settings()

def from_settings1():#settings   
    return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='',db=10)
rediscli1 = from_settings1()

##读取文件中的抓取地址##
def get_start_url():
    Flag=0
    tps=rediscli.rpop('task::getmes')
    print 'tttttt----%s'%tps
    Url= re.sub(r'&f=json','',tps)
    return Url
##生成cookie##    
def create_cookie(Url):
    global opener
    cookie = cookielib.CookieJar()
    handler=urllib2.HTTPCookieProcessor(cookie)
    opener = urllib2.build_opener(handler)
    response = opener.open(Url,timeout=5)
    C_url=Url+'&f=json'
    return C_url
##抓取最近10天的数据##
def get_recent_ten_list(url):
    headers =  {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Mobile/10B329 MicroMessenger/5.0.1'}
    request = urllib2.Request(url,headers=headers)
    response2 = opener.open(request,timeout=5)
    result = response2.read()
    #结构化数据转成字典#
    return result
def format_data(datas):
    data=datas
    print 'format'
    is_continue=1
    is_friend=0 
    temp_b=0
    url_list=[]
    content_list=[]
    ###########一次取得最近10天的数据###########
    items = {'info:create_time':'%s'%time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))}
    msg_list=re.findall(r'comm_msg_info(.*?)}},',data) 
    print len(msg_list)
    for x in range(0,len(msg_list)):

        #封面信息内容
        ##有些公众号中作者自己发布的内容格式不一样
        try:
            temp_content_url_1=re.search(r'content_url:(.*?),',str(msg_list[x]))
            temp_index_cover_img=re.search(r'cover:(.*?),',str(msg_list[x]))
            temp_cover_title=re.search(r'title:(.*?),',str(msg_list[x]))
            temp_cover_digest=re.search(r'digest:(.*?),',str(msg_list[x]))
            temp_is_multi=re.search(r'is_multi:(.*?),',str(msg_list[x]))
            try:
                temp_source_url=re.search(r'source_url:(.*?),',str(msg_list[x]))
            except:
                temp_source_url=''
            temp_content_url=temp_content_url_1.group(1)
            index_cover_img=temp_index_cover_img.group(1)
            cover_title=temp_cover_title.group(1)
            cover_digest=temp_cover_digest.group(1)
            is_multi=temp_is_multi.group(1)
            try: 
                c_source_url=temp_source_url.group(1)
                #print temp_source_url.group(1)
            except:
                c_source_url=''
        except Exception,e:
            print 'json err___.%s'%e
            continue
        temp_comm_msg_info=re.search(r'id:(.*?),',str(msg_list[x]))
        comm_msg_info=temp_comm_msg_info.group(1)
        content_url=re.sub(r'\\|amp;','',temp_content_url)  #封面中的url带有符号，最终取得的封面的url
        temp_pub_time=re.search(r'datetime:(.*?),',str(msg_list[x]))
        pub_time=temp_pub_time.group(1)

        Key_cover=base64.encodestring(content_url)
        if not REDIS.getkey(Key_cover):
            print '%s_____'%(x+1)
            print '封面标题--->%s'%cover_title.decode('utf8','ignore').encode('utf8')
            print '封面url---->%s'%content_url
            content_list.append(cover_digest.decode('utf8','ignore').encode('utf8'))
            url_list.append(content_url)
            print u"当前封面ID------->>%s"%comm_msg_info
            if content_url:
                rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"1"}'%(content_url,pub_time,index_cover_img,cover_title,c_source_url)) 
        else:
            print '当前封面已采集---:%s'%cover_title.decode('utf8','ignore').encode('utf8')
        temp_a=comm_msg_info
        if temp_b>=temp_a:
            temp_c=temp_a
            temp_b=temp_c
        else:
            temp_c=temp_a
            temp_b=temp_c
        if is_multi=='1':
            temp_more_content=''.join(re.findall(r'multi_app_msg_item_list(.*?)],',str(msg_list[x])))
            temp_multi_app_msg_item_list=re.findall(r'{(.*?)}',temp_more_content)  
            for z in range(0,len(temp_multi_app_msg_item_list)):
                temp_multi_url=''.join(re.findall(r'content_url:(.*?),',temp_multi_app_msg_item_list[z]))

                list_title=''.join(re.findall(r'title:(.*?),',''.join(temp_multi_app_msg_item_list[z])))
                multi_url=re.sub(r'\\|amp;','',temp_multi_url)  #最终取得的列表的url地址

                list_cover_img=''.join(re.findall(r'cover:(.*?),',temp_multi_app_msg_item_list[z]))
                Key_cover2=base64.encodestring(multi_url)
                l_source_url=''.join(re.findall(r'source_url:(.*?),',temp_multi_app_msg_item_list[z]))

                if not REDIS.getkey(Key_cover2):
                    #print multi_url
                    print '列表标题----->%s'%list_title.decode('utf8','ignore').encode('utf8')
                    content_list.append(list_title.decode('utf8','ignore').encode('utf8'))
                    url_list.append(multi_url)
                    #hbase.put(Key_cover2,'t_cr_duplicate',items)
                    if multi_url:
                        rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"%s"}'%(multi_url,pub_time,list_cover_img,list_title,l_source_url,(z+2)))
                    #REDIS.setkey(Key_cover2,multi_url)

                else:
                    print '列表已采集----:%s'%list_title
                    continue

            print '----------分--------割-------线------------'
    return [is_continue,is_friend,temp_b,content_list,url_list]

#################10天以后的数据,更多消息(需要关注公众号)####################
#BIZS=Window()




def retry_history(count):
    url=get_start_url()
    Json_url = create_cookie(url)
    data=get_recent_ten_list(Json_url)
    return data



def run():
    ddd=0  
    url=get_start_url()
    #print url
    url_biz=re.findall(r'biz=(.*?)&',url)
    re_biz=''.join(url_biz)
    Json_url = create_cookie(url)
    print Json_url
    data=get_recent_ten_list(Json_url)  
    msglist=re.findall(r"var msgList = '(.*?)';",data) 

    data=re.sub(r'&quot;|amp;|\\|\s+','',''.join(msglist))
    data=re.sub(r'&nbsp;',' ',data)
    history_length=0   #公众号下发布总文章数
    ddd+=1
    try:
        History_args=format_data(data)
        rediscli.set('flag','1')

    except Exception,e:
        print "格式化数据错误--->%s"%e
        print '当前-------%s'%ddd
        time.sleep(44444)
        rediscli.set('flag','0')
        rediscli.rpush('task::getmes',url)
        data=retry_history(0)
        History_args=format_data(data)
    history_length+=len(History_args[4])

    while True:

        if History_args[0]==1 and History_args[1]==1:
            uls=url+"&f=json&frommsgid=%s&count=10"%(History_args[2])
            print uls

            data1=get_recent_ten_list(uls)


            try:
                History_args2=format_data(data1)  #历史文章太多可能导致key过期，异常处理
            except Exception,e:
                print u'历史文章出现异常--->%s'%e
                print 'cid--->%s'%curtime_msgid
                History_args[2]=curtime_msgid
                url=get_start_url()

                continue

            uptime_msgid=History_args[2]    #记录上次最小的id值
            curtime_msgid=History_args2[2]   #本次的最小id值
            if uptime_msgid==curtime_msgid:
                print '所有历史记录获取完毕'
                break
            else:
                history_length+=len(History_args2[4])
                History_args[2]=History_args2[2]
                print '上次ID---->:%s  当前的ID---->:%s'%(uptime_msgid,curtime_msgid)
                print '^^^^^^^^^^^^^^^^^当前累计文章数:%s^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'%history_length
                if curtime_msgid==0:
                    print u'历史记录获取完毕'
                    break
                else:                    
                    continue

        elif History_args[0]==1 and History_args[1]==0:
            print '没有关注公众号，若想取得所有历史信息，请关注此公众号'

            break
        elif History_args[0]==0 and History_args[1]==1:
            print '关注的此公众号信息量极少'
            break
        else:
            print '此公众号没有关注，但没有更多历史信息'
            break

    print '****************文章总数量:%s***********************'%history_length
while True:
    try:
        run()
    except Exception,e:

        print e
        time.sleep(3)
        continue

抓取具体的文章:crawl_detail.py

# -*- coding: UTF-8 -*-
import urllib2
import cookielib
import random
import MySQLdb
import re
import redis
from bs4 import BeautifulSoup
import time
import sys
from Unique import Redis
import base64
sys.setrecursionlimit(999999999)
reload(sys)
sys.setdefaultencoding('utf-8')

TASK_SCHEDUL = 'task::mweb'
REDIS_URL = 'redis://xxxx:6379'
REDIS_HOST = 'xxxx'
REDIS_PORT = 6379
REDIS_PASS='xxxx'
REDIS=Redis()

def from_settings():#settings
    return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password=REDIS_PASS,db=4)
rediscli = from_settings()
try:
    con = MySQLdb.connect(host='xxxx',user='xxxx',passwd='xxxx',port=3306,charset='utf8') 
    con.ping(True)
    cur = con.cursor()

except Exception,e:
    print e
with con:
    con.select_db('wechat')
    cur.execute('select max(flag),min(flag) from articles')
    data=cur.fetchall()
    for tow in data:
        pass

print tow[0]
print tow[1]
max_time=int(tow[0])
min_time=int(tow[1])
my_wx_id=''
temp_my_time=0
cmt=0
while True:
    cmt += 1
    leng = rediscli.llen('task::mweb')
    if leng>0:
        items = rediscli.lpop('task::mweb')
        try:
            infos=eval(items)
        except Exception,e:
            print e
            continue
        url=infos['url']
        pubtime=infos['time'] 
        #如果从数据库中查询出的最新的文章时间比当前的新（即当前采集的文章不是最新的），那么插入到flag字段中的数据为本文章的发布时间
        if max_time > int(pubtime):
            insert_flag=int(pubtime)
        #当前文章是最新的情况下，插入到数据库中的flag字段内容为
        else:
            insert_flag=int(pubtime)
            max_time=int(pubtime)
        cover_img=infos['cover_img']
        title=infos['title']
        source_url=infos['source_url']
        flag=infos['flag']

    else:
        print u'队列任务空'
        time.sleep(5)
        continue
    cv=base64.encodestring(url)
    if not REDIS.getkey(cv):
        try:
            print url
            #没必要添加代理
            #proxy = {'http':'60.169.78.218:808'}
            #proxy_support = urllib2.ProxyHandler(proxy)
            #opener = urllib2.build_opener(proxy_support)
            #urllib2.install_opener(opener)
            headers = {'User-Agent':'Mozilla/5.0 (W                                                                                                                                                                                                                                                                                                                                 indows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
            request = urllib2.Request(url,headers=headers)
            result=urllib2.urlopen(request,timeout=15)
            content=result.read()

            soup = BeautifulSoup(content)

            try:
                wx_id=soup.find('span',attrs={'class':'profile_meta_value'}).get_text()
                if not wx_id:
                    temp_wx_id=re.findall(r'var user_name = "(.*?)"',content)
                    wx_id=''.join(temp_wx_id)
                if str(wx_id)==str(my_wx_id) and int(pubtime)==int(temp_my_time):
                    min_time-=1
                    insert_flag=min_time
                    print '%s__%s相同了__%s'%(wx_id,my_wx_id,insert_flag)
                my_wx_id=wx_id
                temp_my_time=pubtime
                print 'insert_flag---%s__max_time_%s'%(insert_flag,max_time)

            except Exception,e:
                '''
                if str(e).find('NoneType')!=-1:
                    print '内容被举报了'
                    REDIS.setkey(cv,url)
                else:
                    print u'解析页面微信号出现问题----->%s'%e
                '''
                print 'rrrrrrrrrrr---%s'%e
                #print content
                #time.sleep(3)
                continue
            wx_name=soup.find('strong',attrs={'class':'profile_nickname'}).get_text()
            wx_pubtime=soup.find('em',attrs={'id':'post-date'}).get_text()

            print u'公众号---:%s'%wx_name
            print u'微信号---:%s'%wx_id
            print u'第 ----:%s 个-----%s'%(cmt,leng)
            tex = soup.find('div', attrs = {'id':'js_article'})
            tex2=soup.find('div',attrs={'id':'js_content'})
            rand_str=random.randint(360,500)
            digest=str(tex2.get_text().strip())[:rand_str]
            #内容长度过短的不去显示，并且flag值为999，可以选择删除这些文章，没有什么实际的文本
            if len(digest)<250:
                insert_flag=-999
            print '摘要信息----》%s'%digest
            tex = soup.find('div', attrs = {'id':'js_article'})
            try:

                cur.execute("insert into articles(wx_id,wx_name,content,url,title,publish_time,cover_img,source_url,digest,flag) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" %(str(wx_id),str(wx_name.encode('utf8','ignore')),MySQLdb.escape_string(str(tex).encode('utf8','ignore')),url,MySQLdb.escape_string(soup.title.string.encode('utf8','ignore')),pubtime,cover_img,source_url,MySQLdb.escape_string(digest),insert_flag))
            except Exception,e:
                print 'mysql error ---:%s'%e

            con.commit()

            REDIS.setkey(cv,url)
        except Exception,e:

            print u'异常点---%s-----%s'%(cmt,e)
            time.sleep(3)
            continue
    else:

        print u'采集过了___:%s'%url
con.close()

redis去重模块：Unique.py

# -*- coding: UTF-8 -*-
import redis
import base64
class Redis:
    def __init__(self):
        self.red=redis.Redis(host='xxxx',port=6379,password='xxxx',db=4)
    def setkey(self,key,value):
        self.red.set(key,value)

    def getkey(self,key):
        return self.red.get(key)

整个公众号的爬虫代码都在这里了，此方案4月份之前很不错，但是4月份以后微信的反爬更新，控制好频率还是能用的。

欢迎交流，有更好方案的欢迎。如果有需要的，稍后项目会上传的git。

                                                      2017.6.8

公众号文章的爬虫

猜你喜欢