这两天从早上写代码到半夜,终于找回一点做程序员的感觉,人闲太久了真没劲,所以没事可以多定定计划,找一找奋斗的感觉挺好。
闲话不多说,今天笔记主要是记录对微信爬取的整个过程
爬取分为几个步骤:
1.实现登录
2.实现用户初始化,获取最近联系人以及所有用户信息
3.实现对用户发送消息
4.实现对消息的接收
本文主要是使用flask框架以及requests库以及bs4来进行爬虫
代码结构:
步骤:
1.创建flask框架Wechat,到manage.py里面定义登录login函数:
#-*-coding:utf-8-*- from flask import Flask,request,render_template,session,jsonify import time import requests,re import json from bs4 import BeautifulSoup app = Flask(__name__) app.debug = True app.secret_key='abcdefghigklmn' @app.route('/login',methods=['GET','POST']) def login(): if request.method == 'GET': ctime = str(int(time.time()*1000)) qcode_url = 'https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_={0}'.format(ctime) res = requests.get(qcode_url) print res.text qcode = re.findall('uuid = "(.*)";',res.text)[0] # print qcode session['qcode'] = qcode return render_template('login.html',qcode=qcode) else: pass
@app.route('/check_login')
def check_login():
'''
发送GET请求检测是否已经扫码,登陆
https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid=gbG3TQrkaA==&tip=0&r=-925318273&_=1529933650035
:return:
'''
response = {'code':408}
qcode = session.get('qcode')
ctime = str(int(time.time() * 1000))
check_url = 'https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid={0}&tip=0&r=-925318273&_={1}'.format(qcode,ctime)
req = requests.get(check_url)
# print req.text
# print'获取的是登陆的状态'
if "code=201" in req.text:
#用户已扫码,获取用户头像
src = re.findall("userAvatar = '(.*)';",req.text)[0]
print 'src==',src
response['code'] = 201
response['src'] = src
# else:
# #用户未扫码
elif 'code=200' in req.text:
#确认登录
redirect_uri = re.findall('redirect_uri="(.*)";',req.text)[0]
# print ' redirect_uri', redirect_uri
redirect_uri = redirect_uri + '&fun=new&version=v2'
ticket_ret = requests.get(redirect_uri)
# print 'ticket_ret.text',ticket_ret.text
ticket_dict = xml_parser(ticket_ret.text)
session['ticket_dict']=ticket_dict
session['ticket_cookie'] = ticket_ret.cookies.get_dict()
response['code'] = 200
return jsonify(response)
def xml_parser(text):
dic ={}
soup = BeautifulSoup(text,'html.parser')
div = soup.find(name='error')
for item in div.find_all(recursive=False):
dic[item.name]=item.text
return dic
创建login.html,代码如下:
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div style="width:200px;margin:0 auto"> <h1 style="text-align: center"> 登陆</h1> <image id="img" style="height:200px;width:200px;" src="https://login.wx.qq.com/qrcode/{{qcode}}"></image> </div> <script src="/static/jquery-1.12.4.min.js"></script> <script> $(function () { checkLogin(); }) function checkLogin(){ $.ajax({ url:'/check_login', type:'GET', dataType:'JSON', success:function(arg){ if (arg.code === 201){ //扫码成功 console.log('src:',arg.src) $('#img').attr('src',arg.src); checkLogin(); }else if(arg.code === 200){ //重定向到用户列表 location.href = '/index' }else{ checkLogin(); } } }) } </script> </body> </html>
2.用户初始化并获取用户头像代码实现:
#用户初始化步骤 @app.route('/index') def index(): '''用户初始化信息''' ticket_dict = session.get('ticket_dict') ticket_cookie = session.get('ticket_cookie') # print 'ticket_dict',ticket_dict https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-2115319983 init_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-2132117709&pass_ticket={0}'.format(ticket_dict.get('pass_ticket')) data_dict = { 'BaseRequest': { 'DeviceID': "e292711087499063", 'Sid': ticket_dict.get('wxsid'), 'Uin': ticket_dict.get('wxuid'), 'Skey': ticket_dict.get('skey'), }, } init_ret = requests.post( url=init_url, json = data_dict, cookies= ticket_cookie, #这样就是jaon格式的数据了 相当于data = json.dumps(data_dict),headers = {'Content-type':''} ) init_ret.encoding = 'utf-8' user_dict = init_ret.json() # get_img() session['current_user'] = user_dict['User'] session['synckey'] = user_dict['SyncKey'] return render_template('index.html',user_dict=user_dict) @app.route('/get_img') def get_img(): #获取头像# # https# ://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgeticon?seq=1182160498&username=@f04bb7e4d7821f504a4992ca85be95aa3e9957c7e3dfb224dc467af8639450e7&skey=@crypt_a1d89414_e0cf3503fac08d5ac1bf9fadcae86c0d current_user = session['current_user'] ticket_cookie = session.get('ticket_cookie') head_url = "https://wx.qq.com"+current_user["HeadImgUrl"] img_ret = requests.get(head_url,cookies=ticket_cookie,headers={'Content-Type':'image/jpg'}) return img_ret.content
前端代码index.html如下:
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <h1>欢迎登陆{{user_dict.User.NickName}}</h1> <div> <img src="/get_img" alt=""> <h2>{{user_dict.User.NickName}}</h2> <h2>{{user_dict.User.UserName}}</h2> </div> <h3>最近登录联系人</h3> <ul> {% for user in user_dict.ContactList %} <li>{{ user.NickName}}</li> {% endfor %} </ul> <a href="/user_list">查看所有联系人</a> </body> </html>
3.到这里就能够实现自动登录并获取到最近联系人,接着我们获取所有联系人及信息
@app.route('/user_list') def user_list(): ticket_dict = session.get('ticket_dict') ticket_cookie = session.get('ticket_cookie') ctime = int(time.time()*1000) skey = ticket_dict.get('skey') pass_ticket = ticket_dict.get('pass_ticket') user_list_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgetcontact?lang=zh_CN&pass_ticket={0}&r={1}&seq=0&skey={2}".format(pass_ticket,ctime,skey) r1 = requests.get(user_list_url,cookies=ticket_cookie) r1.encoding = 'utf-8' wx_user_dict = r1.json() return render_template('user_list.html',wx_user_dict=wx_user_dict)
前端代码如下:
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div> <div style="width:30%;float:left;"> <h3>{{wx_user_dict.MemberCount}}</h3> <ul> {% for item in wx_user_dict.MemberList %} <li>{{ item.NickName }} ===== {{item.UserName}}</li> {% endfor %} </ul> </div> <div style="width:7%;float:right;> </div> </div> </body> </html>
4.接下来可以实现发送消息的功能
首先创建前端send.html页面
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <form action="" method="post"> <input type="text" name="to"> <input type="text" name="content"> <input type="submit" value="发送"> </form> </body> </html>
后台实现逻辑如下:
@app.route('/send',methods=['GET','POST']) def send(): if request.method == "GET": return render_template('send.html') current_user = session['current_user'] ticket_dict = session.get('ticket_dict') ticket_cookie = session.get('ticket_cookie') pass_ticket = ticket_dict.get('pass_ticket') from_user = current_user["UserName"] to = request.form.get('to') content = request.form.get('content') ctime = str(time.time()*1000) msg_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?pass_ticket={0}'.format(pass_ticket) data_dict = { 'BaseRequest':{ 'DeviceID': "e956888515941054", 'Sid': ticket_dict.get('wxsid'), 'Uin': ticket_dict.get('wxuid'), 'Skey': ticket_dict.get('skey'), }, 'Msg':{ 'ClientMsgId':ctime, 'LocalID':ctime, 'FromUserName':from_user, 'ToUserName':to, 'Content':content, 'Type':1 }, 'scene':0 } ret = requests.post( url = msg_url, data = json.dumps(data_dict,ensure_ascii=False), cookies = ticket_cookie, ) return ret.text
5.实现获取消息代码:
首先定义get_msg.html文件
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <ul> {% for item in content.AddMsgList %} <li> <!--{{ item }}--> {{ item['Content']}} From--> {{item['FromUserName']}} To--> {{item['ToUserName']}} </li> {% endfor %} </ul> <scrip src="/static/jquery-1.12.4.js"></scrip> <script> $(function(){ fetchMessage(); }); function fetchMessage(){ $.ajax({ url:'/get_msg', type: 'GET', success:function(arg){ fetchMessage(); } }) } </script> </body> </html>
后台实现逻辑如下:
@app.route('/get_msg') def get_msg(): #检查是否有新消息到来 SyncKey_1 = session['synckey'] sync_url = "https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck" sync_data_list = [] for item in SyncKey_1['List']: temp = "%s_%s"%(item['Key'],item['Val']) sync_data_list.append(temp) sync_data_str = "|".join(sync_data_list) nid = int(time.time()) ticket_dict = session.get('ticket_dict') sync_dict = { "r":nid, "skey": ticket_dict['skey'], "sid":ticket_dict['wxsid'], "uin":ticket_dict['wxuin'], "deviceid":"e590082815481369", "synckey":sync_data_str, } ticket_cookie = session.get('ticket_cookie') response_sync = requests.get(sync_url,params=sync_dict,cookies=ticket_cookie) pass_ticket = ticket_dict.get('pass_ticket') #获取消息内容 if 'selector:"2"' in response_sync.text: fetch_msg_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid={0}&skey={1}&lang=zh_CN&pass_ticket={2}".format(ticket_dict['wxsid'],ticket_dict['skey'],pass_ticket) form_data = { 'BaseRequest':{ 'DeviceID':"e616487029833324", 'Sid':ticket_dict['wxsid'], 'Skey':ticket_dict['skey'], 'Uin':ticket_dict['wxuin'], }, 'SyncKey':SyncKey_1, 'rr':str(time.time()) } response_fetch_msg = requests.post(fetch_msg_url,json=form_data) response_fetch_msg.encoding = 'utf-8' content = response_fetch_msg.json() return render_template('get_msg.html',content=content)
最后:
if __name__ == '__main__': app.run()
执行代码即可实现微信网页版自动登录,获取联系人信息,发送并接收消息