目标:爬取知乎
代码:
# -*- coding: utf-8 -*- __author__ = 'beauty' # import sys # reload(sys) # sys.setdefaultencoding("utf-8") import sys type = sys.getfilesystemencoding() ''' 作者:liuzhijun 微信: lzjun567 公众号:Python之禅(id:VTtalk) ''' import time from http import cookiejar import requests from BeautifulSoup import BeautifulSoup # headers = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87' } # 使用登录cookie信息 session = requests.session() session.cookies = cookiejar.LWPCookieJar(filename='cookies.txt') try: # print(session.cookies) session.cookies.load(ignore_discard=True) except: print("还没有cookie信息").decode('utf-8') def get_xsrf(): response = session.get("https://www.zhihu.com", headers=headers) soup = BeautifulSoup(response.content, smartQuotesTo="html") xsrf = soup.find('input', attrs={"name": "_xsrf"}).get("value") return xsrf def get_captcha(): """ 把验证码图片保存到当前目录,手动识别验证码 :return: """ t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" r = session.get(captcha_url, headers=headers) with open('captcha.jpg', 'wb') as f: f.write(r.content) captcha = raw_input(u"验证码:") return captcha def login(account, password): login_url = 'https://www.zhihu.com/login/email' data = { 'account': account, 'password': password, '_xsrf': get_xsrf(), "captcha": get_captcha(), 'remember_me': 'true'} response = session.post(login_url, data=data, headers=headers) login_code = response.json() print(login_code['msg']) for i in session.cookies: print(i) session.cookies.save() if __name__ == '__main__': account = "youraccout" password = "yourpassword" login(account, password)结果:登录过于频繁,请稍后重试
PS:不知道是哪里出来问题,日后再试