弄了一阵网络爬虫,使用requests,re,BeautifulSoup,这些包。暂放一段时间,怕忘了,就记下来吧。
按照mu zhi医生的网站布局,只要有一个医生的ID就可以把属于这位医生的问答对全部爬下来。所以,思路是先把所有医生的ID拿下来保存到一个文件,以后按照这个进行爬取。问答网页使用动态js,需要注意下吧。这是代码。
import requests
import re
import time
from bs4 import BeautifulSoup
doc_num = []
"""
Save_docs = open('SaveDoc.txt','w')
for i in range(222):
url = 'http://muzhi.baidu.com/doctor/list/doctoronline?pn={}&rn=5&cid1=127'.format(i)
request = requests.get(url).json()
for item in request['data']['list']:
Save_docs.write(item['uid']+'\n')
doc_num.append(item['uid'])
print('Get doc:',item['realname'],' Company:',item['company'],' uid:',item['uid'])
Save_docs.close()
"""
#第一次运行这段代码会爬下所有医生的ID
Save_docs = open('SaveDoc.txt','r')
doc_num=[line.rstrip() for line in Save_docs]
Save_docs.close()
docs_file = open('docsSeen.txt','w')
ques_file = open('quesSeen.txt','w',encoding='utf-8')
for docNum in doc_num:
seed_doc_url = 'http://muzhi.baidu.com/home/{}'.format(docNum)
print('Downloading from doc:',docNum)
docs_file.write(docNum+'\n')
#把已经爬取得医生的ID放到文件记录
request = requests.get(seed_doc_url)
#访问医生页面
#request.encoding='GB2312'
#可能是编码问题,这句用来解决中文乱码问题
html = request.text
soup = BeautifulSoup(html,'html.parser')
#使用BeautifulSoup对象对网页HTML进行分析
target = soup.find_all('script',type="text/javascript")[2].text
uid_regex = "'id':'(\d*)'"
cid_regex ="'cid1':'(\d*)'"
#这里利用了正则表达式,使用了Python re库的函数,非常方便。
uid = re.search(uid_regex,target).groups()[0]
cid = re.search(cid_regex,target).groups()[0]
for i in range(76):
questions_page = 'http://muzhi.baidu.com/doctor/list/answer?pn={0}&rn=10&uid={1}'.format(i*10,uid)
time.sleep(3)
#避免网站封锁IP
request = requests.get(questions_page).json()['data']['list']
#处理办法,利用Shell,试验出来的。
for item in request:
que_url = 'http://muzhi.baidu.com/question/{}'.format(item['qid'])
print('Downloading from:',que_url,' uid:',uid,' page',i+1)
QandA = []
time.sleep(1)
request = requests.get(que_url)
request.encoding='GB2312'
html = request.text
soup = BeautifulSoup(html,'html.parser')
try:
question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
QandA.append(question)
except IndexError:
pass
try:
answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
QandA.append(answer)
except IndexError:
pass
while QandA == []:
#这个问题有待解决,当爬了几次后,百度便提示验证码,阻碍爬虫。
#这个循环体会每5秒重试一次,直到解决验证码。(显然这是个严重问题)
print('Under control! Waiting...Waiting....')
time.sleep(5)
request = requests.get(que_url)
request.encoding='GB2312'
html = request.text
soup = BeautifulSoup(html,'html.parser')
try:
question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
QandA.append(question)
except IndexError:
pass
try:
answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
QandA.append(answer)
except IndexError:
pass
print(QandA)
strQandA = '|'.join(QandA)
ques_file.write(strQandA+'\n')