import requests
import re
import json
from bs4 import BeautifulSoup
def get_one_page(url):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
headers = {'User-Agent':user_agent}
response = requests.get(url,headers)
return response.text
获取网页内容
def get_information(html_text):
pattern = re.compile('shtml">(.{1,16})</a></div>.*?"rank__price">(.{1,8})</div>.*?<span>(.*?)</span>', re.S)
items = re.findall(pattern,html_text)
for item in items:
yield {
'index':item[0],
'price':item[1],
'score':item[2]
}
正则匹配
yield整合起数据结构
finaall返回匹配到的列表,里面为元组
def recording(information):
with open('豆瓣Top250.txt','a',encoding='utf-8') as f:
f.write(json.dumps(information,ensure_ascii=False)+'\n')
将爬到的信息写入文件
def main():
for i in range(0,1):
response = get_one_page('https://top.zol.com.cn/compositor/15/manu_167.html')
html_text = get_information(response)
for m in html_text:
recording(m)
print('正在爬取第'+str(i)+'页')
print('爬取完毕!')
main()