import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
import lxml
def get_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
response = requests.get(url, headers = headers)#请求数据
data = response.text
soup = BeautifulSoup(data, 'lxml')#解析数据形成文档树结构
return soup
def get_page_detail(url):
soup = get_page(url)
all_film = soup.findAll('div', {
'class':'item'})#找到每个电影的html
for item in all_film:#从每个电影的html中找数据并添加到列表中
rank = item.find('em').text
name = item.find('span', {
'class':'title'}).text
score = item.find('span', {
'class':'rating_num'}).text
film_tuple = (rank, name, score)
list.append(film_tuple)
return None
def main(start = 0, file = None):
url = f'{
base_url}?start={
start}'#每个页面只有25个电影,所以要设置参数start
get_page_detail(url)
if __name__ == "__main__":
print('开始执行')
start = time.perf_counter()
base_url = "https://movie.douban.com/top250"
#建立excel
wb = Workbook()
ws = wb.active
title = ['排名', '名字', '豆瓣评分']
ws.append(title)
list = []
for i in range(0, 250, 25):
main(start=i)
#将电影信息写入excel
for item in list:
ws.append(item)
#记得保存
wb.save(filename='豆瓣前250的电影.xlsx')
end = time.perf_counter()
print('执行结束')
print(f'耗时{
end-start}')
豆瓣电影前250名爬虫并写入excel源码
猜你喜欢
转载自blog.csdn.net/weixin_45486992/article/details/122773494
今日推荐
周排行