from bs4 import BeautifulSoup
import requests
import re
### 获取电影详情
def getMovieDetails(link):
result = {}
details = BeautifulSoup(requests.get(link).text,"lxml")
### 电影名称
result['title'] = details.find('span',attrs = {'property':'v:itemreviewed'}).text
### 发行时间
year = details.find('span',attrs = {'class':'year'}).text
result['year'] = re.sub(r"[()]","",year)
###评分
result['rating_num'] = details.find('strong',attrs = {'class':'ll rating_num'}).text
### 评论数
result['votes'] = details.find('span',attrs = {'property':'v:votes'}).text
#### 类型
types_node = details.findAll('span',attrs = {'property':'v:genre'})
types = [node.text for node in types_node]
# result['types_node'] = types_node[0].text
result['types'] = types
# ##其他信息
# result['info'] = details.find('div',attrs = {'id':'info'}).text
### 简介 .strip()去除字符串首的空格
summary = details.findAll('span',attrs = {'property':'v:summary'})[0].text
result['summary'] = "".join(summary.split())
return result
###获取top250电影列表 并抽取电影详情里面的内容
def getMovies(url,min_rating_num):
resultList = []
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'lxml')
for movies in soup.select('.item'):
rating_num = float(movies.select('.rating_num')[0].text)
movie_name = movies.select('.info .title')[0].text
# movie_star_pepole_num = movies.select('.star span')[3].text
movie_href = movies.select('.info a')[0]['href']
if rating_num >= min_rating_num:
resultList.append(getMovieDetails(movie_href))
# print(movie_name)
# print(movie_star_pepole_num)
# print(rating_num)
print(movie_name)
return resultList
#抓取每一页
url = 'https://movie.douban.com/top250?start={}'
# 大于等于
min_rating_num = 8
movies_total = []
for i in range(0,10):
newUrl = url.format(25*i)
print('正在抓取第',(i+1),'页,请稍后...')
newsary = getMovies(newUrl,min_rating_num)
movies_total.extend(newsary)
# 将抓取到的数据整理成表格
import pandas
df = pandas.DataFrame(movies_total)
df