import requests
import openpyxl
import csv
from bs4 import BeautifulSoup
import random
from urllib.request import quote
#创建excel存储top250电影信息
wb=openpyxl.Workbook()
sheet=wb.active
sheet.title='movie_info'
sheet['A1'] ='序号'
sheet['B1'] ='电影名'
sheet['C1'] ='电影连接'
sheet['D1'] ='文字简介'
a=0
b=0
movie_list=[]
while a<=225:
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
url='https://movie.douban.com/top250?start={}&filter='.format(a)
a=a+25
res=requests.get(url,headers=headers)
bs=BeautifulSoup(res.text,'html.parser')
items=bs.find_all(class_='item')
for item in items:
movie_name=item.find(class_='title').text
movie_url =item.find('a')['href']
movie_inq =item.find(class_='inq').text
movie_photo=item.find('img')['src']
b=b+1
movie_list.append(movie_name)
#print(b,movie_name,movie_url,movie_inq)
#print(movie_name,movie_photo)
sheet.append([b,movie_name,movie_url,movie_inq])#保存电影信息到excel
wb.save('D:\\pythontest\\douban_movie_list.xlsx')
print('ok')
#excel的读取
wb=openpyxl.load_workbook('D:\\pythontest\\douban_movie_list.xlsx')
#调用openpyxl.load_workbook()函数,打开" ***.xlsx "文件。
sheet=wb['movie_info']#获取“Marvel.xlsx”工作薄中名为“movie_info”的工作表
sheetname=wb.sheetnames#sheetnames 是用来获取工作薄所有工作表的名字,如果不知道工作薄到底有几个工作表,就可以把工作表的名字都打印出来。
#print(sheetname)
#随机选取3部电影,并打印
for i in range(3):
num=random.randint(1,250)
B2=sheet['B'+str(num)] #把“movie_info”工作表中B2单元格赋值给B2_cell
B2_value=B2.value #再利用单元格.value属性
#B2_value=sheet['B2'].value
print(B2_value)
#print(type(B2_value))
# 爬电影的下载链接
movie=B2_value
gbkmovie = movie.encode('gbk')
urlsearch = 'http://s.ygdy8.com/plus/s0.php?typeid=1&keyword='+quote(gbkmovie)
res = requests.get(urlsearch)
res.encoding='gbk'
soup_movie = BeautifulSoup(res.text,'html.parser')
urlpart=soup_movie.find(class_="co_content8").find_all('table')
if urlpart:
urlpart=urlpart[0].find('a')['href']
urlmovie='https://www.ygdy8.com/'+urlpart
res1=requests.get(urlmovie)
res1.encoding='gbk'
soup_movie1=BeautifulSoup(res1.text,'html.parser')
urldownload=soup_movie1.find('div',id="Zoom").find('span').find('table').find('a')['href']
print(urldownload)
else:
print('没有'+movie+'的链接')