版权声明:本文为博主原创文章,转载请标明出处。 https://blog.csdn.net/chuan403082010/article/details/84314150
import re
import requests
from lxml import etree
import time
import urllib.request
import csv
import os
# 获取电影详情
def getMoviesDetail(id,score):
movies_id = re.sub(r'/films/', '', id)
details_url = 'http://maoyan.com/films/' + movies_id
print(details_url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
# 定义代理
proxy_addr = {
'http': '88.146.227.253:8080'
}
details_content = requests.get(details_url, headers=headers,proxies = proxy_addr).text
html = etree.HTML(details_content)
# 过滤出电影名称
name = html.xpath('//div[@class="movie-brief-container"]/h3/text()')[0]
# 过滤出地区 上映时间
region_showTime = html.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()')[0]
pattern = re.compile(u"[\u4e00-\u9fa5]+")
region = re.findall(pattern, region_showTime)[0]
pattern = re.compile(u"[\u4e00-\u9fa5]+")
show_time = re.sub(pattern,"",region_showTime)
# 过滤出电影时长
duartion = html.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()')[0]
pattern = re.compile(r"\d+")
duartion = re.findall(pattern, duartion)[0]
# 过滤出图片链接
image_url = html.xpath('//div[@class="avatar-shadow"]/img/@src')[0]
# 将电影信息写入csv文档
data = [name, score, region, show_time, duartion]
writerDataTocsv(data)
# 将图片下载操本地
dowloadImage(image_url,name)
# 获取电影id
def getMoviesId():
url = "http://maoyan.com/films"
#设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
# 定义代理
proxy_addr ={
'http': '88.146.227.253:8080'
}
content = requests.get(url,headers= headers,proxies = proxy_addr).text.encode('utf-8')
html = etree.HTML(content)
# xpath过滤
films_list =html.xpath('//div[@class = "movies-list"]/dl//div[@class="movie-item"]/a/@href')
# 将标题先存入
data = ['电影名称', '电影评分', '上映地区', '上映时间', '电影时长']
writerDataTocsv(data)
# 通过id循环调用下载详情页
for i in films_list:
getMoviesDetail(i,90)
time.sleep(3)
# 数据写入csv
def writerDataTocsv(data):
try:
with open('movie_info.csv', 'a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(data)
csvfile.close()
except:
print("写入文件错误")
# 下载图片到本地
def dowloadImage(image_url,name):
file_path = 'moviesImage'
try:
if not os.path.exists(file_path):
os.makedirs(file_path)
filename = '{}{}{}{}'.format(file_path,os.sep,name,'.jpg',)
urllib.request.urlretrieve(image_url,filename=filename)
except IOError as e:
print('文件操作失败',e)
getMoviesId()