python爬虫电影页面信息 xpath csv写入图片存储到本地

import re
import requests
from  lxml import etree
import time
import urllib.request
import csv
import os

# 获取电影详情
def getMoviesDetail(id,score):
    movies_id = re.sub(r'/films/', '', id)
    details_url = 'http://maoyan.com/films/' + movies_id
    print(details_url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    # 定义代理
    proxy_addr = {
        'http': '88.146.227.253:8080'
    }
    details_content = requests.get(details_url, headers=headers,proxies = proxy_addr).text
    html = etree.HTML(details_content)
    # 过滤出电影名称
    name = html.xpath('//div[@class="movie-brief-container"]/h3/text()')[0]
    # 过滤出地区 上映时间
    region_showTime = html.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()')[0]
    pattern = re.compile(u"[\u4e00-\u9fa5]+")
    region = re.findall(pattern, region_showTime)[0]
    pattern = re.compile(u"[\u4e00-\u9fa5]+")
    show_time = re.sub(pattern,"",region_showTime)
    # 过滤出电影时长
    duartion = html.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()')[0]
    pattern = re.compile(r"\d+")
    duartion = re.findall(pattern, duartion)[0]
    # 过滤出图片链接
    image_url = html.xpath('//div[@class="avatar-shadow"]/img/@src')[0]

    # 将电影信息写入csv文档
    data = [name, score, region, show_time, duartion]
    writerDataTocsv(data)
    # 将图片下载操本地
    dowloadImage(image_url,name)





# 获取电影id
def getMoviesId():
    url = "http://maoyan.com/films"
    #设置请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    # 定义代理
    proxy_addr ={
        'http': '88.146.227.253:8080'
    }
    content = requests.get(url,headers= headers,proxies = proxy_addr).text.encode('utf-8')
    html = etree.HTML(content)
    # xpath过滤
    films_list =html.xpath('//div[@class = "movies-list"]/dl//div[@class="movie-item"]/a/@href')
    # 将标题先存入
    data = ['电影名称', '电影评分', '上映地区', '上映时间', '电影时长']
    writerDataTocsv(data)
    # 通过id循环调用下载详情页
    for i in  films_list:
       getMoviesDetail(i,90)
       time.sleep(3)

# 数据写入csv
def writerDataTocsv(data):
    try:
        with open('movie_info.csv', 'a+') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(data)
            csvfile.close()
    except:
        print("写入文件错误")

# 下载图片到本地
def dowloadImage(image_url,name):
    file_path = 'moviesImage'
    try:
        if not os.path.exists(file_path):
             os.makedirs(file_path)
        filename = '{}{}{}{}'.format(file_path,os.sep,name,'.jpg',)
        urllib.request.urlretrieve(image_url,filename=filename)
    except IOError as e:
        print('文件操作失败',e)

getMoviesId()
python爬虫 电影页面信息 xpath csv写入 图片存储到本地

猜你喜欢

python爬虫电影页面信息 xpath csv写入图片存储到本地