爬取IMDb TOP250电影基本信息
主要代码
items:
import scrapy
class ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
rank = scrapy.Field()
movie_name = scrapy.Field()
movie_type = scrapy.Field()
director = scrapy.Field()
writer = scrapy.Field()
stars = scrapy.Field()
score = scrapy.Field()
country = scrapy.Field()
metascore = scrapy.Field()
movie_length = scrapy.Field()
year = scrapy.Field()
comment_num = scrapy.Field()
critic_num = scrapy.Field()
CWG = scrapy.Field()
# budget = scrapy.Field()
# budget_type = scrapy.Field()
spiders:
# -*- coding: utf-8 -*-
import scrapy
from imdb.items import ImdbItem
import re
import time
import copy
# scrapy crawl rank -o rank.csv
class RankSpider(scrapy.Spider):
name = 'rank'
allowed_domains = ['imdb.com']
start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250']
# request top250 page, get movie url
def parse(self, response):
item = ImdbItem()
rank_list = response.xpath('//td[@class="titleColumn"]/text()').re('\d+')
movie_index = 0
for i in rank_list:
detail_url = response.xpath('//td[@class="titleColumn"]/a[1]/@href').extract()[movie_index]
movie_index += 1
if movie_index < 250:
item['rank'] = i
# we can't use response.follow here as url is not attached to the start url
yield scrapy.Request('https://www.imdb.com' + detail_url, callback=self.parse_detail, meta={
'key': copy.deepcopy(item)})
else:
break
# return valuable results
def parse_detail(self, response):
item = response.meta['key']
item['movie_name'] = response.xpath('//div[@class="title_wrapper"]/h1/text()').re('(.+)\xa0')[0]
# item['year'] = response.xpath('//*[@id="titleYear"]/a/text()').get(),
# 未进入更详细页面爬取各信息!
s = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract()
ss = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract_first()
for i in s[1:-1]:
ss = ss + ',' + i
item['movie_type'] = ss
item['director'] = response.xpath('//div[@class="credit_summary_item"][1]/a/text()').extract_first()
item['writer'] = response.xpath('//div[@class="credit_summary_item"][2]/a/text()').extract_first()
s = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract()
ss = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract_first()
for i in s[1:]:
ss = ss + ',' + i
item['stars'] = ss.strip(',See full cast & crew')
item['score'] = response.xpath(
'//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()').extract_first() # original
item['country'] = response.xpath('//a[@title="See more release dates"]/text()').re(r'[(](.*)[)]')[0] # !!!!!!
if re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text):
item['metascore'] = re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text)[0]
else:
item['metascore'] = ''
item['movie_length'] = response.xpath('//div[@class="title_wrapper"]/div/time/@datetime').re('\d+')[0]
item['year'] = response.xpath('//div[@class="title_wrapper"]/h1/span/a[1]/text()').extract_first()
comment_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[0]
comment_num = comment_num.strip(' user').split(',')
item['comment_num'] = ''
for i in comment_num:
item['comment_num'] += i
critic_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[1]
critic_num = critic_num.strip(' critic').split(',')
item['critic_num'] = ''
for i in critic_num:
item['critic_num'] += i
if re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text):
item['CWG'] = re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text)[0]
else:
item['CWG'] = 'unknown'
yield item
注意事项
提前分析网页
借助普通页面和、element和网页源代码等方式,一定要提前分析好网页,发现目标信息的规律和变化之处,决不能简单看一眼就定论。
正则表达式提取小括号里的字符串
import re
p1 = re.compile(r'[(](.*?)[)]', re.S) #最小匹配
freezer_kind = re.findall(p1, file_name)
error: ‘FeedExporter’ object has no attribute ‘slot’
这是因为已经打开要写入的csv文件,程序无法写入,关闭文件再运行scrapy 即可解决问题。
copy.deepcopy(item)
scrapy.Request使用meta传递数据,以及deepcopy的使用
import copy
a = [1, 2, 3, 4, ['a', 'b']] #原始对象
b = a #赋值,传对象的引用
c = copy.copy(a) #对象拷贝,浅拷贝
d = copy.deepcopy(a) #对象拷贝,深拷贝
a.append(5) #修改对象 a
a[4].append('c') #修改对象a中的['a', 'b']数组对象
print 'a = ', a
print 'b = ', b
print 'c = ', c
print 'd = ', d
#输出结果:
a = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
b = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
c = [1, 2, 3, 4, ['a', 'b', 'c']]
d = [1, 2, 3, 4, ['a', 'b']]