增量式爬虫:顾名思义就是以前爬取过的不在爬取,未爬取过的进行爬取。
需求:爬取https://www.4567kan.com/中的动作电影的标题和简介
分析:指定url, 创建工程,cd进入工程,创建爬虫文件,链接提取器提取页码链接,规则解析器进行规则制定,数据解析,持久化存储,其实都是一样的,重点在于怎么只爬取更新的电影。
核心:检测电影详情页的url之前是否爬取过
将爬取过的电影详情页url存储
存储到redis的set数据结构中(可以自动去重)
(redis数据库表的清空命令:登录客户端后:flushall)
查看存储的urls:smembers urls
查看存储的movieDdata:lrange movieDdata 0 -1
#主文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviepro.items import MovieproItem
class MovieSpider(CrawlSpider):
name = 'movie'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567kan.com/index.php/vod/show/id/5/page/1.html']
rules = (
Rule(LinkExtractor(allow=r'\d+.html'), callback='parse_item', follow=True),
)
#创建redis连接对象
conn = Redis(host='127.0.0.1',port=6379)
#用于解析每一个页面中对应的电影详情页url
def parse_item(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
detail_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
#将详情页的url存入redis的set中
ex = self.conn.sadd('urls',detail_url)#urls为了进行对比
if ex==1:
print('该url未被爬取过可进行数据的爬取')
yield scrapy.Request(url=detail_url,callback=self.detail_parse)
else:
print('该url被爬取过 ,无数据可以爬取')
def detail_parse(self,response):
movie_name = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
jianjie = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
jianjie = ''.join(jianjie)
item = MovieproItem()
item['movie_name'] = movie_name
item['jianjie'] = jianjie
yield item
#items
import scrapy
class MovieproItem(scrapy.Item):
# define the fields for your item here like:
movie_name = scrapy.Field()
jianjie = scrapy.Field()
#pipelines
class MovieproPipeline(object):
conn = None
def open_spider(self,spider):
self.conn = spider.conn #连接对象在爬虫文件(spider)中已经创建过,直接调用
def process_item(self, item, spider):
dic = {
'movie_name':item['movie_name'],
'jianjie':item['jianjie']
}
print(dic)
self.conn.lpush('movieDdata',dic)#存储我们爬取的数据
return item