因为需要用到scrapy图片爬取的中间键,故应先安装PIL:
pip install pillow
settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} IMAGES_URLS_FIELD = 'url' #为items中存储图片链接的字段 IMAGES_STORE = r'.' #为存储图片路径
items.py:
import scrapy class SinaTripItem(scrapy.Item): url = scrapy.Field()
主爬虫文件:
import scrapy from scrapy.spiders import Spider from scrapy.selector import Selector from sina_trip.items import SinaTripItem class sinaTripSpider(Spider): name = "sinaTripSpider" #name of Spider start_urls = ["http://travel.sina.com.cn/"] #start url def parse(self, response): #parse function item = SinaTripItem() sel = Selector(response) sites = sel.xpath("//img/@src").extract() #extract url of pictures item['url'] = [] for site in sites: if 'http:' not in site: site = 'http:' + site item['url'].append(site) yield item
过程中所遇问题:
ValueError: Missing scheme in request url: h
解决办法:相关URL必须是一个List,所以遇到该错误只需要将url转换成list即可。
运用Pipelines做后期处理(数据清洗、验证、过滤等):
# -*- coding: utf-8 -*- from scrapy.pipelines.images import ImagesPipeline from scrapy.http import Request class DoubanmoviePipeline(object): def process_item(self, item, spider): return item class MyImagesPipeline(ImagesPipeline): # yield meta for file_path() function def get_media_requests(self, item, info): for url in item['url']: yield Request(url, meta={'item': item, 'index':item['url'].index(url)}) # rename the image def file_path(self, request, response=None, info=None): item = request.meta['item'] index = request.meta['index'] image_name = item['img_name'][index] return 'full/%s.jpg' % (image_name)