items. py
import scrapy
class MyItem(scrapy.Item):
# ... other item fields ...
img_urls = scrapy.Field()
img_paths = scrapy.Field()
pipelines. py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class ZhihuImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for img_url in item['img_urls']:
yield scrapy.Request(img_url)
def item_completed(self, results, item, info):
img_paths = [x['path'] for ok, x in results if ok]
if not img_paths:
raise DropItem("Item contains no images")
item['img_paths'] = img_paths
return item
注释
results
返回一个元组list,典型值如下:
[(True,
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
'url': 'http://www.example.com/files/product1.pdf'}),
(False,
Failure(...))]
setting. py
ITEM_PIPELINES = {'myProject.pipelines.MyImagesPipeline': 1} #数字越低,优先级越高
IMAGES_STORE = 'D:\\path\\...'