最近和同事和朋友斗图斗得厉害,心想哪里来的这么多表情包,于是乎想着去表情包网站爬一波图片下来,便有了今天这篇文章。话不多说,上图上代码!
将这3540页表情包全部抓取下来,存到我的本地磁盘。
spider.py
# -*- coding: utf-8 -*-
import scrapy
from biaoqing.items import BiaoqingItem
class BiaoqingbaoSpider(scrapy.Spider):
name = 'biaoqingbao'
allowed_domains = ['fabiaoqing.com/biaoqing']
start_urls = ['https://www.fabiaoqing.com/biaoqing/'] # 热门表情包分类
def parse(self, response):
divs = response.xpath('//*[@id="bqb"]/div[1]/div')
next_url = response.xpath('//div[contains(@class,"pagination")]/a[last()-1]/@href').extract_first()
base_url = 'https://fabiaoqing.com'
for div in divs:
items = BiaoqingItem()
items['url'] = div.xpath('a/img/@data-original').extract_first()
items['title'] = div.xpath('a/@title').extract_first()
yield items
if next_url: # 如果存在下一页则进行翻页
url = base_url + next_url # 拼接字符串
yield scrapy.Request(url, self.parse, dont_filter=True)
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class BiaoqingbaoPipeline(ImagesPipeline):
# 重构三个方法
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['url'], meta={'title': item['title']})
def item_completed(self, results, item, info):
# if not results[0][0]:
# raise DropItem('下载失败')
print(results)
return item
def file_path(self, request, response=None, info=None):
# 拆分文件名
title = request.meta['title'] + '.' + request.url.split('.')[-1]
return title
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/68.0.3423.2 Safari/537.36'
ROBOTSTXT_OBEY = False
# 存储路径
IMAGES_STORE = 'G:\表情包'
ITEM_PIPELINES = {
# 'biaoqing.pipelines.BiaoqingPipeline': 300,
'biaoqing.pipelines.BiaoqingbaoPipeline': 1,
}
scrapy crawl biaoqingbao 跑起来 这样就大功告成了 网速稍微快一点 我这里大概跑了3个小时 100兆的电信宽带
(约11mb/s)