目标:爬取 发表情 所有热门表情包,一页 45 张, 3961 页,将近 20 万个表情包。
以页为文件夹存在硬盘上(3 个多小时爬到了 2000 多页,还在爬…)
每一页45张表情包
想想是不是有点小激动,从此斗遍天下再也不用担心图穷了23333333~
创建爬虫的步骤细节就不多讲了,想看步骤细节的同学可以移步我的另一篇博客 Python3 scrapy爬取智联招聘存MongoDB
下面直接上核心代码:
#结构化爬取的数据
import scrapy
class BiaoqingItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = scrapy.Field() #图片的url
title = scrapy.Field() #图片的标题
page = scrapy.Field() #当前页的页码,用作文件夹名称
pass
#爬取数据的核心文件,指定爬取网址,解析页面,爬取逻辑
import scrapy
from biaoqing.items import BiaoqingItem
class BiaoqingbaoSpider(scrapy.Spider):
name = 'biaoqingbao'
allowed_domains = ['fabiaoqing.com/biaoqing']
start_urls = ['https://www.fabiaoqing.com/biaoqing/']
def parse(self, response):
divs = response.xpath('//*[@id="bqb"]/div[1]/div') #当前页面的所有表情
next_url = response.xpath('//div[contains(@class,"pagination")]/a[last()-1]/@href').extract_first() #下一页相对URL
base_url = 'https://fabiaoqing.com'
for div in divs:
items = BiaoqingItem()
items['url'] = div.xpath('a/img/@data-original').extract_first()
items['title'] = div.xpath('a/@title').extract_first()
items['page'] = next_url.split('/')[-1]
yield items
if next_url: # 如果存在下一页则进行翻页
url = base_url + next_url # 拼接字符串
yield scrapy.Request(url, self.parse, dont_filter=True)
#下载图片,制定存储规则
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class BiaoqingPipeline(ImagesPipeline):
# 下载图片
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['url'], meta={'title': item['title'], 'page': item['page']})
def item_completed(self, results, item, info):
# if not results[0][0]:
# raise DropItem('下载失败')
print(results)
return item
#制定存储规则,定义存储文件夹名称,图片名称
def file_path(self, request, response=None, info=None):
# 拆分文件名
title = request.meta['title'] + '.' + 'jpg'
page = request.meta['page']
filename = u'{0}/{1}'.format(page, title)
return filename
#反爬措施之一:使用随机 User-Agent 头
import random
class UserAgentMiddlewares(object):
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def process_request(self, request, spider):
agent = random.choice(self.user_agent_list)
request.headers['User-Agent'] = agent
BOT_NAME = 'biaoqing'
SPIDER_MODULES = ['biaoqing.spiders']
NEWSPIDER_MODULE = 'biaoqing.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False #不遵循 Robots 协议
IMAGES_STORE = '/home/lewis/test/biaoqing2' #设置图片存储位置
DOWNLOAD_DELAY = 0.5 #延时 0.5s 再请求
#注册自定义的反爬中间件
SPIDER_MIDDLEWARES = {
'biaoqing.middlewares.UserAgentMiddlewares': 100,
}
#开启,去掉注释即可
ITEM_PIPELINES = {
'biaoqing.pipelines.BiaoqingPipeline': 1,
}
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 408]
Run_biaoqing.py
#运行文件,启动整个爬虫
from scrapy import cmdline
cmdline.execute('scrapy crawl biaoqingbao'.split())