本文基于scrapy 2.6版本说明
前言
经历了半个月的调参,整理了一些Scrapy中常用的配置和实践方式(都是血和泪的教训TAT)
配置说明
settings.py 常用配置
建议将全局配置放入该位置, 例如数据库连接, 第三方秘钥, 邮件配置/webhook等信息, 与 Spider 相关的配置不建议放在该文件中
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
DOWNLOAD_DELAY = 0
URLLENGTH_LIMIT = 2083
RETRY_ENABLED = True
COOKIES_ENABLED = True
REDIRECT_ENABLED = True
USER_AGENT = None
LOG_LEVEL = 'DEBUG'
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
LOG_FILE = None
LOGSTATS_INTERVAL = 60.0
EXTENSIONS = {
}
ITEM_PIPELINES = {
}
DEFAULT_REQUEST_HEADERS = {
}
SPIDER_MIDDLEWARES = {
}
spiders/
所有 Spider 继承 BaseSpider , 重写构造方法即可从配置中初始化 Spider
建议将与单个 Spider 相关的配置放入 Spider 的 custom_settings 中,例如最大并发数, IP最大并发数, 是否开启 cookie 以及额外配置等
import scrapy
class BaseSpider(scrapy.Spider):
def __init__(self, settings):
pass
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(crawler.settings, *args, **kwargs)
spider._set_crawler(crawler)
return spider
class DemoSpider(BaseSpider):
name = 'DemoSpider'
handle_httpstatus_list = []
custom_settings = {
'RETRY_ENABLED': False,
}
def __init__(self, settings):
super().__init__(settings)
middlewares.py
所有 SpiderMiddleware 继承 BaseSpiderMiddleware, DownloaderMiddleware 继承 BaseDownloaderMiddleware, 重写构造方法即可从配置中初始化 Middlewares
import signals
from utils import get_single_name
class BaseSpiderMiddleware:
"""
基础爬虫中间件, 含settings参数的构造方法
"""
def __init__(self, settings=None):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.settings)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
spider.logger.warn('SpiderMiddleware %s, Spider %s, process exception: %s' % (get_single_name(self), spider.name, exception))
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('SpiderMiddleware %s, Spider opened: %s' % (get_single_name(self), spider.name))
class BaseDownloaderMiddleware:
"""
基础下载中间件, 含settings参数的构造方法
"""
def __init__(self, settings=None):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.settings)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
spider.logger.warn('DownloadMiddleware %s, Spider %s, process exception: %s' % (get_single_name(self), spider.name, exception))
def spider_opened(self, spider):
spider.logger.info('DownloadMiddleware: %s, Spider opened: %s' % (get_single_name(self), spider.name))
pipelines.py
所有 Pipeline 继承 BasePipeline 重写构造方法即可从配置中初始化 Pipeline
通过配置加载 Pipeline 的初始化更易于管理与维护
import signals
from utils import get_single_name
class BasePipeline:
"""
基础管道, 含settings参数的构造方法
"""
def __init__(self, settings=None):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.settings)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_item(self, item, spider):
return item
def spider_opened(self, spider):
spider.logger.info('Pipeline: %s, Spider opened: %s' % (get_single_name(self), spider.name))
utils/
存放一些额外处理工具
__init__.py
def get_single_name(tp):
pass