目前环境
python3
scrapy2.4
1、方案1: 多pipelines进行区分(scrapy版本必须是1.1以上)
settings.py
# 这个数值的范围是0-1000, 这个数值确定了他们的运行顺序(即优先级), 数字越小越优先执行
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
'weather.pipelines.WeatherHourPipeline': 302,
}
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 用于保存所抓取的数据的容器
# 定义字段内容
# 每日天气
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
# 今日
name = scrapy.Field()
# 天气状态
status = scrapy.Field()
# 日期
date = scrapy.Field()
# 最高气温
max = scrapy.Field()
# 最低气温
min = scrapy.Field()
pass
# 每小时天气
class WeatherHourItem(scrapy.Item):
# 唯一标志
date = scrapy.Field()
pass
WeatherSpider.py
import scrapy
from weather.items import WeatherItem
# 爬虫逻辑
class WeatherSpider(scrapy.Spider):
# Spider名称,不能删除
name = "weather"
allowed_domains = ['weather.com']
start_urls = [
'https://weather.com/zh-CN/weather/today/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
]
# 必须指定管道,否则报错
custom_settings = {
"ITEM_PIPELINES": {
'weather.pipelines.WeatherPipeline': 300
}
}
def parse(self, response):
# 每日天气
weatherItem = WeatherItem()
yield weatherItem
pass
WeatherHourSpider.py
import scrapy
from weather.items import WeatherHourItem
# 爬虫逻辑
class WeatherHourSpider(scrapy.Spider):
# Spider名称,不能删除
name = "weatherHour"
allowed_domains = ['weather.com']
start_urls = [
'https://weather.com/zh-CN/weather/hourbyhour/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
]
# 必须指定管道,否则报错
custom_settings = {
"ITEM_PIPELINES": {
'weather.pipelines.WeatherHourPipeline': 302
}
}
def parse(self, response):
# 每日天气
weatherHourItem= WeatherHourItem()
yield weatherHourItem
pass
pipelines.py
多个管道
class WeatherPipeline(object):
def process_item(self, item, spider):
print("Weather")
return item
class WeatherHourPipeline(object):
def process_item(self, item, spider):
print("WeatherHour")
return item
2、方案2:单pipelines通过spider.name区分
settings.py
# 这个数值的范围是0-1000, 这个数值确定了他们的运行顺序(即优先级), 数字越小越优先执行
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300
}
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 用于保存所抓取的数据的容器
# 定义字段内容
# 每日天气
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
# 今日
name = scrapy.Field()
# 天气状态
status = scrapy.Field()
# 日期
date = scrapy.Field()
# 最高气温
max = scrapy.Field()
# 最低气温
min = scrapy.Field()
pass
# 每小时天气
class WeatherHourItem(scrapy.Item):
# 唯一标志
date = scrapy.Field()
pass
WeatherSpider.py
import scrapy
from weather.items import WeatherItem
# 爬虫逻辑
class WeatherSpider(scrapy.Spider):
# Spider名称,不能删除
name = "weather"
allowed_domains = ['weather.com']
start_urls = [
'https://weather.com/zh-CN/weather/today/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
]
def parse(self, response):
# 每日天气
weatherItem = WeatherItem()
yield weatherItem
pass
WeatherHourSpider.py
import scrapy
from weather.items import WeatherHourItem
# 爬虫逻辑
class WeatherHourSpider(scrapy.Spider):
# Spider名称,不能删除
name = "weatherHour"
allowed_domains = ['weather.com']
start_urls = [
'https://weather.com/zh-CN/weather/hourbyhour/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
]
def parse(self, response):
# 每日天气
weatherHourItem= WeatherHourItem()
yield weatherHourItem
pass
pipelines.py
同一个管道
class WeatherPipeline(object):
def process_item(self, item, spider):
if spider.name == "Weather":
print("Weather")
elif spider.name == "WeatherHour":
print("WeatherHour")
return item