目录
1.scrapy框架概述
Scrapy Engine(引擎): 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等。
Scheduler(调度器): 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎。(如果是重复请求,就不入队列)
Downloader(下载器):负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理,
Spider(爬虫):它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器),
Item Pipeline(管道):它负责处理Spider中获取到的Item,并进行进行后期处理(详细分析、过滤、存储等)的地方.
Downloader Middlewares(下载中间件):你可以当作是一个可以自定义扩展下载功能的组件。可以对经过的request 以及 reponse根据需求进行处理。
Spider Middlewares(Spider中间件):你可以理解为是一个可以自定扩展和操作引擎和Spider中间通信的功能组件(比如进入Spider的Responses;和从Spider出去的Requests),可以对reponse与request进行处理,但不会对items进行处理,因为pipeline数据管道是专门对数据进行处理的地方
2.Scrapy爬取苏宁图书案例
先创建一个爬虫项目
scrapy startproject [项目名称]
示例:scrapy startproject book
再在项目路径下创建一个爬虫
scrapy genspider [爬虫名称] [所要爬取的域名]
scrapy genspider suning suning.com
3.设置配置文件
需要开启以下配置文件
4.如何获取USER_AGENT
将3中的USER_AGENT 替换为你的电脑的USER_AGENT
5.编写items.py文件
这里照着格式写。主要是写你所需要爬取的字段。
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 大标签
b_label = scrapy.Field()
# 中标签
m_label = scrapy.Field()
# 小标签
s_label = scrapy.Field()
# 书名
book_name = scrapy.Field()
# 价格
book_price = scrapy.Field()
# 书作者
book_author = scrapy.Field()
# 出版社
book_publish = scrapy.Field()
# 出版时间
publish_time = scrapy.Field()
6.编写爬虫suning.py程序
该程序主要写从网页上抓取信息。
import scrapy
from book.items import BookItem
import re
import time
class SuningSpider(scrapy.Spider):
name = 'suning'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/']
def parse(self, response):
item = BookItem()
menu_list = response.xpath("//div[@class='menu-list']//div[@class='menu-item']")[:-2] # 返回的是一个列表
sub_list = response.xpath("//div[@class='menu-list']//div[@class='menu-sub']") # 返回的是一个列表
print(menu_list)
print(sub_list)
for index, i in enumerate(menu_list):
# 1. 抓取大标签
item["b_label"] = i.xpath(".//dl//dt//h3//a/text()").extract_first()
sub = sub_list[index].xpath(".//div[@class='submenu-left']//p")
for index1, j in enumerate(sub):
# 2.抓取中标签
item["m_label"] = j.xpath(".//a/text()").extract_first()
ssub = response.xpath(f"//div[@class='menu-list']//div[@class='menu-sub'][{index + 1}]//div[@class='submenu-left']//ul[{index1 + 1}]")
s_list = ssub.xpath(".//li")
for k in s_list:
# 3.抓取小标签
item["s_label"] = k.xpath(".//a/text()").extract_first()
# 4.抓取产品url(经分析数据由两种形式,一种是网页,一种是json)
next_url = k.xpath(".//a/@href").extract_first() # 第一页url
url_key = re.findall(r"[0-9]+", next_url)[1]
for page in range(0, 101):
# 针对网页数据形式
page_url = f"https://list.suning.com/1-{url_key}-{page}.html"
yield scrapy.Request(
page_url,
callback=self.page_parse,
meta={"item": item})
# 针对json数据形式
base_json_url = "https://list.suning.com/emall/showProductList.do?"
if page == 0:
json_url = base_json_url + f"ci={url_key}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&"+ "paging=1&sub=0"
else:
json_url = base_json_url + f"ci={url_key}&pg=03&cp={page}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&" + "&cc=020"
yield scrapy.Request(
json_url,
callback=self.page_parse,
meta={"item": item}
)
def page_parse(self, response):
item = response.meta["item"]
product_url_list = response.xpath("//div[@class='img-block']//a/@href").extract()
if product_url_list is not None:
for detail_url in product_url_list:
detail_url = "https:" + detail_url
yield scrapy.Request(
detail_url,
callback=self.product_description,
meta={"item": item}
)
def product_description(self, response):
item = response.meta["item"]
item["book_name"] = re.findall(r'"itemDisplayName":".*?"', response.body.decode())[0].split(":")[1]
# 价格没法通过xpath获取,只能通过源码方式获取
item["book_price"] = re.findall(r'"itemPrice":".*?"', response.body.decode())[0].split(":")[1]
item["book_author"] = response.xpath(".//ul[@class='bk-publish clearfix']//li[1]/text()").extract_first().split("\n")[1].strip()
item["book_publish"] = response.xpath(".//ul[@class='bk-publish clearfix']//li[2]/text()").extract_first().strip()
item["publish_time"] = response.xpath(".//ul[@class='bk-publish clearfix']//li[3]//span[2]/text()").extract_first().strip()
time.sleep(1)
print("大标签:%s" % item["b_label"])
print("中标签:%s" % item["m_label"])
print("小标签:%s" % item["s_label"])
print("书名:%s" % item["book_name"])
print("价格:%s" % item["book_price"])
print("作者:%s" % item["book_author"])
print("出版社:%s" % item["book_publish"])
print("出版时间:%s" % item["publish_time"])
print("")
yield item
7.编写pipeline.py程序
这个文件主要用于信息的输出
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import xlwt
class BookPipeline:
row = 0
def process_item(self, item, spider):
# 将传过来的数据保存到xlwt中
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet("sheet1")
header_list = ['大标签', '中标签', '小标签', '图书名称', '图书价格', '出版社', '发布时间']
for col in range(len(header_list)):
worksheet.write(0, col, header_list[col])
for j in range(len(item)):
worksheet.write(self.row, j, item[j])
self.row = self.row + 1
workbook.save("book_list")