用Scrapy框架实现对糗事百科的段子爬取
闲来无事看一下网上的段子也是挺不错的。
一、效果图
用Scrapy框架成功创建一个项目的时候如下图所示:
我们对每一个py文件进行补充:
二、qsbk_sp爬虫文件代码
import scrapy
# from scrapy.http.response.html import HtmlResponse
# from scrapy.selector.unified import SelectorList
from scrapy_demo.qsbk.qsbk.items import QsbkItem
class QsbkSpSpider(scrapy.Spider):
name = 'qsbk_sp'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
base_url="https://www.qiushibaike.com"
def parse(self, response):
# SelectorList
duanzidivs = response.xpath("//div[@class='col1 old-style-col1']/div")
# items = []
for duanzidiv in duanzidivs:
#Selector
author = duanzidiv.xpath(".//h2/text()").get().strip()
content = duanzidiv.xpath(".//div[@class='content']//text()").getall()
content = "".join(content).strip()
item = QsbkItem(author=author,content=content)
# items.append(item)
# duanzi ={'author':author,'content':content}
# yield duanzi
yield item
# return items
# 爬取多页面!!
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
if not next_url: #当是最后一页的时候不再往下执行
return
else:
yield scrapy.Request(self.base_url+next_url, callback=self.parse) # 将新的url赋给parse函数 然后重复执行上面的parse函数
items.py文件代码如下:
三、items.py文件代码
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QsbkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
pipelines.py代码如下:(我用了三种不同的方式实现数据的存储。)
四、pipelines.py文件代码
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
#1.最基本的数据保存方式
# from itemadapter import ItemAdapter
# import json
#
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open("duanzi.json",'w',encoding='utf-8')
# def open_spider(self,spider):
# print("爬虫开始了》》》")
# def process_item(self, item, spider):
# item_json = json.dumps(dict(item),ensure_ascii=False)
# self.fp.write(item_json+'\n')
# return item
# def close_spider(self,spider):
# self.fp.close()
# print("爬虫结束了》》")
# 2.下面这种方式把所有的数据放到一个列表中去,比较耗用内存 (当数据较少的时候适合使用)
# from scrapy.exporters import JsonItemExporter #JsonItemExporter以byte方式写入
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open("duanzi1.json",'wb') #wb用二进制形式打开
# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
# self.exporter.start_exporting() # 开始标志
#
# def open_spider(self,spider):
# print("爬虫开始了》》》")
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
# def close_spider(self,spider):
# self.exporter.finish_exporting() # 结束的标志
# self.fp.close()
# print("爬虫结束了》》")
# 3.最简介的数据保存方式 (数据较多的时候采用这种方式) 不需要开始和结束
from scrapy.exporters import JsonLinesItemExporter #以每一个字典为一行的方式存储 这样的存储方式占用的内存少
class QsbkPipeline(object):
def __init__(self):
self.fp = open("duanzi1.json",'wb') #wb用二进制形式打开
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def open_spider(self,spider):
print("爬虫开始了》》》")
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
print("爬虫结束了》》")
settings.py文件代码:
五、settings.py文件代码
# Scrapy settings for qsbk project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'qsbk'
SPIDER_MODULES = ['qsbk.spiders']
NEWSPIDER_MODULE = 'qsbk.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'qsbk (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
start.py文件:(用此文件的目的:此文件代替了从cmd命令中启动爬虫 ,省时)
六、start.py文件代码
from scrapy import cmdline
cmdline.execute(["scrapy",'crawl','qsbk_sp'])
最后运行一下start.py即可!!
运行结果:
最后会生成一个文件,我们想要的结果:
————————————————
版权声明:本文为CSDN博主「在路上的小王」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。