版权声明:凡由本人原创,如有转载请注明出处https://me.csdn.net/qq_41424519,谢谢合作 https://blog.csdn.net/qq_41424519/article/details/87006094
1.启动splash:
使用docker启动服务命令启动Splash服务:
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
2.创建新的项目:icecreamJD,创建新的爬虫icecream
scrapy startproject icecreamJD
scrapy genspider icecream jd.com
3.打开setting:
#Splash服务器地址 SPLASH_URL = 'http://localhost:8050' #开启两个下载中间件,并调整HttpCompressionMiddlewares的次序 DOWNLOADER_MIDDLEWARES = { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware':725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':810, } #设置去重过滤器 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' #用来支持cache_args(可选) SPIDER_MIDDLEWARES = { 'scrapy_splash.SplashDeduplicateArgsMiddleware':100, } HTTPCACHE_STORAGE ='scrapy_splash.SplashAwareFSCacheStorage'
4.打开spider:
# -*- coding: utf-8 -*- import scrapy #使用scrapy.splash.Request发送请求 from scrapy_splash import SplashRequest url = 'https://search.jd.com/Search?keyword=%E5%86%B0%E6%B7%87%E6%B7%8B&enc=utf-8' #自定义lua 脚本模拟用户滑动滑块行为 lua = ''' function main(splash) splash:go(splash.args.url) splash:wait(3) splash:runjs("document.getElementById('footer-2017').scrollIntoView(true)") splash:wait(3) return splash:html() end ''' class IcecreamSpider(scrapy.Spider): name = 'icecream' allowed_domains = ['search.jd.com'] start_urls = ['https://search.jd.com/Search?keyword=%E5%86%B0%E6%B7%87%E6%B7%8B&enc=utf-8'] base_url = 'https://search.jd.com/Search?keyword=%E5%86%B0%E6%B7%87%E6%B7%8B&enc=utf-8' def parse(self, response): page_num = int(response.css('span.fp-text i::text').extract_first()) for i in range(page_num): url = '%s?page=%s' % (self.base_url, 2 * i + 1) # 通过观察我们发现url页面间有规律 yield SplashRequest(url, endpoint='execute', args={'lua_source': lua}, callback=self.parse_item) def parse_item(self, response): # 页面解析函数 for sel in response.css('div.gl-i-wrap'): yield { 'name': sel.css('div.p-name em').extract_first(), 'price': sel.css('div.p-price i::text').extract_first(), }