版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/shengshengshiwo/article/details/79586097
好久不见,今天给大家分享如何用自动化工具selenium和scrapy框架来爬取淘宝。
爬取网站时候的坑!
刚开始爬的时候,就想着直接进入淘宝主页,然后用selenium工具自动一步步执行然后爬取到自己想得到的数据,然而!令我没想到的是,利用自动化工具可以对关键词进行抓取之类,但是很奇怪的是数据抓不下来,于是不得不对进入的链接进行修正。
通过观察得到了这样的网址['https://s.taobao.com/search?q={q}'.format(q=QUESTION)]
QUESTION
是要搜索的关键词。废话不多说,直接上代码吧。
spider的编写
为了网页的加载速度,我们一般不加载网页的图片。
# 设置chrome不加载图片
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)
加载不同的浏览器,接受相应的信号等
def __init__(self):
super(TaobaoSpider, self).__init__()
if DEFAULT_BROWSER == 'Chrome':
self.browser = webdriver.Chrome(chrome_options=self.chrome_opt)
elif DEFAULT_BROWSER == 'PhantomJS':
self.browser = webdriver.PhantomJS()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 5)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_closed(self):
self.browser.close()
解析商品列表页信息
# 解析商品列表页信息
def parse(self, response):
goods = response.css('div.item.J_MouserOnverReq')
for good in goods:
title = good.css('div.row.row-2.title a.J_ClickStat::text').extract()
if isinstance(title, list):
title = ''.join(title).strip()
price = good.css('div.price.g_price.g_price-highlight strong::text').extract_first()
free_shipping = 'Yes' if good.css('div.ship.icon-service-free') else 'No'
# 月售数量
month_sale = good.css('div.deal-cnt::text').extract_first()
month_sale = re.match(r'\d+', month_sale).group(0)
goods_url = good.css('div.row.row-2.title a.J_ClickStat::attr(href)').extract_first()
shop = good.xpath('//div[@class="shop"]/a/span[2]/text()').extract_first()
shop_type = '天猫' if good.css('span.icon-service-tianmao') else '淘宝'
addr = good.css('div.location::text').extract_first()
data = {
'title': title,
'price': price,
'free_shipping': free_shipping,
'month_sale' : month_sale,
'goods_url': goods_url,
'shop': shop,
'shop_type': shop_type,
'addr': addr
}
yield scrapy.Request(urljoin('https:', goods_url), meta={'data': data}, callback=self.parse_grade)
# 获取下一页链接
next_key = response.css('li.next a::attr(data-key)').extract_first()
next_value = response.css('li.next a::attr(data-value)').extract_first()
next_url = self.start_urls[0] + '&' + next_key + '=' + next_value
yield scrapy.Request(next_url, callback=self.parse)
解析商品详情页信息
def parse_grade(self, response):
item = TaobaospiderItem()
data = response.meta['data']
item['title'] = data['title']
item['price'] = data['price']
item['free_shipping'] = data['free_shipping']
item['month_sale'] = data['month_sale']
item['goods_url'] = data['goods_url']
item['shop'] = data['shop']
item['shop_type'] = data['shop_type']
item['addr'] = data['addr']
same_grade = response.css('div.tb-shop-rate a::text').extract()
if len(same_grade) == 3:
item['same_grade'] = float(same_grade[0].strip())
item['service_grade'] = float(same_grade[1].strip())
item['shipping_grade'] = float(same_grade[2].strip())
yield item
引用的类库
import scrapy
from selenium import webdriver
from ..settings import QUESTION, DEFAULT_BROWSER
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from ..items import TaobaospiderItem
import re
from urllib.parse import urljoin
from selenium.webdriver.support.ui import WebDriverWait
items.py
import scrapy
class TaobaospiderItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
free_shipping = scrapy.Field()
month_sale = scrapy.Field()
goods_url = scrapy.Field()
shop = scrapy.Field()
shop_type = scrapy.Field()
addr = scrapy.Field()
same_grade = scrapy.Field()
service_grade = scrapy.Field()
shipping_grade = scrapy.Field()
def get_article_info_insert_sql(self):
insert_sql = """
insert into info(
title,
price,
free_shipping,
month_sale,
goods_url,
shop,
shop_type,
addr,
same_grade,
service_grade,
shipping_grade
)
values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE title = VALUES(title)
"""
params = (self['title'],
self['price'],
self['free_shipping'],
self['month_sale'],
self['goods_url'],
self['shop'],
self['shop_type'],
self['addr'],
self['same_grade'],
self['service_grade'],
self['shipping_grade'],
)
return insert_sql, params
middlewares.py
from scrapy.http import HtmlResponse
import logging
class ChromeMiddleware(object):
def __init__(self):
self.logger = logging.getLogger(__name__)
def process_request(self, request, spider):
browser = spider.browser
browser.get(request.url)
#模拟下拉
browser.execute_script('window.scrollTo(0,document.body.scrollHeight);var leftOfPage = document.body.scrollHeight;return leftOfPage;')
self.logger.debug('getting ' + request.url)
return HtmlResponse(url=request.url, body=browser.page_source, request=request, encoding='utf-8')
pipelines.py
import logging
from twisted.enterprise import adbapi
class MysqlTwistedPipeline(object):
def __init__(self, params):
self.dbpool = adbapi.ConnectionPool('pymysql', **params)
self.logger = logging.getLogger(__name__)
@classmethod
def from_settings(cls, settings):
return cls(settings.get('MYSQL_PARAMS'))
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert_article_info, item)
query.addErrback(self.handle_error, item, spider)
def do_insert_article_info(self, cursor, item):
insert_sql, params = item.get_article_info_insert_sql()
try:
cursor.execute(insert_sql, params)
except Exception as e:
print(e)
pass
def handle_error(self, failure, item, spider):
self.logger.debug(failure)
settings.py
DOWNLOADER_MIDDLEWARES = {
'taobaospider.middlewares.ChromeMiddleware': 543,
}
ITEM_PIPELINES = {
'taobaospider.pipelines.MysqlTwistedPipeline': 300,
}
QUESTION = 'Python'
#设置默认浏览器
DEFAULT_BROWSER = 'Chrome'
MYSQL_PARAMS = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'root',
'db': '1',
'charset': 'utf8'
}
爬取结果
欢迎关注我的个人公众号。