新建爬虫项目 tb.py
# -*- coding: utf-8 -*-
import scrapy,json
from taobao.items import TaobaoItem
class TbSpider(scrapy.Spider):
name = 'tb'
allowed_domains = ['taobao.com']
start_urls = ['https://s.m.taobao.com/search?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=2&wlsort=2']
def parse(self, response):
q_list = ['零食','手机','家电','男装','鞋子','女装','游戏','动漫','家具','玩具','箱包','美妆','洗护','手表','眼睛','运动','乐器','美食','生鲜','']
for q in q_list:
urls = response.url + '&q={}'.format(q)
# print(url)
yield scrapy.Request(urls, callback=self.parse_aaa)
def parse_aaa(self, response):
for i in range(1, 101):
url = response.url + '&page=%s' % i
yield scrapy.Request(url, callback=self.parse_sp)
# break
def parse_sp(self, response):
res_json = json.loads(response.body)
res_list = res_json['listItem']
for res in res_list:
title = res['title'] # 商品名字
print(title)
yuanjia = res['originalPrice'] # 原价
print(yuanjia)
xianjia = res['price'] #现价
print(xianjia)
dianming = res['nick'] # 店铺名子
print(dianming)
weizhi = res['location'] # 地点
print(weizhi)
tupianurl = res['pic_path'] # 商品图片
print(tupianurl)
sold = res['sold'] # 付款人数
print(sold)
item = TaobaoItem()
item['title'] = title
item['yuanjia'] = yuanjia
item['xianjia'] = xianjia
item['dianming'] = dianming
item['weizhi'] = weizhi
item['tupianurl'] = tupianurl
item['sold'] = sold
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TaobaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
yuanjia = scrapy.Field()
xianjia = scrapy.Field()
dianming = scrapy.Field()
weizhi = scrapy.Field()
tupianurl = scrapy.Field()
sold = scrapy.Field()
def get_insert_sql(self):
sql = 'INSERT INTO taobao (title, yuanjia, xianjia, dianming, weizhi, tupianurl, sold) ' \
'VALUES (%s,%s,%s,%s,%s,%s,%s)'
data = (self['title'], self['yuanjia'], self['xianjia'], self['dianming'], self['weizhi'], self['tupianurl'], self['sold'])
return (sql, data)
数据库写入
import pymysql
# 创建mysql相关的类
class MysqlHelper(object):
# 初始化函数, 实例化的时候自动执行
def __init__(self):
# 连接mysql的代码
self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456',database='py10', charset='utf8')
# 创建游标, 目的是为了执行sql语句
self.cursor = self.db.cursor()
# 这个函数是我们会反复调用的函数, 目的是执行sql语句, sql是要执行的语句, data是需要插入的数据
def execute_modify_sql(self, sql, data=None):
# 执行
self.cursor.execute(sql, data)
# 数据库的提交
self.db.commit()
# 析构函数, 本个对象再也没有人使用以后, 这个函数自动执行
def __del__(self):
# 关闭游标
self.cursor.close()
# 关闭数据库连接
self.db.close()
if __name__ == '__main__':
sql = 'insert into bole_test (title, zan) VALUES (%s, %s)'
data = ('没人睡觉', 2)
myhelper = MysqlHelper()
myhelper.execute_modify_sql(sql, data) # 这是测试数据库能不能写入
settings.py
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
DOWNLOADER_MIDDLEWARES = {
'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
# 'taobao.pipelines.TaobaoPipeline': 300,
'taobao.pipelines.StoreMysqlScrapyPipeline': 300,
}
pipelines.py
from taobao.mysqlhelper import MysqlHelper
class TaobaoPipeline(object):
def process_item(self, item, spider):
return item
class StoreMysqlScrapyPipeline(object):
def process_item(self, item, spider):
(insert_sql, data) = item.get_insert_sql()
myhelper = MysqlHelper()
myhelper.execute_modify_sql(insert_sql, data)
return item
创建启动文件 run_taobao.py
from scrapy.cmdline import execute
execute('scrapy crawl tb'.split())