pipelines.py
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
class MysqlTwistedPipeline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
dbpool=adbapi.ConnectionPool("pymysql",host=settings["MYSQL_HOST"],db=settings["MYSQL_DBNAME"],user=settings["MYSQL_USER"],password=settings["MYSQL_PASSWORD"],charset="utf8", cursorclass=pymysql.cursors.DictCursor,
use_unicode=True)
return cls(dbpool)
def process_item(self,item,spider):
# 使用twisted将mysql插入变成异步执行
self.dbpool.runInteraction(self.do_insert,item)
def do_insert(self,cursor,item):
# 执行具体的插入
# 根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)
items.py
import scrapy
class TencentItem(scrapy.Item):
positionname=scrapy.Field()
positionlink=scrapy.Field()
positionType=scrapy.Field()
positionNum=scrapy.Field()
positionLocation=scrapy.Field()
publishTime=scrapy.Field()
def get_insert_sql(self):
insert_sql="""
insert into tencent(positionname,positionlink,positionType,positionNum,positionLocation,publishTime)
VALUES (%s,%s,%s,%s,%s,%s)
"""
params=(
self['positionname'], self['positionlink'], self['positionType'], self['positionNum'],
self['positionLocation'], self['publishTime']
)
return insert_sql,params
settings.py
BOT_NAME = 'tencent'
SPIDER_MODULES = ['tencent.spiders']
NEWSPIDER_MODULE = 'tencent.spiders'
ROBOTSTXT_OBEY = False
(不用分布式可忽略下面三项)
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
DOWNLOAD_DELAY = 2
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language': 'en',
}
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline':400,(不用分布式可忽略)
'tencent.pipelines.MysqlTwistedPipeline': 300,
}
REDIS_HOST = '172.21.118.56'(分布式主机ip 不用分布式可忽略)
REDIS_PORT = 6379(不用分布式可忽略)
MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "tencent"(自己数据库名字)
MYSQL_USER = "usrername"(用户名)
MYSQL_PASSWORD = "userpassword"(密码)
spiders/Tencent.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
from tencent.items import TencentItem
class TencentSpider(RedisCrawlSpider):
name = "Tencent"
allowed_domains = ["tencent.com"]
redis_key = 'TencentSpider:start_urls'
page_link=LinkExtractor(allow=(r"start=\d+"))
rules=[
Rule(page_link,callback = "parseContent",follow=True)
]
def parseContent(self, response):
list=response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
for infos in list:
item=TencentItem()
item['positionname']=infos.xpath("./td[1]/a/text()").extract()[0]
item['positionlink']=infos.xpath("./td[1]/a/@href").extract()[0]
item['positionType']=infos.xpath("./td[2]/text()").extract()
item['positionNum']=infos.xpath("./td[3]/text()").extract()[0]
item['positionLocation']=infos.xpath("./td[4]/text()").extract()[0]
item['publishTime']=infos.xpath("./td[5]/text()").extract()[0]
yield item
from: https://www.cnblogs.com/huwei934/p/7116877.html