截止日期2018年5月10日盗墓笔记版本,仅作为参考,因网页格式变化,有一部分未爬取出来
目标网站:盗墓笔记小说网站 目标网址:http://www.daomubiji.com/ 目标内容: 盗墓笔记小说的信息,具体内容包括: 书标题 章数 章标题 输出结果保存在MongoDB中
xxx\novelspider\novelspider\settings.py
# -*- coding: utf-8 -*- # Scrapy settings for novelspider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'novelspider' SPIDER_MODULES = ['novelspider.spiders'] NEWSPIDER_MODULE = 'novelspider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = True MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = 'jikexueyuan' MONGODB_DOCNAME = 'Book' # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'novelspider.middlewares.NovelspiderSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'novelspider.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'novelspider.pipelines.NovelspiderPipeline': 300, } ############################################################################### # 这样在爬虫里调用 # yield item # 的时候都会由经这个FilePipeline来处理。后面的数字400表示的是优先级。 # 可以在此配置多个Pipeline,scrapy会根据优先级,把item依次交给各个item来处理,每个处理完的结果会传递给下一个pipeline来处理。 # 可以这样配置多个pipeline: # ITEM_PIPELINES = { # 'miao.pipelines.Pipeline00': 400, # 'miao.pipelines.Pipeline01': 401, # 'miao.pipelines.Pipeline02': 402, # 'miao.pipelines.Pipeline03': 403, # ## ... # } ######################################################################################## # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage
xxx\novelspider\novelspider\pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from items import NovelspiderItem from scrapy.conf import settings #使得settings的配置项在这里可以使用 import pymongo class NovelspiderPipeline(object): def __init__(self): host = settings['MONGODB_HOST'] # IP port = settings['MONGODB_PORT']# 端口号 dbName = settings['MONGODB_DBNAME']# 数据库的名字 # client = pymongo.MongoClient(host=host,port=port)#括号里面的可有可无,因为数据库在本地 client = pymongo.MongoClient(host=host, port=port) tdb = client[dbName] self.post = tdb[settings['MONGODB_DOCNAME']]# 记录的名字Book def process_item(self, item, spider): bookInfo = dict(item)#将item转化为字典并插入到数据库里面 self.post.insert(bookInfo) return item # 网上查询pymongo初始化数据库 # from pymongo import MongoClient # client = MongoClient() # #创建数据库,两种方式 # db = client.peopleinfo #方式一 # db = client['peopleinfo'] #方式二
xxx/novelspider/main.py
#encoding=utf-8 from scrapy import cmdline cmdline.execute("scrapy crawl novelspider".split()) #使用scrapy里面负责执行Windows命令的一个类,执行scrapy crawl doubanTest,这个命令是爬虫运行,这与 # 平常的python程序不一样,之前运行程序是使用python,然后是这个程序的名字, #cmd对应文件夹(doubantest)下中输入scrapy crawl doubanTest
xxx\novelspider\novelspider\items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html # import scrapy from scrapy import Field,Item class NovelspiderItem(Item): # define the fields for your item here like: # name = scrapy.Field() # pass bookName = Field() chapterURL = Field() bookTitle = Field() chapterNum = Field() chapterName = Field() chapterName2 = Field() chapterName3 = Field() #表征要抓取的数据
xxx\novelspider\novelspider\spiders\novspider.py
#encoding=utf-8 from scrapy.spiders import CrawlSpider from scrapy.selector import Selector from novelspider.items import NovelspiderItem # import redis from scrapy.http import Request class novSpider(CrawlSpider): name = 'novelspider' # redis_key = 'novelspider:start_urls' start_urls = ['http://www.daomubiji.com/dao-mu-bi-ji-1'] # url3 = 'https://movie.douban.com/top250' j = 0 url2 = [] for i in range(2, 12): urls = ['dao-mu-bi-ji-2015', 'sha-hai', 'zang-hai-hua'] # print urls[j] # name = 'novelspider' if i < 9: a = 'http://www.daomubiji.com/dao-mu-bi-ji-' + str(i) url2.append(a) # redis_key = 'novelspider:start_urls' else: a = 'http://www.daomubiji.com/' + urls[j] url2.append(a) j = j + 1 # redis_key = 'novelspider:start_urls' print url2 # urls = 'http://www.daomubiji.com/dao-mu-bi-ji-1' # start_urls = ['http://www.daomubiji.com/'] # print start_urls # j = 0 # for i in range(1,12): # urls = ['dao-mu-bi-ji-2015','sha-hai','zang-hai-hua'] # # print urls[j] # name = 'novelspider' # if i<9: # start_urls = ['http://www.daomubiji.com/dao-mu-bi-ji-'+str(i)] # redis_key = 'novelspider:start_urls' # else: # start_urls = ['http://www.daomubiji.com/'+ urls[j]] # j = j+1 # redis_key = 'novelspider:start_urls' # print start_urls # # start_urls = ['http://www.daomubiji.com/'] def parse(self,response): selector = Selector(response)#处理网页的源代码 # print response.body bookName = selector.xpath('//h1[@class ="focusbox-title"]/text()').extract()[0] # print bookName # content = selector.xpath('//div[@class ="focusbox-text"]/text()') url = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/@href').extract() # print url excerpts = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/text()').extract() print excerpts item = NovelspiderItem() for i in range(len(url)): # for i in range(len(url) + 1): item['bookName'] = bookName item['chapterURL'] = url[i] # print i # print item # for each in excerpts: try: item['bookTitle'] = excerpts[i].split(' ')[0] item['chapterNum'] = excerpts[i].split(' ')[1] item['chapterName'] = excerpts[i].split(' ')[2] # item['chapterName2'] = excerpts[i].split(' ')[3] # item['chapterName3'] = excerpts[i].split(' ')[4] except Exception,e: continue # try: # item['chapterName'] = excerpts[i].split(' ')[2] # except Exception,e: # item['chapterName'] = excerpts[i].split(' ')[1] # try: # item['chapterName2'] = excerpts[i].split(' ')[3] # except Exception, e: # item['chapterName'] = excerpts[i].split(' ')[2] # try: # item['chapterName3'] = excerpts[i].split(' ')[4] # except Exception, e: # item['chapterName'] = excerpts[i].split(' ')[3] yield item for i in range(2, 12): yield Request(self.url2[i], callback=self.parse) # yield Request(self.urls, callback=self.parse) # 添加了一个回调函数,函数自身,递归给这个函数自己,并重新爬取 # nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() #第十页是最后一页,没有下一页的链接 # if nextLink: # nextLink = nextLink[0] # print nextLink # yield Request(self.url+nextLink,callback=self.parse)#添加了一个回调函数,函数自身,递归给这个函数自己,并重新爬取