# 用框架爬取博客园信息并保存到数据库
# cnlogs_itload.py
# -*- coding: utf-8 -*- import scrapy import re from ..items import CnblogItem,CnblogItemLoader from datetime import datetime from w3lib.html import remove_tags from urllib import request # scrapy url去重 (指纹过滤) # url_id = {url1,url2} # task_q = Queue(r1,r2) # itemloader # ['发布与 :2013-9-8'] -》输入管理器 ['2013-9-8'] - > 输出管理器 '2013-9-8' - > 赋值给item class CnblogSpider(scrapy.Spider): name = 'cnblog_itemloader' allowed_domains = ['cnblogs.com'] # 只针对cnblogspider 生效的配置 custom_settings = { 'ROBOTSTXT_OBEY' : False, 'CONCURRENT_REQUESTS' : 100, 'ITEM_PIPELINES' : { 'day13.pipelines.CnblogPipeline': 1, } } headers = { "Host": "www.cnblogs.com", "Connection": "keep-alive", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", "Content-Type": "application/json; charset=UTF-8", "Accept-Language": "zh-CN,zh;q=0.9", } # 构建首次请求 def start_requests(self): base_url = 'https://www.cnblogs.com/aggsite/SubCategories' body = '{"cateIds":"108698,2,108701,108703,108704,108705,108709,108712,108724,4"}' yield scrapy.Request(base_url,method='post',headers=self.headers,body=body,callback=self.parse) def parse(self, response): cate_list = response.xpath('//a/@href').extract() for url in cate_list: url = request.urljoin(response.url,url) yield scrapy.Request(url,callback=self.parse_first,meta={'url' : url}) # 解析所有分类第一页 def parse_first(self,response): url = response.meta['url'] + '%d' try: max_page = response.xpath('//div[@class="pager"]/a/text()').extract()[-2] max_page = int(max_page) except Exception as e: max_page = 1 for i in range(max_page,0,-1): fullurl = url % i yield scrapy.Request(fullurl,callback=self.parse_list) def parse_list(self,response): article_list = response.xpath('//div[@class="post_item"]') for article in article_list: # 创建数据模型 item = CnblogItem() itemloader = CnblogItemLoader(item=item,selector=article) itemloader.add_css('title','h3 a::text') itemloader.add_css('article_link','h3 a::attr(href)') itemloader.add_css('re_num','span.diggnum::text') itemloader.add_css('industry','p.post_item_summary::text') itemloader.add_css('author','div.post_item_foot a::text') itemloader.add_css('date_pub','div.post_item_foot::text') itemloader.add_css('comment','span.article_comment a::text') itemloader.add_css('read_num','span.article_view a::text') itemloader.add_value('crawl_time',datetime.now().strftime('%Y-%m-%d')) itemloader.add_value('spider_name',self.name) # print(itemloader.load_item()) # 详情页请求 article_link = article.css('h3 a::attr(href)').extract_first() yield scrapy.Request(article_link,callback=self.parse_detail,meta={'data':itemloader}) def parse_detail(self,response): itemloader = response.meta['data'] itemloader.selector = response itemloader.add_xpath('content','//div[@id="post_detail"]') # print(itemloader.load_item()) yield itemloader.load_item()
# items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst,Join,MapCompose import re from w3lib.html import remove_tags class CnblogItemLoader(ItemLoader): default_output_processor = TakeFirst() # 定义输入管理器 def v_format(value): # 数据已经被exetract for item in value: item.strip() return value def process_date(date_pub): # 处理时间 date_pub[0] = date_pub[-1].strip('\r\n ').strip('发布于 ') return date_pub def get_num(value): num_pat = re.compile('\d+') res = num_pat.search(value[0]) if res is not None: data = int(res.group()) else: data = 0 value[0] = data return value def process_content(value): value[0] = remove_tags(value[0]).strip() return value def process_title(value): value.append('_Alice') return value class CnblogItem(scrapy.Item): title = scrapy.Field( input_processor = process_title, output_processor = Join('$') ) content = scrapy.Field( input_processor = process_content ) article_link = scrapy.Field() re_num = scrapy.Field() industry = scrapy.Field( input_processor = v_format, ) author = scrapy.Field() date_pub = scrapy.Field( input_processor = process_date ) comment = scrapy.Field( input_processor = get_num ) read_num = scrapy.Field( input_processor=get_num ) crawl_time = scrapy.Field() spider_name = scrapy.Field() def get_sql(self): sql = 'insert into py07_cnblog(title,content,article_link,re_num,industry,author,date_pub,comment,read_num,crawl_time,spider_name) ' \ 'values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = (self["title"],self["content"],self["article_link"],self["re_num"],self["industry"],self["author"],self["date_pub"],self["comment"],self["read_num"],self["crawl_time"],self["spider_name"]) return sql,data
# pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class Day13Pipeline(object): def process_item(self, item, spider): return item class CnblogPipeline(object): def __init__(self): self.conn = pymysql.connect('127.0.0.1','root','123456','han',charset='utf8') self.cursor = self.conn.cursor() def process_item(self,item,spider): # 插入数据库 sql ,data = item.get_sql() self.cursor.execute(sql,data) self.conn.commit() return item def close_spider(self,spider): self.cursor.close() self.conn.close()
# settings.py
# -*- coding: utf-8 -*- # Scrapy settings for day13 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'day13' SPIDER_MODULES = ['day13.spiders'] NEWSPIDER_MODULE = 'day13.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'day13 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 1 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'day13.middlewares.Day13SpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'day13.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # ITEM_PIPELINES = { # 'day13.pipelines.CnblogPipeline': 1, # } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# main.py
from scrapy import cmdline # cmdline.execute('scrapy crawl cnblog'.split()) # cmdline.execute('scrapy crawl cnblog_all'.split()) cmdline.execute('scrapy crawl cnblog_itemloader'.split())
# 爬取部分结果如下:
兄弟连学python
Python学习交流、资源共享群:563626388 QQ