\ArticleSpider\spiders\jobbole.py(爬虫下的代码)
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http import Request
from urllib import parse
from ..items import JobboleArticleItem,ArticleItemLoader
from ..utils.common import get_md5
import datetime
from scrapy.loader import ItemLoader
class JobbleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
image_url = post_node.css("img::attr(src)").extract_first("")
post_url = post_node.css("::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail)
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url,post_url),callback=self.parse)
def parse_detail(self,response):
#article_item = JobboleArticleItem()
# title = response.xpath('//*[@id="post-112048"]/div[1]/h1/text()').extract_first()
# create_date = response.xpath('//*[@id="post-112048"]/div[2]/p/text()[1]').extract()[0].strip().replace('.','')
# praise_nums = response.xpath('//span[contains(@class," btn-bluet-bigger href-style vote-post-up register-user-only ")]/h10/text()').extract()[0]
# fav_nums = response.xpath('//span[contains(@class,"btn-bluet-bigger href-style bookmark-btn register-user-only ")]/text()').extract()[0]
# match_re = re.match(".*(\d+).*",fav_nums)
# if match_re:
# fav_nums = int(match_re.group(1))
# else:
# fav_nums = 0
# comment_nums = response.xpath('//span[contains(@class,"btn-bluet-bigger href-style hide-on-480")]/text()').extract()[0]
# match_re = re.match(".*(\d+).*",comment_nums)
# if match_re:
# comment_nums = int(match_re.group(1))
# else:
# comment_nums = 0
# content = response.xpath('//div[@class="entry"]').extract()[0]
# tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
# tag_list = [element for element in tag_list if element.strip().endswith("评论")]
# tags = ",".join(tag_list)
#css选择器
# front_image_url = response.meta.get("front_image_url","")
# title = response.css('.entry-header h1::text').extract()[0]
# create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('.','')
# praise_nums = response.css('.vote-post-up h10::text').extract()[0]
# fav_nums = response.css('.bookmark-btn::text').extract()[0]
# match_re = re.match(".*(\d+).*",fav_nums)
# if match_re:
# fav_nums = int(match_re.group(1))
# else:
# fav_nums = 0
# comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0]
# match_re = re.match(".*(\d+).*",comment_nums)
# if match_re:
# comment_nums = int(match_re.group(1))
# else:
# comment_nums = 0
# content = response.css('div.entry').extract()[0]
# tags = response.css('p.entry-meta-hide-on-mobile a::text').extract()[0]
# tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
# tag_list = [element for element in tag_list if element.strip().endswith("评论")]
# tags = ",".join(tag_list)
#
# article_item["url_object_id"] = get_md5(response.url)
# article_item["title"] = title
# article_item["url"] = response.url
# try:
# create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date()
# except Exception as e:
# create_date = datetime.datetime.now().date()
# article_item["create_date"] = create_date
# article_item["front_image_url"] = [front_image_url]
# article_item["praise_nums"] = praise_nums
# article_item["comment_nums"] = comment_nums
# article_item["fav_nums"] = fav_nums
# article_item["tags"] = tags
# article_item["content"] = content
#通过item loader加载item
front_image_url = response.meta.get("front_image_url","")
item_loader = ArticleItemLoader(item=JobboleArticleItem(),response=response)
item_loader.add_css("title",".entry-header h1::text")
item_loader.add_value("url",response.url)
item_loader.add_value("url_object_id",get_md5(response.url))
item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
item_loader.add_value("front_image_url",[front_image_url])
item_loader.add_css("praise_nums",".vote-post-up h10::text")
item_loader.add_css("comment_nums",'a[href="#article-comment"] span::text')
item_loader.add_css("fav_nums",".bookmark-btn::text")
item_loader.add_css("tags","p.entry-meta-hide-on-mobile a::text")
item_loader.add_css("content","div.entry")
article_item = item_loader.load_item()
yield article_item
ArticleSpider\utils\common.py
import hashlib
def get_md5(url):
if isinstance(url,str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__=="__main__":
print(get_md5("http://blog.jobbole.com".encode("utf-8")))
\ArticleSpider\items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader.processors import MapCompose,TakeFirst,Join
import datetime
from scrapy.loader import ItemLoader
import re
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def add_jobbole(value):
return value +"-jhy"
def date_convert(value):
try:
create_date = datetime.datetime.strptime(value,"%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(value):
match_re = re.match(".*(\d+).*",value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def remove_comment_tags(value):
#去掉tag提取的评论
if "评论" in value:
return ""
else:
return value
def return_value(value):
return value
class ArticleItemLoader(ItemLoader):
#自定义itemloader
default_output_processor = TakeFirst()
class JobboleArticleItem(scrapy.Item):
title = scrapy.Field()
create_date = scrapy.Field(
input_processor = MapCompose(date_convert)
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor = MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor = MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor = MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor = MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor = MapCompose(remove_comment_tags),
output_processor = Join(",")
)
content = scrapy.Field()
\ArticleSpider\pipelines.py
-- coding: utf-8 --
Define your item pipelines here
#
Don’t forget to add your pipeline to the ITEM_PIPELINES setting
See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
import codecs
import json
from scrapy.exporters import JsonItemExporter
import MySQLdb
from twisted.enterprise import adbapi
import MySQLdb.cursors
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
class JsonWithEncodingPipeline(object):
#自定义json的文件到处
def init(self):
self.file = codecs.open(‘article.json’,’w’,encoding=”utf-8”)
def process_item(self, item, spider):
lines = json.dumps(dict(item),ensure_ascii=False) + “\n”
self.file.write(lines)
return item
def spider_closed(self,spider):
self.file.close()
采用同步存储mysql
class MysqlPipeline(object):
def init(self):
self.conn = MySQLdb.connect(‘localhost’,’root’,’jhy’,’article_spider’,charset=”utf8”,use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums)
values(%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,(item["title"],item["url"],item["create_date"],item["fav_nums"]))
self.conn.commit()
采用异步存储mysql
class MysqlTwistedPipline(object):
def init(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twised将mysql插入变成异步插入
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error) #处理异常
def handle_error(self,failure):
print(failure)
def do_insert(self,cursor,item):
#执行具体的插入
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums)
values(%s,%s,%s,%s)
"""
cursor.execute(insert_sql,(item["title"],item["url"],item["create_date"],item["fav_nums"]))
class JsonExporterPipleLine(object):
#调用scrapy到处json文件
def init(self):
self.file = open(‘articleexport.json’,’wb’)
self.exporter = JsonItemExporter(self.file,encoding=”utf-8”,ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if “front_image_url” in item:
for ok, value in results:
image_file_path = value[“path”]
item[“front_image_path”] = image_file_path
return item
\ArticleSpider\settings.py
-- coding: utf-8 --
import os
Scrapy settings for ArticleSpider project
#
For simplicity, this file contains only settings considered important or
commonly used. You can find more settings consulting the documentation:
#
http://doc.scrapy.org/en/latest/topics/settings.html
http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = ‘ArticleSpider’
SPIDER_MODULES = [‘ArticleSpider.spiders’]
NEWSPIDER_MODULE = ‘ArticleSpider.spiders’
Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘ArticleSpider (+http://www.yourdomain.com)’
Obey robots.txt rules
ROBOTSTXT_OBEY = False
Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
Configure a delay for requests for the same website (default: 0)
See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
COOKIES_ENABLED = False
Disable Telnet Console (enabled by default)
TELNETCONSOLE_ENABLED = False
Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
}
Enable or disable spider middlewares
See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
‘ArticleSpider.middlewares.ArticlespiderSpiderMiddleware’: 543,
}
Enable or disable downloader middlewares
See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
‘ArticleSpider.middlewares.MyCustomDownloaderMiddleware’: 543,
}
Enable or disable extensions
See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
EXTENSIONS = {
‘scrapy.extensions.telnet.TelnetConsole’: None,
}
Configure item pipelines
See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# ‘ArticleSpider.pipelines.JsonExporterPipleLine’: 2,
# #’scrapy.pipelines.images.ImagesPipeline’:1,
# ‘ArticleSpider.pipelines.ArticleImagePipeline’: 1,
‘ArticleSpider.pipelines.MysqlTwistedPipline’: 1,
}
IMAGES_URLS_FIELD = “front_image_url”
project_dir = os.path.abspath(os.path.dirname(file))
IMAGES_STORE = os.path.join(project_dir,”images”)
IMAGES_MIN_HEIGHT = 100
IMAGES_MIN_WIDTH = 100
Enable and configure the AutoThrottle extension (disabled by default)
See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
The initial download delay
AUTOTHROTTLE_START_DELAY = 5
The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
The average number of requests Scrapy should be sending in parallel to
each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
Enable and configure HTTP caching (disabled by default)
See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = ‘httpcache’
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’
MYSQL_HOST = “localhost”
MYSQL_DBNAME = “article_spider”
MYSQL_USER = “root”
MYSQL_PASSWORD = “jhy”
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(file)))
execute([“scrapy”,”crawl”,”jobbole”])
\main.py
“`