本项目仅供学习交流,请勿商用;
CentOS7中爬虫部署
一、创建项目
# 创建scrapy爬虫项目
scrapy startproject meinv
cd meinv/
# 查看可用爬虫模板,并基于 crawl 模板创建全站爬虫
scrapy genspider -l
scrapy genspider -t crawl mm 2717.com
二、项目代码
./meinv/spiders/mm.py
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from ..items import MeinvItem
class MmSpider(CrawlSpider):
name = 'mm'
allowed_domains = ['2717.com']
start_urls = ['https://www.2717.com/ent/meinvtupian/']
rules = (
# 大界面的下一页
Rule(LinkExtractor(allow=r'list_\d+_\d+\.html'), ),
# 单人相册的下一页
Rule(LinkExtractor(allow=r'/ent/meinvtupian/2019/\d+.html'), ),
# 单张相册的界面,需要提取的内容
Rule(LinkExtractor(allow=r'\d+_\d+\.html'), callback='parse_item', follow=False),
)
def parse_item(self, response):
loader = ItemLoader(MeinvItem(), response=response)
loader.add_xpath('title', '//div[@class="articleV4Body"]/p/a[1]/img/@alt')
loader.add_xpath('url', '//div[@class="articleV4Body"]/p/a[1]/img/@src')
yield loader.load_item()
./meinv/items.py
import scrapy, re
from scrapy.loader.processors import TakeFirst, Join, MapCompose
def delete_blank(data):
return re.sub(r'\s+', '', data)
class MeinvItem(scrapy.Item):
title = scrapy.Field(
input_processor=MapCompose(delete_blank),
output_processor=Join()
)
url = scrapy.Field(
output_processor=TakeFirst()
)
./meinv/pipelines.py
# -*- coding: utf-8 -*-
import pymysql
class MeinvPipeline(object):
def __init__(self):
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='******',
database='scrapy_test',
charset='utf8'
)
# 得到一个可以执行SQL语句并且将结果作为字典返回的游标
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
# 创建表
self.cursor.execute('create table if not exists mm(id int not null primary key auto_increment, title varchar (255) default null , url varchar (255) default null );')
self.conn.commit()
def process_item(self, item, spider):
if spider.name == 'mm':
print('*'*20)
print(item['title'])
print(item['url'][0]) # 列表需要提取一下
print('*'*20)
print()
self.cursor.execute("insert into mm values (0, '%s', '%s');" % (item['title'], item['url'][0]))
self.conn.commit()
./meinv/settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'application/json, text/plain, */*',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
ITEM_PIPELINES = {
'meinv.pipelines.MeinvPipeline': 300,
}
三、运行命令
scrapy crawl mm
四、服务器部署
1、安装 scrapyd
pip install scrapyd
2、配置
/data/virtualenvs/testdemo/lib/python3.6/site-packages/scrapyd/default_scrapyd.conf
bind_address = 0.0.0.0 # 外网
3、开启 5000 和 6800 端口
firewall-cmd --list-all
firewall-cmd --permanent --add-port=5000/tcp
firewall-cmd --permanent --add-port=6800/tcp
firewall-cmd --reload
4、开启服务测试
输入命令 scrapyd,开启 scrapyd 服务,浏览器访问 http://192.168.5.149:6800/
5、安装 scrapyd-client
pip install scrapyd-client
配置 ./scrapy.cfg
[settings]
default = meinv.settings
[deploy:mm]
url = http://192.168.5.149:6800/
project = meinv
scrapyd-deploy -l # 查看所有爬虫
报错:
scrapyd-deploy:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead. from scrapy.utils.http import basic_auth_header
解决:
from scrapy.utils.http import basic_auth_header改为 from w3lib.http import basic_auth_header
6、开始运行
# 开启 scrapyd 服务
scrapyd
# 上传 scrapy 到 scrapyd 服务器 scrapyd-deploy <target> -p <project>
scrapyd-deploy mm -p meinv
# 开始运行
curl http://192.168.5.149:6800/schedule.json -d project=meinv -d spider=mm
五、supervisor 守护进程
1、安装
# 守护进程
pip install supervisor
# web 网页管理爬虫
pip install spiderkeeper
2、supervisor 配置文件
mkdir /etc/supervisor
echo_supervisord_conf > /etc/supervisor/supervisord.conf
vim /etc/supervisor/supervisord.conf
[include]
files = conf.d/*.conf
3、添加 scrapyd 的配置文件
mkdir /etc/supervisor/conf.d
vim /etc/supervisor/conf.d/scrapyd.conf
[program:scrapyd]
autostart=true
directory=/data/workspace/scrapy_project/meinv
command=/data/virtualenvs/testdemo/bin/scrapyd
user=root
stderr_logfile=/var/log/scrapyd.err.log
stdout_logfile=/var/log/scrapyd.out.log
4、spiderkeeper(用于web管理爬虫)
# 添加 spiderkeeper 的配置文件(spiderkeeper 可以识别多台 scrapyd,具体多加 --server 就可以)
vim /etc/supervisor/conf.d/spiderkeeper.conf
[program:spiderkeeper]
directory=/data/workspace/scrapy_project/meinv
command=/data/virtualenvs/testdemo/bin/spiderkeeper --server=http://192.168.5.149:6800
user=root
stderr_logfile=/var/log/spiderkeeper.err.log
stdout_logfile=/var/log/spiderkeeper.out.log
5、supervisor 命令
# 指定配置文件启动
supervisord -c /etc/supervisor/supervisord.conf
# 重启
supervisorctl reload
# 关闭
supervisorctl shutdown
6、supervisor 设置 systemctl
vim /lib/systemd/system/supervisord.service
[Unit]
Description=supervisord
[Service]
Type=forking
ExecStart=/usr/local/python36/bin/supervisord -c /etc/supervisor/supervisord.conf
ExecReload=/usr/local/python36/bin/supervisorctl reload
ExecStop=/usr/local/python36/bin/supervisorctl shutdown
KillMode=process
Restart=on-failure
RestartSec=42s
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable supervisord.service
systemctl start supervisord
7、spiderkeeper 网页管理爬虫
# 登录
http://192.168.5.149:5000
# 网页中,新建 project
# 打包egg爬虫文件(需要提前 pip install scrapyd-client)
scrapyd-deploy --build-egg output.egg
# 然后再网站上操作运行