scrapy爬取海贼王漫画
1、创建项目scrapy startproject onepiecesScrapy
2、创建spider
cd onepieces
Scrapy scrapy genspider onepieces http://manhua.fzdm.com/02/
3、改写项目文件在settings中增加
ROBOTSTXT_OBEY = False # 这个不禁用,遵守协议还怎么爬,人家默认不让你爬啊
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
}
3、在middle中增加随机User-Agent
我用的自己维护的代理池,这里就不写随机代理了,直接写随机User-Agent。
在middlewares.py中编写随机头。
class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = ["Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)
并在settings中增加:
settings :
DOWNLOADER_MIDDLEWARES = {
‘onepiece.middlewares.RandomUserAgentMiddleware’: 543,
}
代码分析
为了了解动漫的进展,看相应的漫画是个不错的选择。而KuKu动漫又是免费的试看平台,满足我的需求。奉上URL:http://manhua.fzdm.com/02/。
在此之前,不得不说的一句话就是:请勿将程序用于任何商业用途,仅供交流学习。尊重著作权,请购买正版漫画。
1、 spider分析
class OnepiecesSpider(scrapy.Spider):
name = 'onepieces'
allowed_domains = ['manhua.fzdm.com']
start_urls = ['http://manhua.fzdm.com/02/']
server_img = 'http://p17.xiaoshidi.net/'
pattern_img = re.compile(r'var mhurl1="(.*?)"')
def parse(self, response):
pass
# 从start_requests发送请求
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse1)
# 解析response,获得章节图片链接地址
def parse1(self, response):
items = []
# 章节链接地址
urls = response.xpath('//*[@id="content"]/li/a[1]/@href').extract()
# 章节名
dir_names = response.xpath('//*[@id="content"]/li/a[1]/text()').extract()
# 保存章节链接和章节名
for index in range(len(urls)):
item = ComicItem()
item['link_url'] = response.urljoin(urls[index])
item['dir_name'] = dir_names[index]
item["img_url"]=[]
items.append(item)
#根据每个章节的链接,发送Request请求,并传递item参数
for item in items:
yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)
# 解析获得章节第一页的页码数和图片链接
def parse2(self, response):
# 接收传递的item
item = response.meta['item']
# 获取章节的第一页的链接
# 获取章节的第一页的图片链接 从script中获取的
img_url=response.xpath('/html/body/script[7]').re(r'var mhurl="(.*?)"')
item['img_url'].append(self.server_img+img_url[0])
pre_img_url=response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
# 将获取的章节的第一页的图片链接保存到img_url中
item['img_url'] .extend(img_url)
# 返回item,交给item pipeline下载图片
next = response.xpath( '//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)
# 解析获得本章节其他页面的图片链接
def parse3(self, response):
# 接收传递的item
item = response.meta['item']
# 获取该页面的链接
pre_img_url = response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
if img_url :#如果为空则为最后一页
# 将获取的图片链接保存到img_url中
item['img_url'].extend(img_url)
print(item["dir_name"]+img_url[0])
next = response.xpath('//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract()[1]
print("next :"+next)
url = response.urljoin(next)
# 返回item,交给item pipeline下载图片
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)
yield item
2、编写items文件
Item是保存文件爬取数据的容器,使用方法和字典相同。
创建Item需要继承Scrapt.Item类,类型定义为scrapy.Field字段。
class ComicItem(scrapy.Item):
dir_name = scrapy.Field()
link_url = scrapy.Field()
img_url = scrapy.Field()
image_paths = scrapy.Field()
3、编写pipelines文件
class OnepiecePipeline(object):
def process_item(self, item, spider):
return item
class ComicImgDownloadPipeline(object):
def process_item(self, item, spider):
#如果获取了图片链接,进行如下操作
if 'img_url' in item:
images = []
#文件夹名字
dir_path = '%s/%s' % (settings.IMAGES_STORE, item['dir_name'])
#文件夹不存在则创建文件夹
if not os.path.exists(dir_path):
os.makedirs(dir_path)
#获取每一个图片链接
for image_url in item['img_url']:
#解析链接,根据链接为图片命名
houzhui = image_url.split('/')[-1].split('.')[-1]
qianzhui = image_url.split('/')[-1].split('.')[0]
#图片名
image_file_name = '第' + qianzhui + '页.' + houzhui
#图片保存路径
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
#保存图片
with open(file_path, 'wb') as handle:
response = requests.get(url = image_url)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
#返回图片保存路径
item['image_paths'] = images
return item
最后爬取scrapy crawl onepieces
代码地址:https://download.csdn.net/download/huangwencai123/11142906