版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/m0_37886429/article/details/83896766
今天在python中文社区上看到一篇文章:《Python爬虫基础:验证码的爬取和识别详解》
url地址为:https://profwang.cloudcare.cn/kl/article/fad57e166edd5bc278fb5cd304a2024e95ea7553?share=1
我感觉作者用xpath分析代码的时候不是很好,下面是我重新改善的
一、用lxml模块分析代码
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
import time,os
from lxml import etree
def get_Page(url,headers):
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_Page(html,headers):
html_lxml = etree.HTML(html)
#在xpath中可以用 "|" 表示选取若干路径
datas = html_lxml.xpath('.//div[@class="captcha_images_left"]|.//div[@class="captcha_images_right"]')
item= {}
# 创建保存验证码文件夹
file = 'qcode'
if os.path.exists(file):
os.chdir(file)
else:
os.mkdir(file)
os.chdir(file)
for data in datas:
name = data.xpath('.//h3') #验证码名称,返回列表
src = data.xpath('.//div/img/@src') #验证码链接,返回列表
for i in range(len(name)):
filename = name[i].text + '.jpg' # 验证码图片文件名
img_url = 'https://captcha.com/' + src[i]
item[filename] = img_url
count = 0
for imgname, imgurl in item.items():
response = requests.get(imgurl, headers=headers)
if response.status_code == 200:
image = response.content #获取图片内容
with open(imgname,'wb') as f:
f.write(image)
count += 1
print('保存第{}张验证码成功'.format(count))
time.sleep(1)
def main():
url = 'https://captcha.com/captcha-examples.html?cst=corg'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
html = get_Page(url,headers)
parse_Page(html,headers)
if __name__ == '__main__':
main()
二、用bs4模块分析代码
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
import time,os
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
def get_Page(url,headers):
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_Page(html):
soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8')
data_left = soup.select('#main .captcha_images_left')
data_right = soup.select('#main .captcha_images_right')
data = {}
# 创建保存验证码文件夹
file = 'qcode'
if os.path.exists(file):
os.chdir(file)
else:
os.mkdir(file)
os.chdir(file)
for i in range(2):
for row in zip(data_left,data_right):
names = row[i].select('h3') #获取所有的 h3 标签,返回列表
images = row[i].select('img') #获取所有的 img 标签,返回列表
for tag_h,tag_img in zip(names,images):
#tag_h 为所有的 h3 标签 ;tag_img 为所有的 img 标签
data[str(tag_h.text).strip() + '.jpg'] = 'https://captcha.com/' + tag_img['src']
for imgname,imgurl in data.items():
response = requests.get(imgurl, headers=headers)
if response.status_code == 200:
urlretrieve(imgurl,imgname)
time.sleep(1)
if __name__ == '__main__':
url = 'https://captcha.com/captcha-examples.html?cst=corg'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
html = get_Page(url, headers)
parse_Page(html)
以上两种方式下载图片都比较慢,如果想快速爬去图片,最好用多线程和队列,我之前写过一个用多线程爬虫的,地址是 https://blog.csdn.net/m0_37886429/article/details/79005002
改善多线程爬虫,以下是pyton3的代码
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# author:Mr Yang
import requests
import time,os
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import threading, queue
def get_Page(url,headers):
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_Page(html,urlQueue):
soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8')
data_left = soup.select('#main .captcha_images_left')
data_right = soup.select('#main .captcha_images_right')
# 创建保存验证码文件夹
file = 'qcode'
if os.path.exists(file):
os.chdir(file)
else:
os.mkdir(file)
os.chdir(file)
for i in range(2):
for row in zip(data_left,data_right):
names = row[i].select('h3') #获取所有的 h3 标签,返回列表
images = row[i].select('img') #获取所有的 img 标签,返回列表
for tag_h,tag_img in zip(names,images):
#tag_h 为所有的 h3 标签 ;tag_img 为所有的 img 标签
urlQueue.put({str(tag_h.text).strip() + '.jpg':'https://captcha.com/' + tag_img['src']})
def dowloadimg(urlQueue,headers):
while True:
try:
data = urlQueue.get_nowait() # 不阻塞的读取队列数据
i = urlQueue.qsize() # 队列长度,取出一个长度就减少一个
except Exception as e:
break
for imgname,imgurl in data.items():
response = requests.get(imgurl, headers=headers)
if response.status_code == 200:
urlretrieve(imgurl,imgname)
time.sleep(1)
if __name__ == '__main__':
url = 'https://captcha.com/captcha-examples.html?cst=corg'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
urlQueue = queue.Queue()
html = get_Page(url,headers)
parse_Page(html,urlQueue)
threadNum = 7
for i in range(threadNum):
t = threading.Thread(target=dowloadimg,args=(urlQueue,headers,))
t.start()