import requests, os, time
from lxml import etree
defget_page(url, headers):
response = requests.get(url, headers=headers)if response.status_code ==200:# print(response.text)return response.text
returnNonedefparse_page(html, headers):
html_lxml = etree.HTML(html)
datas = html_lxml.xpath('.//div[@class="captcha_images_left"]|.//div[@class="captcha_images_right"]')
item ={
}# Create save verification code folderfile='./verification_codes'if os.path.exists(file):
os.chdir(file)else:
os.mkdir(file)
os.chdir(file)for data in datas:# verification code name
name = data.xpath('.//h3')# print(len(name))# verification code link
src = data.xpath('.//div/img/@src')# print(len(src))
count =0for i inrange(len(name)):# verification code image file name
filename = name[i].text +'.jpg'
img_url ='https://captcha.com/'+ src[i]
response = requests.get(img_url, headers=headers)if response.status_code ==200:
image = response.content
withopen(filename,'wb')as f:
f.write(image)
count +=1print('Save the {} verification code successfully '.format(count))
time.sleep(1)defmain():
url ='https://captcha.com/captcha-examples.html?cst=corg'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
html = get_page(url, headers)
parse_page(html, headers)if __name__ =='__main__':
main()
from aip import AipOcr
import os, time
i =0
j =0
APP_ID ='21416730'
API_KEY ='k3xPcHRwwgpbqwGDWTpsPRM7'
SECRET_KEY ='e1GzUwW4DzH6G6eMQb4oGgy2F05ZKlkM'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)#
file_path ='./verification_codes'
filenames = os.listdir(file_path)# print(filenames)for filename in filenames:# Combine the path with the file name is the full path of each file
info = os.path.join(file_path, filename)withopen(info,'rb')as fp:# Get the path to the folder
image = fp.read()# Call universal text recognition, picture parameters are local images# result = client.basicGeneral(image)# Define parameter variables
options ={
'detect_direction':'true','language_type':'ENG',}# Call the universal text recognition interface
result = client.basicGeneral(image, options)print(result)if result['words_result_num']==0:print(filename +':'+'----')
i +=1else:for word in result['words_result']:print(filename +' : '+ word['words'])
j +=1
time.sleep(0.2)print('Total identification verification code {} sheets '.format(i + j))print('Unrecognized text {} sheets'.format(i))print('recognized text {} sheets'.format(j))