【一天一个Python小案例】验证码识别（基于百度OCR）

【一天一个Python小案例】验证码识别

在这里插入图片描述

import requests, os, time
from lxml import etree


def get_page(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # print(response.text)
        return response.text
    return None


def parse_page(html, headers):
    html_lxml = etree.HTML(html)
    datas = html_lxml.xpath('.//div[@class="captcha_images_left"]|.//div[@class="captcha_images_right"]')
    item = {
    
    }
    # Create save verification code folder
    file = './verification_codes'
    if os.path.exists(file):
        os.chdir(file)
    else:
        os.mkdir(file)
        os.chdir(file)
    for data in datas:
        # verification code name
        name = data.xpath('.//h3')
        # print(len(name))
        # verification code link
        src = data.xpath('.//div/img/@src')
        # print(len(src))
        count = 0
        for i in range(len(name)):
            # verification code image file name
            filename = name[i].text + '.jpg'
            img_url = 'https://captcha.com/' + src[i]
            response = requests.get(img_url, headers=headers)
            if response.status_code == 200:
                image = response.content
            with open(filename, 'wb') as f:
                f.write(image)
            count += 1
        print('Save the {} verification code successfully '.format(count))


        time.sleep(1)


def main():
    url = 'https://captcha.com/captcha-examples.html?cst=corg'
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
    html = get_page(url, headers)
    parse_page(html, headers)


if __name__ == '__main__':
    main()

在这里插入图片描述

from aip import AipOcr
import os, time

i = 0
j = 0
APP_ID = '21416730'
API_KEY = 'k3xPcHRwwgpbqwGDWTpsPRM7'
SECRET_KEY = 'e1GzUwW4DzH6G6eMQb4oGgy2F05ZKlkM'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)  # 
file_path = './verification_codes'
filenames = os.listdir(file_path)
# print(filenames)
for filename in filenames:
    # Combine the path with the file name is the full path of each file
    info = os.path.join(file_path, filename)
    with open(info, 'rb') as fp:
        # Get the path to the folder
        image = fp.read()
        # Call universal text recognition, picture parameters are local images
        # result = client.basicGeneral(image)
        # Define parameter variables
        options = {
    
    
            'detect_direction': 'true',
            'language_type': 'ENG',
        }
        # Call the universal text recognition interface
        result = client.basicGeneral(image, options)
        print(result)
        if result['words_result_num'] == 0:
            print(filename + ':' + '----')
            i += 1
        else:
            for word in result['words_result']:
                print(filename + ' : ' + word['words'])
            j += 1
    time.sleep(0.2)
print('Total identification verification code {} sheets '.format(i + j))
print('Unrecognized text {} sheets'.format(i))
print('recognized text {} sheets'.format(j))

【一天一个Python小案例】验证码识别（基于百度OCR）

【一天一个Python小案例】验证码识别

猜你喜欢