易道博识服务器接口定义

import requests
from pyexcel_xls import get_data


class BoshiOCR(object):
    """
    易道博识服务器接口定义
    """

    def __init__(self):
        self.host = 'http://113.96.61.102:5000'
        self.base_url = f'{self.host}/ocr/v1/'
        self.general_url = f'{self.base_url}general_ex'  # 通用文本识别
        self.table_api_url = f'{self.base_url}table_api'  # 通用表格识别
        self.table_test_url = f'{self.base_url}table_test'  # 通用表格识别接口(excel)
        self.general_multi_pages_url = f'{self.base_url}general_multi_pages'  # 通用文本识别接口(识别多页PDF)
        self.realestate_cert_url = f'{self.base_url}realestate_cert'  # 房产证识别
        self.realestate_register_url = f'{self.base_url}realestate_register'  # 不动产登记证识别
        self.realestate_cert_new_url = f'{self.base_url}realestate_cert_new'  # 房产证识别接口(定制版)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
        }

    @staticmethod
    def read(image_path):
        """
        读取并返回图片内容
        :param image_path:图片路径
        :return: 图片内容/None
        """
        image_binary = None
        try:
            with open(image_path, 'rb') as f:
                image_binary = f.read()
        except Exception as e:
            # print(e)
            pass
        return image_binary

    def get_content(self, image_url):
        """
        返回 image_url response.content
        :param image_url:
        :return: response.content/None
        """
        content = None
        try:
            content = requests.get(image_url, headers=self.headers).content
            # with open('test.png', 'wb') as f:
            #     f.write(content)
        except Exception as e:
            # print(e)
            pass
        return content

    def general(self, image_url=None, image_path=None):
        """
        通用文本识别
        :param image_url: 图片url,二选一
        :param image_path: 本地图片路径,二选一
        :return: tuple,(result, result_words)/([], '')
        """
        result, result_words = [], ''

        if not image_url and not image_path:
            return result, result_words
        if image_url and image_path:
            return result, result_words

        image_binary = self.get_content(image_url) if image_url else self.read(image_path)
        if not image_binary:
            return result, result_words

        files = {
            'image_binary': image_binary
        }
        try:
            response = requests.post(self.general_url, files=files)
            response_dict = response.json()
            error_code = response_dict.get('error_code')
            if error_code != 0:
                return result, result_words
            result = response_dict.get('result', [])
            result_words = '#$#'.join([item.get('words', '') for item in result])
        except Exception as e:
            # print(e)
            pass

        return result, result_words

    def table_api(self, image_url=None, image_path=None):
        """
        通用表格识别 bug:带表格图片识别异常
        :param image_url: 图片url,二选一
        :param image_path: 本地图片路径,二选一
        :return:tuple,(tables, text, result_words)/([], [], '')
        """
        tables, text, text_words = [], [], ''

        if not image_url and not image_path:
            return tables, text, text_words
        if image_url and image_path:
            return tables, text, text_words

        image_binary = self.get_content(image_url) if image_url else self.read(image_path)
        if not image_binary:
            return tables, text, text_words

        files = {
            'image_binary': image_binary
        }
        try:
            response = requests.post(self.table_api_url, files=files)
            response_dict = response.json()
            error_code = response_dict.get('error_code')
            if error_code != 0:
                return tables, text, text_words
            tables = response_dict.get('tables', [])
            text = response_dict.get('text', [])
            text_words = '#$#'.join([item.get('words', '') for item in text])
        except Exception as e:
            # print(e)
            pass
        return tables, text, text_words

    def table_test(self, image_url=None, image_path=None):
        """
        通用表格识别接口(excel) bug:不带表格图片识别异常(表格识别推荐)
        :param image_url: 图片url,二选一
        :param image_path: 本地图片路径,二选一
        :return: 表格识别结果,[{'sheet1':[]},{'sheet2':[]}]/[]
        """
        result = []

        if not image_url and not image_path:
            return result
        if image_url and image_path:
            return result

        image_binary = self.get_content(image_url) if image_url else self.read(image_path)
        if not image_binary:
            return result

        files = {
            'image_binary': image_binary,
        }
        xls_path = None
        try:
            response = requests.post(self.table_test_url, files=files)
            response_dict = response.json()
            error_code = response_dict.get('error_code')
            if error_code != 0:
                return result
            xls_path = response_dict.get('result')
        except Exception as e:
            # print(e)
            pass
        if not xls_path:
            return result

        path = self.save_excel(xls_path)
        if not path:
            return result

        result = self.read_xls(path)
        return result

    def save_excel(self, xls_path):
        """
        本地存储xls
        :param xls_path: images/ocr/table/38d1fb3cbf8f11ea9accfa163e888bd7.xls
        :return: xls文件路径,test.xls/None
        """
        path1 = 'test.xls'
        path2 = None

        if not xls_path:
            return path2

        try:
            url = f'{self.host}/{xls_path}'
            response = requests.get(url)
            with open(path1, 'wb') as f:
                f.write(response.content)
            return path1
        except Exception as e:
            # print(e)
            return path2

    def read_xls(self, path):
        """
        读取本地xls
        :param path:
        :return:
        """
        result = []

        data = get_data(path)
        for sheet in data:  # 遍历所有的key(sheet)
            item = {}
            item['sheet'] = sheet
            item['content'] = data[sheet]
            result.append(item)
        return result

    def general_multi_pages(self, url=None, path=None):
        """
        通用文本识别接口(识别多页PDF)
        :param url: url,二选一
        :param path: 本地路径,二选一
        :return:
        """
        result, text = [], ''

        if not url and not path:
            return result, text
        if url and path:
            return result, text

        image_binary = self.get_content(url) if url else self.read(path)
        if not image_binary:
            return result, text

        data = {
            'origin': '1'  # 1:输出纯文本信息
        }
        files = {
            'image_binary': image_binary
        }
        try:
            response = requests.post(self.general_multi_pages_url, files=files, data=data)
            response_dict = response.json()
            error_code = response_dict.get('error_code')
            if error_code != 0:
                return result, text
            result = response_dict.get('result', [])
            # text = response_dict.get('txt', '')
            text = '#$#'.join([item.get('words', '') for item in result])
        except Exception as e:
            # print(e)
            pass
        return result, text


if __name__ == '__main__':
    boshi_ocr = BoshiOCR()
    # result, result_words = boshi_ocr.general(
    #     image_url=r'https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3452088381,843690983&fm=26&gp=0.jpg')
    # result, result_words = boshi_ocr.general(image_path=r'C:\Users\My\Desktop\2.png')
    # print(result, '\n', result_words, sep='')

    # image_url = 'https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3452088381,843690983&fm=26&gp=0.jpg'
    # tables, text, result_words = boshi_ocr.table_api(image_url=image_url)
    # tables, text, result_words = boshi_ocr.table_api(image_path=r'C:\Users\My\Desktop\2.png')
    # print(tables, '\n', text, '\n', result_words, sep='')

    # image_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1594052571532&di=7bdf6bc4d45425151d0b7d6b6d70357b&imgtype=0&src=http%3A%2F%2Fimg4.imgtn.bdimg.com%2Fit%2Fu%3D3412895284%2C1452357231%26fm%3D214%26gp%3D0.jpg'
    # result = boshi_ocr.table_test(image_url=image_url)
    result = boshi_ocr.table_test(image_path=r'C:\Users\My\Desktop\1.png')
    print(result)

    # result, text = boshi_ocr.general_multi_pages(url='http://pythonscraping.com/pages/warandpeace/chapter1.pdf')
    # result, text = boshi_ocr.general_multi_pages(
    #     url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1594052571532&di=7bdf6bc4d45425151d0b7d6b6d70357b&imgtype=0&src=http%3A%2F%2Fimg4.imgtn.bdimg.com%2Fit%2Fu%3D3412895284%2C1452357231%26fm%3D214%26gp%3D0.jpg')
    # result, text = boshi_ocr.general_multi_pages(path=r'C:\Users\My\Desktop\2.png')
    # print(result, '\n', text, sep='')
易道博识服务器接口定义

猜你喜欢