import requests
from pyexcel_xls import get_data
class BoshiOCR(object):
"""
易道博识服务器接口定义
"""
def __init__(self):
self.host = 'http://113.96.61.102:5000'
self.base_url = f'{self.host}/ocr/v1/'
self.general_url = f'{self.base_url}general_ex' # 通用文本识别
self.table_api_url = f'{self.base_url}table_api' # 通用表格识别
self.table_test_url = f'{self.base_url}table_test' # 通用表格识别接口(excel)
self.general_multi_pages_url = f'{self.base_url}general_multi_pages' # 通用文本识别接口(识别多页PDF)
self.realestate_cert_url = f'{self.base_url}realestate_cert' # 房产证识别
self.realestate_register_url = f'{self.base_url}realestate_register' # 不动产登记证识别
self.realestate_cert_new_url = f'{self.base_url}realestate_cert_new' # 房产证识别接口(定制版)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
@staticmethod
def read(image_path):
"""
读取并返回图片内容
:param image_path:图片路径
:return: 图片内容/None
"""
image_binary = None
try:
with open(image_path, 'rb') as f:
image_binary = f.read()
except Exception as e:
# print(e)
pass
return image_binary
def get_content(self, image_url):
"""
返回 image_url response.content
:param image_url:
:return: response.content/None
"""
content = None
try:
content = requests.get(image_url, headers=self.headers).content
# with open('test.png', 'wb') as f:
# f.write(content)
except Exception as e:
# print(e)
pass
return content
def general(self, image_url=None, image_path=None):
"""
通用文本识别
:param image_url: 图片url,二选一
:param image_path: 本地图片路径,二选一
:return: tuple,(result, result_words)/([], '')
"""
result, result_words = [], ''
if not image_url and not image_path:
return result, result_words
if image_url and image_path:
return result, result_words
image_binary = self.get_content(image_url) if image_url else self.read(image_path)
if not image_binary:
return result, result_words
files = {
'image_binary': image_binary
}
try:
response = requests.post(self.general_url, files=files)
response_dict = response.json()
error_code = response_dict.get('error_code')
if error_code != 0:
return result, result_words
result = response_dict.get('result', [])
result_words = '#$#'.join([item.get('words', '') for item in result])
except Exception as e:
# print(e)
pass
return result, result_words
def table_api(self, image_url=None, image_path=None):
"""
通用表格识别 bug:带表格图片识别异常
:param image_url: 图片url,二选一
:param image_path: 本地图片路径,二选一
:return:tuple,(tables, text, result_words)/([], [], '')
"""
tables, text, text_words = [], [], ''
if not image_url and not image_path:
return tables, text, text_words
if image_url and image_path:
return tables, text, text_words
image_binary = self.get_content(image_url) if image_url else self.read(image_path)
if not image_binary:
return tables, text, text_words
files = {
'image_binary': image_binary
}
try:
response = requests.post(self.table_api_url, files=files)
response_dict = response.json()
error_code = response_dict.get('error_code')
if error_code != 0:
return tables, text, text_words
tables = response_dict.get('tables', [])
text = response_dict.get('text', [])
text_words = '#$#'.join([item.get('words', '') for item in text])
except Exception as e:
# print(e)
pass
return tables, text, text_words
def table_test(self, image_url=None, image_path=None):
"""
通用表格识别接口(excel) bug:不带表格图片识别异常(表格识别推荐)
:param image_url: 图片url,二选一
:param image_path: 本地图片路径,二选一
:return: 表格识别结果,[{'sheet1':[]},{'sheet2':[]}]/[]
"""
result = []
if not image_url and not image_path:
return result
if image_url and image_path:
return result
image_binary = self.get_content(image_url) if image_url else self.read(image_path)
if not image_binary:
return result
files = {
'image_binary': image_binary,
}
xls_path = None
try:
response = requests.post(self.table_test_url, files=files)
response_dict = response.json()
error_code = response_dict.get('error_code')
if error_code != 0:
return result
xls_path = response_dict.get('result')
except Exception as e:
# print(e)
pass
if not xls_path:
return result
path = self.save_excel(xls_path)
if not path:
return result
result = self.read_xls(path)
return result
def save_excel(self, xls_path):
"""
本地存储xls
:param xls_path: images/ocr/table/38d1fb3cbf8f11ea9accfa163e888bd7.xls
:return: xls文件路径,test.xls/None
"""
path1 = 'test.xls'
path2 = None
if not xls_path:
return path2
try:
url = f'{self.host}/{xls_path}'
response = requests.get(url)
with open(path1, 'wb') as f:
f.write(response.content)
return path1
except Exception as e:
# print(e)
return path2
def read_xls(self, path):
"""
读取本地xls
:param path:
:return:
"""
result = []
data = get_data(path)
for sheet in data: # 遍历所有的key(sheet)
item = {}
item['sheet'] = sheet
item['content'] = data[sheet]
result.append(item)
return result
def general_multi_pages(self, url=None, path=None):
"""
通用文本识别接口(识别多页PDF)
:param url: url,二选一
:param path: 本地路径,二选一
:return:
"""
result, text = [], ''
if not url and not path:
return result, text
if url and path:
return result, text
image_binary = self.get_content(url) if url else self.read(path)
if not image_binary:
return result, text
data = {
'origin': '1' # 1:输出纯文本信息
}
files = {
'image_binary': image_binary
}
try:
response = requests.post(self.general_multi_pages_url, files=files, data=data)
response_dict = response.json()
error_code = response_dict.get('error_code')
if error_code != 0:
return result, text
result = response_dict.get('result', [])
# text = response_dict.get('txt', '')
text = '#$#'.join([item.get('words', '') for item in result])
except Exception as e:
# print(e)
pass
return result, text
if __name__ == '__main__':
boshi_ocr = BoshiOCR()
# result, result_words = boshi_ocr.general(
# image_url=r'https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3452088381,843690983&fm=26&gp=0.jpg')
# result, result_words = boshi_ocr.general(image_path=r'C:\Users\My\Desktop\2.png')
# print(result, '\n', result_words, sep='')
# image_url = 'https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3452088381,843690983&fm=26&gp=0.jpg'
# tables, text, result_words = boshi_ocr.table_api(image_url=image_url)
# tables, text, result_words = boshi_ocr.table_api(image_path=r'C:\Users\My\Desktop\2.png')
# print(tables, '\n', text, '\n', result_words, sep='')
# image_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1594052571532&di=7bdf6bc4d45425151d0b7d6b6d70357b&imgtype=0&src=http%3A%2F%2Fimg4.imgtn.bdimg.com%2Fit%2Fu%3D3412895284%2C1452357231%26fm%3D214%26gp%3D0.jpg'
# result = boshi_ocr.table_test(image_url=image_url)
result = boshi_ocr.table_test(image_path=r'C:\Users\My\Desktop\1.png')
print(result)
# result, text = boshi_ocr.general_multi_pages(url='http://pythonscraping.com/pages/warandpeace/chapter1.pdf')
# result, text = boshi_ocr.general_multi_pages(
# url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1594052571532&di=7bdf6bc4d45425151d0b7d6b6d70357b&imgtype=0&src=http%3A%2F%2Fimg4.imgtn.bdimg.com%2Fit%2Fu%3D3412895284%2C1452357231%26fm%3D214%26gp%3D0.jpg')
# result, text = boshi_ocr.general_multi_pages(path=r'C:\Users\My\Desktop\2.png')
# print(result, '\n', text, sep='')
易道博识服务器接口定义
猜你喜欢
转载自blog.csdn.net/zhu6201976/article/details/107174064
今日推荐
周排行