1,首先安装Python类库
安装支持Python版本的Tesseract库:安装方式如下:
pip install pytesseract
2,安装Tesseract-OCR
下载地址:https://digi.bib.uni-mannheim.de/tesseract/
3,3,设置环境变量。
需要添加俩个环境变量
1,path中添加C:\ProgramFiles(x86)\Tesseract-OCR(安装路径)
2,新建一个TESSDATA_PREFIX变量C:\ProgramFiles(x86)\Tesseract-OCR\tessdata
3,在pytesseract源码中将tesseract_cmd修改为:
tesseract_cmd=r’C:\ProgramFiles(x86)\Tesseract-OCR\tesseract.exe’
import numpy
import pytesseract
from PIL import Image, ImageDraw, ImageFont
from fontTools.ttLib import TTFont
def fontConvert(fontPath): #将web下载的字体文件解析,返回其编码和汉字的对应关系
font = TTFont(fontPath) # 打开文件
codeList = font.getGlyphOrder()[2:]
print(codeList)
im = Image.new("RGB", (1800, 1000), (255, 255, 255))
dr = ImageDraw.Draw(im)
font = ImageFont.truetype(fontPath, 40)
count = 3
arrayList = numpy.array_split(codeList, count) #将列表切分成15份,以便于在图片上分行显示
print(arrayList)
for t in range(count):
newList = [i.replace("uni", "\\u") for i in arrayList[t]]
text = "".join(newList)
text = text.encode('utf-8').decode('unicode_escape')
dr.text((0, 50 * t), text, font=font, fill="#000000")
im.save("sss.jpg")
# im = Image.open("sss.jpg") #可以将图片保存到本地,以便于手动打开图片查看
result = pytesseract.image_to_string(im, lang="chi_sim")
result = result.replace(" ", "").replace("\n", "")
codeList = [i.replace("uni", "&#x") for i in codeList]
return dict(zip(codeList, list(result)))
if __name__ == '__main__':
filepath = '1.woff'
print(fontConvert(filepath))