paddle实现获取pdf的内容
企业开发
2023-08-15 20:54:27
阅读次数: 0
1. 环境安装
2. 实现代码
- 代码
import cv2
import fitz
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=427)
img_path = 'data/深度学习进阶自然语言处理.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
with open(f'data/data_txt/text_{
idx}.txt', 'w', encoding='utf-8') as f:
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
for line in txts:
f.write(line)
f.write('\n')
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('data/images/page_{}.jpg'.format(idx))
- 结果展示
转载自blog.csdn.net/m0_46926492/article/details/132020567