为了手机论文里网络框架图,下载了280多篇论文,一个一个打开来看不现实,所以使用python提取里的图片,一幕了然,知道里面的框架类容了
pip install pymupdf
网上有很多帖子,但是里面的PyMuPDF都是老版本,很多类名都改了,你现在用的话,会报错
报错:doc._getXrefLength和doc.getObjectString(i)
所以要改类名,新版本都不叫之前的名字了
import fitz
import re
import os
file_path = r'D:\\baidu\\最最最\\2105.13381.pdf' # PDF 文件路径
dir_path = r'D:\\baidu\\最最最\\图片' # 存放图片的文件夹
def pdf2image1(path, pic_path):
checkIM = r"/Subtype(?= */Image)"
pdf = fitz.open(path)
# lenXREF = pdf._getXrefLength()
lenXREF = pdf.xref_length()
count = 1
for i in range(1, lenXREF):
# text = pdf._getXrefString(i)
# text = pdf.getObjectString(i)
text = pdf.xref_object(i)
isImage = re.search(checkIM, text)
if not isImage:
continue
pix = fitz.Pixmap(pdf, i)
new_name = f"img_{count}.png"
pix.save(os.path.join(pic_path, new_name))
count += 1
pix = None
pdf2image1(file_path, dir_path)
完整批处理代码:
import fitz
import re
import os
import tqdm
def pdf2image1(path, pic_path, j):
checkIM = r"/Subtype(?= */Image)"
pdf = fitz.open(path)
# lenXREF = pdf._getXrefLength()
lenXREF = pdf.xref_length()
count = 1
for i in range(1, lenXREF):
# text = pdf._getXrefString(i)
# text = pdf.getObjectString(i)
text = pdf.xref_object(i)
isImage = re.search(checkIM, text)
if not isImage:
continue
pix = fitz.Pixmap(pdf, i)
if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
pix = fitz.Pixmap(fitz.csRGB, pix)
new_name = f"img_{j}_{count}.png"
print(pix)
pix.save(os.path.join(pic_path, new_name))
count += 1
pix = None
path = r'D:\\baidu\\最最最\\新建文件夹 (3)' # PDF 文件路径
dir_path = r'D:\\baidu\\最最最\\图片' # 存放图片的文件夹
fil = os.listdir(path)
for j, name in tqdm.tqdm(enumerate(fil)):
# file_path = os.path.join(path,name)
file_path = 'D:\\baidu\\最最最\\新建文件夹 (3)\diagnostics-11-01384-v2.pdf'
print(file_path)
pdf2image1(file_path, dir_path,j)