python读取PDF文件中文本、表格、图片
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
一、文本读取
基于fitz
import fitz
pdf_file = "example.pdf"
pdf_document = fitz.open(pdf_file)
text = ""
for page_number in range(len(pdf_document)):
page = pdf_document.load_page(page_number)
for block in page.get_text("blocks"):
x0, y0, x1, y1 = block[0:4]
text_block = block[4]
# 根据文本块属性过滤表格中的文本
# 这只是一个示例,你可以根据文本块的位置和其他属性来进一步过滤
if y1 - y0 < 20: # 通过高度过滤小文本块
continue
if "image" in text_block:
continue
text += text_block
pdf_document.close()
print(text)
二、图片读取
基于fitz
import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
# print the number of images found on the page
if image_list:
print(f"Found {
len(image_list)} images on page {
page_index}")
else:
print("No images found on page", page_index)
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = fitz.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pix = None
三、表格读取
基于fitz
import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
# print the number of images found on the page
if image_list:
print(f"Found {
len(image_list)} images on page {
page_index}")
else:
print("No images found on page", page_index)
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = fitz.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pix = None
基于fitz,将表格数据当作文本内容抽取
import fitz
doc = fitz.open("example.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
out.write(text) # write text of page
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()
基于pdfplumber
import pdfplumber
import pandas as pd
# 读取pdf文件,保存为pdf实例
pdf = pdfplumber.open("example.pdf")
# 访问第二页
first_page = pdf.pages[1]
# 自动读取表格信息,返回列表
tables = first_page.extract_tables(table_settings = {
})
for table in tables:
table = pd.DataFrame(table[1:], columns=table[0])
print(table)