一.简介
PDF(Portable Document Format),中文名称便携文档格式是我们经常会接触到的一种文件格式,文献、文档…很多都是PDF格式。它以格式稳定的优势,使得我们在打印、分享、传输过程中能够最优的保持原有色彩和格式。
二.PyPDF2
PyPDF2是一个第三方的python PDF库,它能够对PDF文件进行分割、合并、裁剪和转换页面。
另外,它还可以对PDF文件添加自定义数据、水印、密码,也可以从PDF文件中检索出文本和元数据。
1.安装
使用pip直接安装:
pip install PyPDF2
2.删除PDF页
删除就没有目录了。。。
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
def delete_pdf(index):
pages = input1.getNumPages()
for i in range(pages):
if i + 1 in index:
continue
output.addPage(input1.getPage(i))
outputStream = open("PyPDF2-output.pdf", "wb")
output.write(outputStream)
os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
output = PdfFileWriter()
input1 = PdfFileReader(open("数学建模技能图谱.pdf", "rb"))
delete_pdf([2, 3, 4])
3.合并PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
output = PdfFileWriter()
input1 = PdfFileReader(open("example.pdf", "rb"))
input2 = PdfFileReader(open("simple2.pdf", "rb"))
def merge_pdf(add_index, origin_index):
pages = input1.getNumPages()
k = 0
for i in range(pages):
if i+1 in add_index:
output.addPage(input2.getPage(origin_index[k]))
pages += 1
k += 1
output.addPage(input1.getPage(i))
outputStream = open("PyPDF2-output.pdf", "wb")
output.write(outputStream)
merge_pdf([2,3,4], [0, 0, 0])
- 导入PyPDF2合并模块
PdfFileMerger
; - 读取需要处理和合并的PDF文档;
- 从第一个PDF文档中取出需要合并的前3页;
- 把第二个PDF文档的第一页插入到文档中;
- 把第三个PDF文档附到输出文档末尾;
除了上述介绍的2项主要功能,PyPDF2也有一些其他小功能:
4.旋转
input1.getPage(1).rotateClockwise(90)
使得页面1旋转90度。
5.添加水印
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.utils import b_
import os
def remove_watermark(input_file, output_file):
"""
pdf去除水印
:param input_file:
:param output_file:
:return:
"""
with open(input_file, "rb") as f:
# 读取pdf文件
source = PdfFileReader(f, "rb")
# 创建pdf输出对象
output = PdfFileWriter()
for page in range(source.getNumPages()):
# 获取pdf一页属性信息
page = source.getPage(page)
# 获取pdf一页的内容
content_object = page.getContents()
# content_object = page["/Contents"].getObject()
# 将内容对象进行转换
content = ContentStream(content_object, source)
for operands, operator in content.operations:
# 根据要去除的水印格式是“Tj”文本
if operator == b_("Tj"):
# 将获取的文本替换为空
operands[0] = TextStringObject('')
# 转换原来的内容对象
page.__setitem__(NameObject('/Contents'), content)
# 增加到新的pdf上
output.addPage(page)
# 输入新的pdf文件
with open(output_file, "wb") as outputStream:
output.write(outputStream)
def create_watermark(input_pdf, output, watermark):
watermark_obj = PdfFileReader(watermark)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(input_pdf)
pdf_writer = PdfFileWriter()
# 给所有页面添加水印
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
with open(output, 'wb') as out:
pdf_writer.write(out)
os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
# create_watermark(
# input_pdf='数学建模技能图谱.pdf',
# output='new_数学建模技能图谱.pdf',
# watermark='水印.pdf')
remove_watermark('有水印.pdf', '无水印.pdf')
6.加密
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
def encrypt_pdf():
input1 = PdfFileReader(open("数学建模技能图谱.pdf", "rb"))
output = PdfFileWriter()
pages = input1.getNumPages()
for i in range(pages):
output.addPage(input1.getPage(i))
password = "secret"
output.encrypt(password)
outputStream = open("PyPDF2-output.pdf", "wb")
# 要在写入之前进行加密
output.write(outputStream)
os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
encrypt_pdf()
7.解密
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
def decrypt_pdf():
input1 = PdfFileReader(open("PyPDF2-output.pdf", "rb"))
password = 'secret'
input1.decrypt(password)
output = PdfFileWriter()
pages = input1.getNumPages()
for i in range(pages):
output.addPage(input1.getPage(i))
outputStream = open("new-PyPDF2-output.pdf", "wb")
output.write(outputStream)
os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
decrypt_pdf()