前言
对于PDF转换成word文档,我想很多人都了解过,那就是需要付费,而且很贵,但是如果你会Python,只要你会Python这么问题都不再是问题。
老规矩,需要打包好的软件关注小编,QQ群:721195303领取。
pdf文件转换为word文件
import os
fron
configparser
imtport
configParserfrom
io
import stringIo
fron
io
impont
open
fron
concurrent.futures
impont
ProcessPoolExecutorfrom
pdfminer.pdfinterp
import PDFResourceManagerfron
pdfminer.pdfinterp
import processpdf
fron
pdfminer.converter
import Textconverterfron
pdfminer.layout
import LAParams
fon
docx
Lmport
Documentdef
read
from pdf(iLe
pdth):
ith
open(file_
path, 'rb')as file:
resource
manager
PDFResourceManageroreturn_str - StringIo()
lap_params = LAParamso
device
Textconverter(
resource
manager,
return str, Laparoms=lap_params)process
pdf(resource_
manager, device, file)
device.closeo
content = return_str.getvaluereturn_str.close(o
return content
def save text
to
word(content, file
path):
doc - DocumentO
for line in content.split('in"3:
paragraphdoc.add paragraph
doc.save(fiie path)
paragraph.add run(renove controlcharacters(line)
def remove control_characters(content):
mpa
dict.fromkeys(range(a2))
retun
content.translate(mpa)
def pdf_to_word(pdf fiLe_path, word
file
path):
content = Pead
from pdf(pdf
file
path
save
text
to
word(content, word_file
path)
def main(O:
config parser - ConfigParseroconfig
parser.read("config.cfg')config =config parser['defauit1
tasks - [
with ProcessPoolExecutor(max_workers int(config['max _worker']))8s executor:
for
file in os.listdir(config[" pdf folder"
D:
extension
name = os.path.splitext(file)[1]
fextension
name
le
"pdf":
continue
file_name - os.path.splitext(file)[e]
pdf_file
config["pdf_folder'J+file
word
file - config[" word_folder']*'/+ file_name & '.docx"
print(“正在处理:, file
resultexecutor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
exit
flag
Truefor
task in tasks:
if not task.done(o:
exit_flag
Falseif
exit_flag:
print(“完成exit(e
ifname - "_main_':
main()
Word文件转换为pdf文件
# -*- encoding: utf-8 -*-
import os
from win32com import client
#pip instatll win32com
def doc2pdf(doc_name, pdf_name):
"""
:word文件转pdf
:param doc_name word文件名称
:param pdf_name 转换后pdf文件名称
"""
try:
word = client.DispatchEx("Word.Application")
if os.path.exists(pdf_name):
os.remove(pdf_name)
worddoc = word.Documents.Open(doc_name,ReadOnly = 1)
worddoc.SaveAs(pdf_name, FileFormat = 17)
worddoc.Close()
return pdf_name
except:
return 1
if __name__=='__main__':
doc_name = "f:/test.doc"
ftp_name = "f:/test.pdf"
doc2pdf(doc_name, ftp_name)
doc转docx
from win32com import client
def doc2docx(doc_name,docx_name):
"""
:doc转docx
"""
try:
# 首先将doc转换成docx
word = client.Dispatch("Word.Application")
doc = word.Documents.Open(doc_name)
#使用参数16表示将doc转换成docx
doc.SaveAs(docx_name,16)
doc.Close()
word.Quit()
except:
pass
if __name__ == '__main__':
doc2docx(f:test.doc','f:/test.docx')
docx转html
#coding:utf-8
import docx
from docx2html import convert
import HTMLParser
def docx2html(docx_name,new_name):
"""
:docx转html
"""
try:
#读取word内容
doc = docx.Document(docx_name,new_name)
data = doc.paragraphs[0].text
# 转换成html
html_parser = HTMLParser.HTMLParser()
#使用docx2html模块将docx文件转成html串,随后你想干嘛都行
html = convert(new_name)
#docx2html模块将中文进行了转义,需要将生成的字符串重新转义
return html_parser.enescape(html)
except:
pass
if __name__ == '__main__':
docx2html('f:/test.docx','f:/test1.docx')