1.将pdf文件内容写入txt文件:
利用PDFminer3k
模块来抽取PDF内容,包括文本、图像、曲线等:
# -*- coding: utf-8 -*- import sys import importlib importlib.reload(sys) from pdfminer.pdfparser import PDFParser,PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import * from pdfminer.pdfinterp import PDFTextExtractionNotAllowed ''' 解析pdf文件,获取文件中包含的各种对象 ''' # 解析pdf文件函数 def parse(pdf_path): fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x,LTImage): # 图片对象 num_image += 1 if isinstance(x,LTCurve): # 曲线对象 num_curve += 1 if isinstance(x,LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 with open(r'test.txt', 'a') as f: results = x.get_text() print(results,end='') f.write(results) print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n' %num_TextBoxHorizontal) if __name__ == '__main__': pdf_path = r'D:\python tests\ZQfd_paiming\pdf\12.pdf' parse(pdf_path)
2.利用pdf2htmlEX工具,将pdf转化为html文件,分析源码,根据格式提取出需要的内容:
下载pdf2htmlEX,将需要处理的pdf转化为html:
(1)先处理了前两页:13.html:13
# -*- coding: UTF-8 -*- # -*- coding: gbk -*- from bs4 import BeautifulSoup import re file = open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/13.html', 'rb') # html=str(file.read()) #str类型,中文->十六进制 html = file.read() # byte类型,直接显示中文,但是下文的定位是根据十六进制来的 div_bf = BeautifulSoup(html, 'lxml') div = div_bf.find_all('div', class_='c x0 y0 w2 h2') # 每一页 # div=div_bf.find_all('div',class_='pf w0 h0') pattern_id = re.compile('<div class="t m0 x8 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 以学号开始的每个人的信息 pattern_name = re.compile('<div class="t m0 x9 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 姓名 pattern_grade = re.compile('<div class="t m0 xb h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 分数:初试、复试、总成绩 ID = [] NAME = [] GRADE1 = [] # 初试 GRADE2 = [] # 复试 GRADE3 = [] # 总成绩 COLLEGE = [] page_num = len(div) for i in range(page_num): ID += [[]] NAME += [[]] GRADE1 += [[]] GRADE2 += [[]] GRADE3 += [[]] COLLEGE += [[]] str_each = str(div[i]) # 分析每页信息 per_id = re.findall(pattern_id, str_each) per_name = re.findall(pattern_name, str_each) per_grade = re.findall(pattern_grade, str_each) per_num = len(per_id) # 每页人数 for j in range(per_num): start_id = str_each.find(per_id[j]) + 50 if str_each[start_id] == '>': id = str_each[start_id + 1:start_id + 6] else: id = str_each[start_id + 2:start_id + 7] ID[i].append(id) start_name = str_each.find(per_name[j]) + 50 end_name = str_each.find('<', start_name) if str_each[start_name] == '>': name = str_each[start_name + 1:end_name] else: name = str_each[start_name + 2:end_name] NAME[i].append(name.replace(' ', '')) # 避免名字中有空格出现 start_grade = str_each.find(per_grade[j]) + 50 if str_each[start_grade] != '>': start_grade += 1 grade1 = str_each[start_grade + 1:start_grade + 4] grade2 = str_each[start_grade + 31:start_grade + 36] grade3 = str_each[start_grade + 63:start_grade + 68] GRADE1[i].append(grade1) GRADE2[i].append(grade2) GRADE3[i].append(grade3) # 有的学院和专业是跟在姓名div后面的,有的是另外开辟了div,处理起来比较麻烦 # 处理姓名到得分之间的部分,将这部分中诸如<span class="_ _3"></span> # 或者</div><div class="t m0 xd h3 y13 ff1 fs0 fc1 sc1 ls0 ws0">去掉 college_info = str_each[end_name:start_grade + 1] pattern_useless1 = re.compile('<span class="_ _[0-9|a-f]"></span>') pattern_useless2 = re.compile('<span class="_ _[0-9|a-f]"> </span>') pattern_useless3 = re.compile('</div><div class="t m0 x.*? h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') useless1 = re.findall(pattern_useless1, college_info) useless2 = re.findall(pattern_useless2, college_info) useless3 = re.findall(pattern_useless3, college_info) for each in useless1: college_info = college_info.replace(each, '') for each in useless2: college_info = college_info.replace(each, '') for each in useless3: college_info = college_info.replace(each, '') COLLEGE[i].append(college_info) # print('%-10s %-10s %-50s %-5s %-7s %-7s' % (ID[i][j],NAME[i][j],COLLEGE[i][j],GRADE1[i][j],GRADE2[i][j],GRADE3[i][j])) print('|%-10s|%s' % (ID[i][j], NAME[i][j]), end='') for x in range(10 - len(NAME[i][j])): print('\u3000', end='') print('|%s' % COLLEGE[i][j], end='') for x in range(30 - len(COLLEGE[i][j])): print('\u3000', end='') print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j], GRADE2[i][j], GRADE3[i][j]))
(2)再处理所有的168页,结构不完全相同,使用的方法也有很大不同:转化后的zong1.html:zong1
效果截图:
# -*- coding: UTF-8 -*- # -*- coding: gbk -*- from bs4 import BeautifulSoup import re file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb') #html=str(file.read()) #str类型,中文->十六进制 html=file.read() #byte类型,直接显示中文,但是下文的定位是根据十六进制来的 div_bf=BeautifulSoup(html,'lxml') div=div_bf.find_all('div',class_='pf w0 h0') pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">') #以学号开始的每个人的信息 pattern_name=re.compile('<span class="ff1">.*?</span>') #姓名 #pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">') #学院 pattern_college1=re.compile('<div class=".*?"') pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">') pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">') #分数:初试、复试、总成绩 ID=[] NAME=[] GRADE1=[] #初试 GRADE2=[] #复试 GRADE3=[] #总成绩 COLLEGE=[] #学院 REMARKS=[] #备注 page_num=len(div) for i in range(page_num): ID+=[[]] NAME+=[[]] GRADE1+=[[]] GRADE2+=[[]] GRADE3+=[[]] COLLEGE+=[[]] REMARKS+=[[]] str_each=str(div[i]) #分析每页信息 per_id=re.findall(pattern_id,str_each) per_name=re.findall(pattern_name,str_each) per_college1=re.findall(pattern_college1,str_each) per_college2 = re.findall(pattern_college2, str_each) per_grade=re.findall(pattern_grade,str_each) per_num=len(per_id) #每页人数 for j in range(per_num): start_id=str_each.find(per_id[j])+50 if str_each[start_id]=='>': id=str_each[start_id+1:start_id+6] else: id=str_each[start_id+2:start_id+7] ID[i].append(id) #学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>' # 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div> start_name1=str_each.find('<span class="ff1">',start_id) start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id) if start_name1==-1: if str_each[start_name2+50]!='>': start_name=start_name2+51 else: start_name=start_name2+50 elif start_name2==-1: start_name=start_name1+18 elif start_name1<start_name2: start_name=start_name1+18 else: if str_each[start_name2+50]!='>': start_name=start_name2+51 else: start_name=start_name2+50 end_name=str_each.find('</',start_name) name=str_each[start_name:end_name] NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000')) #避免名字中有空格出现 #只有学号是规整的,分数有的跟在学院后面:<span class="ff2"> #有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0"> start_grade1 = str_each.find('<span class="ff2">', start_id) start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id) start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id) if start_grade3!=-1 and start_grade3<start_grade2: start_grade2=start_grade3 if start_grade1 == -1: if str_each[start_grade2 + 50] != '>': start_grade = start_grade2 + 52 else: start_grade = start_grade2 + 51 elif start_grade2 == -1: start_grade = start_grade1 + 18 elif start_grade1 < start_grade2: start_grade = start_grade1 + 18 else: if str_each[start_grade2 + 50] != '>': start_grade = start_grade2 + 52 else: start_grade = start_grade2 + 51 end_grade = str_each.find('</div', start_grade) GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>','')) if str_each[start_grade+30].isdigit(): GRADE2[i].append(str_each[start_grade+30:start_grade+35]) GRADE3[i].append(str_each[start_grade + 62:start_grade + 67]) else: GRADE2[i].append('-') GRADE3[i].append('-') ''' if str_each[start_grade:end_grade].count('<span class=')>=3: #有备注 start_remarks=str_each[start_grade:end_grade].rfind('>') remarks=str_each[start_remarks+1:end_grade] print(remarks) REMARKS[i].append(remarks) else: REMARKS[i].append('-') ''' # 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉 college = str_each[end_name:start_grade] for each in per_college1: college=college.replace(each,'') for each in per_college2: college=college.replace(each,'') college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')') COLLEGE[i].append(college) print('|%-10s|%s' % (ID[i][j],NAME[i][j]),end='') for x in range(12-len(NAME[i][j])): print('\u3000',end='') print('|%s' % COLLEGE[i][j],end='') for x in range(30-len(COLLEGE[i][j])): print('\u3000',end='') print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j],GRADE2[i][j],GRADE3[i][j]))
3.在上述代码的基础上添加几行,使用xlwt将字典写入excel中:
效果截图:
# coding=utf-8 from xlwt import * # 需要xlwt库的支持 # -*- coding: UTF-8 -*- # -*- coding: gbk -*- from bs4 import BeautifulSoup import re file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb') #html=str(file.read()) #str类型,中文->十六进制 html=file.read() #byte类型,直接显示中文,但是下文的定位是根据十六进制来的 div_bf=BeautifulSoup(html,'lxml') div=div_bf.find_all('div',class_='pf w0 h0') pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">') #以学号开始的每个人的信息 pattern_name=re.compile('<span class="ff1">.*?</span>') #姓名 #pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">') #学院 pattern_college1=re.compile('<div class=".*?"') pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">') pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">') #分数:初试、复试、总成绩 ID=[] NAME=[] GRADE1=[] #初试 GRADE2=[] #复试 GRADE3=[] #总成绩 COLLEGE=[] #学院 REMARKS=[] #备注 page_num=len(div) file_excel = Workbook(encoding='utf-8') # 指定file以utf-8的格式打开 table = file_excel.add_sheet('data') # 指定打开的文件名 data = {'考生编号 (后五位)':['姓名','拟录取院系/专业','初试成绩','复试成绩','总成绩']} # 字典数据 for i in range(page_num): ID+=[[]] NAME+=[[]] GRADE1+=[[]] GRADE2+=[[]] GRADE3+=[[]] COLLEGE+=[[]] REMARKS+=[[]] str_each=str(div[i]) #分析每页信息 per_id=re.findall(pattern_id,str_each) per_name=re.findall(pattern_name,str_each) per_college1=re.findall(pattern_college1,str_each) per_college2 = re.findall(pattern_college2, str_each) per_grade=re.findall(pattern_grade,str_each) per_num=len(per_id) #每页人数 for j in range(per_num): start_id=str_each.find(per_id[j])+50 if str_each[start_id]=='>': id=str_each[start_id+1:start_id+6] else: id=str_each[start_id+2:start_id+7] ID[i].append(id) #学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>' # 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div> start_name1=str_each.find('<span class="ff1">',start_id) start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id) if start_name1==-1: if str_each[start_name2+50]!='>': start_name=start_name2+51 else: start_name=start_name2+50 elif start_name2==-1: start_name=start_name1+18 elif start_name1<start_name2: start_name=start_name1+18 else: if str_each[start_name2+50]!='>': start_name=start_name2+51 else: start_name=start_name2+50 end_name=str_each.find('</',start_name) name=str_each[start_name:end_name] NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000')) #避免名字中有空格出现 #只有学号是规整的,分数有的跟在学院后面:<span class="ff2"> #有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0"> start_grade1 = str_each.find('<span class="ff2">', start_id) start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id) start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id) if start_grade3!=-1 and start_grade3<start_grade2: start_grade2=start_grade3 if start_grade1 == -1: if str_each[start_grade2 + 50] != '>': start_grade = start_grade2 + 52 else: start_grade = start_grade2 + 51 elif start_grade2 == -1: start_grade = start_grade1 + 18 elif start_grade1 < start_grade2: start_grade = start_grade1 + 18 else: if str_each[start_grade2 + 50] != '>': start_grade = start_grade2 + 52 else: start_grade = start_grade2 + 51 end_grade = str_each.find('</div', start_grade) GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>','')) if str_each[start_grade+30].isdigit(): GRADE2[i].append(str_each[start_grade+30:start_grade+35]) GRADE3[i].append(str_each[start_grade + 62:start_grade + 67]) else: GRADE2[i].append('-') GRADE3[i].append('-') ''' if str_each[start_grade:end_grade].count('<span class=')>=3: #有备注 start_remarks=str_each[start_grade:end_grade].rfind('>') remarks=str_each[start_remarks+1:end_grade] print(remarks) REMARKS[i].append(remarks) else: REMARKS[i].append('-') ''' # 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉 college = str_each[end_name:start_grade] for each in per_college1: college=college.replace(each,'') for each in per_college2: college=college.replace(each,'') college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')') COLLEGE[i].append(college) target=[] target.append(NAME[i][j]) target.append(COLLEGE[i][j]) target.append(GRADE1[i][j]) target.append(GRADE2[i][j]) target.append(GRADE3[i][j]) data[ID[i][j]]=target ldata=[] num=[a for a in data] #for循环指定取出key值存入num中 for x in num: #for循环将data字典中的键和值分批的保存在ldata中 t=[x] for a in data[x]: t.append(a) ldata.append(t) for i,p in enumerate(ldata): #将数据写入文件,i是enumerate()函数返回的序号数 for j,q in enumerate(p): table.write(i,j,q) file_excel.save('data.xlsx')