Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库,它可以用于创建新的PDF文档,修改现有的PDF文档,还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。
Apache PDFbox于2016年4月26日发布了最新的2.0.1版。
备注:本文代码均是基于2.0及以上版本编写。
官网地址:https://pdfbox.apache.org/index.html
PDFBox 2.0.1 API在线文档:https://pdfbox.apache.org/docs/2.0.1/javadocs/
1,。JAR包
pdfbox-2.0.1.jar下载地址
fontbox-2.0.1.jar下载地址
访问网址 http://sourceforge.net/projects/pdfbox/ 。
package com.airport.demo.tcpReplace;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import javax.imageio.ImageIO;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import com.airport.demo.bean.MRZData;
import com.jogamp.opengl.util.packrect.Level;
public class PDFUtil {
public static MRZData readPDF() {
MRZData mData = new MRZData();
File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf");
PDDocument document = null;
try {
// 方式一:
// InputStream input = null;
// input = new FileInputStream(pdfFile);
// // 加载 pdf 文档
// PDFParser parser = new PDFParser(input, new
// RandomAccessBuffer());
// parser.parse();
// document = parser.getPDDocument();
// 方式二:
document = PDDocument.load(pdfFile);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(1);
String content = stripper.getText(document);
System.out.println(content);
document.close();
} catch (Exception e) {
System.out.println(e);
}
return mData;
}
public static void main(String[] args) {
System.out.println(readPDF());
String f="C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf";
String path="C:\\Users\\weidx\\Desktop\\";
pdfSaveImage(f,path);
}
public static void pdfSaveImage(String file, String imgSavePath) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
/** 文档页面信息 **/// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取文档页面PDPage列表
int pages = document.getNumberOfPages();
int count = 1;
for (int j = 1; j < pages; j++) {
PDPage page = document.getPage(j);
PDResources resources = page.getResources();
Iterable xobjects = resources.getXObjectNames();
if (xobjects != null) {
Iterator imageIter = xobjects.iterator();
while (imageIter.hasNext()) {
COSName key = (COSName) imageIter.next();
if (resources.isImageXObject(key)) {
try {
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
BufferedImage bimage = image.getImage();
ImageIO.write(bimage, "jpg", new File(imgSavePath + count + ".jpg"));
count++;
System.out.println(count);
} catch (Exception e) {
}
}
}
}
}
// document.close();
// fis.close();
} catch (Exception e) {
System.out.println();
}
}
public static void readImage() {
// 待解析PDF
File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\in.pdf");
// 空白PDF
File pdfFile_out = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\out.pdf");
PDDocument document = null;
PDDocument document_out = null;
try {
document = PDDocument.load(pdfFile);
document_out = PDDocument.load(pdfFile_out);
} catch (IOException e) {
e.printStackTrace();
}
int pages_size = document.getNumberOfPages();
System.out.println("getAllPages===============" + pages_size);
int j = 0;
for (int i = 0; i < pages_size; i++) {
PDPage page = document.getPage(i);
PDPage page1 = document_out.getPage(0);
PDResources resources = page.getResources();
Iterable xobjects = resources.getXObjectNames();
if (xobjects != null) {
Iterator imageIter = xobjects.iterator();
while (imageIter.hasNext()) {
COSName key = (COSName) imageIter.next();
if (resources.isImageXObject(key)) {
try {
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
// 方式一:将PDF文档中的图片 分别存到一个空白PDF中。
PDPageContentStream contentStream = new PDPageContentStream(document_out, page1, AppendMode.APPEND,
true);
float scale = 1f;
contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
contentStream.close();
document_out.save("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\" + j + ".pdf");
System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());
/**
* // 方式二:将PDF文档中的图片 分别另存为图片。 File file = new
* File("/Users/xiaolong/Downloads/123"+j+".png");
* FileOutputStream out = new
* FileOutputStream(file);
*
* InputStream input = image.createInputStream();
*
* int byteCount = 0; byte[] bytes = new byte[1024];
*
* while ((byteCount = input.read(bytes)) > 0) {
* out.write(bytes,0,byteCount); }
*
* out.close(); input.close();
**/
} catch (IOException e) {
e.printStackTrace();
}
j++;
}
}
}
}
System.out.println(j);
}
}
提供另外一种思路
- public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
- /**
- * 解析pdf文档信息
- * @param pdfPath pdf文档路径
- * @throws Exception
- */
- public static void pdfParse( String pdfPath, String imgSavePath ) throws Exception
- {
- InputStream input = null;
- File pdfFile = new File( pdfPath );
- PDDocument document = null;
- try{
- input = new FileInputStream( pdfFile );
- //加载 pdf 文档
- document = PDDocument.load( input );
- /** 文档属性信息 **/
- PDDocumentInformation info = document.getDocumentInformation();
- System.out.println( "标题:" + info.getTitle() );
- System.out.println( "主题:" + info.getSubject() );
- System.out.println( "作者:" + info.getAuthor() );
- System.out.println( "关键字:" + info.getKeywords() );
- System.out.println( "应用程序:" + info.getCreator() );
- System.out.println( "pdf 制作程序:" + info.getProducer() );
- System.out.println( "作者:" + info.getTrapped() );
- System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));
- System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));
- /** 文档页面信息 **/
- PDDocumentCatalog cata = document.getDocumentCatalog();
- List pages = cata.getAllPages();
- int count = 1;
- for( int i = 0; i < pages.size(); i++ )
- {
- PDPage page = ( PDPage ) pages.get( i );
- if( null != page )
- {
- PDResources res = page.findResources();
- //获取页面图片信息
- Map imgs = res.getImages();
- if( null != imgs )
- {
- Set keySet = imgs.keySet();
- Iterator it = keySet.iterator();
- while( it.hasNext() )
- {
- Object obj = it.next();
- PDXObjectImage img = ( PDXObjectImage ) imgs.get( obj );
- img.write2file( imgSavePath + count );
- count++;
- }
- }
- }
- }
- }catch( Exception e)
- {
- throw e;
- }finally{
- if( null != input )
- input.close();
- if( null != document )
- document.close();
- }
- }
- /**
- * 获取格式化后的时间信息
- * @param dar 时间信息
- * @return
- * @throws Exception
- */
- public static String dateFormat( Calendar calendar ) throws Exception
- {
- if( null == calendar )
- return null;
- String date = null;
- try{
- String pattern = DATE_FORMAT;
- SimpleDateFormat format = new SimpleDateFormat( pattern );
- date = format.format( calendar.getTime() );
- }catch( Exception e )
- {
- throw e;
- }
- return date == null ? "" : date;
- }
以下是其他的操作,仅供参考
[java] view plain copy print?
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.util.PDFTextStripper;
importstatic readPDFContent.PDFParse.dateFormat;
/**
*
* @author Angela
*/publicclassPDFReader {/**
* 获取格式化后的时间信息
* @param calendar 时间信息
* @return */publicstatic String dateFormat( Calendar calendar ){
if( null == calendar )
returnnull;
String date = null;
String pattern = "yyyy-MM-dd HH:mm:ss";
SimpleDateFormat format = new SimpleDateFormat( pattern );
date = format.format( calendar.getTime() );
return date == null ? "" : date;
}
/**打印纲要**/publicstaticvoidgetPDFOutline(String file){
try {
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//加载 pdf 文档,获取PDDocument文档对象
PDDocument document=PDDocument.load(fis);
//获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog=document.getDocumentCatalog();
//获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline=catalog.getDocumentOutline();
//获取第一个纲要条目(标题1)
PDOutlineItem item=outline.getFirstChild();
if(outline!=null){
//遍历每一个标题1while(item!=null){
//打印标题1的文本
System.out.println("Item:"+item.getTitle());
//获取标题1下的第一个子标题(标题2)
PDOutlineItem child=item.getFirstChild();
//遍历每一个标题2while(child!=null){
//打印标题2的文本
System.out.println(" Child:"+child.getTitle());
//指向下一个标题2
child=child.getNextSibling();
}
//指向下一个标题1
item=item.getNextSibling();
}
}
//关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**打印一级目录**/publicstaticvoidgetPDFCatalog(String file){
try {
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//加载 pdf 文档,获取PDDocument文档对象
PDDocument document=PDDocument.load(fis);
//获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog=document.getDocumentCatalog();
//获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline=catalog.getDocumentOutline();
//获取第一个纲要条目(标题1)if(outline!=null){
PDOutlineItem item=outline.getFirstChild();
//遍历每一个标题1while(item!=null){
//打印标题1的文本
System.out.println("Item:"+item.getTitle());
//指向下一个标题1
item=item.getNextSibling();
}
}
//关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**获取PDF文档元数据**/publicstaticvoidgetPDFInformation(String file){
try {
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//加载 pdf 文档,获取PDDocument文档对象
PDDocument document=PDDocument.load(fis);
/** 文档属性信息 **/ PDDocumentInformation info = document.getDocumentInformation();
System.out.println("页数:"+document.getNumberOfPages());
System.out.println( "标题:" + info.getTitle() );
System.out.println( "主题:" + info.getSubject() );
System.out.println( "作者:" + info.getAuthor() );
System.out.println( "关键字:" + info.getKeywords() );
System.out.println( "应用程序:" + info.getCreator() );
System.out.println( "pdf 制作程序:" + info.getProducer() );
System.out.println( "Trapped:" + info.getTrapped() );
System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));
System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));
//关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**提取pdf文本**/publicstaticvoidextractTXT(String file){
try{
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//实例化一个PDF解析器
PDFParser parser = new PDFParser(fis);
//解析pdf文档
parser.parse();
//获取PDDocument文档对象
PDDocument document=parser.getPDDocument();
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
//获取文本内容
String content = stripper.getText(document);
//打印内容
System.out.println( "内容:" + content );
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**
* 提取部分页面文本
* @param file pdf文档路径
* @param startPage 开始页数
* @param endPage 结束页数
*/publicstaticvoidextractTXT(String file,int startPage,int endPage){
try{
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//实例化一个PDF解析器
PDFParser parser = new PDFParser(fis);
//解析pdf文档
parser.parse();
//获取PDDocument文档对象
PDDocument document=parser.getPDDocument();
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
// 设置起始页
stripper.setStartPage(startPage);
// 设置结束页
stripper.setEndPage(endPage);
//获取文本内容
String content = stripper.getText(document);
//打印内容
System.out.println( "内容:" + content );
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**
* 提取图片并保存
* @param file PDF文档路径
* @param imgSavePath 图片保存路径
*/publicstaticvoidextractImage(String file,String imgSavePath){
try{
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//加载 pdf 文档,获取PDDocument文档对象
PDDocument document=PDDocument.load(fis);
/** 文档页面信息 **///获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
//获取文档页面PDPage列表
List pages = catalog.getAllPages();
int count = 1;
int pageNum=pages.size(); //文档页数//遍历每一页for( int i = 0; i < pageNum; i++ ){
//取得第i页
PDPage page = ( PDPage ) pages.get( i );
if( null != page ){
PDResources resource = page.findResources();
//获取页面图片信息
Map<String,PDXObjectImage> imgs = resource.getImages();
for(Map.Entry<String,PDXObjectImage> me: imgs.entrySet()){
//System.out.println(me.getKey());
PDXObjectImage img = me.getValue();
//保存图片,会自动添加图片后缀类型
img.write2file( imgSavePath + count );
count++;
}
}
}
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**
* 提取文本并保存
* @param file PDF文档路径
* @param savePath 文本保存路径
*/publicstaticvoidextractTXT(String file,String savePath){
try{
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//实例化一个PDF解析器
PDFParser parser = new PDFParser(fis);
//解析pdf文档
parser.parse();
//获取PDDocument文档对象
PDDocument document=parser.getPDDocument();
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
//创建一个输出流
Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));
//保存文本内容
stripper.writeText(document, writer);
//关闭输出流
writer.close();
//关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
/**
* 提取部分页面文本并保存
* @param file PDF文档路径
* @param startPage 开始页数
* @param endPage 结束页数
* @param savePath 文本保存路径
*/publicstaticvoidextractTXT(String file,int startPage,
int endPage,String savePath){
try{
//打开pdf文件流
FileInputStream fis = new FileInputStream(file);
//实例化一个PDF解析器
PDFParser parser = new PDFParser(fis);
//解析pdf文档
parser.parse();
//获取PDDocument文档对象
PDDocument document=parser.getPDDocument();
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
//创建一个输出流
Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));
// 设置起始页
stripper.setStartPage(startPage);
// 设置结束页
stripper.setEndPage(endPage);
//保存文本内容
stripper.writeText(document, writer);
//关闭输出流
writer.close();
//关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
publicstaticvoidmain(String args[]){
String file="F:\\pdf\\2013\\000608_阳光股份_2013年年度报告(更新后)_1.pdf";
String savePath="E:\\result1.txt";
long startTime=System.currentTimeMillis();
extractTXT(file,savePath);
long endTime=System.currentTimeMillis();
System.out.println("读写所用时间为:"+(endTime-startTime)+"ms");
}
}