前言:
本文用常见的java工具包对word和pdf进行文字提取
1.首先声明一下依赖的包
注意,Word 03和Word 07是两种不同类型的文件,
一种以.doc结尾,另一种以.docx结尾,依赖的包不一样。
<!-- 用于对文件操作,例如写操作 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<!-- .docx解析依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- .doc解析依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<!-- .pdf解析依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
<!-- 单元测试依赖,也可以自己写一个测试类,这个无关紧要 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
2.把三种类型的文件抽取都写到一个类里面,如下
package com.chinamobile.cmss.i3.ba.extractText;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
public class FileExtract {
public String fileExtractText(String filePath) throws Exception {
String text;
if(filePath.isEmpty()){
return "";
}
if(filePath.toLowerCase().endsWith("doc")){
InputStream file = new FileInputStream(new File(filePath));
WordExtractor wordExtractor = new WordExtractor(file);
text = wordExtractor.getText();
file.close();
wordExtractor.close();
}
else if(filePath.toLowerCase().endsWith("docx")){
OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
text = extractor.getText();
opcPackage.close();
extractor.close();
}
else if(filePath.toLowerCase().endsWith("pdf")){
PDDocument pdDocument;
InputStream file = new FileInputStream(new File(filePath));
pdDocument = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdDocument);
file.close();
pdDocument.close();
}else {
return "error file";
}
return text;
}
}
3.写一个测试文件,如下
import org.junit.Test;
public class ExtractTest {
@Test
public void FileExtractTest(){
String filePath = "D:\\baoxiao.docx";
FileExtract fileExtract = new FileExtract();
String text = null;
try {
text = fileExtract.fileExtractText(filePath);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(text);
}
}
4.测试结果
无论是word还是pdf都无法识别图片,程序会自动过滤掉图片
可以抽取包括表格在内的word和pdf中的文字