引入maven
<!-- poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.16</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.16</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.16</version>
</dependency>
<!-- pdf -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
</dependencies>
提取内容
private static String read(File file) {
StringBuilder builder = new StringBuilder();
String name = file.getName();
boolean txt = name.endsWith(".txt");
if (txt) {
try (FileInputStream inputStream = new FileInputStream(file)) {
int len;
byte[] bytes = new byte[1024];
while ((len = inputStream.read(bytes)) != -1) {
builder.append(new String(bytes, 0, len));
}
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
POITextExtractor extractor = null;
boolean word = name.endsWith(".doc") || name.endsWith(".docx");
if (word) {
try {
extractor = new WordExtractor(new HWPFDocument(new FileInputStream(file)));
} catch (Exception e) {
try {
extractor = new XWPFWordExtractor(new XWPFDocument(new FileInputStream(file)));
} catch (Exception ignored) {
}
}
}
boolean excel = name.endsWith(".xls") || name.endsWith(".xlsx");
if (excel) {
try {
extractor = new ExcelExtractor(new HSSFWorkbook(new POIFSFileSystem(file)));
} catch (Exception e) {
try {
extractor = new XSSFExcelExtractor(new XSSFWorkbook(file));
} catch (Exception ignored) {
}
}
}
boolean slide = name.endsWith(".ppt") || name.endsWith(".pptx");
if (slide) {
try {
extractor = new PowerPointExtractor(new FileInputStream(file));
} catch (Exception e) {
try {
extractor = new XSLFPowerPointExtractor(new XSLFSlideShow(OPCPackage.open(file)));
} catch (Exception ignored) {
}
}
}
if (extractor != null) {
builder.append(extractor.getText());
try {
extractor.close();
} catch (IOException ignored) {
}
}
boolean pdf = name.endsWith(".pdf");
if (pdf) {
try {
PDDocument document = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
builder.append(stripper.getText(document));
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return builder.toString();
}