<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.16</version>
</dependency>
<!--读取word所需依赖 -->
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.16</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.16</version>
</dependency>
<!--读取word所需依赖 -->
代码:
package springTimer;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import springTimer.util.CharIndexUtil;
public class testReadDocx {
/**
* 读取word文件内容
*
* @param path
* @return buffer
*/
public static String readWord(String path) {
String buffer = "";
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
ex.close();
} else if (path.endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
extractor.close();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
return buffer;
}
public static void main(String[] args) {
List<File> files = DocumentUtil.getFiles("E:\\zhenglilunbotu\\testdocx\\");
for (File file : files) {
String path = file.getPath();
String name = file.getName();
int indexOf = name.indexOf(".");
String substring = name.substring(indexOf);
if(".docx".equals(substring)){
System.out.println(path);
String content = testReadDocx.readWord(path);
System.out.println("content===="+content);
int index1 = CharIndexUtil.getCharacterPosition(content, ":", 5);
int index2 = CharIndexUtil.getCharacterPosition(content, ":", 6);
int index3 = CharIndexUtil.getCharacterPosition(content, ":", 7);
String sc = content.substring(index1+1, index2-4);
String jl = content.substring(index2+1, index3-4);
String ry = content.substring(index3+1);
System.out.println(sc);
System.out.println(jl);
System.out.println(ry);
}else if(".png".equals(substring)){
System.out.println(path);
}
}
}
}
package springTimer;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
//遍历文件夹下的所有文件
public class DocumentUtil {
//path:文件的地址 例如:D:\\videoUpload
public static List<File> getFiles(String path){
File root = new File(path);
List<File> files = new ArrayList<File>();
if(!root.isDirectory()){
files.add(root);
}else{
File[] subFiles = root.listFiles();
for(File f : subFiles){
files.addAll(getFiles(f.getAbsolutePath()));
}
}
return files;
}
}
package springTimer;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import springTimer.util.CharIndexUtil;
public class testReadDocx {
/**
* 读取word文件内容
*
* @param path
* @return buffer
*/
public static String readWord(String path) {
String buffer = "";
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
ex.close();
} else if (path.endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
extractor.close();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
return buffer;
}
public static void main(String[] args) {
List<File> files = DocumentUtil.getFiles("E:\\zhenglilunbotu\\testdocx\\");
for (File file : files) {
String path = file.getPath();
String name = file.getName();
int indexOf = name.indexOf(".");
String substring = name.substring(indexOf);
if(".docx".equals(substring)){
System.out.println(path);
String content = testReadDocx.readWord(path);
System.out.println("content===="+content);
int index1 = CharIndexUtil.getCharacterPosition(content, ":", 5);
int index2 = CharIndexUtil.getCharacterPosition(content, ":", 6);
int index3 = CharIndexUtil.getCharacterPosition(content, ":", 7);
String sc = content.substring(index1+1, index2-4);
String jl = content.substring(index2+1, index3-4);
String ry = content.substring(index3+1);
System.out.println(sc);
System.out.println(jl);
System.out.println(ry);
}
}
}
}
word文件
姓 名:张三
医 院:*医院
科 室:*内科
职 称:*医师
擅长领域:治疗内科常见病、多发病,对心脑血管、胃肠、妇科等病中医药治疗有专长。
执业经历:教授,*************,河北*人。19*年至今在*****医院中医科从事临床医疗、教学、科研工作。
所获荣誉:20*年开展的《*实验研究》获*一等奖(*主研人),发表*5篇,撰写著作6部。