import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Calendar; import java.util.List; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.syweb.mydemo.common.model.ImExtInfo; /** * @ClassName: FileInfoUtil * @Description: 读取各类文件工具类 * @author wl * @date 2018年5月9日 下午6:00:07 */ public class FileInfoUtil { private static final Logger logger = LoggerFactory.getLogger(FileInfoUtil.class); public static ImExtInfo readXFile(String inputPath){ ImExtInfo ImExtInfo = new ImExtInfo(); File file = new File(inputPath.trim()); String fileExt = FileUtil.getFileType(file); switch(fileExt){ case "pdf": ImExtInfo = readPdfFile(inputPath); break; case "docx": ImExtInfo = readDocxFile(inputPath); break; case "doc": ImExtInfo = readDocFile(inputPath); break; case "txt": ImExtInfo = readTxtFile(inputPath); break; case "ppt": ImExtInfo = readPPtFile(inputPath); break; case "xls": ImExtInfo = readXlsFile(inputPath); break; case "json": ImExtInfo = readTextFile(inputPath); break; case "md": ImExtInfo = readTextFile(inputPath); break; default: ImExtInfo = null; } return ImExtInfo; } /** * @Title: readDocxFile * @Description: docx读取 * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:11:01 */ public static ImExtInfo readDocxFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(filePath); XWPFDocument document=new XWPFDocument(fis); List<XWPFParagraph> paragraphs = document.getParagraphs(); StringBuilder contentBuilder = new StringBuilder(); for (XWPFParagraph para : paragraphs) { contentBuilder.append(para.getText().trim()); } String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); fis.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取docx文件失败"); } return ImExtInfo; } /** * @Title: readDocFile * @Description: doc读取 * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:11:19 */ public static ImExtInfo readDocFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(file.getAbsolutePath()); WordExtractor document=new WordExtractor(fis); String paragraphs = document.getText(); StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(paragraphs.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); document.close(); fis.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取doc文件失败"); } return ImExtInfo; } /** * @Title: readPdfFile * @Description: 读取PDF * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:16:40 */ public static ImExtInfo readPdfFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); COSDocument cosDoc=null; PDDocument pdDoc = null; String paragraphs =null; try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(file.getAbsolutePath()); cosDoc=parseDocument(fis); if (cosDoc.isEncrypted()) { if (cosDoc != null) closeCOSDocument(cosDoc); logger.info("该PDF文档是加密文档,无法处理"); } PDFTextStripper stripper = new PDFTextStripper(); String docText = stripper.getText(new PDDocument(cosDoc)); pdDoc = new PDDocument(cosDoc); PDDocumentInformation docInfo = pdDoc.getDocumentInformation(); if(docInfo.getTitle()!=null && !docInfo.getTitle().equals("")){ paragraphs = docInfo.getTitle(); } StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(docText.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); fis.close(); closeCOSDocument(cosDoc); closePDDocument(pdDoc); } catch (Exception e) { e.printStackTrace(); closeCOSDocument(cosDoc); closePDDocument(pdDoc); logger.error("读取PDF文件失败"); } return ImExtInfo; } /** * @Title: readTxtFile * @Description: 读取txt * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:37:11 */ public static ImExtInfo readTxtFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 //构造一个BufferedReader类来读取文件 BufferedReader br = new BufferedReader(new FileReader(file)); String str = null; String result =null; while ((str = br.readLine()) != null) { result = result + "\n" + str; } StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(result.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); br.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取txt文件失败"); } return ImExtInfo; } /** * @Title: readPPtFile * @Description: PPT读取 * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:47:21 */ public static ImExtInfo readPPtFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(file.getAbsolutePath()); PowerPointExtractor document=new PowerPointExtractor(fis); String paragraphs = document.getText(); StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(paragraphs.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); fis.close(); document.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取ppt文件失败"); } return ImExtInfo; } /** * @Title: readXlsFile * @Description: 读取xls * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:54:13 */ public static ImExtInfo readXlsFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HSSFWorkbook wb=new HSSFWorkbook(new POIFSFileSystem(fis)); ExcelExtractor extractor=new ExcelExtractor(wb); extractor.setFormulasNotResults(false); extractor.setIncludeSheetNames(true); String paragraphs = extractor.getText(); StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(paragraphs.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); fis.close(); extractor.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取xls文件失败"); } return ImExtInfo; } /** * @Title: readTextFile * @Description: 读取文本 * @param @param inputPath * @param @return 参数说明 * @return ImExtInfo 返回类型 * @author wl * @date 2018年5月9日 下午5:57:30 */ public static ImExtInfo readTextFile(String inputPath) { ImExtInfo ImExtInfo = new ImExtInfo(); try { File file = new File(inputPath.trim()); // 1.获取文件名称 String fileTitle = file.getName(); // 2.获取绝对路径 String filePath = file.getAbsolutePath(); // 3.获取文件内容 FileInputStream fis = new FileInputStream(file.getAbsolutePath()); WordExtractor extractor = new WordExtractor(fis); String paragraphs = extractor.getText(); StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(paragraphs.trim()); String content = contentBuilder.toString(); ImExtInfo.setTitle(fileTitle); ImExtInfo.setPath(filePath); ImExtInfo.setContent(content.trim()); // 获取当前时间... String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime()); ImExtInfo.setCreate_time(curTimeStamp); fis.close(); extractor.close(); } catch (Exception e) { e.printStackTrace(); logger.error("读取文件失败"); } return ImExtInfo; } /** * @Title: parseDocument * @Description: PD路径读取 * @author wl * @date 2018年5月9日 下午5:20:29 */ private static COSDocument parseDocument(InputStream is) throws IOException { PDFParser parser = new PDFParser(is); parser.parse(); return parser.getDocument(); } /** * @Title: closeCOSDocument * @Description: COS关闭 * @author wl * @date 2018年5月9日 下午5:20:59 */ private static void closeCOSDocument(COSDocument cosDoc) { if (cosDoc != null) { try { cosDoc.close(); } catch (IOException e) { } } } /** * @Title: closeCOSDocument * @Description: PDD关闭 * @author wl * @date 2018年5月9日 下午5:20:59 */ private static void closePDDocument(PDDocument pdDoc) { if (pdDoc != null) { try { pdDoc.close(); } catch (IOException e) { } } } }
注意:第一个方法及其他方法中传参路径为文件本地磁盘路径,不是网络路径