获取pdf、doc/docx文本数据

1、依赖关系

<dependency>
     <groupId>org.apache.pdfbox</groupId>
     <artifactId>pdfbox</artifactId>
     <version>2.0.12</version>
     </dependency>
     <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi</artifactId>
     <version>3.17</version>
     </dependency>
     <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi-ooxml</artifactId>
     <version>3.17</version>
     </dependency>
     <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi-scratchpad</artifactId>
     <version>3.17</version>
     </dependency>

2.代码

package com.lucene.util;

import com.zxf.lucene.common.consts.FileSuffix;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;

public class TextUtil {

    private static final Logger logger = LoggerFactory.getLogger(TextUtil.class);

    private TextUtil(){}
    /**获取pdf、doc、docx文档的文本信息*/
    public static String getTextOfFile(String filepath){
        String text = "";
        File file = new File(filepath);
        if(!file.isFile()){
            return text;
        }
        String fileName = file.getName();
        String suffix = getSuffix(fileName);
        if (FileSuffix.PDF.equalsIgnoreCase(suffix)) {
            try{
                RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(new FileInputStream(file));
                PDFParser pdfParser = new PDFParser(randomAccessRead);
                pdfParser.parse();
                try(PDDocument pdDocument = pdfParser.getPDDocument()){
                    PDFTextStripper pdfTextStripper = new PDFTextStripper();
                    text = pdfTextStripper.getText(pdDocument);
                }
            }catch (Exception e){
                logger.error("获取pdf文本信息出错",e);
                return text;
            }
        } else if (FileSuffix.DOCX.equalsIgnoreCase(suffix)) {
            try(XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(OPCPackage.open(file))){
                text = xwpfWordExtractor.getText();
            }catch(Exception e){
                logger.error("获取word文档(.docx)文本信息出错",e);
                return text;
            }
        } else if (FileSuffix.DOC.equalsIgnoreCase(suffix)) {
            try(WordExtractor wordExtractor = new WordExtractor(new FileInputStream(file))){
                text = wordExtractor.getText();
            }catch (Exception e){
                logger.error("获取word文档(.doc)文本信息出错",e);
                return text;
            }
        }else{
            return text;
        }
        return text.trim().replaceAll("\\r", "").replaceAll("\\n", "").replaceAll("\\t", "").replaceAll("\\s", "");
    }

    /**获取文件后缀*/
    public static String getSuffix(String string){
        int one = 1;
        String douhao = ".";
        return string.substring(string.lastIndexOf(douhao)+one);
    }

    /**剔除路径中的不合法字符卷*/
    public static String clearIllegalCharacter(String fieldValue) {
        return fieldValue.replaceAll("[\\/:\\*\\?\"<>\\\\|]", "");
    }

}

 

上一篇:c#-使用OpenXML SDK调整DocX中现有图像的大小


下一篇:thinkphp PHPWord数据导出word固定模板,ZipArchive文件批量压缩打包下载