通用的读取方法:
读取 doc
private static String contextOfDoc(File file){
String str = "";
try {
FileInputStream fis = new FileInputStream(file);
HWPFDocument doc = new HWPFDocument(fis);
str = doc.getDocumentText();
doc.close();
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
return str;
}
读取 docx
FileInputStream fis = null;
XWPFDocument xdoc = null;
XWPFWordExtractor extractor = null;
try{
if (suffix.endsWith(".docx")) {
fis = new FileInputStream(file);
xdoc = new XWPFDocument(fis);
extractor = new XWPFWordExtractor(xdoc);
wordText = extractor.getText();
}
} catch (IOException e) {
log.error("getWordContent error", e);
}
finally {
try {
if (extractor != null){
extractor.close();
}
} catch (IOException e) {
log.error("close stream failed", e);
}
CloseUtil.closeStream(fis);
}
//将整个文档数据字符串拆分成行数据,删除两头空格,并删除空行
String[] lineArr = wordText.split("\r\n|\n\n|\n");
for (String line : lineArr) {
if (StringUtils.isNotEmpty(line.trim())) {
lineList.add(line.trim());
}
}