word文件与html文件互转

word文件与html文件互转

一、使用poi完成docx转html

  1. 引入maven依赖

    <dependency>
                <groupId>fr.opensagres.xdocreport</groupId>
                <artifactId>fr.opensagres.xdocreport.document</artifactId>
                <version>2.0.1</version>
            </dependency>
            <dependency>
                <groupId>fr.opensagres.xdocreport</groupId>
                <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
                <version>2.0.1</version>
            </dependency>
    

    还有poi、poi-ooxml、poi-scratchpad、poi-ooxml-schemas这些,一定要避免jar包冲突。在我的项目中有用到elasticsearch(org.elasticsearch),它其中就已经囊括了这些jar包,所以我再在pom文件中加入依赖后一直因为冲突报错。

    word文件与html文件互转

  2. 代码部分

    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
    import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
    
    public static void docx2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException, XmlException {
            String fileOutName = outPutFile;
            XWPFDocument document = new XWPFDocument(new FileInputStream(fileName));
            XHTMLOptions options = XHTMLOptions.create().indent(4);
            // 导出图片
            File imageFolder = new File(tempPath);
            options.setExtractor(new FileImageExtractor(imageFolder));
            // URI resolver
            options.URIResolver(new FileURIResolver(imageFolder));
            File outFile = new File(fileOutName);
            outFile.getParentFile().mkdirs();
            OutputStream out = new FileOutputStream(outFile);
            XHTMLConverter.getInstance().convert(document, out, options);
        }
    public static void main(String argv[]) throws Exception {
        docx2Html("E:\\aaaaaa.docx", "E:\\aaaaaa.html");
    }
    
  3. 注意

    我是通过hutool中的类Word07Writer生成的docx文件,会报styles no found和与页眉页脚nullpoint相关的错误(通过打断点进原始类发现)。最后是通过以下方式解决的。

    //设置样式
    writer.getDoc().createStyles();
    //设置页眉页脚
    createDefaultHeader(writer.getDoc(), "这是页眉内容");
    
    private static void createDefaultHeader(final XWPFDocument docx, final String text){
            CTP ctp = CTP.Factory.newInstance();
            XWPFParagraph paragraph = new XWPFParagraph(ctp, docx);
            ctp.addNewR().addNewT().setStringValue(text);
            ctp.addNewR().addNewT().setSpace(SpaceAttribute.Space.PRESERVE);
            CTSectPr sectPr = docx.getDocument().getBody().isSetSectPr() ? docx.getDocument().getBody().getSectPr() : docx.getDocument().getBody().addNewSectPr();
            XWPFHeaderFooterPolicy policy = new XWPFHeaderFooterPolicy(docx, sectPr);
            XWPFHeader header = policy.createHeader(STHdrFtr.DEFAULT, new XWPFParagraph[] { paragraph });
            header.setXWPFDocument(docx);
        }
    

二、通过docx4j将html转docx

  1. 引入依赖

    <dependency>
                <groupId>org.docx4j</groupId>
                <artifactId>docx4j-ImportXHTML</artifactId>
                <version>3.0.0</version>
    </dependency>
    
  2. 代码部分

    public static void main(String argv[]) throws Exception {
    //        String xhtml =
    //                "<table border=\"1\" cellpadding=\"1\" cellspacing=\"1\" style=\"width:100%;\"><tbody><tr><td>test</td><td>test</td></tr><tr><td>test</td><td>test</td></tr><tr><td>test</td><td>test</td></tr></tbody></table>";
            Document document = Jsoup.parse(new File("E:\\report.html"), "UTF-8");
        //html body体    
        String xhtml =document.body().outerHtml();
            // To docx, with content controls
            WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
    
            XHTMLImporterImpl XHTMLImporter = new XHTMLImporterImpl(wordMLPackage);
            //XHTMLImporter.setDivHandler(new DivToSdt());
    
            wordMLPackage.getMainDocumentPart().getContent().addAll(
                    XHTMLImporter.convert(xhtml, null));
    
            System.out.println(XmlUtils.marshaltoString(wordMLPackage
                    .getMainDocumentPart().getJaxbElement(), true, true));
    
    		wordMLPackage.save(new java.io.File("E:\\aaaaaa.docx"));
    
上一篇:对伪装docx文件病毒的逆向分析


下一篇:语法:赖氏经典英语语法.docx