废话不说,直接代码
Jacob有官网,下载完了把dll放在system32下,jsoup就简单的一个jar包
package com.sinosoft.util;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
/**
* jsoup用来格式化html格式 jacob用来处理word和html相互转化,使用的是原生的office转化,转出的效果比较好
*/
public class WordJacob {
// 8 代表word保存成html
private final int WORD_HTML = 8;
// 12 代表html保存成docx
private final int HTML_WORD = 12;
/*
* // 1 代表html保存成doc private final int HTML_DOC = 1;
*/
/**
* word保存路径
*/
private String wordPath;
/**
* word文档名称,注意不要带文件后缀
*/
private String docName;
public WordJacob(String wordPath, String docName) {
this.wordPath = wordPath;
this.docName = docName;
}
public static void main(String[] args) throws Exception {
String wordPath = "D:/111/";
//这个content就是读取了一个html的文件
String content=ComFile.readTxtFile("d://report_sj_bn.html", "utf-8");
String docName = "333";
//把word阅览模式的头部添加到HTML代码
String html = WordJacob.setHead(content);
WordJacob word = new WordJacob(wordPath, docName);
//解析html代码
org.jsoup.nodes.Document document = Jsoup.parse(html);
//格式化HTML代码,并且把图片文件提取出来(图片文件以base64的格式存储)
word.convertReportHtml(document);
//写html文件
word.writehtml(wordPath + docName + ".html", document.toString());
//把HTML和files文件转格式化成word
word.htmlToWord();
}
/**
* WORD转HTML
*
* @param docfile
* WORD文件全路径
* @param htmlfile
* 转换后HTML存放路径
* notes:需要将jacob.dll拷贝到windows/system32或者项目所在jre\bin目录下面(
* 比如我的Eclipse正在用的Jre路径是D:\Java\jdk1.7.0_17\jre\bin)。
* @param html
* html静态页面路径
* @param wordFile
* 要生成的word文档路径
*/
public void wordToHtml() throws Exception{
String docfile = wordPath + File.separator + docName + ".docx";
String htmlfile = wordPath + File.separator + docName + ".html";
// 启动word应用程序(Microsoft Office Word 2003)
ActiveXComponent app = new ActiveXComponent("Word.Application");
try {
// 设置word应用程序不可见
app.setProperty("Visible", new Variant(false));
// documents表示word程序的所有文档窗口,(word是多文档应用程序)
Dispatch docs = app.getProperty("Documents").toDispatch();
// 打开要转换的word文件
Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method,
new Object[] { docfile, new Variant(false), new Variant(true) }, new int[1]).toDispatch();
// 作为html格式保存到临时文件
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] { htmlfile, new Variant(WORD_HTML) },
new int[1]);
// 关闭word文件
Dispatch.call(doc, "Close", new Variant(false));
} finally {
// 关闭word应用程序
app.invoke("Quit", new Variant[] {});
}
}
/**
*
* JACOB方式
*
* notes:需要将jacob.dll拷贝到windows/system32或者项目所在jre\bin目录下面(
* 比如我的Eclipse正在用的Jre路径是D:\Java\jdk1.7.0_17\jre\bin)。
*
* @param html
* html静态页面路径
*
* @param wordFile
* 要生成的word文档路径
*
*/
public void htmlToWord() throws Exception{
String wordFile = wordPath + File.separator + docName + ".docx";
String htmlFile = wordPath + File.separator + docName + ".html";
ActiveXComponent app = new ActiveXComponent("Word.Application"); // 启动word
try {
app.setProperty("Visible", new Variant(false));
Dispatch wordDoc = app.getProperty("Documents").toDispatch();
wordDoc = Dispatch.invoke(wordDoc, "Add", Dispatch.Method, new Object[0], new int[1]).toDispatch();
Dispatch.invoke(app.getProperty("Selection").toDispatch(), "InsertFile", Dispatch.Method,
new Object[] { htmlFile, "", new Variant(false), new Variant(false), new Variant(false) },
new int[3]);
Dispatch.invoke(wordDoc, "SaveAs", Dispatch.Method, new Object[] { wordFile, new Variant(HTML_WORD) },
new int[1]);
Dispatch.call(wordDoc, "Close", new Variant(false));
} finally {
app.invoke("Quit", new Variant[] {});
}
}
/**
* 格式化html
*
* @param element
* @param imgPath
* 图片路径
* @throws Exception
*/
public void convertReportHtml(Element element) throws Exception {
if (element.id().equals("customerImg")) {
element.remove();
return;
}
String tagName = element.tagName();
if (tagName.equals("table") && !"printTable".equals(element.id())) { // printTable
// 是需求版本输出Word时,用于布局的表格,不应该显示边框
String style = element.attr("style");
style += "width: 100%; border:1px solid #CCC; border-collapse:collapse;";
element.attr("style", style);
} else if (tagName.equals("th") && !"printTable".equals(element.parent().parent().parent().id())) {
String style = element.attr("style");
style += " border:1px solid #CCC;";
element.attr("style", style);
} else if (tagName.equals("td") && !"printTable".equals(element.parent().parent().parent().id())) {
String style = element.attr("style");
style += " border:1px solid #EEE;";
element.attr("style", style);
} else if (tagName.equals("img")) {
String src = element.attr("src");
if (src.startsWith("data:")) {
// 此处用于生成图片文件
String imgName = ComStr.generateShortUuid() + ".png";
String newsrc = docName + ".files";
String base64Str = src.split(",")[1];// 只取出base64部分的字符串
saveWordImage(base64Str, imgName);
element.attr("src", newsrc +"/"+ imgName);
}
if (src.contains("svg") && src.contains("version")) {// 如果highchart图表,先删除掉
element.remove();
return;
}
element.appendText("");
} else if (tagName.equals("a")) {
String href = element.attr("href");
if (!href.startsWith("http")) {
element.attr("href", href);
}
} else if (tagName.equals("meta")) {
element.appendText("");
} else if (tagName.equals("o:p")) {
element.remove();
}
Elements elements = element.children();
if (elements.isEmpty()) {
return;
}
for (Element e : elements) {
convertReportHtml(e);
}
}
/**
* 保存word图片文件
*
* @throws Exception
*/
private void saveWordImage(String base64Str, String filename) throws Exception {
String path = wordPath + docName + ".files";
File f = ImageBase64Converter.convertBase64ToFile(base64Str, path, filename);
if (f == null) {
throw new Exception("保存word文档图片失败!");
}
}
/**
* 增加一个头部,用来转成word打开的时候是页面视图
*
* @param html
* @return
*/
public static String setHead(String html) {
// 添加一部分代码,用于打开的时候默认是word视图
String head = "<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN' 'http://www.w3.org/TR/html4/loose.dtd'>";
head += "<html xmlns:v='urn:schemas-microsoft-com:vml' xmlns:o='urn:schemas-microsoft-com:office:office'";
head += "xmlns:w='urn:schemas-microsoft-com:office:word' xmlns:m='http://schemas.microsoft.com/office/2004/12/omml'";
head += "xmlns='http://www.w3.org/TR/REC-html40'><head>";
head += "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />";
head += "<meta name=ProgId content=Word.Document ><meta name=Generator content='Microsoft Word 14' ><meta name=Originator content='Microsoft Word 14' >";
head += "<!--[if gte mso 9]><xml><w:WordDocument><w:View>Print</w:View><w:TrackMoves>false</w:TrackMoves><w:TrackFormatting/><w:ValidateAgainstSchemas/><w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid><w:IgnoreMixedContent>false</w:IgnoreMixedContent><w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText><w:DoNotPromoteQF/><w:LidThemeOther>EN-US</w:LidThemeOther><w:LidThemeAsian>ZH-CN</w:LidThemeAsian><w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript><w:Compatibility><w:BreakWrappedTables/><w:SnapToGridInCell/><w:WrapTextWithPunct/><w:UseAsianBreakRules/><w:DontGrowAutofit/><w:SplitPgBreakAndParaMark/><w:DontVertAlignCellWithSp/><w:DontBreakConstrainedForcedTables/><w:DontVertAlignInTxbx/><w:Word11KerningPairs/><w:CachedColBalance/><w:UseFELayout/></w:Compatibility><w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel><m:mathPr><m:mathFont m:val='Cambria Math'/><m:brkBin m:val='before'/><m:brkBinSub m:val='--'/><m:smallFrac m:val='off'/><m:dispDef/><m:lMargin m:val='0'/> <m:rMargin m:val='0'/><m:defJc m:val='centerGroup'/><m:wrapIndent m:val='1440'/><m:intLim m:val='subSup'/><m:naryLim m:val='undOvr'/></m:mathPr></w:WordDocument></xml><![endif]-->";
head += html;
return head;
}
public void writehtml(String file, String content) throws Exception {
FileOutputStream fos = null;
OutputStreamWriter osw = null;
try {
fos = new FileOutputStream(file);
osw = new OutputStreamWriter(fos, "utf-8");
osw.write(content);
osw.flush();
} finally {
if (osw != null) {
osw.close();
}
if (fos != null) {
fos.close();
}
}
}
}