/* ***************************************************************************** * This software is under the Apache License Version 2.0 * Author: Tao - mail:cn.java.river@gmail.com * Spreading Your Heart **************************************************************************** */ package atao.util.html; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import org.apache.commons.lang.StringUtils; /** * * A Simple HTML downloader which can also download Page resources. * <br/> * <b>Note: This Tool won‘t download related or sub HTML</b> * * @author <a href="mailto:cn.java.river@gmail.com">Tao</a> * @since 1.0 */ public class HtmlDownloader { //URL will be downloaded. private static String url = "http://pervasive2.morselli.unimo.it/~nicola/courses/IngegneriaDelSoftware/java/J6d_xml.html"; //workspace folder. private static String workspace = "download"; //sub css and js resources sign private static String urlSign = "<link href="; //sub image resources sign private static String urlSign2 = "src="; //URL parent. private static String rootUrl = null; public static void main (String[] args) throws Exception { long start = System.nanoTime (); setRootUrl (); URL u = new URL (url); InputStream is = u.openStream (); BufferedReader reader = new BufferedReader (new InputStreamReader (is)); File f = createDownloadFile ("download.html"); BufferedWriter writer = new BufferedWriter (new FileWriter (f)); String s; while ((s = reader.readLine ()) != null) { writer.write (s); writer.newLine (); if (hasSubUrl (s)) { downloadChild (getSubUrl (s)); } } is.close (); reader.close (); writer.close (); System.out.println ("Download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00)); } // end of main /** * set root url for the downloading html */ private static void setRootUrl () { int pos = url.lastIndexOf ("/"); rootUrl = url.substring (0, pos); System.out.println ("Root Url is:" + rootUrl); } /** * check if content includes sub resources. * * @param text line of html content. * @return Yes or Not */ private static boolean hasSubUrl (String text) { if (StringUtils.isNotEmpty (text)) { if (text.contains (urlSign) || text.contains (urlSign2)) { return true; } return false; } else { return false; } } /** * generate sub url from line content. */ private static String getSubUrl (String text) { int pos = text.indexOf (urlSign); pos = (pos == -1) ? text.indexOf (urlSign2) : pos; text = text.substring (pos); String[] ps = text.split ("\""); System.out.println ("subUrl is :" + ps[1]); return ps[1]; } /** * download sub resources,<b>Note: don‘t use Java Character Writers, * otherwise you can‘t get pictures correctly.</b> * * @param subUrl */ private static void downloadChild (String subUrl) { if (StringUtils.isNotEmpty (subUrl)) { if (subUrl.startsWith ("http:")) { System.out.println ("subUrl not support yet."); } else { long start = System.nanoTime (); try { String forUrl = subUrl.replace (" ", "%20"); if (!forUrl.startsWith ("/")) { forUrl = "/" + forUrl; } URL u = new URL (rootUrl + forUrl); InputStream reader = u.openStream (); File f = createDownloadFile (subUrl); FileOutputStream writer = new FileOutputStream (f); byte[] buff = new byte[1024]; int size = -1; while ((size = reader.read (buff)) != -1) { writer.write (buff, 0, size); } reader.close (); writer.close (); } catch (Exception e) { e.printStackTrace (); } System.out.println ("Source:" + subUrl +"download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00)); } } else { System.out.println ("subUrl is Empty."); } } /** * create sub file,create parent folders if necessary. * * @param url related path of a url source. * @return created file. */ private static File createDownloadFile (String url) { File f = new File (workspace, url); f.getParentFile ().mkdirs (); return f; } }
小程序:Java下载单页HTML(可下载引用资源)
本程序可下载页面所依赖的CSS/JS,图片等引用,目前不包含下载关联HTML页面这个功能。代码如下