
 * This software is under the Apache License Version 2.0
 * Author: Tao -  mail:cn.java.river@gmail.com
 * Spreading Your Heart

package atao.util.html;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;

import org.apache.commons.lang.StringUtils;

 * A Simple HTML downloader which can also download Page resources.
 * <br/>
 * <b>Note: This Tool won‘t download related or sub HTML</b>
 * @author <a href="mailto:cn.java.river@gmail.com">Tao</a>
 * @since 1.0
public class HtmlDownloader

    //URL will be downloaded.
    private static String url = "http://pervasive2.morselli.unimo.it/~nicola/courses/IngegneriaDelSoftware/java/J6d_xml.html";
    //workspace folder.
    private static String workspace = "download";
    //sub css and js resources sign 
    private static String urlSign = "<link href=";
    //sub image resources sign
    private static String urlSign2 = "src=";
    //URL parent.
    private static String rootUrl = null;

    public static void main (String[] args) throws Exception
        long start = System.nanoTime ();
        setRootUrl ();
        URL u = new URL (url);
        InputStream is = u.openStream ();
        BufferedReader reader = new BufferedReader (new InputStreamReader (is));
        File f = createDownloadFile ("download.html");
        BufferedWriter writer = new BufferedWriter (new FileWriter (f));
        String s;
        while ((s = reader.readLine ()) != null)
            writer.write (s);
            writer.newLine ();
            if (hasSubUrl (s))
                downloadChild (getSubUrl (s));
        is.close ();
        reader.close ();
        writer.close ();
        System.out.println ("Download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
    } // end of main

     * set root url for the downloading html
    private static void setRootUrl ()
        int pos = url.lastIndexOf ("/");
        rootUrl = url.substring (0, pos);
        System.out.println ("Root Url is:" + rootUrl);

     * check if content includes sub resources.
     * @param text line of html content.
     * @return Yes or Not
    private static boolean hasSubUrl (String text)
        if (StringUtils.isNotEmpty (text))
            if (text.contains (urlSign) || text.contains (urlSign2))
                return true;
            return false;
            return false;



     * generate sub url from line content.
    private static String getSubUrl (String text)
        int pos = text.indexOf (urlSign);
        pos = (pos == -1) ? text.indexOf (urlSign2) : pos;
        text = text.substring (pos);
        String[] ps = text.split ("\"");
        System.out.println ("subUrl is :" + ps[1]);
        return ps[1];

     * download sub resources,<b>Note: don‘t use Java Character Writers,
     * otherwise you can‘t get pictures correctly.</b>
     * @param subUrl
    private static void downloadChild (String subUrl)
        if (StringUtils.isNotEmpty (subUrl))
            if (subUrl.startsWith ("http:"))
                System.out.println ("subUrl not support yet.");
                long start = System.nanoTime ();
                    String forUrl = subUrl.replace (" ", "%20");
                    if (!forUrl.startsWith ("/"))
                        forUrl = "/" + forUrl;
                    URL u = new URL (rootUrl + forUrl);
                    InputStream reader = u.openStream ();
                    File f = createDownloadFile (subUrl);
                    FileOutputStream writer = new FileOutputStream (f);
                    byte[] buff = new byte[1024];
                    int size = -1;
                    while ((size = reader.read (buff)) != -1)
                        writer.write (buff, 0, size);

                    reader.close ();
                    writer.close ();
                catch (Exception e)
                    e.printStackTrace ();
                System.out.println ("Source:" + subUrl +"download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
            System.out.println ("subUrl is Empty.");


     * create sub file,create parent folders if necessary.
     * @param url related path of a url source.
     * @return created file.
    private static File createDownloadFile (String url)
        File f = new File (workspace, url);
        f.getParentFile ().mkdirs ();
        return f;




上一篇:JAVA JDK 8 正式发布了!

下一篇:C语言 指针 地址算数运算 字符指针 指针数组 多维数组