JAVA实现网页抓取(htmlunit)

准确条件

加入依赖jar包

<dependency>
     <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.15</version>
</dependency>


代码示例

private WebClient initWc() throws IOException {
    WebClient wc = new WebClient(BrowserVersion.CHROME);
    wc.getOptions().setJavaScriptEnabled(false);
    wc.getOptions().setCssEnabled(false);
    wc.getOptions().setTimeout(8000);
    wc.setJavaScriptTimeout(8000);
    wc.setAjaxController(new NicelyResynchronizingAjaxController());
    wc.waitForBackgroundJavaScript(8000);
//        Cache cache=new Cache();
//        wc.setCache(cache);
    wc.getOptions().setThrowExceptionOnScriptError(false);
//        wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
    return wc;
}

public void loadData() {
  WebClient wc = null;

    if ( wc == null ) {
        try {
            wc = initWc();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    try {
        //图片中文字解析时使用
        IIORegistry registry = IIORegistry.getDefaultInstance();  
        registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());  
        registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());  

        StringBuffer errPage =new StringBuffer();
        for(int i =1 ; i<=97;i++){
            loadPage(i,errPage,wc);
            riskCompanyDao.flush();
        }
        log.info("errPage:"+errPage);
//            loadPage(27,errPage,wc);
    } catch (Exception e) {
        log.warn("loadData error! ", e);
    } finally {
        wc.closeAllWindows();
    }
}

private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
   HtmlPage page;
    try {
        String refer="http://www.baidu.com/";
        URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html");
        WebRequest request=new WebRequest(link);
        request.setCharset("UTF-8");
        request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
        设置请求报文头里的User-Agent字段
        request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
        request.setAdditionalHeader("Connection", "keep-alive");
        request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");

        page = wc.getPage(request);

        HtmlPage pageResult = page;
        HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
        HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
        int indexRow = 0;
        for ( DomNode node2 : body.getChildNodes() ) {

            if (node2 instanceof  HtmlTableRow ) {
                HtmlTableRow row = (HtmlTableRow) node2;
                List<HtmlTableCell> cells = row.getCells();
                HtmlTableCell cell0=cells.get(0);
                String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
                String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
                industryName = industryName.split(":")[1];
                String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
                if (addr.split(":").length>1){
                    addr = addr.split(":")[1];
                }else{
                    addr=null;
                }
                String mobile =null;
                if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
                    HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
                    String imgStr =img.getAttribute("src");
                    imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
                    mobile = ImageRead.getImgStr(imgStr);
                    log.info("mobile:"+mobile);
                }
               
            }
            indexRow++;
        }

    } catch (Exception e) {
        errPage.append(pageNo).append(",");
        log.warn("page error :"+pageNo,e);
    }

}


注意事项

    普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
    WebClient在请求发起前初始化一次即可
    不同浏览器版返回的html代码有一定差异,需单独调试

上一篇:重构 之 总结代码的坏味道 Bad Smell (一) 重复代码 过长函数 过大的类 过长参数列 发散式变化 霰弹式修改


下一篇:Linux之13——常用统计命令之wc