使用jsoup解析html页面内容案例

public String getFaGuiKuTitles(String type, int page) {
String href = "http://info.qd-n-tax.gov.cn/NewFaGuiKu/"+type+"/";
String baseUrl = href + "index"; int no = 0;
String msg = ""; if(page>0){
baseUrl = baseUrl + "_"+page;
} baseUrl += ".htm"; int totalPage = 0; List<FaGui> list = new ArrayList<FaGui>(); try {
URL url = new URL(baseUrl);
org.jsoup.nodes.Document doc = Jsoup.parse(url, 10000); org.jsoup.nodes.Element table = doc.select("table").get(0);
org.jsoup.nodes.Element tbody = table.select("tbody").get(0);
org.jsoup.select.Elements rows = tbody.select("tr"); int len = rows.size(); for (int i = 0; i < len; i++) {
org.jsoup.select.Elements cols = rows.get(i).select("td"); FaGui fg = new FaGui();
fg.setTitle(cols.get(0).text());
fg.setDate(cols.get(1).text()); if(cols.size()>2){
fg.setFwzh(cols.get(2).text());
} org.jsoup.nodes.Element a = cols.get(0).select("a").get(0);
fg.setHref(a.attr("href").replaceFirst("./", href)); list.add(fg);
} //翻页信息
String pager = doc.getElementsByClass("pager").get(0).html();
int start = pager.indexOf("(")+1;
int end = pager.indexOf(",");
pager = pager.substring(start, end);//截取页面中的总页数 if(pager.matches("\\d+")){
totalPage = Integer.parseInt(pager);
} no = 1;
msg = "SUCCESS"; log.info("获取税收法规库标题内容", "getFaGuiKuTitles");
} catch (MalformedURLException ex) {
Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
msg = "获取税收法规库标题内容:baseUrl"+baseUrl+"不可用,ex:"+ex;
log.error(msg, "getFaGuiKuTitles");
} catch (IOException ex) {
Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
msg = "获取税收法规库标题内容:IO异常,ex:"+ex;
log.error(msg, "getFaGuiKuTitles");
} return ResultUtil.getResult(no, msg, list,totalPage,page);
}
上一篇:从汇编来看c语言之指针


下一篇:linux学习笔记2-命令总结2