目录标题
一、爬取工具
-
httpclient
模拟浏览器请求,用于抓取html页面,抓取后,可以结合正则获取数据。 -
fastjson
用于解析json。对于一些ajax请求,httpclient 是无法抓取这些后期由js动态生成的html的。因此我们需要获取ajax请求url,然后通过httpclient请求 url获取到后台返回的json串,最后解析这些json串就能得到相应的数据。 -
jsoup
这个工具也是用来解析html的,他是通过类似css选择器来获取节点的内容。 -
htmlunit
由于httpclient只能抓取静态的html页面,对于由js动态生成的html,httpclient是无能为力的,因此需要使用htmlunit这个工具。htmlunit可以在获取html页面之后,像浏览器一样执行异步JS代码,等到JS将html页面渲染完成之后,就可以愉快的获取页面上的节点信息了。 -
java-selenium
需要结合浏览器内核使用(需要下载浏览器对应版本内核的驱动,注意驱动版本必须与浏览器一致
)。selenium会真正的模拟浏览器对网站进行操作。对于一些安全性较高的网站,一般需要登录才能获取数据。登录后操作一般也要携带token令牌,这时侯如果直接通过url进行爬取可能比较麻烦。因此我们可以通过真实模拟浏览器进行操作,从而抓取到这些数据。
二、httpclient爬取案例
爬取这个url的一些数据: http://www.xinhuanet.com/legal/ej.htm?page=fzzt
(一) httpclient get请求url 分析html信息
- 获取html的代码
public static void main(String[] args) {
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet("http://www.xinhuanet.com/legal/ej.htm?page=fzzt");
//伪装头
request.setHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36");
//IP代理
//HttpHost proxy = new HttpHost("112.85.168.223", 9999);
//RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
//request.setConfig(config);
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
System.out.println(html);
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
}
- 运行后获取到的html
<!DOCTYPE html>
<html>
<head>
<link href="/favicon.ico" rel="shortcut icon" type="image/x-icon" />
<meta charset="utf-8" /><meta name="publishid" content="11165612.0.100.0"/>
<meta name="nodeid" content="0"/>
<meta name="nodename" content="" />
<meta name="viewport" content="width=device-width,initial-scale=1.0,user-scalable=0,minimum-scale=1.0,maximum-scale=1.0" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<meta content="telephone=no" name="format-detection" />
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
<script src="http://www.news.cn/global/js/pageCore.js"></script>
<title> </title>
<meta name="description" content=" " />
<link rel="stylesheet" href="http://lib.news.cn/common/reset.css" />
<script src="http://res.wx.qq.com/open/js/jweixin-1.6.0.js"></script>
<script src="http://lib.news.cn/common/share.js"></script>
<script src="http://lib.news.cn/jquery/jquery1.12.4/jquery.min.js"></script>
<script src="http://lib.news.cn/xpage/xpage.min.js"></script>
<link rel="stylesheet" href="http://lib.news.cn/swiper/swiper3.4.2/swiper.min.css" />
<script src="http://lib.news.cn/swiper/swiper3.4.2/swiper.min.js"></script>
<!--[if lt IE 10]>
<link rel="stylesheet" href="http://lib.news.cn/swiper/swiper2.7.6/idangerous.swiper.css">
<script src="http://lib.news.cn/swiper/swiper2.7.6/idangerous.swiper.min.js"></script>
<style>
.item-style1 {display:inline-block;width:100%}
.item-style1 .img{ float:left}
.item-style5 .img a img{margin-right:1%}
</style>
<![endif]-->
<link rel="stylesheet" href="/politics/newpage2020/styles/ej.css" />
<style>
.lib-top { background:#2f65c0 !important; }
.nav .nav-cont a:first-child { background:#2f65c0 !important; }
.nav .nav-cont a.active { color:##2f65c0 !important; }
.nav .nav-cont a.active:before { background:#2f65c0 !important; }
.card .title a:before { background:#2f65c0 !important; }
.list .xpage-more-btn { background:#2f65c0 !important; }
.lib-foot { background:#2f65c0 !important; }
</style>
</head>
<body>
<script src="http://lib.news.cn/common/top.js"></script>
<script src="http://lib.news.cn/common/mobHead.js"></script>
<div class="main">
<div class="grid-1200 box clearfix">
<div class="grid-120 mr-30 nav domPc">
<div class="grid-120 nav-cont" id="navCont"></div>
</div>
<div class="grid-700 box-cont">
<div class="list">
<div class="xpage-container" id="list">
<ul class="xpage-content xpage-content-list"></ul>
<div class="xpage-more-btn"></div>
</div>
</div>
</div>
<div class="grid-320 ml-30 hot domPc">
<div class="grid-320 card">
<div class="title"><a href="ej.htm?page=fy" target="_blank">法 眼</a></div>
<div class="list-style2">
<ul>
<li class="clearfix">
<a href="http://www.news.cn/2021-10/05/c_1127930385.htm" target="_blank">
<div class="img"><img src="titlepic/112793/1127930558_1633396049044_title0h.png" alt="藏身网络药房 部分美瞳质量存疑" /></div>
<div class="tit">藏身网络药房 部分美瞳质量存疑</div>
</a>
</li>
<li class="clearfix">
<a href="http://www.news.cn/legal/2021-09/27/c_1127906926.htm" target="_blank">
<div class="img"><img src="titlepic/112790/1127907358_1632711219452_title0h.jpg" alt="惩治金融腐败和防控金融风险一体推进" /></div>
<div class="tit">惩治金融腐败和防控金融风险一体推进</div>
</a>
</li>
<li class="clearfix">
<a href="http://www.news.cn/legal/2021-09/27/c_1127906945.htm" target="_blank">
<div class="img"><img src="titlepic/112790/1127907354_1632711121596_title0h.jpg" alt="粮仓"硕鼠"现形记" /></div>
<div class="tit">粮仓"硕鼠"现形记</div>
</a>
</li>
</ul>
</div>
</div>
<div class="grid-320 card observe" id="observe">
<div class="title"><a href="ej.htm?page=fztj" target="_blank">法治图解</a></div>
<div class="swiper-container">
<div class="swiper-wrapper">
<div class="swiper-slide">
<a href="http://www.news.cn/legal/2021-10/06/c_1127932076.htm" target="_blank">
<div class="img"><img src="titlepic/112793/1127932991_1633508973562_title0h.jpg" alt="详解监察机关证据要求与标准" /></div>
<div class="tit">详解监察机关证据要求与标准</div>
</a>
</div>
<div class="swiper-slide">
<a href="http://www.news.cn/legal/2021-08/18/c_1211336053.htm" target="_blank">
<div class="img"><img src="titlepic/112777/1127776844_1629362752818_title0h.jpg" alt="最新立法“知识点”来了!" /></div>
<div class="tit">最新立法“知识点”来了!</div>
</a>
</div>
<div class="swiper-slide">
<a href="http://www.xinhuanet.com/politics/2021-08/17/c_1211334769.htm" target="_blank">
<div class="img"><img src="titlepic/112777/1127771007_1629248807162_title0h.jpg" alt="疫情防控,这四大意识必须有" /></div>
<div class="tit">疫情防控,这四大意识必须有</div>
</a>
</div>
<div class="swiper-slide">
<a href="http://www.xinhuanet.com/politics/2021-08/05/c_1211317032.htm" target="_blank">
<div class="img"><img src="titlepic/112773/1127735858_1628214570620_title0h.png" alt="这四类妨碍疫情防控的“恶行”,法办!" /></div>
<div class="tit">这四类妨碍疫情防控的“恶行”,法办!</div>
</a>
</div>
<div class="swiper-slide">
<a href="http://www.xinhuanet.com/legal/2021-07/30/c_1211268206.htm" target="_blank">
<div class="img"><img src="titlepic/112771/1127713933_1627634740365_title0h.jpg" alt="消灭“偷脸贼”势在必行!" /></div>
<div class="tit">消灭“偷脸贼”势在必行!</div>
</a>
</div>
</div>
<div class="swiper-pagination pagination"></div>
</div>
</div>
<div class="grid-320 card special" id="special">
<div class="title"><a href="ej.htm?page=fzzt" target="_blank">法治专题</a></div>
<div class="list-style3">
<ul class="clearfix">
<li class="clearfix">
<a href="http://www.xinhuanet.com/legal/fzldzt/quanmgjaqjy6.htm" target="_blank">
<div class="img"><img src="titlepic/112735/1127352605_1618907407390_title0h.jpg" alt="2021年第六个全民国家安全教育日" /></div>
<div class="tit">2021年第六个全民国家安全教育日</div>
</a>
</li>
<li class="clearfix">
<a href="http://www.xinhuanet.com/legal/fzldzt/2021zjw5qh.htm" target="_blank">
<div class="img"><img src="titlepic/112702/1127026351_1611631384945_title0h.png" alt="聚焦十九届*纪委五次全会" /></div>
<div class="tit">聚焦十九届*纪委五次全会</div>
</a>
</li>
</ul>
</div>
</div>
</div>
</div>
</div>
<script src="http://lib.news.cn/common/foot.js"></script>
<script src="http://lib.news.cn/common/rightFixed.js"></script>
<script src="/legal/newpage/js/ej.js"></script>
<div style="display:none"><div id="fwl">010020020120000000000000011200000000000000</div><script type="text/javascript" src="//webd.home.news.cn/webdig.js?z=1"></script><script type="text/javascript">wd_paramtracker("_wdxid=010020020120000000000000011200000000000000")</script><noscript><img src="//webd.home.news.cn/1.gif?z=1&_wdxid=010020020120000000000000011200000000000000" border="0" /></noscript></div> </body>
</html>
- 我们发现没有获取到所有html
运行后我们发现获取到的html是不完整的,缺少了一些数据。
(二)获取动态加载的html
1、分析url请求
- 打开浏览器控制器,抓取url请求、响应信息
- 查看根js相关的数据
- 找到目标数据后,获取url
- 在新的标签页打开发现,这个请求会返回一个json数据,而这些数据正是html页面缺少的数据。
- 我们发现上面的url携带了很多参数,有一些参数并不是必须的,我们可以尝试删除掉。
修改后的url:http://da.wa.news.cn/nodeart/page?nid=11227931&pgnum=1
(三)使用json工具解析json
1、导入jar
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.75</version>
</dependency>
2、具体用法
- JSONObject对象和JSONArray对象
- 先来理解json的格式。json由 { }和[ ] 组成。{ } 里面的内容为一个对象。[ ] 里面的内容为一个list集合(一个数组)
- 在fastJson中,JSONObject的数据是用 { } 来表示的,而JSONArray,顾名思义是由JSONObject构成的数组,用 [ { } , { } , … , { } ] 来表示。
- 比如:
JSONObject: { “id” : “123”, “courseID” : “huangt-test”, “title” : “提交作业”, “content” : null }
JSONArray: [ { “id” : “123”, “courseID” : “huangt-test”, “title” : “提交作业” } , { “content” : null, “beginTime” : 1398873600000 “endTime” } ] ;当然:JSONObject里面也可以有JSONArray。
注意:
- JSONObject【等价于map】可以通过key(也就是冒号前面的名字),来获取value的值(也就是冒号后面的值)
- JSONArray【等价于list】可以用来遍历。JSONArray对象调用iterator()这个方法就能获得一个迭代器
- 举例
目标:解析出下面json串中所有Title的值。
jsonStr =
{
"status":0,
"data":{
"list":[
{
"DocID":1127940962,
"Title":"2021年国家网络安全宣传周",
"NodeId":11227931,
"PubTime":"2021-10-11 08:50:18",
"LinkUrl":"http://www.news.cn/politics/ldzt/wlaqxcz2021/2021fld.htm",
"Abstract":null,
"keyword":null,
"Editor":null,
"Author":"于子茹",
"IsLink":1,
"SourceName":null,
"PicLinks":"http://www.news.cn/legal/titlepic/112794/1127940962_1633940116800_title0h.jpg",
"IsMoreImg":0,
"imgarray":[
],
"SubTitle":null,
"Attr":63,
"m4v":null,
"tarray":[
],
"uarray":[
],
"allPics":[
"http://www.news.cn/legal/titlepic/112794/1127940962_1633940116800_title0h.jpg"
],
"IntroTitle":null,
"Ext1":null,
"Ext2":null,
"Ext3":null,
"Ext4":null,
"Ext5":null,
"Ext6":null,
"Ext7":null,
"Ext8":null,
"Ext9":null,
"Ext10":null
},
{
"DocID":1127352605,
"Title":"2021年第六个全民国家安全教育日",
"NodeId":11227931,
"PubTime":"2021-04-20 16:30:09",
"LinkUrl":"http://www.xinhuanet.com/legal/fzldzt/quanmgjaqjy6.htm",
"Abstract":null,
"keyword":null,
"Editor":null,
"Author":"于子茹",
"IsLink":1,
"SourceName":null,
"PicLinks":"http://www.xinhuanet.com/legal/titlepic/112735/1127352605_1618907407390_title0h.jpg",
"IsMoreImg":0,
"imgarray":[
],
"SubTitle":null,
"Attr":63,
"m4v":null,
"tarray":[
],
"uarray":[
],
"allPics":[
"http://www.xinhuanet.com/legal/titlepic/112735/1127352605_1618907407390_title0h.jpg"
],
"IntroTitle":null,
"Ext1":null,
"Ext2":null,
"Ext3":null,
"Ext4":null,
"Ext5":null,
"Ext6":null,
"Ext7":null,
"Ext8":null,
"Ext9":null,
"Ext10":null
},
{
"DocID":1127026351,
"Title":"聚焦十九届*纪委五次全会",
"NodeId":11227931,
"PubTime":"2021-01-26 11:23:06",
"LinkUrl":"http://www.xinhuanet.com/legal/fzldzt/2021zjw5qh.htm",
"Abstract":null,
"keyword":null,
"Editor":null,
"Author":"卢俊宇",
"IsLink":1,
"SourceName":null,
"PicLinks":"http://www.xinhuanet.com/legal/titlepic/112702/1127026351_1611631384945_title0h.png",
"IsMoreImg":0,
"imgarray":[
],
"SubTitle":null,
"Attr":63,
"m4v":null,
"tarray":[
],
"uarray":[
],
"allPics":[
"http://www.xinhuanet.com/legal/titlepic/112702/1127026351_1611631384945_title0h.png"
],
"IntroTitle":null,
"Ext1":null,
"Ext2":null,
"Ext3":null,
"Ext4":null,
"Ext5":null,
"Ext6":null,
"Ext7":null,
"Ext8":null,
"Ext9":null,
"Ext10":null
}
]
},
"totalnum":23
}
-
将json字符串转换成JSONObject。
注意只能一层一层的往里面解析
JSONObject jsonObject = JSON.parseObject(jsonStr);
-
获取jsonObject 中key为data的内容,我们发现data的值是用{ }括号包围的,因此,我们需要通过getJSONObject(“data”)方法来获取jsonObject 。
JSONObject data = jsonObject.getJSONObject("data");
-
获取data中(新的JSONObject )key为list的内容,我们发现data的值是用[ ]括号包围的,因此,我们需要通过getJSONArray(“list”)方法来获取JSONArray。
JSONArray list = data.getJSONArray("list");
-
有了JSONArray 后,我们就需要对它进行遍历
Iterator<Object> iterator = list.iterator(); while (iterator.hasNext()){ //我门发现list里面的每个元素都是{ } 括起来的,因此每个元素为JSONObject。 JSONObject next = (JSONObject)iterator.next(); //如果冒号后面的值为字符串,那么意味着解析完成了,通过get(key)这个方法能获取所有字符的值。 articles.add(new Article((String) next.get("Title"),(String) next.get("PubTime"),null)); System.out.println(next.get("Title")); System.out.println(next.get("PubTime")); System.out.println(next.get("LinkUrl")); }
-
完整代码
package com.lihua.crawlingfzzt.service.impl; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.lihua.crawlingfzzt.pojo.Article; import com.lihua.crawlingfzzt.service.CrawlingFzztService; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.HttpClientUtils; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.springframework.stereotype.Service; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * @author 15594 */ @Service public class CrawlingFzztServiceImpl implements CrawlingFzztService { @Override public List<Article> getArticles(int pageNum, int cnt) { //存储文章列表 List<Article> articles = new ArrayList<>(); //请求地址 String url = "http://da.wa.news.cn/nodeart/page?nid=11227931&pgnum="+pageNum+"&cnt="+cnt; // String url = "http://da.wa.news.cn/nodeart/page?nid=11227931&pgnum=2&cnt=10"; //生成httpclient,相当于该打开一个浏览器 CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse response = null; //创建get请求,相当于在浏览器地址栏输入 网址 HttpGet request = new HttpGet(url); //伪装头 request.setHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36"); //IP代理 //HttpHost proxy = new HttpHost("112.85.168.223", 9999); //RequestConfig config = RequestConfig.custom().setProxy(proxy).build(); //request.setConfig(config); try { //执行get请求,相当于在输入地址栏后敲回车键 response = httpClient.execute(request); //判断响应状态为200,进行处理 if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { //5.获取响应内容 HttpEntity httpEntity = response.getEntity(); String html = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(html); JSONObject jsonObject = JSON.parseObject(html); JSONObject data = jsonObject.getJSONObject("data"); JSONArray list = data.getJSONArray("list"); Iterator<Object> iterator = list.iterator(); while (iterator.hasNext()){ JSONObject next = (JSONObject)iterator.next(); articles.add(new Article((String) next.get("Title"),(String) next.get("PubTime"),null)); System.out.println(next.get("Title")); System.out.println(next.get("PubTime")); System.out.println(next.get("LinkUrl")); } } else { System.out.println("返回状态不是200"); System.out.println(EntityUtils.toString(response.getEntity(), "utf-8")); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { //关闭资源 HttpClientUtils.closeQuietly(response); HttpClientUtils.closeQuietly(httpClient); } return articles; } }
四、参考
https://blog.csdn.net/gududedabai/article/details/78637186