前情提要:
公司有一个需求是利用java订阅知乎RSS的数据源,通过上网查询资料通过JSOUP是个不错的选择,操作简单方便
一、maven项目的pom依赖
<!-- 解析html -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
二、代码案例
package com.rssdemo.utils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
/**读取Url网页链接中的内容信息
* @author: zy
* @Date: 2020/8/14 13:35
*/
public class HtmlContent {
public static void main(String[] args) throws Exception {
//这里要注意,因为是http请求,所以要加行http://,不然会报错:
//Http协议异常:java.net.MalformedURLException: no protocol: www.baidu.com
String url_read = "https://www.zhihu.com/question/412188418/answer/1386120965?utm_campaign=rss&utm_medium=rss&utm_source=rss&utm_content=title";
String url_doc = "https://www.zhihu.com/question/412188418/answer/1386120965?utm_campaign=rss&utm_medium=rss&utm_source=rss&utm_content=title";
//方法1
HtmlContent htmlContent = new HtmlContent();
String content = htmlContent.readUrl(url_read, StandardCharsets.UTF_8);
System.out.println(content);
System.out.println("--------------");
//方法2
Document doc = Jsoup.connect(url_doc).get();
Elements elementsByClass = doc.getElementsByClass("NumberBoard-itemValue");
// for (int i = 0; i < elementsByClass.size(); i++) {
// Element element = elementsByClass.get(i);
// String text = element.text();
// System.out.println(text);
// }
String text = elementsByClass.get(0).text();
System.out.println(text);
// System.out.println(doc);
}
/**
* 读取url的内容
* @param url 网页链接
* @param charset 编码字符
* @return
* @throws Exception
*/
public String readUrl(String url, Charset charset) throws Exception {
BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openConnection().getInputStream(), charset));
StringBuffer str = new StringBuffer();
String realLineStr = null;
while ((realLineStr = br.readLine()) != null) {
str.append(realLineStr).append("\r\n"); //每读取一行就换行
}
return str.toString();
}
}