爬虫任务一:使用httpclient去爬取百度新闻首页的新闻标题和url,编码是utf-8

第一个入手的爬虫小任务:

maven工程

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhaowu</groupId>
<artifactId>pachong01</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency> </dependencies>
</project>

代码实现:

package com.zhaowu.renwu1;

import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; public class News {
public static void main(String[] args) throws ClientProtocolException, IOException {
// 创建HttpClient实例
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpGet = new HttpGet("https://news.baidu.com/"); RequestConfig config = RequestConfig.custom()
.setConnectTimeout(10000)//设置连接超时时间10秒钟,单位毫秒
.setSocketTimeout(10000) //设置读取超时时间10秒钟
.build();
httpGet.setConfig(config);
// 设置请求头消息User-Agent模拟浏览器
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0");
// 执行get请求
CloseableHttpResponse response = httpClient.execute(httpGet);
// 获取返回实体
HttpEntity entity = response.getEntity();
// 实体的内容(编码格式为utf-8)
String content = EntityUtils.toString(entity, "utf-8");
// System.out.println("网页内容为: " + content); // 解析网页 得到文档对象
Document doc = Jsoup.parse(content); Elements hrefElements = doc.select("a[href]");// 选择所有的a元素
for (Element e : hrefElements) {
System.out.println("新闻标题:" + e.text());
System.out.println("新闻地址:" + e.attr("href"));
System.out.println("------------------------");
} }
}
上一篇:用php写爬虫去爬数据


下一篇:Scrapy 通过登录的方式爬取豆瓣影评数据