2021-03-27

java爬虫(httpClient+Jsoup)爬取高清大图实例

package practice;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class BaidouCrawler {
	static int sum = 0;

	public static void main(String[] args) throws ClientProtocolException, IOException {

		downLoadPicture(1);

	}

	public static void downLoadPicture(int page) throws ClientProtocolException, IOException {
		// 创建一个链接
		CloseableHttpClient httpClient = HttpClients.createDefault();
		// 利用链接发送请求
		HttpGet httpGet = new HttpGet("https://www.umei.cc/bizhitupian/diannaobizhi/" + page + ".htm");
		// 接收回应
		CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
		// 获取响应解析工具httpEntity
		HttpEntity httpEntity = httpResponse.getEntity();
		// content响应内容(网页源代码),并解决中文乱码用UTF-8
		String content = EntityUtils.toString(httpEntity, "UTF-8");
		// 现在得到了网页源代码,而图片就在源代码中
		// 拿到了脏数据----要清洗--像洗菜一样---
		// Jsoup解析器
		Document document = Jsoup.parse(content);
		// elements标签下的内容
		Elements elements = document.select("div.TypeList  ul  li  a");// 选择,元素,标签
		int i = 0;
		try {//捕获异常

			// 进一步清洗
			for (i = 0; i < 30; i++) {// 观察到一个有略缩图的网页一共有30张
				Element element = elements.get(i);
				// 拿到高清图的链接,这个链接在 href 标签里面,attr接收到高清图链接
				String attr = element.attr("href");
				
				
				// 重复上面的步骤 //创建一个链接
				CloseableHttpClient httpClient2 = HttpClients.createDefault();
				// 利用高清图链接发送请求
				HttpGet httpGet2 = new HttpGet(""+attr+"");
				// 接收回应
				CloseableHttpResponse httpResponse2 = httpClient2.execute(httpGet2);

				// 获取响应解析工具httpEntity
				HttpEntity httpEntity2 = httpResponse2.getEntity();

				// content响应内容(网页源代码),并解决中文乱码用UTF-8
				String content2 = EntityUtils.toString(httpEntity2, "UTF-8");
				// Jsoup解析器                    再进一步的清洗
				Document document2 = Jsoup.parse(content2);

				Elements elements2 = document2.select("div.wrap div.ImageBody  p  img");// 选择,元素,标签
				// System.out.println( elements2);

				Element element2 = elements2.get(0);
				// src有图片的地址,得到了高清图片的地址src.....
				String attr3 = element2.attr("src");
				// 利用高清图地址发送请求
				HttpGet httpGet3 = new HttpGet(attr3);
				// 执行下载该高清原图
				CloseableHttpResponse httpResponse3 = httpClient.execute(httpGet3);

				HttpEntity httpEntity3 = httpResponse3.getEntity();
                  //流入本地文件夹
				InputStream stream = httpEntity3.getContent();
				FileUtils.copyInputStreamToFile(stream, new File("C://爬虫图片//" + page + "-" + i + ".png"));
				sum++;
//				
				System.out.println("恭喜第" + page + " 页第"+(i+1)+ "张图,正在下载。。总共下载了" + sum + "张图;目录C:/爬虫图片");
			}

		} catch (Exception e) {
			System.out.println("恭喜你,共为你下载了" + sum + "张图,程序结束了");
		}

		page++;//下一页
		downLoadPicture(page);

	}

}


运行效果如下

2021-03-27
爬取的是电脑壁纸
2021-03-27

上一篇:SQL基础语句


下一篇:SQL教程基础