java爬虫爬取高清图片

代码1:

package com.xy;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬取网页面图篇
 */
public class Test1 {

    public static void main(String[] args) throws ClientProtocolException, IOException {

        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例
        CloseableHttpResponse pictureResponse = null;
        CloseableHttpResponse response = null;
        for (int i = 10; i < 99; i++) {
            String ss= "https://pic.netbian.com/tupian/270"+i+".html";
            System.out.println(ss);
            HttpGet httpget = new HttpGet(ss);


            // 执行get请求
            response = httpclient.execute(httpget);
            HttpEntity entity = response.getEntity();
            // 获取返回实体
            String content = EntityUtils.toString(entity, "utf-8");

            // 解析网页 得到文档对象
            Document doc = Jsoup.parse(content);
            // 获取指定的 <img />
            Elements elements = doc.select(".photo-pic #img img");
            try{
                Element element = elements.get(0);


                String src = element.attr("src");

                String strpre = "https://pic.netbian.com";
                String url = strpre + src;
                System.out.println("第"+(i-9)+"张"+ url);

                HttpGet picGet = new HttpGet(url);

                pictureResponse = httpclient.execute(picGet);
                HttpEntity pictureEntity = pictureResponse.getEntity();
                InputStream inputStream = pictureEntity.getContent();

                // 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
                FileUtils.copyToFile(inputStream, new File("D://img//imsge//" + i + "" + 1 + ".jpg"));
            }catch (Exception e){
                e.printStackTrace();
            }
        }
        pictureResponse.close(); // pictureResponse关闭
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭


    }

}

代码2:

package com.xy;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬取彼岸图网页面预览图
 */
public class Test2 {

    public static void main(String[] args) throws ClientProtocolException, IOException {

        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例
        CloseableHttpResponse pictureResponse = null;
        CloseableHttpResponse response = null;
        for (int i = 0; i <=10 ; i++) {
            String ss= "https://pic.netbian.com/e/search/result/index.php?page="+i+"&searchid=2453";
            System.out.println(ss);
            HttpGet httpget = new HttpGet(ss);
//        https://pic.netbian.com/e/search/result/index.php?page=2&searchid=2453

            // 执行get请求
            response = httpclient.execute(httpget);
            HttpEntity entity = response.getEntity();
            // 获取返回实体
            String content = EntityUtils.toString(entity, "utf-8");

            // 解析网页 得到文档对象
            Document doc = Jsoup.parse(content);
            // 获取指定的 <img />
            Elements elements = doc.select(".slist ul li");
            for (int j = 0; j <15 ; j++) {
                Element element = elements.get(j);
                Elements elements1 = element.select("a img");
                String url = elements1.attr("src");
                // 彼岸图网首页
                String urlStr = "https://pic.netbian.com";
                HttpGet picturehttpGet = new HttpGet(url);
                String s = picturehttpGet+"";
                String substring = s.substring(4);
                String[] split = substring.split(" HTTP/1.1");
                String s1 = String.valueOf(split[0]);

                String string = urlStr+s1;
                System.out.println(string);

                HttpGet picGet = new HttpGet(string);
                try {
                    pictureResponse = httpclient.execute(picGet);
                    HttpEntity pictureEntity = pictureResponse.getEntity();
                    InputStream inputStream = pictureEntity.getContent();

                    // 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
                    FileUtils.copyToFile(inputStream, new File("D://img//" + i + "" + j + ".jpg"));
                }catch (Exception e){
                    e.printStackTrace();
                }

            }
//        System.out.println(elements.get(1));

//        System.out.println(url);


        }
//        String pre = "https://pic.netbian.com/e/search/result/index.php?page=1&searchid=2453";
        pictureResponse.close(); // pictureResponse关闭
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭

//        for (int i = 0; i < 10; i++) {
//            Element element = elements.get(i);
            // 获取 <img /> 的 src
//            String url = element.attr("src");

            // 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全 ✔


    }

}
上一篇:观看利用湖仓一体架构快速搭建企业数据中台有感


下一篇:Android 虚拟机访问本地服务器