直接爬取照片
介绍
该方法适用于直接就可以获取到照片标签的情况,以爬取微信公众号图片为例。
打开目标网页
配置maven-导入jsoup
<!-- 爬虫 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
编写代码
package com.chen.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
public class Test {
public static void main(String[] args) throws Exception {
String url = "https://mp.weixin.qq.com/s?__biz=MzA3NTE5MzQzMA==&mid=2655450793&idx=1&sn=5394c0e352eff2583583b3967028cec8&chksm=84c61b90b3b19286602f44190416a0af87dbc164dc9ca33e88b5c8ea23f93cfa65d07258d1b0&mpshare=1&scene=23&srcid=01244eYMKK7Za3Mg8m0PXKjh&sharer_sharetime=1611471589507&sharer_shareid=73f49c4878d18eba06f9547be6e90474#rd";
Document document = Jsoup.parse(new URL(url), 10000); //获取整个页面对象
Element element = document.getElementById("js_content"); //通过 Id 获取对应的 HTML 对象
Elements imgs = element.getElementsByTag("img");//获取 element 元素下所有的 img 标签
int id = 1; //图片序号
for (Element img: imgs) {
String src = img.attr("data-src"); //获取 data-src 属性的值
//获取输入流
URL target = new URL(src);
URLConnection urlConnection = target.openConnection();
InputStream inputStream = urlConnection.getInputStream(); //输入流
OutputStream outputStream = null; //输出流对象
if(src.indexOf("jpg") != -1){
outputStream = new FileOutputStream("G:\\images\\" + id + ".jpg" );
}else if(src.indexOf("png") != -1){
outputStream = new FileOutputStream("G:\\images\\" + id + ".png" );
}else if(src.indexOf("gif") != -1){
outputStream = new FileOutputStream("G:\\images\\" + id + ".gif" );
}
int temp = 0;
while((temp = inputStream.read()) != -1){ //读取资源
outputStream.write(temp); //将资源写入指定位置
}
System.out.println((id++) + "号下载完毕!");
//关闭输出流和输入流
outputStream.close();
inputStream.close();
}
}
}
效果展示
间接爬取照片
介绍
该方法适用于无法直接就可以获取到照片标签的情况,因为页面中显示的图片是通过js代码渲染出来的,以爬取LOL官网英雄图片为例。
打开目标网页
LOL官网里的英雄名称和图片都是通过JS代码渲染出来的,按F12打开开发工具,在Network栏中点击XHR,就可以发现有一个hero-list.js文件。
配置maven-导入fastjson
<!-- json字符串 转为 java对象 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
调用第三方http接口
爬取英雄信息
通过JDK网络类Java.net.HttpURLConnection来实现,爬取英雄的ID、名称、别名、音频信息。
思路:先通过GET请求获取到hero-list.js文件,然后再进行json数据转java对象,最后再通过(k,v)方式获取需要的信息,hero-list.js文件如下所示:
代码
package com.chen.util;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
public class Test {
public static void main(String[] args) {
HttpURLConnection httpURLConnection = null;
try{
// 第一步:创建URL对象
URL url = new URL("https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js");
// 第二步:打开URL链接获得 HttpURLConnection
httpURLConnection = (HttpURLConnection) url.openConnection();
// 第三步:设置请求方式-"GET"方式
httpURLConnection.setRequestMethod("GET");
// httpURLConnection.setUseCaches(true); //是否使用缓存
// httpURLConnection.setInstanceFollowRedirects(true); //是否自动执行 HTTP 重定向
httpURLConnection.setConnectTimeout(10000); //设置超时时间
// 第四步:设置通用属性,如 Accept、Connection、User-Agent、Content-Type、Cookie 等请求头header信息
httpURLConnection.setRequestProperty("Accept","*/*"); //接收任意类型资源
// http连接的两种方式:串行连接,持久连接,,持久连接可以减少下载的时间,但服务器必须要返回响应头Content-Length,返回内容的长度
httpURLConnection.setRequestProperty("Connection","Keep-Alive"); //选择持久连接
httpURLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
httpURLConnection.setRequestProperty("Content-Type", "application/json;charset=utf-8");
// 第五步:连接
//如果发送post请求必须设置这两个
httpURLConnection.setDoOutput(false); //向httpUrlConnection输出数据
httpURLConnection.setDoInput(true); //从httpUrlConnection读入数据
httpURLConnection.connect();
// 第六步:得到响应状态码 responseCode
int responseCode = httpURLConnection.getResponseCode();
String data = "";
if(responseCode == 200){ //正常响应
//从数据流中读取响应信息
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream()));
String lineData = null;
while ((lineData = bufferedReader.readLine()) != null){ //依次按行读出数据
data += lineData;
}
bufferedReader.close(); //关闭读取流
}
//json字符串转为 java 对象{ ... },使用 JSONObject
JSONObject jsonObject = JSONObject.parseObject(data);
System.out.println("文件名:" + jsonObject.get("fileName")); //获取文件名
System.out.println("文件时间:" + jsonObject.get("fileTime")); //获取文件时间
System.out.println("版本号:" + jsonObject.get("version")); //获取版本号
//获取 hero数组 [...]
JSONArray heros = jsonObject.getJSONArray("hero");
for (Object hero:heros){
JSONObject temp= (JSONObject) hero;
int id = temp.getInteger("heroId"); //英雄ID
String name = temp.getString("name"); //英雄名称
String banAudio = temp.getString("banAudio"); //英雄被禁用时的音频
String selectAudio = temp.getString("selectAudio"); //英雄被选用时的音频
System.out.println("英雄ID:" + id + "\n" +
"英雄名称:" + name + "\n" +
"英雄被禁用时的音频:" + banAudio + "\n" +
"英雄被选用时的音频:" + selectAudio + "\n" +
"------");
}
}catch (IOException e){
e.printStackTrace();
}finally {
// 第七步:断开连接,释放资源
if(null != httpURLConnection){
try{
httpURLConnection.disconnect();
}catch (Exception e){
e.printStackTrace();
}
}
}
}
}
效果展示:
爬取英雄皮肤
随便点一个英雄头像,进入到单个英雄介绍页面,如下图所示,可以发现卡牌的英雄ID是4,而单英雄介绍页面恰恰有一个4.js文件,而里面有英雄信息,皮肤信息,还有技能信息。
这里我只想爬取一下英雄皮肤,如下所示:
代码
package com.chen.util;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.io.*;
import java.net.*;
public class Test {
public static void main(String[] args) {
HttpURLConnection httpURLConnection = null;
try{
// 第一步:创建URL对象
URL url = new URL("https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js");
// 第二步:打开URL链接获得 HttpURLConnection
httpURLConnection = (HttpURLConnection) url.openConnection();
// 第三步:设置请求方式-"GET"方式
httpURLConnection.setRequestMethod("GET");
// httpURLConnection.setUseCaches(true); //是否使用缓存
// httpURLConnection.setInstanceFollowRedirects(true); //是否自动执行 HTTP 重定向
httpURLConnection.setConnectTimeout(10000); //设置超时时间
// 第四步:设置通用属性,如 Accept、Connection、User-Agent、Content-Type、Cookie 等请求头header信息
httpURLConnection.setRequestProperty("Accept","*/*"); //接收任意类型资源
// http连接的两种方式:串行连接,持久连接,,持久连接可以减少下载的时间,但服务器必须要返回响应头Content-Length,返回内容的长度
httpURLConnection.setRequestProperty("Connection","Keep-Alive"); //选择持久连接
httpURLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
httpURLConnection.setRequestProperty("Content-Type", "application/json;charset=utf-8");
// 第五步:连接
//如果发送post请求必须设置这两个
httpURLConnection.setDoOutput(false); //向httpUrlConnection输出数据
httpURLConnection.setDoInput(true); //从httpUrlConnection读入数据
httpURLConnection.connect();
// 第六步:得到响应状态码 responseCode
int responseCode = httpURLConnection.getResponseCode();
String data = "";
if(responseCode == 200){ //正常响应
//从数据流中读取响应信息
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream()));
String lineData = null;
while ((lineData = bufferedReader.readLine()) != null){ //依次按行读出数据
data += lineData;
}
bufferedReader.close(); //关闭读取流
}
//json字符串转为 java 对象{ ... },使用 JSONObject
JSONObject jsonObject = JSONObject.parseObject(data);
System.out.println("文件名:" + jsonObject.get("fileName")); //获取文件名
System.out.println("文件时间:" + jsonObject.get("fileTime")); //获取文件时间
System.out.println("版本号:" + jsonObject.get("version")); //获取版本号
//获取 hero数组 [...]
JSONArray heros = jsonObject.getJSONArray("hero");
JSONObject temp= null;
for (Object hero:heros){
temp= (JSONObject) hero;
int id = temp.getInteger("heroId"); //英雄ID
//下载皮肤
downloadSkins(id+1);
}
}catch (IOException e){
e.printStackTrace();
}finally {
// 第七步:断开连接,释放资源
if(null != httpURLConnection){
try{
httpURLConnection.disconnect();
}catch (Exception e){
e.printStackTrace();
}
}
}
}
//通过英雄ID下载皮肤
private static void downloadSkins(Integer id) {
HttpURLConnection httpURLConnection = null;
InputStream inputStream = null; //输入流
OutputStream outputStream = null; //输出流对象
try{
// 第一步:创建URL对象
URL url = new URL("https://game.gtimg.cn/images/lol/act/img/js/hero/"
+ id + ".js");
// 第二步:打开URL链接获得 HttpURLConnection
httpURLConnection = (HttpURLConnection) url.openConnection();
// 第三步:设置请求方式-"GET"方式
httpURLConnection.setRequestMethod("GET");
httpURLConnection.setConnectTimeout(10000); //设置超时时间
// 第四步:设置通用属性,如 Accept、Connection、User-Agent、Content-Type、Cookie 等请求头header信息
httpURLConnection.setRequestProperty("Accept","*/*"); //接收任意类型资源
// http连接的两种方式:串行连接,持久连接,,持久连接可以减少下载的时间,但服务器必须要返回响应头Content-Length,返回内容的长度
httpURLConnection.setRequestProperty("Connection","Keep-Alive"); //选择持久连接
httpURLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
httpURLConnection.setRequestProperty("Content-Type", "application/json;charset=utf-8");
// 第五步:连接
//如果发送post请求必须设置这两个
httpURLConnection.setDoOutput(false); //向httpUrlConnection输出数据
httpURLConnection.setDoInput(true); //从httpUrlConnection读入数据
httpURLConnection.connect();
// 第六步:得到响应状态码 responseCode
int responseCode = httpURLConnection.getResponseCode();
String data = "";
if(responseCode == 200){ //正常响应
//从数据流中读取响应信息
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream()));
String lineData = null;
while ((lineData = bufferedReader.readLine()) != null){ //依次按行读出数据
data += lineData;
}
bufferedReader.close(); //关闭读取流
}
//json字符串转为 java 对象{ ... },使用 JSONObject
JSONObject jsonObject = JSONObject.parseObject(data);
//获取 hero数组 [...]
JSONArray skins = jsonObject.getJSONArray("skins");
JSONObject temp= null;
String heroName = null; //英雄名称
String name = null; //皮肤名称
String mainImg = null; //皮肤地址
String oldMainImg = null; //皮肤地址
//获取输入流
URL target = null;
URLConnection urlConnection = null;
for (Object skin:skins){
temp= (JSONObject) skin;
heroName = temp.getString("heroName"); //英雄名称
name = temp.getString("name"); //皮肤名称
oldMainImg = mainImg; //上一个的皮肤地址
mainImg = temp.getString("mainImg"); //皮肤地址
//获取输入流
try{
target = new URL(mainImg);
}catch (MalformedURLException e){
// mainImg 为空,英雄后面最新的几张图片没有存放在 js文件中,没办法只能判断了
System.out.println(name);
int imgLength = oldMainImg.length();
int a = Integer.parseInt(oldMainImg.substring(imgLength - 8 , imgLength - 4)); //获取最后四位数
mainImg = oldMainImg.replace(Integer.toString(a),Integer.toString(++a));
HttpURLConnection httpURLConnection2 = null;
try{
boolean hh =true;
while (hh){
URL url2 = new URL(mainImg);
httpURLConnection2 = (HttpURLConnection) url2.openConnection();
httpURLConnection2.setRequestMethod("GET");
httpURLConnection2.setRequestProperty("Accept", "*/*"); //接收任意类型资源
httpURLConnection2.setDoInput(true); //从httpUrlConnection读入数据
httpURLConnection2.connect();
int responseCode2 = httpURLConnection.getResponseCode();
if (responseCode2 != 200) { //如果不能正常响应
a = Integer.parseInt(mainImg.substring(imgLength - 8,imgLength - 4)); //获取最后四位数
mainImg = mainImg.replace(Integer.toString(a), Integer.toString(++a));
} else if(responseCode2 == 200){
hh = false;
}
}
}catch(Exception e2){
e2.printStackTrace();
}finally {
// 第七步:断开连接,释放资源
if(null != httpURLConnection2){
try{
httpURLConnection2.disconnect();
}catch (Exception e2){
e2.printStackTrace();
}
}
}
}
urlConnection = target.openConnection();
inputStream = urlConnection.getInputStream(); //输入流
//判断文件夹是否存在
File file = new File("G:\\LOL\\" + heroName);
if(!file.exists()){
file.mkdir(); //如果文件夹不存在就创建一个新的
}
if(mainImg.indexOf("jpg") != -1){
outputStream = new FileOutputStream("G:\\LOL\\" + heroName + "\\" + name + ".jpg" );
}else if(mainImg.indexOf("png") != -1){
outputStream = new FileOutputStream("G:\\LOL\\" + heroName + "\\" + name + ".png" );
}else if(mainImg.indexOf("gif") != -1){
outputStream = new FileOutputStream("G:\\LOL\\" + heroName + "\\" + name + ".gif" );
}
int temp2 = 0;
while((temp2 = inputStream.read()) != -1){ //读取资源
outputStream.write(temp2); //将资源写入指定位置
}
System.out.println( name + "下载完毕!");
}
}catch (IOException e){
e.printStackTrace();
}finally {
// 第七步:断开连接,释放资源
if(null != httpURLConnection){
try{
httpURLConnection.disconnect();
//关闭输出流和输入流
outputStream.close();
inputStream.close();
}catch (Exception e){
e.printStackTrace();
}
}
}
}
}
效果展示:
期间问题
拿狂战士举例,官网公布皮肤有9个,但是最新几年的皮肤信息没有写在 js文件中,而且 js文件中的皮肤数量是24个,我看了下是屠龙勇士的炫彩,但相应的URL是没有信息的。
不过问题不大,通过代码设置一个判断,若是找不到指定资源的URL,它反正是有一定的格式,所以获取到它地址中不一样的地方,进行加法操作,寻找出真正的地址即可,如下所示:
mainImg = temp.getString("mainImg"); //皮肤地址
//获取输入流
try{
target = new URL(mainImg);
}catch (MalformedURLException e){
// mainImg 为空,英雄后面最新的几张图片没有存放在 js文件中,没办法只能判断了
System.out.println(name);
int imgLength = oldMainImg.length();
int a = Integer.parseInt(oldMainImg.substring(imgLength - 8 , imgLength - 4)); //获取最后四位数
mainImg = oldMainImg.replace(Integer.toString(a),Integer.toString(++a));
HttpURLConnection httpURLConnection2 = null;
try{
boolean hh =true;
while (hh){
URL url2 = new URL(mainImg);
httpURLConnection2 = (HttpURLConnection) url2.openConnection();
httpURLConnection2.setRequestMethod("GET");
httpURLConnection2.setRequestProperty("Accept", "*/*"); //接收任意类型资源
httpURLConnection2.setDoInput(true); //从httpUrlConnection读入数据
httpURLConnection2.connect();
int responseCode2 = httpURLConnection.getResponseCode();
if (responseCode2 != 200) { //如果不能正常响应
a = Integer.parseInt(mainImg.substring(imgLength - 8,imgLength - 4)); //获取最后四位数
mainImg = mainImg.replace(Integer.toString(a), Integer.toString(++a));
} else if(responseCode2 == 200){
hh = false;
}
}
}catch(Exception e2){
e2.printStackTrace();
}finally {
// 第七步:断开连接,释放资源
if(null != httpURLConnection2){
try{
httpURLConnection2.disconnect();
}catch (Exception e2){
e2.printStackTrace();
}
}
}
}
End
仅供学习研究,了解至此,记录留存~