案例功能效果图
爬去数据的平台页面
这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!
爬取的热榜数据效果图
环境介绍
前端:vue+h5
后端:springboot+webMagic
jdk:1.8及以上
数据库:mysql
完整源码获取方式
源码获取方式:点击这里,暗号博客园!
核心代码介绍
pom.xml
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代码省略工具-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- swagger2 -->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger2</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger-ui</artifactId>
<version>2.9.1</version>
</dependency>
application.yml
server:
port: 9004
spring:
jackson:
serialization:
write-dates-as-timestamps: true
datasource:
driverClassName: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC
username: feimeidehuoji
password: feimeidehuoji
jpa:
database: MySQL
show-sql: true
hibernate:
ddl-auto: update
database-platform: org.hibernate.dialect.MySQL5InnoDBDialect
spiderUrl: https://tophub.today
proxyUrl: 61.160.210.234
proxyPort: 808
NodeController.java
package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.Result;
import cn.cesi.webMagic.util.StatusCode;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiParam;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.domain.Page;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;
@RestController
@CrossOrigin
@RequestMapping("/node")
@Api(value = "获取数据接口",tags={"用户登录接口"})
public class NodeController {
@Value("${spiderUrl}")
private String url;
@Value("${proxyUrl}")
private String proxyUrl;
@Value("${proxyPort}")
private Integer proxyPort;
@Resource
NodeService nodeService;
@Autowired
SpringPieline springPieline;
@RequestMapping("")
@ApiOperation(value = "查询数据接口")
public Result getData(
@ApiParam(value = "分类名称", required = false) String typeName
,@ApiParam(value = "分类名称", required = false) String secondTitle
,@ApiParam(value = "当前页", required = false)Integer page
,@ApiParam(value = "每页数据条数", required = false)Integer size){
Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size);
Result result = new Result();
result.setFlag(true);
result.setCode(StatusCode.OK);
result.setMsg("查询成功!");
result.setData(nodes);
return result;
}
@RequestMapping("/getType")
@ApiOperation(value = "查询全部分类列表")
public Result getData(){
List<Map<String,String>> list = nodeService.findType();
Result result = new Result();
result.setFlag(true);
result.setCode(StatusCode.OK);
result.setMsg("查询成功!");
result.setData(list);
return result;
}
@Scheduled(fixedDelay = 480000) //1000*60*8 任务执行完成后10分钟继续执行
public void tasks(){
System.out.println("定时任务开始——————————————————————————————————");
//设置代理服务器
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort)));
Spider.create(new WebProcess())
.addUrl(url)
.setDownloader(httpClientDownloader)
.thread(2) //线程(程序爬取速度)
.addPipeline(springPieline) //指定pieline接口
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10)))
.run();
System.out.println("定时任务结束——————————————————————————————————");
}
}
WebProcess.java
package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.util.NodeEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import org.jsoup.select.Elements;
import java.util.*;
@Component
public class WebProcess implements PageProcessor {
@Override
public void process(Page page) {
System.out.println(page.getHtml());
//page页面对象,getHtml()获取页面的html ,css()选择器 div#Sortable 获取id为Sortable的div元素 nodes()转为集合
List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();
List<NodeEntity> nodes = new ArrayList<>();
for(Selectable selectable : list){
//regex 正则表达式
// String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //标题
//Jsoup.parse解析html为dom元素(对象)语法同js语法 text()为js语法不多解释
//获取title大标题
String s = selectable.css("div.cc-cd-ih div a div span").toString();
String title = "";
if(s != null){
title = Jsoup.parse(s).text();
}
//获取logo
String logo = selectable.css("div.cc-cd-ih div a div img").toString();
String logoSrc = "";
if(logo != null){
Document document = Jsoup.parse(logo);
Elements imgTags = document.select("img[src]");
logoSrc = imgTags.attr("src");
}
//获取第二层小标题的集合
List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();
List<Map<String,String>> maps = new ArrayList<>();
for(Selectable selectable2 :list2){
Map<String,String> map = new HashMap<>();
//获取二级标题的链接
String url = selectable2.links().toString();
//获取二级标题
String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();
//获取文章热度
String hot = "";
if(selectable2.css("div span.e") != null){
hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();
}
map.put("url",url);
map.put("secondTitle",secondTitle);
map.put("hot",hot);
maps.add(map);
//将连接添加入任务中
//page.addTargetRequest(url);
}
NodeEntity node = new NodeEntity();
node.setTitle(title);
node.setLogo(logoSrc);
node.setMaps(maps);
nodes.add(node);
}
//给page对象绑定对象
page.putField("nodes",nodes);
}
private Site site = Site.me()
.setSleepTime(2)//抓取间隔时间,可以解决一些反爬限制
.setRetryTimes(3) //重试次数
.setRetrySleepTime(10000) //重试时间
.setTimeOut(60000) //超时时间 1000*60 1分钟
.setCharset("utf8");
@Override
public Site getSite() {
return site;
}
}
SpringPieline.java
package cn.cesi.webMagic.pieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.IdWorker;
import cn.cesi.webMagic.util.NodeEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.*;
//存入数据库
@Component
public class SpringPieline implements Pipeline {
@Autowired
NodeService nodeService;
@Autowired
IdWorker idWorker;
@Override
public void process(ResultItems resultItems, Task task) {
List<NodeEntity> nodes = resultItems.get("nodes");
try{
for(NodeEntity entity : nodes){
Node node = new Node();
String title = entity.getTitle();
node.setTitle(title);
String logo = entity.getLogo();
node.setLogo(logo);
List<Map<String,String>> list = entity.getMaps();
for(Map<String,String> map : list){
node.setId(idWorker.nextId()+"");
String secondTitle = map.get("secondTitle").trim();
node.setSecondTitle(secondTitle);
node.setUrl(map.get("url"));
node.setCreateDate(new Date());
node.setHot(map.get("hot"));
System.out.println(secondTitle);
if(!secondTitle.equals("") && !title.equals("")){
List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);
if(byTitleAndSecondTitle.size() <= 0){
nodeService.save(node);
}
}
}
}
}catch (Exception e){
System.out.println(e);
}
}
}
index.vue
<template>
<div class="tab__content">
<h1 class="page__title">摸鱼热榜</h1>
<van-search
v-model="value"
placeholder="请输入搜索关键词"
@search="onSearch"
@clear="onClear"
/>
<!-- 分类列表 -->
<div v-if="!listData.length">
<div class="tab__tips">
仿今日热榜!,关注java项目开发,学习更多案例!
</div>
<div class="cells-block">
<div>
<div class="cells__title">全部热榜</div>
<div class="cells">
<div
v-for="(item, index) in typeList"
:key="index"
class="cell-row"
>
<div class="cell" @click="goDateils(item)">
<div class="cell__hd">
<img
:src="item.logo"
:alt="item.title"
@error="imgError(item)"
/>
</div>
<div class="cell__bd">{{ item.title }}</div>
<div class="cell__ft">
<svg-icon
iconClass="index_right"
className="icon_search"
></svg-icon>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- 搜索内容 -->
<div v-if="listData.length">
<search-list v-if="listData.length" :list="listData" />
<van-empty v-else description="暂无相关内容!" />
</div>
</div>
</template>
<script>
import SvgIcon from '@/components/icon/SvgIcon';
import searchList from '@/components/searchList/list';
export default {
components: {
SvgIcon,
searchList
},
data() {
return {
value: '', // 搜索值
listData: [], // 搜索数据
typeList: [], // 所有热榜类型
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片
};
},
computed: {},
created() {
this.getAllType();
},
mounted() {},
methods: {
// 获取全部热榜类型
getAllType() {
const that = this;
this.$api.getAllType().then(res => {
if (res.code === 0) {
that.typeList = res.data;
}
});
},
// 跳转分类详情
goDateils(item) {
this.$router.push({
name: 'details',
query: {
item: JSON.stringify(item)
}
});
},
// 搜索
onSearch(e) {
const that = this;
let params = {
typeName: '全部',
size: 10000,
secondTitle: e
};
this.$api.getAllInfoGzip(params).then(res => {
if (res.code == 0) {
that.listData = res.data.content;
that.handleData(that.listData);
console.log(res);
}
});
},
// 清除搜索框
onClear(e) {
this.listData = [];
},
// 处理热榜类型数据
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 图片404处理
imgError(item) {
// 图片404就赋值默认图片
item.logo = this.defaultUrl;
}
}
};
details.vue
<template>
<div class="topic-list">
<div class="info-top">
<img class="info-bg" :src="details.logo" @error="imgError" alt="" />
<div class="info-content">
<div class="top-column">
<p @click="$router.push('/')">摸鱼热榜</p>
</div>
<img class="pic-icon" :src="details.logo" @error="imgError" alt="" />
<h1 class="info-title">{{ details.title }}</h1>
</div>
</div>
<div class="divider">
<van-pull-refresh v-model="refreshing" @refresh="onRefresh">
<van-list
v-model="loading"
:finished="finished"
@load="onLoad"
:immediate-check="false"
>
<div class="panel_bd">
<a
v-for="(item, index) in listData"
:key="item.id"
:href="item.url"
class="media-box van-hairline--bottom"
>
<div class="media-box__bd">
<h4 class="media-box__title">
{{ index + 1 }}、{{ item.secondTitle }}
</h4>
<div class="dec-row">
<span class="tag" v-if="item.hot">
<span>{{ item.hot }}</span>
</span>
<span class="time">
<span>{{ item.CreateTime }}</span>
</span>
<span class="new" v-if="item.new">新</span>
</div>
</div>
</a>
</div>
</van-list>
</van-pull-refresh>
</div>
<div class="footer-flag flex-center" v-if="finished">
<p class="flex-center">我是有底线的</p>
</div>
</div>
</template>
<script>
export default {
data() {
return {
page: 1, // 当前页数
refreshing: false, // 下拉刷新状态
loading: false, // 上拉加载状态
finished: false, // 是否无更多数据状态
listData: [], // 数据列表
details: {}, // 类型详情
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片
};
},
computed: {},
created() {},
mounted() {
this.details = JSON.parse(this.$route.query.item);
this.getList(this.details, this.page);
},
methods: {
// 分类详情
getList(item, page, loading = true) {
const that = this;
let list = that.listData;
let params = {
typeName: item.title,
size: 50,
page
};
this.$api.getAllInfoGzip(params, loading).then(res => {
console.log(res);
if (res.code == 0) {
that.listData = list.concat(res.data.content);
that.handleData(that.listData);
// 上拉加载状态结束
if (that.loading) {
that.loading = false;
}
// 下拉刷新状态结束
if (that.refreshing) {
that.refreshing = false;
}
// 暂无更多数据
if (that.page >= res.data.totalPages) {
that.finished = true;
}
}
});
},
// 上拉加载
onl oad() {
// 请求状态
this.loading = true;
this.getList(this.details, ++this.page, false);
},
// 下拉刷新
onRefresh() {
// 请求状态、清空列表数据
this.finished = false;
this.loading = true;
this.listData = [];
this.page = 1;
this.getList(this.details, 1, false);
},
// 处理热榜类型数据
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 图片404处理
imgError() {
// 图片404就赋值默认图片
this.details.img = this.defaultUrl;
}
}
};
xxx.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for node
-- ----------------------------
DROP TABLE IF EXISTS `node`;
CREATE TABLE `node` (
`id` varchar(255) NOT NULL,
`create_date` datetime DEFAULT NULL,
`hot` varchar(1024) DEFAULT NULL,
`second_title` longtext,
`title` varchar(1024) DEFAULT NULL,
`url` longtext,
`logo` varchar(1024) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
作者:Java开发项目
链接:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
欢迎大家关注:有故事的程序员,每天更新Java技术知识点,还可以领取Java进阶学习资料哦~
资料包含的模块分为19个模块,分别是: Java 基础、容器、多线程、反射、对象拷贝、Java Web 、异常、网络、设计模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。