spring boot+vue实现爬取各大平台每日热榜数据功能

案例功能效果图
爬去数据的平台页面
spring boot+vue实现爬取各大平台每日热榜数据功能

这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!
爬取的热榜数据效果图
spring boot+vue实现爬取各大平台每日热榜数据功能

环境介绍
前端:vue+h5
后端:springboot+webMagic
jdk:1.8及以上
数据库:mysql

完整源码获取方式
源码获取方式:点击这里,暗号博客园!

核心代码介绍
pom.xml

<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>18.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.4</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.4</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代码省略工具-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.8</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>

        <!-- swagger2 -->
        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger2</artifactId>
            <version>2.9.1</version>
        </dependency>

        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger-ui</artifactId>
            <version>2.9.1</version>
        </dependency>

application.yml

server:
  port: 9004
spring:
  jackson:
    serialization:
      write-dates-as-timestamps: true
  datasource:
    driverClassName: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC
    username: feimeidehuoji
    password: feimeidehuoji
  jpa:
    database: MySQL
    show-sql: true
    hibernate:
      ddl-auto: update
    database-platform: org.hibernate.dialect.MySQL5InnoDBDialect
spiderUrl: https://tophub.today
proxyUrl: 61.160.210.234
proxyPort: 808

NodeController.java

package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.Result;
import cn.cesi.webMagic.util.StatusCode;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiParam;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.domain.Page;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;

@RestController
@CrossOrigin
@RequestMapping("/node")
@Api(value = "获取数据接口",tags={"用户登录接口"})
public class NodeController {

    @Value("${spiderUrl}")
    private String url;

    @Value("${proxyUrl}")
    private String proxyUrl;

    @Value("${proxyPort}")
    private Integer proxyPort;

    @Resource
    NodeService nodeService;

    @Autowired
    SpringPieline springPieline;

    @RequestMapping("")
    @ApiOperation(value = "查询数据接口")
    public Result getData(
            @ApiParam(value = "分类名称", required = false) String typeName
            ,@ApiParam(value = "分类名称", required = false) String secondTitle
            ,@ApiParam(value = "当前页", required = false)Integer page
            ,@ApiParam(value = "每页数据条数", required = false)Integer size){
        Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size);
        Result result = new Result();
        result.setFlag(true);
        result.setCode(StatusCode.OK);
        result.setMsg("查询成功!");
        result.setData(nodes);
        return result;
    }

    @RequestMapping("/getType")
    @ApiOperation(value = "查询全部分类列表")
    public Result getData(){
        List<Map<String,String>> list = nodeService.findType();
        Result result = new Result();
        result.setFlag(true);
        result.setCode(StatusCode.OK);
        result.setMsg("查询成功!");
        result.setData(list);
        return result;
    }
    @Scheduled(fixedDelay = 480000) //1000*60*8 任务执行完成后10分钟继续执行
    public void tasks(){
        System.out.println("定时任务开始——————————————————————————————————");
        //设置代理服务器
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort)));
        Spider.create(new WebProcess())
                .addUrl(url)
                .setDownloader(httpClientDownloader)
                .thread(2)  //线程(程序爬取速度)
                .addPipeline(springPieline) //指定pieline接口
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10)))
                .run();

        System.out.println("定时任务结束——————————————————————————————————");
    }
}

WebProcess.java

package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.util.NodeEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;

import org.jsoup.select.Elements;
import java.util.*;

@Component
public class WebProcess implements PageProcessor {


    @Override
    public void process(Page page) {
        System.out.println(page.getHtml());
        //page页面对象,getHtml()获取页面的html ,css()选择器 div#Sortable 获取id为Sortable的div元素 nodes()转为集合
        List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();

        List<NodeEntity> nodes = new ArrayList<>();
        for(Selectable selectable : list){
            //regex 正则表达式
//          String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //标题
            //Jsoup.parse解析html为dom元素(对象)语法同js语法 text()为js语法不多解释
            //获取title大标题
            String s = selectable.css("div.cc-cd-ih div a div span").toString();
            String title = "";
            if(s != null){
                title = Jsoup.parse(s).text();
            }
            //获取logo
            String logo = selectable.css("div.cc-cd-ih div a div img").toString();
            String logoSrc = "";
            if(logo != null){
                Document document = Jsoup.parse(logo);
                Elements imgTags = document.select("img[src]");
                logoSrc = imgTags.attr("src");
            }

            //获取第二层小标题的集合
            List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();
            List<Map<String,String>> maps = new ArrayList<>();
            for(Selectable selectable2 :list2){
                Map<String,String> map = new HashMap<>();
                //获取二级标题的链接
                String url = selectable2.links().toString();
                //获取二级标题
                String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();
                //获取文章热度
                String hot = "";
                if(selectable2.css("div span.e") != null){
                    hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();
                }

                map.put("url",url);
                map.put("secondTitle",secondTitle);
                map.put("hot",hot);
                maps.add(map);

                //将连接添加入任务中
                //page.addTargetRequest(url);
            }
            NodeEntity node = new NodeEntity();
            node.setTitle(title);
            node.setLogo(logoSrc);
            node.setMaps(maps);
            nodes.add(node);
        }

        //给page对象绑定对象
        page.putField("nodes",nodes);

    }

    private Site site = Site.me()
            .setSleepTime(2)//抓取间隔时间,可以解决一些反爬限制
            .setRetryTimes(3)   //重试次数
            .setRetrySleepTime(10000)  //重试时间
            .setTimeOut(60000)  //超时时间 1000*60 1分钟
            .setCharset("utf8");
    @Override
    public Site getSite() {
        return site;
    }
}

SpringPieline.java

package cn.cesi.webMagic.pieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.IdWorker;
import cn.cesi.webMagic.util.NodeEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.*;

//存入数据库
@Component
public class SpringPieline implements Pipeline {
    @Autowired
    NodeService nodeService;

    @Autowired
    IdWorker idWorker;

    @Override
    public void process(ResultItems resultItems, Task task) {
        List<NodeEntity> nodes = resultItems.get("nodes");
        try{
            for(NodeEntity entity : nodes){
                Node node = new Node();
                String title = entity.getTitle();
                node.setTitle(title);
                String logo = entity.getLogo();
                node.setLogo(logo);
                List<Map<String,String>> list = entity.getMaps();
                for(Map<String,String> map : list){
                    node.setId(idWorker.nextId()+"");
                    String secondTitle = map.get("secondTitle").trim();
                    node.setSecondTitle(secondTitle);
                    node.setUrl(map.get("url"));
                    node.setCreateDate(new Date());
                    node.setHot(map.get("hot"));
                    System.out.println(secondTitle);
                    if(!secondTitle.equals("") && !title.equals("")){
                        List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);
                        if(byTitleAndSecondTitle.size() <= 0){
                            nodeService.save(node);
                        }
                    }

                }
            }
        }catch (Exception e){
            System.out.println(e);
        }

    }
}

index.vue

<template>
  <div class="tab__content">
    <h1 class="page__title">摸鱼热榜</h1>
    <van-search
      v-model="value"
      placeholder="请输入搜索关键词"
      @search="onSearch"
      @clear="onClear"
    />

    <!-- 分类列表 -->
    <div v-if="!listData.length">
      <div class="tab__tips">
        仿今日热榜!,关注java项目开发,学习更多案例!
      </div>
      <div class="cells-block">
        <div>
          <div class="cells__title">全部热榜</div>
          <div class="cells">
            <div
              v-for="(item, index) in typeList"
              :key="index"
              class="cell-row"
            >
              <div class="cell" @click="goDateils(item)">
                <div class="cell__hd">
                  <img
                    :src="item.logo"
                    :alt="item.title"
                    @error="imgError(item)"
                  />
                </div>
                <div class="cell__bd">{{ item.title }}</div>
                <div class="cell__ft">
                  <svg-icon
                    iconClass="index_right"
                    className="icon_search"
                  ></svg-icon>
                </div>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <!-- 搜索内容 -->
    <div v-if="listData.length">
      <search-list v-if="listData.length" :list="listData" />
      <van-empty v-else description="暂无相关内容!" />
    </div>
  </div>
</template>

<script>
import SvgIcon from '@/components/icon/SvgIcon';
import searchList from '@/components/searchList/list';
export default {
  components: {
    SvgIcon,
    searchList
  },
  data() {
    return {
      value: '', // 搜索值
      listData: [], // 搜索数据
      typeList: [], // 所有热榜类型
      defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片
    };
  },
  computed: {},
  created() {
    this.getAllType();
  },
  mounted() {},
  methods: {
    // 获取全部热榜类型
    getAllType() {
      const that = this;
      this.$api.getAllType().then(res => {
        if (res.code === 0) {
          that.typeList = res.data;
        }
      });
    },

    // 跳转分类详情
    goDateils(item) {
      this.$router.push({
        name: 'details',
        query: {
          item: JSON.stringify(item)
        }
      });
    },

    // 搜索
    onSearch(e) {
      const that = this;
      let params = {
        typeName: '全部',
        size: 10000,
        secondTitle: e
      };
      this.$api.getAllInfoGzip(params).then(res => {
        if (res.code == 0) {
          that.listData = res.data.content;
          that.handleData(that.listData);
          console.log(res);
        }
      });
    },

    // 清除搜索框
    onClear(e) {
      this.listData = [];
    },

    // 处理热榜类型数据
    handleData(data) {
      data.forEach(item => {
        item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息
        item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
      });
    },

    // 图片404处理
    imgError(item) {
      // 图片404就赋值默认图片
      item.logo = this.defaultUrl;
    }
  }
};

details.vue

<template>
  <div class="topic-list">
    <div class="info-top">
      <img class="info-bg" :src="details.logo" @error="imgError" alt="" />
      <div class="info-content">
        <div class="top-column">
          <p @click="$router.push('/')">摸鱼热榜</p>
        </div>
        <img class="pic-icon" :src="details.logo" @error="imgError" alt="" />
        <h1 class="info-title">{{ details.title }}</h1>
      </div>
    </div>
    <div class="divider">
      <van-pull-refresh v-model="refreshing" @refresh="onRefresh">
        <van-list
          v-model="loading"
          :finished="finished"
          @load="onLoad"
          :immediate-check="false"
        >
          <div class="panel_bd">
            <a
              v-for="(item, index) in listData"
              :key="item.id"
              :href="item.url"
              class="media-box van-hairline--bottom"
            >
              <div class="media-box__bd">
                <h4 class="media-box__title">
                  {{ index + 1 }}、{{ item.secondTitle }}
                </h4>
                <div class="dec-row">
                  <span class="tag" v-if="item.hot">
                    <span>{{ item.hot }}</span>
                  </span>
                  <span class="time">
                    <span>{{ item.CreateTime }}</span>
                  </span>
                  <span class="new" v-if="item.new">新</span>
                </div>
              </div>
            </a>
          </div>
        </van-list>
      </van-pull-refresh>
    </div>
    <div class="footer-flag flex-center" v-if="finished">
      <p class="flex-center">我是有底线的</p>
    </div>
  </div>
</template>

<script>
export default {
  data() {
    return {
      page: 1, // 当前页数
      refreshing: false, // 下拉刷新状态
      loading: false, // 上拉加载状态
      finished: false, // 是否无更多数据状态
      listData: [], // 数据列表
      details: {}, // 类型详情
      defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片
    };
  },
  computed: {},
  created() {},
  mounted() {
    this.details = JSON.parse(this.$route.query.item);
    this.getList(this.details, this.page);
  },
  methods: {
    // 分类详情
    getList(item, page, loading = true) {
      const that = this;
      let list = that.listData;
      let params = {
        typeName: item.title,
        size: 50,
        page
      };
      this.$api.getAllInfoGzip(params, loading).then(res => {
        console.log(res);
        if (res.code == 0) {
          that.listData = list.concat(res.data.content);
          that.handleData(that.listData);
          // 上拉加载状态结束
          if (that.loading) {
            that.loading = false;
          }
          // 下拉刷新状态结束
          if (that.refreshing) {
            that.refreshing = false;
          }
          // 暂无更多数据
          if (that.page >= res.data.totalPages) {
            that.finished = true;
          }
        }
      });
    },

    // 上拉加载
    onl oad() {
      // 请求状态
      this.loading = true;
      this.getList(this.details, ++this.page, false);
    },

    // 下拉刷新
    onRefresh() {
      // 请求状态、清空列表数据
      this.finished = false;
      this.loading = true;
      this.listData = [];
      this.page = 1;
      this.getList(this.details, 1, false);
    },

    // 处理热榜类型数据
    handleData(data) {
      data.forEach(item => {
        item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息
        item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
      });
    },

    // 图片404处理
    imgError() {
      // 图片404就赋值默认图片
      this.details.img = this.defaultUrl;
    }
  }
};

xxx.sql

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for node
-- ----------------------------
DROP TABLE IF EXISTS `node`;
CREATE TABLE `node` (
  `id` varchar(255) NOT NULL,
  `create_date` datetime DEFAULT NULL,
  `hot` varchar(1024) DEFAULT NULL,
  `second_title` longtext,
  `title` varchar(1024) DEFAULT NULL,
  `url` longtext,
  `logo` varchar(1024) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

作者:Java开发项目
链接:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
欢迎大家关注:有故事的程序员,每天更新Java技术知识点,还可以领取Java进阶学习资料哦~
资料包含的模块分为19个模块,分别是: Java 基础、容器、多线程、反射、对象拷贝、Java Web 、异常、网络、设计模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。
spring boot+vue实现爬取各大平台每日热榜数据功能

上一篇:python爬虫之爬取图片(一)


下一篇:C#性能分析反射VS配置文件VS预编译