Spring boot+webMagic实现自动化爬取网站内容

本文摘自:https://funyan.cn/p/6861.html

前言

在网站中,内容才是王道,如果你的网站功能很多也很好看,但是没有内容,那么将毫无意义,但是要是靠站长自己写的话那将是一段非常漫长的过程,所以这时候就需要借鉴其他站的内容,来丰富自己的网站,所以爬虫就出现了,那么今天就教大家如何使用Spring boot+webMagic实现自动化爬取网站内。

开发过程

第一步:引入webMagic

<dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
</dependency>

第二步:编写工具类

package cn.funyan.blog.task;

import cn.funyan.blog.Article;
import cn.funyan.blog.ArticleSpider;
import cn.funyan.blog.service.ArticleSpiderService;
import cn.funyan.utils.SpringUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

/** 
* @Description: 自动化爬虫工具类 
* @Author: Chris.Ren
* @Date: 2021-06-29 
*/

@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@Slf4j
@Component
public class WebMagicUtil implements PageProcessor
{

	//id
	private Integer id;
	//url
	private String url;
	//文章列表dom值
	private String listDom;
	//文章分页dom值
	private String pageDom;
	//文章详情dom值
	private String articleDom;
	//文章标题dom值
	private String headDom;
	//文章内容dom值
	private String contentDom;
	//文章标签dom值
	private String tagDom;
	//文章标签类型
	private Integer tagType;
	//文章题图
	private String img;
        
        //这里需要autowird,因为我们在创建爬虫的时候需要用到这个保存
      	@Autowired
	private SaveArticle saveArticle;


	public WebMagicUtil(WebMagicUtil webMagicUtil) {
		this.url = webMagicUtil.getUrl();
		this.listDom = webMagicUtil.getListDom();
		this.pageDom = webMagicUtil.getPageDom();
		this.articleDom = webMagicUtil.getArticleDom();
		this.headDom = webMagicUtil.getHeadDom();
		this.contentDom = webMagicUtil.getContentDom();
		this.tagDom = webMagicUtil.getTagDom();
		this.tagType = webMagicUtil.getTagType();
		this.img = webMagicUtil.getImg();
		saveArticle = SpringUtil.getBean(SaveArticle.class);
		articleSpiderService = SpringUtil.getBean(ArticleSpiderService.class);
	}

	@Override
	public void process(Page page) {

		Html html = page.getHtml();

		//获取详情页链接
		List<Selectable> linkList = html.css(this.listDom).nodes();
		if (linkList.size() <= 1) {
			//保存文章
			preSaveArticle(page);
		} else {
			//遍历文章列表,获取详情
			for (Selectable link : linkList) {
				//详情页图片
				String artLink = link.css(this.articleDom).links().toString();

				page.addTargetRequest(artLink);
			}

			//找到下一页
			String bkUrl = html.css(this.pageDom).links().toString();
			if (!StringUtils.isEmpty(bkUrl)) {
				page.addTargetRequest(bkUrl);
			}
		}
	}

	//获取文章详情
	private void preSaveArticle(Page page) {
		try {
			Html html = page.getHtml();
			//到这里已经有15个详情页
			Article article = new Article();
			article.setTitlePhoto(this.img);
			//标题
			article.setHead(html.css(this.headDom, "text").toString());
			log.info("正在解析文章:{}", article.getHead());

			//内容
			String content = html.css(this.contentDom).toString();
			article.setContent(content);

			//标签
			String tag = "";
			if (tagType.equals(1)) {
				List<Selectable> tagList = html.css(this.tagDom, "text").nodes();
				for (Selectable tags : tagList) {
					tag = tag + tags.toString() + ",";
				}
				article.setTags(tag.substring(0, tag.lastIndexOf(",")));
			} else {
				tag = this.tagDom;
				article.setTags(tag);
			}

			//类型
			article.setArticleClassify(3);
			article.setCheckStatus(0);
			article.setArticleType(2);
			article.setDescription(article.getHead());
			article.setUserId((int) (Math.random() * 5 + 1));

			page.putField("artInfo", article);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private Site site = new Site()
			.setCharset("utf8")
			.setTimeOut(10000)
			.setRetrySleepTime(3000)
			.setRetryTimes(3);

	@Override
	public Site getSite() {
		return site;
	}


	public void processor(WebMagicUtil webMagicUtil) {
		this.url = webMagicUtil.getUrl();
		log.info("正在解析网站:{}", url);

		Spider.create(new WebMagicUtil(webMagicUtil))
				.addUrl(url)
				.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
				.thread(10)
				.addPipeline(saveArticle)
				.run();
	}

	public static void main(String[] args) {
		WebMagicUtil wb = new WebMagicUtil();
		wb.processor(WebMagicUtil
				.builder()
				.articleDom("111")
				.contentDom("www")
				.url("https://funyan.cn")
				.build());
	}

}

第三步:编写持久化类,就是把你爬取的东西保存下来

package cn.funyan.blog.task;

import cn.funyan.blog.Article;
import cn.funyan.blog.service.ArticleService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 *
 **/
@Slf4j
@Component
public class SaveArticle implements Pipeline
{

	@Autowired
	private ArticleService articleService;


	@Override
	public void process(ResultItems resultItems, Task task) {

		Article article=resultItems.get("artInfo");

		if(article!=null) {
			log.info("正在保存文章:{}",article.getHead());
			articleService.sendNewArticle(article);
		}
	}
}

第四步:调用,因为我们把工具类交给spring管理,所以我们只需要注入到需要的地方即可

        @Autowired
	private WebMagicUtil webMagicUtil;


	@PostMapping("spiderArticle")
	public Res spiderArticle(Integer id){
		//查看是否爬过了
		ArticleSpider as = service.getById(id);
		if(as==null){
			return new Res().failed();
		}

		if(as.getStatus().equals(1)){
			return new Res().failed().setMsg("该文章已爬取过,请勿重复爬取");
		}
		webMagicUtil.processor(WebMagicUtil
				.builder()
				.id(as.getId())
				.url(as.getUrl())
				.img(as.getImg())
				.listDom(as.getListDom())
				.articleDom(as.getArticleDom())
				.pageDom(as.getPageDom())
				.headDom(as.getHeadDom())
				.contentDom(as.getContentDom())
				.tagDom(as.getTagDom())
				.tagType(as.getTagType())
				.build());

		return new Res();
	}

Spring boot+webMagic实现自动化爬取网站内容

这样就完成了Spring boot+webMagic实现自动化爬取网站,后期就可以在后台管理系统调用接口,输入网站dom名实现自动化的爬取内容了

本文摘自:https://funyan.cn/p/6861.html

上一篇:集合系列 Queue(十一):ArrayDeque


下一篇:ArrayDeque源码解析