一、需求分析
网站:https://www.cnblogs.com/
分析:
需要的数据:标题、摘要、原文地址、发布时间
存储数据库
二、设计数据库
标题、摘要、原文地址、发布时间
文章表:id主键 title标题summary摘要detailurl详细地址pubtime发布时间ctime创建时间
SQL脚本:
create database db_data1906;
use db_data1906;
create table t_bkyarticle(id int primary key auto_increment,title varchar(100),summary text,detailurl varchar(200),pubtime date,ctime date);
三、实现编码
技术栈:SpringBoot
1.新建项目
SpringBoot
2、依赖jar
3、逐层编写代码
实体层
@TableName("t_bkyarticle")
@Data
public class BkyArticle {
@TableId(type = IdType.AUTO)
private Integer id;
private String title;
private String summary;
private String detailurl;
private Date pubtime;
private Date ctime;
}
持久层
public interface BkyArticleDao extends BaseMapper {
@Insert("insert into t_bkyarticle(title,summary,detailurl,pubtime,ctime) values(#{title},#{summary},#{detailurl},#{pubtime},now())")
int save(BkyArticle article);
}
业务逻辑层
public interface BkyArticleService extends IService {
boolean saveEntity(BkyArticle article);
}
@Service
public class BkyArticleServiceImpl extends ServiceImpl implements BkyArticleService {
@Override
public boolean saveEntity(BkyArticle article) {
return getBaseMapper().save(article)>0;
}
}
4、编写爬虫核心代码
自定义页面处理器
@Service
public class BkyArticlePage implements PageProcessor {
private String baseUrl="https://www.cnblogs.com/";
@Override
public void process(Page page) {
//1、解析当前页面的内容
List<String> titles=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/text()").all();
List<String> urls=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/@href").all();
List<String> infos=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/p[@class='post_item_summary']/text()").all();
List<String> times=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/div[@class='post_item_foot']/a/text()").all();
//2、组装解析的结果
List<BkyArticle> articles=new ArrayList<>();
for(int i=0;i<titles.size();i++){
BkyArticle article=new BkyArticle();
article.setTitle(titles.get(i));
article.setSummary(infos.get(i));
article.setDetailurl(urls.get(i));
article.setPubtime(parseTime(getTimeStr(times.get(i))));
articles.add(article);
}
//3、传递给了结果处理器
page.putField("list",articles);
//4、分页查询 获取分页的路径并标记继续爬取
if(page.getUrl().get().equals(baseUrl)){
//计算所有的分页请路径
List<String> pageurls=new ArrayList<>();
List<String>allpages=page.getHtml().xpath("div[@id='paging_block']/div[@class='pager']/a/text()").all();
int maxPage=Integer.parseInt(allpages.get(allpages.size()-2));
for(int i=2;i<=maxPage;i++){
pageurls.add(baseUrl+"/sitehome/p/"+i);
}
//设置继续爬取的网页
page.addTargetRequests(pageurls);
}
}
private String getTimeStr(String s){
String s1=s.trim();
if(s1.indexOf(" ")>0){
return s.substring(s.indexOf(' ')+1);
}else {
return null;
}
}
private Date parseTime(String time){
if(time!=null) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
try {
return sdf.parse(time);
} catch (ParseException e) {
e.printStackTrace();
return new Date();
}
}else {
return new Date();
}
}
private Site site=Site.me().setTimeOut(6000).setSleepTime(2000);
@Override
public Site getSite() {
return site;
}
}
结果处理器
@Repository
public class BkyArticPipeline implements Pipeline {
@Autowired
private BkyArticleDao bkyArticleDao;
@Override
public void process(ResultItems resultItems, Task task) {
List<BkyArticle> articleList=resultItems.get("list");
System.out.println("爬取数据:"+articleList.size());
for(BkyArticle a:articleList){
bkyArticleDao.save(a);
}
}
}
5、编写启动接口
控制器 实现爬取的运行
@Api
@RestController
public class BkyArticController {
@Autowired
private BkyArticleService bkyArticleService;
@Autowired
private BkyArticlePage page;
@Autowired
private BkyArticPipeline pipeline;
//启动爬虫
@GetMapping("/api/spider/start.do")
public R start(){
Spider.create(page).addPipeline(pipeline).addUrl("https://www.cnblogs.com/").thread(5).run();
return R.ok("爬取已经启动");
}
//查询爬取数据
@GetMapping("api/bkyartic/all.do")
public R all(){
return R.ok(bkyArticleService.list());
}
}
6、配置Swagger
@Configuration //配置文件
public class SwaggerConfig {
//创建文档说明
public ApiInfo createAI(){
ApiInfo apiInfo=new ApiInfoBuilder().title("文章接口").description("实现一款基于爬虫实现的数据接口").contact(new Contact("Feri","http://www.17feri.top","xingfei_work@163.com")).build();
return apiInfo;
}
//创建Swagger扫描信息
@Bean
public Docket createD(){
return new Docket(DocumentationType.SWAGGER_2).apiInfo(createAI()).select().
apis(RequestHandlerSelectors.basePackage("com.feri.point.controller")).build();
}
}
7、启动测试