简书改版后,根据文章标题搜索文章的功能就不见了。
虽然简书提供了批量下载文章的功能,但是下载到本地的文章都是markdown格式的,不包含文章的链接,这不满足我的需求。
既然我是程序员,没有这个功能我就自己实现一个。
打开简书首页,发现默认只显示8篇文章,用鼠标滑动到屏幕底部后,会触发一个懒加载事件,到后台读取更多的文章列表,所以文章读取在服务器端是采取的分页实现。
打开Chrome开发者工具,观察网络请求,请求url中99b8712e8850是我简书用户id,page=2,3,4这些是分页代码。
每页的文章内容以html格式包含在响应结构里:
我关心的只是文章标题和文章链接,如上图高亮字段所示。
最开始我写了一个nodejs应用,代码如下:
var request = require('request');
var jsdom = require("jsdom");
var JSDOM = jsdom.JSDOM;
const PREFIX = "https://www.jianshu.com";
const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
const MAX = 2;
var mArticleResult = new Map();
var pageNumber;
/* a given article: https://www.jianshu.com/p/963cd23fb092
value got from API: /p/5c1d0319dc42
*/
var lastPageReached = false;
var url = "";
var aHandlers = [];
// use limited for loop to ease testing
for(var i = 0; i < MAX; i++){
pageNumber = i + 1;
var url = PAGE + pageNumber;
// console.log("current page: " + url);
var pageOptions = {
url: url,
method: "GET",
headers: {
"Accept": "text/html"
}
};
aHandlers.push(getArticles(pageOptions, pageNumber));
if( lastPageReached)
break;
}
console.log("promise handler size: " + aHandlers.length);
Promise.all(aHandlers).then(function(){
var articleIndex = 0;
for (var [key, value] of mArticleResult) {
console.log("Article[" + articleIndex++ + "]: " + key + " = " + value);
}
console.log("done");
}
);
function getArticles(pageOptions, pageNumber) {
return new Promise(function(resolve,reject){
var requestC = request.defaults({jar: true});
requestC(pageOptions,function(error,response,body){
if( error){
console.log("error: " + error);
resolve(error);
}
var document = new JSDOM(body).window.document;
var content = document.getElementsByTagName("li");
for( var i =0; i < content.length; i++){
var li = content[i];
var children = li.childNodes;
for( var j = 0; j < children.length; j++){
var eachChild = children[j];
if( eachChild.nodeName == "DIV"){
var grandChild = eachChild.childNodes;
for( var k = 0; k < grandChild.length; k++){
var grand = grandChild[k];
if( grand.nodeName == "A"){
var fragment = grand.getAttribute("href");
if( fragment.indexOf("/p") < 0)
continue;
console.log("title: " + grand.text);
var wholeURL = PREFIX + fragment;
console.log("url: " + wholeURL);
if( mArticleResult.has(grand.text)){
lastPageReached = true;
console.log("article size: " + mArticleResult.size);
resolve(pageNumber);
}
mArticleResult.set(grand.text, wholeURL);
}
}
}
}
}// end of outer loop
resolve(pageNumber);
});
});
}
原理就是使用nodejs的request module,向简书网站同时发起多个请求,每个请求读取一页的简书文章。
后来发现这种方法在并发请求数大于10个的时候就无法工作,简书网站会拒绝该类请求,返回HTTP 429状态码。
所以最后我采用了最简单的同步请求实现,使用了nodejs提供的sync-request在循环里发起请求。
var request = require("sync-request");
var jsdom = require("jsdom");
var JSDOM = jsdom.JSDOM;
var textEncoding = require('text-encoding');
var textDecoder = textEncoding.TextDecoder;
const PREFIX = "https://www.jianshu.com";
const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
const MAX = 100;
var mArticleResult = new Map();
var lastPageReached = false;
var pageNumber;
/* a given article: https://www.jianshu.com/p/963cd23fb092
value got from API: /p/5c1d0319dc42
*/
try {
// use limited for loop to ease testing
for (var i = 0; i < MAX; i++) {
if( lastPageReached)
break;
pageNumber = i + 1;
var url = PAGE + pageNumber;
console.log("current page: " + url);
var response = request('GET', url);
var html = new textDecoder("utf-8").decode(response.body);
handleResponseHTML(html);
}
}
catch (e) {
}
var articleIndex = 0;
var resultHTML = "<html>";
const fs = require('fs');
/*
<HTML>
<p>
<a href="https://www.baidu.com">eee</a>
</p>
<p><a>22</a></p>
<p><a>33</a></p>
</HTML>
*/
var index = 1;
for (var [key, value] of mArticleResult) {
var article = "<p><a href=\"" + key + "\">" +
index++ + ". " + value + "</a></p>" + "\n";
resultHTML = resultHTML + article;
console.log("Article[" + articleIndex++ + "]: " + value + " = " + key);
}
resultHTML = resultHTML + "</html>";
var pwd = process.cwd() + "/jianshu.html";
fs.appendFileSync(pwd, resultHTML);
console.log("done");
function handleResponseHTML(html) {
var document = new JSDOM(html).window.document;
var content = document.getElementsByTagName("li");
for (var i = 0; i < content.length; i++) {
var li = content[i];
var children = li.childNodes;
for (var j = 0; j < children.length; j++) {
var eachChild = children[j];
if (eachChild.nodeName == "DIV") {
var grandChild = eachChild.childNodes;
for (var k = 0; k < grandChild.length; k++) {
var grand = grandChild[k];
if (grand.nodeName == "A") {
var fragment = grand.getAttribute("href");
if (fragment.indexOf("/p") < 0)
continue;
// console.log("title: " + grand.text);
var wholeURL = PREFIX + fragment;
// console.log("url: " + wholeURL);
if (mArticleResult.has(wholeURL)) {
lastPageReached = true;
console.log("article size: " + mArticleResult.size);
return;
}
mArticleResult.set(wholeURL, grand.text);
}
}
}
}
}
}
这个nodejs应用执行后,会在本地生成一个html文件,包含每篇文章的标题和超链接。
本文来自云栖社区合作伙伴“汪子熙”,了解相关信息可以关注微信公众号"汪子熙"。