因为工作需要,用nodejs写了个简单的爬虫例子,之前也没用过nodejs,连搭环境加写大概用了5天左右,so。。。要多简陋有多简陋,放这里给以后的自己看~~
整体需求是:给一个有效的URL地址,返回该网页上所有无效链接的百分比(坏链率)
第一个文件:计算环链率 urlSpider.js
1 /*================================================ 2 @author MissUU 3 链接抓取思路: 4 5 1. 获取页面内容 6 2. 正则取得所有<a> 7 3. 进一步取得href属性值,如果首位是“则剔除,不是http开头加上域名(javascript开头除外) 8 4.正则验证是否是常见URL格式 9 ================================================*/ 10 var http = require(‘http‘); 11 var async = require(‘async‘); 12 var dbHandle = require(‘./dbHandle.js‘); 13 14 //主程序 15 var runUrlSpider = function(obj, callback){ 16 //10s timeout 17 var request_timer = setTimeout(function() { 18 req.abort(); 19 console.log(‘Request Timeout.‘); 20 }, 10000); 21 22 var urlBadLink = new UrlBadLink(); 23 var html=‘‘; 24 var req = http.get(obj.url, function(res) { 25 26 clearTimeout(request_timer); 27 28 res.setEncoding(‘utf8‘); 29 res.on(‘data‘, function (chunk) { 30 html += chunk; 31 }).on(‘end‘, function(){ 32 console.log(‘*******开始提取有效链接地址******‘); 33 console.log(new Date().toLocaleString()); 34 console.log(obj.url); 35 urlBadLink.host = obj.url; 36 urlBadLink.id = obj.id; 37 matchURL(html, urlBadLink, function(){ 38 callback(); 39 }); 40 }); 41 }); 42 43 req.on(‘error‘, function(e) { 44 console.log(‘problem with request: ‘ + e.message); 45 callback(); 46 }); 47 } 48 49 //this is the entrance of code 50 var main = function(){ 51 var urlArray = dbHandle.showUrls(1, function(result){ 54 async.eachSeries(result, runUrlSpider, function(err){ 55 console.log(‘******this is the end, haha*******‘); 56 }); 57 }); 58 // console.log(urlArray); 59 60 }; 61 62 main(); 63 64 /* 65 * 用于异步放送get请求 66 * 67 * @param {string} content 原始页面信息 68 * @param {string} host 主域名 69 */ 70 function matchURL(content, urlBadLink, callend){ 71 var host = urlBadLink.host; 72 var anchor = /<a\s[^>]*>/g; 73 var matches = content.match(anchor); 74 var badLink = 0; 75 var flag = 0; 76 var HttpGet = function(url,callback){ 77 //10s timeout 78 var request_timer = setTimeout(function() { 79 req.abort(); 80 console.log(‘Request Timeout.‘); 81 }, 10000); 82 83 var req = http.get(url, function(res) { 84 clearTimeout(request_timer); 85 86 res.on(‘data‘, function () { 87 }).on(‘end‘, function(){ 88 console.log(++flag + ": " + url + ‘ response status: ‘ + res.statusCode); 89 90 if(!(res.statusCode >= 200 && res.statusCode < 400)){ 91 console.log(‘-----------------------‘); 92 badLink++; 93 } 94 95 callback(); 96 }); 97 }); 98 req.on(‘error‘, function(err){ 99 console.log(++flag + ": " + ‘problem with request: ‘ + err.message); 100 badLink++; 101 callback(); 102 }); 103 }; 104 105 var urls = filterUrl(matches,host); 106 107 if(urls !== null){ 108 var totalLink = urls.length; 109 //console.log(urls); 110 async.eachSeries(urls, HttpGet, function(err){ 111 // var urlBadLink = new UrlBadLink(host,totalLink, badLink); 112 // console.log("坏链个数为: " + urlBadLink.badCounts); 113 // console.log("坏链率为: " + urlBadLink.getRate()); 114 urlBadLink.total = totalLink; 115 urlBadLink.badCounts = badLink; 116 //data store puts here 117 dbHandle.updateBadLink(urlBadLink); 118 callend(); 119 }); 120 }else{ 121 console.log(‘no links found‘); 122 urlBadLink.total = 10; 123 urlBadLink.badCounts = 0; 124 dbHandle.updateBadLink(urlBadLink); 125 callend(); 126 } 127 } 128 129 //正则取得href属性值 130 function URLFommat(strUrl,host){ 131 132 var urlPatten = /href=[\‘\"]?([^\‘\"]*)[\‘\"]?/i; 133 var temp = urlPatten.exec(strUrl); 134 135 if(temp!= null){ 136 var url = temp[0].substring(6,temp[0].length-1).trim(); 137 138 if(url.indexOf("\"") != -1){ 139 url = url.slice(url.indexOf("\"") + 1); 140 } 141 142 if(url.charAt(0) == "/"){ 143 url = url.slice(1); 144 return host + url; 145 }else if((url.indexOf("http") == -1)&& 146 (url.indexOf("javascript") == -1)){ 147 return host + url; 148 }else 149 return url; 150 }else 151 return null; 152 } 153 154 // 155 function URLFommat1(strUrl,host){ 156 157 var urlPatten = /href=[\‘\"]?([^\‘\"]*)[\‘\"]?/i; 158 var temp = urlPatten.exec(strUrl); 159 160 if(temp!= null){ 161 var url = temp[0].substring(6,temp[0].length-1).trim(); 162 163 if(url.indexOf("\"") != -1) 164 url = url.slice(url.indexOf("\"") + 1); 165 166 if(url.charAt(0) == "/") 167 return "http://" + host + url; 168 else if((url.indexOf("http") == -1)&& 169 (url.indexOf("javascript") == -1)){ 170 return "http://" + host+"/" + url; 171 }else 172 return url; 173 }else 174 return null; 175 } 176 //test URLFommat 177 //var test = "http://baidu.com"; 178 // var test1 = " \"http://baidu.com"; 179 //var test2 = "/wenhao"; 180 //console.log(URLFommat(test,"www.sina.com.cn")); 181 //console.log(URLFommat(test1,"www.sina.com.cn")); 182 //console.log(URLFommat(test2,"www.sina.com.cn")); 183 184 185 //测试是否为常见url格式 186 function IsURL(strUrl) { 187 if(strUrl != null){ 188 var regular = /^\b(((http?|ftp):\/\/)?[-a-z0-9]+(\.[-a-z0-9]+)*\.(?:com|edu|gov|int|mil|net|org|biz|info|name|museum|asia|coop|aero|[a-z][a-z]|((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d))\b(\/[-a-z0-9_:\@&?=+,.!\/~%\$]*)?)$/i; 189 if (regular.test(strUrl)) { 190 return true; 191 } 192 else { 193 return false; 194 } 195 }else 196 return false; 197 } 198 199 200 //对象 201 function UrlBadLink(id, host, total, badCounts){ 202 this.id = id; 203 this.host = host; 204 this.total = total; 205 this.badCounts = badCounts; 206 207 if(typeof this.getRate != "function"){ 208 UrlBadLink.prototype.getRate = function(){ 209 var output = Number(Math.round(this.badCounts/this.total*10000)/100).toFixed(2)+‘%‘; 210 return output; 211 }; 212 } 213 } 214 215 function filterUrl(arr,host){ 216 217 if(arr === null) 218 return null; 219 var output = []; 220 arr.forEach(function(item,index,array){ 221 //console.log(item); 222 var formatURL = URLFommat(item,host); 223 224 if(IsURL(formatURL)){ 225 output.push(formatURL); 226 }//if 227 });//forEach 228 229 return output; 230 }
第二个文件:将数据存库,dbHandle.js
/** * @author MissUU * @des MySql基本操作 * API: https://github.com/felixge/node-mysql */ var mysql = require(‘mysql‘); mysql.createConnection(‘mysql://root:apple@localhost/test?debug=false‘); var pool = mysql.createPool({ host : ‘10.102.1.00‘, user : ‘root‘, password : ‘root‘, database : ‘test‘, connectionLimit: 15 }); //读取urls exports.showUrls = function (groupId, callback){ console.log(‘this is showUrl()‘); pool.getConnection(function(err, conn){ if (err) { console.log("connection error!"); console.log(err); } conn.query(‘SELECT id,realurl as url FROM t_site WHERE siteGroupId = ?‘,groupId, function(err, result){ if(err){ console.log(err.message); } conn.release(); if(result.length){ // console.log(result instanceof Array); callback(result); return true; }else{ callback(‘‘); return false; } }); }); }; exports.updateBadLink = function (urlBadLink){ //若不含数据则不插入 if (!!urlBadLink) { pool.getConnection(function(err, conn){ if (err) { console.log("connection error!"); console.log(err); } var updateSql = "UPDATE a_qualityinfo SET brokenRate = ‘"+ urlBadLink.getRate() +"‘ WHERE siteId = " + urlBadLink.id; console.log(updateSql); conn.query(updateSql, function(err, result){ if(err){ console.log(err.message); console.log(‘update fail‘); } conn.release(); console.log(‘update success‘); });// conn.query });//pool.getConnection } };
代码后期还会改动,这里有几点需要注意的:
1、http.get有时会一直等待响应,所以一定要判断下,超时则认为出错,要不程序就卡住了。。。= =!
2、注意callback的使用,要不然很难规范执行顺序的,用过nodejs的都懂得。。。