使用nodejs采集新华新闻数据
代码段:
var express = require(‘express‘); var $ = require(‘jQuery‘); var app = express(); var colors = require(‘colors‘); var message_list = require(‘./zui/message_list‘); var findData = require(‘./findData‘); //设置全局跨域访问 app.all(‘*‘, function(req, res, next) { res.header("Access-Control-Allow-Origin", "*"); res.header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept"); res.header("Access-Control-Allow-Methods","PUT,POST,GET,DELETE,OPTIONS"); res.header("X-Powered-By",‘ 3.2.1‘) res.header("Content-Type", "application/json;charset=utf-8"); next(); }); //输出主页列表数据 app.get(‘/getIndexData‘, function (req, res) { findData.indexData(req, res); }); /* *输出列表数据。支持范围选择 *demo:http://localhost:3000/getList/0,9 前9条数据 从0开始 *demo: http://localhost:3000/getList/0 所有的数据 从0开始 */ app.get(/^\/getList?(?:\/(\d+)(?:\,(\d+))?)?/, function (req, res) { var start=undefined,end=undefined; if(req.params.length==1){ start=req.params[0]; }else if(req.params.length==2){ start=req.params[0]; end=req.params[1]; } findData.list(req, res,start,end,function(data){ for(var i=0,j=data.length;i<j;i++){ data[i].orderby="1"; } return data; }); }); //数据列表 var newsListData=undefined; app.get(‘/getNewsList‘, function (req, res) { if(typeof newsListData=="undefined"){ findData.list(req, res,undefined,undefined,function(data,length){ for(var i=0,j=data.length;i<j;i++){ data[i].orderby=Math.round(Math.random()*6); data[i].summary="暂无摘要"; data[i].updateUserName="admin"; } newsListData=data; getNewsList(req, res); }); }else{ getNewsList(req, res); } }); //缓存数据列表 function getNewsList(req, res){ var pageNum=req.query.pageNum,pageSize=req.query.pageSize; var start=(pageNum-1)*pageSize,end=pageNum*pageSize; var newData=newsListData.slice(start, end); res.send(JSON.stringify(newData)); } app.listen(3000); console.log("nodejs was start".green);
var http = require(‘http‘), fs = require(‘fs‘), jquery = fs.readFileSync("lib/jquery.min.js", "utf-8"), jsdom = require(‘jsdom‘); function get(url, callback) { //使用代理 var opt = { host: ‘127.0.0.1‘, port: ‘7070‘, method: ‘get‘, //这里是发送的方法 path: url } //以下是接受数据的代码 var req = http.request(opt, function(res) { res.setEncoding(‘utf8‘); var html = ‘‘; res.on(‘data‘, function(d) { html += d; }).on(‘end‘, function() { jsdom.env({ html: html, src: [jquery], done: function(errors, window) { var $ = window.$; callback && callback(errors, $); window.close(); // 释放window相关资源,否则将会占用很高的内存 } }); }); }).on(‘error‘, function(e) { console.log("Got error: " + e.message); }) req.end(); } //导出list exports.indexData = function(request, response, start, end) { get(‘http://www.news.cn/edu/index.htm‘, function(errors, $) { var $list = $(".list"); var data = []; $list.each(function(index) { var $this = $(this), $li = $this.find("li"), li_data = []; $li.each(function(index) { var $a = $(this).find("a"); var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/",""); var temp_href="/"+href.replace(/[^c]*/,""); var date=href.replace(temp_href,"").replace("/","-"); if(date.length>10){ date="2013-01-04"; } li_data.push({ title: $a.text(), href: $a.attr("href"), date:date, id: index }); }); data.push({ id: index, list: li_data }); }); if (typeof start != "undefined" && typeof end == "undefined") { data = data.slice(start); } else if (typeof start != "undefined" && typeof end != "undefined") { data = data.slice(start, end); } response.send(JSON.stringify(data)); }); } //导出list exports.list = function(request, response, start, end,callback) { get(‘http://www.news.cn/edu/index.htm‘, function(errors, $) { var $li = $(".list li"); var data = []; $li.each(function(index) { var $a = $(this).find("a"); var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/",""); var temp_href="/"+href.replace(/[^c]*/,""); var date=href.replace(temp_href,"").replace("/","-"); if(date.length>10){ date="2013-01-04"; } data.push({ id: index, title: $a.text(), attachment: $a.attr("href"), updateTime:date }); }); var newData=data; if (typeof start != "undefined" && typeof end == "undefined") { newData = data.slice(start); } else if (typeof start != "undefined" && typeof end != "undefined") { newData = data.slice(start, end); } if(typeof callback=="function"){ newData=callback(newData,data.length); } //response.header("Access-Control-Allow-Origin", "*"); //设置跨域访问 response.send(JSON.stringify(newData)); }); }
访问地址
http://localhost:3000/getIndexData http://localhost:3000/getList/0,4 http://localhost:3000/getNewsList?pageNum=1&pageSize=10
over