nodejs 采集新闻数据

使用nodejs采集新华新闻数据


代码段:

var express = require(‘express‘);
var $ = require(‘jQuery‘);
var app = express();
var colors = require(‘colors‘);
var message_list = require(‘./zui/message_list‘);
var findData = require(‘./findData‘);

//设置全局跨域访问  
app.all(‘*‘, function(req, res, next) {  
    res.header("Access-Control-Allow-Origin", "*");  
    res.header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept");  
    res.header("Access-Control-Allow-Methods","PUT,POST,GET,DELETE,OPTIONS");  
    res.header("X-Powered-By",‘ 3.2.1‘)  
    res.header("Content-Type", "application/json;charset=utf-8"); 
    next();  
});

//输出主页列表数据
app.get(‘/getIndexData‘, function (req, res) {
	findData.indexData(req, res);
});


/*
*输出列表数据。支持范围选择
*demo:http://localhost:3000/getList/0,9 前9条数据 从0开始
*demo: http://localhost:3000/getList/0  所有的数据  从0开始
*/
app.get(/^\/getList?(?:\/(\d+)(?:\,(\d+))?)?/, function (req, res) {
	var start=undefined,end=undefined;
	if(req.params.length==1){
		start=req.params[0];
	}else if(req.params.length==2){
		start=req.params[0];
		end=req.params[1];
	}
	findData.list(req, res,start,end,function(data){
		for(var i=0,j=data.length;i<j;i++){
			data[i].orderby="1";
		}
		return data;
	});
});

//数据列表
var newsListData=undefined;
app.get(‘/getNewsList‘, function (req, res) {
	if(typeof newsListData=="undefined"){
		findData.list(req, res,undefined,undefined,function(data,length){
			for(var i=0,j=data.length;i<j;i++){
				data[i].orderby=Math.round(Math.random()*6);
				data[i].summary="暂无摘要";
				data[i].updateUserName="admin";

			}
			newsListData=data;
			getNewsList(req, res);
		});
	}else{
		getNewsList(req, res);
	}
});

//缓存数据列表
function getNewsList(req, res){
	var pageNum=req.query.pageNum,pageSize=req.query.pageSize;
	var start=(pageNum-1)*pageSize,end=pageNum*pageSize;
	var newData=newsListData.slice(start, end);
	res.send(JSON.stringify(newData));
}

app.listen(3000);
console.log("nodejs was start".green);


var http = require(‘http‘),
	fs = require(‘fs‘),
	jquery = fs.readFileSync("lib/jquery.min.js", "utf-8"),
	jsdom = require(‘jsdom‘);

function get(url, callback) {
	//使用代理
        var opt = {
		host: ‘127.0.0.1‘,
		port: ‘7070‘,
		method: ‘get‘, //这里是发送的方法
		path: url
	}
	//以下是接受数据的代码
	var req = http.request(opt, function(res) {
		res.setEncoding(‘utf8‘);
		var html = ‘‘;
		res.on(‘data‘, function(d) {
			html += d;
		}).on(‘end‘, function() {
			jsdom.env({
				html: html,
				src: [jquery],
				done: function(errors, window) {
					var $ = window.$;
					callback && callback(errors, $);
					window.close(); // 释放window相关资源,否则将会占用很高的内存
				}
			});
		});
	}).on(‘error‘, function(e) {
		console.log("Got error: " + e.message);
	})
	req.end();
}

//导出list
exports.indexData = function(request, response, start, end) {
	get(‘http://www.news.cn/edu/index.htm‘, function(errors, $) {
		var $list = $(".list");
		var data = [];
		$list.each(function(index) {
			var $this = $(this),
				$li = $this.find("li"),
				li_data = [];
			$li.each(function(index) {
				var $a = $(this).find("a");
				var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/","");
				var temp_href="/"+href.replace(/[^c]*/,"");
				var date=href.replace(temp_href,"").replace("/","-");
				if(date.length>10){
					date="2013-01-04";
				}

				li_data.push({
					title: $a.text(),
					href: $a.attr("href"),
					date:date,
					id: index
				});
			});
			data.push({
				id: index,
				list: li_data
			});
		});

		if (typeof start != "undefined" && typeof end == "undefined") {
			data = data.slice(start);
		} else if (typeof start != "undefined" && typeof end != "undefined") {
			data = data.slice(start, end);
		}
		response.send(JSON.stringify(data));
	});
}

//导出list
exports.list = function(request, response, start, end,callback) {
	get(‘http://www.news.cn/edu/index.htm‘, function(errors, $) {
		var $li = $(".list li");
		var data = [];
		$li.each(function(index) {
			var $a = $(this).find("a");
			var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/","");
			var temp_href="/"+href.replace(/[^c]*/,"");
			var date=href.replace(temp_href,"").replace("/","-");
			if(date.length>10){
				date="2013-01-04";
			}
			
			data.push({
				id: index,
				title: $a.text(),
				attachment: $a.attr("href"),
				updateTime:date
			});
		});

		var newData=data;
		if (typeof start != "undefined" && typeof end == "undefined") {
			newData = data.slice(start);
		} else if (typeof start != "undefined" && typeof end != "undefined") {
			newData = data.slice(start, end);
		}
		if(typeof callback=="function"){
			newData=callback(newData,data.length);
		}
		//response.header("Access-Control-Allow-Origin", "*");   //设置跨域访问  
		response.send(JSON.stringify(newData));
	});
}

访问地址

http://localhost:3000/getIndexData
http://localhost:3000/getList/0,4
http://localhost:3000/getNewsList?pageNum=1&pageSize=10

over

nodejs 采集新闻数据

上一篇:php 不同类形的方法


下一篇:一个很好用的js倒计时