JS爬虫 利用axios和cheerio爬取好大夫病历并生成xlsx

var axios = require("axios")
var cheerio = require("cheerio")
var xlsx = require('node-xlsx');
var fs = require('fs');

var userAgentPool = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
];
var userAgentFlag = 0;
var urls = [];
var num = 1;
var data = [
    {
        name : 'sheet1',
        data : [
            [
                '',
                '疾病描述',
                '疾病',
                '病历概要'
            ]
        ]
    }
]
let si1 = setInterval(getUrls, 1000)

console.log('开始爬取病症链接');

function getUrls() {
    if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
    if(num > 100) {
        clearInterval(si1);
        console.log('病症链接爬取完成,爬取数量:' + urls.length);
        getDatas();
        return;
    }
    axios.get("https://zixun.haodf.com/dispatched/45001000.htm?p=" + num++,
    {
        headers: {
            'User-Agent': userAgentPool[userAgentFlag]
        }
    }).then(resp => {
        var $ = cheerio.load(resp.data)
        var lis = $('.clearfix li');
        for (var i = 0; i < lis.length; i++) {
            var li = lis.eq(i);
            if(li.find(".fl a").attr("href")) {
                urls.push(li.find(".fl a").attr("href"));
            }
        }
        console.log('已爬取第', num - 1, '页', '总爬取数量:', 'urls:', urls.length, '该页末位链接:', urls[urls.length - 1]);
    })
    userAgentFlag++;
}

function getDatas() {
    console.log('开始爬取具体数据');
    num = 0;
    si1 = setInterval(getItem, 100)
}

function getItem() {
    if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
    if(num === urls.length) {
        clearInterval(si1);
        
        var buffer = xlsx.build(data);
        fs.writeFile('./res.xls', buffer, function (err)
        {
            if (err)
                throw err;
            console.log('Write to xls has finished');
        })
        
        return;
    }
    
    axios.get(urls[num],
    {
        headers: {
            'User-Agent': userAgentPool[userAgentFlag]
        }
    }).then(resp => {
        var $ = cheerio.load(resp.data)
        var section = $('.bccard section').eq(0).find('.info3-value p');
        
        data[0].data.push([
            ++num,
            section.eq(0).text().trim(),
            section.eq(2).text().trim(),
            $('.suggestions-content .suggestions-text-value').text().trim()
        ])
        
        console.log('爬取数据:', data[0].data[data[0].data.length - 1]);
    }
)
    userAgentFlag++;
}

 

上一篇:cheeio常见用法学习笔记


下一篇:【重学Node.js 第4篇】实现一个简易爬虫&启动定时任务