puppeteer是一个node库,提供了一组用来操纵Chrome的API(默认headless也就是无UI的chrome,也可以配置为有UI)
有点类似于PhantomJS,但Puppeteer是Chrome官方团队进行维护的,前景更好。
使用Puppeteer,相当于同时具有Linux和Chrome的能力,应用场景会非常多。就爬虫领域来说,远比一般的爬虫工具功能更丰富,性能分析、自动化测试也不在话下。
简单用法
// 依赖 const puppeteer = require('puppeteer') const request = require('request') const { createWriteStream } = require('fs') async function run(val) { const browser = await puppeteer.launch({ headless: false }) console.log('打开浏览器') const page = await browser.newPage() console.log('新建页面') await page.goto('http://image.baidu.com/') console.log('进入一个网站') await page.waitForSelector('html') console.log('等待元素加载') await page.type('#kw', val) console.log('输入成功') await page.click('#homeSearchForm > span.s_search') console.log('点击搜索') await page.waitForSelector('.imgbox > a') console.log('等待加载') let urls = await page.$$eval('.imgbox > a', as => as.map(a => a.href)) console.log('1') for (var i = 0, len = urls.length; i < len; i++) { await page.goto(urls[i]) console.log('进入图片页面') await downloadImg(page, i) } await browser.close() } async function downloadImg(page, index) { await page.waitForSelector('.currentImg') console.log('页面加载完毕') let src = await page.evaluate(function () { let img = document.querySelector('.currentImg') return img.src }) console.log('图片链接:'+src) await download(src,`./images/${index}` + src.substr(-4, 4)) console.log('下载完成') } function download (path, name) { return new Promise ((resolve, reject) => { let ws = new createWriteStream(name) ws.on('finish', function () { ws.end() resolve() }) ws.on('error',reject) request({ url: path, headers: { 'Referer': 'no-referrer-when-downgrade', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } }).pipe(ws) }) } run('壁纸')