作者:由来只有新人笑_谁能记得旧人哭 | 来源:互联网 | 2023-10-11 11:03
Puppeteerpuppeteer官网一、puppeteer是干什么的?援用puppeteer官网诠释:Mostthingsthatyoucandomanuallyinthebr
Puppeteer
puppeteer官网
一、puppeteer是干什么的?
援用puppeteer官网诠释: Most things that you can do manually in the browser can be done using Puppeteer!
- 天生页面的屏幕截图和PDF。
- 抓取SPA并天生预衬着内容(即“SSR”)。
- 自动表单提交,UI测试,键盘输入等。
- 建立最新的自动化测试环境。 运用最新的Javascript和浏览器功用直接在最新版本的Chrome中运转测试。
- 捕捉您网站的[时候线跟踪]
二、经常使用API
更多API
三、举个栗子:截取屏幕
3.1 代码
const puppeteer = require('puppeteer');
// 援用default.js的sceenshot途径,将截取的屏幕pdf保存到该途径下。
const { screenshot } = require('./config/default.js');
(async () => {
// 猎取browser实例
const browser = await puppeteer.launch();
// 猎取浏览器tab页面实例
const page = await browser.newPage();
// 链接到百度首页
await page.goto('https://www.baidu.com');
// 截屏
await page.screenshot({
// 将截屏按时候戳保存到指定途径下。
path: `${screenshot}/${Date.now()}.png`
});
// 封闭
await browser.close();
})();
3.2 然后实行命令
node src/screenshot.js
3.3 末了在screenshot文件指定途径下天生百度首页的截屏。
四、爬取百度图片列表
4.1 完成思绪
- 模仿用户翻开浏览器
- 模仿翻开tab页
- 模仿前去百度图片页面
- 模仿focus到输入框,输入查询值, 点击查询按钮
- 抓取图片
- 经由过程writeFile,将图片下载到指定途径下。
4.2 目次构造
.
|-mn
|-src
| |-config
| | |-default.js
| |-helper
| | |-srcToImg.js
| |-mn.js
|-package.json
4.3 mn.js 主文件
const puppeteer = require('puppeteer');
const { mn } = require('./config/default');
const srcToImg = require('./helper/srcToImg');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://image.baidu.com');
console.log('go to https://image.baidu.com');
await page.setViewport({
width: 1920,
height: 1080
});
console.log('reset viewport');
await page.focus('#kw');
await page.keyboard.sendCharacter('狗');
await page.click('.s_search');
console.log('go to search list');
page.on('load', async () => {
console.log('page loading done, start fetch ...');
const srcs = await page.evaluate(() => {
const images = document.querySelectorAll('img.main_img');
return Array.prototype.map.call(images, img => img.src);
});
console.log(`get ${srcs.length} image, start download`); srcs.forEach(async (src) => {
await srcToImg(src, mn);
});
await browser.close();
})
})();
4.4 default.js 途径
const path = require('path');
module.exports = {
screenshot: path.resolve(__dirname, '../../screenshot'),
mn: path.resolve(__dirname, '../../mn')
}
4.5 srcToImg.js 剖析图片地点
const http = require('http');
const https = require('https');
const fs = require('fs');
const path = require('path');
const { promisify } = require('util');
const writeFile = promisify(fs.writeFile);
module.exports = async(src, dir) => {
if(/\.(jpg|png|gif)$/.test(src)) {
await urlToImg(src, dir);
}else {
await base64ToImg(src, dir);
}
}
// 辨认src为http或许https的图片
const urlToImg = promisify((url, dir, callback) => {
const mod = /^https:/.test(url) ? https : http;
const ext = path.extname(url);
const file = path.join(dir, `${Date.now()}${ext}`);
mod.get(url, res => {
res.pipe(fs.createWriteStream(file))
.on('finish', () => {
callback();
console.log(file);
})
})
})
// 辨认src为base64地点的图片
const base64ToImg = async (base64Str, dir) => {
// data: image/jpeg;base64,/raegreagearg
const matchs = base64Str.match(/^data:(.+?);base64,(.+)$/);
try {
const ext = matches[1].split('/')[1]
.replace('jpeg', 'jpg');
const file = path.join(dir, `${Date.now()}.${ext}`);
await writeFile(file, match[2], 'base64');
console.log(file);
} catch (ex) {
console.log('不法 base64 字符串');
}
}
4.6 终究在mn文件夹中存入爬取到的图片。
go to https://image.baidu.com
reset viewport
go to search list
page loading done, start fetch ...
get 46 image, start download
不法 base64 字符串
不法 base64 字符串
不法 base64 字符串
不法 base64 字符串
不法 base64 字符串
不法 base64 字符串
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351397.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351396.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351398.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351400.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351386.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351399.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351402.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351412.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351403.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351398.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351399.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351403.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351406.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351401.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351408.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351404.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351414.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351400.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351402.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351408.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351414.jpg
/Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg
......
4.7 mn文件夹下