In my Puppeteer Node JS app I need to read localStorage and cookies from a browser web page, but for some reason I'm getting the following error:
UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
What am I doing wrong/missing from my JS:
const dayjs = require('dayjs');
const AdvancedFormat = require('dayjs/plugin/advancedFormat');
dayjs.extend(AdvancedFormat);
const puppeteer = require('puppeteer');
const { config } = require('./config');
const helpers = require('./helpers');
const logs = require('./logs');
const runEmulation = async (body) => {
logs.debug('starting emulation');
// vars
const argOptions = [], journey = [];
// sandbox config
if ((config.puppeteer.run_in_sandbox === 'true')) {
argOptions.push('--no-sandbox');
}
// initiate a Puppeteer instance with options and launch
const browser = await puppeteer.launch({
args: argOptions,
headless: (config.puppeteer.run_in_headless === 'true') ? true : false
});
// launch a new page
const page = await browser.newPage()
// initiate a new CDP session
const client = await page.target().createCDPSession();
await client.send('Network.enable');
await client.on('Network.requestWillBeSent', async (e) => {
// if not a document, skip
if (e.type !== "Document") return;
const scrapablePageData = async () => {
function getLocalStorage () {
const values = [];
const keys = Object.keys(localStorage);
let index = keys.length;
while (index--) {
values.push({
key: keys[index],
value: localStorage.getItem(keys[index])
});
}
return values ? values : [];
}
return {
localStorage: getLocalStorage()
}
}
const scrapable = await page.evaluate(scrapablePageData);
const cookies = await page.cookies();
// the data we want to log
journey.push({
url: e.documentURL,
type: e.redirectResponse ? e.redirectResponse.status : 'JS Redirection',
storage: {
cookies: cookies ?? [],
local: scrapable.localStorage ?? []
},
duration_in_ms: 0,
duration_in_sec: 0,
loaded_at: dayjs().valueOf()
})
})
// set userAgent and go to the URL
await page.setUserAgent(body.userAgent);
await page.goto(body.url);
await page.waitForNavigation();
console.log(journey)
}
exports.runEmulation = runEmulation
Related
I'm writing an API with express, puppeteer-cluster and cheerio that returns all anchor elements containing one or more words that can be added as query parameters. I want to use puppeteer in order to get elements that are javascript generated too. But for some reason it's not working, I get an empty array as an output printed on the browser.
I'm still trying to understand this library but has been 2 days and I made no progress. Any help is deeply appreciated.
Update: I added async to all my functions and they run now, but the result is still empty :(
Update 2: I started logging everything, every step and found that data.name is being passed to the cheerio function as a Promise. '-' I think that is the problem, but don't know how to fix it yet.
Update 3: One of the issues was that the page content (html code) was not being handled properly to the cheerio function. In the browser, however, the response is empty and the console shows an error:
Error handling response: TypeError: Cannot read properties of
undefined (reading 'innerText').
So, I think the response is not json formatted. Is res.json() not the right way to do it?
My code:
app.js
const PORT = process.env.PORT || 8000;
var path = require("path");
const express = require("express");
// Routes
const indexRouter = require("./routes/index");
const allNews = require("./routes/news");
const clusterRouter = require("./routes/cluster");
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(express.static(path.join(__dirname, "public")));
app.use("/", indexRouter);
app.use("/news", allNews);
app.use("/cluster", clusterRouter);
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));
cluster.js
const express = require("express");
const { Cluster } = require("puppeteer-cluster");
const puppeteer = require("puppeteer-extra");
const cheerio = require("cheerio");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var router = express.Router();
const newspapers = [
{
"name": "CNN",
"address": "https://edition.cnn.com/specials/world/cnn-climate",
"base": "https://edition.cnn.com"
},
{
"name": "The Guardian",
"address": "https://www.theguardian.com/environment/climate-crisis",
"base": "https://www.theguardian.com"
}]
const app = express();
puppeteer.use(StealthPlugin());
const result = [];
router.get("/", async (req, res) => {
(async () => {
// Query String
const query = checkForQuery(req);
const wordsToSearch = query ? verifyQuery(query) : "";
console.log("Running tests.."); // This is printed on console
//Functions
function checkForQuery(request) {
if (request.originalUrl.indexOf("?") !== -1) {
console.log(request.query);
return request.query;
} else {
return false;
}
}
// // Validates query and remove invalid values
function verifyQuery(queryString) {
const queryParams = {
only: queryString.only ? queryString.only : "",
also: queryString.also ? queryString.also : "",
};
// Creates new list containing valid terms for search
var newList = {
only: [],
also: [],
};
for (const [key, value] of Object.entries(queryParams)) {
const tempId = key.toString();
const tempVal =
queryParams[tempId].length >= 2
? queryParams[tempId].split(",")
: queryParams[tempId];
console.log(queryParams[tempId], " and ", tempVal);
if (tempVal.length > 1) {
console.log("helloooooo");
tempVal.forEach((term) => {
if (topics.indexOf(term) != -1) {
newList[tempId].push(term);
}
});
} else {
if (topics.indexOf(queryParams[tempId]) != -1) {
newList[tempId].push(queryParams[tempId]);
}
}
}
console.log(newList);
return newList;
}
function storeData(element, base, name) {
const results = [];
element.find("style").remove();
const title = element.text();
const urlRaw = element.attr("href");
const url =
urlRaw.includes("www") || urlRaw.includes("http")
? urlRaw
: base + urlRaw;
// Check for duplicated url
if (tempUrls.indexOf(url) === -1) {
// Check for social media links and skip
if (!exceptions.some((el) => url.toLowerCase().includes(el))) {
tempUrls.push(url);
// Get img if child of anchor tag
const imageElement = element.find("img");
if (imageElement.length > 0) {
// Get the src attribute of the image element
results.push({
title: title.replace(/(\r\n|\n|\r)/gm, ""),
url,
source: name,
imgUrl: getImageFromElement(imageElement),
});
} else {
results.push({
title: title.replace(/(\r\n|\n|\r)/gm, ""),
url: url,
source: name,
});
}
}
}
return results;
}
function getElementsCheerio(html, base, name, searchterms) {
console.log(html, base, name);
const $ = cheerio.load(html);
console.log(searchterms);
const concatInfo = [];
if (searchterms) {
const termsAlso = searchterms.also;
const termsOnly = searchterms.only;
termsAlso.forEach((term) => {
$(`a:has(:contains("climate"):contains(${term}))`).each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
});
termsOnly.forEach((term) => {
// $(`a:has(:contains(${term}))`).each(function () {
$(`a:contains(${term})`).each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
});
} else {
$('a:contains("climate")').each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
}
return concatInfo;
}
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
puppeteerOptions: {
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
userDataDir: "./tmp",
defaultViewport: false,
},
});
await cluster.task(async ({ page, data }) => {
await page.goto(data.address);
await page.waitForSelector("body");
// console.log here prints that data.name is a Promise :(
const elements = await getElementsCheerio(
document.body.innerHTML,
data.base,
data.name,
wordsToSearch
);
result.push(elements);
});
newspapers.map((newspaper) => {
console.log("queue" + newspaper); // This logs correctly: queue[object Object]
cluster.queue(newspaper);
});
await cluster.idle();
await cluster.close();
// Display final object
res.json(result);
})();
});
module.exports = router;
I don't get any errors, but on screen I get an empty [ ]. Anyone can see what I am doing wrong here? :(
In general, it's an antipattern to mix Puppeteer with another selection library like Cheerio. In addition to being redundant, the extra HTML parser doesn't work on the live document as Puppeteer does, so you have to snapshot the HTML at a particular moment with Puppeteer to capture it as a string and plug that string into Cheerio, where it's re-parsed back to a traversible tree structure.
Introducing this extra step creates opportunity for bugs and confusion to creep in, and that's what happened here.
The code
const elements = await getElementsCheerio(
document.body.innerHTML,
data.base,
data.name,
wordsToSearch
);
is problematic. document.body.innerHTML doesn't refer to anything related to Puppeteer. Instead, use Puppeteer's await page.content() to snapshot the HTML.
As a minor point, there's no need for Cheerio functions to be async, because they never use await. It's a fully synchronous API.
Here's a minimal set up for using Cheerio with Puppeteer, assuming you accept the terms and conditions and are sure that intoducing this usually unnecessary layer of indirection is appropriate for your use case:
const cheerio = require("cheerio"); // 1.0.0-rc.12
const puppeteer = require("puppeteer"); // ^19.0.0
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const url = "https://www.example.com";
await page.goto(url, {waitUntil: "domcontentloaded"});
const html = await page.content();
const $ = cheerio.load(html);
// do cheerio stuff synchronously
console.log($("h1").text()); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
It's basically the same for puppeteer-cluster: just drop the lines starting with const html = await page.content(); into the cluster.task callback that operates on page.
Electron is not opening spotify in desktop mode, as you can see in the screenshot below.
Here is the code:
const {BrowserWindow, app} = require("electron");
const pie = require("puppeteer-in-electron")
const puppeteer = require("puppeteer-core");
const fs = require("fs");
const path = require("path");
const main = async () => {
const cookiesPath = path.join(__dirname, "cookies/open.spotify.com.cookies.json");
const cookies = JSON.parse(await fs.readFileSync(cookiesPath, 'utf8'));
await pie.initialize(app);
const browser = await pie.connect(app, puppeteer);
const window = new BrowserWindow();
const url = "https://example.com/";
await window.loadURL(url);
const page = await pie.getPage(browser, window);
await page.goto("https://open.spotify.com");
for (const cookie of cookies) {
if (cookie.name !== 'ig_lang') {
await page.setCookie(cookie);
}
}
await page.reload();
};
main();
Note I'm using puppeteer-in-electron so that I can automate web process even in electron.
But, this is not an issue because even if I use electron normally without puppeteer the issue persists.
This is how it should've been: https://cdn.discordapp.com/attachments/1026704902925324410/1026710664611377202/unknown.png
This is how it is: https://cdn.discordapp.com/attachments/1026704902925324410/1026704903055343626/Screenshot_42.png
Hope I've explained it well.
Thanks
install package "https://github.com/castlabs/electron-releases#v20.0.0+wvcus"
like this:
npm install "https://github.com/castlabs/electron-releases#v20.0.0+wvcus" --save-dev
import also components
const { BrowserWindow, app, components } = require("electron");
to open in desktop mode just add userAgent:
window.loadURL(url, {
userAgent: "Chrome/105.0.0.0",
});
and create BrowserWindow after app and components are ready
app.whenReady().then(async () => {
await components.whenReady();
main();
});
full code:
const { BrowserWindow, app, components } = require("electron");
const pie = require("puppeteer-in-electron");
const puppeteer = require("puppeteer-core");
const fs = require("fs");
const path = require("path");
pie.initialize(app);
const main = async () => {
const cookiesPath = path.join(
__dirname,
"cookies/open.spotify.com.cookies.json",
);
const cookies = JSON.parse(await fs.readFileSync(cookiesPath, "utf8"));
const browser = await pie.connect(app, puppeteer);
const window = new BrowserWindow();
const url = "https://example.com/";
await window.loadURL(url, {
userAgent: "Chrome/105.0.0.0",
});
const page = await pie.getPage(browser, window);
await page.goto("https://open.spotify.com");
for (const cookie of cookies) {
if (cookie.name !== "ig_lang") {
await page.setCookie(cookie);
}
}
await page.reload();
};
app.whenReady().then(async () => {
await components.whenReady()
main();
});
short version of code:
const { BrowserWindow, app, components } = require("electron");
const main = () => {
const window = new BrowserWindow();
const url = "https://open.spotify.com";
window.loadURL(url, {
userAgent: "Chrome/105.0.0.0",
});
};
app.whenReady().then(async () => {
await components.whenReady();
main();
});
I use puppeteer to get data about the store. I search using the p.shop-page-content__text_large, span.shop-list-item__address selectors, but I ran into such a problem that only one of them can be present on the page. I tried to solve the problem in the following way, but it does not work. Tell me how can this be fixed?
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
async function getData(page, selector) {
return await page.$$eval(selector, info => info.map((data) => {
let str = data.textContent.trim(),
from = str.search(','),
to = str.length;
return {
'COUNTRY': 'unknow',
'STREET' : str.substring(from, to)
}
}));
}
const result = [];
for (let val of cities) {
console.log(val.LINK, val.CITY);
const page = await browser.newPage();
await page.goto('https://www.example-site.ru' + val.LINK);
data = await page.waitForFunction('.shop-page-content').then(async() => {
console.log('ok');
return await getData(page, 'p.shop-page-content__text_large');
}).catch(async (e) => {
console.log('fail');
await page.waitForSelector('.shops-info__section');
return await getData(page, 'span.shop-list-item__address');
// result.push(data);
});
result.push(data);
await browser.close();
}
console.log(result);
It turned out like this:
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
const page = await browser.newPage();
const result = [];
for (let val of cities) {
await page.goto('https://www.example-site.ru' + val.LINK);
const list = await page.evaluate(() => {
const data = [];
const elements = document.querySelectorAll('p.shop-page-content__text_large').length
? document.querySelectorAll('p.shop-page-content__text_large')
: document.querySelectorAll('span.shop-list-item__address');
for (const element of elements) {
data.push(element.innerText);
}
return data;
});
result.push({
link: val.LINK,
city: val.CITY,
list
})
}
await browser.close();
I'm trying to start this script that takes care of making visits to a site with different IPs through a proxy pool but after a few visits the CPU rises to 100% and begins to slow down more and more, you could help me to optimize it.
I state that I am not a programmer and I thank anyone who can help me solve this problem
const express = require('express');
const app = express();
const port = process.env.PORT || 8080;
const validUrl = require('valid-url');
const parseUrl = function (url) {
url = decodeURIComponent(url)
if (!/^(?:f|ht)tps?:\/\//.test(url)) {
url = 'https://' + url;
}
return url;
};
const getRandomDevice = () => {
const puppeteer = require('puppeteer');
const devices = Object.entries(puppeteer.devices)
return devices[Math.floor(Math.random() * devices.length)][1]
}
app.get('/', function (req, res) {
// const url = parseUrl(req.query.url);
const url = 'https://www.example.com';
const tries = req.query.tries || 100000;
if (validUrl.isWebUri(url)) {
console.log('Handling: ' + url);
(async () => {
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: true,
// userDataDir: './myUserDataDir',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--proxy-server=EXAMPLE-POOL-PROXY:13012'
]
});
let [page] = await browser.pages();
for (let i = 0; i < tries; i++) {
// enable request interception
await page.setRequestInterception(true);
const device = getRandomDevice()
await page.setUserAgent(device.userAgent);
await page.setViewport(device.viewport)
// add header for the navigation requests
page.on('request', request => {
// Add a new header for navigation request.
const headers = request.headers();
headers['User-Agent'] = device.userAgent;
headers['user-agent'] = device.userAgent;
request.continue({headers});
});
await page.goto(url, {waitUntil: 'networkidle2', timeout: 1500000});
try {
//console.log(page);
} catch (error) {
console.error(error)
} finally {
// console.log(urls);
// console.log(await page._client.send('Network.getAllCookies'));
// await page.screenshot().then(function (buffer) {
// res.setHeader('Content-Disposition', 'attachment;filename="' + url + '.png"');
// res.setHeader('Content-Type', 'image/png');
// res.send(buffer)
// });
await page.screenshot({path: 'screenshot-' + i + '.png',fullPage: true})
// If everything correct then no 'HeadlessChrome' sub string on userAgent
console.log(await page.evaluate(() => navigator.userAgent));
page = await browser.newPage();
}
}
setTimeout(async () => {
await browser.close();
}, 500);
})();
} else {
res.send('Invalid url: ' + url);
}
});
app.listen(port, function () {
console.log('App listening on port ' + port)
})
This is the module that collections and exports async data: scraper.js
const express = require('express')
const cheerio = require('cheerio')
const request = require("tinyreq")
const fs = require('fs')
const _ = require('lodash')
const uuid = require('uuid/v4')
const async = require('async')
const mental_models = {
url: 'https://www.farnamstreetblog.com/mental-models/',
data: {}
}
const decision_making = {
url: 'https://www.farnamstreetblog.com/smart-decisions/',
data: {}
}
const cognitive_bias = {
url: 'https://betterhumans.coach.me/cognitive-bias-cheat-sheet-55a472476b18',
data: {}
}
const DATA_URLS = [mental_models, decision_making, cognitive_bias]
const filterScrape = async (source, params) => {
let filtered_data = {
topics: [],
content: [],
additional_content: []
}
let response = await scrape(source)
try {
let $ = cheerio.load(response)
params.forEach((elem) => {
let headers = ['h1', 'h2', 'h3']
if ($(elem) && headers.includes(elem)) {
let topic = {}
let content = {}
let id = uuid()
topic.id = id
topic.text = $(elem).text()
if ($(elem).closest('p')) {
content.text = $(elem).closest('p').text()
content.id = id
}
filtered_data.topics.push(topic)
filtered_data.content.push(content)
} else if ($(elem) && !headers.includes(elem)) {
let content = {}
let id = uuid()
content.text = $(elem).text()
content.id = id
filtered_data.additional_content.push(content)
} else {
}
})
}
catch (err) {
console.log(err)
}
return filtered_data
}
const scrape = (source) => {
return new Promise((resolve, reject) => {
request(source.url, function (err, body) {
if (err) {
reject(err)
return
}
resolve(body)
})
})
}
const DATA = _.map(DATA_URLS, async (source) => {
let params = ['h1', 'h2', 'h3', 'p']
let new_data = await filterScrape(source, params)
try {
source.data = new_data
}
catch (err) {
console.log(err)
}
})
module.exports = DATA
This is the module that imports the data: neural.js
const brain = require('brain')
const neural_net = new brain.NeuralNetwork()
const DATA = require('./scraper')
console.log(DATA)
Obviously not much going on, I've removed the code since the variable doesn't resolve. When logged it logs a promise but the promise does not resolve. However in the imported module, the promise is logged and then resolves. What gives? Should I import a function that resolves the data?
Of course it would be best to import that function, however it won't change the issue in your code which is here:
const DATA = _.map(DATA_URLS, async (source) => {
Lodash doesn't support async iteration - so you need to have some other method, one would be to use the newest nodejs version (10.x) and make use of async iteration - but that won't use the full power of asynchronous code.
You can also use scramjet - a framework my company is supporting. The code above would take the following form:
const {DataStream} = require("scramjet");
const DATA_URLS = [mental_models, decision_making, cognitive_bias];
module.exports = async () => DataStream.fromArray(DATA_URLS)
.setOptions({maxParallel: 2}) // if you need to limit that at all.
.map(async ({url}) => {
let params = ['h1', 'h2', 'h3', 'p']
let data = await filterScrape(source, params);
return { url, data };
})
.toArray();
The other file would take the following form:
const brain = require('brain')
const neural_net = new brain.NeuralNetwork()
const scraper = require('./scraper')
(async (){
const DATA = await scraper();
console.log(DATA); // or do whatever else you were expecting...
})();