I have the following function that clicks a link with a "rel=_nofollow" property, awaits the new page tab open and then loads the content, but occassionally the page will not load and instead of looping to the next page, the script hangs.
How can I put a timeout in here?
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
};
I presume that if I can get it to timeout, I can then return false or something to check in the main code for success or failure?
newpage = await clickAndWaitForTarget("a.someSelector", page, browser);
if(newpage){
var url = await newpage.url();
...
Thanks in advance!
edit: I'm using Puppeteer version 2.0.0
waitForTarget has a timeout option. The default is 30 seconds, maybe that´s a lot for you.
You could do something like this:
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
try {
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget,
{ timeout: 3000} /*3 seconds instead*/
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
} catch {
return null;
}
};
Related
I have no idea why the for-loop only run once.
My intention is making the code run through all elements in foodGroupIdsOnPage1.
But it only runs through the first element now.
Could anyone please explain to me? Thank you for your help in advance.
async function initialize() {
const browser = await playwright.chromium.launch({
headless: false
});
const context = await browser.newContext(); ////create a new browser context, which means no cookies and cache saved
const tab1 = await context.newPage();
return { tab1, context };
}
async function GotoPage2() { ////wait for function>>>only run the next command after all the commands inside the next bracket run
const page1_foodGroupButton = id.querySelector('a') ////beginning of the for loop
await page1_foodGroupButton.click();
};
async function main() {
const { tab1, context } = await initialize();
await tab1.goto('https://www.cfs.gov.hk/tc_chi/nutrient/search1.php');
const foodGroupIdsOnPage1 = await tab1.evaluate(async function getFoodGroupsOnPage1() {
return [...document.querySelector('.tableResponsive').querySelectorAll('td ')].map(e => e.id);
})
for (let id of foodGroupIdsOnPage1) {
await tab1.evaluate(id => {
const page1_foodGroupButton = document.querySelector('[id=' + `"${id}"` + ']').querySelector('a') ////beginning of the for loop
page1_foodGroupButton.click();
}, id);
await tab1.waitForTimeout(2000);
await tab1.click('[id^=grp] > a');
await tab1.waitForTimeout(2000);
const ArrayOfTabs = context.pages(); ////get how many tabs chromium are
let tab2 = ArrayOfTabs[1]; ////make active tab to the second tab
await tab2.evaluate(async function extractFoodGroupData() {
let tableOfAllFoods = [];
let rowsOnPage3 = document.querySelector(".colorTable2").querySelectorAll("tr");
for (let row_OnPage3 of rowsOnPage3) {
let arrayNutritionOfOneFood = [];
let cellsInOneRow = row_OnPage3.querySelectorAll("td");
for (let cell of cellsInOneRow) {
arrayNutritionOfOneFood.push(cell.innerText);
}
tableOfAllFoods.push(arrayNutritionOfOneFood);
}
});
tab2.close();
tab1.goBack();
return;
}
}
main();
I found out the solution.
The first problem is that return terminates the iteration. So delete it.
The second problem is that the second iteration cycle runs before
tab1.goBack();
Simply add await before tab1.goBack(); to fix the second problem.
I have an puppeteer problem. I have an array with links
let links = ["...", "...", "..."];
Thats about 30 links. I wanted to scrape them all at once with Promise.all()
let descriptions = await Promise.all(
links.map((url) => getDescription(page, url))
);
Every page has an description but for some reason my descriptions array is populated with 30 same description.
Why is that? Do i need to scrape one by one or?
The description function:
export async function getDescription(page, url) {
await page.goto(url);
let selector = ".C4VMK > span";
return page.$eval(selector, (handle) => handle.textContent);
}
I have managed it like this:
I create a new tab and open that URL from that
export async function getDescription(browser, url) {
let page = await browser.newPage();
await page.goto(url);
let selector = ".C4VMK > span";
let result = await page.$eval(selector, (handle) => handle.textContent);
page.close();
return result;
}
I'm doing a Discord Bot and I have a infinite loop with setInterval each 10s but every loop that the setInterval does, it gives me every data of each loop, so I'd like to know how can I do to get only the last data of the last cycle, not every one.
const puppeteer = require('puppeteer');
const Discord = require('discord.js');
const client = new Discord.Client();
const url = 'url to scrape';
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'Duplicate Fashion Product Identification Task'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
if(message.author.bot === false) {
message.channel.send(clock);
}
});
clocks.push(clock);
// Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
client.login('discordjs token');
This is how the messages are shown:
enter image description here
As you can see, now it's giving each change not all the data of each cycle
enter image description here
Every time your setInterval runs, it loads the page fresh, gathers information in 'clock', and sends it via discord. The problem is, it does not know what it has already sent you, so you'll get some of the same data every time.
The solution to that is to save the data it finds, and then only create a discord message if the current batch of data is different from all of the previous data.
So you want some kind of data store:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
// ...
And then once you've gotten the current clock back, you want to check if it is NOT in the data store.
if(!clocks.includes(clock)) {
If it isn't, then you know that you have a new piece of data to send.
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
So all in all you have something like:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' })
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
console.log(`after reload`)
}, 8000)
})()
While we're at it though, there's no real reason to fire up a new browser window every 10 seconds, it will probably be easier on your computer to load the page once and then simply refresh every 10 seconds.
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
Now, to make sure that your page function (clock) finds a new data point each time, we need to pass our past data points in to it:
let clock = await page.evaluate(clocks=>{
// ...
}, clocks);
Now, inside of the page function you'll have access to the old data points.
Instead of
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`); // What is this for?
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
Check if the data point exists in your clocks array, and only return it if it's new.
if(ar[i][1] === keyword[j]){
let dataPoint =`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`;
if(!clocks.includes(dataPoint)){
return dataPoint;
}
}
I am trying to make a simple webscraper using Node and Puppeteer to get the titles of posts on reddit, but am having issues accessing a global variable, SUBREDDIT_NAME from within only one function, extractItems(). It works fine with every other function, but for that one I have to make a local variable with the same value for it to work.
Am I completely misunderstanding variable scope in Javascript?
I have tried everything I can think of, and the only thing that works is to create a local variable inside of extractedItems() with the value of "news", otherwise I get nothing.
const fs = require('fs');
const puppeteer = require('puppeteer');
const SUBREDDIT = (subreddit_name) => `https://reddit.com/r/${subreddit_name}/`;
const SUBREDDIT_NAME= "news";
function extractItems() {
const extractedElements = document.querySelectorAll(`a[href*='r/${SUBREDDIT_NAME}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = 1000,
) {
let items = [];
try {
let previousHeight;5
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(SUBREDDIT(SUBREDDIT_NAME));
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();
I expect a text file with the 100 first found titles, but it only works when I hardcode the subreddit into the extractItems() function.
The problem is that the extractItems function is converted to a string (without processing the template literal) and executed in the pages context where there is no SUBREDDIT_NAME variable.
You can fix that by doing something like this:
function extractItems(name) {
const extractedElements = document.querySelectorAll(`a[href*='r/${name}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
page.evaluate(`(${extractItems})(${SUBREDDIT_NAME})`)
I'm having a play around with nodejs and the way we can now have a headless chrome browser and interact with it which is pretty awesome!
I have some code and I have it working for scraping 1 website without any issues. However when I want to scrape multiple my loop just seems to mess it up and i'm pretty sure it's all to do with async/await.
My loop is near the bottom of this code - does anyone have any suggestions?
Thanks heaps!
const HeadlessChrome = require('simple-headless-chrome')
const browser = new HeadlessChrome({
headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})
async function navigateWebsite(urlToGoTo) {
try {
await browser.init()
const mainTab = await browser.newTab({
privateTab: false
})
await mainTab.inject('jquery')
let cookieName = 'li_at'
let cookieValue = 'cyzzzzzzzzz'
let cookieDomain = '.www.linkedin.com'
await mainTab.setCookie(cookieName, cookieValue, {
domain: cookieDomain
})
// Navigate to a URL
await mainTab.goTo(urlToGoTo)
await mainTab.wait(2000);
// Get a HTML tag value based on class id
let businessName = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__name');
let industry = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.company-industries');
let followers = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__followers-count');
let details = {
businessName: cleanData(businessName),
industry: cleanData(industry),
followers: cleanData(followers)
}
console.log(details)
// Resize the viewport to full screen size (One use is to take full size screen shots)
await mainTab.resizeFullScreen()
// Take a screenshot
await mainTab.saveScreenshot()
// Close the browser
await browser.close()
} catch (err) {
console.log('ERROR!', err)
}
}
let websites = []
websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')
websites.forEach(function (i) {
navigateWebsite(i)
})
function cleanData(a) {
return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}
navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result.
Promise.all(websites.map(w => navigateWebsite(w)));
// or
for (let w of websites) {
await navigateWebsite(w);
}