I have no idea why the for-loop only run once.
My intention is making the code run through all elements in foodGroupIdsOnPage1.
But it only runs through the first element now.
Could anyone please explain to me? Thank you for your help in advance.
async function initialize() {
const browser = await playwright.chromium.launch({
headless: false
});
const context = await browser.newContext(); ////create a new browser context, which means no cookies and cache saved
const tab1 = await context.newPage();
return { tab1, context };
}
async function GotoPage2() { ////wait for function>>>only run the next command after all the commands inside the next bracket run
const page1_foodGroupButton = id.querySelector('a') ////beginning of the for loop
await page1_foodGroupButton.click();
};
async function main() {
const { tab1, context } = await initialize();
await tab1.goto('https://www.cfs.gov.hk/tc_chi/nutrient/search1.php');
const foodGroupIdsOnPage1 = await tab1.evaluate(async function getFoodGroupsOnPage1() {
return [...document.querySelector('.tableResponsive').querySelectorAll('td ')].map(e => e.id);
})
for (let id of foodGroupIdsOnPage1) {
await tab1.evaluate(id => {
const page1_foodGroupButton = document.querySelector('[id=' + `"${id}"` + ']').querySelector('a') ////beginning of the for loop
page1_foodGroupButton.click();
}, id);
await tab1.waitForTimeout(2000);
await tab1.click('[id^=grp] > a');
await tab1.waitForTimeout(2000);
const ArrayOfTabs = context.pages(); ////get how many tabs chromium are
let tab2 = ArrayOfTabs[1]; ////make active tab to the second tab
await tab2.evaluate(async function extractFoodGroupData() {
let tableOfAllFoods = [];
let rowsOnPage3 = document.querySelector(".colorTable2").querySelectorAll("tr");
for (let row_OnPage3 of rowsOnPage3) {
let arrayNutritionOfOneFood = [];
let cellsInOneRow = row_OnPage3.querySelectorAll("td");
for (let cell of cellsInOneRow) {
arrayNutritionOfOneFood.push(cell.innerText);
}
tableOfAllFoods.push(arrayNutritionOfOneFood);
}
});
tab2.close();
tab1.goBack();
return;
}
}
main();
I found out the solution.
The first problem is that return terminates the iteration. So delete it.
The second problem is that the second iteration cycle runs before
tab1.goBack();
Simply add await before tab1.goBack(); to fix the second problem.
Related
What's up friends,
I have a problem. I have DOM elements that show data from the variable (useState) that I am saving with some "accounts".
I have a function that updates the data of each element by calling the api, and I have a function that refreshes all the elements one by one, the problem is that when using the function that refreshes all the elements, the changes are not reflected correctly in the dom but when using the function individually (I have a button for that) I can see the changes correctly reflected.
I do not get the error, if you can help me I would be very grateful.
Refresh Function
let [accounts, setAccounts] = useState
const refreshAccount = async (id, name, token) => {
await setIsBusy(true)
await updateWorkingStatus(id, true, name, token)
let work = await axios.get(
'http://192.168.0.101:3000/account/refresh/' + id,
)
await updateWorkingStatus(id, false, name, token)
await setIsBusy(false)
// console.log(work.data.result)
let accs = [...accounts]
let index = accs.findIndex((acc) => acc._id === id)
accs[index] = work.data.result
accs[index]._id = id
// console.log(accs[index])
setTimeout(() => {
setAccounts(accs)
}, 0)
return 'OK'
}
Refresh Selected Item (I already have a function that selects each item and works nice)
const refreshSelected = async () => {
let finalAccounts = await accounts.filter((acc) => acc.selected)
for (let i = 0; i < finalAccounts.length; i++) {
await refreshAccount(
finalAccounts[i]._id,
finalAccounts[i].name,
finalAccounts[i].token,
)
}
}
I know the problem is updating the status, but I can't solve it. Implement a setTimeout and the previous element to the last is updated or the last of the cycle, the previous ones are left with the outdated data (first state).
I think problem is when using FOR loop on React, but dont know how to use it correctly
The problem is probably that the setAccounts function is asynchronous, but the execution doesn't wait for it to complete. Because setAccounts is asynchronous, subsequent calls in the same update cycle will overwrite previous updates, and the previous changes will be lost, causing only the last update to take effect.
You could store the changes to an array and update it after all the API requests have finished.
let [accounts, setAccounts] = useState
const refreshAccount = async (id, name, token) => {
await setIsBusy(true)
await updateWorkingStatus(id, true, name, token)
let work = await axios.get(
'http://192.168.0.101:3000/account/refresh/' + id,
)
await updateWorkingStatus(id, false, name, token)
await setIsBusy(false)
// console.log(work.data.result)
let accs = [...accounts]
let index = accs.findIndex((acc) => acc._id === id)
accs[index] = work.data.result
accs[index]._id = id
return accs
}
const refreshSelected = async () => {
let finalAccounts = await accounts.filter((acc) => acc.selected)
let newAccounts = []
for (let i = 0; i < finalAccounts.length; i++) {
const accs = await refreshAccount(
finalAccounts[i]._id,
finalAccounts[i].name,
finalAccounts[i].token,
)
newAccounts = newAccounts.concat(accs)
}
setAccounts(newAccounts)
}
I have the following function that clicks a link with a "rel=_nofollow" property, awaits the new page tab open and then loads the content, but occassionally the page will not load and instead of looping to the next page, the script hangs.
How can I put a timeout in here?
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
};
I presume that if I can get it to timeout, I can then return false or something to check in the main code for success or failure?
newpage = await clickAndWaitForTarget("a.someSelector", page, browser);
if(newpage){
var url = await newpage.url();
...
Thanks in advance!
edit: I'm using Puppeteer version 2.0.0
waitForTarget has a timeout option. The default is 30 seconds, maybe that´s a lot for you.
You could do something like this:
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
try {
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget,
{ timeout: 3000} /*3 seconds instead*/
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
} catch {
return null;
}
};
I'm doing a Discord Bot and I have a infinite loop with setInterval each 10s but every loop that the setInterval does, it gives me every data of each loop, so I'd like to know how can I do to get only the last data of the last cycle, not every one.
const puppeteer = require('puppeteer');
const Discord = require('discord.js');
const client = new Discord.Client();
const url = 'url to scrape';
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'Duplicate Fashion Product Identification Task'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
if(message.author.bot === false) {
message.channel.send(clock);
}
});
clocks.push(clock);
// Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
client.login('discordjs token');
This is how the messages are shown:
enter image description here
As you can see, now it's giving each change not all the data of each cycle
enter image description here
Every time your setInterval runs, it loads the page fresh, gathers information in 'clock', and sends it via discord. The problem is, it does not know what it has already sent you, so you'll get some of the same data every time.
The solution to that is to save the data it finds, and then only create a discord message if the current batch of data is different from all of the previous data.
So you want some kind of data store:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
// ...
And then once you've gotten the current clock back, you want to check if it is NOT in the data store.
if(!clocks.includes(clock)) {
If it isn't, then you know that you have a new piece of data to send.
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
So all in all you have something like:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' })
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
console.log(`after reload`)
}, 8000)
})()
While we're at it though, there's no real reason to fire up a new browser window every 10 seconds, it will probably be easier on your computer to load the page once and then simply refresh every 10 seconds.
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
Now, to make sure that your page function (clock) finds a new data point each time, we need to pass our past data points in to it:
let clock = await page.evaluate(clocks=>{
// ...
}, clocks);
Now, inside of the page function you'll have access to the old data points.
Instead of
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`); // What is this for?
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
Check if the data point exists in your clocks array, and only return it if it's new.
if(ar[i][1] === keyword[j]){
let dataPoint =`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`;
if(!clocks.includes(dataPoint)){
return dataPoint;
}
}
I am trying to make a simple webscraper using Node and Puppeteer to get the titles of posts on reddit, but am having issues accessing a global variable, SUBREDDIT_NAME from within only one function, extractItems(). It works fine with every other function, but for that one I have to make a local variable with the same value for it to work.
Am I completely misunderstanding variable scope in Javascript?
I have tried everything I can think of, and the only thing that works is to create a local variable inside of extractedItems() with the value of "news", otherwise I get nothing.
const fs = require('fs');
const puppeteer = require('puppeteer');
const SUBREDDIT = (subreddit_name) => `https://reddit.com/r/${subreddit_name}/`;
const SUBREDDIT_NAME= "news";
function extractItems() {
const extractedElements = document.querySelectorAll(`a[href*='r/${SUBREDDIT_NAME}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = 1000,
) {
let items = [];
try {
let previousHeight;5
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(SUBREDDIT(SUBREDDIT_NAME));
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();
I expect a text file with the 100 first found titles, but it only works when I hardcode the subreddit into the extractItems() function.
The problem is that the extractItems function is converted to a string (without processing the template literal) and executed in the pages context where there is no SUBREDDIT_NAME variable.
You can fix that by doing something like this:
function extractItems(name) {
const extractedElements = document.querySelectorAll(`a[href*='r/${name}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
page.evaluate(`(${extractItems})(${SUBREDDIT_NAME})`)
I'm having a play around with nodejs and the way we can now have a headless chrome browser and interact with it which is pretty awesome!
I have some code and I have it working for scraping 1 website without any issues. However when I want to scrape multiple my loop just seems to mess it up and i'm pretty sure it's all to do with async/await.
My loop is near the bottom of this code - does anyone have any suggestions?
Thanks heaps!
const HeadlessChrome = require('simple-headless-chrome')
const browser = new HeadlessChrome({
headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})
async function navigateWebsite(urlToGoTo) {
try {
await browser.init()
const mainTab = await browser.newTab({
privateTab: false
})
await mainTab.inject('jquery')
let cookieName = 'li_at'
let cookieValue = 'cyzzzzzzzzz'
let cookieDomain = '.www.linkedin.com'
await mainTab.setCookie(cookieName, cookieValue, {
domain: cookieDomain
})
// Navigate to a URL
await mainTab.goTo(urlToGoTo)
await mainTab.wait(2000);
// Get a HTML tag value based on class id
let businessName = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__name');
let industry = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.company-industries');
let followers = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__followers-count');
let details = {
businessName: cleanData(businessName),
industry: cleanData(industry),
followers: cleanData(followers)
}
console.log(details)
// Resize the viewport to full screen size (One use is to take full size screen shots)
await mainTab.resizeFullScreen()
// Take a screenshot
await mainTab.saveScreenshot()
// Close the browser
await browser.close()
} catch (err) {
console.log('ERROR!', err)
}
}
let websites = []
websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')
websites.forEach(function (i) {
navigateWebsite(i)
})
function cleanData(a) {
return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}
navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result.
Promise.all(websites.map(w => navigateWebsite(w)));
// or
for (let w of websites) {
await navigateWebsite(w);
}