Headless chrome nodejs async loop for scraping

Headless chrome nodejs async loop for scraping - javascript

I'm having a play around with nodejs and the way we can now have a headless chrome browser and interact with it which is pretty awesome!
I have some code and I have it working for scraping 1 website without any issues. However when I want to scrape multiple my loop just seems to mess it up and i'm pretty sure it's all to do with async/await.
My loop is near the bottom of this code - does anyone have any suggestions?
Thanks heaps!
const HeadlessChrome = require('simple-headless-chrome')
const browser = new HeadlessChrome({
headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})
async function navigateWebsite(urlToGoTo) {
try {
await browser.init()
const mainTab = await browser.newTab({
privateTab: false
})
await mainTab.inject('jquery')
let cookieName = 'li_at'
let cookieValue = 'cyzzzzzzzzz'
let cookieDomain = '.www.linkedin.com'
await mainTab.setCookie(cookieName, cookieValue, {
domain: cookieDomain
})
// Navigate to a URL
await mainTab.goTo(urlToGoTo)
await mainTab.wait(2000);
// Get a HTML tag value based on class id
let businessName = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__name');
let industry = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.company-industries');
let followers = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__followers-count');
let details = {
businessName: cleanData(businessName),
industry: cleanData(industry),
followers: cleanData(followers)
}
console.log(details)
// Resize the viewport to full screen size (One use is to take full size screen shots)
await mainTab.resizeFullScreen()
// Take a screenshot
await mainTab.saveScreenshot()
// Close the browser
await browser.close()
} catch (err) {
console.log('ERROR!', err)
}
}
let websites = []
websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')
websites.forEach(function (i) {
navigateWebsite(i)
})
function cleanData(a) {
return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}

navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result.
Promise.all(websites.map(w => navigateWebsite(w)));
// or
for (let w of websites) {
await navigateWebsite(w);
}

Related

Why does my for-loop code in javascript only run once?

I have no idea why the for-loop only run once.
My intention is making the code run through all elements in foodGroupIdsOnPage1.
But it only runs through the first element now.
Could anyone please explain to me? Thank you for your help in advance.
async function initialize() {
const browser = await playwright.chromium.launch({
headless: false
});
const context = await browser.newContext(); ////create a new browser context, which means no cookies and cache saved
const tab1 = await context.newPage();
return { tab1, context };
}
async function GotoPage2() { ////wait for function>>>only run the next command after all the commands inside the next bracket run
const page1_foodGroupButton = id.querySelector('a') ////beginning of the for loop
await page1_foodGroupButton.click();
};
async function main() {
const { tab1, context } = await initialize();
await tab1.goto('https://www.cfs.gov.hk/tc_chi/nutrient/search1.php');
const foodGroupIdsOnPage1 = await tab1.evaluate(async function getFoodGroupsOnPage1() {
return [...document.querySelector('.tableResponsive').querySelectorAll('td ')].map(e => e.id);
})
for (let id of foodGroupIdsOnPage1) {
await tab1.evaluate(id => {
const page1_foodGroupButton = document.querySelector('[id=' + `"${id}"` + ']').querySelector('a') ////beginning of the for loop
page1_foodGroupButton.click();
}, id);
await tab1.waitForTimeout(2000);
await tab1.click('[id^=grp] > a');
await tab1.waitForTimeout(2000);
const ArrayOfTabs = context.pages(); ////get how many tabs chromium are
let tab2 = ArrayOfTabs[1]; ////make active tab to the second tab
await tab2.evaluate(async function extractFoodGroupData() {
let tableOfAllFoods = [];
let rowsOnPage3 = document.querySelector(".colorTable2").querySelectorAll("tr");
for (let row_OnPage3 of rowsOnPage3) {
let arrayNutritionOfOneFood = [];
let cellsInOneRow = row_OnPage3.querySelectorAll("td");
for (let cell of cellsInOneRow) {
arrayNutritionOfOneFood.push(cell.innerText);
}
tableOfAllFoods.push(arrayNutritionOfOneFood);
}
});
tab2.close();
tab1.goBack();
return;
}
}
main();

I found out the solution.
The first problem is that return terminates the iteration. So delete it.
The second problem is that the second iteration cycle runs before
tab1.goBack();
Simply add await before tab1.goBack(); to fix the second problem.

Scan multiple pages at once

I have an puppeteer problem. I have an array with links
let links = ["...", "...", "..."];
Thats about 30 links. I wanted to scrape them all at once with Promise.all()
let descriptions = await Promise.all(
links.map((url) => getDescription(page, url))
);
Every page has an description but for some reason my descriptions array is populated with 30 same description.
Why is that? Do i need to scrape one by one or?
The description function:
export async function getDescription(page, url) {
await page.goto(url);
let selector = ".C4VMK > span";
return page.$eval(selector, (handle) => handle.textContent);
}

I have managed it like this:
I create a new tab and open that URL from that
export async function getDescription(browser, url) {
let page = await browser.newPage();
await page.goto(url);
let selector = ".C4VMK > span";
let result = await page.$eval(selector, (handle) => handle.textContent);
page.close();
return result;
}

Node js Puppeteer goto array of youtube links to screenshot images

I am trying to go to youtube, login then go into video links to take a screen shot.
But I get this error.I have read other similar quesiton and I have done what they suggested: Use a forloop. Please dont duplicate label this.
(node:18337) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 response listeners added.
Here is my code. This is what I am tring to do.
var proccessALink = async (link,label)=>{
if (validUrl.isWebUri(urlToScreenshot)) {
;(async () => {
await page.goto(link, {
waitUntil: 'networkidle'
})
const video = await page.$('.html5-video-player')
await page.evaluate(() => {
// Hide youtube player controls.
let dom = document.querySelector('.ytp-chrome-bottom')
dom.style.display = 'none'
})
await video.screenshot({path: downloadPath});
})()
} else {
res.send('Invalid url: ' + urlToScreenshot)
}
}
for(let i = 0; i<linksArr.length; i++){
var link = linksArr[i];
var label = labelsArr[i];
await proccessALink(link, label)
}

Puppeteer timeout on WaitForTarget

I have the following function that clicks a link with a "rel=_nofollow" property, awaits the new page tab open and then loads the content, but occassionally the page will not load and instead of looping to the next page, the script hangs.
How can I put a timeout in here?
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
};
I presume that if I can get it to timeout, I can then return false or something to check in the main code for success or failure?
newpage = await clickAndWaitForTarget("a.someSelector", page, browser);
if(newpage){
var url = await newpage.url();
...
Thanks in advance!
edit: I'm using Puppeteer version 2.0.0

waitForTarget has a timeout option. The default is 30 seconds, maybe that´s a lot for you.
You could do something like this:
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
try {
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget,
{ timeout: 3000} /*3 seconds instead*/
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
} catch {
return null;
}
};

JS cant access global variable inside function

I am trying to make a simple webscraper using Node and Puppeteer to get the titles of posts on reddit, but am having issues accessing a global variable, SUBREDDIT_NAME from within only one function, extractItems(). It works fine with every other function, but for that one I have to make a local variable with the same value for it to work.
Am I completely misunderstanding variable scope in Javascript?
I have tried everything I can think of, and the only thing that works is to create a local variable inside of extractedItems() with the value of "news", otherwise I get nothing.
const fs = require('fs');
const puppeteer = require('puppeteer');
const SUBREDDIT = (subreddit_name) => `https://reddit.com/r/${subreddit_name}/`;
const SUBREDDIT_NAME= "news";
function extractItems() {
const extractedElements = document.querySelectorAll(`a[href*='r/${SUBREDDIT_NAME}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = 1000,
) {
let items = [];
try {
let previousHeight;5
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(SUBREDDIT(SUBREDDIT_NAME));
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();
I expect a text file with the 100 first found titles, but it only works when I hardcode the subreddit into the extractItems() function.

The problem is that the extractItems function is converted to a string (without processing the template literal) and executed in the pages context where there is no SUBREDDIT_NAME variable.
You can fix that by doing something like this:
function extractItems(name) {
const extractedElements = document.querySelectorAll(`a[href*='r/${name}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
page.evaluate(`(${extractItems})(${SUBREDDIT_NAME})`)

We Keep Coding

JavaScript is the programming language of the Web.

Headless chrome nodejs async loop for scraping - javascript

navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result. Promise.all(websites.map(w => navigateWebsite(w))); // or for (let w of websites) { await navigateWebsite(w); }

Related

Why does my for-loop code in javascript only run once?

Scan multiple pages at once

Node js Puppeteer goto array of youtube links to screenshot images

Puppeteer timeout on WaitForTarget

JS cant access global variable inside function

Categories

Resources