Scan multiple pages at once - javascript

I have an puppeteer problem. I have an array with links
let links = ["...", "...", "..."];
Thats about 30 links. I wanted to scrape them all at once with Promise.all()
let descriptions = await Promise.all(
links.map((url) => getDescription(page, url))
);
Every page has an description but for some reason my descriptions array is populated with 30 same description.
Why is that? Do i need to scrape one by one or?
The description function:
export async function getDescription(page, url) {
await page.goto(url);
let selector = ".C4VMK > span";
return page.$eval(selector, (handle) => handle.textContent);
}

I have managed it like this:
I create a new tab and open that URL from that
export async function getDescription(browser, url) {
let page = await browser.newPage();
await page.goto(url);
let selector = ".C4VMK > span";
let result = await page.$eval(selector, (handle) => handle.textContent);
page.close();
return result;
}

Related

JavaScript Puppeteer Scraping a Variable

I'm trying to scrape HTML data from a variable that holds HTML data. You can see my annotations, they are marked with " << ".
Unfortunately, evaluate only works on a page on not in a div. Could someone tell me how I could scrape information from a variable containing HTML?
Are there perhaps other methods of scraping?
I tried this in the forEach loop as well, but this resulted in the first mealname of the original document.
let mealName = htmlOfOneProduct.document.querySelector("div.meal__description-texts.js-meal-description-text > span > span").innerText;
My code with notes:
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(" "); << Meal website
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('div.meal__wrapper'); << Gets all the meals from a page
items.forEach((item) => {
let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal
let mealName = htmlOfOne.evaluate(() => document.querySelector('meal-name').textContent); << Not working, should get the meal-name from the div.
results.push({
mealName: mealName
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
As you did not provide the site URL, I cannot check my proposal, sorry.
item.innerHTML returns a string which has no evaluate() method. Try this simpler way:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').textContent;
results.push({
mealName: mealName
});
});
Perhaps let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal it's not necessary.
If you only need the content of something you could directly do item.innerText or item.name or any other propriety of the element.
In the end something like this should be possible:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').innerText
results.push({
mealName: mealName
});
});
You can also combine your CSS selectors and use Array.from() to simplify scraping the innerText of the elements:
let urls = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.meal__wrapper span.meal-name'), e => ({
mealName: e.innerText,
});
});

Puppeteer timeout on WaitForTarget

I have the following function that clicks a link with a "rel=_nofollow" property, awaits the new page tab open and then loads the content, but occassionally the page will not load and instead of looping to the next page, the script hangs.
How can I put a timeout in here?
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
};
I presume that if I can get it to timeout, I can then return false or something to check in the main code for success or failure?
newpage = await clickAndWaitForTarget("a.someSelector", page, browser);
if(newpage){
var url = await newpage.url();
...
Thanks in advance!
edit: I'm using Puppeteer version 2.0.0
waitForTarget has a timeout option. The default is 30 seconds, maybe that´s a lot for you.
You could do something like this:
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
try {
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget,
{ timeout: 3000} /*3 seconds instead*/
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
} catch {
return null;
}
};

JS cant access global variable inside function

I am trying to make a simple webscraper using Node and Puppeteer to get the titles of posts on reddit, but am having issues accessing a global variable, SUBREDDIT_NAME from within only one function, extractItems(). It works fine with every other function, but for that one I have to make a local variable with the same value for it to work.
Am I completely misunderstanding variable scope in Javascript?
I have tried everything I can think of, and the only thing that works is to create a local variable inside of extractedItems() with the value of "news", otherwise I get nothing.
const fs = require('fs');
const puppeteer = require('puppeteer');
const SUBREDDIT = (subreddit_name) => `https://reddit.com/r/${subreddit_name}/`;
const SUBREDDIT_NAME= "news";
function extractItems() {
const extractedElements = document.querySelectorAll(`a[href*='r/${SUBREDDIT_NAME}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = 1000,
) {
let items = [];
try {
let previousHeight;5
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(SUBREDDIT(SUBREDDIT_NAME));
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();
I expect a text file with the 100 first found titles, but it only works when I hardcode the subreddit into the extractItems() function.
The problem is that the extractItems function is converted to a string (without processing the template literal) and executed in the pages context where there is no SUBREDDIT_NAME variable.
You can fix that by doing something like this:
function extractItems(name) {
const extractedElements = document.querySelectorAll(`a[href*='r/${name}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
page.evaluate(`(${extractItems})(${SUBREDDIT_NAME})`)

Error: failed to find element matching selector for <img data-src="url>

Running on Puppeteer, all updated.
The intended process is to go to website, where url is url/{search item} and run through the list of search names. Then for each search item --> search page, get name, price and image url for each listing. Now theres error it cannot find selector. Appreciate any help on this, many thanks!
Layout of the data of the website is as follows:
<div class="items-box-content">
<section class="items-box">
<a href="https://listingurl">
<figure class="items-box-photo">
<img data-src="https://imageurl.jpg" class=" lazyloaded" src="https://imageurl.jpg">
</figure>
<div class="items-box-main">
<h3 class="items-box-name"> listing name </h3>
<div class="items-box-figure">
<div class="items-price font-4"> $29.95 </div> // item's price
</h3>
</div>
And what i have now is (which throws the error):
const puppeteer = require('puppeteer');
const searches = ["a", "b", "c"]; //appended to url
(async () => {
const browser = await puppeteer.launch({ headless: false });
let results =[];
for (const search of searches) {
try {
page = await browser.newPage();
await page.goto(`https://weburl/?keyword=${search}`);
await page.evaluate(() => { document.querySelector('div[class*="items-box"]').scrollIntoView();});
let elements = await page.$$('div[class*="items-box"]');
for (let element of elements) {
let listImg = await element.$eval(('img[class="items-box-photo]'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('d[class="items-box-main"] > h[class="items-box-name"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('d[class="items-box-figure"] > d[class="items-price"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('d[class="items-box-content"] > a[class*="items-box"]'), node => node.getAttribute('href'));
results.push({
listImg,
listTitle,
listPrice,
listUrl
})
return results;
}
} finally {
await page.close
}
}
})();
The error thrown is
(node:5168) UnhandledPromiseRejectionWarning: Error: Error: failed to
find element matching selector "img[class="items-box-photo]"
The problem is right there in the error message (Error: failed to find element matching selector ...).
The selectors are wrong in the following lines:
let listImg = await element.$eval(('img[class="items-box-photo]'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('d[class="items-box-main"] > h[class="items-box-name"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('d[class="items-box-figure"] > d[class="items-price"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('d[class="items-box-content"] > a[class*="items-box"]'), node => node.getAttribute('href'));
According to the HTML code you have given, these should be:
let listImg = await element.$eval('img.lazyloaded', img => img.getAttribute('src'));
let listTitle = await element.$eval('h3.items-box-name', node => node.innerText.trim());
let listPrice = await element.$eval('div.items-price', node => node.innerText.trim());
let listUrl = await element.$eval('div.items-box-content a', node => node.getAttribute('href'));
Note, that instead of using [class=...] the proper way to query a class is by using the class selector: .
I updated your code with my test/debug.
const puppeteer = require('puppeteer');
const searches = ["a"];
(async () => {
const browser = await puppeteer.launch({ headless: false });
function delay(timeout) {
return new Promise((resolve) => {
setTimeout(resolve, timeout);
});
}
let results = [];
for (const search of searches) {
try {
page = await browser.newPage();
await page.goto(`https:url/`);
await page.evaluate(() => { document.querySelector('section[class*="items-box"]').scrollIntoView(); });
let elements = await page.$$('section[class*="items-box"]');
console.log(elements.length)
console.log('wait 6 seconds')
await delay(6000);
for (let element of elements) {
// await delay(6000);
let listImg = await element.$eval(('img'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('h3[class="items-box-name font-2"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('div[class="items-box-price font-5"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('div[class="items-box-content clearfix"] a'), node => node.getAttribute('href'));
results.push({
listImg,
listTitle,
listPrice,
listUrl
});
}
debugger;
} catch (error) {
console.log(error)
} finally {
//await page.close
await browser.close
}
}
console.log(results)
return results;
})();
Updated content:
1. return result in for loop
for(){
return result;
}
=>
for(){
}
return result;
Updated querySelector
section[class*="items-box"]
img // There is only one img tags in "element"
h3[class="items-box-name font-2"] // removed outer 'element'
div[class="items-box-figure"] > div[class="items-price font-4"]
div[class="items-box-price font-5 // updated class name? on my side
items-box-price
div[class="items-box-content clearfix"] a
Updated sleep duration 6 Seconds, this is relative network speed(web load duration).
try catch finally
catch help you to process next step although crash in one step.

Headless chrome nodejs async loop for scraping

I'm having a play around with nodejs and the way we can now have a headless chrome browser and interact with it which is pretty awesome!
I have some code and I have it working for scraping 1 website without any issues. However when I want to scrape multiple my loop just seems to mess it up and i'm pretty sure it's all to do with async/await.
My loop is near the bottom of this code - does anyone have any suggestions?
Thanks heaps!
const HeadlessChrome = require('simple-headless-chrome')
const browser = new HeadlessChrome({
headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})
async function navigateWebsite(urlToGoTo) {
try {
await browser.init()
const mainTab = await browser.newTab({
privateTab: false
})
await mainTab.inject('jquery')
let cookieName = 'li_at'
let cookieValue = 'cyzzzzzzzzz'
let cookieDomain = '.www.linkedin.com'
await mainTab.setCookie(cookieName, cookieValue, {
domain: cookieDomain
})
// Navigate to a URL
await mainTab.goTo(urlToGoTo)
await mainTab.wait(2000);
// Get a HTML tag value based on class id
let businessName = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__name');
let industry = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.company-industries');
let followers = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__followers-count');
let details = {
businessName: cleanData(businessName),
industry: cleanData(industry),
followers: cleanData(followers)
}
console.log(details)
// Resize the viewport to full screen size (One use is to take full size screen shots)
await mainTab.resizeFullScreen()
// Take a screenshot
await mainTab.saveScreenshot()
// Close the browser
await browser.close()
} catch (err) {
console.log('ERROR!', err)
}
}
let websites = []
websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')
websites.forEach(function (i) {
navigateWebsite(i)
})
function cleanData(a) {
return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}
navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result.
Promise.all(websites.map(w => navigateWebsite(w)));
// or
for (let w of websites) {
await navigateWebsite(w);
}

Categories