I'm trying to scrape HTML data from a variable that holds HTML data. You can see my annotations, they are marked with " << ".
Unfortunately, evaluate only works on a page on not in a div. Could someone tell me how I could scrape information from a variable containing HTML?
Are there perhaps other methods of scraping?
I tried this in the forEach loop as well, but this resulted in the first mealname of the original document.
let mealName = htmlOfOneProduct.document.querySelector("div.meal__description-texts.js-meal-description-text > span > span").innerText;
My code with notes:
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(" "); << Meal website
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('div.meal__wrapper'); << Gets all the meals from a page
items.forEach((item) => {
let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal
let mealName = htmlOfOne.evaluate(() => document.querySelector('meal-name').textContent); << Not working, should get the meal-name from the div.
results.push({
mealName: mealName
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
As you did not provide the site URL, I cannot check my proposal, sorry.
item.innerHTML returns a string which has no evaluate() method. Try this simpler way:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').textContent;
results.push({
mealName: mealName
});
});
Perhaps let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal it's not necessary.
If you only need the content of something you could directly do item.innerText or item.name or any other propriety of the element.
In the end something like this should be possible:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').innerText
results.push({
mealName: mealName
});
});
You can also combine your CSS selectors and use Array.from() to simplify scraping the innerText of the elements:
let urls = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.meal__wrapper span.meal-name'), e => ({
mealName: e.innerText,
});
});
Related
This question already has answers here:
Using async/await with a forEach loop
(33 answers)
Closed 1 year ago.
I'm making a program that consists of three different functions:
downloadPDF: download a PDF from the web
getPDF: read and parse the pdf
getDaata: loop through getPDF
Problem I'm having is that the third function(getData) that has a for of loop that runs getPDF, it seems as if it doesn't let getPDF finish before trying to console.log the result that getPDF returns.
Here are the three functions:
async function downloadPDF(pdfURL, outputFilename) {
let pdfBuffer = await request.get({uri: pdfURL, encoding: null});
console.log("Writing downloaded PDF file to " + outputFilename + "...");
fs.writeFileSync(outputFilename, pdfBuffer);
}
async function getPDF(query, siteName, templateUrl, charToReplace) {
const currentWeek = currentWeekNumber().toString();
await downloadPDF(templateUrl.replace(charToReplace, currentWeek), "temp/pdf.pdf");
var resultsArray = []
let dataBuffer = fs.readFileSync("temp/pdf.pdf");
pdf(dataBuffer).then(function(data) {
pdfContent = data.text;
const splittedArray = pdfContent.split("\n");
const parsedArray = splittedArray.map((item, index) => {
if(item.includes(query)) {
resultsArray.push({result: item, caseId: splittedArray[index-1].split(',', 1)[0], site: siteName});
}
}).filter(value => value);
return(resultsArray);
});
fs.unlinkSync("temp/pdf.pdf"); //deletes the downloaded file
}
async function getData(query, desiredSites) {
var resultsArray = []
for (const value of desiredSites) {
let result = await getPDF(query, sitesList.sites[value].name, sitesList.sites[value].templateUrl, sitesList.sites[value].charToReplace);
console.log(result)
}
}
getData("test", ['a', 'b']);
In the bottom function(getData), the console.log results in undefined
I'm guessing this has something to do with the promises. Any ideas? Thanks a lot!
In getPDF, you should chain all your async functions with await instead of .then or vice versa.
You can mix await with .then but this would be not easy to chain them with linear codes. The reason people use await because they want to make the codes look linear and easy to maintain.
async function downloadPDF(pdfURL, outputFilename) {
let pdfBuffer = await request.get({ uri: pdfURL, encoding: null });
console.log("Writing downloaded PDF file to " + outputFilename + "...");
fs.writeFileSync(outputFilename, pdfBuffer);
}
async function getPDF(query, siteName, templateUrl, charToReplace) {
const currentWeek = currentWeekNumber().toString();
await downloadPDF(
templateUrl.replace(charToReplace, currentWeek),
"temp/pdf.pdf"
);
var resultsArray = [];
let dataBuffer = fs.readFileSync("temp/pdf.pdf");
const data = await pdf(dataBuffer);
pdfContent = data.text;
const splittedArray = pdfContent.split("\n");
const resultsArray = splittedArray
.filter(item => item.includes(query))
.map(item => ({
result: item,
caseId: splittedArray[index - 1].split(",", 1)[0],
site: siteName,
}));
fs.unlinkSync("temp/pdf.pdf"); //deletes the downloaded file
return resultsArray;
}
async function getData(query, desiredSites) {
for (const value of desiredSites) {
let result = await getPDF(
query,
sitesList.sites[value].name,
sitesList.sites[value].templateUrl,
sitesList.sites[value].charToReplace
);
console.log(result);
}
}
getData("test", ["a", "b"])
.then(() => console.log("done"))
.catch(console.log);
I'm fetching article list data from API and use/fetch Unsplash API to get relative images according to each fetched article title.
This is my code:
let url = 'http://127.0.0.1:8000/api';
async function getData(url) {
const res = await fetch(url);
const objects = await res.json();
await Promise.all(objects.map(async (object) => {
const res = await fetch('https://api.unsplash.com/search/photos?client_id=XXX&content_filter=high&per_page=1&query=' + object.title);
const image = await res.json();
object.image_url = image.results[0].urls.small
object.image_alt = image.results[0].alt_description
}));
}
let articles_1 = getData(url + '/articles/index/1/');
let articles_2 = getData(url + '/articles/index/2/');
let articles_3 = getData(url + '/articles/index/3/');
I am showing three different categories at once on the same page. That's why I call that function three times.
Question:
When this function kicks, results are shown after both article data and images are fetched. But I want to show article data first when it's been fetched and then images when they get fetched in order to shorten the user waiting time. How can I achieve it wether with Svelte reactive declaration or plain Javascript?
You would want to seperate the two functions, so they can be called in sequence.
const endpoint = 'http://127.0.0.1:8000/api';
const getArticles = async (url) => {
return fetch(url).res.json();
};
const renderArticles = async (articles) => {
// render article set and return it as a DOM object
return articlesDOM;
};
const getImageForArticle = async (articleNode) => {
const res = await = fetch('https://api.unsplash.com/search/photos?client_id=XXX&content_filter=high&per_page=1&query=' + object.title);
const img = new Image();
img.src = res.results[0].urls.small;
img.alt = res.results[0].alt_description;
return {img, articleNode};
};
const renderImage = async (stuff) => {
const {img, articleNode} = stuff;
// inject your img into your article
};
// now call in sequence
getArticles(endpoint+'/articles/index/1/').then(renderArticles).then(articleNodes => {
const promises = articleNodes.map(articleNode => {
return getImageForArticle(articleNode).then(renderImage);
});
return Promise.all(promises);
});
While I'm not completely sure what you're trying to do (I don't have a minimal working example), here's my best attempt at it:
var url = 'http://127.0.0.1:8000/api';
async function getData(url) {
var data = fetch(url)
.then(data => data.json())
await data.then(data => ArticleFunc(data))
await data.then(function(data) {
data.map(function(object) {
fetch('https://api.unsplash.com/search/photos?client_id=XXX&content_filter=high&per_page=1&query=' + object.title)
.then(data => data.json())
object.image_url = image.results[0].urls.small
object.image_alt = image.results[0].alt_description
ImageFunc(object)
})
})
}
function ArticleFunc(data){
//display article
}
function ImageFunc(data){
//display image
}
getData(url + '/articles/index/1/');
getData(url + '/articles/index/2/');
getData(url + '/articles/index/3/');
Note that this is to be treated as pseudocode, as again, it is untested due to the absense of a minimal working example.
When taking screenshots using puppeteer, dynamic elements with the .menu__link class are required to change innerHTML to a stub.
I use BackstopJs puppet/onReady.js
When I try this, only the first element on the page is replaced:
module.exports = async (page) => {
const myLocalValue = "Test";
const tweets = await page.$$('.menu__link');
for (const tweet of tweets) {
await page.$eval('.menu__link', (el, value) => el.innerHTML = value, myLocalValue)
}
};
And this code does not work at all:
module.exports = async (page) => {
const myLocalValue = "Test";
const tweets = await page.$$('.menu__link');
for (const tweet of tweets) {
await page.$eval(tweet, (el, value) => el.innerHTML = value, myLocalValue)
}
};
Please tell me how to replace innerHTML on the entire page for all .menu__link using puppeteer?
You can use $$eval
await page.$$eval('. menu__link', (links, value) => links.forEach(el => el.innerHTML = value), 'myLocalValue');
Running on Puppeteer, all updated.
The intended process is to go to website, where url is url/{search item} and run through the list of search names. Then for each search item --> search page, get name, price and image url for each listing. Now theres error it cannot find selector. Appreciate any help on this, many thanks!
Layout of the data of the website is as follows:
<div class="items-box-content">
<section class="items-box">
<a href="https://listingurl">
<figure class="items-box-photo">
<img data-src="https://imageurl.jpg" class=" lazyloaded" src="https://imageurl.jpg">
</figure>
<div class="items-box-main">
<h3 class="items-box-name"> listing name </h3>
<div class="items-box-figure">
<div class="items-price font-4"> $29.95 </div> // item's price
</h3>
</div>
And what i have now is (which throws the error):
const puppeteer = require('puppeteer');
const searches = ["a", "b", "c"]; //appended to url
(async () => {
const browser = await puppeteer.launch({ headless: false });
let results =[];
for (const search of searches) {
try {
page = await browser.newPage();
await page.goto(`https://weburl/?keyword=${search}`);
await page.evaluate(() => { document.querySelector('div[class*="items-box"]').scrollIntoView();});
let elements = await page.$$('div[class*="items-box"]');
for (let element of elements) {
let listImg = await element.$eval(('img[class="items-box-photo]'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('d[class="items-box-main"] > h[class="items-box-name"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('d[class="items-box-figure"] > d[class="items-price"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('d[class="items-box-content"] > a[class*="items-box"]'), node => node.getAttribute('href'));
results.push({
listImg,
listTitle,
listPrice,
listUrl
})
return results;
}
} finally {
await page.close
}
}
})();
The error thrown is
(node:5168) UnhandledPromiseRejectionWarning: Error: Error: failed to
find element matching selector "img[class="items-box-photo]"
The problem is right there in the error message (Error: failed to find element matching selector ...).
The selectors are wrong in the following lines:
let listImg = await element.$eval(('img[class="items-box-photo]'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('d[class="items-box-main"] > h[class="items-box-name"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('d[class="items-box-figure"] > d[class="items-price"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('d[class="items-box-content"] > a[class*="items-box"]'), node => node.getAttribute('href'));
According to the HTML code you have given, these should be:
let listImg = await element.$eval('img.lazyloaded', img => img.getAttribute('src'));
let listTitle = await element.$eval('h3.items-box-name', node => node.innerText.trim());
let listPrice = await element.$eval('div.items-price', node => node.innerText.trim());
let listUrl = await element.$eval('div.items-box-content a', node => node.getAttribute('href'));
Note, that instead of using [class=...] the proper way to query a class is by using the class selector: .
I updated your code with my test/debug.
const puppeteer = require('puppeteer');
const searches = ["a"];
(async () => {
const browser = await puppeteer.launch({ headless: false });
function delay(timeout) {
return new Promise((resolve) => {
setTimeout(resolve, timeout);
});
}
let results = [];
for (const search of searches) {
try {
page = await browser.newPage();
await page.goto(`https:url/`);
await page.evaluate(() => { document.querySelector('section[class*="items-box"]').scrollIntoView(); });
let elements = await page.$$('section[class*="items-box"]');
console.log(elements.length)
console.log('wait 6 seconds')
await delay(6000);
for (let element of elements) {
// await delay(6000);
let listImg = await element.$eval(('img'), img => img.getAttribute('src'));
let listTitle = await element.$eval(('h3[class="items-box-name font-2"]'), node => node.innerText.trim());
let listPrice = await element.$eval(('div[class="items-box-price font-5"]'), node => node.innerText.trim());
let listUrl = await element.$eval(('div[class="items-box-content clearfix"] a'), node => node.getAttribute('href'));
results.push({
listImg,
listTitle,
listPrice,
listUrl
});
}
debugger;
} catch (error) {
console.log(error)
} finally {
//await page.close
await browser.close
}
}
console.log(results)
return results;
})();
Updated content:
1. return result in for loop
for(){
return result;
}
=>
for(){
}
return result;
Updated querySelector
section[class*="items-box"]
img // There is only one img tags in "element"
h3[class="items-box-name font-2"] // removed outer 'element'
div[class="items-box-figure"] > div[class="items-price font-4"]
div[class="items-box-price font-5 // updated class name? on my side
items-box-price
div[class="items-box-content clearfix"] a
Updated sleep duration 6 Seconds, this is relative network speed(web load duration).
try catch finally
catch help you to process next step although crash in one step.
So I'm trying to crawl a site using Puppeteer. All the data I'm looking to grab is in multiple tables. Specifically, I'm trying to grab the data from a single table. I was able to grab the specific table using a very verbose .querySelector(table.myclass ~ table.myclass), so now my issue is, my code is grabbing the first item of each table (starting from the correct table, which is the 2nd table), but I can't find a way to get it to just grab all the data in only the 2nd table.
const puppeteer = require('puppeteer');
const myUrl = "https://coolurl.com";
(async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = (await browser.pages())[0];
await page.setViewport({
width: 1920,
height: 926
});
await page.goto(myUrl);
let gameData = await page.evaluate(() => {
let games = [];
let gamesElms = document.querySelectorAll('table.myclass ~ table.myclass');
gamesElms.forEach((gameelement) => {
let gameJson = {};
try {
gameJson.name = gameelement.querySelector('.myclass2').textContent;
} catch (exception) {
console.warn(exception);
}
games.push(gameJson);
});
return games;
})
console.log(gameData);
browser.close();
})();
You can use either of the following methods to select the second table:
let gamesElms = document.querySelectorAll('table.myclass')[1];
let gamesElms = document.querySelector('table.myclass:nth-child(2)');
Additionally, you can use the example below to push all of the data from the table to an array:
let games = Array.from(document.querySelectorAll('table.myclass:nth-child(2) tr'), e => {
return Array.from(e.querySelectorAll('th, td'), e => e.textContent);
});
// console.log(games[rowNum][cellNum]); <-- textContent