When taking screenshots using puppeteer, dynamic elements with the .menu__link class are required to change innerHTML to a stub.
I use BackstopJs puppet/onReady.js
When I try this, only the first element on the page is replaced:
module.exports = async (page) => {
const myLocalValue = "Test";
const tweets = await page.$$('.menu__link');
for (const tweet of tweets) {
await page.$eval('.menu__link', (el, value) => el.innerHTML = value, myLocalValue)
}
};
And this code does not work at all:
module.exports = async (page) => {
const myLocalValue = "Test";
const tweets = await page.$$('.menu__link');
for (const tweet of tweets) {
await page.$eval(tweet, (el, value) => el.innerHTML = value, myLocalValue)
}
};
Please tell me how to replace innerHTML on the entire page for all .menu__link using puppeteer?
You can use $$eval
await page.$$eval('. menu__link', (links, value) => links.forEach(el => el.innerHTML = value), 'myLocalValue');
Related
So I have been a mess for days. I am scraping a website for particular information. The problem is that the website has two css classes but with an identical name. I want to use the link and text from the first css class. Attached is the image of what I have. I want to only use the href values from 1 and not the ones from the two "regions".
const cheerio = require('cheerio');
const axios = require("axios");
const siteUrl = "https://worldpostalcode.com/nigeria/abia/";
const fetchData = async () => {
const result = await axios.get(siteUrl);
return cheerio.load(result.data);
};
const getData = async (html) => {
const stateList = []
const $ = await fetchData();
const stateUrl = $('.regions',html);
//console.log(stateUrl.length)
console.log(stateUrl.length)
for (let index = 0; index < 1; index++) {
let firstRegion = $(stateUrl[index],'a')
stateList.push(firstRegion)
}
console.log(stateList)
}
getData()
Help please
I would use the previous h2 text:
$('h2:contains(Regions) + div.regions a')
I'm trying to scrape HTML data from a variable that holds HTML data. You can see my annotations, they are marked with " << ".
Unfortunately, evaluate only works on a page on not in a div. Could someone tell me how I could scrape information from a variable containing HTML?
Are there perhaps other methods of scraping?
I tried this in the forEach loop as well, but this resulted in the first mealname of the original document.
let mealName = htmlOfOneProduct.document.querySelector("div.meal__description-texts.js-meal-description-text > span > span").innerText;
My code with notes:
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(" "); << Meal website
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('div.meal__wrapper'); << Gets all the meals from a page
items.forEach((item) => {
let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal
let mealName = htmlOfOne.evaluate(() => document.querySelector('meal-name').textContent); << Not working, should get the meal-name from the div.
results.push({
mealName: mealName
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
As you did not provide the site URL, I cannot check my proposal, sorry.
item.innerHTML returns a string which has no evaluate() method. Try this simpler way:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').textContent;
results.push({
mealName: mealName
});
});
Perhaps let htmlOfOneProduct = item.innerHTML; << Gets the HTML of each meal it's not necessary.
If you only need the content of something you could directly do item.innerText or item.name or any other propriety of the element.
In the end something like this should be possible:
items.forEach((item) => {
let mealName = item.querySelector('meal-name').innerText
results.push({
mealName: mealName
});
});
You can also combine your CSS selectors and use Array.from() to simplify scraping the innerText of the elements:
let urls = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.meal__wrapper span.meal-name'), e => ({
mealName: e.innerText,
});
});
So I'm trying to crawl a site using Puppeteer. All the data I'm looking to grab is in multiple tables. Specifically, I'm trying to grab the data from a single table. I was able to grab the specific table using a very verbose .querySelector(table.myclass ~ table.myclass), so now my issue is, my code is grabbing the first item of each table (starting from the correct table, which is the 2nd table), but I can't find a way to get it to just grab all the data in only the 2nd table.
const puppeteer = require('puppeteer');
const myUrl = "https://coolurl.com";
(async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = (await browser.pages())[0];
await page.setViewport({
width: 1920,
height: 926
});
await page.goto(myUrl);
let gameData = await page.evaluate(() => {
let games = [];
let gamesElms = document.querySelectorAll('table.myclass ~ table.myclass');
gamesElms.forEach((gameelement) => {
let gameJson = {};
try {
gameJson.name = gameelement.querySelector('.myclass2').textContent;
} catch (exception) {
console.warn(exception);
}
games.push(gameJson);
});
return games;
})
console.log(gameData);
browser.close();
})();
You can use either of the following methods to select the second table:
let gamesElms = document.querySelectorAll('table.myclass')[1];
let gamesElms = document.querySelector('table.myclass:nth-child(2)');
Additionally, you can use the example below to push all of the data from the table to an array:
let games = Array.from(document.querySelectorAll('table.myclass:nth-child(2) tr'), e => {
return Array.from(e.querySelectorAll('th, td'), e => e.textContent);
});
// console.log(games[rowNum][cellNum]); <-- textContent
Background:
Using NodeJS/CucumberJS/Puppeteer to build end-to-end regression test for an emberJS solution.
Problem:
Selecting (page.click) and getting textContent of one of the elements when there are several dynamic elements with the same selector? (In my case, I have 4 elements with the same selector = [data-test-foo4="true"])
I know, that with:
const text = await page.evaluate( () => document.querySelector('[data-test-foo4="true"]').textContent );
I can get the text of the first element, but how do I select the other elements with the same selector? I've tried:
var text = await page.evaluate( () => document.querySelectorAll('[data-test-foo4="true"]').textContent )[1];
console.log('text = ' + text);
but it gives me 'text = undefined'
Also, the following:
await page.click('[data-test-foo4="true"]');
selects the first elements with that selector, but how can I select the next one with that selector?
You can use Array.from() to create an array containing all of the textContent values of each element matching your selector:
const text = await page.evaluate(() => Array.from(document.querySelectorAll('[data-test-foo4="true"]'), element => element.textContent));
console.log(text[0]);
console.log(text[1]);
console.log(text[2]);
If you need to click more than one element containing a given selector, you can create an ElementHandle array using page.$$() and click each one using elementHandle.click():
const example = await page.$$('[data-test-foo4="true"]');
await example[0].click();
await example[1].click();
await example[2].click();
https://github.com/puppeteer/puppeteer/blob/v5.5.0/docs/api.md#frameselector-1
const pageFrame = page.mainFrame();
const elems = await pageFrame.$$(selector);
Not mentioned yet is the awesome page.$$eval which is basically a wrapper for this common pattern:
page.evaluate(() => callback([...document.querySelectorAll(selector)]))
For example,
const puppeteer = require("puppeteer"); // ^19.1.0
const html = `<!DOCTYPE html>
<html>
<body>
<ul>
<li data-test-foo4="true">red</li>
<li data-test-foo4="false">blue</li>
<li data-test-foo4="true">purple</li>
</ul>
</body>
</html>`;
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const sel = '[data-test-foo4="true"]';
const text = await page.$$eval(sel, els => els.map(e => e.textContent));
console.log(text); // => [ 'red', 'purple' ]
console.log(text[0]); // => 'red'
console.log(text[1]); // => 'purple'
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If you want to pass additional data from Node for $$eval to use in the browser context, you can add additional arguments:
const text = await page.$$eval(
'[data-test-foo4="true"]',
(els, data) => els.map(e => e.textContent + data),
"X" // 'data' passed to the callback
);
console.log(text); // => [ 'redX', 'purpleX' ]
You can use page.$$eval to issue a native DOM click on each element or on a specific element:
// click all
await page.$$eval(sel, els => els.forEach(el => el.click()));
// click one (hardcoded)
await page.$$eval(sel, els => els[1].click());
// click one (passing `n` from Node)
await page.$$eval(sel, (els, n) => els[n].click(), n);
or use page.$$ to return the elements back to Node to issue trusted Puppeteer clicks:
const els = await page.$$('[data-test-foo4="true"]');
for (const el of els) {
await el.click();
}
// or click the n-th:
await els[n].click();
Pertinent to OP's question, you can always access the n-th item of these arrays with the usual syntax els[n] as shown above, but often, it's best to select based on the :nth-child pseudoselector. This depends on how the elements are arranged in the DOM, though, so it's not as general of a solution as array access.
Does anybody know how to get the innerHTML or text of an element? Or even better; how to click an element with a specific innerHTML? This is how it would work with normal JavaScript:
var found = false
$(selector).each(function() {
if (found) return;
else if ($(this).text().replace(/[^0-9]/g, '') === '5' {
$(this).trigger('click');
found = true
}
});
Thanks in advance for any help!
This is how i get innerHTML:
page.$eval(selector, (element) => {
return element.innerHTML
})
Returning innerHTML of an Element
You can use the following methods to return the innerHTML of an element:
page.$eval()
const inner_html = await page.$eval('#example', element => element.innerHTML);
page.evaluate()
const inner_html = await page.evaluate(() => document.querySelector('#example').innerHTML);
page.$() / elementHandle.getProperty() / jsHandle.jsonValue()
const element = await page.$('#example');
const element_property = await element.getProperty('innerHTML');
const inner_html = await element_property.jsonValue();
Clicking an Element with Specific innerHTML
You can use the following methods to click on an element based on the innerHTML that is contained within the element:
page.$$eval()
await page.$$eval('.example', elements => {
const element = elements.find(element => element.innerHTML === '<h1>Hello, world!</h1>');
element.click();
});
page.evaluate()
await page.evaluate(() => {
const elements = [...document.querySelectorAll('.example')];
const element = elements.find(element => element.innerHTML === '<h1>Hello, world!</h1>');
element.click();
});
page.evaluateHandle() / elementHandle.click()
const element = await page.evaluateHandle(() => {
const elements = [...document.querySelectorAll('.example')];
const element = elements.find(element => element.innerHTML === '<h1>Hello, world!</h1>');
return element;
});
await element.click();
This should work with puppeteer:)
const page = await browser.newPage();
const title = await page.evaluate(el => el.innerHTML, await page.$('h1'));
You can leverage the page.$$(selector) to get all your target elments and then use page.evaluate() to get the content(innerHTML), then apply your criteria. It should look something like:
const targetEls = await page.$$('yourFancySelector');
for(let target of targetEls){
const iHtml = await page.evaluate(el => el.innerHTML, target);
if (iHtml.replace(/[^0-9]/g, '') === '5') {
await target.click();
break;
}
}
I can never get the .innerHtml to work reliable. I always do the following:
let els = page.$$('selector');
for (let el of els) {
let content = await (await el.getProperty('textContent')).jsonValue();
}
Then you have your text in the 'content' variable.
With regard to this part of your question...
"Or even better; how to click an element with a specific innerHTML."
There are some particulars around innerHTML, innerText, and textContent that might give you grief. Which you can work-around using a sufficiently loose XPath query with Puppeteer v1.1.1.
Something like this:
const el = await page.$x('//*[text()[contains(., "search-text-here")]]');
await el[0].click({
button: 'left',
clickCount: 1,
delay: 50
});
Just keep in mind that you will get an array of ElementHandles back from that query. So... the particular item you are looking for might not be at [0] if your text isn't unique.
Options passed to .click() aren't necessary if all you need is a single left-click.
You can simply write as below. (no need await sentence in the last part)
const center = await page.$eval('h2.font-34.uppercase > strong', e => e.innerHTML);
<div id="innerHTML">Hello</div>
var myInnerHtml = document.getElementById("innerHTML").innerHTML;
console.log(myInnerHtml);