Capture a screenshot as a table using Puppeteer - javascript

I am learning to scrape items from a website using Puppeteer. I am using table data from Basketball reference.com to practice. What I have done so far is use the puppeteer to Search the stats of my favorite player (Stephen Curry), access the table page, and take a screenshot of the page which then finishes the scraping process and closes the browser. However, I cannot seem to scrape the table I need and I am completely stuck.
The following is the code I have written so far:
const puppeteer = require("puppeteer");
async function run() {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
});
const page = await browser.newPage();
await page.goto(`https://www.basketball-reference.com/`);
await page.waitForSelector("input[name=search]");
await page.$eval("input[name=search]", (el) => (el.value = "Stephen Curry"));
await page.click('input[type="submit"]');
await page.waitForSelector(`a[href='${secondPageLink}']`, { visible: true });
await page.click(`a[href='${secondPageLink}']`);
await page.waitForSelector();
await page.screenshot({
path: `StephenCurryStats.png`,
});
await page.close();
await browser.close();
}
run();
I am trying to scrape the PER GAME table on the following link and take its screenshot. However, I cannot seem to find the right selector to pick and scrape and I am very confused.
The URL is https://www.basketball-reference.com/players/c/curryst01.html

There seems to be at least a couple of issues here. I'm not sure what secondPageLink refers to or the intent behind await page.waitForSelector() (throws TypeError: Cannot read properties of undefined (reading 'startsWith') on my version). I would either select the first search result with .search-item-name a[href] or skip that page entirely by clicking on the first autocompleted name in the search after using page.type(). Even better, you can build the query string URL (e.g. https://www.basketball-reference.com/search/search.fcgi?search=stephen+curry) and navigate to that in your first goto.
The final page loads a video and a ton of Google ad junk. Best to block all requests that aren't relevant to the screenshot.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://www.basketball-reference.com/";
await page.setViewport({height: 600, width: 1300});
await page.setRequestInterception(true);
const allowed = [
"https://www.basketball-reference.com",
"https://cdn.ssref.net"
];
page.on("request", request => {
if (allowed.some(e => request.url().startsWith(e))) {
request.continue();
}
else {
request.abort();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.type('input[name="search"]', "Stephen Curry");
const $ = sel => page.waitForSelector(sel);
await (await $(".search-results-item")).click();
await (await $(".adblock")).evaluate(el => el.remove());
await page.waitForNetworkIdle();
await page.screenshot({
path: "StephenCurryStats.png",
fullPage: true
});
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If you just want to capture the per game table:
// same boilerplate above this line
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.type('input[name="search"]', "Stephen Curry");
const $ = sel => page.waitForSelector(sel);
await (await $(".search-results-item")).click();
const table = await $("#per_game");
await (await page.$(".scroll_note"))?.click();
await table.screenshot({path: "StephenCurryStats.png"});
But I'd probably want a CSV for maximum ingestion:
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.type('input[name="search"]', "Stephen Curry");
const $ = sel => page.waitForSelector(sel);
await (await $(".search-results-item")).click();
const btn = await page.waitForFunction(() =>
[...document.querySelectorAll("#all_per_game-playoffs_per_game li button")]
.find(e => e.textContent.includes("CSV"))
);
await btn.evaluate(el => el.click());
const csv = await (await $("#csv_per_game"))
.evaluate(el => [...el.childNodes].at(-1).textContent.trim());
const table = csv.split("\n").map(e => e.split(",")); // TODO use proper CSV parser
console.log(table);

Related

Is it possible to fill in multiple inputs in parallel with Puppeteer?

I want to type into 2 inputs at the same time but in fact both texts go to the second input.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://example.com");
await Promise.all([
page.type("#user", "user"),
page.type("#password", "password"),
]);
await browser.close();
})();
The second input looks like upsaesrsword
The behavior is intended.
Related issue on GitHub:
https://github.com/puppeteer/puppeteer/issues/1958
Alternative solution:
page.$eval(
'#user',
(handle, text) => {
handle.value = text;
handle.dispatchEvent(new Event('change', {bubbles}));
},
'user'
);

Puppeteer getting Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText') when returning multiple values

Using puppeteer to get some data from a site. I need to return multiple values gotten from the site but for some reason I can only return one at a time, anytime I try returning multiple values(like the one in the code below), I get the following error: Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText') when returning multiple values and I can't find out why.
Code
(async () => {
try {
const chromeBrowser = await puppeterr.launch({ headless: true });
const page = await chromeBrowser.newPage();
await page.goto("https://www.sec.gov/edgar/search/#/category=form-cat2", {timeout: 0});
const getInfo = await page.evaluate(() => {
const secTableEN = document.querySelector(".table td.entity-name");
const secTableFiled = document.querySelector(".table td.entity-filed");
const secTableLink = document.querySelector(".table td.filetype");
return {
secTableEN: secTableEN.innerText,
secTableFiled: secTableFiled.innerText,
};
})
console.log(getInfo);
await page.close();
await chromeBrowser.close();
} catch (e) {
console.error(e)
}
})();
Two problems:
The page loads the data dynamically, so you should waitForSelector before querying.
.entity-filed should be .filed.
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "<your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const $ = (...args) => page.waitForSelector(...args);
const text = async (...args) =>
(await $(...args)).evaluate(el => el.textContent.trim());
await page.goto(url, {waitUntil: "domcontentloaded"});
const info = {
secTableEN: await text(".table td.entity-name"),
secTableFiled: await text(".table td.filed"),
secTableLink: await text(".table td.filetype"),
};
console.log(info);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
As an aside, I wouldn't use {timeout: 0}. If the page doesn't load after a minute or so, something is wrong and you should probably report an error rather than hang the script forever.
Another approach is to avoid the DOM and simply intercept the API response with the payload you're interested in:
// ... same boilerplate as above ...
browser = await puppeteer.launch();
const [page] = await browser.pages();
const resP = page.waitForResponse(res =>
res.url() === "https://efts.sec.gov/LATEST/search-index"
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const res = await resP;
const data = JSON.parse(await res.text());
const hit = data.hits.hits[0]._source;
const info = {
secTableEN: hit.display_names[0],
secTableFiled: hit.file_date,
secTableLink: hit.file_type // slightly different output than from the DOM
};
console.log(info);
// ...

How to click on popup contents in Puppeteer?

I open the 'deliver to' popup but am not able to click on the input field and enter information.
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
const url = 'https://www.tntsupermarket.com/eng/store-flyer';
await page.goto(url, {waitUntil: 'networkidle0'});
const newPagePromise = new Promise(x => browser.once('targetcreated', target => x(target.page())));
await page.evaluate(()=> {
document.querySelector('span[class="deliverCss-city-FJJ"]').click();
});
const popup = await newPagePromise;
await popup.waitForSelector('input[aria-label="Enter your Postal Code"]');
await popup.focus('input[aria-label="Enter your Postal Code"]');
await popup.click('input[aria-label="Enter your Postal Code"]');
await popup.keyboard.type('a2b');
})();
The pop-up isn't a new page, just a modal element that's shown with JS and without navigation. Removing the navigation promise gives a pretty clear result:
const puppeteer = require("puppeteer"); // ^13.5.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
const url = "https://www.tntsupermarket.com/eng/store-flyer";
await page.goto(url, {waitUntil: "networkidle0", timeout: 90000});
const cityEl = await page.waitForSelector('span[class="deliverCss-city-FJJ"]');
await cityEl.evaluate(el => el.click());
const postalSel = 'input[aria-label="Enter your Postal Code"]';
const postalEl = await page.waitForSelector(postalSel);
await postalEl.type("a2b");
await page.waitForTimeout(30000); // just to show that the state is as we wish
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
This is a bit slow; there's an annoying pop-up you might wish to click off instead of using "networkidle0":
// ... same code
await page.goto(url, {waitUntil: "domcontentloaded", timeout: 90000});
const closeEl = await page.waitForSelector("#closeActivityPop");
await closeEl.click();
const cityEl = await page.waitForSelector('span[class="deliverCss-city-FJJ"]');
// same code ...
On quick glance, if the page is cached, the pop-up might not show, so you might want to abort page.waitForSelector("#closeActivityPop"); after 30 seconds or so and continue with the code without clicking on it, depending on how flexible you want the script to be.

storing page to variable in puppeteer doesn't work

I'm trying to store the page result to a variable so I can use it to access other page but encountered an error "TypeError: Cannot read property 'waitForSelector' of undefined"
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.instagram.com/accounts/login/');
await page.waitForSelector('input[name="username"]');
await page.type('input[name="username"]', 'username');
await page.type('input[name="password"]', 'password');
const mainPage = await page.click('button[type="submit"]');
await mainPage.pdf({path: 'page.pdf', format: 'A4'});
mainPage.goto(https://www.instagram.com/direct/inbox/);
mainPage.waitForSelector('button[name="Send Message"]');
//some additional code
})();
page.click won't return a page. You can use waitForNavigation there.
await Promise.all([
page.waitForNavigation(),
page.click('button[type="submit"]')]);

Puppeteer unable to use get property

Cannot read property 'getProperty' of undefined is the error that I get.
const puppeteer = require('puppeteer');
async function scrapeUdemy(url) {
try {
const browser = await puppeteer.launch({headless: false, slowmo: 250});
const page = await browser.newPage()
await page.goto(url)
const [el] = await page.$x('//*[#id="udemy"]/div[1]/div[4]/div/div/div[2]/div/div/div[1]/a/div[1]/div[1]');
const txt = await el.getProperty('textContent');
const rawTxt = await src.jsonValue();
console.log({srcTxt});
browser.close();
}
catch(err) {
console.log(err.message);
}
}
scrapeUdemy('https://www.udemy.com/user/eren-cem-salta/')
I tried using other versions but does not work. It is not working with the catch block too.
The element that you want to get is loaded with AJAX after the page started and you have to wait until it appears in the DOM:
await page.waitForSelector('[data-purpose="course-card-container"] div.udlite-heading-sm');
And why not use the same selector to get all of the cards:
const titles = await page.evaluate(() => {
const nodes = document.querySelectorAll(
'[data-purpose="course-card-container"] div.udlite-heading-sm'
);
return [...nodes].map((node) => node.textContent);
})

Categories