I am trying to scrape github contributor insights using NodeJS, puppeteer, and cheerio.
const cheerio = require('cheerio');
const puppeteer = require('puppeteer');
const url = 'https://github.com/grey-software/grey.software/graphs/contributors';
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.waitForSelector('#contributors', {
visible: true,
})
await page.goto(url);
const pageContent = await page.content()
const $ = cheerio.load(pageContent);
$('.contrib-person').each(function (i, elem) {
console.log(elem)
});
await browser.close();
})();
I get the following error when I run the code above
UnhandledPromiseRejectionWarning: TimeoutError: waiting for selector "#contributors" failed: timeout 30000ms exceeded
The #contributors div should load within the 30 seconds, but I always get this timeout.
Note: page.waitForNavigation() gives the same timeout error
Your issue is you are waiting for the selector to exist... Before you go to the website in the first place. use page.waitForSelector() after you use page.goto()
Related
import puppeteer from 'puppeteer'
async function scrapeProduct(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url)
const element = await page.waitForSelector('input[type="hidden"][name="abuseID"]', {
hidden: true,
timeout: 15000
})
const abuseID = await page.evaluate(element => element.value, element)
console.log(abuseID)
}
scrapeProduct('https://steamcommunity.com/id/lupusRe')
I don't know where I am going wrong but I am getting an error of
file:///C:/Users/charl/Desktop/Code/abg/steam-search/node_modules/puppeteer/lib/esm/puppeteer/common/ExecutionContext.js:282
throw new Error('Evaluation failed: ' + getExceptionMessage(exceptionDetails));
^
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'value')
at pptr://__puppeteer_evaluation_script__:1:21
at ExecutionContext._ExecutionContext_evaluate (file:///C:/Users/charl/Desktop/Code/abg/steam-search/node_modules/puppeteer/lib/esm/puppeteer/common/ExecutionContext.js:282:15)
at processTicksAndRejections (node:internal/process/task_queues:96:5)
at async ExecutionContext.evaluate (file:///C:/Users/charl/Desktop/Code/abg/steam-search/node_modules/puppeteer/lib/esm/puppeteer/common/ExecutionContext.js:114:16)
at async scrapeProduct (file:///C:/Users/charl/Desktop/Code/abg/steam-search/test.js:13:21)
My goal is to be able to get the steam ID in the below tag
<input type="hidden" name="abuseID" value="76561198036553525>"
I am open to using other things beside puppeteer but its just the main thing that I have been using for web scraping.
This is on the official steam community website if that matters.
Copy Xpath of an element
and use page.$x(Xpath)
Edit: i've tried couple of things and it seems like you can get persons steamID through a <script> tag
import puppeteer from 'puppeteer'
async function scrapeProduct(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url);
const text = await page.evaluate(() => Array.from(document.querySelectorAll('script'), element => element.textContent));
let spl = text[38].split(',');
console.log(spl[1])
browser.close()
}
scrapeProduct('https://steamcommunity.com/id/DaYanack')
try this and tell me if it works
Before I start the question, I am new in JavaScript, and I have very basic knowledge of async js, but i need to solve this so i can have my first project functional.
I am trying to build a scraping app using Node and Puppeteer. Basically, the user enters a URL ("link" in the code below), puppeteer goes trough the website code, tries to find the specific piece and returns the data. That part I got working so far.
The problem is when a user enters a URL of a site that doesn't have that piece of code. In that case, I get UnhandledPromiseRejectionWarning: Error: Evaluation failed theme is not defined
What do I do so when there is an error like that, I can catch it and redirect the page instead of Getting Internal Server error.
app.post("/results", function(req, res) {
var link = req.body.link;
(async link => {
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name');
await browser.close()
return data
})(link)
.then(data => {
res.render("index", {data: data, siteUrl: link});
})
})
You can extend the async part to the whole route handler and do whatever you want on catch:
app.post('/results', async (req, res) => {
try {
const link = req.body.link
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name')
await browser.close()
res.render("index", {data: data, siteUrl: link})
} catch(e) {
// redirect or whatever
res.redirect('/')
}
});
I'm writing a simple script to just check if all the resources load correctly (i check status codes of the responses). I've decided to use puppeteer for this and i wrote
(async () => {
const browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
page.on("response", (res) => {
const url = res.url(), status=res.status();
// my functionality goes here
if (url.includes("favicon")) console.log(status, url); // not logging in headless
});
await page.goto("https://stackoverflow.com/", {waitUntil: 'networkidle2'});
await browser.close();
})();
the issue is that if i run my application in headless mode the favicon is missing from the responses, i assume it has something to do with puppeteer not loading a favicon in headless. Any built in functionality or workarounds?
From the lack of a better solution right now I'm evaluating favicons url and manually visiting it
async function checkFavicon(page){
const iconUrl = await page.$eval("link[rel*='icon']", ({href}) => href);
await page.goto(iconUrl);
await page.goBack();
}
I'm using Puppeteer for doing some web scraping and I'm having troubles. The website I'm trying to scrape is this one and I'm trying to create a screenshot of a calendar that appears after clicking the button "Reserve now" > "Dates".
const puppeteer = require('puppeteer');
const fs = require('fs');
void (async () => {
try {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://www.marriott.com/hotels/travel/reumd-le-meridien-ra-beach-hotel-and-spa');
await page.setViewport({ width: 1920, height: 938 });
await page.waitForSelector('.m-hotel-info > .l-container > .l-header-section > .l-m-col-2 > .m-button');
await page.click('.m-hotel-info > .l-container > .l-header-section > .l-m-col-2 > .m-button');
await page.waitForSelector('.modal-content');
await page.waitFor(5000);
await page.waitForSelector('.js-recent-search-inputs .js-datepick-container .l-h-field-input')
await page.click('.js-recent-search-inputs .js-datepick-container .l-h-field-input');
await page.waitFor(5000);
await page.screenshot({ path: 'myscreenshot.png'});
await browser.close();
} catch (error) {
console.log(error);
}
})()
This is what myscreenshot.png should contain:
but I'm getting this instead:
As you can see, myscreenshot.png doesn't contain the calendar. I don't understand what I'm doing wrong since I click on the right node and I even give time enough to it for loading everything.
Thank you in advance!
Edit: I forgot to say that I have also tried Puppeteer recorder in order to achieve this and I haven't had luck either.
As you have many .l-h-field-input elements, I would try being more specific there.
This worked for me:
await page.click('.js-recent-search-inputs .js-datepick-container .l-h-field-input');
I am using Puppeteer to test the login page of my application.
The below code runs fine on my local system but when run on Jenkins 9 times out of 10, I get error (mentioned below).
I have tried different things to fix this but haven't been successful. Also added setTimeOut but even that doesn't help to fix this.
Here is the sample code:
const puppeteer = require('puppeteer');
const CREDS = require('../creds');
describe('Login & Title', () => {
var browser, page;
var url = 'https://www.example.com/login'
beforeEach (async () => {
browser = await puppeteer.launch({
args: ['--no-sandbox'], headless: true
});
page = await browser.newPage();
await page.goto(url);
})
afterEach (() => {
browser.close()
})
test('Test for Page Title', async () => {
var millisecondsToWait = 500;
setTimeout(function() {
}, millisecondsToWait);
const USERNAME_SELECTOR = '#inputUserName';
const PASSWORD_SELECTOR = '#inputPassword';
const BUTTON_SELECTOR = 'body > div > ng-view > form > div:nth-child(3) > button';
await page.click(USERNAME_SELECTOR);
await page.keyboard.type(CREDS.username);
await page.click(PASSWORD_SELECTOR);
await page.keyboard.type(CREDS.password);
await page.click(BUTTON_SELECTOR);
await page.waitForNavigation();
const title = await page.title();
expect(title).toBe("Example Website");
});
})
I am getting error:
console.assert node_modules/puppeteer/lib/FrameManager.js:597
AssertionError [ERR_ASSERTION]: No node found for selector: #inputUserName
FAIL __tests__/login.test.js
Login & Title
✕ Test for Page Title (513ms)
● Login & Title functionality.. › Test for Page Title
TypeError: Cannot read property 'click' of null
at Frame.click (node_modules/puppeteer/lib/FrameManager.js:598:18)
Any pointers or help would be great! Thanks in Advance.