I'm not a pro just trying to scrape some data from a website.
Some one from here helped me to select first "frame" but I need to scrape data from third frame and concatenate data from frame 1 + frame 2 + frame 3 in just one result.This is the site
This is what I have:
const puppeteer = require('puppeteer');
let scrape = async() => {
const browser = await puppeteer.launch({
headless: false,
slowMo: 250
});
const page = await browser.newPage();
await page.goto('', {
waituntil: "networkidle0"
});
const frame = await page.frames().find(f => f.name() === 'stanga');
const button = await frame.$('body > form > font > select > option:nth-child(12)');
button.click();
await page.waitFor(1000);
const frame1 = await page.frames().find(a => a.name() ==='centru');
const select = await frame1.$('body > form > font > select > option:nth-child(1)');
await page.waitFor(500);
select.click();
await page.waitFor(500);
const result = await page.$$eval("body > font", (options) => {
const timpi = options.map(option => option.innerText);
return timpi
});
await browser.close();
return result;
};
scrape().then((value) => {
console.log(value);
});
Thank you for any help.
I have fixed our script:
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('http://example.com/txt', { waitUntil: "networkidle2" });
const optionSelector = 'body > form > font > select > option';
const frames = await page.frames();
const expectedFrames = ['stanga', 'centru'];
const scrapedText = [];
const getOptions = (frameName) => {
return frameName.$$eval(optionSelector, (options) => {
const result = options.map(option => option.innerText);
return result;
}, optionSelector);
}
for (const frame of frames) {
const name = frame.name();
if (expectedFrames.includes(name)) {
await frame.click(optionSelector);
await page.waitFor(1000);
const result = await getOptions(frame);
scrapedText.push({[name]: result});
} else if (name === 'dreapta') {
const result = await frame.$eval('body', elm => elm.innerText);
scrapedText.push({[name]: result.split(/\n+/g)});
}
}
await browser.close();
return scrapedText;
};
scrape().then((value) => {
console.log(value);
});
OUTPUT:
[{
stanga: ['Mures','A Saguna', 'A.Guttenbrun_1', ... and more items]
},
{
centru: ['[0] E3']
},
{
dreapta: ['Linia: E3','2019-07-25 23:19:40','Sosire1: 23:39','Sosire2: 23:41']
}]
You have to improve your scraper not just to click on the select, but also to pull selected item value from the select object.
const frame = await page.frames().find(f => f.name() === "stanga");
const select1 = await frame.$(
"body > form > font > select > option:nth-child(12)"
);
const select1Value = await frame.evaluate(
select1 => select1.textContent,
select1
);
select1Value will have the value of the selected item in select box. The same must be done for select2 in the next frame.
In your code, you don't select frame3, thats why you cannot read data from it.
I have updated your code and this is the result I could get out of your code:
$ node scrape.js
Frame1: AT_Miresei_1
Frame2: [1] E1
Frame3: Linia: E12019-07-25 22:29:13Sosire1: 22:55 Sosire2: 23:00
This is what I ended up with, but there is a lot to improve (code quality and readability).
const puppeteer = require("puppeteer");
let scrape = async () => {
let result;
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://ratt.ro/txt", {
waituntil: "networkidle0"
});
// Frame 1
const frame = await page.frames().find(f => f.name() === "stanga");
const button = await frame.$(
"body > form > font > select > option:nth-child(12)"
);
const select1Value = await frame.evaluate(
button => button.textContent,
button
);
button.click();
await page.waitFor(1000);
// Frame 2
const frame1 = await page.frames().find(a => a.name() === "centru");
const select = await frame1.$(
"body > form > font > select > option:nth-child(1)"
);
const select2Value = await frame1.evaluate(
select => select.textContent,
select
);
await page.waitFor(200);
select.click();
await page.waitFor(200);
// Frame 3
const frame3 = await page.frames().find(f => f.name() === "dreapta");
const element = await frame3.$("body");
const frame3Text = await frame3.evaluate(
element => element.textContent,
element
);
await browser.close();
result =
"Frame1: " +
select1Value +
"\nFrame2: " +
select2Value +
"\nFrame3: " +
frame3Text.trim();
return result;
};
scrape().then(value => {
console.log(value);
});
Related
trying to navigate all pagination , get deals links , and console.log them.but problem is: it wont click the next page since page.click function not works inside page.evaluate() need to write them with js and not works
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
slowMo: 20,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto("https://www.amazon.com.tr/deals?ref_=nav_cs_gb", {
waitUntil: "networkidle2",
});
//await page.waitForSelector('[data-testid="grid-deals-container"]');
const siktir = await page.evaluate(() => {
var while_status = true;
var list = [];
while (while_status) {
setTimeout(() => {}, 5000);
let sayi = document.querySelector('[data-testid="grid-deals-container"]')
.children.length;
for (let i = 0; i < sayi; i++) {
list.push(
document
.querySelector('[data-testid="grid-deals-container"]')
.children.item(i)
.children.item(0)
.children.item(0)
.children.item(0).href
);
}
if (document.querySelector(".a-last a") === null) {
while_status = false;
}
setTimeout(() => {
document.querySelector(".a-last a").click();
}, 3000);
}
return list;
});
console.log(siktir);
//await page.click(".a-last a",{delay:3000});
await browser.close();
})();
A lil help would be good
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: true,
slowMo: 20,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto("https://www.amazon.com.tr/deals?ref_=nav_cs_gb", {
waitUntil: "networkidle2",
});
const numberOfDivs = await page.evaluate(() => {
return document.querySelector("li.a-disabled:nth-child(6)").textContent;
});
console.log(numberOfDivs);
var sayfa = 0;
for (let i = 0; i < numberOfDivs; i++) {
await page.waitForTimeout(3000);
sayfa++;
console.log(sayfa);
var lale = await page.evaluate(() => {
let list = [];
var sayi = document.querySelector('[data-testid="grid-deals-container"]')
.children.length;
for (let i = 0; i < sayi; i++) {
list.push(
document
.querySelector('[data-testid="grid-deals-container"]')
.children.item(i)
.children.item(0)
.children.item(0)
.children.item(0).href
);
}
return list;
});
console.log(lale);
await page.click(".a-last a");
}
await browser.close();
})();
Still need to get fixed but at least i can get the links of the products.
I am making this scraper to collect post from public Facebook pages. My problem is when I turn the scraper up to collect more than like 10 post, it's unable to scrape elements after scrolling way down. So the way it works is the scraper goes to a public page then scrolls the entire feed until it grabs the IDs for the number of post you want to collect. After collecting all the IDs it will then go to every post and collect what specific info like comments, shares, reactions etc. But when I'm far down the feed it can't find the post higher up the feed by ID, even tho when in Chrome dev tools the selector works in puppeteer its undefined. So my question is why does the scroll location affect puppeteer being able to read the dom. And if there is a better way for me to collect this information.
-Sidenote: this scraper is expected to grab thosands of post
Here is my code so far
const { scrollPageToBottom } = require('puppeteer-autoscroll-down')
const puppeteer = require('puppeteer');
const prompt = require('prompt-sync')();
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const fbPage = prompt('What FaceBook Page?');
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
await page.goto(`https://www.facebook.com/OfficialMensHumor/`, {waitUntil : 'networkidle2' }).catch(e => void 0);
await scrapeArticles(page)
await browser.close();
})();
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
async function getText(spans){
for (const ele of spans){
// const text = ele.getProperty('innerText')
const text = await (await ele.getProperty('innerText')).jsonValue()
console.log(text)
}
}
async function scrapeArticles(
page,
// extractItems,
postCount=100,
scrollDelay = 800,
) {
let post = [];
try {
let previousHeight;
while (post.length < postCount) {
const content = await page.$('div[role="main"] > div.k4urcfbm')
post = await content.evaluate(()=>{
const postDivs = Array.from(document.querySelectorAll('div.du4w35lb.l9j0dhe7 div[class=lzcic4wl][role="article"]'))
return postDivs.map(post=>({id:post.getAttribute('aria-posinset')}))
})
console.log(post)
let isLoadingAvailable = true
await scrollPageToBottom(page, { size: 500 , delay: 250})
}
console.log(1)
await getPostUrls(page, post)
await getComments(page, post)
await getShares(page, post)
await getReactions(page, post)
await getPostImg(page, post)
await getTime(page, post)
console.log(post)
saveToFile(post)
} catch(e) {
console.log(e)
}
// return items;
}
const getComments = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
// const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')
// Comment String
const commentNum = await (await handle.getProperty('innerText')).jsonValue()
obj['commentsNum'] = commentNum
}
}
}
// console.log(articleNums)
}
const getShares = async (page, articleNums) => {
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.waitForXPath("//span[contains(text(), 'Shares')]", {visible: true})
// Share String
const shareNum = await (await handle[0].getProperty('innerText')).jsonValue()
obj['sharesNum'] = shareNum
}
}
}
// console.log(articleNums)
}
const getReactions = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.$('span[aria-label="See who reacted to this"] + span[aria-hidden="true"]')
// Share String
const reactionsNum = await (await handle.getProperty('innerText')).jsonValue()
obj['reactionsNum'] = reactionsNum
}
}
}
// console.log(articleNums)
}
const getPostImg = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const imgDiv = await article.$('div[class="pmk7jnqg kr520xx4"]')
const handle = await imgDiv.$('img[alt]')
// Share String
const imgUrl = await (await handle.getProperty('src')).jsonValue()
obj['imgUrl'] = imgUrl
}
}
}
// console.log(articleNums)
}
// And timestamp
const getTime = async (page, articleNums)=>{
for (const obj of articleNums){
for (const key in obj){
if(key == 'postUrl'){
await page.goto(obj[key])
const timeStamp = await page.$eval('abbr[data-shorten]', abbr=>abbr.dataset.tooltipContent)
obj['timestamp'] = timeStamp
}
}
}
}
const getPostUrls = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const postURLHandle = await article.$('a[role="link"][aria-label]')
// Share String
const postURL = await (await postURLHandle.getProperty('href')).jsonValue()
obj['postUrl'] = postURL
}
}
}
console.log(articleNums)
}
const saveToFile = async (list) =>{
const csv = new ObjectsToCsv(list);
// Save to file:
await csv.toDisk('./post_sample.csv');
}
These are the lines in question that are continously returning undefined
/ const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')
I am attempting to write a script that locates the largest image on a page. The first step of this process would be to retrieve all the image sources on a particular website. This is where I am stuck.
const puppeteer = require('puppeteer');
function ImageFetcher(pageURL, partName) {
return new Promise( async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page1 = await browser.newPage();
await page1.goto(pageURL);
try {
const images = await page.$$eval("img", els => els.map(x => x.getAttribute("src")));
console.log(images);
} catch(e) {console.log("ERR Locator")};
await page1.close();
await browser.close();
return resolve();
} catch(e) {console.log(`Error Image Fetcher Part Name: ${partName}`)};
});
}
async function start() {
pageURL = "https://www.grainger.com/product/NVENT-CADDY-Cushioned-Pipe-Clamp-1RVC3";
partName = "10000";
ImageFetcher(pageURL, partName);
} start();
//ERR Locator
How about this:
const puppeteer = require("puppeteer");
let testing = async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.grainger.com/product/NVENT-CADDY-Cushioned-Pipe-Clamp-1RVC3');
const image = await extractLargestImage(page);
return image;
};
async function extractLargestImage(page) {
return page.evaluate(() => {
let imgs = document.querySelectorAll('img');
let largestImgSrc = 'none yet';
let largestImgSize = 0;
for (var img of imgs) {
let imgSize = Number(img.height) * Number(img.width);
if (imgSize > largestImgSize) {
largestImgSize = imgSize;
largestImgSrc = img.src;
}
}
return largestImgSrc;
});
}
testing().then((value) => {
console.dir(value, {'maxArrayLength': null});
});
I need to wait until the new page is loaded, which was opened in a new tab after clicking on the button. That is, I click on a button, a new page opens (which should load) and on it I click another button. I have some example code, but it doesn't work for some reason:
const page = await browser.newPage();
await page.goto('https://twitter.com/amazon/');
await page.click('.css-1dbjc4n:nth-child(1) > .css-1dbjc4n > .css-1dbjc4n > .css-901oao > .css-4rbku5',{waitUntil: ['load', 'domcontentloaded', 'networkidle0', 'networkidle2']});
const page2 = (await browser.pages())[2];
await page2.click('#nav-main > .nav-fill > #nav-xshop-container > #nav-xshop > .nav-a:nth-child(2)');
If I understand correctly, the problem is detecting when a new tab ("page") has opened and getting the new page object associated with the tab.
There are at least a couple techniques available. One method is promisifying the browser's "targetcreated" event as described here:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
await page.goto("https://twitter.com/amazon");
const amzSel = `.css-1dbjc4n:nth-child(1) > .css-1dbjc4n >
.css-1dbjc4n > .css-901oao > .css-4rbku5`;
await page.waitForSelector(amzSel, {visible: true});
console.log((await browser.pages()).length); // => 1
// method 1
const newPagePromise = new Promise(resolve =>
browser.once("targetcreated", target => resolve(target.page()))
);
await page.click(amzSel);
const newPage = await newPagePromise;
// --------
console.log((await browser.pages()).length); // => 2
await newPage.waitForSelector("#nav-link-prime", {visible: true});
await newPage.click("#nav-link-prime");
const sel = "#prime-header-CTA-announce";
await newPage.waitForSelector(sel, {visible: true});
console.log(await newPage.$eval(sel, el => el.innerText.trim())); // => TRY PRIME
//await browser.close();
})();
Another approach is to use browser.waitForTarget to check when the target's opener() is the previous page target, as described here:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
await page.goto("https://twitter.com/amazon");
const amzSel = `.css-1dbjc4n:nth-child(1) > .css-1dbjc4n >
.css-1dbjc4n > .css-901oao > .css-4rbku5`;
await page.waitForSelector(amzSel, {visible: true});
console.log((await browser.pages()).length); // => 1
// method 2
const pageTarget = page.target();
await page.click(amzSel);
const newTarget = await browser.waitForTarget(target =>
target.opener() === pageTarget
);
const newPage = await newTarget.page();
// --------
console.log((await browser.pages()).length); // => 2
await newPage.waitForSelector("#nav-link-prime", {visible: true});
await newPage.click("#nav-link-prime");
const sel = "#prime-header-CTA-announce";
await newPage.waitForSelector(sel, {visible: true});
console.log(await newPage.$eval(sel, el => el.innerText.trim())); // => TRY PRIME
//await browser.close();
})();
As an aside, I'm not sure how important/significant this particular Twitter/Amazon example is, but #nav-xshop > .nav-a:nth-child(2) doesn't seem like a reliable selector (it appears to have a race condition between "Best Sellers" and "Prime")--I'd use #nav-link-prime since it's a direct id, if that's what you're looking for.
I have thefollowing script with puppeter that works correctly , this code extract all information about table.
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const tableRows = await page.$$('table > tbody tr');
await page.goto("https://www.mismarcadores.com/baloncesto/espana/liga-endesa/partidos/");
const time = await page.evaluate(() => {
const tables = Array.from(document.querySelectorAll('table tr .time'));
return tables.map(table => table.textContent)
});
const teamHome = await page.evaluate(() => {
const tables = Array.from(document.querySelectorAll('table tr .team-home'));
return tables.map(table => table.textContent)
});
const teamAway = await page.evaluate(() => {
const tables = Array.from(document.querySelectorAll('table tr .team-away'));
return tables.map(table => table.textContent)
});
for (let i = 0; i < time.length; i++) {
console.log(time[i]);
console.log(teamHome[i]);
console.log(teamAway[i]);
}
await browser.close();
})();
Now I try to create this in a better way and I have the following code.
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.mismarcadores.com/baloncesto/espana/liga-endesa/partidos/");
console.log("started evalating");
var data = await page.evaluate(() => {
Array.from(
document.querySelectorAll('table tr')
).map(row => {
return {
time: row.querySelector(".time"),
teamHome: row.querySelector(".team-home"),
teamAway: row.querySelector(".team-away")
};
});
});
console.log(data);
})();
When I try to execute the second script I receive and undefined.
The result will be to pass the first script to second script.
Could anyone helps to me ?
You need to specify tr elements more (like by adding .stage-scheduled class) and to return .textContent properties instead fo elements themselves. Try this:
var data = await page.evaluate(() => {
return Array.from(
document.querySelectorAll('table tr.stage-scheduled')
).map(row => {
return {
time: row.querySelector(".time").textContent,
teamHome: row.querySelector(".team-home").textContent,
teamAway: row.querySelector(".team-away").textContent,
};
});
});