I am trying to take screenshots of all the table elements in a web page. Now for someweb pages it is working perfectly fine, I am able to take take pictures. But some websites are not working. I am able to take screenshots but they are mostly white or gray here is the code iam using.
const puppeteer = require('puppeteer');
const jsonfile = require('jsonfile');
const getWebImages = async(pageToGo, link) => {
puppeteer.launch({
args: ['--start-maximized'],
headless: false,
defaultViewport: null
}).then(async (browser) => {
const page = await browser.newPage();
await page.goto(pageToGo, {waitUntil: 'networkidle2', timeout: 60000});
const VIEWPORT = {width: 1366, height: 768 }; // Your default values
boxes2 = [];
const getData = async (link) => {
return page.evaluate(async (link) => {
return await new Promise(resolve => {
var rects = [];
const element = document.querySelectorAll('table');
element.forEach(function (item, index) {
var box = item.getBoundingClientRect();
rects.push({
x: box.left,
y: box.left,
width: box.width,
height: box.height,
id: index
})
})
return resolve(rects);
})
}, link);
}
const getImages = async (rect) => {
for (const item of rect) {
try {
await page.screenshot({
path: 'data\\_table_' + item.id + '.png',
clip: {
x: item.x,
y: item.y,
width: item.width,
height: item.height
}
});
} catch (e) {
console.log(e)
}
}
}
boxes2 = await getData(link);
images = await getImages(boxes2);
console.log(boxes2)
await browser.close();
});
}
getWebImages("https://www.csb.gc.ca/rates/", 11);
I have tried different screen sizes and other things like waiting for everything to load. When i see in the browser, i can clearly see the page loads and after it loads, the screenshots are taken but the images are either just white screens same size as tabel area.
NOTE: Just a note that i also downloaded some of the pages offline and even that is not working.
My problem was that i was setting my viewport after goto(), I changed the code to this;
`const puppeteer = require('puppeteer');
async function run(url) {
let browser = await puppeteer.launch({ headless: true });
let page = await browser.newPage();
const VIEWPORT = { width: 1360, height: 780}
boxes2 = [];
await page.setViewport(VIEWPORT);
await page.goto(url, { waitUntil: 'domcontentloaded'});
await page.waitFor('table');
await page.waitForSelector('table');
const el = await page.$$('table');
for(let i = 0; i < el.length; i++){
// await console.log(el[i].getBoundingClientRect());
await el[i].screenshot({
path: 'link_' +i+ '.png',
})
}
// await processScreens(el, page);
await page.close();
await browser.close();
}
Related
I have a vue.js project connected to Firebase. The app displays courses -> sections -> units. The client would like to print an entire section to PDF. Each section has X number of units.
For our example lets say the section has 8 units.
The code works on local but when live it only generates part of the content. In other words, using our example of 8 units let's say the total number of pages is 150. When on local it will generate the full PDF (150 pages) but live it might generate 30 pages or 50 pages.
Is there a way to fix this issue?
below is the code on Firebase.
const functions = require('firebase-functions');
const puppeteer = require('puppeteer');
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time)
});
}
exports = module.exports = function (context) {
const { admin } = context;
return functions.runWith({
timeoutSeconds: 120,
memory: "4GB" //"2GB"
}).https.onRequest(async (req, resp) => {
if (req.method != "GET") {
resp.status(400).send("Unauthorized");
return;
}
let parts = req.params[0].split('/');
let x = parts[0] === '' ? 1 : 0;
let courseID = parts[x++];
let sectionID = parts[x++];
let url = "https://XXXXXXXXXXXXX/sectionpdf";
if (process.env.FUNCTIONS_EMULATOR === 'true') {
url = `http://localhost:8080/sectionpdf`;
}
url += `/${courseID}/${sectionID}`;
url += `?uid=${req.query.uid}`
url += `&uem=${req.query.uem}`
url += `&cid=${req.query.cid}`
if (!url) {
return resp.send(`Invalid url: ${url}`);
}
const browser = await puppeteer.launch({
args: ["--no-sandbox"],
//headless: false
});
let page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle2" });
await page.waitForSelector('div.section-pdf-container')
await page.waitForSelector('#section-title')
let element = await page.$('#section-title')
let unitTitle = await page.evaluate(el => el.textContent, element)
// wait for questions and answers to load
await page.waitForSelector('div.content-has-loaded')
const pdf = await page.pdf({
displayHeaderFooter: true,
headerTemplate: '<style>html { -webkit-print-color-adjust: exact;}</style><div style="font-size: 9px; color:rgba(0,0,0,.35); padding-left: 30px;">The Journey: ' + unitTitle + '</div>',
footerTemplate: '<style>html { -webkit-print-color-adjust: exact;}</style><div style="font-size: 9px; color:rgba(0,0,0,.35); padding-right: 30px; width:100%; text-align:right;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></div>',
printBackground: true,
margin: { top: '0.5in', bottom: '0.5in', left: '0.3in', right: '0.3in' }
});
await browser.close();
resp.set("Content-Type", "application/pdf");
resp.status(200).send(pdf);
});
}
I use puppeteer to get data about the store. I search using the p.shop-page-content__text_large, span.shop-list-item__address selectors, but I ran into such a problem that only one of them can be present on the page. I tried to solve the problem in the following way, but it does not work. Tell me how can this be fixed?
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
async function getData(page, selector) {
return await page.$$eval(selector, info => info.map((data) => {
let str = data.textContent.trim(),
from = str.search(','),
to = str.length;
return {
'COUNTRY': 'unknow',
'STREET' : str.substring(from, to)
}
}));
}
const result = [];
for (let val of cities) {
console.log(val.LINK, val.CITY);
const page = await browser.newPage();
await page.goto('https://www.example-site.ru' + val.LINK);
data = await page.waitForFunction('.shop-page-content').then(async() => {
console.log('ok');
return await getData(page, 'p.shop-page-content__text_large');
}).catch(async (e) => {
console.log('fail');
await page.waitForSelector('.shops-info__section');
return await getData(page, 'span.shop-list-item__address');
// result.push(data);
});
result.push(data);
await browser.close();
}
console.log(result);
It turned out like this:
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
const page = await browser.newPage();
const result = [];
for (let val of cities) {
await page.goto('https://www.example-site.ru' + val.LINK);
const list = await page.evaluate(() => {
const data = [];
const elements = document.querySelectorAll('p.shop-page-content__text_large').length
? document.querySelectorAll('p.shop-page-content__text_large')
: document.querySelectorAll('span.shop-list-item__address');
for (const element of elements) {
data.push(element.innerText);
}
return data;
});
result.push({
link: val.LINK,
city: val.CITY,
list
})
}
await browser.close();
(async () => {
let browser, page;
let url = "https://www.facebook.com/marketplace/nyc/search/?query=cars";
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle2" });
console.log("scraping...");
let data = await page.evaluate(() => {
let cars = [];
if (
document.querySelectorAll(
'div[class="fome6x0j tkqzz1yd aodizinl fjf4s8hc f7vcsfb0"]'
)
) {
document.querySelectorAll(
'div[class="fome6x0j tkqzz1yd aodizinl fjf4s8hc f7vcsfb0"]'
)
.forEach((element) => {
let prices = element.querySelectorAll(
'span[class="d2edcug0 hpfvmrgz qv66sw1b"]'
);
let listings = element.querySelectorAll(
'span[class="a8c37x1j ni8dbmo4 stjgntxs l9j0dhe7"]'
);
for (let i = 0; i < addLists.length; i++) {
const car = {
price: prices[i].innerText,
listing: listings[i].innerText,
sentence1:''
sentence2:''
};
cars.push(car);
}
});
}
return cars;
});
console.log(data);
}
})();
how to split the sentence that is rendered using listings , listings is the list of facebook add titles, I tried using split() function but it doesn't work,is there anyway that I can split that sentence and store it in sentence1 and sentence2
You can give this a try.
(async () => {
let browser, page;
let url = 'https://www.facebook.com/marketplace/nyc/search/?query=cars';
try {
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.setViewport({ width: 1366, height: 500 });
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
let data = await page.evaluate(() => {
let cars = [];
if (document.getElementsByClassName('sonix8o1')) {
Array.from(document.getElementsByClassName('sonix8o1')).forEach(element => {
let text = element.innerText;
if (text != "" && text != "Create New Listing" && text != "Log In" && text != "Learn More") {
let carElements = element.getElementsByClassName('rq0escxv')[0];
let car_info = carElements.getElementsByClassName('aahdfvyu');
let car = {
price: car_info[0].innerText,
listing: car_info[1].innerText,
sentence1: car_info[2].innerText,
sentence2: car_info[3].innerText
};
cars.push(car);
}
});
}
return cars;
});
data.forEach(e => console.log(e));
} catch (error) {
console.log(error.message);
} finally {
if (browser) {
await browser.close();
console.log('closing browser');
}
}
})();
Update from the comment.
(async () => {
let browser, page;
let url = 'https://www.facebook.com/marketplace/nyc/search/?query=cars';
try {
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.setViewport({ width: 1366, height: 500 });
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
let data = await page.evaluate(() => {
let cars = [];
if (document.getElementsByClassName('sonix8o1')) {
Array.from(document.getElementsByClassName('sonix8o1')).forEach(element => {
let text = element.innerText;
if (text != "" && text != "Create New Listing" && text != "Log In" && text != "Learn More") {
let carElements = element.getElementsByClassName('rq0escxv')[0];
let car_info = carElements.getElementsByClassName('aahdfvyu');
let splitText = car_info[1].innerText.split(' ');
let car = {
price: car_info[0].innerText,
listing: car_info[1].innerText,
sentence1: splitText[0],
sentence2: splitText[1]
};
cars.push(car);
}
});
}
return cars;
});
data.forEach(e => console.log(e));
} catch (error) {
console.log(error.message);
} finally {
if (browser) {
await browser.close();
console.log('closing browser');
}
}
})();
That returns:
I try get information from a web page using puppeteer, but in I don't to find the selector tha I need, I suppose that's because the page contain more than one documents html and I can't to find the way for to get the data that I need.
the inpection of the page
that´s the code:
const puppeteer = require('puppeteer');
(async ()=>{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto('https://www.arrivia.com/careers/job-openings/');
await page.waitForSelector('.job-search-result');
const data = await page.evaluate(()=>{
const elements = document.querySelectorAll('.job-search-result .job-btn-container a');
vacancies = [];
for(element of elements){
vacancies.push(element.href);
}
return vacancies;
});
console.log(data.length);
const vacancies = [];
for (let i = 0; i <=2; i++){
var urljob = data[i];
await page.goto(data[i]);
await page.waitForSelector(".app-title"); //that´s one of the selectors that I can´t to find
from here I get an error`enter code here`
const jobs = await page.evaluate((urljob)=> {
const job = {};
job.title = document.querySelector(".app-title").innerText;
job.location = document.querySelector(".location").innerText;
job.url = urljob;
return job;close
});
vacancies.push(jobs);
}
console.log(vacancies);
//await page.screenshot({ path: 'xx1.jpg'});
await browser.close()
})();
Iframes are not always the easiest things to deal with, in Puppeteer. But a way to bypass this could be to access directly the URL of the iframe, instead of accessing the page which hosts the iframe. It's also faster:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const page = await browser.newPage();
await page.goto("https://www.arrivia.com/careers/job-openings/", {
waitUntil: "domcontentloaded",
});
const jobUrls = await page.$$eval(".job-search-result .job-btn-container a",
els => els.map(el => el.href));
const vacancies = [];
for (let i = 0; i < 10; i++) { // don't forget to replace 10 with jobUrls.length later
const url = jobUrls[i];
const jobId = /job_id=(\d+)/.exec(url)[1]; // Extract the ID from the link
await page.goto(
`https://boards.greenhouse.io/embed/job_app?token=${jobId}`, // Go to iframe URL
{ waitUntil: "domcontentloaded" }
);
vacancies.push({
title: await page.$eval(".app-title", el => el.innerText),
location: await page.$eval(".location", el => el.innerText),
url,
});
}
console.log(vacancies);
await browser.close();
})();
Output:
[
{
title: 'Director of Account Management',
location: 'Scottsdale, AZ',
url: 'https://www.arrivia.com/careers/job/?job_id=2529695'
},
{
title: "Site Admin and Director's Assistant",
location: 'Albufeira, Portugal',
url: 'https://www.arrivia.com/careers/job/?job_id=2540303'
},
...
]
How can I select this (html anchor element to click and navigate to Tutorial page) query using puppeteer ?
I was doing this and it is not working
const puppeteer = require('puppeteer');
const url = process.argv[2];
if (!url) {
throw "Please provide URL as a first argument";
}
async function run() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
slowMo: 10,
args: ['--start-maximzed', '--disable-notifications']
});
const page = await browser.newPage();
await page.goto(url);
await page.waitForSelector(".python-navigation .navigation.menu .tier-1.element-3 a", {
visible: true
});
await page.click(".python-navigation .navigation.menu .tier-1.element-3 a");
await page.waitForSelector(".documentation-banner .download-buttons ", {
visible: true
});
const elem = await page.$$('.documentation-banner .download-buttons a');
await elem[0].click();
await page.waitForSelector(".contentstable", {
visible: true
});
elem = await page.$$('.contentstable')[0].$$('tbody')[0].$$('tr')[0].$$('td')[0].$$('p')[1];
await elem.click();
await page.pdf({path: 'pdfGenerated.pdf',format:"A4"});
console.log("Success");
browser.close();
}
run();
What should I write instead of this line elem = await page.$$('.contentstable')[0].$$('tbody')[0].$$('tr')[0].$$('td')[0].$$('p')[1]; ?
for click on an element you can use this code:
await page.click(".contentstable tbody tr td p:nth-child(2) a");