I've been using puppeteer to try and get pdfs - or its buffer response - from a website which does two requests after clicking on the link for the document (which open in a new tab):
The first request (http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF) retrieves the session guid to access the document
The second request (http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c) uses that guid to access the document and render the pdf on the browser.
The result of my attempts has been a blank pdf being generated, even if it was created after the page been loaded (checked with Fiddler).
I've tried
Intercepting targetcreated event to get the page
Get the second request url and use page.goto to get the pdf
Wait on a the page response to get the buffer
Set Page.setDownloadBehaviour to allow download instead of rendering it in the browser
Any guidance and help is appreciated.
The code tried is below:
const puppeteer = require("puppeteer");
let browser;
async function getDocument(index, title, page) {
if (index != 19) return "";
console.log("getDocument START");
console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
let docPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ViewDocument page");
}
})
);
/* Tried to set the download behaviour to download automatically the pdf but it didn't work */
// await page._client.send("Page.setDownloadBehaviour", {
// behaviour: "allow",
// downloadPath: "./"
// });
await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
let pdfResults = "";
let pdfPage = await docPagePromise;
/* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
// let target = await pdfPage.target();
// let url = await target.url();
// let response = await pdfPage.goto(url);
// console.log(response);
pdfPage.on("console.log", msg => console.log(msg));
/* This is never called */
await pdfPage.on("response", async response => {
console.log("PDF PAGE Response");
let responseBuffer = await response.buffer();
let responseHeaders = response.headers();
console.log("PDF PAGE Response Header: " + responseHeaders);
console.log("PDF PAGE Response Buffer: " + responseBuffer);
return {
responseHeaders,
responseBuffer
};
});
console.log(pdfResults);
let pdfTitle = await pdfPage.title();
console.log("PDFPage URL: " + pdfPage.url());
console.log("PDFPage Title: " + pdfTitle);
let pdfTarget = await pdfPage.target();
console.log("PDFTarget URL: " + (await pdfTarget.url()));
console.log("PDFTarget Type: " + pdfTarget.type());
pdfPage = await pdfTarget.page();
console.log("PDFPage URL: " + pdfPage.url());
await pdfPage.waitFor(3000);
let pdf = await pdfPage.pdf({ path: title + ".pdf" });
console.log(pdf);
return pdf;
}
async function getAdditionalDocumentation(page) {
console.log("getAdditionalDocumentation START");
await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
await page.click("#repGroupSummary__ctl1_lnkGroupName");
await page.waitForSelector("#pnlDocumentList > table > tbody > tr");
await page.waitFor(2000);
const documents = await page.$$eval(
"#pnlDocumentList > table > tbody > tr",
docs =>
docs.map((doc, i) => ({
type: doc.querySelector(".tdl-subgroup > span").innerText,
datePublished: doc.querySelector(
".tdl-date > span[id*='DatePublished']"
).innerText,
dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
.innerText,
docType: doc.querySelector(".tdl-doctype > span").innerText,
description: doc.querySelector(".tdl-description > span").innerText
// 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
}))
);
for (let i = 0; i < documents.length; i++) {
documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
}
await page.click("#btnSummary");
console.log("getAdditionalDocumentation FINISH");
return documents;
}
async function getDocuments(page, browser) {
console.log("getDocuments");
let newPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ShowCaseFile page");
}
})
);
await page.click("#tab_externalDocuments > span");
await page.waitForSelector("#hp-doc-link");
await page.click("#hp-doc-link");
const newPage = await newPagePromise;
const additionalDocumentation = await getAdditionalDocumentation(newPage);
return {
additionalDocumentation
};
}
async function run() {
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => console.log("PAGE LOG:", ...msg.args));
const planningReference = "LA04/2017/1388/F";
await page.goto(
"http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
);
await page.waitForSelector("#simpleSearchString");
await page.type("#simpleSearchString", planningReference);
await page.click("#simpleSearchForm > div.row3 > input.button.primary");
await page.waitForSelector("#simpleDetailsTable");
console.log("getDocuments START");
const documents = await getDocuments(page, browser);
console.log("getDocuments FINISH");
console.log(documents);
console.log(documents.additionalDocumentation.length);
} finally {
browser.close();
}
}
run();
Use exposefunction to write the buffer data to disk with:
page.exposeFunction("writeABString", async (strbuf, targetFile) => {
var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer
var buf = new ArrayBuffer(str.length); // 1 byte for each char
var bufView = new Uint8Array(buf);
for (var i=0, strLen=str.length; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
console.log("In 'writeABString' function...");
return new Promise((resolve, reject) => {
// Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
let buf = Buffer.from(str2ab(strbuf));
// Try saving the file.
fs.writeFile(targetFile, buf, (err, text) => {
if(err) reject(err);
else resolve(targetFile);
});
});
});
With the download link that you have use it in tandem with fetch api to get it as blob and convert it with:
page.evaluate( async () => {
function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String
var bufView = new Uint8Array(buffer);
var length = bufView.length;
var result = '';
var addition = Math.pow(2,8)-1;
for(var i = 0;i<length;i+=addition){
if(i + addition > length){
addition = length - i;
}
result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
}
return result;
}
let geturl = "https://whateverurl.example.com";
return fetch(geturl, {
credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
responseType: 'arraybuffer', // get response as an ArrayBuffer
})
.then(response => response.arrayBuffer())
.then( arrayBuffer => {
var bufstring = arrayBufferToString(arrayBuffer);
return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
})
.catch(function (error) {
console.log('Request failed: ', error);
});
});
For more info look at this issue on the github puppeteer page. The above solution was also suggested in the issue.
Source
Related
I am trying to store some data into a group of PDF files using Puppeteer. But when I try to run this code my app is freezing and I assume I am doing something wrong with the async and await code. It seems to be working properly, the documents are created properly in the folder of the project, but at some point node crashes and I must reboot my PC.
const path = require("path");
const puppeteer = require("puppeteer");
const fs = require('fs');
for (let index = 0; index < documentsToPDF.length; index ++) {
const result = await generatePDF(documentsToPDF[index]);
await createAndSavePDFInFile(documentsToPDF[index], result);
}
async function generatePDF(browser, numdoc) {
const filePage = await browser.newPage();
filePage.setDefaultNavigationTimeout(0);
await filePage.goto(renderPath(numdoc), {
waitUntil: "networkidle0",
});
return filePage.pdf({
format: "A4",
});
}
async function createAndSavePDFInFile(numdoc, result) {
console.log(result);
if(result) {
return fs.writeFile(
path.join(__dirname, `../results/result-${numdoc}.pdf`),
result,
(error, result) => {
if(error) console.log('error', err);
else console.log(`PDF result-${numdoc}.pdf `);
}
);
} else {
console.log(`numdoc `, numdoc, ' could not be saved into pdf');
}
}
Thanks for your help, I'm new to node and I'm learning :)
EDIT:
This loop now is working fine
for (let index = 0; index < documentsToPDF.length; index += 1) {
const browser = await puppeteer.launch(puppeteerParams);
const result = await generatePDF(browser,documentsToPDF[index]);
await createAndSavePDFInFile(documentsToPDF[index], result);
browser.close();
console.log('Exp: ', documentsToPDF[index], ' SUCCESS');
}
But can I do something like this?
const pdfPromises = [];
for (let index = 0; index < documentsToPDF.length; index += 1) {
pdfPromises.push(createPdf(puppeteer, puppeteerParams, documentsToPDF[index]))
}
await Promise.all(pdfPromises);
async function createPdf(puppeteer, puppeteerParams, numexp) {
const browser = await puppeteer.launch(puppeteerParams);
const result = await generatePDF(browser, numexp);
await createAndSavePDFInFile(numexp, result);
browser.close();
console.log('Exp: ', numexp, ' SUCCESS');
}
I've tried and it returns this error:
MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 event listeners added. Use emitter.setMaxListeners() to increase limit.
I am trying to use setTimeout in a for loop so that my HTTP requests get sent once per second to avoid rate-limiting. However, it doesn't seem to be working. Could someone please help?
async function initiateSearchExperimental() {
const json = await getCollections();
for (let i = 0; i < json.result.data.length; i++) {
setTimeout(getData(json, i), 1000 * i)
}
}
function getData(json, i) {
fetch(`https://howrare.is${json.result.data[i].url}/?for_sale=on&sort_by=rank`).then(function(response) {
// The API call was successful!
return response.text();
}).then(function(html) {
// Convert the HTML string into a document object
var parser = new DOMParser();
var doc = parser.parseFromString(html, 'text/html');
var priceArray = getPriceArray(doc.querySelectorAll("div.featured_item"))
console.log(json.result.data[i].url, curateArrayTwo(priceArray, json.result.data[i]))
}).catch(function(err) {
// There was an error
console.warn('Something went wrong.', err);
});
}
No need for await and such
I suggest you DO use setTimeout, but do it in the success of the second fetch
let data;
let cnt = 0;
const getData() {
if (cnt >= data.length) return; // stop
fetch(`https://howrare.is${data[cnt].url}/?for_sale=on&sort_by=rank`)
.then(function(response) {
// The API call was successful!
return response.text();
}).then(function(html) {
// Convert the HTML string into a document object
var parser = new DOMParser();
var doc = parser.parseFromString(html, 'text/html');
var priceArray = getPriceArray(doc.querySelectorAll("div.featured_item"))
console.log(data[cnt].url, curateArrayTwo(priceArray, data[cnt]))
cnt++
setTimeout(getData, 1000)
}).catch(function(err) {
// There was an error
console.warn('Something went wrong.', err);
});
};
fetch(collectionurl)
.then(response => response.json())
.then(json => {
data = json.result.data;
getData()
});
This is the code to fetch all the results from the website.
const puppeteer = require('puppeteer');
let students = [];
let rollPrefix = '387EA';
let regPrefix = 'EA87S18';
let currRoll = 80;
let currReg = 80;
let i = 0;
(async () => {
const browser = await puppeteer.launch({
headless: false, // Show the window for debugging
slowMo: 150 // slow down by 50ms
});
const page = await browser.newPage();
let rolltemp = rollPrefix + pad(currRoll,3);
let regTemp = regPrefix + pad(currReg,3);
while(i < 4){
await page.goto('http://orissaresults.nic.in/CHSE');
await page.type('#txtRollNo', rolltemp);
await page.type('#txtRegNo', regTemp);
const element = await page.$("#divCaptch");
const text = await (await element.getProperty('textContent')).jsonValue();
await page.type('#txt_UserCaptcha', text);
await page.click('#btnSubmit');
page.on('dialog', async (dialog) => {
await dialog.dismiss().catch(() => {
console.log(dialog.message());
return new Result(TestStatus.FAIL, dialog.message());
})})
try{
await page.waitForNavigation()
await page.waitForSelector('table');
const RollNO = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[0].cells[1].innerText.trim();
});
const Name = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[2].cells[1].innerText.trim();
});
const RegNo = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[1].cells[1].innerText.trim();
});
const Total = await page.evaluate(() => {
return document.querySelectorAll('table')[3].rows[8].cells[0].innerText.trim();
});
let student = new Student(RollNO,Name,RegNo,Total)
students.push(student)
}catch{
currReg++;
continue;
}
currRoll++;
i++;
}
await browser.close()
// let json = JSON.stringify(students);
// storeData(json,'test.json')
})();
// function delay(time) {
// return new Promise(function(resolve) {
// setTimeout(resolve, time)
// });
// }
function pad(num, size) {
var s = num+"";
while (s.length < size) s = "0" + s;
return s;
}
class Student {
constructor(roll,name,reg,total){
this.roll = roll;
this.name = name;
this.reg = reg;
this.total = total;
}
}
const fs = require('fs')
const storeData = (data, path) => {
try {
fs.writeFileSync(path, data)
} catch (err) {
console.error(err)
}
}
Here the variable value of currReg stays the same pls help
The code tries each roll no and reg no combinations but there are some reg no that doesnt match with roll no so in the code the roll no should stay the same but the reg no should increase by one..
Not really sure what should happen with each combination, but here's an implementation which inputs all combinations. Below a short explanation:
const puppeteer = require('puppeteer');
let students = [];
(async () => {
const browser = await puppeteer.launch({
headless: false, // Show the window for debugging
slowMo: 150 // slow down by 50ms
});
const page = await browser.newPage();
let i = 0;
let j = 0;
const rollPrefix = '387EA';
const regPrefix = 'EA87S18';
let currRoll = 80;
let currReg = 80;
while(i < 4){
while(j < 4) {
let rolltemp = rollPrefix + pad(currRoll,3);
let regTemp = regPrefix + pad(currReg,3);
console.log("rolltemp = ", rolltemp, " regtemp = ", regTemp);
await page.goto('http://orissaresults.nic.in/CHSE');
await page.type('#txtRollNo', rolltemp);
await page.type('#txtRegNo', regTemp);
const element = await page.$("#divCaptch");
const text = await (await element.getProperty('textContent')).jsonValue();
await page.type('#txt_UserCaptcha', text);
await page.click('#btnSubmit');
page.on('dialog', async (dialog) => {
await dialog.dismiss().catch(() => {
console.log(dialog.message());
return new Result(TestStatus.FAIL, dialog.message());
})})
try{
await page.waitForNavigation()
await page.waitForSelector('table');
const RollNO = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[0].cells[1].innerText.trim();
});
const Name = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[2].cells[1].innerText.trim();
});
const RegNo = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[1].cells[1].innerText.trim();
});
const Total = await page.evaluate(() => {
return document.querySelectorAll('table')[3].rows[8].cells[0].innerText.trim();
});
let student = new Student(RollNO,Name,RegNo,Total)
students.push(student)
} catch {
continue;
}
currReg++;
j++;
}
currReg = 80;
j = 0;
currRoll++;
i++;
}
await browser.close()
// let json = JSON.stringify(students);
// storeData(json,'test.json')
})();
// function delay(time) {
// return new Promise(function(resolve) {
// setTimeout(resolve, time)
// });
// }
function pad(num, size) {
var s = num+"";
while (s.length < size) s = "0" + s;
return s;
}
class Student {
constructor(roll,name,reg,total){
this.roll = roll;
this.name = name;
this.reg = reg;
this.total = total;
}
}
const fs = require('fs')
const storeData = (data, path) => {
try {
fs.writeFileSync(path, data)
} catch (err) {
console.error(err)
}
}
Explanation
so, assuming you want all combinations of pairs {currentRol, currentReg}, you'll definitely need two loops. There are gonna be 4x4=16 combinations in total (I assume, basing on i < 4 condition). First mistake which you made was assigning regTemp before while loop, effectively not changing the strings entered to the inputs, only some unused, temporary values (currentRoll, currentReg). So, first and foremost is to move rollTemp and regTemp definitions into the while loop. Now, as I said, you're gonna need two nested loops, as you need to generate all possible combinations (for each currentRol, all currentRegs). One more thing to remember is that you'll have to reset currentReg with each outer loop iteration, as you want to test each reg for given roll.
Note about variables' scopes
This is a great example why variables scopes are critical when programming. Not only it increases readability and comprehensibility of the given code - it prevents other functions/scopes from using symbols which do not really belong to them. Please notice where the variables definitions are within my snippet. Probably it's not perfect, but why would you pollute global namespace as in your example?
So I am trying to pull out information using data scraping from this real estate website (https://www.zillow.com/vancouver-bc/)
I am able to get all the information about the listing on the page but with images (image links/src), after a few of them, the result is some garbage. I tried researching and found it was because of lazy loading. For which is tried almost all the methods available and answered by others but none seem to work - this includes scrolling to the bottom, scrolling with delays (https://www.npmjs.com/package/puppeteer-autoscroll-down), zooming out the browser as much as I can to get the images to render. But it still doesn't work. I have been looking everywhere for hours now before I decided to post my question and code here itself for anyone else to figure it out.
let cheerio = require('cheerio')
let puppeteer = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent = require('random-useragent')
const baseURL = "https://www.zillow.com/vancouver-bc"
let estateData = []
let urlLinks = []
let scrollPageToBottom = require('puppeteer-autoscroll-down')
let getEstateData = async () => {
estateData = []
urlLinks = []
let url
for (let pgNum = 1; pgNum <= 1; pgNum++) {
if (pgNum === 1) {
url = baseURL + "/"
} else {
url = baseURL + ("/" + pgNum + "_p")
}
urlLinks.push(url)
}
await searchWebsite()
console.log("search over")
return estateData
//module.exports = estateData
}
let searchWebsite = async () => {
await puppeteer
.launch({headless : false})
.then(async function (browser) {
let page = await browser.newPage();
// await page.setRequestInterception(true)
//
// page.on('request', (req) => {
// if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
// req.abort()
// }
// else {
// req.continue()
// }
//
// })
let html
await page.setUserAgent(userAgent.getRandom())
for(let url of urlLinks){
console.log(url)
await page.goto(url).then(async function () {
html = await page.content();
let obj = await cheerio('.list-card-link.list-card-info', html)
let imgObj = await cheerio(".list-card-top", html)
let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)
// await page.waitForSelector('img',{
// visible: true,
// })
// await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
const scrollStep = 250 // default
const scrollDelay = 100 // default
const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
await page.waitFor(2000)
let num = 0
console.log(obj.length)
for (let key in obj) {
if (obj[key].attribs) {
try {
let geoStr = await geoLocation[0].children[0].children[0].children[0].data
let geoObj = await (JSON.parse(geoStr)["geo"])
let extractedInfo = {
estateName : await obj[key].children[0].children[0].data,
estatePrice : await obj[key].children[2].children[0].children[0].data,
saleType : await obj[key].children[1].children[0].next.data,
estateConfig : {
beds : await obj[key].children[2].children[1].children[0].children[0].data,
bath : await obj[key].children[2].children[1].children[1].children[0].data,
area : await obj[key].children[2].children[1].children[2].children[0].data
},
estateLocation : {
longitude : await geoObj.longitude,
latitude : await geoObj.latitude
},
estateLink : await obj[key].attribs.href,
estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
}
console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
await estateData.push(extractedInfo)
}
catch (e) {
console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
console.log(e)
}
}
}
console.log(estateData.length)
});
}
//Now read the page
console.log("total - ", estateData.length)
await page.close()
await browser.close()
})
.catch(function (err) {
console.log(err)
});
}
module.exports.getEstateData = getEstateData
I had a similar issue and found a working answer here. Hopefully this works for you too. The interval was a little slow so I changed it from 100 to 30.
I was able to solve this with a pretty simple implementation using the puppeteer-autoscroll-down library as you mentioned. I'm not sure which images you were specifically attempting to grab, but this worked for me.
// Set the initial viewport and navigate to the page
await page.setViewport({ width: 1300, height: 1000 });
await page.goto('https://www.zillow.com/vancouver-bc/', { waitUntil: 'load' });
// Scroll to the very top of the page
await page.evaluate(_ => {
window.scrollTo(0, 0);
});
// Scroll to the bottom of the page with puppeteer-autoscroll-down
await scrollPageToBottom(page);
// Get your image links
let imageLinks = await page.$$eval('.list-card img', imgLinks => {
return imgLinks.map((i) => i.src);
});
imageLinks was an array with 40 fully formed links, https://photos.zillowstatic.com/p_e/ISz7wlfm278p501000000000.jpg is one example.
Hope that helps you, this was a pretty brutal one for me to solve as well.
I am looking for a way to check if all img src from a specific page results in a 200. I got this script so far:
test('Check if all images exist', async t => {
var images = Selector('img');
var count = await images.count;
for(var i=0; i < count; i++) {
var url = await images.nth(i).getAttribute('src');
if(!url.startsWith('data')) {
console.log(url);
console.log(getHTTPStatus(url));
console.log(await t.navigateTo(url));
}
}
});
Now we are able to read the src attribute and skip them if they start with "data" to avoid base64 images. If I use the navigateTo command now I see the image in the browser, but am not able to do anything else. Are you able to help me checking things?
To check that all image responses have 200 status, you can use TestCafe ClientFunction:
import { Selector, ClientFunction } from 'testcafe';
fixture `fixture`
.page `https://www.google.com`;
test('Check if all images exist', async t => {
var images = Selector('img');
var count = await images.count;
var requestsCount = 0;
var statuses = [];
var getRequestResult = ClientFunction(url => {
return new Promise(resolve => {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.onload = function () {
resolve(xhr.status);
};
xhr.send(null);
});
});
for (var i = 0; i < count; i++) {
var url = await images.nth(i).getAttribute('src');
if (!url.startsWith('data')) {
requestsCount++;
statuses.push(await getRequestResult(url));
}
}
await t.expect(requestsCount).eql(statuses.length);
for (const status of statuses)
await t.expect(status).eql(200);
});
Or, you can use some addition module, for example, a request to simplify the code:
import { Selector, ClientFunction } from 'testcafe';
import request from 'request';
fixture `fixture`
.page `https://www.google.com`;
const getLocation = ClientFunction(() => window.location.href);
test('Check if all images exist', async t => {
var images = Selector('img');
var count = await images.count;
var location = await getLocation();
var requestPromises = [];
for (var i = 0; i < count; i++) {
var url = await images.nth(i).getAttribute('src');
if (!url.startsWith('data')) {
requestPromises.push(new Promise(resolve => {
return request(location + url, function (error, response) {
resolve(response ? response.statusCode : 0);
});
}));
}
}
var statuses = await Promise.all(requestPromises);
for (const status of statuses)
await t.expect(status).eql(200);
});