So I am trying to pull out information using data scraping from this real estate website (https://www.zillow.com/vancouver-bc/)
I am able to get all the information about the listing on the page but with images (image links/src), after a few of them, the result is some garbage. I tried researching and found it was because of lazy loading. For which is tried almost all the methods available and answered by others but none seem to work - this includes scrolling to the bottom, scrolling with delays (https://www.npmjs.com/package/puppeteer-autoscroll-down), zooming out the browser as much as I can to get the images to render. But it still doesn't work. I have been looking everywhere for hours now before I decided to post my question and code here itself for anyone else to figure it out.
let cheerio = require('cheerio')
let puppeteer = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent = require('random-useragent')
const baseURL = "https://www.zillow.com/vancouver-bc"
let estateData = []
let urlLinks = []
let scrollPageToBottom = require('puppeteer-autoscroll-down')
let getEstateData = async () => {
estateData = []
urlLinks = []
let url
for (let pgNum = 1; pgNum <= 1; pgNum++) {
if (pgNum === 1) {
url = baseURL + "/"
} else {
url = baseURL + ("/" + pgNum + "_p")
}
urlLinks.push(url)
}
await searchWebsite()
console.log("search over")
return estateData
//module.exports = estateData
}
let searchWebsite = async () => {
await puppeteer
.launch({headless : false})
.then(async function (browser) {
let page = await browser.newPage();
// await page.setRequestInterception(true)
//
// page.on('request', (req) => {
// if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
// req.abort()
// }
// else {
// req.continue()
// }
//
// })
let html
await page.setUserAgent(userAgent.getRandom())
for(let url of urlLinks){
console.log(url)
await page.goto(url).then(async function () {
html = await page.content();
let obj = await cheerio('.list-card-link.list-card-info', html)
let imgObj = await cheerio(".list-card-top", html)
let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)
// await page.waitForSelector('img',{
// visible: true,
// })
// await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
const scrollStep = 250 // default
const scrollDelay = 100 // default
const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
await page.waitFor(2000)
let num = 0
console.log(obj.length)
for (let key in obj) {
if (obj[key].attribs) {
try {
let geoStr = await geoLocation[0].children[0].children[0].children[0].data
let geoObj = await (JSON.parse(geoStr)["geo"])
let extractedInfo = {
estateName : await obj[key].children[0].children[0].data,
estatePrice : await obj[key].children[2].children[0].children[0].data,
saleType : await obj[key].children[1].children[0].next.data,
estateConfig : {
beds : await obj[key].children[2].children[1].children[0].children[0].data,
bath : await obj[key].children[2].children[1].children[1].children[0].data,
area : await obj[key].children[2].children[1].children[2].children[0].data
},
estateLocation : {
longitude : await geoObj.longitude,
latitude : await geoObj.latitude
},
estateLink : await obj[key].attribs.href,
estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
}
console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
await estateData.push(extractedInfo)
}
catch (e) {
console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
console.log(e)
}
}
}
console.log(estateData.length)
});
}
//Now read the page
console.log("total - ", estateData.length)
await page.close()
await browser.close()
})
.catch(function (err) {
console.log(err)
});
}
module.exports.getEstateData = getEstateData
I had a similar issue and found a working answer here. Hopefully this works for you too. The interval was a little slow so I changed it from 100 to 30.
I was able to solve this with a pretty simple implementation using the puppeteer-autoscroll-down library as you mentioned. I'm not sure which images you were specifically attempting to grab, but this worked for me.
// Set the initial viewport and navigate to the page
await page.setViewport({ width: 1300, height: 1000 });
await page.goto('https://www.zillow.com/vancouver-bc/', { waitUntil: 'load' });
// Scroll to the very top of the page
await page.evaluate(_ => {
window.scrollTo(0, 0);
});
// Scroll to the bottom of the page with puppeteer-autoscroll-down
await scrollPageToBottom(page);
// Get your image links
let imageLinks = await page.$$eval('.list-card img', imgLinks => {
return imgLinks.map((i) => i.src);
});
imageLinks was an array with 40 fully formed links, https://photos.zillowstatic.com/p_e/ISz7wlfm278p501000000000.jpg is one example.
Hope that helps you, this was a pretty brutal one for me to solve as well.
Related
I am trying to store some data into a group of PDF files using Puppeteer. But when I try to run this code my app is freezing and I assume I am doing something wrong with the async and await code. It seems to be working properly, the documents are created properly in the folder of the project, but at some point node crashes and I must reboot my PC.
const path = require("path");
const puppeteer = require("puppeteer");
const fs = require('fs');
for (let index = 0; index < documentsToPDF.length; index ++) {
const result = await generatePDF(documentsToPDF[index]);
await createAndSavePDFInFile(documentsToPDF[index], result);
}
async function generatePDF(browser, numdoc) {
const filePage = await browser.newPage();
filePage.setDefaultNavigationTimeout(0);
await filePage.goto(renderPath(numdoc), {
waitUntil: "networkidle0",
});
return filePage.pdf({
format: "A4",
});
}
async function createAndSavePDFInFile(numdoc, result) {
console.log(result);
if(result) {
return fs.writeFile(
path.join(__dirname, `../results/result-${numdoc}.pdf`),
result,
(error, result) => {
if(error) console.log('error', err);
else console.log(`PDF result-${numdoc}.pdf `);
}
);
} else {
console.log(`numdoc `, numdoc, ' could not be saved into pdf');
}
}
Thanks for your help, I'm new to node and I'm learning :)
EDIT:
This loop now is working fine
for (let index = 0; index < documentsToPDF.length; index += 1) {
const browser = await puppeteer.launch(puppeteerParams);
const result = await generatePDF(browser,documentsToPDF[index]);
await createAndSavePDFInFile(documentsToPDF[index], result);
browser.close();
console.log('Exp: ', documentsToPDF[index], ' SUCCESS');
}
But can I do something like this?
const pdfPromises = [];
for (let index = 0; index < documentsToPDF.length; index += 1) {
pdfPromises.push(createPdf(puppeteer, puppeteerParams, documentsToPDF[index]))
}
await Promise.all(pdfPromises);
async function createPdf(puppeteer, puppeteerParams, numexp) {
const browser = await puppeteer.launch(puppeteerParams);
const result = await generatePDF(browser, numexp);
await createAndSavePDFInFile(numexp, result);
browser.close();
console.log('Exp: ', numexp, ' SUCCESS');
}
I've tried and it returns this error:
MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 event listeners added. Use emitter.setMaxListeners() to increase limit.
i'm scrollig a page towards the bottom with puppeteer, when i press a key i want to exit from the function end return the results.
The script works almost, except i'm not able return values and the end of the script.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
userDataDir: "C:\\Users\\johndoe\\AppData\\Local\\Google\\Chrome\\User Data\\Default"
});
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
await page.goto('https://www.facebook.com/groups/0000000/members',{waitUntil: 'networkidle0'});
let rawMembers = await page.evaluate(() => {
const intervall = 3000;
let stop = false;
document.addEventListener('keypress', e => stop = true); //press a key to exit
let results = [];
let i = 0;
let pageHeigth = 0;
let timerId = setTimeout(function tick() {
if ((stop === false) && (document.body.scrollHeight > pageHeigth)){
pageHeigth = document.body.scrollHeight //save the current page heigth
document.scrollingElement.scrollTop = pageHeigth; //move the scroll to the end of the page (page visible size), this will couse new content to be loaded - virtula scroll)
results.concat(pageHeigth); //<--- it should be the results
timerId = setTimeout(tick, intervall); //schedule a new timeout to repeat the function
}
else
{
clearTimeout(timerId)
return results;
}
}, intervall);
});
console.log('END')
//await browser.close();
})();
You can return a Promise and resolve it on timer termination (also, as mentioned in the other answer, concat() in not suited here, you can use push instead):
const rawMembers = page.evaluate(() => new Promise((resolve) => {
const intervall = 3000;
let stop = false;
document.addEventListener('keypress', () => { stop = true; });
const results = [];
let pageHeigth = 0;
let timerId = setTimeout(function tick() {
if (stop === false && document.body.scrollHeight > pageHeigth) {
pageHeigth = document.body.scrollHeight;
document.scrollingElement.scrollTop = pageHeigth;
results.push(pageHeigth);
timerId = setTimeout(tick, intervall);
} else {
clearTimeout(timerId);
resolve(results);
}
}, intervall);
}));
concat method returns you a new array. It is not modifying the existing array. When you call the concat method you need to assign it to a variable. So this might solve the problem
results.concat(pageHeight) //instead of this
let finalResult = results.concat(pageHeight) // you need to do this
Other ways your results array always be empty because concat method does not modify the array.
Here's the documentation. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/concat#return_value
And you need to return finalResult instead of results at the end.
This is the code to fetch all the results from the website.
const puppeteer = require('puppeteer');
let students = [];
let rollPrefix = '387EA';
let regPrefix = 'EA87S18';
let currRoll = 80;
let currReg = 80;
let i = 0;
(async () => {
const browser = await puppeteer.launch({
headless: false, // Show the window for debugging
slowMo: 150 // slow down by 50ms
});
const page = await browser.newPage();
let rolltemp = rollPrefix + pad(currRoll,3);
let regTemp = regPrefix + pad(currReg,3);
while(i < 4){
await page.goto('http://orissaresults.nic.in/CHSE');
await page.type('#txtRollNo', rolltemp);
await page.type('#txtRegNo', regTemp);
const element = await page.$("#divCaptch");
const text = await (await element.getProperty('textContent')).jsonValue();
await page.type('#txt_UserCaptcha', text);
await page.click('#btnSubmit');
page.on('dialog', async (dialog) => {
await dialog.dismiss().catch(() => {
console.log(dialog.message());
return new Result(TestStatus.FAIL, dialog.message());
})})
try{
await page.waitForNavigation()
await page.waitForSelector('table');
const RollNO = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[0].cells[1].innerText.trim();
});
const Name = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[2].cells[1].innerText.trim();
});
const RegNo = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[1].cells[1].innerText.trim();
});
const Total = await page.evaluate(() => {
return document.querySelectorAll('table')[3].rows[8].cells[0].innerText.trim();
});
let student = new Student(RollNO,Name,RegNo,Total)
students.push(student)
}catch{
currReg++;
continue;
}
currRoll++;
i++;
}
await browser.close()
// let json = JSON.stringify(students);
// storeData(json,'test.json')
})();
// function delay(time) {
// return new Promise(function(resolve) {
// setTimeout(resolve, time)
// });
// }
function pad(num, size) {
var s = num+"";
while (s.length < size) s = "0" + s;
return s;
}
class Student {
constructor(roll,name,reg,total){
this.roll = roll;
this.name = name;
this.reg = reg;
this.total = total;
}
}
const fs = require('fs')
const storeData = (data, path) => {
try {
fs.writeFileSync(path, data)
} catch (err) {
console.error(err)
}
}
Here the variable value of currReg stays the same pls help
The code tries each roll no and reg no combinations but there are some reg no that doesnt match with roll no so in the code the roll no should stay the same but the reg no should increase by one..
Not really sure what should happen with each combination, but here's an implementation which inputs all combinations. Below a short explanation:
const puppeteer = require('puppeteer');
let students = [];
(async () => {
const browser = await puppeteer.launch({
headless: false, // Show the window for debugging
slowMo: 150 // slow down by 50ms
});
const page = await browser.newPage();
let i = 0;
let j = 0;
const rollPrefix = '387EA';
const regPrefix = 'EA87S18';
let currRoll = 80;
let currReg = 80;
while(i < 4){
while(j < 4) {
let rolltemp = rollPrefix + pad(currRoll,3);
let regTemp = regPrefix + pad(currReg,3);
console.log("rolltemp = ", rolltemp, " regtemp = ", regTemp);
await page.goto('http://orissaresults.nic.in/CHSE');
await page.type('#txtRollNo', rolltemp);
await page.type('#txtRegNo', regTemp);
const element = await page.$("#divCaptch");
const text = await (await element.getProperty('textContent')).jsonValue();
await page.type('#txt_UserCaptcha', text);
await page.click('#btnSubmit');
page.on('dialog', async (dialog) => {
await dialog.dismiss().catch(() => {
console.log(dialog.message());
return new Result(TestStatus.FAIL, dialog.message());
})})
try{
await page.waitForNavigation()
await page.waitForSelector('table');
const RollNO = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[0].cells[1].innerText.trim();
});
const Name = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[2].cells[1].innerText.trim();
});
const RegNo = await page.evaluate(() => {
return document.querySelectorAll('table')[2].rows[1].cells[1].innerText.trim();
});
const Total = await page.evaluate(() => {
return document.querySelectorAll('table')[3].rows[8].cells[0].innerText.trim();
});
let student = new Student(RollNO,Name,RegNo,Total)
students.push(student)
} catch {
continue;
}
currReg++;
j++;
}
currReg = 80;
j = 0;
currRoll++;
i++;
}
await browser.close()
// let json = JSON.stringify(students);
// storeData(json,'test.json')
})();
// function delay(time) {
// return new Promise(function(resolve) {
// setTimeout(resolve, time)
// });
// }
function pad(num, size) {
var s = num+"";
while (s.length < size) s = "0" + s;
return s;
}
class Student {
constructor(roll,name,reg,total){
this.roll = roll;
this.name = name;
this.reg = reg;
this.total = total;
}
}
const fs = require('fs')
const storeData = (data, path) => {
try {
fs.writeFileSync(path, data)
} catch (err) {
console.error(err)
}
}
Explanation
so, assuming you want all combinations of pairs {currentRol, currentReg}, you'll definitely need two loops. There are gonna be 4x4=16 combinations in total (I assume, basing on i < 4 condition). First mistake which you made was assigning regTemp before while loop, effectively not changing the strings entered to the inputs, only some unused, temporary values (currentRoll, currentReg). So, first and foremost is to move rollTemp and regTemp definitions into the while loop. Now, as I said, you're gonna need two nested loops, as you need to generate all possible combinations (for each currentRol, all currentRegs). One more thing to remember is that you'll have to reset currentReg with each outer loop iteration, as you want to test each reg for given roll.
Note about variables' scopes
This is a great example why variables scopes are critical when programming. Not only it increases readability and comprehensibility of the given code - it prevents other functions/scopes from using symbols which do not really belong to them. Please notice where the variables definitions are within my snippet. Probably it's not perfect, but why would you pollute global namespace as in your example?
So basically im working on a cron job in my app that fires every 3 hours and updating users 'score' by calling the RiotApi
basically the function so far
exports.updatePlayersPoints = async () => {
console.log('STARTED UPDATING');
try {
const players = await UserLadder.findAll();
await Promise.all(
players.map(async (player) => {
const p = await RiotAccount.findOne({
where: {
userId: player.userId,
},
include: RiotRegions,
});
const beginTime = new Date(player.dataValues.createdAt);
let data;
try {
const res = await axios.get(
`https://${
p.dataValues.riot_region.dataValues.name
}.api.riotgames.com/lol/match/v4/matchlists/by-account/${
p.dataValues.accountId
}?queue=420&beginTime=${beginTime.getTime()}&api_key=${
process.env.RIOT_KEY
}`
);
data = res.data;
} catch (error) {
if (!error.response.status === 404) {
console.error(error);
}
}
if (!data) {
return;
}
let totalScore = player.dataValues.userPoints;
await Promise.all(
data.matches.map(async (match, i) => {
if (i < 15) {
const { data } = await axios.get(
`https://${p.dataValues.riot_region.dataValues.name}.api.riotgames.com/lol/match/v4/matches/${match.gameId}?api_key=${process.env.RIOT_KEY}`
);
const calculateScore = () => {
return new Promise((resolve) => {
const { stats } = _.find(
data.participants,
(o) => o.championId === match.champion
);
const killsPts = stats.kills * 2;
const deathPts = stats.deaths * -1.5;
const assistsPts = stats.assists;
const wardsPts = stats.wardsPlaced / 4;
const firstBloodPts = stats.firstBloodKill ? 3 : 0;
const firstBloodAssistPts = stats.firstBloodAssist ? 3 : 0;
const firstTowerPts = stats.firstTowerKill ? 2 : 0;
const firstTowerAssistPts = stats.firstTowerAssist ? 2 : 0;
const score =
killsPts +
deathPts +
assistsPts +
wardsPts +
firstBloodPts +
firstBloodAssistPts +
firstTowerPts +
firstTowerAssistPts;
totalScore += score;
resolve();
});
};
await calculateScore();
}
})
);
const user = await UserLadder.findOne({
where: {
userId: player.userId,
},
});
user.userPoints = parseFloat(totalScore);
user.lastGameId = data.matches[0].gameId;
await user.save();
})
);
console.log('FINISHED UPDATING');
} catch (error) {
console.error(error);
}
};
Basically it just looks up the table userladder to find the players that are signed to the ladder and for each one of these players it fires a map function that makes a request to the riotapi to get the match history of this player and then later make an inside map function to map each one of these matches.
but basically I updated it to now keep track of the game id of the last call before 3 hours so it doesn't have to make request that was already done.
user.lastGameId = data.matches[0].gameId;
but now in my second map function that maps the matches I wasn't it so that if the last game from my database matches the game id that currently being mapped I want to stop the map function and not continue this record or the ones after because it also means they all have been already counted.
but I can not seem to find a way to do it.
i tried using break; but it didn't work
any ideas?
using for loop
I tried a small test with for loop so I tried
for (let i = 0; i < 15; i++) {
await new Promise(async (resolve, reject) => {
const match = data.matches[i];
console.log(match);
resolve();
if (i === 1) {
break;
}
});
}
but I still go the same error
SyntaxError: Illegal break statement
Instead of trying to "break" a map, you should filter the matches that you want to process before you execute the map.
Something like this:
await Promise.all(
const filteredMatches = data.matches.filter(match => match.gameId > previousId);
filteredMatches.map(async (match, i) => { ...
More on filter() in javascript.
Edit: If generated id's are random and are not ordered, you can store all previous id's in a Set, and then just ask if it has been previously added
await Promise.all(
const filteredMatches = data.matches.filter(match => mySet.has(match.gameId));
filteredMatches.map(async (match, i) => { ...
More on Set in javascript.
I've been using puppeteer to try and get pdfs - or its buffer response - from a website which does two requests after clicking on the link for the document (which open in a new tab):
The first request (http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF) retrieves the session guid to access the document
The second request (http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c) uses that guid to access the document and render the pdf on the browser.
The result of my attempts has been a blank pdf being generated, even if it was created after the page been loaded (checked with Fiddler).
I've tried
Intercepting targetcreated event to get the page
Get the second request url and use page.goto to get the pdf
Wait on a the page response to get the buffer
Set Page.setDownloadBehaviour to allow download instead of rendering it in the browser
Any guidance and help is appreciated.
The code tried is below:
const puppeteer = require("puppeteer");
let browser;
async function getDocument(index, title, page) {
if (index != 19) return "";
console.log("getDocument START");
console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
let docPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ViewDocument page");
}
})
);
/* Tried to set the download behaviour to download automatically the pdf but it didn't work */
// await page._client.send("Page.setDownloadBehaviour", {
// behaviour: "allow",
// downloadPath: "./"
// });
await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
let pdfResults = "";
let pdfPage = await docPagePromise;
/* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
// let target = await pdfPage.target();
// let url = await target.url();
// let response = await pdfPage.goto(url);
// console.log(response);
pdfPage.on("console.log", msg => console.log(msg));
/* This is never called */
await pdfPage.on("response", async response => {
console.log("PDF PAGE Response");
let responseBuffer = await response.buffer();
let responseHeaders = response.headers();
console.log("PDF PAGE Response Header: " + responseHeaders);
console.log("PDF PAGE Response Buffer: " + responseBuffer);
return {
responseHeaders,
responseBuffer
};
});
console.log(pdfResults);
let pdfTitle = await pdfPage.title();
console.log("PDFPage URL: " + pdfPage.url());
console.log("PDFPage Title: " + pdfTitle);
let pdfTarget = await pdfPage.target();
console.log("PDFTarget URL: " + (await pdfTarget.url()));
console.log("PDFTarget Type: " + pdfTarget.type());
pdfPage = await pdfTarget.page();
console.log("PDFPage URL: " + pdfPage.url());
await pdfPage.waitFor(3000);
let pdf = await pdfPage.pdf({ path: title + ".pdf" });
console.log(pdf);
return pdf;
}
async function getAdditionalDocumentation(page) {
console.log("getAdditionalDocumentation START");
await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
await page.click("#repGroupSummary__ctl1_lnkGroupName");
await page.waitForSelector("#pnlDocumentList > table > tbody > tr");
await page.waitFor(2000);
const documents = await page.$$eval(
"#pnlDocumentList > table > tbody > tr",
docs =>
docs.map((doc, i) => ({
type: doc.querySelector(".tdl-subgroup > span").innerText,
datePublished: doc.querySelector(
".tdl-date > span[id*='DatePublished']"
).innerText,
dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
.innerText,
docType: doc.querySelector(".tdl-doctype > span").innerText,
description: doc.querySelector(".tdl-description > span").innerText
// 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
}))
);
for (let i = 0; i < documents.length; i++) {
documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
}
await page.click("#btnSummary");
console.log("getAdditionalDocumentation FINISH");
return documents;
}
async function getDocuments(page, browser) {
console.log("getDocuments");
let newPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ShowCaseFile page");
}
})
);
await page.click("#tab_externalDocuments > span");
await page.waitForSelector("#hp-doc-link");
await page.click("#hp-doc-link");
const newPage = await newPagePromise;
const additionalDocumentation = await getAdditionalDocumentation(newPage);
return {
additionalDocumentation
};
}
async function run() {
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => console.log("PAGE LOG:", ...msg.args));
const planningReference = "LA04/2017/1388/F";
await page.goto(
"http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
);
await page.waitForSelector("#simpleSearchString");
await page.type("#simpleSearchString", planningReference);
await page.click("#simpleSearchForm > div.row3 > input.button.primary");
await page.waitForSelector("#simpleDetailsTable");
console.log("getDocuments START");
const documents = await getDocuments(page, browser);
console.log("getDocuments FINISH");
console.log(documents);
console.log(documents.additionalDocumentation.length);
} finally {
browser.close();
}
}
run();
Use exposefunction to write the buffer data to disk with:
page.exposeFunction("writeABString", async (strbuf, targetFile) => {
var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer
var buf = new ArrayBuffer(str.length); // 1 byte for each char
var bufView = new Uint8Array(buf);
for (var i=0, strLen=str.length; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
console.log("In 'writeABString' function...");
return new Promise((resolve, reject) => {
// Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
let buf = Buffer.from(str2ab(strbuf));
// Try saving the file.
fs.writeFile(targetFile, buf, (err, text) => {
if(err) reject(err);
else resolve(targetFile);
});
});
});
With the download link that you have use it in tandem with fetch api to get it as blob and convert it with:
page.evaluate( async () => {
function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String
var bufView = new Uint8Array(buffer);
var length = bufView.length;
var result = '';
var addition = Math.pow(2,8)-1;
for(var i = 0;i<length;i+=addition){
if(i + addition > length){
addition = length - i;
}
result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
}
return result;
}
let geturl = "https://whateverurl.example.com";
return fetch(geturl, {
credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
responseType: 'arraybuffer', // get response as an ArrayBuffer
})
.then(response => response.arrayBuffer())
.then( arrayBuffer => {
var bufstring = arrayBufferToString(arrayBuffer);
return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
})
.catch(function (error) {
console.log('Request failed: ', error);
});
});
For more info look at this issue on the github puppeteer page. The above solution was also suggested in the issue.
Source