How to save web page content as mp4 using puppeteer in nodejs - javascript

Is there a way to download mp4 or video files using puppeteer?
Here's an example of what I want to download https://www.w3schools.com/html/mov_bbb.mp4
Here's my script so far
async downloadVideo(link = '') {
try {
this.browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox']
})
this.page = await this.browser.newPage();
if (!link) {
throw new Error(`No link provided, process skipped`);
}
await this.page.goto(link, { waitUntil: 'load', timeout: 60000 })
}
catch (error) {
throw error;
}
}

You can just use the https module of node to download the file using the URL. Documentation here: https://nodejs.org/api/https.html

Related

Puppeteer Application Error: A client side exception has occurred

I am using Puppeteer with NEXT.JS, trying to take a screenshot. And it works fine on localhost but returns an image with this error in production:
Application error a client-side exception has occurred (see the browser console for more information!!
Taking a screenshot
export const createImages = async (urlArray) => {
try {
const browser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
slowMo: 250, // slow down by 250ms
})
const page = await browser.newPage()
for (let i = 0; i < urlArray.length; i++) {
if (urlArray[i].address === "") continue
await page.goto(urlArray[i].address, {
waitUntil: "load",
timeout: 30000,
})
const screenshotBase64 = await page.screenshot({
encoding: "base64",
})
const screenshot = Buffer.from(
await screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
"base64"
)
urlArray[i]["imgBase64"] = screenshot
}
await browser.close()
} catch (err) {
console.log(new Date(), "was not able to create images: ", err)
return err
}
return 1
}
When I open the url manually in production, the page loads fine! And I have tried encoding the image to Binary instead but still the same issue.. Any idea !?
At first I was listening only to the errors. But after I listened to all console messages using this command:
page.on('console', msg => console.log('PAGE LOG:', msg.text()))
I was able to see this error:
'THREE.WebGLRenderer: Error creating WebGL context.'
And it point out that the GPU used on the server is blacklisted because it's old.

Getting blank screenshot from Puppeteer

I have used puppeteer to capture the screenshot of my page in React JS. But it is taking a blank screenshot instead of the actual charts present on the page. Here is my code.
const puppeteer = require('puppeteer');
const url = process.argv[2];
if (!url) {
throw "Please provide URL as a first argument";
}
async function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({args: ['--no-sandbox', '--disable-setuid-sandbox'],headless: true, ignoreHTTPSErrors:true});
const page = await browser.newPage();
await page.goto(url, {
timeout: 30000,
waitUntil: "networkidle0"
});
await page.content();
let imgDataBase64 = await page.screenshot({quality:100, fullPage: true, encoding: "base64", type: "jpeg"});
await browser.close();
return resolve(imgDataBase64);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
The reason for the same could be document is getting loaded first before the chart loads. And puppeteer takes the screenshot as soon as the document loads. Can anyone please help me with this? We have to be sure that there is no delay in chart loading after the document is loaded so that screenshot can be captured properly. Please help. Thanks in advance.

UnhandledPromiseRejectionWarning: Error: Evaluation failed theme is not defined

Before I start the question, I am new in JavaScript, and I have very basic knowledge of async js, but i need to solve this so i can have my first project functional.
I am trying to build a scraping app using Node and Puppeteer. Basically, the user enters a URL ("link" in the code below), puppeteer goes trough the website code, tries to find the specific piece and returns the data. That part I got working so far.
The problem is when a user enters a URL of a site that doesn't have that piece of code. In that case, I get UnhandledPromiseRejectionWarning: Error: Evaluation failed theme is not defined
What do I do so when there is an error like that, I can catch it and redirect the page instead of Getting Internal Server error.
app.post("/results", function(req, res) {
var link = req.body.link;
(async link => {
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name');
await browser.close()
return data
})(link)
.then(data => {
res.render("index", {data: data, siteUrl: link});
})
})
You can extend the async part to the whole route handler and do whatever you want on catch:
app.post('/results', async (req, res) => {
try {
const link = req.body.link
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name')
await browser.close()
res.render("index", {data: data, siteUrl: link})
} catch(e) {
// redirect or whatever
res.redirect('/')
}
});

Puppeteer is not working the same on local vs prod

let templateHtml = fs.readFileSync(
path.join(process.cwd(), '../signedDocs/template.html'),
'utf8'
);
// making a compilable out of the HTML file
let template = handlebars.compile(templateHtml);
console.log('creafte pdf 1');
// passing the data to the HTML
let html = template(dataPDF);
// constructing the path where the generated PF file will be stored
let pdfPath = path.join(process.cwd(), '../signedDocs/' + userID + '.pdf');
console.log('creafte pdf 2');
// PDF configuration
let options = {
width: '1230px',
headerTemplate: '<p></p>',
footerTemplate: '<p></p>',
displayHeaderFooter: false,
printBackground: true,
pageRanges: '1-6',
format: 'A4',
preferCSSPageSize: true,
margin: {
top: '10px',
right: '20px',
bottom: '60px',
left: '20px'
},
path: pdfPath
};
console.log('creafte pdf 3.1');
// starting the browser with Puppeteer
const browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox'],
headless: true
});
console.log('creafte pdf 3.2');
// starting a new blank page
let page = await browser.newPage();
try {
await page.goto(`data:text/html;charset=UTF-8,${html}`, {
waitUntil: 'networkidle0' //command used so the page w/ modules waited to be loaded
});
} catch (err) {
console.log(err);
}
console.log('creafte pdf 4');
try {
await page.pdf(options); // to generate the PDF
} catch (err) {
console.log('errrr on page.pdf');
console.log(err);
}
console.log('done');
await followUpEmail(user);
console.log('email sent');
await browser.close(); // for closing the browser
The above code works perfectly fine on my localhost. ( Running node.js 10 )
However i have now deployed my API to an EC2 instance and it runs until:
const browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox'],
headless: false
});
I get the 3.1 console.log but nothing afterwards.
Im starting to get the feeling its something todo with my Prod env. However after trying all type of different approaches today i'm a bit lost.
Now i'm really hoping someone here has encountered this issue and has an answer or a direction!
So it turned out that NPM does install a version of Chrome however its missing a-lot of dependancies.
I checked to see which dependancies were missing by using:
ldd chrome | grep not
Installed a few manually however some are not on the PKM's
I then created a YUM config to install chrome, installed it and that came with the missing dependancies.

Puppeteer - How can I get the current page (application/pdf) as a buffer or file?

Using Puppeteer (https://github.com/GoogleChrome/puppeteer), I have a page that's a application/pdf. With headless: false, the page is loaded though the Chromium PDF viewer, but I want to use headless. How can I download the original .pdf file or use as a blob with another library, such as (pdf-parse https://www.npmjs.com/package/pdf-parse)?
Since Puppeteer does not currently support navigation to a PDF document in headless mode via page.goto() due to the upstream issue, you can use page.setRequestInterception() to enable request interception, and then you can listen for the 'request' event and detect whether the resource is a PDF before using the request client to obtain the PDF buffer.
After obtaining the PDF buffer, you can use request.abort() to abort the original Puppeteer request, or if the request is not for a PDF, you can use request.continue() to continue the request normally.
Here's a full working example:
'use strict';
const puppeteer = require('puppeteer');
const request_client = require('request-promise-native');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
if (request.url().endsWith('.pdf')) {
request_client({
uri: request.url(),
encoding: null,
headers: {
'Content-type': 'applcation/pdf',
},
}).then(response => {
console.log(response); // PDF Buffer
request.abort();
});
} else {
request.continue();
}
});
await page.goto('https://example.com/hello-world.pdf').catch(error => {});
await browser.close();
})();
Grant Miller's solution didn't work for me because I was logged in the website. But if the pdf is public this solution works out well.
The solution for my case was to add the cookies
await page.setRequestInterception(true);
page.on('request', async request => {
if (request.url().indexOf('exibirFat.do')>0) { //This condition is true only in pdf page (in my case of course)
const options = {
encoding: null,
method: request._method,
uri: request._url,
body: request._postData,
headers: request._headers
}
/* add the cookies */
const cookies = await page.cookies();
options.headers.Cookie = cookies.map(ck => ck.name + '=' + ck.value).join(';');
/* resend the request */
const response = await request_client(options);
//console.log(response); // PDF Buffer
buffer = response;
let filename = 'file.pdf';
fs.writeFileSync(filename, buffer); //Save file
} else {
request.continue();
}
});

Categories