Reconnect To Previous Puppeteer Session In Hosted Environment - javascript

I'm currently working on a project that uses Puppeteer to control headless chrome. Right now I'm hosting my app using Firebase functions. This is working well if I have to do all my browsing in one session, but if I have to come back at a later time I am having trouble reestablishing a connection and resuming where I left off.
Here is my current script.
const express = require('express');
const functions = require('firebase-functions');
const puppeteer = require('puppeteer');
const admin = require('firebase-admin');
admin.initializeApp(functions.config().firebase);
const db = admin.database();
const app = express();
app.get('/openpage', async (req, res) => {
try {
const browser = await puppeteer.launch({ args: ['--no-sandbox'] });
const page = await browser.newPage();
const url = 'https://www.reddit.com/';
await page.goto(url, { waitUntil: 'networkidle2' });
await page.evaluate(() => {
document.querySelector('input[name="q"]').value = 'dog';
document.querySelector('[action="/search"]').submit();
});
// Here I want to save the current state of the browser
const endpoint = browser.wsEndpoint();
console.log('Endpoint', endpoint);
db.ref('test/').update({ endpoint });
await browser.close();
} catch (err) {
console.log(err);
}
res.send('Finished');
});
app.get('/screenshot', async (req, res) => {
try {
const endpoint = await db.ref('test/endpoint').once('value').then(snap => snap.val());
const browser = await puppeteer.connect({ browserWSEndpoint: endpoint }); // This is where it fails
const pages = await browser.pages();
const page = pages[0];
await page.screenshot({ path: 'picture' });
await browser.close();
} catch (err) {
console.error(err);
}
res.send('Finished');
})
exports.test = functions.runWith({ memory: '2GB', timeoutSeconds: 60 }).https.onRequest(app);
With this setup, I can make a request to the /openpage endpoint and everything works fine and I store the browser endpoint to the firebase realtime database. But when I try to resume the session by calling /screenshot I get an error that the connection gets refused on the browser.connect() method. Is there a different way I should be going about this? Is this a firebase limitation or am I missing something about how the connections are reestablished in Puppeteer?
Error message: Error: connect ECONNREFUSED 127.0.0.1:62222
On a side note you have to add "engines": { "node": "8" }, to your package.json to be able to run Puppeteer with Firebase Functions.

This is because you are closing your browser with this line, await browser.close();. This will disconnect and close the browser and you won't be able to connect again.
You should use browser.disconnect() instead.

Related

Read data from url in javascript

i'm trying to read data stored in google drive, on drive or other cloud storage using javascript
`
let btn = document.getElementById('action-btn');
btn.addEventListener('click', () => {
// let baseurl = document.getElementById('inputurl').value;
// let guidDcoument = baseurl.split('#guid=')[1];
// const query = encodeURIComponent('Select *')
// fetch('', {
// mode: "no-cors",
// method: "get",
// }).then(Response => console.log(Response))
fetch('https://docs.google.com/spreadsheets/d/1kwfv6L2lBrPw8OjHGyhO7YHOXFNwHYyPI_noM5TUMLw/edit?pli=1#gid=1097732605',{
mode:"no-cors"
})
.then((response) => { console.log(Response.error); })
.catch(console.error())
})
`
What I need is that given a url, I can read the data from the file, and then show them on my website.
I think that when I try to access any other cloud storage I will have the same problem. That it will be necessary to access the account to be able to read the data that would be a csv document.
First of all, what you're trying to achieve is called 'web scraping' and you can't use fetch for that, instead you should use puppeteer (in the server side), which is the most popular library for web scraping.
Run npm init -y to initialize a npm project and install puppeteer npm i puppeteer and also install express and cors npm i express cors in order to create a server that scraps data and sets it back to your client. So, instead of trying to scrap the information directly from the client you do it from the server with puppeteer.
Try the following .js server code:
const express = require('express')
const cors = require('cors')
const puppeteer = require('puppeteer')
const app = express()
app.use(express.json())
app.use(cors())
;(async () => {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
app.use('/', async (req, res) => {
await page.goto('url-to-the-website')
const data = {}
return res.json(data)
})
})()
app.listen(3000)
And learn more about puppeteer: https://pptr.dev/.
And your client code should connect to this server to send scrape requests to it like this:
(async () => {
const res = await fetch('http://localhost:3000')
const rawjson = await res.json()
const json = JSON.parse(rawjson)
console.log(json)
})
Note: we wrap the code in anonymous functions with the reserved word async in order to use async and await syntax. More information: https://javascript.info/async-await.

how access returned values from client side and display them

I was experimenting with puppeteer and I built a simple scraper that gets information from youtube and it works fine what I was trying to add was to display that scraped information on my web page with <p> tags. Is there any way to do this? Where I'm am stuck is my name and avatarUrl variables are inside my scrape function as a local variable so how can I get those values and insert them in my <p> tag. For a rough sketch of what I tried, I did: document.getElementById('nameId')=name; after importing my js script(on HTML side) but this wont work because name is a local variable and it can't be accessed outside the scope. Any help is appreciated. Thanks in advance
const puppeteer = require('puppeteer');
async function scrapeChannel(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string');
const text = await el.getProperty('textContent');
const name = await text.jsonValue();
const [el2] = await page.$x('//*[#id="img"]');
const src = await el2.getProperty('src');
const avatarURL = await src.jsonValue();
browser.close();
console.log({
name,
avatarlURL
})
return {
name,
avatarURL
}
}
scrapeChannel('https://www.youtube.com/channel/UCQOtt1RZbIbBqXhRa9-RB5g')
module.exports = {
scrapeChannel,
}
<body onload="scrapeChannel()">
<p id="nameId">'put the scraped name here'</p>
<p id="avatarUrlId">'put the scraped avatar url here'</p>
<!--
document.getElementById('nameId')=name;
document.getElementById('avatartUrlId')=avatarURL;
-->
</body>
I have used cheerio in one of my projects and this is what I did in the backend and in the front end.
Node & Express JS Backend
In order to access your backend from the frontend, you need to set Routes in your backend. All your frontend requests are redirected to these routes. For more information read this Express Routes.
E.g Route.js code
const router = require("express").Router();
const { callscrapeChannel } = require("../scrape-code/scrape");
router.route("/scrapedata").get(async (req, res) => {
const Result = await callscrapeChannel();
return res.json(Result);
});
module.exports = router;
scrapeChannel.js file
const puppeteer = require('puppeteer');
async function scrapeChannel(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string');
const text = await el.getProperty('textContent');
const name = await text.jsonValue();
const [el2] = await page.$x('//*[#id="img"]');
const src = await el2.getProperty('src');
const avatarURL = await src.jsonValue();
browser.close();
console.log({
name,
avatarURL
})
return {
name,
avatarURL
}
}
async function callscrapeChannel() {
const data = await scrapeChannel('https://www.youtube.com/channel/UCQOtt1RZbIbBqXhRa9-RB5g')
return data
}
module.exports = {
callscrapeChannel,
}
in your server.js file
const express = require("express");
const cors = require("cors");
const scrapeRoute = require("./Routes/routes");
require("dotenv").config({ debug: process.env.DEBUG });
const port = process.env.PORT || 5000;
const app = express();
app.use(cors());
app.use(express.json());
app.use("/api", scrapeRoute);
app.listen(port, () => {
console.log(`server is running on port: http://localhost:${port}`);
});
dependencies you need (package.json)
"dependencies": {
"axios": "^0.21.1",
"body-parser": "^1.19.0",
"cors": "^2.8.5",
"cross-env": "^7.0.3",
"dotenv": "^8.2.0",
"esm": "^3.2.25",
"express": "^4.17.1",
"nodemon": "^2.0.7",
"puppeteer": "^8.0.0"
}
Frontend
In the front-end, I have used fetch. You need to send a get request to your backend. All you have to do is
<html>
<head>
<script>
async function callScrapeData(){
await fetch(`http://localhost:5000/api/scrapedata`)
.then((res) => {
return new Promise((resolve, reject) => {
setTimeout(()=> {
resolve(res.json())
}, 1000)
})
}).then((response) => {
console.log(response)
document.getElementById("nameId").innerHTML = response.name
document.getElementById("avatartUrlId").innerHTML = response.avatarURL
}
)
}
</script>
</head>
<body>
<div>
<h1>scrape</h1>
<p id="nameId"></p>
<p id="avatartUrlId"></p>
<button onclick="callScrapeData()">click</button>
</div>
</body>
</html>
Remember, my backend server is running on port 5000
output
The above code is just an example and I have modified it to fit your question. I hope this helps you to some extent. It's straightforward. Let me know if you have any questions.
Note: I assume you have a server.js file in your backend and it is configured properly.

Using Puppeteer to screenshot an EJS template with NodeJS and Express

I have NodeJs/Express app in which I would like to open new browser window and render local EJS view into it. I am trying to do it using Puppeteer.
const puppeteer = require('puppeteer');
router.post('/new_window', async (req, res) => {
try {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
const pageContent = ejs.render('../views/mypage.ejs', {})
await page.setContent(pageContent)
//await page.screenshot({path: 'example.png'});
// await browser.close();
} catch (err) {
res.status(500)
console.log(err)
res.send(err.message)
}
})
In the browser instead of page layout I get:
../views/mypage.ejs
Instead of:
await page.goto(...); // This code is acting like your browser's address bar
Try
const pageContent = ejs.render('../views/mypage.ejs', {data to populate your .ejs page}) //This is sudo code. Check ejs docs on how to do this
await page.setContent(pageContent)
The code above will let you create your page on your server.
With page.setContent(..)you can load any string of HTML.
OP made an edit that correctly uses page.setContent rather than page.goto, however, there's still an issue. ejs.render() is used to run EJS on a template in string form, so it's treating the file path as the template itself. If you want to read the file into a string first (possibly when your app starts, if the template never changes), ejs.render() will work.
The other approach is to use the EJS method that accepts a path, ejs.renderFile(). Here's a minimal example showing the usage:
const ejs = require("ejs"); // 3.1.8
const express = require("express"); // ^4.18.1
const puppeteer = require("puppeteer"); // ^19.1.0
express()
.get("/greet", (req, res) => {
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const html = await ejs.renderFile("greet.ejs", {name: "world"});
await page.setContent(html);
const buf = await page.screenshot();
res.contentType("image/png");
res.send(buf);
})()
.catch(err => {
console.error(err);
res.sendStatus(500);
})
.finally(() => browser?.close());
})
.listen(3000);
Where greet.ejs contains:
<!DOCTYPE html>
<html>
<body>
<h1>Hello, <%- name %></h1>
</body>
</html>
To make a PDF with Express, EJS and Puppeteer, see Express/Puppeteer: generate PDF from EJS template and send as response.
To reuse the browser across routes, see this answer for a possible approach.

Error while testing delete method in NodeJs with Mocha

I'm developing a billboard app to learn new technologies and i'm currently having problems trying to test the DELETE method of the API using Mocha.
I've been trying different approaches for it, but i couldn't find a solution yet. I'm using NodeJs and Hapi for the back-end server
const mongoose = require('mongoose')
const chai = require('chai')
const hapiServer = require('../../../index')
const expect = require('chai').expect
const should = chai.should()
chai.use(require('chai-http'))
before(async () => {
const server = await hapiServer()
global.url = 'http://localhost:3000'
})
after(async () => {
await mongoose.connection.close()
//await mongoose.connection.db.dropCollection()
})
const id = new mongoose.Types.ObjectId('5cd8a0eefc06344accd62a76')
describe('Movies API', () => {
it('Should DELETE a single movie', async () => {
const response = await chai.request(global.url)
.delete('/api/movies/' + '5cd8a0eefc06344accd62a77')
response.should.have.status(202)
})
})
Here's the index.js
const Hapi = require('#hapi/hapi')
const Inert = require('#hapi/inert')
const Vision = require('#hapi/vision')
const HapiSwagger = require('hapi-swagger')
const mongoose = require('mongoose')
const mongo = require('./config/mongo')
const Pack = require('./package')
const start = async () => {
const server = await new Hapi.Server({
host: 'localhost',
port: 3000,
})
// MongoDB Connection
try{
await mongoose
.connect(mongo.configuration.getUri(process.env.NODE_ENV), {useNewUrlParser: true})
console.log('MongoDB Connected...')
} catch(err) {
console.log(err)
}
const swaggerOptions = {
info: {
title: 'Billboard API Documentation',
version: Pack.version,
},
}
await server.register([
Inert,
Vision,
{
plugin: HapiSwagger,
options: swaggerOptions
}
]);
try {
await server.start()
console.log('Server running at:', server.info.uri)
} catch(err) {
console.log(err)
}
// Register Plugins
const moviesRoutes = require('./plugins/movies/routes')
server.route(moviesRoutes)
}
start()
module.exports = start
So, the the rest of the api tests are similar and they work excelent. But when i try to test this method i get this response:
error: Error: cannot DELETE /api/movies/5cd8a0eefc06344accd62a77 (404)
at Response.toError (C:\Users\lucas\billboard-backend\node_modules\superagent\lib\node\response.js:94:15)
at ResponseBase._setStatusProperties (C:\Users\lucas\billboard-backend\node_modules\superagent\lib\response-base.js:123:16)
at new Response (C:\Users\lucas\billboard-backend\node_modules\superagent\lib\node\response.js:41:8)
at Test.Request._emitResponse (C:\Users\lucas\billboard-backend\node_modules\superagent\lib\node\index.js:752:20)
at C:\Users\lucas\billboard-backend\node_modules\superagent\lib\node\index.js:916:38
at IncomingMessage.<anonymous> (C:\Users\lucas\billboard-backend\node_modules\superagent\lib\node\parsers\json.js:19:7)
at IncomingMessage.emit (events.js:201:15)
at endReadableNT (_stream_readable.js:1130:12)
at processTicksAndRejections (internal/process/task_queues.js:84:9) {
status: 404,
text: '{"statusCode":404,"error":"Not Found","message":"Not Found"}',
method: 'DELETE',
path: '/api/movies/5cd8a0eefc06344accd62a77'
Any ideas of why is this happening?
You need to add a route like this in your routes file assuming you're using controllers
const express = require('express')
const router = express.Router()
const movieController = require('../controllers/movie.controller')
router.delete('/api/movies/:movieId', movieController.deleteMovie)
module.exports = router

Node.js Async/Await module export

I'm kinda new to module creation and was wondering about module.exports and waiting for async functions (like a mongo connect function for example) to complete and exporting the result. The variables get properly defined using async/await in the module, but when trying to log them by requiring the module, they show up as undefined. If someone could point me in the right direction, that'd be great. Here's the code I've got so far:
// module.js
const MongoClient = require('mongodb').MongoClient
const mongo_host = '127.0.0.1'
const mongo_db = 'test'
const mongo_port = '27017';
(async module => {
var client, db
var url = `mongodb://${mongo_host}:${mongo_port}/${mongo_db}`
try {
// Use connect method to connect to the Server
client = await MongoClient.connect(url, {
useNewUrlParser: true
})
db = client.db(mongo_db)
} catch (err) {
console.error(err)
} finally {
// Exporting mongo just to test things
console.log(client) // Just to test things I tried logging the client here and it works. It doesn't show 'undefined' like test.js does when trying to console.log it from there
module.exports = {
client,
db
}
}
})(module)
And here's the js that requires the module
// test.js
const {client} = require('./module')
console.log(client) // Logs 'undefined'
I'm fairly familiar with js and am still actively learning and looking into things like async/await and like features, but yeah... I can't really figure that one out
You have to export synchronously, so its impossible to export client and db directly. However you could export a Promise that resolves to client and db:
module.exports = (async function() {
const client = await MongoClient.connect(url, {
useNewUrlParser: true
});
const db = client.db(mongo_db);
return { client, db };
})();
So then you can import it as:
const {client, db} = await require("yourmodule");
(that has to be in an async function itself)
PS: console.error(err) is not a proper error handler, if you cant handle the error just crash
the solution provided above by #Jonas Wilms is working but requires to call requires in an async function each time we want to reuse the connection. an alternative way is to use a callback function to return the mongoDB client object.
mongo.js:
const MongoClient = require('mongodb').MongoClient;
const uri = "mongodb+srv://<user>:<pwd>#<host and port>?retryWrites=true";
const mongoClient = async function(cb) {
const client = await MongoClient.connect(uri, {
useNewUrlParser: true
});
cb(client);
};
module.exports = {mongoClient}
then we can use mongoClient method in a diffrent file(express route or any other js file).
app.js:
var client;
const mongo = require('path to mongo.js');
mongo.mongoClient((connection) => {
client = connection;
});
//declare express app and listen....
//simple post reuest to store a student..
app.post('/', async (req, res, next) => {
const newStudent = {
name: req.body.name,
description: req.body.description,
studentId: req.body.studetId,
image: req.body.image
};
try
{
await client.db('university').collection('students').insertOne({newStudent});
}
catch(err)
{
console.log(err);
return res.status(500).json({ error: err});
}
return res.status(201).json({ message: 'Student added'});
};

Categories