Using wappalyzer and puppeteer in node.js - javascript

I am trying to build a scraper to monitor web projects automatically.
So far so good, the script is running, but now I want to add a feature that automatically analyses what libraries I used in the projects. The most powerful script for this job is wappalyser. They have a node package (https://www.npmjs.com/package/wappalyzer) and it's written that you can use it combined with pupperteer.
I managed to run pupperteer and to log the source code of the sites in the console, but I don't get the right way to pass the source code to the wappalyzer analyse function.
Do you guys have a hint for me?
I tryed this code but a am getting a TypeError: url.split is not a function
function getLibarys(url) {
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
// get source code with puppeteer
const html = await page.content();
const wappalyzer = new Wappalyzer();
(async function () {
try {
await wappalyzer.init()
// Optionally set additional request headers
const headers = {}
const site = await wappalyzer.open(page, headers)
// Optionally capture and output errors
site.on('error', console.error)
const results = await site.analyze()
console.log(JSON.stringify(results, null, 2))
} catch (error) {
console.error(error)
}
await wappalyzer.destroy()
})()
await browser.close()
})()
}

Fixed it by using the sample code from wappalyzer.
function getLibarys(url) {
const Wappalyzer = require('wappalyzer');
const options = {
debug: false,
delay: 500,
headers: {},
maxDepth: 3,
maxUrls: 10,
maxWait: 5000,
recursive: true,
probe: true,
proxy: false,
userAgent: 'Wappalyzer',
htmlMaxCols: 2000,
htmlMaxRows: 2000,
noScripts: false,
noRedirect: false,
};
const wappalyzer = new Wappalyzer(options)
;(async function() {
try {
await wappalyzer.init()
// Optionally set additional request headers
const headers = {}
const site = await wappalyzer.open(url, headers)
// Optionally capture and output errors
site.on('error', console.error)
const results = await site.analyze()
console.log(JSON.stringify(results, null, 2))
} catch (error) {
console.error(error)
}
await wappalyzer.destroy()
})()
}

I do not know if you still need an answer to this. But this is what a wappalyzer collaborator told me:
Normally you'd run Wappalyzer like this:
const Wappalyzer = require('wappalyzer')
const wappalyzer = new Wappalyzer()
await wappalyzer.init() // Launches a Puppeteer instance
const site = await wappalyzer.open(url)
If you want to use your own browser instance, you can skip wappalyzer.init() and assign the instance to wappalyzer.browser:
const Wappalyzer = require('wappalyzer')
const wappalyzer = new Wappalyzer()
wappalyzer.browser = await puppeteer.launch() // Use your own Puppeteer launch logic
const site = await wappalyzer.open(url)
You can find the discussion here.
Hope this helps.

Related

Puppeteer Application Error: A client side exception has occurred

I am using Puppeteer with NEXT.JS, trying to take a screenshot. And it works fine on localhost but returns an image with this error in production:
Application error a client-side exception has occurred (see the browser console for more information!!
Taking a screenshot
export const createImages = async (urlArray) => {
try {
const browser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
slowMo: 250, // slow down by 250ms
})
const page = await browser.newPage()
for (let i = 0; i < urlArray.length; i++) {
if (urlArray[i].address === "") continue
await page.goto(urlArray[i].address, {
waitUntil: "load",
timeout: 30000,
})
const screenshotBase64 = await page.screenshot({
encoding: "base64",
})
const screenshot = Buffer.from(
await screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
"base64"
)
urlArray[i]["imgBase64"] = screenshot
}
await browser.close()
} catch (err) {
console.log(new Date(), "was not able to create images: ", err)
return err
}
return 1
}
When I open the url manually in production, the page loads fine! And I have tried encoding the image to Binary instead but still the same issue.. Any idea !?
At first I was listening only to the errors. But after I listened to all console messages using this command:
page.on('console', msg => console.log('PAGE LOG:', msg.text()))
I was able to see this error:
'THREE.WebGLRenderer: Error creating WebGL context.'
And it point out that the GPU used on the server is blacklisted because it's old.

Trying to use puppeteer inside async function inside async function which already has puppeteer

I'm trying to build telegram bot to parse page on use request. My parsing code works fine inside one async function, but completeky falls on its face if I try to put it inside another async function.
Here is the relevant code I have:
const puppeteer = require('puppeteer');
const fs = require('fs/promises');
const { Console } = require('console');
async function start(){
async function searcher(input) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const url = ; //here is a long url combining logic, that works fine
await page.goto(url);
const currentUrl = requestPage.url();
console.log(currentUrl); //returns nothing.
//here is some long parsing logic
await browser.close();
return combinedResult;
}
//here is a bot code
const { Telegraf } = require('telegraf');
const bot = new Telegraf('my bot ID');
bot.command('start', ctx => {
console.log(ctx.from);
bot.telegram.sendMessage(ctx.chat.id, 'Greatings message', {});
bot.telegram.sendMessage(ctx.chat.id, 'request prompt ', {});
})
bot.on('text', (ctx) => {
console.log(ctx.message.text);
const queryOutput = searcher(ctx.message.text);
bot.telegram.sendMessage(ctx.chat.id, queryOutput, {});
});
bot.launch()
}
start();
Here is an error message:
/Users/a.rassanov/Desktop/Fetch/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:218
return Promise.reject(new Error(`Protocol error (${method}): Session closed. Most likely the ${this._targetType} has been closed.`));
^
Error: Protocol error (Page.navigate): Session closed. Most likely the page has been closed.
I'm very new to this, and your help is really appriciated.

Preserving indentation with Tesseract.js library

I am using tessearct.js library in my angular code.
I want to preserve the white spaces, the indentation as it is. How to do it?
Currently I am using this piece of code to do it.
async doOCR {
const worker = createWorker({
logger: m => console.log(m),
});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const value = await worker.recognize(this.selectedFile);
}
I am looking a method to do it on client side only, that's why not using its python library.
You can give it a try after version (3.04), they have added the preserve_interword_spaces`. You can try this and check if this works:
async doOCR {
const worker = createWorker({
logger: m => console.log(m),
});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
// there is no proper documentation, but they have added this flag
// to run it as a command
await worker.setParameters({
preserve_interword_spaces: 1,
});
const value = await worker.recognize(this.selectedFile);
}

UnhandledPromiseRejectionWarning: Error: Evaluation failed theme is not defined

Before I start the question, I am new in JavaScript, and I have very basic knowledge of async js, but i need to solve this so i can have my first project functional.
I am trying to build a scraping app using Node and Puppeteer. Basically, the user enters a URL ("link" in the code below), puppeteer goes trough the website code, tries to find the specific piece and returns the data. That part I got working so far.
The problem is when a user enters a URL of a site that doesn't have that piece of code. In that case, I get UnhandledPromiseRejectionWarning: Error: Evaluation failed theme is not defined
What do I do so when there is an error like that, I can catch it and redirect the page instead of Getting Internal Server error.
app.post("/results", function(req, res) {
var link = req.body.link;
(async link => {
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name');
await browser.close()
return data
})(link)
.then(data => {
res.render("index", {data: data, siteUrl: link});
})
})
You can extend the async part to the whole route handler and do whatever you want on catch:
app.post('/results', async (req, res) => {
try {
const link = req.body.link
const browser = await puppeteer.launch({ args: ['--no-sandbox'] })
const page = await browser.newPage()
await page.goto(link, { waitUntil: 'networkidle2'})
const data = await page.evaluate('theme.name')
await browser.close()
res.render("index", {data: data, siteUrl: link})
} catch(e) {
// redirect or whatever
res.redirect('/')
}
});

I can't go from a page to another using page.goto() - Puppeteer

I'm trying to make a InstagramBot that logs in and then go to some profile, my code worked yesterday for awhile and than it just stopped working .
I've tried to clone my repository from github, but it does'n work either, sometimes it works again, but if I try to create another function, the code just ignore the line of the code that changes the page.
I've also tried to create a new page and then in this new page use the goto function and it worked, but the account doesn keep logged in
The version of puppeteer that I'm using: 1.16.0
The version of node.js that I'm using: v10.15.3
const puppeteer = require('puppeteer');
const BASE_URL = "https://www.instagram.com/accounts/login/?hl=en&source=auth_switcher";
const instagram = {
browser: null,
page: null,
profile_url: null,
initialize: async (profile) => {
instagram.browser = await puppeteer.launch({
headless: false
})
instagram.profile_url = await "https://www.instagram.com/" + profile;
instagram.page = await instagram.browser.newPage();
await instagram.page.goto(BASE_URL, {waitUntil: 'networkidle2'});
},
login: async(username, password) =>{
await instagram.page.waitFor(1000);
await instagram.page.type('input[name="username"]', username);
await instagram.page.type('input[name="password"', password);
await instagram.page.click('button[type="submit"]');
await instagram.page.waitFor(1500);
await console.log(instagram.profile_url);
await instagram.page.goto(instagram.profile_url, {timeout: 0, waitUntil: 'domcontentloaded'}); // the code just ignore this line
await instagram.page.waitFor(1000);
},
getPhotosLinks: async() => {
console.log("Do something here");
}
}
module.exports = instagram;
It doesn't give any error message, just doesn't work
Replace
await instagram.page.click('button[type="submit"]');
await instagram.page.waitFor(1500);
with
await Promise.all([
instagram.page.click('button[type="submit"]');,
instagram.page.waitForNavigation()
]);
and see if it works

Categories