Speed optimization using Piscina - javascript

I want to analyze a lot of urls in short period.
For analyze the urls, I am using Wappalyzer module.
Piscina is what I chose for speed optimization and it is working well.
However, I am not satisfied with the current speed and I need faster one.
Here is my current code.
index.js
const Piscina = require("piscina");
const { program } = require("commander");
const { readCsv, cleanFile } = require("./src/csv");
const { resolve, join } = require("path");
const { chunks } = require("./src/utils");
async function run({
chunksNum = 500,
enableLogging = true,
inputFile,
outputFile,
enableConfidence = false,
limitTab = 5,
}) {
console.time("Analyze time");
const inputFileLocation = join(__dirname, inputFile);
const outputFileLocation = join(__dirname, outputFile);
console.log("Reading input csv file...");
let urls = await readCsv(inputFileLocation);
// urls = urls.slice(0, 1)
console.log(`* Done!, total urls will be analyzed: ${urls.length}`);
try {
const urlChunks = chunks(urls, chunksNum);
console.log(`* Total threads: ${urlChunks.length}`);
const pool = new Piscina({
filename: resolve(__dirname, "./src/worker-pool.js"),
});
console.log("Start analyzing...");
await cleanFile(outputFileLocation);
await Promise.all(
urlChunks.map((urls, index) => {
console.log(`-- Thread ${index} will analyze ${urls.length} urls`);
return pool.run({
urls,
enableLogging,
limitTab,
threadIndex: index,
outputFile: outputFileLocation,
});
})
);
console.log("Writing results to csv...");
// await writeCsv(outputFileLocation, results.flat(), enableConfidence)
console.log("Writing results done!");
} catch (error) {
console.error(error);
}
console.timeEnd("Analyze time");
}
program
.option("-c, --chunks <number>", "Split urls into n chunks (default 1000)")
.option(
"-lm, --limitTab <number>",
"Limit chrome tabs per chrome instance (default 10)"
)
.option("-l, --log", "Enable logging (default false)")
.option("-i, --input <file>", "Input file (default urls.csv)")
.option("-o, --output <file>", "Input file (default result.csv)")
.option(
"-cfd, --confidence",
"Write confidence results to csv (default false)"
);
program.parse(process.argv);
const options = program.opts();
const chunksNum = parseInt(options.chunks) || 1000;
const enableLogging = options.log || false;
const enableConfidence = options.confidence || false;
const limitTab = parseInt(options.limitTab) || 10;
const inputFile = options.input || "urls.csv";
const outputFile = options.output || "result.csv";
run({
chunksNum,
enableLogging,
inputFile,
outputFile,
enableConfidence,
limitTab,
});
package.json
{
"name": "wappalyzer",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"commander": "^9.2.0",
"csv": "^6.0.5",
"piscina": "^3.2.0",
"wappalyzer": "^6.10.23"
}
}
urls.csv
akwam.io
bciseguros.cl
ctctusercontent.com
haodeplus.com
aicaimei.com
papik.pro
prixdubaril.com
cambiumast.com
fyeds2.com
jimcontent.com
dbankcloud.com
sura.cl
tazkarti.com
coke.com
secureholiday.net
petycjeonline.com
aso1.net
ookla.com
samsungpromotions.claims
shec.edu.cn
tui.ch
filmstreaming.al
bttba.com
guizumeimei.com
xiti.com
wakefern.com
housingauthority.gov.hk
quantserve.com
jabama.com
lord-film-cc.me
xiepp.cc
yourcovidrecovery.nhs.uk
leyuman.com
jse.edu.cn
kkkkwu.com
pingit.im
service-voyages.com
fiaformula2.com
junaeb.cl
yt.be
windowssearch-exp.com
tickets.com
manga1002.com
yuedyue.com
nakadashi.pw
tapmad.com
mywelfare.ie
freemangaraw.com
shukriya90.com
regudri.xyz
pgmall.my
secsso.net
uab.cat
job-terminal.com
swimsuitsforall.com
sberbank.com
vic.gov.au
hondadealers.com
mptgate.com
hgq26.com
lenzmx.com
minna-no-ginko.com
vchecks.io
feed-xml.com
optoutsub.net
mengyuanshucheng.com
utiitsl.com
beneficiosestudiantiles.cl
adpool.bet
csdiran.ir
aftrk3.com
100fanwo.com
bancolombia.com
binaamanhaji.com
tokyo-brain.clinic
tcloudbaseapp.com
qimiaomh.com
12cm.com.tw
amplifyapp.com
86zw.co
infinitummovil.net
sejam.ir
maeva.com
ghiseul.ro
sf024.com
capcut.net
fusetracking.com
mof.gov.cn
northgate-nes.co.uk
btbtt20.com
srvpcn.com
downxia.com
googleweblight.com
ajuda.globo
likecs.com
... more than 8970
src/analyse.js
async function analyzeUrl(wappalyzerInstance, url, enableLogging = false) {
// check if the url string has a protocol, if not, add http://
if (!url.startsWith('http')) {
url = `http://${url}`
}
if (enableLogging) {
console.log('------ Start analyzing: ', url)
}
const site = await wappalyzerInstance.open(url)
const results = await site.analyze()
const detectResult = await detectSpotifyOrStripeInTechnologies(results.technologies)
if (enableLogging) {
console.log(`------ Finish analyzing: ${url}; Result: ${detectResult.exist ? `Found ${detectResult.label}!` : 'Not found.'}`)
}
src/csv.js
const fs = require('fs')
const { parse, stringify } = require('csv')
async function readCsv(csvFile) {
const urls = []
// read the csv file
const parser = parse({ columns: false })
const stream = fs.createReadStream(csvFile).pipe(parser)
stream.on('data', (row) => {
urls.push(row[0])
})
// wait for the stream to end and return the urls
await new Promise((resolve) => {
stream.on('end', () => {
resolve(urls)
})
})
return urls
}
//RETURN CSV
function generateOutputData(data, enableConfidence) {
const outputData = data.map(row => {
if (row.technologies) {
return { url: row.url.slice(7), detect: row.technologies, confidence: row.confidence }
}
return { url: row.url, detect: row.label || '-' }
})
return outputData
}
async function cleanFile(outputFile) {
await fs.writeFileSync(outputFile, '', 'utf-8')
}
async function writeCsv(outputFile, data, enableConfidence) {
const outputData = generateOutputData(data, enableConfidence)
stringify(outputData, { header: true }, (err, output) => {
if (err) {
console.error(err)
}
fs.writeFile(outputFile, output, (err) => {
if (err) {
console.error(err)
}
})
})
}
async function appendCsv(outputFile, data, enableConfidence) {
const outputData = generateOutputData(data, enableConfidence)
stringify(outputData, { header: false }, (err, output) => {
if (err) {
console.error(err)
}
fs.appendFile(outputFile, output, (err) => {
if (err) {
console.error(err)
}
})
})
}
module.exports = {
readCsv, writeCsv, appendCsv, cleanFile
}
return {
url, ...results
}
}
module.exports = { analyzeUrl }
src/utils.js
function chunks(arr, chunkSize) {
if (chunkSize === 0) return arr
let results = [];
while (arr.length) results.push(arr.splice(0, chunkSize));
return results;
}
module.exports = { chunks };
src/worker-pool.js
const Wappalyzer = require("wappalyzer");
const { analyzeUrl } = require("./analyze");
const { chunks } = require("./utils");
const { appendCsv } = require("./csv");
module.exports = async ({
urls,
enableLogging,
limitTab = 5,
threadIndex,
outputFile,
}) => {
const options = {
debug: false,
delay: 5000,
maxDepth: 3,
maxUrls: 3,
maxWait: 10000,
recursive: true,
userAgent: "Wappalyzer",
};
const urlChunks = chunks(urls, limitTab);
const results = [];
let finishUrl = 0;
for (let chunk of urlChunks) {
console.log("---- Create new chrome instance");
const wappalyzer = new Wappalyzer(options);
await wappalyzer.init();
console.log(`---- Open ${chunk.length} tabs in thread ${threadIndex}`);
const result = await Promise.all(
chunk.map((url) => analyzeUrl(wappalyzer, url, enableLogging))
);
await appendCsv(outputFile, result.flat(), false);
finishUrl += chunk.length;
const totalUrls = urlChunks.flat().length;
console.log(
`---- Finish ${finishUrl}/${totalUrls} urls in thread ${threadIndex}`
);
results.push(result);
await wappalyzer.destroy();
console.log("---- Destroy chrome instance");
}
console.log(`-- Finish thread ${threadIndex}`);
return results.flat();
};

Related

405 Error using slash command handler Discord.js

I got a 405: Method Not Allowed error when I was running the slash command builder.
There's the code:
const { glob } = require("glob");
const { promisify } = require("util");
const { Client } = require("discord.js");
const mongoose = require("mongoose");
const globPromise = promisify(glob);
/**
* #param {Client} client
*/
module.exports = async (client) => {
// Commands
const commandFiles = await globPromise(`${process.cwd()}/commands/**/*.js`);
commandFiles.map((value) => {
const file = require(value);
const splitted = value.split("/");
const directory = splitted[splitted.length - 2];
if (file.name) {
const properties = { directory, ...file };
client.commands.set(file.name, properties);
}
});
// Events
const eventFiles = await globPromise(`${process.cwd()}/events/*.js`);
eventFiles.map((value) => require(value));
// Slash Commands
const slashCommands = await globPromise(
`${process.cwd()}/SlashCommands/*/*.js`
);
const arrayOfSlashCommands = [];
slashCommands.map((value) => {
const file = require(value);
if (!file?.name) return;
client.slashCommands.set(file.name, file);
if (["MESSAGE", "USER"].includes(file.type)) delete file.description;
if (file.userPermissions) file.defaultPermission = false;
arrayOfSlashCommands.push(file);
});
client.on("ready", async () => {
// Register for a single guild
const guild = client.guilds.cache.get("1014471738844790844");
await guild.commands.set(arrayOfSlashCommands).then((cmd) => {
const getRoles = (commandName) => {
const permissions = arrayOfSlashCommands.find((x) => x.name === commandName).userPermissions;
if (!permissions) return null;
return guild.roles.cache.filter(x => x.permissions.has(permissions) && !x.managed);
};
const fullPermissions = cmd.reduce((accumulator, x) => {
const roles = getRoles(x.name);
if (!roles) return accumulator;
const permissions = roles.reduce((a, v) => {
return [
...a,
{
id: v.id,
type: 'ROLE',
permission: true,
}
];
}, []);
return [
...accumulator,
{
id: x.id,
permission: permissions,
}
];
}, []);
guild.commands.permissions.set({ fullPermissions });
});
// Register for all the guilds the bot is in
// await client.application.commands.set(arrayOfSlashCommands);
});
// mongoose
const { mongooseConnectionString } = require('../config.json')
if (!mongooseConnectionString) return;
mongoose.connect(mongooseConnectionString).then(() => console.log('Connected to mongodb'));
};
There's the interactionCreate.js file:
const client = require("../index");
client.on("interactionCreate", async (interaction) => {
// Slash Command Handling
if (interaction.isCommand()) {
await interaction.deferReply({ ephemeral: false }).catch(() => { });
const cmd = client.slashCommands.get(interaction.commandName);
if (!cmd)
return interaction.followUp({ content: "An error has occurred " });
const args = [];
for (let option of interaction.options.data) {
if (option.type === "SUB_COMMAND") {
if (option.name) args.push(option.name);
option.options?.forEach((x) => {
if (x.value) args.push(x.value);
});
} else if (option.value) args.push(option.value);
}
interaction.member = interaction.guild.members.cache.get(interaction.user.id);
if (!interaction.member.permissions.has(cmd.userPermissions || [])) return interaction.followUp({ content: `Error: Mission Permissions.` });
cmd.run(client, interaction, args);
}
// Context Menu Handling
if (interaction.isContextMenu()) {
await interaction.deferReply({ ephemeral: false });
const command = client.slashCommands.get(interaction.commandName);
if (command) command.run(client, interaction);
}
});
And there's the error I got:
C:\Users\pines\OneDrive\桌面\ROBO_Head-v2\node_modules\discord.js\src\rest\RequestHandler.js:350
throw new DiscordAPIError(data, res.status, request);
^
DiscordAPIError: 405: Method Not Allowed
at RequestHandler.execute (C:\Users\pines\OneDrive\桌面\ROBO_Head-v2\node_modules\discord.js\src\rest\RequestHandler.js:350:13)
at processTicksAndRejections (node:internal/process/task_queues:96:5)
at async RequestHandler.push (C:\Users\pines\OneDrive\桌面\ROBO_Head-v2\node_modules\discord.js\src\rest\RequestHandler.js:51:14)
at async ApplicationCommandPermissionsManager.set (C:\Users\pines\OneDrive\桌面\ROBO_Head-v2\node_modules\discord.js\src\managers\ApplicationCommandPermissionsManager.js:186:18) {
method: 'put',
path: '/applications/991997852538634311/guilds/1014471738844790844/commands/permissions',
code: 0,
httpStatus: 405,
requestData: { json: [], files: [] }
}
Discord.js version is 13.11.0, bot has administrator permission on the server.
Please help me solve this.

How to initialize App Data in node js and access it without being undefined in jest test?

i am initializing a node js app with crucial data for the app to work from a database in index.js.
index.ts
import {getInitialData} from 'initData.ts';
export let APP_DATA: AppData;
export const initializeAppData = async () => {
try {
APP_DATA = (await getInitialData()) as AppData;
if (process.env.NODE_ENV !== 'test') {
initializeMongoose();
startServer();
}
} catch (error) {
console.log(error);
}
};
initData.ts
let dbName: string = 'initialData';
if (process.env.NODE_ENV === 'test') {
dbName = 'testDb';
}
const uri = `${process.env.MONGODB_URI}/?maxPoolSize=20&w=majority`;
export async function getInitialData() {
const client = new MongoClient(uri);
try {
await client.connect();
const database = client.db(dbName);
const configCursor = database
.collection('config')
.find({}, { projection: { _id: 0 } });
const config = await configCursor.toArray();
const aaoCursor = database
.collection('aao')
.find({}, { projection: { _id: 0 } });
const aao = await aaoCursor.toArray();
return { config, aao };
} catch {
(err: Error) => console.log(err);
} finally {
await client.close();
}
}
I'm using this array in another file and import it there.
missionCreateHandler
import { APP_DATA } from '../index';
export const addMissionResources = (
alarmKeyword: AlarmKeyword,
newMission: MissionDocument
) => {
const alarmKeywordObject = APP_DATA?.aao.find(
(el) => Object.keys(el)[0] === alarmKeyword
);
const resourceCommand = Object.values(alarmKeywordObject!);
resourceCommand.forEach((el) => {
Object.entries(el).forEach(([key, value]) => {
for (let ii = 1; ii <= value; ii++) {
newMission.resources?.push({
initialType: key,
status: 'unarranged',
});
}
});
});
};
I'm setting up a mongodb-memory-server in globalSetup.ts for Jest and copy the relevant data to the database from json-files.
globalSetup.ts
export = async function globalSetup() {
const instance = await MongoMemoryServer.create({
instance: { dbName: 'testDb' },
});
const uri = instance.getUri();
(global as any).__MONGOINSTANCE = instance;
process.env.MONGODB_URI = uri.slice(0, uri.lastIndexOf('/'));
process.env.JWT_SECRET = 'testSECRET';
const client = new MongoClient(
`${process.env.MONGODB_URI}/?maxPoolSize=20&w=majority`
);
try {
await client.connect();
const database = client.db('testDb');
database.createCollection('aao');
//#ts-ignore
await database.collection('aao').insertMany(aao['default']);
} catch (error) {
console.log(error);
} finally {
await client.close();
}
};
missionCreateHandler.test.ts
test('it adds the correct mission resources to the array', async () => {
const newMission = await Mission.create({
address: {
street: 'test',
houseNr: 23,
},
alarmKeyword: 'R1',
});
const expected = {
initialType: 'rtw',
status: 'unarranged',
};
addMissionResources('R1', newMission);
expect(newMission.resources[0].initialType).toEqual(expected.initialType);
expect(newMission.resources[0].status).toEqual(expected.status);
});
When runing the test, i get an 'TypeError: Cannot convert undefined or null to object at Function.values ()'. So it seems that the APP_DATA object is not set. I checked that the mongodb-memory-server is set up correctly and feed with the needed data.
When i hardcode the content of APP_DATA in index.ts, the test runs without problems.
So my questions are: How is the best practice to set up initial data in a node js app and where to store it (global object, simple variable and import it in the files where needed)? How can the test successfully run, or is my code just untestable?
Thank you!

Run code after executing promise in Javascript

I am trying to save to json the values returned from indeed api. I use indeed-scraper code from github https://github.com/rynobax/indeed-scraper
My code:
... required files ...
const parsedResults = []
indeed.query(queryOptions).then(response => {
response.forEach((res,i) => {
setTimeout(function(){
let url = res.url
let resultCount = 0
console.log(`\n Scraping of ${url} initiated...\n`)
const getWebsiteContent = async (url) => {
try {
const response = await axios.get(url)
const $ = cheerio.load(response.data)
...get scraped data...
parsedResults.push(metadata)
} catch (error) {
exportResults(parsedResults)
console.error(error)
}
}
getWebsiteContent(url)
}
, i*3000);
});
});
const outputFile = 'data.json'
const fs = require('fs');
const exportResults = (parsedResults) => {
fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
})
}
parsedResults is not accessible in last portion of script, so to save as json file.
Any help appreciated!

Problem with parsing data at filestream and cvparser

I'm having a problem reading this piece of code in a project that we are developing with colleagues. I'll be greateful if you can help me and tell me what that piece of code mean. I get that in that piece, I`m having the technologies.csv file and we need to parse it, and if there is an error we must throw an exception to this error. And at the end of the code there are some cards but i don't know what is the idea of this cards. And at the final we have to export the module with the data, and i think the data is from the const with the programing languages.If someone can explain it with details it would be more than perfect. Thanks in advance! :)
const fs = require('fs');
const parse = require('csv-parse');
const path = require('path');
const constants = {
testData: {
csvColumns: [
'ruby',
'python',
'vuejs',
'angular',
'react',
'nodejs',
],
},
};
const configFileLocation = (name) => {
return
{
filename: path.join(__dirname, `${name}technologies.csv`)
}
}
const getData = (name) =>
new Promise((resolve, reject) => {
const fileLocation = configFileLocation(name).filename;
const csvParser = parse({
delimiter: ',',
});
if (!fs.existsSync(fileLocation)) {
reject(new Error(`File ${fileLocation} is missing.`));
}
const csvFileStream = fs.createReadStream(fileLocation);
csvFileStream.on('ready', () => {
csvFileStream.pipe(csvParser);
});
csvFileStream.on('error', (error) => {
reject(
new Error({
error,
message: 'csvParseCards#csvFileStream on error',
})
);
});
csvParser.on('error', (error) => {
reject(
new Error({
error,
message: 'csvParseCards#csvParser on error',
})
);
});
const cards = [];
csvParser.on('readable', () => {
let record = '';
while ((record = csvParser.read())) {
const card = {};
const columns = constants.testData.csvColumns;
if (record.length !== columns.length) {
console.warn('Column mismatch', record);
}
record.map((value, index) => {
card[columns[index]] = value;
});
cards.push(card);
}
});
csvParser.on('end', () => {
cards.shift();
resolve(cards);
});
});
module.exports = getData;

Node js Multiple Query Promises

How do we handle multiple query promises in node js and mongo using async function ?
For example I wanted to add new query which is and then handle the gallery result the way const file result being handled.
const gallery = await gfs.findManyAsync({ metadata: vehicle.VIN })
Here is my current code
async function myFunction() {
gfs.findOneAsync = promisify(gfs.findOne);
gfs.findManyAsync = promisify(gfs.files.find);
const totalCount = await Vehicle.model.count({})
const vehicles = await Vehicle.model
.find(query, {}, pagination)
.sort(sortOrd + sortType)
const totalPages = Math.ceil(totalCount / size)
const promises = vehicles.map(async vehicle => {
let ImageList = vehicle.ImageList.split(',')
let profile_name = ImageList[0].split('/').pop();
const file = await gfs.findOneAsync({
$and: [{
$or: [{
metadata: vehicle.VIN
}]
},
{
$or: [{
filename: profile_name
}]
}
]
})
const gallery = await gfs.findManyAsync({ metadata: vehicle.VIN })
// const gallery = await gfs.findManyAsync({ metadata: vehicle.VIN })
// console.log("Gallery :" , gallery)
if (file) {
return new Promise(resolve => {
const readstream = gfs.createReadStream(file.filename);
const url = `http://localhost:3002/api/vehicle/file/${file.filename}`
vehicle.FileData = url
resolve()
})
}
})
try { } catch { (err) }
await Promise.all(promises)
return res.status(200).send({
message: "success",
pageNo: pageNo,
totalRecords: vehicles.length,
data: vehicles,
totalPages: totalPages
})
}

Categories