I have multiple scrapers like below:
await scraper1.run
await scraper2.run
// etc
to increase performance and response time I used websocket and I pass socket connection down to each scraper and emit for each single item (of the result).
const express = require('express')
const app = express()
const http = require('http').Server(app)
const cors = require('cors')
const puppeteer = require('puppeteer')
const io = require('socket.io')(http)
const mongoose = require('mongoose')
const _ = require('lodash')
const scraper1 = require('./scraper1')
const scraper2 = require('./scraper2')
mongoose.connect("mongodb://localhost:27017/test")
;(async function () {
try {
const browser = await puppeteer.launch({
headless: false
})
io.on('connection', async function (socket) {
socket.on('search', async function (query) {
// check whether document exists with user ip address then return
// otherwise run the scrapres
await scraper1.run(browser, socket, query)
await scraper2.run(browser, socket, query)
})
})
} catch (e) {
console.log(e)
}
})()
http.listen(3000)
Context: When user does refresh and multiple socket connections made the scrapers run multiple times and data become duplicate. I prevented the duplication using mongodb but the performance issue remains because scrapers run until their result are ready then I check with the database.
Question: How to lock or prevent scrapers from running multiple times and also wait for each scraper to be done with websocket?
I can propose you next solution (I didn't test it but I guess you can figure out what I am trying to achieve, the explanation below the example):
'use strict';
const queryState = {
};
const getQueryKey = (query) => {
// base64 but can be a hash like sha256
const key = Buffer.from(query).toString('base64');
return key;
};
/**
* Return query state
* #param {String} query
* #return {String} state [PENDING, DONE, null] null if query doesn't exist
*/
const getQueryState = (query) => {
const key = getQueryKey(query);
const state = queryState[key] || null;
return state;
};
/**
* Add a query and initialize it as pending
* #param {String} query
* #return {String} state
*/
const addQuery = (query) => {
const key = getQueryKey(query);
const state = 'PENDING';
queryState[key] = state;
return state;
};
/**
* Hashmap to associate pending queries to be notified to socket connections
* when query is done
* This structure keeps and array of callbacks per query key
*/
const observers = {
};
const addObserver = (query, callback) => {
const key = getQueryKey(query);
if (typeof observers[key] !== 'undefined') {
observers[key] = [callback];
} else {
observers[key] = [...observers[key], callback];
}
};
const notifyObservers = (query) => {
const key = getQueryKey(query);
const callbacks = observers[key] || [];
// TODO: get query data scrapper from a cache / database / etc
const data = getDataFromQuery(query);
callbacks.forEach((callback) => {
callback(data);
});
};
/**
* Update query status to done
* PreCD: query must exist in queryState (previously added using addQuery)
* #param {String} query
* #return {String} state
*/
const endQuery = (query) => {
const key = getQueryKey(query);
const state = 'DONE';
queryState[key] = state;
return state;
};
io.on('connection', async function (socket) {
socket.on('search', async function (query) {
/**
* If query doesn't exist, scrap it
*/
const state = getQueryState(query);
if (state === null) {
addQuery(query);
await scraper1.run(browser, socket, query);
await scraper2.run(browser, socket, query);
endQuery(query);
// store scrapper data in cache / database / etc and
// socket send scraperData to the user
// notify pending queries to send data scrapper
notifyObservers(query);
} else if (state === 'PENDING') {
// add callback to return data to the user
addObserver(query, (scraperData) => {
// socket send scraperData to the user
});
} else {
// socket send scraperData to the user
}
});
});
To simplify the things, the example is simple but not the best one in terms of performance / architecture. This solution implements:
Scenario 1 (someone asking for query1 first time)
A request (socket connection) came to the backend asking for a query
This request doesn't exist yet, so, mark it as PENDING and start scrappers
Scenario 2 (another one asking for query1)
A second connection appears asking for same query as scenario 1
The request is in state PENDING, so, we add a callback to be call when this query finish
Scenario 3 (query 1 finish)
Scrappers started at Scenario 1 ends so query1 is market as DONE
Each request (observer) waiting for query1 will be notified
This solution can have multiple ways to be implemented, but my point is try to expose this to you and then you can modify it in the way you want.
Hope this helps
Related
I need to create a service in NodeJS that periodically executes a GET request to an API that return all the Jobs/Tasks. The service then needs to create a CronJob for each task returned while continuing to check for new tasks, and if there are new ones create new CronJobs.
I made something similar by having a service that runs a GET and then does a forEach loop and creates new CronJobs. But this doesn't take in account new tasks that are created after the first initialization. How do I solve this? How do I make a service that is always looking for new tasks and dynamically creates them?
EDIT1: the axios.post just post a log on a database, nothing special
const axios = require("axios");
const CronJob = require("cron").CronJob;
const cron = require("cron");
const startCron = async () => {
const schedules = await axios
.get("http://127.0.0.1:4000/")
.then((res) => {
return res.data;
})
.catch((err) => console.log(err));
schedules.forEach((schedule) => {
return new CronJob(`${schedule.timing} * * * * *`, () => {
let d = new Date();
console.log(schedule.message + " in data: " + d);
axios.post(`http://127.0.0.1:4000/${schedule.id}`);
}).start();
});
};
startCron();
The goal is to call a function from my main script that connects to a database, reads a document from it, stores pieces of that document in a new object, and returns that object to my main script. The problem is I cannot get it all to work together. If I try one thing, I get the results but my program locks up. If I try something else I get undefined results.
Long story short, how do I open a database and retrieve something from it to another script.
The program is a quiz site and I want to return the quiz name and the questions.
const myDb = require('./app.js');
var myData = myDb.fun((myData) => {
console.log(myData.quizName);
});
Here is the script that tries to open the database and find the data
const { MongoClient } = require("mongodb");
const {mongoClient} = require("mongodb");
const uri = connection uri goes here but my name is hard coded into it at the moment so I removed for privacy
const client = new MongoClient(uri);
const fun = async (cback) => {
try {
await client.connect();
const database = client.db('Quiz-Capstone');
const quizzes = database.collection('Quiz');
const query = {quizName: "CIS01"};
const options = {
sort: {},
projection: {}
};
const quiz = await quizzes.findOne(query, options);
var quizObject = {
quizName: quiz.quizName,
quizQuestions: quiz.quizQuestions
}
//console.log(testOb);
} finally {
await client.close();
cback(quizObject);
}
}
fun().catch(console.dir);
module.exports = {
fun: fun
}
UPDATE: Still stuck. I have read several different threads here about asynchronous calls and callbacks but I cannot get my function located in one file to return a value to the caller located in another file.
I'm running on a fastify server and when I send a request, I get a ton of data from mongodb and start a for loop to process each item. This can take ~ 30 minutes. Each item is "processed" by sending to ffmpeg, redis pub->sub, and then a socket to the client.
// Streams controller
exports.renderStreams = async function (req, reply) {
const streams = await Stream.find({}).sort({ createdAt: -1 })//.limit(5)
const renderedStreams = renderStreams(this, streams);
return { success: true, streams: streams.length };
}
// renderStreams
const renderStreams = (fastify, streams = []) => {
const { redis } = fastify;
const channel = "streams";
for (let i = 0; i < streams.length; i++) {
setTimeout(async () => {
const stream = streams[i];
await renderStream(redis, channel, stream);
}, i * 200)
}
}
I am wondering in this for loop, how can I either "pause" it or stop it completely (or both?) via another request, maybe when I call /api/streams/stop.
How would this be possible?
You can use socket.io to do communications with your script while it is still running, so you would create a function to stop the loop and when you get the notification from socket.io you would run it. Documentation link: https://socket.io/docs/v4/
I have a function that inserts into the database with the POST method and debugging, I test it with postman, sending it an empty post request, so it executes the controller
The function executes 2 more, than 1 is the one that inserts to the DB, ok, I want to execute this function automatically with node-cron
My functions
export class GettingInfo {
ReadingFileFromServer = () => {
const file = path.resolve(__dirname, '../../../dist/entity/PRUEBA.txt')
try {
const data = fs.readFileSync(file, 'utf-8');
const lines = data.split("\n")
let values = []
let bi = []
lines.forEach(line => {
line.trim()
values = line.split("\|", 6).map(a => a.trim());
bi.push(values)
console.log(bi)
})
const convert = this.TransformingFiletoJson(bi)
console.log(convert)
const save = this.SavingReferences(convert)
console.log(save)
} catch (err) {
console.error(err), 'something has happened to the file';
}
}
for the moment to test it I call it in a controller.ts
#Post('data')
createData(){
const tasks = new GettingInfo(this.referenceService)
tasks.ReadingFileFromServer()
return "created! 201 test.."
}
}
But now, that I want to run it alone, create a file "execute.ts" with the following code and it does not run alone
import cron = require("node-cron")
import {GettingInfo} from "./reference.task";
cron.schedule("5 * * * * *", ()=> {
const echale = new GettingInfo(this.referenceService)
echale.ReadingFileFromServer()
console.log("Executing...")
})
From what I can see in the node-cron documentation you need to start the task in order to start the scheduled cron executions.
Change your code to:
const task = cron.schedule("5 * * * * *", ()=> {
const echale = new GettingInfo(this.referenceService)
echale.ReadingFileFromServer()
console.log("Executing...")
})
task.start()
And it should work.
The function I would like this function to run by itself at time intervals. As it is now I have to visit the '/getCompanyInfo' path to trigger it. I would like it to run every minute as if I was visiting the '/getCompanyInfo' path each minute. The app is on Heroku and I would like the function to execute without any pages open.
The original function that is triggered by visiting the path.
const express = require('express');
const app = express();
/**
* getCompanyInfo ()
*/
app.get('/getCompanyInfo', function(req,res){
const companyID = oauthClient.getToken().realmId;
console.log(companyID)
const url = OAuthClient.environment.production ;
oauthClient.makeApiCall({url: url + 'v3/company/0000000000/salesreceipt/8?minorversion=41'})
.then(function(authResponse){
console.log("The response for API call is :"+JSON.parse(JSON.stringify(authResponse)));
res.send(authResponse);
})
.catch(function(e) {
console.error(e);
});
});
One of my attempts here was to put it in a function that executes each minute using node-schedule.
This one doesn't do anything other than print 'This will run once a minute.' to the console.
I tried removing
app.get(function(req,res){
and the
})
below it but that made the app (hosted on Heroku) fail to build.
const express = require('express');
const app = express();
var schedule = require('node-schedule');
var j = schedule.scheduleJob('* * * * *', function(){
console.log('This will run once a minute.');
app.get(function(req,res){
const companyID = oauthClient.getToken().realmId;
console.log(companyID)
const url = OAuthClient.environment.production ;
oauthClient.makeApiCall({url: url + 'v3/company/0000000000/salesreceipt/8?minorversion=41'})
.then(function(authResponse){
console.log("The response for API call is :"+JSON.parse(JSON.stringify(authResponse)));
res.send(authResponse);
})
.catch(function(e) {
console.error(e);
});
});
});
More Context:
It is inside an app I have on Heroku. I would like to set the app to make a requests for JSON data from the API every x time without me having to touch it.
app.get initializes api handler - e.g. this is your api route definition - the thing that will respond when you call GET /getCompanyInfo via web browser or some other client. You should not redefine it regularly with your scheduled action.
The failed build after you've removed the route handler is probably because of the res.send(authResponse); left behind.
You could have something like:
// function that will be used to get the data
const getCompanyInfo = (done) => {
const companyID = oauthClient.getToken().realmId
console.log(companyID)
const url = OAuthClient.environment.production
oauthClient.makeApiCall({url: url + 'v3/company/0000000000/salesreceipt/8?minorversion=41'})
.then((authResponse) => {
console.log("The response for API call is :"+JSON.parse(JSON.stringify(authResponse)))
done(authResponse)
})
.catch((e) => {
console.error(e)
})
}
// this will trigger the function regularly on the specified interval
const j = schedule.scheduleJob('* * * * *', () => {
getCompanyInfo((companyInfo) => {
// ...do whatever you want with the info
})
})
// this will return you the data by demand, when you call GET /getCompanyInfo via browser
app.get('/getCompanyInfo', function(req,res) {
getCompanyInfo((companyInfo) => {
res.send(companyInfo)
})
})
Heroku has an add on called Heroku Scheduler that does what you want. The node-schedule npm package might do the job, but as you mentioned, you probably aren't going to be able to see the execution/results/logs of your jobs that run every 24 hours without making some interface for it on your own.
For your issue, calling app.get doesn't make a lot of sense. That's just telling node about the route. Assuming you have your /getCompanyInfo route up and running, you just need to call it in your scheduled job, not re-register it every time.
You could also just do this (http being the http client you're using):
var j = schedule.scheduleJob('* * * * *', async function(){
console.log('This will run once a minute.');
const result = await http.get('/getCompanyInfo');
console.log(result);
});