Bufferizing data from stream in nodeJS for perfoming bulk insert - javascript

How to bufferize efficiently in nodeJS on events from a stream to bulk insert instead of unique insert per record received from the stream. Here's pseudo code I've got in mind:
// Open MongoDB connection
mystream.on('data', (record) => {
// bufferize data into an array
// if the buffer is full (1000 records)
// bulk insert into MongoDB and empty buffer
})
mystream.on('end', () => {
// close connection
})
Does this look realistic?
Is there any possible optimization? Existing libraries facilitaties that?

Using NodeJS' stream library, this can be concisely and efficiently implemented as:
const stream = require('stream');
const util = require('util');
const mongo = require('mongo');
const streamSource; // A stream of objects from somewhere
// Establish DB connection
const client = new mongo.MongoClient("uri");
await client.connect();
// The specific collection to store our documents
const collection = client.db("my_db").collection("my_collection");
await util.promisify(stream.pipeline)(
streamSource,
stream.Writable({
objectMode: true,
highWaterMark: 1000,
writev: async (chunks, next) => {
try {
const documents = chunks.map(({chunk}) => chunk);
await collection.insertMany(docs, {ordered: false});
next();
}
catch( error ){
next( error );
}
}
})
);

I ended up with a no dependency solution.
const { MongoClient } = require("mongodb")
const url = process.env.MONGO_URI || "mongodb://localhost:27019";
const connection = MongoClient.connect(url, { useNewUrlParser: true, useUnifiedTopology: true })
Promise.resolve(connection)
.then((db) => {
const dbName = "databaseName";
const collection = 'collection';
const dbo = db.db(dbName);
let buffer = []
stream.on("data", (row: any) => {
buffer.push(row)
if (buffer.length > 10000) {
dbo.collection(collection).insertMany(buffer, {ordered: false});
buffer = []
}
});
stream.on("end", () => {
// insert last chunk
dbo.collection(collection).insertMany(buffer, {ordered: false})
.then(() => {
console.log("Done!");
db.close();
})
});
stream.on("error", (err) => console.log(err));
})
.catch((err) => {
console.log(err)
})

Related

How to filter with Mongo and Node ? using GridFs

I am making an Api with Node and Mongo that receives large volumes of data, I was receiving an error because the size of the records that were stored in mongo exceeded 16 MB. So I opted for the alternative offered by mongo in its gridFS documentation, to insert the records, which I had no problems with. But I am having conflicts to insert and filter since I don't know how to do it, I read the documentation and there are several ways. But I can't figure out how to filter (find a record by its field) and how to update.
The function to create a record works but it performs some necessary steps such as storing the json it receives in a file and then reading it and with that creating the record, I would have liked to find a more practical solution such as only inserting the json it receives without having to create a file with its content and then get the information from that file I attach the code to see if you can tell me how to solve this problem:
const { MongoClient, ObjectId, GridFSBucket,} = require('mongodb');
const { config } = require('../../config');
//const USER = encodeURIComponent(config.noRelDbUser);
//const PASSWORD = encodeURIComponent(config.noRelDbPassword);
const DB_NAME = config.noRelDbName;
const fs = require('fs');
const removeFile = require('../../modules/results/utils/RemoveFile');
// const MONGO_URI = `mongodb://${USER}:${PASSWORD}#${config.dbHost}:${config.dbPort}/admin?retryWrites=true&w=majority`
const MONGO_URI = `mongodb://${config.noRelDbHost}:${config.noRelDbPort}`;
class MongoLib {
constructor() {
this.client = new MongoClient(MONGO_URI, { useNewUrlParser: true, useUnifiedTopology: true });
this.dbName = DB_NAME;
}
connect() {
if (!MongoLib.connection) {
MongoLib.connection = new Promise((resolve, reject) => {
this.client.connect((err) => {
if (err) {
reject(err);
}
resolve(this.client.db(this.dbName));
});
});
}
return MongoLib.connection;
}
create(collection, data) {
return this.connect()
.then((db) => {
return db.collection(collection).insertOne(data);
})
.then((result) => result.insertedId);
}
async createWithForBigData(collection, data, vr_id , remove=false){
let vrule_id = vr_id;
return this.connect().then((db)=>{
try{
var bucket = new GridFSBucket(db, {
bucketName: collection,
chunkSizeBytes: 260000 ,
});
bucket
bucket.find()
let uploadStream = fs.createReadStream(data).pipe(bucket.openUploadStream(`resultsdetail${vrule_id}`));
let id = uploadStream.id;
uploadStream.on('error', (err) => {
console.log({ message: "Error uploading file" });
throw new Error(err);
});
bucket.find()
uploadStream.on('finish', () => {
console.log({ message: "File uploaded successfully, stored under Mongo ObjectID: " + id });
if(remove === true){
console.log('remueve archivo archivo de directorio storebigdata');
removeFile(data);
}
return id;
});
}catch(err){
console.log('ocurriĆ³ un error al almacenar big data',err);
throw new Error(err);
}
})
}
findBigData(){
//
}
UpdateBigData(){
//
}
}
module.exports = MongoLib;

FS.Readable Stream skipping rows when importing to noSQL using Mongoose schemas

I am attempting to import a CSV using my mongoose modal and regardless of its size, I am importing the first 2 rows and then every other row.
const fs = require('mz/fs');
const { parse } = require('#fast-csv/parse');
const streamToIterator = require('stream-to-iterator');
mongoose.Promise = global.Promise;
mongoose.set('debug', true);
const Product = require('./schemas/Product');
mongoose.Promise = global.Promise;
const options = {
useNewUrlParser: true,
useUnifiedTopology: true,
};
const database = mongoose
.connect(
process.env.DATABASE_URL,
options
)
.then((db) =>
(async function () {
console.log('Connected to database.');
try {
await Promise.all(
Object.entries(db.models).map(([k, m]) => m.deleteMany())
);
let headers = Object.keys(Product.schema.paths).filter(
(k) => ['_id', '__v'].indexOf(k) === -1
);
if (await fs.exists('./database.csv')) {
let stream = fs
.createReadStream('./database.csv')
.pipe(parse({ headers }));
const iterator = await streamToIterator(stream).init();
let buffer = [],
counter = 0;
for (let docPromise of iterator) {
let doc = await docPromise;
buffer.push(doc);
counter++;
if (counter > 10000) {
await Product.insertMany(buffer);
buffer = [];
counter = 0;
}
}
if (counter > 0) {
await Product.insertMany(buffer);
buffer = [];
counter = 0;
}
}
} catch (e) {
console.error(e);
}
})()
)
.catch((err) => console.error('Error connecting to database:', err));
module.exports = database;
When I look at my doc variable, it is already in a malformed state (every other row) and when I'm reading the stream, it is already in a malformed state so I'm assuming its occurring around there?
What I ended up doing to resolve this was to just turn the CSV into JSON and importing it as normal. Not ideal or really addresses the underlying issue but my database has what it needs.

string to bufferstream not always writing data

I have a cloud function receiving a json string in a pubsub topic.
The goal is to extracts some data into a new json string.
Next parse it as JSONL.
And finally stream it to Google Cloud Storage.
I notice that sometimes the files seem to contain data and sometimes they do not.
The pubsub is working fine and data is coming into this cloud function just fine.
I tried adding some async awaits where I seem it might fit but I am afraid it has do to with the bufferstream. Both topics on where I have trouble getting my head around.
What could be the issue?
const stream = require('stream');
const { Storage } = require('#google-cloud/storage');
// Initiate the source
const bufferStream = new stream.PassThrough();
// Creates a client
const storage = new Storage();
// save stream to bucket
const toBucket = (message, filename) => {
// Write your buffer
bufferStream.end(Buffer.from(message));
const myBucket = storage.bucket(process.env.BUCKET);
const file = myBucket.file(filename);
// Pipe the 'bufferStream' into a 'file.createWriteStream' method.
bufferStream.pipe(file.createWriteStream({
validation: 'md5',
}))
.on('error', (err) => { console.error(err); })
.on('finish', () => {
// The file upload is complete.
console.log(`${filename} is uploaded`);
});
};
// extract correct fields
const extract = (entry) => ({
id: entry.id,
status: entry.status,
date_created: entry.date_created,
discount_total: entry.discount_total,
discount_tax: entry.discount_tax,
shipping_total: entry.shipping_total,
shipping_tax: entry.shipping_tax,
total: entry.total,
total_tax: entry.total_tax,
customer_id: entry.customer_id,
payment_method: entry.payment_method,
payment_method_title: entry.payment_method_title,
transaction_id: entry.transaction_id,
date_completed: entry.date_completed,
billing_city: entry.billing.city,
billing_state: entry.billing.state,
billing_postcode: entry.billing.postcode,
coupon_lines_id: entry.coupon_lines.id,
coupon_lines_code: entry.coupon_lines.code,
coupon_lines_discount: entry.coupon_lines.discount,
coupon_lines_discount_tax: entry.coupon_lines.discount_tax,
});
// format json to jsonl
const format = async (message) => {
let jsonl;
try {
// extract only the necessary
const jsonMessage = await JSON.parse(message);
const rows = await jsonMessage.map((row) => {
const extractedRow = extract(row);
return `${JSON.stringify(extractedRow)}\n`;
});
// join all lines as one string with no join symbol
jsonl = rows.join('');
console.log(jsonl);
} catch (e) {
console.error('jsonl conversion failed');
}
return jsonl;
};
exports.jsonToBq = async (event, context) => {
const message = Buffer.from(event.data, 'base64').toString();
const { filename } = event.attributes;
console.log(filename);
const jsonl = await format(message, filename);
toBucket(jsonl, filename);
};
it's fixed by moving the bufferstream const into the tobucket function.

How to recreate a mongo database with using Node.js mongodb official provider?

I have a simple case - I want to drop a database if it exists, then I want to create a new one. I use node.js official driver.
Now, I am trying to do it with the following code:
const client = new MongoClient(dbUrl, { useNewUrlParser: true });
client.connect(() => {
const db = client.db();
db.dropDatabase();
createUsers(db);
client.close();
console.log(`Database "${db.databaseName}" created successfully`);
}, (error) => {
if (error) throw error;
});
But I get an error: "Topology was destroyed" from mongodb. But there is no method to create a database after dropping. How can I recreate database correctly?
Delete client.close();
Look my code
const {MongoClient} = require('mongodb');
MongoClient.connect('mongodb://localhost:27017', { useNewUrlParser: true }).then((client) => {
console.log('Connected to MongoDB server')
const dbOld = client.db('db01')
// drop db01
dbOld.dropDatabase().then(() => {
console.log(`${dbOld.databaseName} drop successfully `)
// create db01 again
const dbNew = client.db('db01')
console.log(`${dbNew.databaseName} recreated successfully `);
},e => console.log(e))
})
Good Luck!
The finish solution based on #padeq answer:
/**
* Rereates a database
* #param {String} dbUrl Database url
*/
const recreateTestDb = (dbUrl) => {
MongoClient.connect(dbUrl, { useNewUrlParser: true }).then((client) => {
const currentDb = client.db();
currentDb.dropDatabase().then(() => {
const newDb = client.db();
createUsers(newDb);
client.close().then(() => {
console.log(`New database "${newDb.databaseName}" recreated successfully`);
});
});
});
};

Exit node js when a stream that write json strings into mongodb finish its work

I use request module to download a zip file that cointain a .csv file, then i use pipe to read the content with unzip and split modules and then i parse and write result into mongodb with mongoose-object-stream module.
My code:
//index.js
var request = require('request');
var bun = require('bun');
var split = require('split');
var unzip = require('./lib/unzip');
var tomongo = require('./lib/tomongo');
var pipeline = bun([ unzip(), split()]);
request.get( "http://someurl/somefile.zip" )
.pipe( pipeline )
.pipe( tomongo() );
//tomongo.js
var mySchema = require('../schema.json');
var through = require('through2');
var mos = require('mongoose-object-stream');
var mongoose = require('mongoose');
var models = require('../models')
const dbpath = "mongodb://localhost:27017/test";
const mongo = mongoose.connect(dbpath, {useNewUrlParser: true });
mongo.then(() => {
console.log('mongoDB connected');
}).catch((err) => {
console.log('err', err);
});
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
var modelStream = new mos(models.books);
function parser(){
var columns = mySchema;
var parseandwrite = function( chunk, _, cb ){
var row = {}, cells = chunk.toString('utf-8').split('\t');
cells.forEach( function( cell, i ){
row[ columns[ i ] ] = ( cell || '' ).trim();
});
if( !!chunk ){
modelStream.write( row );
}
cb();
};
return through.obj( parseandwrite );
}
module.exports = parser;
I want to do something when the stream ends and all records are stored in the db.
I tried adding to pipe .on('finish', function(){process.exit()}) or .on('end', function(){process.exit()}) but node continue running.
I did it! Through2 need .on("data", function(){}) before the .on("end"...
Now the process gracefully disconnect the database and exit.
var request = require('request');
var bun = require('bun');
var split = require('split');
var unzip = require('./lib/unzip');
var tomongo = require('./lib/tomongo');
var pipeline = bun([unzip(), split()]);
function streamToDB(url) {
return new Promise((resolve, reject) => {
request.get(url)
.pipe(pipeline)
.pipe(tomongo())
.on("data", function(data){
new aModel( data ).save();}) //here i save to the db
.on("error", reject)
.on("end", resolve);
});
}
mongoose.connect("mongodb://localhost:27017/test", {
useNewUrlParser: true
}).then(() => {
console.log('mongoDB connected');
return streamToDB("http://someurl/somefile.zip")
}).catch((err) => {
console.log('err', err);
}).then(() => {
return mongoose.disconnect();
});
//tomongo.js
var parseandwrite = function( chunk, _, cb ){
var row = {}, cells = chunk.toString('utf-8').split('\t');
cells.forEach( function( cell, i ){
row[ columns[ i ] ] = ( cell || '' ).trim();
});
if( !!chunk ){
this.push( row ); //here i push the row to the stream
}
cb();
};
Assuming that your parser method is not the problem here I would suggest moving the database connection logic into your index, you should connect to the DB before attempting to stream data to it. If you wrap the streaming logic in a Promise you can do DB connection handling logic in one Promise chain.
Here's an example of what that might look like:
var Promise = require('bluebird');
var mongoose = require('mongoose');
var MongooseObjectStream = require('mongoose-object-stream');
var request = require('request');
var split = require('split');
var through = require('through2');
var unzip = require('unzip-stream');
function streamToDB(url) {
return new Promise((resolve, reject) => {
request.get(url)
.pipe(unzip.Parse())
.pipe(through.obj(function (entry, enc, cb) {
if (entry.path === 'file_with_content') {
entry.on('end', cb)
.on('error', cb)
.on('data', (data) => this.push(data));
} else {
entry.autodrain()
.on('error', cb)
.on('finish', cb);
}
}))
.pipe(split())
.pipe(through.obj((line, enc, cb) => {
cb(null, line.split('\t')); // Convert to "real" object here
}))
.pipe(new MongooseObjectStream(mongoose, 'Model', {}, { strict: false }))
.on('error', reject)
.on('finish', resolve);
});
}
mongoose.connect('mongodb://localhost:27017/test', {
useNewUrlParser: true,
promiseLibrary: Promise
}).then(() => {
return streamToDB('http://someurl/somefile.zip')
.finally(() => mongoose.disconnect());
}).catch((err) => {
console.error(err);
});

Categories