MongoDB to Elasticsearch indexing - javascript

Stuck at the point to index data collection in elasticsearch.
Following is the code I'm trying to index the data from mongo.
const elasticsearch = require('elasticsearch');
// instantiate an Elas
var bulk = [];
var MongoClient = require('mongodb').MongoClient;
var ObjectID = require('mongodb').ObjectID;
var mongoDBName = 'mydb'; // Name of mongodb goes here
var mongoCollectionName = 'mycollection'; // Collection name of mongodb goes here
var connectionString = 'mongodb://127.0.0.1:27017/'; // put username and password for mongo here
var esIndexName = 'new-collection'; // Elasticsearch index name will go here
var bulk = [];
const client = new elasticsearch.Client({
hosts: [ 'http://localhost:9200']
});
// ping the client to be sure Elasticsearch is up
client.ping({
requestTimeout: 30000,
}, function(error) {
// At this point, eastic search is down, please check your Elasticsearch service
if (error) {
console.error('Elasticsearch cluster is down!');
} else {
console.log('Everything is ok');
}
});
MongoClient.connect(connectionString+mongoDBName, function(err, db) {
if(err) throw err;
// for each object in a collection
var collection = db.collection(mongoCollectionName);
var counter = 0;
collection.find().each(function(err, item, response, status) {
console.log(item)
Array.from(item).forEach(itemdata => {
bulk.push({index:{
_index: esIndexName,
_type: mongoCollectionName,
}
})
bulk.push(itemdata)
})
//perform bulk indexing of the data passed
client.bulk({body:bulk}, function( err, response ){
if( err ){
console.log("Failed Bulk operation".red, err)
} else {
console.log("Successfully imported %s".green, mongoCollectionName.length);
}
console.log(response);
});
if(item != null) {
if(counter % 100 == 0) console.log( "Syncing object id: "+ item['_id'] + " #: " + counter);
client.indices.create(
{ index: esIndexName },
function(error, response) {
if (error) {
console.log(error);
} else {
console.log("created a new index", response);
}
}
);
}
counter += 1;
});
});
So here I'm trying to indexing data into elasticsearch, I'm able to create the collection index, but failed to insert the data in index of elastic search. Can anyone help me here?
Where I'm getting wrong, and what mistake I'm doing here.
I'm using nodejs here, just simple function to test, later will add lambda function to update/delete and which any change.

First of all, I would suggest to tidy up your code ; it's very difficult to see how the blocks are nested.
Now, there are several problems with your code:
Why are you doing Array.from(item).forEach(itemdata => {? item is a document object from Mongo, so doing Array.from on it has no effect.
You are calling the bulk API inside the .each callback ; meaning you'll do an API call for each document. I don't think this is what you want.
You are creating the index after the bulk operation. This is wrong. You should create your ES index once and for all before inserting documents. It's important because in the future, you'll want to have a more advanced configuration to process your documents.
Your ping call to ES is nice, but it doesn't prevent the rest of your code to run if the cluster is down.
So what you should do:
Create your ES index before iterating over you documents.
Iterate over your MongoDB documents and accumulate them in your body object.
When you have a batch of n documents, call the bulk API and reset your body.

Here is the solution you are looking for
index.js
//MongoDB client config
var MongoClient = require('mongodb').MongoClient;
var mongoDBName = 'mydb'; // Name of mongodb goes here
var mongoCollectionName = 'mycollection'; // Collection name of mongodb goes here
var connectionString = 'mongodb://127.0.0.1:27017/'; // put username and password for mongo here
//Elasticsearch client config
const { Client } = require('#elastic/elasticsearch')
const esClient = new Client({ node: 'http://localhost:9200' });
var esIndexName = 'new-collection'; // Elasticsearch index name will go here
let bulk = [];
async function indexData() {
const client = await MongoClient.connect(connectionString, { useNewUrlParser: true })
.catch(err => { console.log(err); });
if (!client) {
return;
}
try {
const db = client.db(mongoDBName);
let collection = db.collection(mongoCollectionName);
await collection.find().forEach((doc) => {
bulk.push({
index: {
_index: esIndexName,
}
})
let { _id, ...data } = doc;
bulk.push(data);
})
console.log(bulk);
await esClient.indices.create({
index: esIndexName,
}, { ignore: [400] })
const { body: bulkResponse } = await esClient.bulk({ refresh: true, body: bulk })
if (bulkResponse.errors) {
const erroredDocuments = []
// The items array has the same order of the dataset we just indexed.
// The presence of the `error` key indicates that the operation
// that we did for the document has failed.
bulkResponse.items.forEach((action, i) => {
const operation = Object.keys(action)[0]
if (action[operation].error) {
erroredDocuments.push({
// If the status is 429 it means that you can retry the document,
// otherwise it's very likely a mapping error, and you should
// fix the document before to try it again.
status: action[operation].status,
error: action[operation].error,
operation: bulk[i * 2],
document: bulk[i * 2 + 1]
})
}
})
console.log(erroredDocuments)
}
const { body: count } = await esClient.count({ index: esIndexName })
console.log(count)
} catch (err) {
console.log(err);
} finally {
client.close();
}
}
indexData();
package.json
{
"name": "elastic-node-mongo",
"version": "1.0.0",
"description": "Simple example to connect ElasticSearch, MongoDB and NodeJS",
"main": "index.js",
"dependencies": {
"#elastic/elasticsearch": "^7.3.0",
"mongodb": "^3.3.2",
"nodemon": "1.18.3"
},
"scripts": {
"dev": "nodemon",
"start": "node index.js"
},
"keywords": [
"nodejs",
"node",
"mongodb",
"elasticsearch",
"docker"
],
"author": "Sathishkumar Rakkiasmy",
"license": "ISC"
}
Clarifications
I'm able to create the collection index but failed to insert the data
in an index of elastic search.
Above sentence makes sense. Because the bulk variable is unaltered.
Refer below links why bulk variable is unaltered.
Why is my variable unaltered after I modify it inside of a function? - Asynchronous code reference
How do I return the response from an asynchronous call?
To know more about asynchronous programming
https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Asynchronous
https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Asynchronous/Async_await

You can make logstash to import data from mongo db to elasticsearch.Please find attached configuration for your reference.
input {
mongodb {
codec => “json”
uri => ‘mongodb://localhost:27017/NewDb’
placeholder_db_dir => ‘/home/devbrt.shukla/Desktop/scalaoutput/ELK/logstash-6.4.1/db_dir’
placeholder_db_name => ‘Employee_sqlite.db’
collection => ‘Employee’
batch_size => 5000
generateId => ‘true’
parse_method => “simple”
}
}
filter {
mutate {
remove_field => [ “_id” ]
}
}
output {
elasticsearch {
hosts => [“localhost:9200”]
index => “employee-%{+YYYY.MM.dd}”
}
stdout { codec => rubydebug } }
In Logstash we will three sections Input, Filter and Output.
Input: Is to take data from sql, mongodb, mysql etc..
Filter: In this section, we can frame customized json to index into elasticsearch.
Output: In this section we will put Index name, doc type and Ip address of the output section i.e. elasticsearch.

Related

How to Insert items into multiple Arrays in Node.js & MongoDB

This might be a weird question but I believe nothing is completely impossible.
I have a List of Users in MongoDB, each user has among other things, properties array which is currently empty.
In Excel sheet, I have a data that represents each user's properties which I want to programmatically insert in each user's properties array.
Importing excel sheet is fast and easy to populating each user's properties is what gives me the problem.
I have added userId, and PropeId, from the users and the properties they bought, so Identify them as seen below
router.put('/importdata', async (req, res)=>{
// upload queries
const imported = req.files.importdata;
const uploadpath = path.resolve(`public/excel_maneger/uploads/ ${imported.name}`);
if (imported.truncated) {
throw new Error("Uploaded File is too big, should not be morethan 20 MB");
}
await imported.mv(uploadpath);
const file = render.readFile(uploadpath);
const sheets = file.SheetNames;
const data = [];
for (let i = 0; i < sheets.length; i++) {
const sheetname = sheets[i];
const sheetData = render.utils.sheet_to_json(file.Sheets[sheetname]);
sheetData.forEach((item) => {
data.push(item);
});
}
try {
const users = await User.find({role: 'Customer'})
for(let i = 0; i < users.length; i++ ){
data.forEach((d) => {
if(users[i].id == d.id){
User.updateMany(
{},
{
$set: {
properties: {
propeId: d.propeId,
},
},
},
(err, d) => {
if (err) console.log(err);
}
);
}
});
}
} catch (err) {
console.log(err)
}
})
The Problem is that this code updates everyone on the Database (including non specified users) with the same information, Please I need help, I am trying to import 11 thousand users information from excel to database
When you are updating your User.updateMany(), You are not passing in the Id.
What it does is it when if statement is true, it updates all the user, You can use findByIdAndUpdate
Also you should be using async/await. Since that is what you are using to find the user
await User.findByIdAndUpdate( users[i]._id,{ $set: { properties: { propeId: d.propeId }}})
I have finally figured where I made the mistake, I am supposed to use $in: operator to access multiple Ids as desired.
Here's the solution:
data.forEach((d) => {
User.updateMany(
{ _id: { $in: [d.id] } },
{
$push: {
properties: d.propeId,
},
},
(err, d) => {
if (err) return false;
}
);
});
Above solved the problem amazingly

Error while deleting a value of element in mongoDB array using filter function?

I tried to find the solutions over here but unable to get success while using $pull as the array values I have does not contain `mongo_id'.
So the scenario is that , I am trying to delete the specific comment of the particular user which I am passing through query params. M
My mongo data looks like this:
Now I am making API Delete request like this : http://localhost:8000/api/articles/learn-react/delete-comment?q=1 on my localhost .
ANd finally my code looks like this:
import express from "express";
import bodyParser from "body-parser";
import { MongoClient } from "MongoDB";
const withDB = async (operations, res) => {
try {
const client = await MongoClient.connect(
"mongodb://localhost:27017",
{ useNewUrlParser: true },
{ useUnifiedTopology: true }
);
const db = client.db("my-blog");
await operations(db);
client.close();
} catch (error) {
res.status(500).json({ message: "Error connecting to db", error });
}
};
app.delete("/api/articles/:name/delete-comment", (req, res) => {
const articleName = req.params.name;
const commentIndex = req.query.q;
withDB(async(db) => {
try{
const articleInfo = await db.collection('articles').findOne({name:articleName});
let articleAllComment = articleInfo.comments;
console.log("before =",articleAllComment)
const commentToBeDeleted = articleInfo.comments[commentIndex];
//console.log(commentToBeDeleted)
// articleAllComment.update({
// $pull: { 'comments':{username: commentToBeDeleted.username }}
// });
articleAllComment = articleAllComment.filter( (item) => item != commentToBeDeleted );
await articleAllComment.save();
console.log("after - ",articleAllComment);
//yaha per index chahiye per kaise milega pta nhi?
//articleInfo.comments = gives artcle comment
res.status(200).send(articleAllComment);
}
catch(err)
{
res.status(500).send("Error occurred")
}
},res);
});
I have used the filter function but it is not showing any error in terminal but also getting 500 status at postman.
Unable to figure out the error?
I believe you'll find a good answer here:
https://stackoverflow.com/a/4588909/9951599
Something to consider...
You can use MongoDB's built-in projection methods to simplify your code.
https://docs.mongodb.com/manual/reference/operator/projection/positional/#mongodb-projection-proj.-
By assigning a "unique ID" to each of your comments, you can find/modify the comment quickly using an update command instead of pulling out the comment by order in the array. This is more efficient, and much simpler. Plus, multiple read/writes at once won't interfere with this logic during busy times, ensuring that you're always deleting the right comment.
Solution #1: The recommended way, with atomic operators
Here is how you can let MongoDB pull it for you if you give each of your comments an ID.
await db.collection('articles').updateOne({ name:articleName },
{
$pull:{ "comments.id":commentID }
});
// Or
await db.collection('articles').updateOne({ name:articleName, "comments.id":commentID },
{
$unset:{ "comments.$":0 }
});
Solution #2 - Not recommended
Alternatively, you could remove it by index:
// I'm using "3" here staticly, put the index of your comment there instead.
db.collection('articles').updateOne({ name:articleName }, {
$unset : { "comments.3":0 }
})
I do not know why your filter is erroring, but I would recommend bypassing the filter altogether and try to utilize MongoDB's atomic system for you.

How to batch insert data into MySQL in NodeJS?

I am developing a REST API. One of the end points I have recieve a list of data like below.
[
{
"iduser": 3,
"title": "House in kandala",
"description": "Built a house in kandala area"
},
{
"iduser": 3,
"title": "House in NYC",
"description": "Built a house in greater NYC area"
}
]
I need to save the list into the database. Below is my code.
const mysql = require('mysql2');
const errorCodes = require('source/error-codes');
const PropertiesReader = require('properties-reader');
const prop = PropertiesReader('properties.properties');
const con = mysql.createConnection({
host: prop.get('server.host'),
user: prop.get("server.username"),
password: prop.get("server.password"),
port: prop.get("server.port"),
database: prop.get("server.dbname")
});
exports.saveSellerPortfolioItem = (event, context, callback) => {
context.callbackWaitsForEmptyEventLoop = false;
if (event.body == null && event.body == undefined) {
var response = errorCodes.missing_parameters;
callback(null, response)
}
else {
let body = JSON.parse(event.body)
console.log("body", body);
let iduser = Number(body.iduser);
let title = body.title;
let description = body.description;
if (isNaN(iduser)) {
var response = errorCodes.invalid_parameter;
callback(null, response);
}
else {
// allows for using callbacks as finish/error-handlers
const sql = "INSERT INTO seller_portfolio_item (iduser, title, description) VALUES (?,?,?)";
con.execute(sql, [iduser, title, description], function (err, result) {
if (err) {
console.log(err.toString());
if (err.toString().indexOf('cannot be null') >= 0) {
var response = errorCodes.not_null_parameters;
callback(null, response);
}
var response = errorCodes.internal_server_error;
callback(null, response);
}
else {
var response = {
"statusCode": 200,
"headers": {
"Content-Type": "application/json"
},
"body": JSON.stringify({ insertId: result.insertId }),
"isBase64Encoded": false
};
callback(null, response)
}
});
}
}
};
My code is capable of inserting just one record, not suitable to save multiple when I am sending a list. As a result, client program will have to call the same method again and again in a loop.
How can I read the list and insert multiple records ?
You are correct that going forward it is better to use mysql instead of mysql2. Below is one approach that can be used to batch insert multiple records.
Be sure to run npm install mysql --save to ensure you have to necessary package installed.
Working with multiple records requires some additional thinking and planning as well. You should consider:
does your table contain any unique keys other than the primary?
is it possible your API function will ever attempt to insert a duplicate?
in the event of a duplicate how should it be handled?
do you need to know the insert ID for every new record created?
will every object in your list always have the same number of entries, the same keys, and expected values?
Depending on your answers to the above considerations the example I provided below would require additional code and complications. This example is the simplest implementation of the idea.
// package changed, remember to npm install…
const mysql = require('mysql');
const errorCodes = require('source/error-codes');
const PropertiesReader = require('properties-reader');
const prop = PropertiesReader('properties.properties');
const con = mysql.createPool({
connectionLimit: 10,
host: prop.get('server.host') || '127.0.0.1',
user: prop.get("server.username") || 'local_user',
password: prop.get("server.password") || 'local_password',
database: prop.get("server.dbname") || 'local_database',
multipleStatements: true, // necessary to run chained queries
charset: 'utf8mb4' // necessary if you might need support for emoji characters - table charset must match
});
exports.saveSellerPortfolioItem = (event, context, callback) => {
context.callbackWaitsForEmptyEventLoop = false;
// It is better to check for the existence of your
// expected request body in the controller stage of
// your app but I've included this for consistency
// with your original code.
let query_object = event.body ? JSON.parse(event.body) : null;
console.log('query_object', query_object);
if (!query_object.length) {
let response = errorCodes.missing_parameters;
callback(null, response)
}
else {
// use the keys of the first object to define the field names.
// you don't have to use this approach but it provides flexibility
// if you will not always use the same fields
let keys = Object.keys(query_object[0]);
// map the values into a supported format
let values = query_object.map( obj => keys.map( key => obj[key]));
let sql = 'INSERT INTO seller_portfolio_item (' + keys.join(',') + ') ?;'
con.query(sql, values, function(error, results, fields) {
if (error) callback(null, error);
// when inserting multiples you will only get back the
// insert id of the first record. if there are updates
// due to duplicate keys, you won't even get that.
// results will look like this:
console.log(results);
// Expected output
// OkPacket {
// fieldCount: 0,
// affectedRows: 3,
// insertId: 1,
// serverStatus: 2,
// warningCount: 6,
// message: '&Records: 3 Duplicates: 0 Warnings: 6',
// protocol41: true,
// changedRows: 0
// }
let response = {
"statusCode": 200,
"headers": {
"Content-Type": "application/json"
},
"body": JSON.stringify({ records_inserted: results.affectedRows }),
"isBase64Encoded": false
};
callback(null, response)
});
}
};

Google Cloud Functions, resolveMX is not working with a list of domain

I have the following function. I have a list of domains (very big list, more than 100000), I'm trying to put them in a foreach and resolveMx all of them and save the mx records in another database.
Edit, this is the complete function:
const dns = require('dns');
const {BigQuery} = require('#google-cloud/bigquery');
const bigquery = new BigQuery(project="smartiodomains");
const functions = require('firebase-functions');
exports.getMxRecords = functions.https.onRequest( async (req, res) => {
const query = "SELECT string_field_0 FROM smartiodomains.Domains.sk_domains_table";
const options = {
query: query,
location: 'US',
};
const [job] = await bigquery.createQueryJob(options);
const [rows] = await job.getQueryResults();
const datasetId = 'Domains';
const tableId = 'smartio_records';
var index = 0;
rows.forEach((row) => {
dns.resolveMx(row.string_field_0, function(error,addresses){
if(error){
const rows = [
{domain:row.string_field_0, mx_records: 'No data found.', priority: 'No data found.'}
];
// Insert data into a table
bigquery
.dataset(datasetId)
.table(tableId)
.insert(rows);
res.write("Something");
}else{
res.write("Something else");
addresses.forEach( address => {
const rows = [
{domain:row.string_field_0, mx_records: address.exchange, priority: address.priority}
];
// Insert data into a table
bigquery
.dataset(datasetId)
.table(tableId)
.insert(rows).then((foundErrors) => {
if (foundErrors && foundErrors.insertErrors != undefined) {
console.log('Error: ', err);
}
})
.catch((err) => {
console.error('ERROR:', err);
});
});
}
});
});
});
As #Doug Stevenson suggested i add a response (res.write("Something")). Now i have one error and a warning:
1.- Memory Limit exceeded
2.- TeenyStatisticsWarning: Possible excessive concurrent requests detected. 5000 requests in-flight, which exceeds the configured threshold of 5000. Use the TEENY_REQUEST_WARN_CONCURRENT_REQUESTS environment variable or the concurrentRequests option of teeny-request to increase or disable (0) this warning.
Old error:
With this implementation i got this error in the logs of GCF:
getMxRecordsp5ter5a8u17q { Error: queryMx ETIMEOUT marketingweb.sk
Sorry for my bad english. And thanks for any help.
An HTTP function requires that you send a response to the client after all of the asynchronous work is complete. The function terminates immediately after you send that response. Right now, you're not sending any response, so the function never terminates, and it will always time out. You should send a response after all the calls to dns.resolveMx are fully complete.

Firebase functions: sync with Algolia doesn't work

I am currently trying to sync my firestore documents with algolia upon a new document creation or the update of a document. The path to the collection in firestore is videos/video. The function seems to be triggering fine, however after triggering, the firebase function does not seem to relay any of the information to algolia (no records are being created). I am not getting any errors in the log. (I also double checked the rules and made sure the node could be read by default, and yes I am on the blaze plan). Does anyone know how to sync a firestore node and algolia? Thanks for all your help!
"use strict";
const functions = require("firebase-functions");
const admin = require("firebase-admin");
const algoliasearch_1 = require("algoliasearch");
// Set up Firestore.
admin.initializeApp();
const env = functions.config();
// Set up Algolia.
// The app id and API key are coming from the cloud functions environment, as we set up in Part 1,
const algoliaClient = algoliasearch_1.default(env.algolia.appid, env.algolia.apikey);
// Since I'm using develop and production environments, I'm automatically defining
// the index name according to which environment is running. functions.config().projectId is a default property set by Cloud Functions.
const collectionindexvideo = algoliaClient.initIndex('videos');
exports.collectionvideoOnCreate = functions.firestore.document('videos/{uid}').onCreate(async(snapshot, context) => {
await savevideo(snapshot);
});
exports.collectionvideoOnUpdate = functions.firestore.document('videos/{uid}').onUpdate(async(change, context) => {
await updatevideo(change);
});
exports.collectionvideoOnDelete = functions.firestore.document('videos/{uid}').onDelete(async(snapshot, context) => {
await deletevideo(snapshot);
});
async function savevideo(snapshot) {
if (snapshot.exists) {
const document = snapshot.data();
// Essentially, you want your records to contain any information that facilitates search,
// display, filtering, or relevance. Otherwise, you can leave it out.
const record = {
objectID: snapshot.id,
uid: document.uid,
title: document.title,
thumbnailurl: document.thumbnailurl,
date: document.date,
description: document.description,
genre: document.genre,
recipe: document.recipe
};
if (record) { // Removes the possibility of snapshot.data() being undefined.
if (document.isIncomplete === false) {
// In this example, we are including all properties of the Firestore document
// in the Algolia record, but do remember to evaluate if they are all necessary.
// More on that in Part 2, Step 2 above.
await collectionindexvideo.saveObject(record); // Adds or replaces a specific object.
}
}
}
}
async function updatevideo(change) {
const docBeforeChange = change.before.data();
const docAfterChange = change.after.data();
if (docBeforeChange && docAfterChange) {
if (docAfterChange.isIncomplete && !docBeforeChange.isIncomplete) {
// If the doc was COMPLETE and is now INCOMPLETE, it was
// previously indexed in algolia and must now be removed.
await deletevideo(change.after);
} else if (docAfterChange.isIncomplete === false) {
await savevideo(change.after);
}
}
}
async function deletevideo(snapshot) {
if (snapshot.exists) {
const objectID = snapshot.id;
await collectionindexvideo.deleteObject(objectID);
}
}
Still don't know what I did wrong, however if anyone else is stuck in this situation, this repository is a great resource: https://github.com/nayfin/algolia-firestore-sync. I used it and was able to properly sync firebase and algolia. Cheers!
// Takes an the Algolia index and key of document to be deleted
const removeObject = (index, key) => {
// then it deletes the document
return index.deleteObject(key, (err) => {
if (err) throw err
console.log('Key Removed from Algolia Index', key)
})
}
// Takes an the Algolia index and data to be added or updated to
const upsertObject = (index, data) => {
// then it adds or updates it
return index.saveObject(data, (err, content) => {
if (err) throw err
console.log(`Document ${data.objectID} Updated in Algolia Index `)
})
}
exports.syncAlgoliaWithFirestore = (index, change, context) => {
const data = change.after.exists ? change.after.data() : null;
const key = context.params.id; // gets the id of the document changed
// If no data then it was a delete event
if (!data) {
// so delete the document from Algolia index
return removeObject(index, key);
}
data['objectID'] = key;
// upsert the data to the Algolia index
return upsertObject(index, data);
};

Categories