Find Email-adresses in the mailbody with Mailparser - javascript

I'm quite new to the topic and i'm still having some issues with my mailparser. Though searching and finding emails in the email header (mail.from) does work, it doesn't work in the email body. Does anybody have some experience with that and is willing to help? You can find the function i'm talking about under the "// Check for other addresses in Mail-Body (Doesn't work yet)"-comment. I think, that my Regex is correct. Also if the matchAll-Function give back an array and it can't be saved in the the subscriber.email-object, it shall be at least logged to the console. Also i checked manually in the inbox if there are mails with email adresses in the mail body. There are at least two, which shall be found..
The part of the App.js, that does the mailparsing:
const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');
var config = require('./config');
var Imap = require('imap');
var imap = new Imap(config.imap);
var blacklistString = '';
String.prototype.matchAll = function(regexp) {
var matches = [];
this.replace(regexp, function() {
var arr = ([]).slice.call(arguments, 0);
var extras = arr.splice(-2);
arr.index = extras[0];
arr.input = extras[1];
matches.push(arr);
});
return matches.length ? matches : null;
};
function openInbox(subbox,cb) {
imap.openBox('INBOX.'+subbox, true, cb);
}
function getBoxes(cb) {
imap.getBoxes(cb);
}
function showBoxes(boxes) {
imap.end();
}
function logArrayElements(element) {
if(element[1].indexOf('placeholder.de')==-1){
addToBlacklistString(element[1]);
}
}
function addToBlacklistString(str) {
blacklistString += str+"\n";
}
function writeBlacklistFile() {
fs.appendFile('data/data.csv', blacklistString, function (err) {
if (err) throw err;
console.log('Saved!');
});
}
function search(searchArray, regex){
imap.search(searchArray, function(err, results) {
if (err) throw err;
var temp = 0;
var mailtemp = [];
var f = imap.fetch(results, { bodies: '' });
f.on('message', function(msg, seqno) {
console.log('Message #%d', seqno);
var prefix = '(#' + seqno + ') ';
msg.on('body', function(stream, info) {
simpleParser(stream, (err, mail)=>{
//console.log(temp);
//console.log(mail.subject);
/*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
console.log(err);
});*/
//var text = mail.text;
// New Subscriber Object
var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
subscr.nr = seqno;
//Check for From-Address
if(!!mail.from) {
//console.log(mail.from.value);
for(var i = 0; i < mail.from.value.length; i++) {
mailtemp = mail.from.value[i].address.matchAll(regex);
mailtemp.forEach(function(element){
/*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
console.log(error);
});*/
subscr.from = element[0];
});
if(!!mailtemp) {
mailtemp.forEach(logArrayElements);
}
}
}else{
//console.log(mail.text);
}
// Message-ID
if(!!mail.messageId) {
subscr.mailIdent = mail.messageId;
}
console.log(mail.messageId);
// Check for other addresses in Mail-Body (Doesn't work yet)
var regexEmails = new RegExp('/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
if(!!mail.text){
if(mail.text.matchAll(regexEmails)!=null) {
subscr.emails = mail.text.matchAll(regexEmails);
console.log(subscr.emails);
}
}
/* Split mail.text at substrings in substr-array. Extend if necessary..
*
* Also check for 'Anwalt'-Expression in splitted Substring
*
* If mail.text doesn't exist -> Check for html body and convert it to text-format
*/
//var regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
var regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');
if(!!mail.text) {
var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
for (var i = 0; i<substr.length; i++) {
if(mail.text.indexOf(substr[i]) > -1) {
var textTemp = mail.text;
var arr = textTemp.split(substr[i]);
if(arr[0].matchAll(regexAnwalt)!=null) {
subscr.anwalt = true;
};
subscr.text = arr[0];
break;
} else {
subscr.text = mail.text;
}
}
//console.log(arr);
}
else
{
var html = mail.html;
var text = htmlToText.fromString(html, {
noLinkBrackets: true,
ignoreImage: true,
uppercaseHeadings: false,
preserveNewlines: false,
wordwrap:130,
format: {
heading: function (node, fn, options) {
var h = fn(node.children, options);
return '\n==== ' + h + ' ====\n\n';
}
}
});
subscr.text = text;
}
mail.headers.forEach(function(value, key) {
//console.log(value);
});
subscr.save();
//console.log(subscr);
temp++;
});
});
msg.once('end', function() {
console.log(prefix + 'Finished');
});
});
f.once('error', function(err) {
console.log('Fetch error: ' + err);
});
f.once('end', function() {
console.log('Done fetching all messages!');
//writeBlacklistFile();
imap.end();
});
});
}
imap.once('ready', function() {
openInbox('Test',function(err, box) {
var searchArray = [['FROM', '#']];
search(searchArray,/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
});
});
imap.once('error', function(err) {
console.log(err);
});
imap.once('end', function() {
console.log('Connection ended');
});
imap.connect();
app.listen(2700, function(){
console.log("Listening on Port 2700")
});
module.exports = app;
subscriber.js
const mongoose = require('mongoose');
var subscriberSchema = mongoose.Schema({
nr: Number,
mailIdent: String,
from: String,
emails: String,
text: String,
uLink: String,
anwalt: Boolean
});
var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);
//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};
module.exports.getSubscriberByID = function(_id, callback){
Subscriber.findById(_id, callback);
};

The Regex for the Emails was a little bit wrong.
Also i didn't noticed that the matchAll-Fct. is giving back a two-dimensional Array. Here is the changed part of the code:
var regexEmails = new RegExp("([\\w\\.\\-\\_\\#\\+]+#[\\w\\.\\-\\_äüö]+\\.[a-zA-Z]+)");
var temp1 = mail.text.matchAll(regexEmails);
if(!!temp1){
//console.log(temp1);
for(var i =0; i<temp1.length; i++) {
if(temp1[0][i]!=='info#service.placeholder.de' && temp1[0][i] !== "info#placeholder.de"){
subscr.emails += temp1[0][i];
}
}
}

Related

ReplaceAll causing issues in array.reduce

I am still pretty new to this, so forgive me if I dont' say this correctly. We have an array.reduce that calls a method with a returning promise that iterates through a list of files and post results to the db. Everything was working great, until it ran into a field that had an apostrophe in it and then the db insert fails. This is the field value. 'Expected 100002822' to be 100002822.'
I tried adding a replaceAll on the field and now get an error in the array.reduce.
Here is the .reduce
console.log('Found test results in ' + matches.length + ' files. Parsing and posting to the database now...');
var startTime = moment();
var parser = new Parser();
matches.reduce(function (p, val) {
return p.then(function () {
return parser.parseResults(val);
});
}, Promise.resolve()).then(function (finalResult) {
var endTime = moment();
var testDuration = moment.duration(endTime.diff(startTime));
console.log(chalk.blue('*** File parsing time: ' + testDuration.humanize() + ' ***'));
if (finalResult.insertSuccess == matches.length) {
var publishOut = {
totalFiles: matches.length,
totalTests: 0,
totalTestsSuccess: 0,
totalTestsFailed: 0
}
publishOut.totalTests += finalResult.totalTests;
publishOut.totalTestsSuccess += finalResult.testPassedCount;
publishOut.totalTestsFailed += finalResult.testFailedCount;
console.log(`Successfully inserted ${finalResult.insertSuccess} of ${publishOut.totalTests} test results.`);
// for (var i = 0; i < matches.length; i++) {
// var currentFile = `./testing/results/${matches[i]}`;
// fs.unlinkSync(currentFile);
// }
resolve(publishOut);
} else {
reject('Only ' + finalResult.insertSuccess + ' of ' + matches.length + ' successfully posted to the database');
}
}, function (err) {
reject('error in reduce', err);
});
I have tried several different ways of using the replaceAll with the same failure. It hits this code from the array.reduce
}, function (err) {
reject('error in reduce', err);
});
And this is the called method. The added code causing the failure in the .reduce is this Message = expectation.message.replaceAll("'", "");
protractorParser.prototype.parseResults = function (fileName) {
return new Promise((resolve, reject) => {
//console.log('In parseresults', fileName);
var currentFile = './testing/results/' + fileName
json.readFile(currentFile, function (err, obj) {
if (err != null) {
console.log('error reading file', err);
reject(err);
} else {
resolve(obj);
}
});
}).then(function (obj) {
var results = [];
for (var suite in obj) {
var specs = obj[suite].specs;
for (let i = 0; i < specs.length; i++) {
const assert = specs[i];
const tcR = /TC[\d]+/;
const tc = assert.description.match(tcR);
let Passed = 1;
let Message = '';
let Stack = '';
testResults.totalTests++;
if (assert.failedExpectations.length) {
const expectation = assert.failedExpectations[assert.failedExpectations.length - 1];
Passed = 0;
Message = expectation.message.replaceAll("'", "");
Stack = expectation.stack.split('\n')[1].trim();
testResults.testFailedCount++
} else {
testResults.testPassedCount++
}
if (tc != null) {
const time = moment().utcOffset(config.get('settings.timeOffset')).format('YYYY-MM-DDTHH:mm:ss');
const promise = utility.TestDataManager.insertAutomationResults(tc[0], assert.description, Passed, process.env.testBuild, 'P', Message, Stack, 0, time, '');
results.push(promise.then(() => {
//fs.unlinkSync(currentFile);
testResults.insertSuccess++;
//console.log('insertSuccess', testResults.insertSuccess);
},
err => { console.log('… failed', err); throw err; }
));
} else {
console.log('no test case found for test: ' + assert.description + ' -- skipping');
// I don't think you want to `throw err` here, right?
}
}
}
return Promise.all(results).then(() => testResults);
});
};

Node.js readStream for large data processing

I'm having trouble creating a line by line node.js method of processing large nessus xml files without high RAM usage. In its current form, it is saving data in MongoDB correctly, however the RAM usage keeps increasing, and errors out with files over ~1.5GB.
I've tried using .pause() on the readStream, however, I must have implemented it incorrectly, because it never seemed to actually pause the stream.
Here is the code:
// LR.JS Imports
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('test.nessus');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
var buff = false;
var parseString = require('xml2js').parseString;
var buffStream = '';
//Mongoose Imports
var mongoose = require('mongoose');
var ReportHostDoc = require('./schemas/report-host.model.js');
var ReportItemDoc = require('./schemas/report-item.model.js');
var PluginDetailDoc = require('./schemas/plugin-detail.model.js');
mongoose.Promise = require('bluebird');
// Mongoose Connect
mongoose.connect('mongodb://localhost/test');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
db.once('open', () => {
// Create counters for _taskCheck
var reportHostCounter = 0;
var reportHostSaveCounter = 0;
rl.on('line', (line) => {
// process line here
if (/[<]ReportHost/.test(line)) {
buff = true;
reportHostCounter++;
}
if (buff == true) {
buffStream += line + '\n';
}
if (/[<][/]ReportHost/i.test(line)) {
buff = false; // changed to = not == 9/6
// XML2JS Parse ReportHost Buffstream
parseString(buffStream, (err, result) => {
// Loop through ReportHost properties to reliably find IP
var reportHostIP = '';
var reportHostOS = '';
result.ReportHost.HostProperties[0].tag.forEach((entry) => {
if (entry.$.name === 'host-ip') {
reportHostIP = entry._;
}
if (entry.$.name === 'operating-system') {
reportHostOS = entry._;
}
});
// Save Report Host Document
var host = new ReportHostDoc({
hostname: result.ReportHost.$.name,
ip: reportHostIP,
os: reportHostOS,
high: 0,
critical: 0
});
// Process Each Report Item
result.ReportHost.ReportItem.forEach((entry) => {
var cvssScore = '';
if (entry.cvss_base_score) {
cvssScore = JSON.stringify(entry.cvss_base_score).slice(2, 5)
} else {
cvssScore = 0;
}
var item = new ReportItemDoc({
itemName: entry.$.pluginName,
pluginID: entry.$.pluginID,
ipAddress: reportHostIP,
exploitAvailable: entry.exploit_available,
cvssBaseScore: cvssScore,
pluginPublishedDate: entry.plugin_publication_date,
pluginModifiedDate: entry.plugin_modification_date,
description: entry.description
})
if (item.cvssBaseScore >= 7 && item.cvssBaseScore < 10) {
host.high++;
}
if (item.cvssBaseScore == 10) {
host.critical++;
}
item.save((err, item) => {
if (err) return console.log(err);
})
});
host.save((err, host) => {
if (err) return console.log(err);
reportHostSaveCounter++;
});
})
buffStream = ''; // Empty buffer for next report host
}
});
rl.on('close', () => { // Read Stream Finished
console.log('Log Parse finished!');
var _taskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (reportHostCounter == reportHostSaveCounter) {
clearInterval(_taskCheck);
var pluginCounter = 0;
var pluginSaveCounter = 0;
ReportItemDoc.distinct('pluginID', (err, ids) => {
ids.forEach((id) => {
pluginCounter++;
ReportItemDoc.findOne({
'pluginID': id
}, (err, plugin) => {
ReportItemDoc.count({
'pluginID': id
}, (err, count) => {
var pluginSeverity = '';
var cvss = plugin.cvssBaseScore;
if (cvss >= 7 && cvss < 10) {
pluginSeverity = 'High';
}
if (cvss == 10) {
pluginSeverity = 'Critical';
}
item = new PluginDetailDoc({
pluginName: plugin.itemName,
pluginID: id,
severity: pluginSeverity,
quantity: count,
description: plugin.description
})
item.save((err, host) => {
if (err) return console.log(err);
pluginSaveCounter++;
});
})
});
})
})
var _pluginTaskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (pluginCounter == pluginSaveCounter) {
clearInterval(_pluginTaskCheck);
mongoose.connection.close();
}
}, 100);
}
}, 100);
});
});

Send event from server to client to run a function

I looked at various answers but I can't not find a way to get this going for myself.
I have a function (in node.js) that selects a winner out of a pool, when it selects the winner though, I need it to send a event to the client where it runs a function with data. The data would be the array index of the winner.
Selecting a winner:
var endRound = function() {
ref.child('currentJackpot').once('value', function(data) {
var currentJackpot = data.val();
var winnerArray = [];
var winnerObj = {};
winnerObj.items = [];
for (var i = 0; i < currentJackpot.players.length; i++) {
winnerObj.items = winnerObj.items.concat(currentJackpot.players[i].items);
var playerValue = currentJackpot.players[i].itemsValue * 100;
currentJackpot.players[i].chance = ((currentJackpot.players[i].itemsValue / currentJackpot.jackpotValue) * 100).toFixed(2);
for (var j = 0; j < playerValue; j++) {
winnerArray.push(i);
}
}
var formatted = currentJackpot.roundHash.replace(/[.#$/]/g, "");
sgRef.child(formatted).once('value', function(data) {
var sgData = data.val();
salt = sgData.salt;
rngStr = sgData.rngStr;
console.log('ROUND ENDED! hash: ', hash, ' salt: ', salt, ' rngStr: ', rngStr);
currentJackpot.tickets = currentJackpot.jackpotValue * 100;
currentJackpot.winningTicket = Math.floor((parseFloat(rngStr, 2) * currentJackpot.tickets));
currentJackpot.winningNumber = (parseFloat(rngStr, 2) * 100).toFixed(2) + "%";
currentJackpot.winner = currentJackpot.players[winnerArray[currentJackpot.winningTicket]];
currentJackpot.salt = salt;
currentJackpot.rngStr = rngStr;
winnerObj.jackpotValue = currentJackpot.jackpotValue;
currentJackpot.jackpotValue = currentJackpot.jackpotValue.toFixed(2);
winnerObj.winner = currentJackpot.winner;
winnerObj.tradeToken = currentJackpot.winner.tradeToken;
ref.child('endedJackpots').push(currentJackpot);
bcrypt.genSalt(10, function(err, data) {
salt = data;
rngStr = JSON.stringify(rng());
bcrypt.hash(rngStr, salt, function(err, data) {
hash = data;
ref.child('currentJackpot').set({
itemsCount: 0,
jackpotValue: 0,
roundHash: hash,
}, function() {
console.log('NEW ROUND! hash: ', hash, 'salt: ', salt, 'rngStr: ', rngStr);
var formatted = hash.replace(/[.#$/]/g, "");
var sgJackpotRef = sgRef.child(formatted);
sgRef.set({}, function() {
sgJackpotRef.set({
salt: salt,
rngStr: rngStr,
}, function() {
request.post({
url: '*******',
body: winnerObj,
json: true,
}, function(error, response, body) {
if (error) {
console.log(error);
setPollTimer(10000);
} else {
usersRef.child(winnerObj.winner.id).once('value', function(data) {
var userData = data.val();
if (data.child('won').exists()) {
userData.won = (Math.floor(parseFloat(userData.won, 2)) + Math.floor(parseFloat(winnerObj.jackpotValue, 2))).toFixed(2);
} else {
userData.won = (Math.floor(parseFloat(winnerObj.jackpotValue, 2))).toFixed(2);
}
usersRef.child(winnerObj.winner.id).update({
won: userData.won
}, function() {
console.log('Added winnings to user data');
});
});
console.log('Making a withdraw request now to bot');
setPollTimer(10000);
}
});
});
});
});
});
});
});
});
};
And the function it should run client side:
function slotMachine(winnerIndex) {
var params = {
active: 3,
randomize: function(activeElementIndex){
return 1;
}
};
var machine = $('.slot').slotMachine( params );
$("#slotMachineButton").click(function(){
machine.shuffle(3, function(){
$(this).text("Index: " + this.active);
});
});
}
I'm completely stuck and stressed though, this isn't my code and It's a lot of code I can't grasp where to begin to do this. Any help?

Trouble using nested callbacks in NodeJS

I'm writing a program that scrapes a site for links, then scrapes these links for information. In order to scrape the site, it is necessary to log in first. And so the order is: Log in -> Scrape the index for links -> Scrape the links for info
The callback to the login function prints an empty array { results: [], hasMore: true }, so something is wrong with my code (the scraping part works):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
Maybe Q promises would be better. How would I implement that in the code above?
If you're wondering what the code is for, I'm planning to modify Popcorn-time to use another torrent-tracker (without an API).
Thanks
A main problem is with this code:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails() is async, but you just call it links.length times and move on - acting like they have all completed. So, none of the requests in getDetails() is done before you call the callback and try to pass the results. But, none of the results have yet been filled in so they will be empty.
You have all these other nested callbacks everywhere through your code (as required), yet you dropped the ball in this one place. You need to know when all the getDetails() calls are done before you call the final callback with the results.
In addition, you also have to decide if you're OK calling all the getDetails() calls in parallel (all in flight at once) or if what you really want to do is to call one, wait for it to finish, then call the next, etc... Right now you are putting them all in-flight at once which can work if the destination server doesn't object to that many requests all at once.
There are several potential strategies for fixing this.
Add a callback to getDetails() and then keep a count of when you've gotten links.length callbacks from getDetails() and only when the entire count has finished so you call the final callback.
Change getDetails() to return a promise. You can then use something like links.map(getDetails) to create an array of promises that you can then use Promise.all() with to know when they are all done.
Personally, I would change all of your code to use promises and I'd use the Bluebird promises library for it's extra features such as Promise.map() to make this even simpler.
Here's a fix that adds a callback to getDetails() and then counts how many are done:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
The following is the given sample code rewritten to use bind, a custom this object and a count of the requests that have yet to complete (I think promises obscure the execution path).
The reason that the callback is returning an empty array seems to be that there are no spans in the document with a title attribute, so as a result no further requests are triggered.
var
request = require('request').defaults({
jar: true
}), // necessary for persistent login
cheerio = require('cheerio'),
process = require('process'),
url1 = "https://example.org/torrents/browse/index/",
loginUrl = "https://example.org/user/account/login/",
login = function(callback) {
request.post({
uri: loginUrl,
headers: {
'content-type': 'application/x-www-form-urlencoded'
},
body: require('querystring').stringify({
username: 'user1',
password: 'passpass'
})
}, fna.bind({
callback: callback
}));
},
fna = function(err, res, body) {
if (err) {
console.log("Login error");
return;
}
request(url1, fnb.bind(this));
},
fnb = function(err, res, body) {
if (err) {
console.log("Main scrape error");
return;
}
var
$ = cheerio.load(body),
links = [],
fnd = fne.bind(this);
$('span.title').each(function() {
links.push($(this).children().first().attr('href'));
});
this.results = [];
this.resultCount = links.length;
if (this.resultCount) {
fnd = fnc.bind(this);
for (var i = 0; i < links.length; i++) {
request("https://example.org" + links[i], fnd);
}
} else {
process.nextTick(fnd);
}
},
fnc = function(err, res, body) {
if (err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var
$ = cheerio.load(body),
tds = $('td'),
title = $(tds).get(1).firstChild.data,
hash = $(tds).get(3).firstChild.data.trim(),
size = $(tds).get(9).firstChild.data,
rlsDate = "notfound",
genres = "notfound",
runtime = "notfound",
plot = "notfound",
rating = "notfound", // of 10
imdb_id = "notfound",
cover = "notfound",
thumb = "notfound";
if (tds.length > 23) {
rlsDate = $(tds).get(23).firstChild.data || '';
genres = $(tds).get(27).firstChild.data || '';
runtime = $(tds).get(31).firstChild.data || '';
if ($(tds).get(33).firstChild != null) {
plot = $(tds).get(33).firstChild.data || '';
}
rating = $('#imdb_rating').parent().next().text() || ''; // of 10
imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
cover = $('#cover').children().eq(0).get(0).attribs.href || '';
thumb = $('[alt=Cover]').get(0).attribs.src || '';
if (typeof cover == 'undefined') {
cover = thumb;
}
}
this.results.push({
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
});
this.resultCount--;
if (this.resultCount === 0) {
this.callback({
results: this.results,
hasMore: true
});
}
},
fne = function() {
this.callback({
results: this.results,
hasMore: true
});
};
login(function(result) {
console.log(result);
});

Extract common code from sails.js / mongodb

I'm current trying to use sails.js with mongodb, I need some custom mapReduce function to group data.
Now I could achieve what I want by using waterline's native function, but have some questions.
These function has only small variation actually, but I found myself keep repeating codes like the following one:
function getSomeData() {
// First-query
Log.native(function(err, logCollection) {
var mapFunction = function() {
function dateFormatter(date) {
return date.getFullYear() + "-" + (date.getMonth() + 1)
}
//! Generate Grouping Key
emit(dateFormatter(this.emb_date), this.bad_qty)
}
var reduceFunction = function (key, values) {
return Array.sum(values);
}
var outputControl = {
out: {inline: 1},
//! Filters
query: {order_type: product}
}
logCollection.mapReduce(mapFunction, reduceFunction, outputControl, function (err, result) {
if (err) {
callback(err);
return;
}
var resultSet = [];
//! post-processing
for (var i = 0; i < result.length; i++) {
//.....
}
callback(err, resultSet);
});
});
}
Second-query:
function getAnotherData() {
Log.native(function(err, logCollection) {
var mapFunction = function() {
//! Generate Grouping Key
emit(dateFormatter(this.product), this.bad_qty)
}
var reduceFunction = function (key, values) {
return Array.sum(values);
}
var outputControl = {
out: {inline: 1},
//! Filters
query: {order_type: product}
}
logCollection.mapReduce(mapFunction, reduceFunction, outputControl, function (err, result) {
if (err) {
callback(err);
return;
}
var resultSet = [];
//! post-processing
for (var i = 0; i < result.length; i++) {
//......
}
callback(err, resultSet);
});
});
}
As you can see, these two snippet shares lots of common code, only has difference in three place (Generate grouping key, filters, post-process).
So I would really like to extract the common part to make my code cleaner, but have no success.
I first try to make dateFromatter is provided by a callback instead of hard-coding like the following:
function dateFormatter(data) {
return data.emb_date.getFullYear() + "-" + (data.emb_date.getMonth() + 1)
}
function getSomeData(groupingKey) {
// First-query
Log.native(function(err, logCollection) {
var mapFunction = function() {
//! Generate Grouping Key
emit(groupingKey(this.emb_date), this.bad_qty)
}
var reduceFunction = function (key, values) {
return Array.sum(values);
}
var outputControl = {
out: {inline: 1},
//! Filters
query: {order_type: product}
}
logCollection.mapReduce(mapFunction, reduceFunction, outputControl, function (err, result) {
if (err) {
callback(err);
return;
}
var resultSet = [];
//! post-processing
for (var i = 0; i < result.length; i++) {
//.....
}
callback(err, resultSet);
});
});
}
But without any luck, I keep getting error like the following one:
MongoError: exception: ReferenceError: groupingKey is not defined near 'emit(groupingKey(this), this.bad_qty' (line 3)
at Object.toError (/home/brianhsu/zh800/dashboard/node_modules/sails-mongo/node_modules/mongodb/lib/mongodb/utils.js:114:11)
What should I do if I would like to reduce those duplicate part of code?
Finally I found that I need pass the option called 'scope' to mongodb, I come up with the following solution which works quite well.
exports.defineOn = function(options) {
var model = options.model
var groupingFunction = options.groupingFunction
var mongoFilters = options.mongoFilters
var customFilter = options.customFilter
var converter = options.converter
var sorting = options.sorting
return function(callback) {
model.native(function(err, collection) {
var mapFunction = function() { emit(groupingFunction(this), this.bad_qty) }
var reduceFunction = function(key, values) { return Array.sum(values); }
var mapReduceOptions = {
out: {inline: 1},
query: mongoFilters,
scope: {
groupingFunction: groupingFunction,
mongoFilters: mongoFilters,
customFilter: customFilter,
converter: converter
}
}
var processCallback = function (err, result) {
if (err) {
callback(err);
return;
}
if (sorting) {
result.sort(sorting);
}
var resultSet = [];
for (var i = 0; i < result.length; i++) {
if (customFilter && customFilter(result[i])) {
resultSet.push(converter(result[i]));
} else if (!customFilter) {
resultSet.push(converter(result[i]));
}
}
callback(err, resultSet);
}
collection.mapReduce(mapFunction, reduceFunction, mapReduceOptions, processCallback);
});
}
}
Usage:
function machineDetail (year, month, date, machine, callback) {
var startDate = new Date(+year, +(month-1), +date);
var endDate = new Date(+year, +(month-1), (+date) + 1);
var mapReducer = MapReducer.defineOn({
model: Log,
groupingFunction: function(data) {
return {date: data.emb_date, error: data.defact_id};
},
mongoFilters: {
mach_id: machine,
emb_date: {$gte: startDate, $lt: endDate}
},
converter: function (data) {
return {
name: data._id,
value: data.value,
};
}
});
mapReducer(callback);
}

Categories