Node.js readStream for large data processing

Node.js readStream for large data processing - javascript

I'm having trouble creating a line by line node.js method of processing large nessus xml files without high RAM usage. In its current form, it is saving data in MongoDB correctly, however the RAM usage keeps increasing, and errors out with files over ~1.5GB.
I've tried using .pause() on the readStream, however, I must have implemented it incorrectly, because it never seemed to actually pause the stream.
Here is the code:
// LR.JS Imports
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('test.nessus');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
var buff = false;
var parseString = require('xml2js').parseString;
var buffStream = '';
//Mongoose Imports
var mongoose = require('mongoose');
var ReportHostDoc = require('./schemas/report-host.model.js');
var ReportItemDoc = require('./schemas/report-item.model.js');
var PluginDetailDoc = require('./schemas/plugin-detail.model.js');
mongoose.Promise = require('bluebird');
// Mongoose Connect
mongoose.connect('mongodb://localhost/test');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
db.once('open', () => {
// Create counters for _taskCheck
var reportHostCounter = 0;
var reportHostSaveCounter = 0;
rl.on('line', (line) => {
// process line here
if (/[<]ReportHost/.test(line)) {
buff = true;
reportHostCounter++;
}
if (buff == true) {
buffStream += line + '\n';
}
if (/[<][/]ReportHost/i.test(line)) {
buff = false; // changed to = not == 9/6
// XML2JS Parse ReportHost Buffstream
parseString(buffStream, (err, result) => {
// Loop through ReportHost properties to reliably find IP
var reportHostIP = '';
var reportHostOS = '';
result.ReportHost.HostProperties[0].tag.forEach((entry) => {
if (entry.$.name === 'host-ip') {
reportHostIP = entry._;
}
if (entry.$.name === 'operating-system') {
reportHostOS = entry._;
}
});
// Save Report Host Document
var host = new ReportHostDoc({
hostname: result.ReportHost.$.name,
ip: reportHostIP,
os: reportHostOS,
high: 0,
critical: 0
});
// Process Each Report Item
result.ReportHost.ReportItem.forEach((entry) => {
var cvssScore = '';
if (entry.cvss_base_score) {
cvssScore = JSON.stringify(entry.cvss_base_score).slice(2, 5)
} else {
cvssScore = 0;
}
var item = new ReportItemDoc({
itemName: entry.$.pluginName,
pluginID: entry.$.pluginID,
ipAddress: reportHostIP,
exploitAvailable: entry.exploit_available,
cvssBaseScore: cvssScore,
pluginPublishedDate: entry.plugin_publication_date,
pluginModifiedDate: entry.plugin_modification_date,
description: entry.description
})
if (item.cvssBaseScore >= 7 && item.cvssBaseScore < 10) {
host.high++;
}
if (item.cvssBaseScore == 10) {
host.critical++;
}
item.save((err, item) => {
if (err) return console.log(err);
})
});
host.save((err, host) => {
if (err) return console.log(err);
reportHostSaveCounter++;
});
})
buffStream = ''; // Empty buffer for next report host
}
});
rl.on('close', () => { // Read Stream Finished
console.log('Log Parse finished!');
var _taskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (reportHostCounter == reportHostSaveCounter) {
clearInterval(_taskCheck);
var pluginCounter = 0;
var pluginSaveCounter = 0;
ReportItemDoc.distinct('pluginID', (err, ids) => {
ids.forEach((id) => {
pluginCounter++;
ReportItemDoc.findOne({
'pluginID': id
}, (err, plugin) => {
ReportItemDoc.count({
'pluginID': id
}, (err, count) => {
var pluginSeverity = '';
var cvss = plugin.cvssBaseScore;
if (cvss >= 7 && cvss < 10) {
pluginSeverity = 'High';
}
if (cvss == 10) {
pluginSeverity = 'Critical';
}
item = new PluginDetailDoc({
pluginName: plugin.itemName,
pluginID: id,
severity: pluginSeverity,
quantity: count,
description: plugin.description
})
item.save((err, host) => {
if (err) return console.log(err);
pluginSaveCounter++;
});
})
});
})
})
var _pluginTaskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (pluginCounter == pluginSaveCounter) {
clearInterval(_pluginTaskCheck);
mongoose.connection.close();
}
}, 100);
}
}, 100);
});
});

Related

node.js async/await function called multiple times (before it ends)

My main file is a "file watcher" that's watching a folder using "node-watch" module. For each file placed in the folder, it has to do couple of MSSQL calls and then process the file accordingly.
The file watcher script looks like that:
var watch = require('node-watch');
const timestamp = require('./SHR_modules/timestamp');
const sql = require('./SHR_modules/sql');
const configuration = require('./SHR_modules/config');
const logger = require('./SHR_modules/logger');
watch('./Incoming', { recursive: true }, async function (evt, name) {
if (evt == 'update') {
logger.addLog(configuration.logFile.watcher, 'File found: ' + name);
var startTime = await timestamp.dateTimeDelimited(); //capture time at proicessing start
filePath = await name.replace(name.split('\\')[name.split('\\').length-1], '');
fileName = await name.replace(filePath, '');
var myLoad = await sql.readLoadSpec(fileName, filePath).catch(err => {
logger.addLog(configuration.logFile.error, 'Error while reading Load Specs for file: ' + name);
logger.addLog(configuration.logFile.error, err);
});
if (typeof myLoad !== 'undefined' && myLoad){
console.log(evt);
logger.addLog(configuration.logFile.watcher, 'Finished processing: ' + name);
};
var finishTime = await timestamp.dateTimeDelimited(); //capture time at processing finish
}
else {
logger.addLog(configuration.logFile.watcher, 'File removed: ' + name);
}
console.log(startTime);
console.log(finishTime);
});
and the SQL function:
var readLoadSpec = (fileName, filePath) => {
return new Promise(async (resolve, reject) => {
specIDs = await findSpecs(fileName, filePath)
.catch(err => {
reject(err);
console.log(fileName + ' not found')
});
if (typeof specIDs !== 'undefined' && specIDs) {
console.log('FOUND!!!!');
var addLoadSpec = [];
for (let i = 0; i < specIDs.length; i++) {
console.log(specIDs);
var tempQuery1 = `select * from LoadSpec where LoadID = ${specIDs[i]}`;
var tempQuery2 = `select * from Mappings where LoadID = ${specIDs[i]} ORDER BY OutputColumn`;
console.log(specIDs); <========= code work sup to this line
const results = await queryDB(tempQuery1);
const resultsMapping = await queryDB(tempQuery2);
console.log(fileName);
console.log(results.recordset[0].OutputFileName);
console.log(specIDs);
const inputFrmt = {
name: results.recordset[0].InputFormatName,
format: results.recordset[0].InputFormat,
delimiter: results.recordset[0].InputDelimiter,
headerRows: results.recordset[0].InputHeaderRows
};
.
.
.
console.log(results.recordset[0].OutputColumnCount);
addLoadSpec.push(loadSpec);
};
resolve(addLoadSpec);
};
});
};
So the code runs to the row I've marked (before the await for query results) and the loops back to beginning.
So all is fine if I drop a single file into the folder, but when multiple files arrive at more or less the same time, the syncIDs var takes value of another file, before the code resumes after the query await part is completed. syncIDs is just an array so let's say for file 'a.csv' it's [1,2], for file 'b.csv' it's [3,4] and for file 'c.csv' it's undefined. So before it get's the answer from the DB, this loop:
specIDs = await findSpecs(fileName, filePath)
.catch(err => {
reject(err);
console.log(fileName + ' not found')
});
if (typeof specIDs !== 'undefined' && specIDs) {
console.log('FOUND!!!!');
var addLoadSpec = [];
for (let i = 0; i < specIDs.length; i++) {
Get's error for index of undefined
How do I make it process files one by one?

For loops breaks when using await

I have a problem when using await in a for loop. Every time it hits the await funtion it executes it fine, but it stops loping through the rest of the array that is looping through. I'm using nodejs with axios to send http request to a restAPI. The function that is in the api is big. So I thought maby that is the problem but it wasn't. Also the api uses its own await functions to, but that wasn't the problem either(As far as I know).
Here is my code for the request
var files = await globPromise("assets/series/**/*.json");
var json = null;
var file = null;
var series = [];
for (i = 0; i < files.length; i++) {
file = files[i];
var raw = fs.readFileSync(file);
json = JSON.parse(raw);
if (json.type === "series") {
try {
var userId = null;
if (req.user == null) {
userId = req.query.userid;
} else {
userId = req.user.id;
}
const response = await axios.get("http://" + host + "/series/info/" + json.id + "?userid=" + userId);
series.push(JSON.parse(response.data));
} catch (error) {
console.log(error);
series.push(json);
}
}
}
res.json(JSON.stringify(series));
});
And also the api side:
app.get('/series/info/:id', async(req, res) => {
var files = await globPromise("assets/series/**/*.json");
var json = null;
var file = null;
for (i = 0; i < files.length; i++) {
file = files[i];
var raw = fs.readFileSync(file);
json = JSON.parse(raw);
if (json.type === "series" && json.id === req.params.id) {
break;
}
json = null;
}
let path = pathFs.dirname(file) + "/";
try {
var seriesFiles = await fsPromise.readdir(path);
var latestWatchedVideo = null;
var latestWatchedTime = null;
var latestWatchTime = null;
var latestWatchDuration = null;
var seasonCount = 0;
var seasons = [];
for (i = 0; i < seriesFiles.length; i++) {
seriesFile = seriesFiles[i];
if (fs.lstatSync(path + "/" + seriesFile).isDirectory()) {
if (!seriesFile.startsWith("s")) {
continue;
}
seasonCount++;
try {
var videoFiles = await fsPromise.readdir(path + "/" + seriesFile + "/");
var videos = [];
for (let i = 0; i < videoFiles.length; i++) {
const video = videoFiles[i];
if (video.endsWith(".json")) {
var rawVideo = fs.readFileSync(path + "/" + seriesFile + "/" + video);
videoJson = JSON.parse(rawVideo);
const query = util.promisify(con.query).bind(con);
var userId = null;
if (req.user == null) {
userId = req.query.userid;
} else {
userId = req.user.id;
}
var results = await query(`SELECT * FROM watched WHERE video_id = "${videoJson.id}" AND user_id = "${userId}"`);
if (results.length > 0) {
var updated = JSON.parse(JSON.stringify(results[0].updated));
var duration = JSON.parse(JSON.stringify(results[0].duration));
var time = JSON.parse(JSON.stringify(results[0].time));
if (latestWatchedVideo == null) {
latestWatchedVideo = videoJson.id;
latestWatchedTime = updated;
latestWatchTime = time;
latestWatchDuration = duration;
} else {
if (latestWatchedTime < updated) {
latestWatchedVideo = videoJson.id;
latestWatchedTime = updated;
latestWatchTime = time;
latestWatchDuration = duration;
}
}
}
videos.push(videoJson);
}
}
function compare(a, b) {
if (a.episode < b.episode) {
return -1;
}
if (a.episode > b.episode) {
return 1;
}
return 0;
}
videos.sort(compare);
seasons.push({
season: seasonCount,
title: seriesFile.replace("s" + seasonCount, ""),
videos: videos
});
} catch (error) {
console.log(error);
}
}
}
json.seasonCount = seasonCount;
json.seasons = seasons;
json.latestWatchDuration = latestWatchDuration;
json.latestWatchTime = latestWatchTime;
json.latestWatchedVideo = latestWatchedVideo;
json.latestWatchedTime = latestWatchedTime;
res.json(JSON.stringify(json));
} catch (error) {
console.log(error);
}
});
Is there something (important) about await and async that I've missed?
Edit: my problem is that is loops fine through the first item and the await is working fine too, but it stops the loop and executes the next lines of code like there are no other items in my array.
Solved: I tried using the for/of and it works now. I don't know whats is so different between de default for and this one but it works!

How can I use loops/recursion with promise-chains?

Imagine for example that you want to store paginated data from an API to a database.
let db;
let pageitems = 35
var offset = 0;
dbConnect //establish connection to database
.then( fetch(apiLink+?offset=2)
.then( res => res.json())
.then( res => {
var total = res.count
return collection.insertMany(res.data, {ordered: false})
// If offset is less than total, I want to increase offset and go back to the fetch-event.
.catch( err => {
if(err.code !== 11000){log(err)}
else{log({completed: err.result.nInserted, duplicates:
err.result.result.writeErrors.length});}
})
.then(() => {
connection.close();
})

You could just use a regular loop:
(async function() {
const conn = await dbConnect;
for(let offset = 0; true; offset++) {
const { data, count } = await (await fetch(`api?page=${offset}`)).json();
// Exit if the page is empty
if(count === 0) break;
await collection.insertMany(data, { ordered: false });
}
})();
To speed that up you could execute multiple requests in parallel:
const chunkSize = 10; // 10 in parallel
for(let offset = 0; offset < chunkSize; offset++) {
(async function() {
const conn = await dbConnect;
for(let offset2 = 0; true; offset2 += chunkSize) {
const { data, count } = await (await fetch(`api?page=${offset + offset2}`)).json();
// Exit if the page is empty
if(count === 0) break;
await collection.insertMany(data, { ordered: false });
}
})();
}

Basically, you will want to wrap your fetch and insert into a function that you will call many times. See the below as an example to illustrate my point...
let db;
let pageitems = 35
var offset = 0;
var db = dbConnect() //establish connection to database
function fetch_and_insert(offset) {
db
.then(fetch(apiLink + "?" + offset))
.then(res => res.json())
.then(res => {
var total = res.count
collection.insertMany(res.data, { ordered: false })
.catch(err => {
if (err.code !== 11000) { log(err) }
else {
log({
completed: err.result.nInserted, duplicates: err.result.result.writeErrors.length
});
}
})
if (offset < total) return fetch_and_insert(offset + pageitems)
return null;
})
}
fetch_and_insert(offset)
.then(() => {
connection.close();
})

Find Email-adresses in the mailbody with Mailparser

I'm quite new to the topic and i'm still having some issues with my mailparser. Though searching and finding emails in the email header (mail.from) does work, it doesn't work in the email body. Does anybody have some experience with that and is willing to help? You can find the function i'm talking about under the "// Check for other addresses in Mail-Body (Doesn't work yet)"-comment. I think, that my Regex is correct. Also if the matchAll-Function give back an array and it can't be saved in the the subscriber.email-object, it shall be at least logged to the console. Also i checked manually in the inbox if there are mails with email adresses in the mail body. There are at least two, which shall be found..
The part of the App.js, that does the mailparsing:
const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');
var config = require('./config');
var Imap = require('imap');
var imap = new Imap(config.imap);
var blacklistString = '';
String.prototype.matchAll = function(regexp) {
var matches = [];
this.replace(regexp, function() {
var arr = ([]).slice.call(arguments, 0);
var extras = arr.splice(-2);
arr.index = extras[0];
arr.input = extras[1];
matches.push(arr);
});
return matches.length ? matches : null;
};
function openInbox(subbox,cb) {
imap.openBox('INBOX.'+subbox, true, cb);
}
function getBoxes(cb) {
imap.getBoxes(cb);
}
function showBoxes(boxes) {
imap.end();
}
function logArrayElements(element) {
if(element[1].indexOf('placeholder.de')==-1){
addToBlacklistString(element[1]);
}
}
function addToBlacklistString(str) {
blacklistString += str+"\n";
}
function writeBlacklistFile() {
fs.appendFile('data/data.csv', blacklistString, function (err) {
if (err) throw err;
console.log('Saved!');
});
}
function search(searchArray, regex){
imap.search(searchArray, function(err, results) {
if (err) throw err;
var temp = 0;
var mailtemp = [];
var f = imap.fetch(results, { bodies: '' });
f.on('message', function(msg, seqno) {
console.log('Message #%d', seqno);
var prefix = '(#' + seqno + ') ';
msg.on('body', function(stream, info) {
simpleParser(stream, (err, mail)=>{
//console.log(temp);
//console.log(mail.subject);
/*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
console.log(err);
});*/
//var text = mail.text;
// New Subscriber Object
var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
subscr.nr = seqno;
//Check for From-Address
if(!!mail.from) {
//console.log(mail.from.value);
for(var i = 0; i < mail.from.value.length; i++) {
mailtemp = mail.from.value[i].address.matchAll(regex);
mailtemp.forEach(function(element){
/*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
console.log(error);
});*/
subscr.from = element[0];
});
if(!!mailtemp) {
mailtemp.forEach(logArrayElements);
}
}
}else{
//console.log(mail.text);
}
// Message-ID
if(!!mail.messageId) {
subscr.mailIdent = mail.messageId;
}
console.log(mail.messageId);
// Check for other addresses in Mail-Body (Doesn't work yet)
var regexEmails = new RegExp('/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
if(!!mail.text){
if(mail.text.matchAll(regexEmails)!=null) {
subscr.emails = mail.text.matchAll(regexEmails);
console.log(subscr.emails);
}
}
/* Split mail.text at substrings in substr-array. Extend if necessary..
*
* Also check for 'Anwalt'-Expression in splitted Substring
*
* If mail.text doesn't exist -> Check for html body and convert it to text-format
*/
//var regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
var regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');
if(!!mail.text) {
var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
for (var i = 0; i<substr.length; i++) {
if(mail.text.indexOf(substr[i]) > -1) {
var textTemp = mail.text;
var arr = textTemp.split(substr[i]);
if(arr[0].matchAll(regexAnwalt)!=null) {
subscr.anwalt = true;
};
subscr.text = arr[0];
break;
} else {
subscr.text = mail.text;
}
}
//console.log(arr);
}
else
{
var html = mail.html;
var text = htmlToText.fromString(html, {
noLinkBrackets: true,
ignoreImage: true,
uppercaseHeadings: false,
preserveNewlines: false,
wordwrap:130,
format: {
heading: function (node, fn, options) {
var h = fn(node.children, options);
return '\n==== ' + h + ' ====\n\n';
}
}
});
subscr.text = text;
}
mail.headers.forEach(function(value, key) {
//console.log(value);
});
subscr.save();
//console.log(subscr);
temp++;
});
});
msg.once('end', function() {
console.log(prefix + 'Finished');
});
});
f.once('error', function(err) {
console.log('Fetch error: ' + err);
});
f.once('end', function() {
console.log('Done fetching all messages!');
//writeBlacklistFile();
imap.end();
});
});
}
imap.once('ready', function() {
openInbox('Test',function(err, box) {
var searchArray = [['FROM', '#']];
search(searchArray,/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
});
});
imap.once('error', function(err) {
console.log(err);
});
imap.once('end', function() {
console.log('Connection ended');
});
imap.connect();
app.listen(2700, function(){
console.log("Listening on Port 2700")
});
module.exports = app;
subscriber.js
const mongoose = require('mongoose');
var subscriberSchema = mongoose.Schema({
nr: Number,
mailIdent: String,
from: String,
emails: String,
text: String,
uLink: String,
anwalt: Boolean
});
var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);
//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};
module.exports.getSubscriberByID = function(_id, callback){
Subscriber.findById(_id, callback);
};

The Regex for the Emails was a little bit wrong.
Also i didn't noticed that the matchAll-Fct. is giving back a two-dimensional Array. Here is the changed part of the code:
var regexEmails = new RegExp("([\\w\\.\\-\\_\\#\\+]+#[\\w\\.\\-\\_äüö]+\\.[a-zA-Z]+)");
var temp1 = mail.text.matchAll(regexEmails);
if(!!temp1){
//console.log(temp1);
for(var i =0; i<temp1.length; i++) {
if(temp1[0][i]!=='info#service.placeholder.de' && temp1[0][i] !== "info#placeholder.de"){
subscr.emails += temp1[0][i];
}
}
}

What is causing "Uncaught Error: write after end" in my tests?

I have the following code:
var Promise = require('bluebird');
Promise.longStackTraces();
var path = require('path');
var fs = Promise.promisifyAll(require('fs-extra'));
var clone = require('nodegit').Clone.clone;
var tar = require('tar-fs');
var zlib = require('zlib');
var gzip = zlib.createGzip();
var globAsync = Promise.promisify(require('glob'));
module.exports = Archive;
function Archive(pkg) {
var self = this;
var tmp_dir_name = '.tmp';
var code_dir_name = 'code';
var files_dir_name = 'files';
var output_dir_name = 'archives';
var coverall_docs_dir_name = 'coverall_documents';
// the archive's name (no extension):
self.name = pkg.name;
self.recipient_name = pkg.recipient_name;
// path to letter.tex:
self.tex_letter_path = path.resolve(pkg.files.letter);
// path to resume.tex:
self.tex_resume_path = path.resolve(pkg.files.resume);
// path to merged.pdf (letter.pdf + resume.pdf):
self.pdf_package_path = path.resolve(pkg.compiled_files.package);
// temp dir where the archive is assembled:
self.tmp_path = path.resolve(tmp_dir_name, pkg.name);
// path to final archive:
self.output_path = path.resolve(output_dir_name, self.name + '.tar.gz');
// where to copy files to be added to the archive:
self.files_path = path.resolve(tmp_dir_name, self.name, files_dir_name);
// where the tex files are within the archive:
self.coverall_docs_path = path.resolve(self.files_path, code_dir_name, coverall_docs_dir_name);
}
Archive.prototype.make = Promise.method(function() {
var self = this;
return self._prepareFilesDir()
.then(self._copyFiles.bind(self))
.then(self._writeArchive.bind(self))
.then(self._delTmpDir.bind(self));
});
// ********************************
// * Private functions
// ********************************
Archive.prototype._prepareFilesDir = function() {
var self = this;
return fs.emptyDirAsync(self.tmp_path);
};
Archive.prototype._copyFiles = function() {
var self = this;
var sources = {
tex_letter_path: path.resolve(self.tex_letter_path, '..'),
tex_resume_path: path.resolve(self.tex_resume_path, '..'),
tex_letter_shared_path: path.resolve(self.tex_letter_path, '../../shared'),
pdf_package_path: self.pdf_package_path
};
var destinations = {
letter_path: path.resolve(self.coverall_docs_path, 'coverletters', self.recipient_name.toLowerCase()),
resume_path: path.resolve(self.coverall_docs_path, 'resume'),
letter_shared_path: path.resolve(self.coverall_docs_path, 'coverletters/shared'),
pdf_package_path: path.resolve(self.files_path, 'pdf', self.recipient_name.toLowerCase() + '.pdf'),
coverall_repo_path: path.resolve(self.files_path, 'code/coverall')
};
var filters = {
tex: function(filename) {
var contains_dot = /\./gm;
var hidden = /\/\./gm;
var cls_or_tex_file = /\.(cls|tex)$/gm;
var is_a_dir = !contains_dot.test(filename);
var is_not_hidden = (contains_dot.test(filename) && !hidden.test(filename));
var is_cls_or_tex = cls_or_tex_file.test(filename);
// it doesn't contain a dot or it isn't a hidden file or it is a cls/tex file
var is_allowed = is_a_dir || is_not_hidden || is_cls_or_tex;
return is_allowed;
},
pdf: /[^\.].*\.pdf/
};
var copyLetter = function() {
return fs.copyAsync(sources.tex_letter_path, destinations.letter_path, { filter: filters.tex });
};
function copyShared() {
return fs.copyAsync(sources.tex_letter_shared_path, destinations.letter_shared_path, { filter: filters.tex });
}
function copyResume() {
return fs.copyAsync(sources.tex_resume_path, destinations.resume_path, { filter: filters.tex });
}
function copyPdf() {
return fs.copyAsync(sources.pdf_package_path, destinations.pdf_package_path, { filter: filters.pdf });
}
function copyJs() {
return clone('https://github.com/coaxial/coverall.git', destinations.coverall_repo_path);
}
return Promise.all([
copyLetter(),
copyShared(),
copyResume(),
copyPdf(),
copyJs()
]);
};
Archive.prototype._writeArchive = function() {
var self = this;
var archive_dir_path = path.resolve(self.output_path, '..');
var tarPromise = function() {
return new Promise(function(resolve, reject) {
tar.pack(self.files_path)
.pipe(gzip)
.pipe(fs.createWriteStream(self.output_path))
.on('error', reject)
.on('finish', resolve);
});
};
return fs.ensureDirAsync(archive_dir_path)
.then(tarPromise);
};
Archive.prototype._delTmpDir = function() {
var self = this;
return fs.removeAsync(self.tmp_path);
};
and I am testing it with:
/*eslint-env mocha */
var chai = require('chai');
var chaiAsPromised = require("chai-as-promised");
var expect = chai.expect;
var Promise = require('bluebird');
Promise.longStackTraces();
var Archive = require('../lib/archive');
var path = require('path');
var fs = Promise.promisifyAll(require('fs-extra'));
var globAsync = Promise.promisify(require('glob'));
var tar = require('tar-fs');
var zlib = Promise.promisifyAll(require('zlib'));
var _ = require('lodash');
chai.use(chaiAsPromised);
describe.only('Archive', function() {
var pkg;
beforeEach(function() {
pkg = {
name: 'test_0790feebb1',
recipient_name: 'Test',
files: {
letter: '../coverall_documents/coverletters/test/letter.tex',
resume: '../coverall_documents/resume/resume.tex'
},
compiled_files: {
package: '../coverall_documents/coverletters/test/test.pdf'
}
};
});
// after(function() {
// return Promise.all([
// 'archives/test*',
// 'test/.tmp'
// ].map(function(glob_pattern) {
// return globAsync(glob_pattern)
// .each(function(filename) {
// // make every file writeable so the git packfiles can be removed
// return fs.chmodAsync(filename, '755')
// .then(function() { fs.removeAsync(filename); });
// })
// }));
// });
describe('#make', function() {
it('creates an archive', function() {
var modified_pkg = _.cloneDeep(pkg);
modified_pkg.name = 'test_0000000001';
var archive_location = path.resolve('archives', modified_pkg.name + '.tar.gz');
var test_archive = new Archive(modified_pkg);
return test_archive.make()
.then(function() { return fs.statAsync(archive_location); })
.then(function(file) { return expect(file).to.exist; })
.catch(function(e) { return expect(e).to.not.exist; });
});
it('creates a gzip compressed archive', function() {
var modified_pkg = _.cloneDeep(pkg);
modified_pkg.name = 'test_0000000002';
var archive_location = path.resolve('archives', modified_pkg.name + '.tar.gz');
var test_archive = new Archive(modified_pkg);
// inspired from https://github.com/mafintosh/gunzip-maybe/blob/master/index.js#L6-L11
var isGzipped = function(data) {
var GZIP_MAGIC_BYTES = [0x1f, 0x8b];
var DEFLATE_COMPRESSION_METHOD = 0x08;
var buffer = data[1];
if (buffer[0] !== GZIP_MAGIC_BYTES[0] && buffer[1] !== GZIP_MAGIC_BYTES[1]) return false;
if (buffer[2] !== DEFLATE_COMPRESSION_METHOD) return false;
return true;
};
return test_archive.make()
.then(function() { return fs.openAsync(archive_location, 'r'); })
.then(function(fd) {
var buffer = new Buffer(10);
var buffer_offset = 0;
var buffer_length = 10;
var file_position = 0;
return fs.readAsync(fd, buffer, buffer_offset, buffer_length, file_position);
})
.then(function(data) { console.log('data', data); return data; })
.then(function(data) { return expect(isGzipped(data)).to.be.true; })
});
it('has the correct directory structure', function() {
var modified_pkg = _.cloneDeep(pkg);
modified_pkg.name = 'test_0000000003';
var archive_location = path.resolve('archives', modified_pkg.name + '.tar.gz');
var test_archive = new Archive(modified_pkg);
var tmp_extract_path = path.resolve('test/.tmp');
var tarPromise = function(archive_path) {
return new Promise(function(resolve, reject) {
fs.createReadStream(archive_path)
.pipe(zlib.Unzip())
.pipe(tar.extract(tmp_extract_path))
.on('error', reject)
.on('finish', resolve);
})
};
var verifyDir = function() {
return Promise.all([
'code',
'pdf',
'code/coverall',
'code/coverall_documents',
'code/coverall_documents/coverletters',
'code/coverall_documents/coverletters/test',
'code/coverall_documents/coverletters/shared',
'code/coverall_documents/resume',
'code/coverall_documents/coverletters'
].map(function(subpath) {
return expect(fs.statAsync(path.resolve(tmp_extract_path, subpath)))
.to.be.fulfilled;
}))
};
return test_archive.make()
.then(function() { return tarPromise(archive_location); })
.then(function() { return verifyDir(); });
});
it('removes the temporary dir', function() {
var modified_pkg = _.cloneDeep(pkg);
modified_pkg.name = 'test_0000000004';
var archive_location = path.resolve('archives', modified_pkg.name + '.tar.gz');
var test_archive = new Archive(modified_pkg);
var tmp_dir = path.resolve('.tmp');
return test_archive.make()
.then(function() { return expect(fs.statAsync(tmp_dir)).to.be.rejected; });
});
});
});
Which results in:
$ mocha test
Archive
#make
✓ creates an archive (644ms)
1) creates a gzip compressed archive
2) has the correct directory structure
3) removes the temporary dir
1 passing (2s)
3 failing
1) Archive #make creates a gzip compressed archive:
Uncaught Error: write after end
at writeAfterEnd (_stream_writable.js:167:12)
at Gzip.Writable.write (_stream_writable.js:214:5)
at ondata (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:574:20)
at readableAddChunk (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:198:16)
at Readable.push (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:162:10)
at Pack._encode (node_modules/tar-fs/node_modules/tar-stream/pack.js:154:17)
at Pack.entry (node_modules/tar-fs/node_modules/tar-stream/pack.js:100:10)
at onstat (node_modules/tar-fs/index.js:108:19)
at node_modules/tar-fs/index.js:40:9
at FSReqWrap.oncomplete (fs.js:95:15)
2) Archive #make has the correct directory structure:
AssertionError: expected false to be true
at Context.<anonymous> (test/archive_spec.js:96:10)
3) Archive #make removes the temporary dir:
Uncaught Error: write after end
at writeAfterEnd (_stream_writable.js:167:12)
at Gzip.Writable.write (_stream_writable.js:214:5)
at ondata (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:574:20)
at readableAddChunk (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:198:16)
at Readable.push (node_modules/tar-fs/node_modules/tar-stream/node_modules/readable-stream/lib/_stream_readable.js:162:10)
at Pack._encode (node_modules/tar-fs/node_modules/tar-stream/pack.js:154:17)
at Pack.entry (node_modules/tar-fs/node_modules/tar-stream/pack.js:100:10)
at onstat (node_modules/tar-fs/index.js:108:19)
at node_modules/tar-fs/index.js:40:9
at FSReqWrap.oncomplete (fs.js:95:15)
I suspected a race condition so I commented out the after block to see if it would make any difference but it doesn't.
I do not understand what the Uncaught Error: write after end is about nor why the stacktrace is unusable, even though I am using Promise.longStackTraces(). What is causing this error?
My tests look overly complicated for what they are doing and I am repeating code several times when instantiating the different test_archive objects. How could I refactor them?

You're trying to re-use the same gzip instance, which won't work. This also explains why the first test works just fine.
So move your var gzip = zlib.createGzip(); line to right inside your Archive.prototype._writeArchive function.

We Keep Coding

JavaScript is the programming language of the Web.

Node.js readStream for large data processing - javascript

Related

node.js async/await function called multiple times (before it ends)

For loops breaks when using await

How can I use loops/recursion with promise-chains?

Find Email-adresses in the mailbody with Mailparser

What is causing "Uncaught Error: write after end" in my tests?

Categories

Resources