Node.js monitor file for changes and parse them - javascript

I need to monitor a file for changes. Due to a large amount of new entries to this file I would need to 'monitor' this file. I would need to get the new inserted content to this file to be able to parse this content.
I found this code:
fs.watchFile('var/log/query.log', function() {
console.log('File Changed ...');
//how to get the new line which is now inserted?
});

Here is an example of how I used fs.watchFile to monitor a log file for a game called Hearthstone and pick up new log entries to monitor game events as they happened while playing. https://github.com/chevex-archived/hearthstone-log-watcher/blob/master/index.js
var fs = require('fs');
var options = {
logFile: '~/Library/Preferences/Blizzard/Hearthstone/log.config',
endOfLineChar: require('os').EOL
};
// Obtain the initial size of the log file before we begin watching it.
var fileSize = fs.statSync(options.logFile).size;
fs.watchFile(options.logFile, function (current, previous) {
// Check if file modified time is less than last time.
// If so, nothing changed so don't bother parsing.
if (current.mtime <= previous.mtime) { return; }
// We're only going to read the portion of the file that
// we have not read so far. Obtain new file size.
var newFileSize = fs.statSync(options.logFile).size;
// Calculate size difference.
var sizeDiff = newFileSize - fileSize;
// If less than zero then Hearthstone truncated its log file
// since we last read it in order to save space.
// Set fileSize to zero and set the size difference to the current
// size of the file.
if (sizeDiff < 0) {
fileSize = 0;
sizeDiff = newFileSize;
}
// Create a buffer to hold only the data we intend to read.
var buffer = new Buffer(sizeDiff);
// Obtain reference to the file's descriptor.
var fileDescriptor = fs.openSync(options.logFile, 'r');
// Synchronously read from the file starting from where we read
// to last time and store data in our buffer.
fs.readSync(fileDescriptor, buffer, 0, sizeDiff, fileSize);
fs.closeSync(fileDescriptor); // close the file
// Set old file size to the new size for next read.
fileSize = newFileSize;
// Parse the line(s) in the buffer.
parseBuffer(buffer);
});
function stop () {
fs.unwatchFile(options.logFile);
};
function parseBuffer (buffer) {
// Iterate over each line in the buffer.
buffer.toString().split(options.endOfLineChar).forEach(function (line) {
// Do stuff with the line :)
});
};
It first calculates the initial size of the file because in this log watcher module I only want to read new data as it's being written by the game. I don't care about existing data. It then starts watching the file for changes. When the change handler fires we check if the modified time is really newer because some other changes about the file can trigger the handler when no data we care about actually changed. We wanted this watcher to be as performant as we could.
We then read the new size of the file and calculate the difference from the last time. This tells us exactly how much data to read from the file to get only the newly written data. Then we store the data in a buffer and parse it as a string. Just split the string by newline characters. Using core module os to get os.EOL will give you the correct line ending character for the operating system you are running on (windows line ending character is different from linux/unix).
Now you have an array of lines written to the file :)

On bash you would do something like that with tail --follow.
There is also a package tail availible.
you can watch on a file, and get new lines with an event:
const Tail = require('tail').Tail;
var tail = new Tail("var/log/query.log");
tail.watch()
tail.on("line", data => {
console.log(data);
});

Related

node.js - How to adjust the the stdout buffer size (aka highWaterMark) of child_process.spawn()?

I'm attempting to adjust the buffer size (highWaterMark) for the stdout from a child_process.spawn() call, and I'm having a hard time doing this successfully. Below is some code that illustrates what I'm looking for. I'm trying to perform some benchmarking with various stdout buffer sizes, but until I can actually adjust the buffer size, I won't be able to do so. Any advice would be appreciated, thanks!
// I'm using the `stdout` from a child_process.spawn() call to feed into another stream, and I'm having a hard time increasing the buffer size on stdout.
// The code below illustrates the issue:
const child_process = require('child_process');
const stream = require('stream');
const fs = require('fs');
const compressStream = child_process.spawn("cat", ["testfile_32MB"], {writableHighWaterMark: 1024*1024, highWaterMark: 1024*1024}); // <-- child_process.spawn does not have highWaterMark or writableHighWaterMark options, so including them has no effect, but I'm including it just as an illustration of what I want.
compressStream.stdout.writableLength = 1024*1024; // <-- This does not help.
compressStream.stdout._writableState.writableLength = 1024*1024; // <-- This does not help.
compressStream.stdout._writableState.writableHighWaterMark = 1024*1024; // <-- This does not help.
compressStream.stdout.writableHighWaterMark = 1024*1024; // <-- This does not help.
compressStream.stdout._readableState.highWaterMark = 1024*1024; // <-- This does not help.
compressStream.stdout._writableState.highWaterMark = 1024*1024; // <-- This does not help.
compressStream.stdout.on("data", (data) => {
console.log("data length [from spawn()]:", data.length); // <-- data.length will always be <= 64KB! How can we get it higher?
});
const readStream = fs.createReadStream("testfile_32MB", {highWaterMark: 1024*1024});
readStream.on("data", (data) => {
console.log("data length [from createReadStream()]:", data.length); // <-- This prints values <= 1MB, which is what I wish I had for child_process.spawn/stdout...
});
Note, you'll need a file called "testfile_32MB" in place to run this example code, and it should be at least a few megabytes in size.
Alas, you cannot do this. highWaterMark is a mechanism that is used to signal the writer to stop because the reader is not reading and its buffer is full.
A Socket will send you the data as soon as it is available. It will always use an internal hard-coded 64k buffer. Increasing the highWaterMark will make it capable of holding more data, but it will still send you 64Kb chunks - except that you will get bursts of several calls per event loop rotation.
You don't have access to the Stream creation options when using spawn - you can eventually pass a preconstructed named pipe stream - but this won't solve your problem.

Parsing huge binary files in Node.js

I want to create Node.js module which should be able to parse huge binary files (some larger than 200GB). Each file is divided into chunks and each chunk can be larger than 10GB. I tried using flowing and non-flowing methods to read file, but the problem is because the end of the readed buffer is reached while parsing chunk, so parsing of that chunk must be terminated before the next onData event occurs. This is what I've tried:
var s = getStream();
s.on('data', function(a){
parseChunk(a);
});
function parseChunk(a){
/*
There are a lot of codes and functions.
One chunk is larger than buffer passed to this function,
so when the end of this buffer is reached, parseChunk
function must be terminated before parsing process is finished.
Also, when the next buffer is passed, it is not the start of
a new chunk because the previous chunk is not parsed to the end.
*/
}
Loading whole chunk into process memory isn't prossible because I have only 8GB of RAM. How can I synchronously read data from the stream or how can I pause parseChunk function when the end of the buffer is reached and wait until new data is available?
Maybe I'm missing something, but as far as I can tell, I don't see a reason why this couldn't be implemented using streams with a different syntax. I'd use
let chunk;
let Nbytes; // # of bytes to read into a chunk
stream.on('readable', ()=>{
while(chunk = stream.read(Nbytes)!==null) {
// call whatever you like on the chunk of data of size Nbytes
}
})
Note that if you specify the size of the chunk yourself, like done here, null will be returned if the amount of bytes requested are not available at the end of the stream. This doesn't mean there is no data anymore to stream. So just be aware that you should expect back a 'trimmed' buffer object of size < Nbytes at the end of the file.

Can files be updated and read using the JavaScript FileReader interface?

I am reading a local CSV file using a web UI, and the HTML5 FileReader interface to handle the local file stream. This works great.
However, sometimes I want the file being read to be updated continuously, after the initial load. I am having problems, and I think it might have something to do with the FileReader API. Specifically, after the initial file load, I maintain a reference to the file. Then, when I detect that the size of the file has increased, I slice off the new part of the file, and get a new Blob object. However, there appears to be no data in these new Blobs.
I am using PapaParse to handle the CSV parsing, though I don't think that is the source of the problem (though it may be).
The source code is too voluminous to post here, but here is some pseudocode:
var reader = new FileReader();
reader.onload = loadChunk;
var file = null;
function readLocalFile(event) {
file = event.target.files[0];
// code that divides file up into chunks.
// for each chunk:
readChunk(chunk);
}
function readChunk(chunk) {
reader.readAsText(chunk);
}
function loadChunk(event) {
return event.target.result;
}
// this is run when file size has increased
function readUpdatedFile(oldLength, newLength) {
var newData = file.slice(oldLength, newLength);
readChunk(newData);
}
The output of loadChunk when the file is first loading is a string, but after the file has been updated it is a blank string. I am not sure if the problem is with my slice method, or if there is something going on with FileReader that I am not aware of.
The spec for File objects shouldn't allow this: http://www.w3.org/TR/FileAPI/#file -- it's supposed to be like a snapshot.
The fact that you can detect that the size has changed is probably a shortcoming of an implementation.

How NOT to stop reading file when meeting EOF?

I'm trying to implement a routine for Node.js that would allow one to open a file, that is being appended to by some other process at this very time, and then return chunks of data immediately as they are appended to file. It can be thought as similar to tail -f UNIX command, however acting immediately as chunks are available, instead of polling for changes over time. Alternatively, one can think of it as of working with a file as you do with socket — expecting on('data') to trigger from time to time until a file is closed explicitly.
In C land, if I were to implement this, I would just open the file, feed its file descriptor to select() (or any alternative function with similar designation), and then just read chunks as file descriptor is marked "readable". So, when there is nothing to be read, it won't be readable, and when something is appended to file, it's readable again.
I somewhat expected this kind of behavior for following code sample in Javascript:
function readThatFile(filename) {
const stream = fs.createReadStream(filename, {
flags: 'r',
encoding: 'utf8',
autoClose: false // I thought this would prevent file closing on EOF too
});
stream.on('error', function(err) {
// handle error
});
stream.on('open', function(fd) {
// save fd, so I can close it later
});
stream.on('data', function(chunk) {
// process chunk
// fs.close() if I no longer need this file
});
}
However, this code sample just bails out when EOF is encountered, so I can't wait for new chunk to arrive. Of course, I could reimplement this using fs.open and fs.read, but that somewhat defeats Node.js purpose. Alternatively, I could fs.watch() file for changes, but it won't work over network, and I don't like an idea of reopening file all the time instead of just keeping it open.
I've tried to do this:
const fd = fs.openSync(filename, 'r'); // sync for readability' sake
const stream = net.Socket({ fd: fd, readable: true, writable: false });
But had no luck — net.Socket isn't happy and throws TypeError: Unsupported fd type: FILE.
So, any solutions?
UPD: this isn't possible, my answer explains why.
I haven't looked into the internals of the read streams for files, but it's possible that they don't support waiting for a file to have more data written to it. However, the fs package definitely supports this with its most basic functionality.
To explain how tailing would work, I've written a somewhat hacky tail function which will read an entire file and invoke a callback for every line (separated by \n only) and then wait for the file to have more lines written to it. Note that a more efficient way of doing this would be to have a fixed size line buffer and just shuffle bytes into it (with a special case for extremely long lines), rather than modifying JavaScript strings.
var fs = require('fs');
function tail(path, callback) {
var descriptor, bytes = 0, buffer = new Buffer(256), line = '';
function parse(err, bytesRead, buffer) {
if (err) {
callback(err, null);
return;
}
// Keep track of the bytes we have consumed already.
bytes += bytesRead;
// Combine the buffered line with the new string data.
line += buffer.toString('utf-8', 0, bytesRead);
var i = 0, j;
while ((j = line.indexOf('\n', i)) != -1) {
// Callback with a single line at a time.
callback(null, line.substring(i, j));
// Skip the newline character.
i = j + 1;
}
// Only keep the unparsed string contents for next iteration.
line = line.substr(i);
// Keep reading in the next tick (avoids CPU hogging).
process.nextTick(read);
}
function read() {
var stat = fs.fstatSync(descriptor);
if (stat.size <= bytes) {
// We're currently at the end of the file. Check again in 500 ms.
setTimeout(read, 500);
return;
}
fs.read(descriptor, buffer, 0, buffer.length, bytes, parse);
}
fs.open(path, 'r', function (err, fd) {
if (err) {
callback(err, null);
} else {
descriptor = fd;
read();
}
});
return {close: function close(callback) {
fs.close(descriptor, callback);
}};
}
// This will tail the system log on a Mac.
var t = tail('/var/log/system.log', function (err, line) {
console.log(err, line);
});
// Unceremoniously close the file handle after one minute.
setTimeout(t.close, 60000);
All that said, you should also try to leverage the NPM community. With some searching, I found the tail-stream package which might do what you want, with streams.
Previous answers have mentioned tail-stream's approach which uses fs.watch, fs.read and fs.stat together to create the effect of streaming the contents of the file. You can see that code in action here.
Another, perhaps hackier, approach might be to just use tail by spawning a child process with it. This of course comes with the limitation that tail must exist on the target platform, but one of node's strengths is using it to do asynchronous systems development via spawn and even on windows, you can execute node in an alternate shell like msysgit or cygwin to get access to the tail utility.
The code for this:
var spawn = require('child_process').spawn;
var child = spawn('tail',
['-f', 'my.log']);
child.stdout.on('data',
function (data) {
console.log('tail output: ' + data);
}
);
child.stderr.on('data',
function (data) {
console.log('err data: ' + data);
}
);
So, it seems people are still looking for an answer to this question for five years already, and there is yet no answer on topic.
In short: you can't. Not in Node.js particularly, you can't at all.
Long answer: there are few reasons for this.
First, POSIX standard clarifies select() behavior in this regard as follows:
File descriptors associated with regular files shall always select true for ready to read, ready to write, and error conditions.
So, select() can't help with detecting a write beyond the file end.
With poll() it's similar:
Regular files shall always poll TRUE for reading and writing.
I can't tell for sure with epoll(), since it's not standartized and you have to read quite lengthy implementation, but I would assume it's similar.
Since libuv, which is in core of Node.js implementation, uses read(), pread() and preadv() in its uv__fs_read(), neither of which would block when invoked at the end of file, it would always return empty buffer when EOF is encountered. So, no luck here too.
So, summarizing, if such functionality is desired, something must be wrong with your design, and you should revise it.
What you're trying to do is a FIFO file (acronym for First In First Out), which as you said works like a socket.
There's a node.js module that allows you to work with fifo files.
I don't know what do you want that for, but there are better ways to work with sockets on node.js. Try socket.io instead.
You could also have a look at this previous question:
Reading a file in real-time using Node.js
Update 1
I'm not familiar with any module that would do what you want with a regular file, instead of with a socket type one. But as you said, you could use tail -f to do the trick:
// filename must exist at the time of running the script
var filename = 'somefile.txt';
var spawn = require('child_process').spawn;
var tail = spawn('tail', ['-f', filename]);
tail.stdout.on('data', function (data) {
data = data.toString().replace(/^[\s]+/i,'').replace(/[\s]+$/i,'');
console.log(data);
});
Then from the command line try echo someline > somefile.txt and watch at the console.
You might also would like to have a look at this: https://github.com/layerssss/node-tailer

Write a data URI to a file in a Firefox extension

I am developing a Firefox addon. I need to save a bunch of data URI images to the disk. How do I approach to this?
I have browsed through the file I/O snippets on MDN, but the snippets don't help me much.
There are async and sync methods.I would like to use async method but how can I write a binary file using async method
Components.utils.import("resource://gre/modules/NetUtil.jsm");
Components.utils.import("resource://gre/modules/FileUtils.jsm");
// file is nsIFile
var file = FileUtils.getFile("Desk", ["test.png"]);
// You can also optionally pass a flags parameter here. It defaults to
// FileUtils.MODE_WRONLY | FileUtils.MODE_CREATE | FileUtils.MODE_TRUNCATE;
var ostream = FileUtils.openSafeFileOutputStream(file);
//base64 image that needs to be saved
image ="iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==";
// How can I create an inputstream from the image data URI?
var inputstream = createInputstream(image);
// The last argument (the callback) is optional.
NetUtil.asyncCopy(inputstream , ostream, function(status) {
if (!Components.isSuccessCode(status)) {
// Handle error!
return;
}
// Data has been written to the file.
});
It sounds like you'd like to write not the data URI but the binary data it "contains", so I'll answer that.
First, lets assume we got some actual data URI, (if not, adding data:application/octet-stream;base64, isn't too hard ;)
// btoa("helloworld") as a placeholder ;)
var imageDataURI = "data:application/octet-stream;base64,aGVsbG93b3JsZA==";
Option 1 - Using OS.File
OS.File has the benefit that it is truly async. On the other hand, NetUtil is only mostly async, in that there will be stat calls on the main thread and the file will be opened and potentially closed on the main thread as well (which can lead to buffer flushes and hence block the main thread while the flush is happening).
After constructing a path (with some constants help), OS.File.writeAtomic is suited for the job.
Components.utils.import("resource://gre/modules/osfile.jsm");
var file = OS.Path.join(OS.Constants.Path.desktopDir, "test.png");
var str = imageDataURI.replace(/^.*?;base64,/, "");
// Decode to a byte string
str = atob(str);
// Decode to an Uint8Array, because OS.File.writeAtomic expects an ArrayBuffer(View).
var data = new Uint8Array(str.length);
for (var i = 0, e = str.length; i < e; ++i) {
data[i] = str.charCodeAt(i);
}
// To support Firefox 24 and earlier, you'll need to provide a tmpPath. See MDN.
// There is in my opinion no need to support these, as they are end-of-life and
// contain known security issues. Let's not encourage users. ;)
var promised = OS.File.writeAtomic(file, data);
promised.then(
function() {
// Success!
},
function(ex) {
// Failed. Error information in ex
}
);
Option 2 - Using NetUtil
NetUtil has some drawbacks in that is is not fully async, as already stated above.
We can take a shortcut in that we can use NetUtil.asyncFetch to directly fetch the URL, which gives us a stream we can pass along to .asyncCopy.
Components.utils.import("resource://gre/modules/NetUtil.jsm");
Components.utils.import("resource://gre/modules/FileUtils.jsm");
// file is nsIFile
var file = FileUtils.getFile("Desk", ["test.png"]);
NetUtil.asyncFetch(imageDataURI, function(inputstream, status) {
if (!inputstream || !Components.isSuccessCode(status)) {
// Failed to read data URI.
// Handle error!
return;
}
// You can also optionally pass a flags parameter here. It defaults to
// FileUtils.MODE_WRONLY | FileUtils.MODE_CREATE | FileUtils.MODE_TRUNCATE;
var ostream = FileUtils.openSafeFileOutputStream(file);
// The last argument (the callback) is optional.
NetUtil.asyncCopy(inputstream , ostream, function(status) {
if (!Components.isSuccessCode(status)) {
// Handle error!
return;
}
// Data has been written to the file.
});
});

Categories