I have an application (KafkaConnect) that is generating me avro files into S3.
This files are compressed with avro code "snappy".
I'm trying to read them with javascript (I'm not a very strong javascript developer as you will be able to guess).
I tried to use avro-js or avsc as libraries to help me with this since they are referenced in most of the online examples I found for doing this.
The most complete example and very useful I found was here.
Anyway it seems most examples I found are using snappy version 6 which seems to be a bit different than version 7 (the latest).
One of the main things I noticed is that it now provides two methods of uncompress. One with sync and another which returns a promise, but none that can receive a call back function.
Anyway I think this is not an issue because I could make it work regardless, but my best example to read this files would be something like this (with avsc).
const avsc = require('avsc');
const avsc = require('avsc');
const snappy = require('snappy');
const codecs = {
snappy: function (buf, cb) {
// Avro appends checksums to compressed blocks, which we skip here.
const buffer = snappy.uncompressSync(buf.slice(0, buf.length - 4));
return cb(buffer);
}
};
avsc.createFileDecoder('person-10.snappy.avro', {codecs})
.on('metadata', function (writerType) {
console.log(writerType.name);
})
.on('data', function (obj) {
console.log('on data ');
console.log('obj');
})
.on('end', function () {
console.log('end');
});
Anyway the processing of metadata works without issues (I can access the full schema information) but the data always fails with
Uncaught Error: snappy codec decompression error
I'm looking for someone that has by some reason worked with avro and snappy in the latest versions and managed to make this work.
Because I'm really struggling with understanding this I created a fork of the official avsc repo and tried to introduce my examples there to see how this work but if more useful I could try and create a simpler
reproducible scenario.
the documentation of the package I was using was updated and now the problem is fixed.
https://github.com/mtth/avsc/wiki/API#class-blockdecoderopts
mainly I was just wrong on how to call the call back function and how to handle the buffer to snappy.
this is the correct way (as documented)
const crc32 = require('buffer-crc32');
const snappy = require('snappy');
const blockDecoder = new avro.streams.BlockDecoder({
codecs: {
snappy: (buf, cb) => {
// Avro appends checksums to compressed blocks.
const len = buf.length;
const checksum = buf.slice(len - 4, len);
snappy.uncompress(buf.slice(0, len - 4))
.then((inflated) => {
if (!checksum.equals(crc32(inflated))) {
// We make sure that the checksum matches.
throw new Error('invalid checksum');
}
cb(null, inflated);
})
.catch(cb);
}
}
});
I'm trying to implement a routine for Node.js that would allow one to open a file, that is being appended to by some other process at this very time, and then return chunks of data immediately as they are appended to file. It can be thought as similar to tail -f UNIX command, however acting immediately as chunks are available, instead of polling for changes over time. Alternatively, one can think of it as of working with a file as you do with socket — expecting on('data') to trigger from time to time until a file is closed explicitly.
In C land, if I were to implement this, I would just open the file, feed its file descriptor to select() (or any alternative function with similar designation), and then just read chunks as file descriptor is marked "readable". So, when there is nothing to be read, it won't be readable, and when something is appended to file, it's readable again.
I somewhat expected this kind of behavior for following code sample in Javascript:
function readThatFile(filename) {
const stream = fs.createReadStream(filename, {
flags: 'r',
encoding: 'utf8',
autoClose: false // I thought this would prevent file closing on EOF too
});
stream.on('error', function(err) {
// handle error
});
stream.on('open', function(fd) {
// save fd, so I can close it later
});
stream.on('data', function(chunk) {
// process chunk
// fs.close() if I no longer need this file
});
}
However, this code sample just bails out when EOF is encountered, so I can't wait for new chunk to arrive. Of course, I could reimplement this using fs.open and fs.read, but that somewhat defeats Node.js purpose. Alternatively, I could fs.watch() file for changes, but it won't work over network, and I don't like an idea of reopening file all the time instead of just keeping it open.
I've tried to do this:
const fd = fs.openSync(filename, 'r'); // sync for readability' sake
const stream = net.Socket({ fd: fd, readable: true, writable: false });
But had no luck — net.Socket isn't happy and throws TypeError: Unsupported fd type: FILE.
So, any solutions?
UPD: this isn't possible, my answer explains why.
I haven't looked into the internals of the read streams for files, but it's possible that they don't support waiting for a file to have more data written to it. However, the fs package definitely supports this with its most basic functionality.
To explain how tailing would work, I've written a somewhat hacky tail function which will read an entire file and invoke a callback for every line (separated by \n only) and then wait for the file to have more lines written to it. Note that a more efficient way of doing this would be to have a fixed size line buffer and just shuffle bytes into it (with a special case for extremely long lines), rather than modifying JavaScript strings.
var fs = require('fs');
function tail(path, callback) {
var descriptor, bytes = 0, buffer = new Buffer(256), line = '';
function parse(err, bytesRead, buffer) {
if (err) {
callback(err, null);
return;
}
// Keep track of the bytes we have consumed already.
bytes += bytesRead;
// Combine the buffered line with the new string data.
line += buffer.toString('utf-8', 0, bytesRead);
var i = 0, j;
while ((j = line.indexOf('\n', i)) != -1) {
// Callback with a single line at a time.
callback(null, line.substring(i, j));
// Skip the newline character.
i = j + 1;
}
// Only keep the unparsed string contents for next iteration.
line = line.substr(i);
// Keep reading in the next tick (avoids CPU hogging).
process.nextTick(read);
}
function read() {
var stat = fs.fstatSync(descriptor);
if (stat.size <= bytes) {
// We're currently at the end of the file. Check again in 500 ms.
setTimeout(read, 500);
return;
}
fs.read(descriptor, buffer, 0, buffer.length, bytes, parse);
}
fs.open(path, 'r', function (err, fd) {
if (err) {
callback(err, null);
} else {
descriptor = fd;
read();
}
});
return {close: function close(callback) {
fs.close(descriptor, callback);
}};
}
// This will tail the system log on a Mac.
var t = tail('/var/log/system.log', function (err, line) {
console.log(err, line);
});
// Unceremoniously close the file handle after one minute.
setTimeout(t.close, 60000);
All that said, you should also try to leverage the NPM community. With some searching, I found the tail-stream package which might do what you want, with streams.
Previous answers have mentioned tail-stream's approach which uses fs.watch, fs.read and fs.stat together to create the effect of streaming the contents of the file. You can see that code in action here.
Another, perhaps hackier, approach might be to just use tail by spawning a child process with it. This of course comes with the limitation that tail must exist on the target platform, but one of node's strengths is using it to do asynchronous systems development via spawn and even on windows, you can execute node in an alternate shell like msysgit or cygwin to get access to the tail utility.
The code for this:
var spawn = require('child_process').spawn;
var child = spawn('tail',
['-f', 'my.log']);
child.stdout.on('data',
function (data) {
console.log('tail output: ' + data);
}
);
child.stderr.on('data',
function (data) {
console.log('err data: ' + data);
}
);
So, it seems people are still looking for an answer to this question for five years already, and there is yet no answer on topic.
In short: you can't. Not in Node.js particularly, you can't at all.
Long answer: there are few reasons for this.
First, POSIX standard clarifies select() behavior in this regard as follows:
File descriptors associated with regular files shall always select true for ready to read, ready to write, and error conditions.
So, select() can't help with detecting a write beyond the file end.
With poll() it's similar:
Regular files shall always poll TRUE for reading and writing.
I can't tell for sure with epoll(), since it's not standartized and you have to read quite lengthy implementation, but I would assume it's similar.
Since libuv, which is in core of Node.js implementation, uses read(), pread() and preadv() in its uv__fs_read(), neither of which would block when invoked at the end of file, it would always return empty buffer when EOF is encountered. So, no luck here too.
So, summarizing, if such functionality is desired, something must be wrong with your design, and you should revise it.
What you're trying to do is a FIFO file (acronym for First In First Out), which as you said works like a socket.
There's a node.js module that allows you to work with fifo files.
I don't know what do you want that for, but there are better ways to work with sockets on node.js. Try socket.io instead.
You could also have a look at this previous question:
Reading a file in real-time using Node.js
Update 1
I'm not familiar with any module that would do what you want with a regular file, instead of with a socket type one. But as you said, you could use tail -f to do the trick:
// filename must exist at the time of running the script
var filename = 'somefile.txt';
var spawn = require('child_process').spawn;
var tail = spawn('tail', ['-f', filename]);
tail.stdout.on('data', function (data) {
data = data.toString().replace(/^[\s]+/i,'').replace(/[\s]+$/i,'');
console.log(data);
});
Then from the command line try echo someline > somefile.txt and watch at the console.
You might also would like to have a look at this: https://github.com/layerssss/node-tailer
I'm building something like Amblilight and I need to capture the screen (1920x1080) as fast as possible and process it to get colors for LEDs.
I'm using Node.js as programming language. I tried to capture screen using VNC protocol (using my own client impementation), but it gave me like 1 FPS and there was delay like 3 seconds. I need fastest way to capture the screen of the computer, which runs node.js.
I'm using Ubuntu-based linux distro.
This is going to be difficult to answer because screen capturing is system dependent. If I were going to write this type of thing for my current system on MacOSX, I would utilize the command line for taking a screen shot.
//from Terminal
>screencapture ~/Desktop/test.png
Then incorporate this into Node.JS:
//screenshot.js
var file = '~/Desktop/test.png';
var worker = require('child_process');
var fs = require('fs');
worker.exec('screencapture ' + file, function(err, results) {
if(err) return console.error(err);
//process image
var imageBuffer = fs.readFileSync(file);
while(imageBuffer.read()) {
//process lines in buffer
}
});
I am trying to determine what is the best way to read a live file line by line.
The line will sent for consumption and then discarded.
The file is live, meaning it is being written to by another application (its a log file).
The file could be large, and therefore I dont want to ready the whole thing in to memory and then process it.
Read line
Process it
Keep required data
Read next line
etc..
It seems there are many plugins aka modules. Not sure what the best (fast and efficient) way is.
I am using node.js version 0.10.33
Thanks
Use tail. It's just like the unix tail command, but in node.
npm install tail
a usage example from the npm page:
Tail = require('tail').Tail;
tail = new Tail("fileToTail", "\n", {}, true);
tail.on("line", function(data) {
console.log(data);
});
tail.on("error", function(error) {
console.log('ERROR: ', error);
});
you can create read stream http://nodejs.org/api/fs.html#fs_fs_createreadstream_path_options
and read before next line. Something like this
var lines = [],
line,
rs = require('fs').createReadStream('/etc/passwd');
rs.on('data', function(chunk){
var indx = chunk.indexOf("\n")'
if( indx !== -1 ) {
line = line + chunk;
} else {
line = line + chunk.chunk(0, indx); //we cut the "\n" symbol.
lines.push(line); //we add line to array of lines of file
line = ''; //we clear buffer
}
});
rs.on('end', function(){
console.log(lines);
});
A linux only solution would be to spawn a child_process of tail -f /path/to/your/log and do something with stdout - Not very elegant, but it would work.
I have asked this question also on the appcelerator forum, but as I find I often get better answers from you lovely people here on stackoverflow I am also asking it here just incase anyone can spread some light.
I have created a downloadQueue of urls and am using it to download files with the httpclient. Each file in the downloadQueue is is sent the the httpclient one at a time, with the next download being initiated only after the previous has been completed.
When I start the download, it seems to be working correctly and manages to download several files before it it simply freezes and I get an "out of memory" error in the DDMS error log.
I tried implementing suggestions found in other posts a sample of which are:
[http://developer.appcelerator.com/question/28911/httpclient-leaks-easily-or-can-we-have-a-close-method#answer-104241][1]
[http://developer.appcelerator.com/question/35041/large-file-download-on-mobile][2]
[http://developer.appcelerator.com/question/120129/httpclient-and-setfile][3]
[http://developer.appcelerator.com/question/95521/httpclient---save-response-directly-to-file][4]
I tried all of the following:
- moving larger file downloads directly form the nativepath rather then simply saving to file in order to insure that tmp files are not kept longer then necessary.
using the undocument setFile method of the httpclient. (This stopped my code dead without any error message, and as it is undocumented I have no idea if it was ever implemented on android anyway)
-using a settimeout in httplient.onload after the file has been download to pause for 1 second before requesting the next file (I have no idea how this would help, but I am clutching a straws now)
Below is the relevant parts of my code (which is complete except for the GetFileUrls functions which I excluded for simplicity sake as all this function does is return an array of URLs).
Can anyone spot anything that might be causing my memory issue. Does anyone have any ideas as I have tried everthing I can think of? (HELP!)
var count = 0;
var downloadQueue = [];
var rootDir = Ti.Filesystem.getExternalStorageDirectory();
downloadQueue = GetFileUrls(); /* this function is not included in order to keep my post as short as possible, bu it returns an array of urls */
DownloadFile(downloadQueue[count]);
var downloader = Ti.Network.createHTTPClient({timeout:10000});
downloader.onerror = function(){
Ti.API.info(this.responseData);
}
downloader.onload = function(){
SaveFile(this.folderName, this.fileName, this.responseData);
count += 1;
setTimeout( function(){ DownloadFile(); }, 1000);
}
function DownloadFile(){
if (count < downloadQueue.length){
var fileUrl = downloadQueue[count];
var fileName = fileUrl.substring(fileUrl.lastIndexOf('/') + 1);
downloader.fileName = fileName;
downloader.folderName = rootDir;
downloader.open('GET', fileUrl);
downloader.send();
}
}
function SaveFile(foldername, filename, response){
if (response.type == 1){
var f = Ti.Filesystem.getFile(response.nativePath);
var dest = Ti.Filesystem.getFile(foldername, filename);
if (dest.exists()){
dest.deleteFile();
}
f.move(dest.nativePath);
}else{
var dest = Ti.Filesystem.getFile(foldername, filename);
dest.write(response);
}
}
try to use events instead of the nested recursion that you are using. Android does not seem to like that too much