python to node.js confusion - javascript

So I have this python code that I'm trying to convert to node.js, but I am not sure how.
import urllib.request, re
def getDef(word):
link = "http://www.merriam-webster.com/dictionary/%s" % word
data = urllib.request.urlopen(link).read().decode()
try:
return re.search("<p>: (.*?)</p><p>", data).group(1)
except:
return "No match"
class newDefinition:
def __init__(self, word):
self.definition = getDef(word);
>>> definition = newDefintion("color")
>>> print(definition.definition)
a quality such as red, blue, green, yellow, etc., that you see when you look at something
In node.js however though it I can seem to return it like in python because of it's callback way of doing things, or at least I can't seem to return it which is why I'm asking how would I do the node.js equivalent or is their no equivalent? Here is what I have so far maybe you can spot what I'm doing wrong and how to fix it
var urllib = require("urllib"); // installed with npm
var getDef = function(word){
var link = "http://www.merriam-webster.com/dictionary/" + word;
var urlData = urllib.request(link, {}, function(err, data, res){
var re = new RegExp("<p>: (.*?)</p><p>");
var results = data.toString();
var match = re.exec(results)[1];
return match; // Expected it to give urlData the definition
});
return urlData;
}
var Definition = function(word){
this.definition = getDef(word);
}
definition = new Definition("color");
console.log(definition.definition); // this won't give the definition but the information of the urllib itself rather.
So in general trying to figure out is how to use asynchronous code so I can return things that I need, but I am not use to this concept either so is there an equivalent to this in python? Also if you can point me to some good documentation on asynchronous code that would be great also.

Since return will actually just exit your function instead of returning a value, you need to use a callback. It would look like this:
var urllib = require("urllib");
var getDef = function(word, callback){
var link = 'http://www.merriam-webster.com/dictionary/' + word;
urllib.request(link, {}, function(err, data, res) {
var re = new RegExp('<p>: (.*?)</p><p>');
var results = data.toString();
var match = re.exec(results)[1];
callback(match);
});
};
Then you would pass a callback while calling the function:
getDef('color', function(definition) {
console.log(definition);
});
Edit: Setting an object's property has the same idea. It might look like this instead:
var Definition = function(word) {
var self = this;
getDef(world, function(definition, callback) {
self.definition = definition;
callback.call(self);
});
};
And would be called like so:
var definition = new Definition('color', function() {
console.log(definition.definition);
});

Here is my two cent worth suggestion.
Never ever use regular expressions to parse HTML (Refer here for more details), instead use the XPath like library to parse the document. You can use libraries like cheerio or phantomjs.
Here is a clean solution.
var request = require('request'),
when = require('when'),
cheerio = require('cheerio');
var URL = 'http://www.merriam-webster.com/dictionary/';
/**
* #param word: Word to search the dictionary
* #returns
* Promise object which resolves to array of
* definitions of the word
*/
var getDef = function(word){
var defer = when.defer();
request(URL + word, function(err, res, body){
if (err || res.statusCode !== 200){
defer.reject();
}
var defs = [];
var $ = cheerio.load(body);
$('.wordclick .headword:first-child p').each(function(i,ele){
var definition = $(ele).text();
defs.push(definition);
});
defer.resolve(defs);
});
return defer.promise;
}
getDef('happy').then(function(words){
console.log(words);
});
Note: Here I am using when (a Promise+ library) instead of the Node's standard CPS style.

Related

Why isn't my readFileSync function executing? [duplicate]

This question already has answers here:
Anonymous function passed to readFileSync is not returning any data
(2 answers)
Closed 5 years ago.
I'm trying to read from a file in Node. Here is my code:
const cheerio = require('cheerio');
var fs = require('fs');
var path = process.argv[2];
var glossArr = []
fs.readFileSync(path, {encoding: "utf8"}, function (err, markup){
console.log('function executing')
if (err) throw err;
const $ = cheerio.load(markup);
var glossar = $('body').children().last();
var index = $('body').children().last().prev();
glossar.children().children().children().each(function(i, elem) {
var obj = {};
var container = $(this).children();
var unter = container.children();
var begriff = unter.first().text();
var text = unter.last().text();
obj[begriff] = text;
obj['file'] = path;
glossArr.push(obj)
});
});
console.log('done reading file...')
var glossString = JSON.stringify(glossArr)
var result = 'export default ' + glossString
fs.writeFileSync('./data/data.js', result)
For some reason, the readFileSync doesn't execute at all. The only thing that's logged is 'done reading file...'
However, when I changed it to readFile() (instead of sync), the function executes and works as expected. What am I missing?
readFileSync doesn't accept a callback parameter because it's synchronous. You need to change your code to move the code from within the callback to beneath the synchronous function:
var markup = fs.readFileSync(path, {encoding: "utf8"});
const $ = cheerio.load(markup);
// ...
To clarify, the readFileSync is being executed, it's just that you aren't doing anything with the result and your callback parameter is being ignored.
fs.readFileSync Synchronous version of fs.readFile(). Returns the contents of the path
Once the content is returned you can perform the task you want to do.
for more understanding use the following link readFileSync

how to read a file, store data and then write it

I have a text file with a ton of values that I want to convert to meaningful JSON using node.js fs module.
I want to store the first value of every line in an array unless the value is already present.
7000111,-1.31349,36.699959,1004,
7000111,-1.311739,36.698589,1005,
8002311,-1.262245,36.765884,2020,
8002311,-1.261135,36.767544,2021,
So for this case, I'd like to write to a file:
[7000111, 8002311]
Here's what I have so far. It writes [] to the file.
var fs = require('fs');
var through = require('through');
var split = require('split');
var shape_ids = [];
var source = fs.createReadStream('data/shapes.txt');
var target = fs.createWriteStream('./output3.txt');
var tr = through(write, end);
source
.pipe(split())
.pipe(tr)
// Function definitions
function write(line){
var line = line.toString();
var splitted = line.split(',');
// if it's not in array
if (shape_ids.indexOf(splitted[0]) > -1){
shape_ids.push(splitted[0]);
}
}
function end(){
shape_ids = JSON.stringify(shape_ids);
target.write(shape_ids);
console.log('data written');
}
The code is using the split and through modules
How do I store values in the array and write the populated array to the file?
== === ====== =================
Update:
This is what I want to do, but it's in Ruby:
shape_ids = []
File.open("data/shapes.txt").readlines.each do |line|
data = line.split(',')
shape_id = data.first
if !shape_ids.include? shape_id
shape_ids.push(shape_id)
end
end
puts shape_ids # array of unique shape_ids
Can I do this in javascript?
Unless you are super comfortable with the new Stream API in node, use the event-stream module to accomplish this:
var fs = require('fs');
var es = require('event-stream');
function getIds(src, target, callback) {
var uniqueIDs = [];
es.pipeline(
fs.createReadStream(src),
es.split(),
es.map(function (line, done) {
var id = line.split(',').shift();
if (uniqueIDs.indexOf(id) > -1) return done();
uniqueIDs.push(id);
done(null);
}),
es.wait(function (err, text) {
// Here we create our JSON — keep in mind that valid JSON starts
// as an object, not an array
var data = JSON.stringify({ ids: uniqueIDs});
fs.writeFile(target, data, function (err) {
if ('function' == typeof callback) callback(err);
});
})
);
}
getIds('./values.txt', './output.json');
Unfortunately there is no "easy" way to keep this as a pure stream flow so you have to "wait" until the data is done filtering before turning into a JSON string. Hope that helps!

mocha import external script failed

I have this opcode.js file and need to test it with mocha.An example can be seen here :
var opcode = {
'0': {
decode: function (data) {
var ocBuf = new OpcodeBuffer(data);
var kpo = {};
kpo.opcode = 0x00;
ocBuf.setIndex(1);
kpo.sid = ocBuf.readUInt16();
return kpo;
},
encode: function (kpo) {
var ocBuf = new OpcodeBuffer(opcode['0'].encodeLength(kpo));
ocBuf.writeUInt8(0x00);
ocBuf.writeUInt16(kpo.sid);
return ocBuf.buf;
}
module.exports = opcode;
and the write in my test_ack.js file:
var op = require('./ack.js');
var assert = require('assert');
opcode = op.opcode;
var decode = require('opcode').decode();
var encode = require('opcode').encode();
the problem is that i keep having this encode and decode not defined error messages.I still cannot get how can i import them in my directory.
Given the code you show us, this would be the way you could import your two functions:
var decode = require('opcode')["0"].decode;
var encode = require('opcode')["0"].encode;
I'd suggest additionally avoiding calling require twice. Among other things, the code you currently have calls the functions instead of just importing them.

How can I use these Node modules to accept HTML through a file or URL and then output JSON as validation of existing HTML elements?

Essentially what I need to do is to take a local grader.js file and then use it at the command line to input HTML, which will then output JSON data to the console to validate the existence of several HTML elements. The usage looks something like this:
./grader.js --checks checks.json --file index.html
./grader.js --checks checks.json --url http://google.com
The Node modules being used are Commander (for working at the command line), Cheerio (for HTML), and Restler (for getting HTML from URL).
The checks.json file is straightforward in that it's simply asking to check for the existence of a few simple HTML elements to find out whether or not they exist on the page:
["h1",
".navigation",
".logo",
".blank",
".about",
".heading",
".subheading",
".pitch",
".video",
".thermometer",
".order",
".social",
".section1",
".section2",
".faq",
".footer"]
The grader.js file is where things get a little more complicated. The following code actually works insofar as it takes the command line arguments and does indicate a true or false value as to whether the HTML elements exist. But it doesn't work properly after adding the URL check at the bottom. There is something wrong with my checkURL function and the way that I implement it using the Commander code at the bottom. Even though the true and false values are correct dependent upon the HTML file/URL I use, I end up spitting out both checks to the console even if I only want to check either the file or the URL, not both. I'm fairly new to this so I'm surprised that it works at all. It may have something to do with the default values, but when I try to make those changes the checkURL function seems to break down. Thanks in advance for your help I really do appreciate it.
#!/usr/bin/env node
var fs = require('fs');
var program = require('commander');
var cheerio = require('cheerio');
var rest = require('restler');
var HTMLFILE_DEFAULT = "index.html";
var CHECKSFILE_DEFAULT = "checks.json";
var URL_DEFAULT = "http://cryptic-spire-7925.herokuapp.com/index.html";
var assertFileExists = function(infile) {
var instr = infile.toString();
if(!fs.existsSync(instr)) {
console.log("%s does not exist. Exiting.", instr);
process.exit(1); // http://nodejs.org/api/process.html#process_process_exit_code
}
return instr;
};
var cheerioHtmlFile = function(htmlfile) {
return cheerio.load(fs.readFileSync(htmlfile));
};
var loadChecks = function(checksfile) {
return JSON.parse(fs.readFileSync(checksfile));
};
var checkHtmlFile = function(htmlfile, checksfile) {
$ = cheerioHtmlFile(htmlfile);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
return out;
};
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
console.log(out);
});
}
var clone = function(fn) {
// Workaround for commander.js issue.
// http://stackoverflow.com/a/6772648
return fn.bind({});
};
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html', clone(assertFileExists), HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', clone(assertFileExists), CHECKSFILE_DEFAULT)
.parse(process.argv);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
var checkJson2 = checkUrl(program.url, program.checks);
var outJson2 = JSON.stringify(checkJson2, null, 4);
console.log(outJson2);
}
else {
exports.checkHtmlFile = checkHtmlFile;
}
Depending on the arguments call either one of checkHtmlFile() or checkUrl()
Something like:
if (program.url)
checkUrl(program.url, program.checks);
else checkHtmlFile(program.file, program.checks);
Read this for more references: commander.js option parsing
Also, checkJson2 is undefined as checkUrl() isn't returning anything.
Those commander .option lines look wrong to me.
Delete the clone function and revise your option lines as follows:
.option('-f, --file <html_file>', 'Path to index.html', HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', CHECKSFILE_DEFAULT)
This should solve your commander problem.
Here is the updated checkUrl function after the helpful hints from #David and #ankitsabharwal.
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
var outJson = JSON.stringify(out, null, 4);
console.log(outJson);
});
}
And here is the updated Commander code below:
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html')
.option('-u, --url <url>', 'URL to index.html')
.option('-c, --checks <check_file>', 'Path to checks.json')
.parse(process.argv);
if (program.url) {
checkUrl(program.url, program.checks);
} else {
checkHtmlFile (program.file, program.checks);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
}
}

Read a file one line at a time in node.js?

I am trying to read a large file one line at a time. I found a question on Quora that dealt with the subject but I'm missing some connections to make the whole thing fit together.
var Lazy=require("lazy");
new Lazy(process.stdin)
.lines
.forEach(
function(line) {
console.log(line.toString());
}
);
process.stdin.resume();
The bit that I'd like to figure out is how I might read one line at a time from a file instead of STDIN as in this sample.
I tried:
fs.open('./VeryBigFile.csv', 'r', '0666', Process);
function Process(err, fd) {
if (err) throw err;
// DO lazy read
}
but it's not working. I know that in a pinch I could fall back to using something like PHP, but I would like to figure this out.
I don't think the other answer would work as the file is much larger than the server I'm running it on has memory for.
Since Node.js v0.12 and as of Node.js v4.0.0, there is a stable readline core module. Here's the easiest way to read lines from a file, without any external modules:
const fs = require('fs');
const readline = require('readline');
async function processLineByLine() {
const fileStream = fs.createReadStream('input.txt');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
// Note: we use the crlfDelay option to recognize all instances of CR LF
// ('\r\n') in input.txt as a single line break.
for await (const line of rl) {
// Each line in input.txt will be successively available here as `line`.
console.log(`Line from file: ${line}`);
}
}
processLineByLine();
Or alternatively:
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream('file.in')
});
lineReader.on('line', function (line) {
console.log('Line from file:', line);
});
The last line is read correctly (as of Node v0.12 or later), even if there is no final \n.
UPDATE: this example has been added to Node's API official documentation.
For such a simple operation there shouldn't be any dependency on third-party modules. Go easy.
var fs = require('fs'),
readline = require('readline');
var rd = readline.createInterface({
input: fs.createReadStream('/path/to/file'),
output: process.stdout,
console: false
});
rd.on('line', function(line) {
console.log(line);
});
Update in 2019
An awesome example is already posted on official Nodejs documentation. here
This requires the latest Nodejs is installed on your machine. >11.4
const fs = require('fs');
const readline = require('readline');
async function processLineByLine() {
const fileStream = fs.createReadStream('input.txt');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
// Note: we use the crlfDelay option to recognize all instances of CR LF
// ('\r\n') in input.txt as a single line break.
for await (const line of rl) {
// Each line in input.txt will be successively available here as `line`.
console.log(`Line from file: ${line}`);
}
}
processLineByLine();
You don't have to open the file, but instead, you have to create a ReadStream.
fs.createReadStream
Then pass that stream to Lazy
require('fs').readFileSync('file.txt', 'utf-8').split(/\r?\n/).forEach(function(line){
console.log(line);
})
there is a very nice module for reading a file line by line, it's called line-reader
with it you simply just write:
var lineReader = require('line-reader');
lineReader.eachLine('file.txt', function(line, last) {
console.log(line);
// do whatever you want with line...
if(last){
// or check if it's the last one
}
});
you can even iterate the file with a "java-style" interface, if you need more control:
lineReader.open('file.txt', function(reader) {
if (reader.hasNextLine()) {
reader.nextLine(function(line) {
console.log(line);
});
}
});
Old topic, but this works:
var rl = readline.createInterface({
input : fs.createReadStream('/path/file.txt'),
output: process.stdout,
terminal: false
})
rl.on('line',function(line){
console.log(line) //or parse line
})
Simple. No need for an external module.
You can always roll your own line reader. I have'nt benchmarked this snippet yet, but it correctly splits the incoming stream of chunks into lines without the trailing '\n'
var last = "";
process.stdin.on('data', function(chunk) {
var lines, i;
lines = (last+chunk).split("\n");
for(i = 0; i < lines.length - 1; i++) {
console.log("line: " + lines[i]);
}
last = lines[i];
});
process.stdin.on('end', function() {
console.log("line: " + last);
});
process.stdin.resume();
I did come up with this when working on a quick log parsing script that needed to accumulate data during the log parsing and I felt that it would nice to try doing this using js and node instead of using perl or bash.
Anyway, I do feel that small nodejs scripts should be self contained and not rely on third party modules so after reading all the answers to this question, each using various modules to handle line parsing, a 13 SLOC native nodejs solution might be of interest .
With the carrier module:
var carrier = require('carrier');
process.stdin.resume();
carrier.carry(process.stdin, function(line) {
console.log('got one line: ' + line);
});
I ended up with a massive, massive memory leak using Lazy to read line by line when trying to then process those lines and write them to another stream due to the way drain/pause/resume in node works (see: http://elegantcode.com/2011/04/06/taking-baby-steps-with-node-js-pumping-data-between-streams/ (i love this guy btw)). I haven't looked closely enough at Lazy to understand exactly why, but I couldn't pause my read stream to allow for a drain without Lazy exiting.
I wrote the code to process massive csv files into xml docs, you can see the code here: https://github.com/j03m/node-csv2xml
If you run the previous revisions with Lazy line it leaks. The latest revision doesn't leak at all and you can probably use it as the basis for a reader/processor. Though I have some custom stuff in there.
Edit: I guess I should also note that my code with Lazy worked fine until I found myself writing large enough xml fragments that drain/pause/resume because a necessity. For smaller chunks it was fine.
In most cases this should be enough:
const fs = require("fs")
fs.readFile('./file', 'utf-8', (err, file) => {
const lines = file.split('\n')
for (let line of lines)
console.log(line)
});
Edit:
Use a transform stream.
With a BufferedReader you can read lines.
new BufferedReader ("lorem ipsum", { encoding: "utf8" })
.on ("error", function (error){
console.log ("error: " + error);
})
.on ("line", function (line){
console.log ("line: " + line);
})
.on ("end", function (){
console.log ("EOF");
})
.read ();
I was frustrated by the lack of a comprehensive solution for this, so I put together my own attempt (git / npm). Copy-pasted list of features:
Interactive line processing (callback-based, no loading the entire file into RAM)
Optionally, return all lines in an array (detailed or raw mode)
Interactively interrupt streaming, or perform map/filter like processing
Detect any newline convention (PC/Mac/Linux)
Correct eof / last line treatment
Correct handling of multi-byte UTF-8 characters
Retrieve byte offset and byte length information on per-line basis
Random access, using line-based or byte-based offsets
Automatically map line-offset information, to speed up random access
Zero dependencies
Tests
NIH? You decide :-)
Since posting my original answer, I found that split is a very easy to use node module for line reading in a file; Which also accepts optional parameters.
var split = require('split');
fs.createReadStream(file)
.pipe(split())
.on('data', function (line) {
//each chunk now is a seperate line!
});
Haven't tested on very large files. Let us know if you do.
function createLineReader(fileName){
var EM = require("events").EventEmitter
var ev = new EM()
var stream = require("fs").createReadStream(fileName)
var remainder = null;
stream.on("data",function(data){
if(remainder != null){//append newly received data chunk
var tmp = new Buffer(remainder.length+data.length)
remainder.copy(tmp)
data.copy(tmp,remainder.length)
data = tmp;
}
var start = 0;
for(var i=0; i<data.length; i++){
if(data[i] == 10){ //\n new line
var line = data.slice(start,i)
ev.emit("line", line)
start = i+1;
}
}
if(start<data.length){
remainder = data.slice(start);
}else{
remainder = null;
}
})
stream.on("end",function(){
if(null!=remainder) ev.emit("line",remainder)
})
return ev
}
//---------main---------------
fileName = process.argv[2]
lineReader = createLineReader(fileName)
lineReader.on("line",function(line){
console.log(line.toString())
//console.log("++++++++++++++++++++")
})
I wanted to tackle this same problem, basically what in Perl would be:
while (<>) {
process_line($_);
}
My use case was just a standalone script, not a server, so synchronous was fine. These were my criteria:
The minimal synchronous code that could reuse in many projects.
No limits on file size or number of lines.
No limits on length of lines.
Able to handle full Unicode in UTF-8, including characters beyond the BMP.
Able to handle *nix and Windows line endings (old-style Mac not needed for me).
Line endings character(s) to be included in lines.
Able to handle last line with or without end-of-line characters.
Not use any external libraries not included in the node.js distribution.
This is a project for me to get a feel for low-level scripting type code in node.js and decide how viable it is as a replacement for other scripting languages like Perl.
After a surprising amount of effort and a couple of false starts this is the code I came up with. It's pretty fast but less trivial than I would've expected: (fork it on GitHub)
var fs = require('fs'),
StringDecoder = require('string_decoder').StringDecoder,
util = require('util');
function lineByLine(fd) {
var blob = '';
var blobStart = 0;
var blobEnd = 0;
var decoder = new StringDecoder('utf8');
var CHUNK_SIZE = 16384;
var chunk = new Buffer(CHUNK_SIZE);
var eolPos = -1;
var lastChunk = false;
var moreLines = true;
var readMore = true;
// each line
while (moreLines) {
readMore = true;
// append more chunks from the file onto the end of our blob of text until we have an EOL or EOF
while (readMore) {
// do we have a whole line? (with LF)
eolPos = blob.indexOf('\n', blobStart);
if (eolPos !== -1) {
blobEnd = eolPos;
readMore = false;
// do we have the last line? (no LF)
} else if (lastChunk) {
blobEnd = blob.length;
readMore = false;
// otherwise read more
} else {
var bytesRead = fs.readSync(fd, chunk, 0, CHUNK_SIZE, null);
lastChunk = bytesRead !== CHUNK_SIZE;
blob += decoder.write(chunk.slice(0, bytesRead));
}
}
if (blobStart < blob.length) {
processLine(blob.substring(blobStart, blobEnd + 1));
blobStart = blobEnd + 1;
if (blobStart >= CHUNK_SIZE) {
// blobStart is in characters, CHUNK_SIZE is in octets
var freeable = blobStart / CHUNK_SIZE;
// keep blob from growing indefinitely, not as deterministic as I'd like
blob = blob.substring(CHUNK_SIZE);
blobStart -= CHUNK_SIZE;
blobEnd -= CHUNK_SIZE;
}
} else {
moreLines = false;
}
}
}
It could probably be cleaned up further, it was the result of trial and error.
Generator based line reader: https://github.com/neurosnap/gen-readlines
var fs = require('fs');
var readlines = require('gen-readlines');
fs.open('./file.txt', 'r', function(err, fd) {
if (err) throw err;
fs.fstat(fd, function(err, stats) {
if (err) throw err;
for (var line of readlines(fd, stats.size)) {
console.log(line.toString());
}
});
});
A new function was added in Node.js v18.11.0 to read files line by line
filehandle.readLines([options])
This is how you use this with a text file you want to read
import { open } from 'node:fs/promises';
myFileReader();
async function myFileReader() {
const file = await open('./TextFileName.txt');
for await (const line of file.readLines()) {
console.log(line)
}
}
To understand more read Node.js documentation here is the link for file system readlines():
https://nodejs.org/api/fs.html#filehandlereadlinesoptions
If you want to read a file line by line and writing this in another:
var fs = require('fs');
var readline = require('readline');
var Stream = require('stream');
function readFileLineByLine(inputFile, outputFile) {
var instream = fs.createReadStream(inputFile);
var outstream = new Stream();
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function (line) {
fs.appendFileSync(outputFile, line + '\n');
});
};
var fs = require('fs');
function readfile(name,online,onend,encoding) {
var bufsize = 1024;
var buffer = new Buffer(bufsize);
var bufread = 0;
var fd = fs.openSync(name,'r');
var position = 0;
var eof = false;
var data = "";
var lines = 0;
encoding = encoding || "utf8";
function readbuf() {
bufread = fs.readSync(fd,buffer,0,bufsize,position);
position += bufread;
eof = bufread ? false : true;
data += buffer.toString(encoding,0,bufread);
}
function getLine() {
var nl = data.indexOf("\r"), hasnl = nl !== -1;
if (!hasnl && eof) return fs.closeSync(fd), online(data,++lines), onend(lines);
if (!hasnl && !eof) readbuf(), nl = data.indexOf("\r"), hasnl = nl !== -1;
if (!hasnl) return process.nextTick(getLine);
var line = data.substr(0,nl);
data = data.substr(nl+1);
if (data[0] === "\n") data = data.substr(1);
online(line,++lines);
process.nextTick(getLine);
}
getLine();
}
I had the same problem and came up with above solution
looks simular to others but is aSync and can read large files very quickly
Hopes this helps
Two questions we must ask ourselves while doing such operations are:
What's the amount of memory used to perform it?
Is the memory consumption increasing drastically with the file size?
Solutions like require('fs').readFileSync() loads the whole file into memory. That means that the amount of memory required to perform operations will be almost equivalent to the file size. We should avoid these for anything larger than 50mbs
We can easily track the amount of memory used by a function by placing these lines of code after the function invocation :
const used = process.memoryUsage().heapUsed / 1024 / 1024;
console.log(
`The script uses approximately ${Math.round(used * 100) / 100} MB`
);
Right now the best way to read particular lines from a large file is using node's readline. The documentation has amazing examples.
This is my favorite way of going through a file, a simple native solution for a progressive (as in not a "slurp" or all-in-memory way) file read with modern async/await. It's a solution that I find "natural" when processing large text files without having to resort to the readline package or any non-core dependency.
let buf = '';
for await ( const chunk of fs.createReadStream('myfile') ) {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop() ?? '';
for( const line of lines ) {
console.log(line);
}
}
if(buf.length) console.log(buf); // last line, if file does not end with newline
You can adjust encoding in the fs.createReadStream or use chunk.toString(<arg>). Also this let's you better fine-tune the line splitting to your taste, ie. use .split(/\n+/) to skip empty lines and control the chunk size with fs.createReadStream('myfile', { highWaterMark: <chunkSize> }).
Don't forget to create a function like processLine(line) to avoid repeating the line processing code twice due to the ending buf leftover. Unfortunately, the ReadStream instance does not update its end-of-file flags in this setup, so there's no way, afaik, to detect within the loop that we're in the last iteration without some more verbose tricks like comparing the file size from a fs.Stats() with .bytesRead. Hence the final buf processing solution, unless you're absolutely sure your file ends with a newline \n, in which case the for await loop should suffice.
Performance Considerations
Chunk sizes are important for performance, the default is 64k for text files and, for multi MB files, larger chunks can improve speed by an order of magnitude.
The above snippet runs at least the same speed (or even 5% faster sometimes) as code based on NodeJS v18's fs.readLine() or based on the readline module (the accepted answer), once you tune highWaterMark to something that your machine can handle, ie. setting it to the same size as the file, if your available memory allows it, is the fastest.
In any case, any of NodeJS line-reading answers here are an order of magnitude slower than the Perl or native *Nix solutions.
Similar alternatives
★ If you prefer the evented asynchronous version, this would be it:
let buf = '';
fs.createReadStream('myfile')
.on('data', chunk => {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop();
for( const line of lines ) {
console.log(line);
}
})
.on('end', () => buf.length && console.log(buf) );
★ Now if you don't mind importing the stream core package, then this is the equivalent piped stream version, which allows for chaining transforms like gzip decompression:
const { Writable } = require('stream');
let buf = '';
fs.createReadStream('myfile').pipe(
new Writable({
write: (chunk, enc, next) => {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop();
for (const line of lines) {
console.log(line);
}
next();
}
})
).on('finish', () => buf.length && console.log(buf) );
I have a little module which does this well and is used by quite a few other projects npm readline Note thay in node v10 there is a native readline module so I republished my module as linebyline https://www.npmjs.com/package/linebyline
if you dont want to use the module the function is very simple:
var fs = require('fs'),
EventEmitter = require('events').EventEmitter,
util = require('util'),
newlines = [
13, // \r
10 // \n
];
var readLine = module.exports = function(file, opts) {
if (!(this instanceof readLine)) return new readLine(file);
EventEmitter.call(this);
opts = opts || {};
var self = this,
line = [],
lineCount = 0,
emit = function(line, count) {
self.emit('line', new Buffer(line).toString(), count);
};
this.input = fs.createReadStream(file);
this.input.on('open', function(fd) {
self.emit('open', fd);
})
.on('data', function(data) {
for (var i = 0; i < data.length; i++) {
if (0 <= newlines.indexOf(data[i])) { // Newline char was found.
lineCount++;
if (line.length) emit(line, lineCount);
line = []; // Empty buffer.
} else {
line.push(data[i]); // Buffer new line data.
}
}
}).on('error', function(err) {
self.emit('error', err);
}).on('end', function() {
// Emit last line if anything left over since EOF won't trigger it.
if (line.length){
lineCount++;
emit(line, lineCount);
}
self.emit('end');
}).on('close', function() {
self.emit('close');
});
};
util.inherits(readLine, EventEmitter);
Another solution is to run logic via sequential executor nsynjs. It reads file line-by-line using node readline module, and it doesn't use promises or recursion, therefore not going to fail on large files. Here is how the code will looks like:
var nsynjs = require('nsynjs');
var textFile = require('./wrappers/nodeReadline').textFile; // this file is part of nsynjs
function process(textFile) {
var fh = new textFile();
fh.open('path/to/file');
var s;
while (typeof(s = fh.readLine(nsynjsCtx).data) != 'undefined')
console.log(s);
fh.close();
}
var ctx = nsynjs.run(process,{},textFile,function () {
console.log('done');
});
Code above is based on this exampe: https://github.com/amaksr/nsynjs/blob/master/examples/node-readline/index.js
i use this:
function emitLines(stream, re){
re = re && /\n/;
var buffer = '';
stream.on('data', stream_data);
stream.on('end', stream_end);
function stream_data(data){
buffer += data;
flush();
}//stream_data
function stream_end(){
if(buffer) stream.emmit('line', buffer);
}//stream_end
function flush(){
var re = /\n/;
var match;
while(match = re.exec(buffer)){
var index = match.index + match[0].length;
stream.emit('line', buffer.substring(0, index));
buffer = buffer.substring(index);
re.lastIndex = 0;
}
}//flush
}//emitLines
use this function on a stream and listen to the line events that is will emit.
gr-
While you should probably use the readline module as the top answer suggests, readline appears to be oriented toward command line interfaces rather than line reading. It's also a little bit more opaque regarding buffering. (Anyone who needs a streaming line oriented reader probably will want to tweak buffer sizes). The readline module is ~1000 lines while this, with stats and tests, is 34.
const EventEmitter = require('events').EventEmitter;
class LineReader extends EventEmitter{
constructor(f, delim='\n'){
super();
this.totalChars = 0;
this.totalLines = 0;
this.leftover = '';
f.on('data', (chunk)=>{
this.totalChars += chunk.length;
let lines = chunk.split(delim);
if (lines.length === 1){
this.leftover += chunk;
return;
}
lines[0] = this.leftover + lines[0];
this.leftover = lines[lines.length-1];
if (this.leftover) lines.pop();
this.totalLines += lines.length;
for (let l of lines) this.onLine(l);
});
// f.on('error', ()=>{});
f.on('end', ()=>{console.log('chars', this.totalChars, 'lines', this.totalLines)});
}
onLine(l){
this.emit('line', l);
}
}
//Command line test
const f = require('fs').createReadStream(process.argv[2], 'utf8');
const delim = process.argv[3];
const lineReader = new LineReader(f, delim);
lineReader.on('line', (line)=> console.log(line));
Here's an even shorter version, without the stats, at 19 lines:
class LineReader extends require('events').EventEmitter{
constructor(f, delim='\n'){
super();
this.leftover = '';
f.on('data', (chunk)=>{
let lines = chunk.split(delim);
if (lines.length === 1){
this.leftover += chunk;
return;
}
lines[0] = this.leftover + lines[0];
this.leftover = lines[lines.length-1];
if (this.leftover)
lines.pop();
for (let l of lines)
this.emit('line', l);
});
}
}
const fs = require("fs")
fs.readFile('./file', 'utf-8', (err, data) => {
var innerContent;
console.log("Asynchronous read: " + data.toString());
const lines = data.toString().split('\n')
for (let line of lines)
innerContent += line + '<br>';
});
I wrap the whole logic of daily line processing as a npm module: line-kit
https://www.npmjs.com/package/line-kit
// example
var count = 0
require('line-kit')(require('fs').createReadStream('/etc/issue'),
(line) => { count++; },
() => {console.log(`seen ${count} lines`)})
I use below code the read lines after verify that its not a directory and its not included in the list of files need not to be check.
(function () {
var fs = require('fs');
var glob = require('glob-fs')();
var path = require('path');
var result = 0;
var exclude = ['LICENSE',
path.join('e2e', 'util', 'db-ca', 'someother-file'),
path.join('src', 'favicon.ico')];
var files = [];
files = glob.readdirSync('**');
var allFiles = [];
var patternString = [
'trade',
'order',
'market',
'securities'
];
files.map((file) => {
try {
if (!fs.lstatSync(file).isDirectory() && exclude.indexOf(file) === -1) {
fs.readFileSync(file).toString().split(/\r?\n/).forEach(function(line){
patternString.map((pattern) => {
if (line.indexOf(pattern) !== -1) {
console.log(file + ' contain `' + pattern + '` in in line "' + line +'";');
result = 1;
}
});
});
}
} catch (e) {
console.log('Error:', e.stack);
}
});
process.exit(result);
})();
I have looked through all above answers, all of them use third-party library to solve it. It's have a simple solution in Node's API. e.g
const fs= require('fs')
let stream = fs.createReadStream('<filename>', { autoClose: true })
stream.on('data', chunk => {
let row = chunk.toString('ascii')
}))

Categories