I'm trying to copy a sqlite database from the data folder in my extension directory, to the profile folder, in order to use it.
So for now, I'm trying with that:
const {Cc, Ci, Cu} = require("chrome");
const {NetUtils} = Cu.import("resource://gre/modules/NetUtil.jsm");
const data = require('sdk/self').data;
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/FileUtils.jsm");
var file = Cc["#mozilla.org/file/directory_service;1"].
getService(Ci.nsIProperties).
get("TmpD", Ci.nsIFile);
file.append("searchEngines.sqlite");
file.createUnique(Ci.nsIFile.NORMAL_FILE_TYPE, 0666);
// Then, we need an output stream to our output file.
var ostream = Cc["#mozilla.org/network/file-output-stream;1"].createInstance(Ci.nsIFileOutputStream);
ostream.init(file, -1, -1, 0);
// Finally, we need an input stream to take data from.
var iStreamData = NetUtil.ioService.newChannel(data.url("searchEngines.sqlite"), null, null).open();
let istream = Cc["#mozilla.org/io/string-input-stream;1"].createInstance(Ci.nsIStringInputStream);
istream.setData(iStreamData, iStreamData.length);
NetUtil.asyncCopy(istream, ostream, function(aResult) {
console.log(aResult); // return 0
})
console.log(FileUtils.getFile("ProfD", ["searchEngines.sqlite"]).exists()); // return false
let dbConn = Services.storage.openDatabase(file);
The file seems to exist since the console.log(file.exists()) return FALSE and is not populated (the console.log(aResult) return 0).
Where is my mistake, and is there a better way to do that?
Besides that it uses sync I/O (opening the channel with .open instead of .asyncOpen), the NetUtil.asyncCopy operation is still async, meaning the code
NetUtil.asyncCopy(istream, ostream, function(aResult) {
console.log(aResult); // return 0
})
console.log(FileUtils.getFile("ProfD", ["searchEngines.sqlite"]).exists()); // return false
let dbConn = Services.storage.openDatabase(file);
will try to open the file before the copy likely finishes!
However, file.exists() will be likely true, because you already opened the file for writing. It's just that the file is still blank because the data copy isn't done (or even started) yet. (Actually, it is true, because you're checking searchEngines.sqlite in ProfD and not TmpD, but if you correct that the previous statement would apply).
You can only use the file when/after your callback to .asyncCopy is done, e.g.
NetUtil.asyncCopy(istream, ostream, function(aResult) {
console.log(aResult);
console.log(FileUtils.getFile("ProfD", ["searchEngines.sqlite"]).exists()); // return false
let dbConn = Services.storage.openDatabase(file);
// ...
});
PS: You might want to .asyncOpen the channel, then use NetUtil.asyncFetch and pass the resulting stream to .asyncCopy to be truly async for smallish files, since this caches the contents in memory first.
For large files you could create a variant of the NetUtil.asyncFetch implementation that feeds the .outputStream end directly to NetUtils.asyncCopy. That is a bit more complicated, so I won't be writing this up in detail until somebody is truly interested in this and ask the corresponding question.
Edit, so here is how I'd write it:
const data = require('sdk/self').data;
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/NetUtil.jsm");
function copyDataURLToFile(url, file, callback) {
NetUtil.asyncFetch(url, function(istream) {
var ostream = Cc["#mozilla.org/network/file-output-stream;1"].
createInstance(Ci.nsIFileOutputStream);
ostream.init(file, -1, -1, Ci.nsIFileOutputStream.DEFER_OPEN);
NetUtil.asyncCopy(istream, ostream, function(result) {
callback && callback(file, result);
});
});
}
var file = Services.dirsvc.get("TmpD", Ci.nsIFile);
file.append("searchEngines.sqlite");
copyDataURLToFile(data.url("searchEngine.sqlite"), file, function(file, result) {
console.log(result);
console.log(file.exists());
console.log(file.fileSize);
});
Try using OS.File it's much more straight forward.
Cu.import("resource://gre/modules/FileUtils.jsm");
Cu.import("resource://gre/modules/osfile.jsm")
var fromPath = FileUtils.getFile("ProfD", ["searchEngines.sqlite"]).path;
var toPath = FileUtils.getFile("TmpD", ["searchEngines.sqlite"]).path;;
var promise = OS.File.copy(fromPath, toPath);
var dbConn;
promise.then(
function(aStat) {
alert('success will now open connection');
dbConn = Services.storage.openDatabase(toPath);
},
function(aReason) {
console.log('promise rejected', aReason);
alert('copy failed, see console for details');
}
);
Related
I have a file which stores many JavaScript objects in JSON form and I need to read the file, create each of the objects, and do something with them (insert them into a db in my case). The JavaScript objects can be represented a format:
Format A:
[{name: 'thing1'},
....
{name: 'thing999999999'}]
or Format B:
{name: 'thing1'} // <== My choice.
...
{name: 'thing999999999'}
Note that the ... indicates a lot of JSON objects. I am aware I could read the entire file into memory and then use JSON.parse() like this:
fs.readFile(filePath, 'utf-8', function (err, fileContents) {
if (err) throw err;
console.log(JSON.parse(fileContents));
});
However, the file could be really large, I would prefer to use a stream to accomplish this. The problem I see with a stream is that the file contents could be broken into data chunks at any point, so how can I use JSON.parse() on such objects?
Ideally, each object would be read as a separate data chunk, but I am not sure on how to do that.
var importStream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
importStream.on('data', function(chunk) {
var pleaseBeAJSObject = JSON.parse(chunk);
// insert pleaseBeAJSObject in a database
});
importStream.on('end', function(item) {
console.log("Woot, imported objects into the database!");
});*/
Note, I wish to prevent reading the entire file into memory. Time efficiency does not matter to me. Yes, I could try to read a number of objects at once and insert them all at once, but that's a performance tweak - I need a way that is guaranteed not to cause a memory overload, not matter how many objects are contained in the file.
I can choose to use FormatA or FormatB or maybe something else, just please specify in your answer. Thanks!
To process a file line-by-line, you simply need to decouple the reading of the file and the code that acts upon that input. You can accomplish this by buffering your input until you hit a newline. Assuming we have one JSON object per line (basically, format B):
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var buf = '';
stream.on('data', function(d) {
buf += d.toString(); // when data is read, stash it in a string buffer
pump(); // then process the buffer
});
function pump() {
var pos;
while ((pos = buf.indexOf('\n')) >= 0) { // keep going while there's a newline somewhere in the buffer
if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
buf = buf.slice(1); // discard it
continue; // so that the next iteration will start with data
}
processLine(buf.slice(0,pos)); // hand off the line
buf = buf.slice(pos+1); // and slice the processed data off the buffer
}
}
function processLine(line) { // here's where we do something with a line
if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)
if (line.length > 0) { // ignore empty lines
var obj = JSON.parse(line); // parse the JSON
console.log(obj); // do something with the data here!
}
}
Each time the file stream receives data from the file system, it's stashed in a buffer, and then pump is called.
If there's no newline in the buffer, pump simply returns without doing anything. More data (and potentially a newline) will be added to the buffer the next time the stream gets data, and then we'll have a complete object.
If there is a newline, pump slices off the buffer from the beginning to the newline and hands it off to process. It then checks again if there's another newline in the buffer (the while loop). In this way, we can process all of the lines that were read in the current chunk.
Finally, process is called once per input line. If present, it strips off the carriage return character (to avoid issues with line endings – LF vs CRLF), and then calls JSON.parse one the line. At this point, you can do whatever you need to with your object.
Note that JSON.parse is strict about what it accepts as input; you must quote your identifiers and string values with double quotes. In other words, {name:'thing1'} will throw an error; you must use {"name":"thing1"}.
Because no more than a chunk of data will ever be in memory at a time, this will be extremely memory efficient. It will also be extremely fast. A quick test showed I processed 10,000 rows in under 15ms.
Just as I was thinking that it would be fun to write a streaming JSON parser, I also thought that maybe I should do a quick search to see if there's one already available.
Turns out there is.
JSONStream "streaming JSON.parse and stringify"
Since I just found it, I've obviously not used it, so I can't comment on its quality, but I'll be interested to hear if it works.
It does work consider the following Javascript and _.isString:
stream.pipe(JSONStream.parse('*'))
.on('data', (d) => {
console.log(typeof d);
console.log("isString: " + _.isString(d))
});
This will log objects as they come in if the stream is an array of objects. Therefore the only thing being buffered is one object at a time.
As of October 2014, you can just do something like the following (using JSONStream) - https://www.npmjs.org/package/JSONStream
var fs = require('fs'),
JSONStream = require('JSONStream'),
var getStream() = function () {
var jsonData = 'myData.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
}
getStream().pipe(MyTransformToDoWhateverProcessingAsNeeded).on('error', function (err) {
// handle any errors
});
To demonstrate with a working example:
npm install JSONStream event-stream
data.json:
{
"greeting": "hello world"
}
hello.js:
var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
var getStream = function () {
var jsonData = 'data.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
console.log(data);
}));
$ node hello.js
// hello world
I had similar requirement, i need to read a large json file in node js and process data in chunks and call a api and save in mongodb.
inputFile.json is like:
{
"customers":[
{ /*customer data*/},
{ /*customer data*/},
{ /*customer data*/}....
]
}
Now i used JsonStream and EventStream to achieve this synchronously.
var JSONStream = require("JSONStream");
var es = require("event-stream");
fileStream = fs.createReadStream(filePath, { encoding: "utf8" });
fileStream.pipe(JSONStream.parse("customers.*")).pipe(
es.through(function(data) {
console.log("printing one customer object read from file ::");
console.log(data);
this.pause();
processOneCustomer(data, this);
return data;
}),
function end() {
console.log("stream reading ended");
this.emit("end");
}
);
function processOneCustomer(data, es) {
DataModel.save(function(err, dataModel) {
es.resume();
});
}
I realize that you want to avoid reading the whole JSON file into memory if possible, however if you have the memory available it may not be a bad idea performance-wise. Using node.js's require() on a json file loads the data into memory really fast.
I ran two tests to see what the performance looked like on printing out an attribute from each feature from a 81MB geojson file.
In the 1st test, I read the entire geojson file into memory using var data = require('./geo.json'). That took 3330 milliseconds and then printing out an attribute from each feature took 804 milliseconds for a grand total of 4134 milliseconds. However, it appeared that node.js was using 411MB of memory.
In the second test, I used #arcseldon's answer with JSONStream + event-stream. I modified the JSONPath query to select only what I needed. This time the memory never went higher than 82MB, however, the whole thing now took 70 seconds to complete!
I wrote a module that can do this, called BFJ. Specifically, the method bfj.match can be used to break up a large stream into discrete chunks of JSON:
const bfj = require('bfj');
const fs = require('fs');
const stream = fs.createReadStream(filePath);
bfj.match(stream, (key, value, depth) => depth === 0, { ndjson: true })
.on('data', object => {
// do whatever you need to do with object
})
.on('dataError', error => {
// a syntax error was found in the JSON
})
.on('error', error => {
// some kind of operational error occurred
})
.on('end', error => {
// finished processing the stream
});
Here, bfj.match returns a readable, object-mode stream that will receive the parsed data items, and is passed 3 arguments:
A readable stream containing the input JSON.
A predicate that indicates which items from the parsed JSON will be pushed to the result stream.
An options object indicating that the input is newline-delimited JSON (this is to process format B from the question, it's not required for format A).
Upon being called, bfj.match will parse JSON from the input stream depth-first, calling the predicate with each value to determine whether or not to push that item to the result stream. The predicate is passed three arguments:
The property key or array index (this will be undefined for top-level items).
The value itself.
The depth of the item in the JSON structure (zero for top-level items).
Of course a more complex predicate can also be used as necessary according to requirements. You can also pass a string or a regular expression instead of a predicate function, if you want to perform simple matches against property keys.
If you have control over the input file, and it's an array of objects, you can solve this more easily. Arrange to output the file with each record on one line, like this:
[
{"key": value},
{"key": value},
...
This is still valid JSON.
Then, use the node.js readline module to process them one line at a time.
var fs = require("fs");
var lineReader = require('readline').createInterface({
input: fs.createReadStream("input.txt")
});
lineReader.on('line', function (line) {
line = line.trim();
if (line.charAt(line.length-1) === ',') {
line = line.substr(0, line.length-1);
}
if (line.charAt(0) === '{') {
processRecord(JSON.parse(line));
}
});
function processRecord(record) {
// Process the records one at a time here!
}
I solved this problem using the split npm module. Pipe your stream into split, and it will "Break up a stream and reassemble it so that each line is a chunk".
Sample code:
var fs = require('fs')
, split = require('split')
;
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var lineStream = stream.pipe(split());
linestream.on('data', function(chunk) {
var json = JSON.parse(chunk);
// ...
});
Using the #josh3736 answer, but for ES2021 and Node.js 16+ with async/await + AirBnb rules:
import fs from 'node:fs';
const file = 'file.json';
/**
* #callback itemProcessorCb
* #param {object} item The current item
*/
/**
* Process each data chunk in a stream.
*
* #param {import('fs').ReadStream} readable The readable stream
* #param {itemProcessorCb} itemProcessor A function to process each item
*/
async function processChunk(readable, itemProcessor) {
let data = '';
let total = 0;
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of readable) {
// join with last result, remove CR and get lines
const lines = (data + chunk).replace('\r', '').split('\n');
// clear last result
data = '';
// process lines
let line = lines.shift();
const items = [];
while (line) {
// check if isn't a empty line or an array definition
if (line !== '' && !/[\[\]]+/.test(line)) {
try {
// remove the last comma and parse json
const json = JSON.parse(line.replace(/\s?(,)+\s?$/, ''));
items.push(json);
} catch (error) {
// last line gets only a partial line from chunk
// so we add this to join at next loop
data += line;
}
}
// continue
line = lines.shift();
}
total += items.length;
// Process items in parallel
await Promise.all(items.map(itemProcessor));
}
console.log(`${total} items processed.`);
}
// Process each item
async function processItem(item) {
console.log(item);
}
// Init
try {
const readable = fs.createReadStream(file, {
flags: 'r',
encoding: 'utf-8',
});
processChunk(readable, processItem);
} catch (error) {
console.error(error.message);
}
For a JSON like:
[
{ "name": "A", "active": true },
{ "name": "B", "active": false },
...
]
https.get(url1 , function(response) {
var data = "";
response.on('data', function(chunk) {
data += chunk.toString();
})
.on('end', function() {
console.log(data)
});
});
I think you need to use a database. MongoDB is a good choice in this case because it is JSON compatible.
UPDATE:
You can use mongoimport tool to import JSON data into MongoDB.
mongoimport --collection collection --file collection.json
Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 3 years ago.
Improve this question
Anyone able to explain what I'm doing wrong with my use of asynchronous functions in Javascript?
Basically, I must use an asynchronous in my Node.js code to grab an open port for me to use. There is a local variable that is being set outside of the asynchronous call that I can access/use just fine until I await for the asynchronous function to return. After that, the local variable is undefined.
(async () => {
console.log("CHECK AFTER ASYNC1: " + csvFilePath);
// First, grab a valid open port
var port;
while (!port || portsInProcess.indexOf(port) >= 0) {
console.log("CHECK AFTER ASYNC2: " + csvFilePath);
port = await getPort();
console.log(port);
}
console.log("CHECK AFTER ASYNC3: " + csvFilePath);
portsInProcess.push(port);
// ... more code below...
Checks #1 and 2 are fine for the csvFilePath variable, but check #3 shows that it's undefined. The port number, however, is fine. This leads me to believe that there's some weirdness with asynchronous function calls in Javascript that ONLY affects local variables; the global variables I use further down are just fine. Unfortunately here, I cannot make the csvFilePath variable global since that will introduce race conditions on that variable too (which I'm preventing elsewhere; the while loop is to help prevent race conditions on the port number, which is basically unused in my simple tests on localhost).
Just in case it's helpful, here's the output I'm getting:
CHECK AFTER ASYNC1: data/text/crescent_topics.csv
CHECK AFTER ASYNC2: data/text/crescent_topics.csv
58562
CHECK AFTER ASYNC3: null
It might also be worth mentioning it's really only those first few lines of code to dynamically grab an open port that are the lines of code I added. The code that I had before which used a fixed port number worked just fine (including this csvFilePath variable remaining stable).
My understanding of the await functionality was that it makes the asynchronous function act more or less synchronously, which is what seems to be happening here; the code I have farther down that uses the port number is not running until after the port number is set. (But even if that wasn't the case, why is the csvFilePath variable being unset since I'm not altering it or using it in any way here?)
EDIT: Here's some more code to provide additional context
var spawn = require('child_process').spawn;
var fs = require("fs");
var async = require('async');
var zmq = require('zmq');
var readline = require('readline');
const getPort = require('get-port');
/* Export the Nebula class */
module.exports = Nebula;
/* Location of the data for the Crescent dataset */
var textDataPath = "data/text/";
var crescentRawDataPath = textDataPath + "crescent_raw";
var crescentTFIDF = textDataPath + "crescent tfidf.csv";
var crescentTopicModel = textDataPath + "crescent_topics.csv";
/* Location of the data for the UK Health dataset */
var ukHealthRawDataPath = textDataPath + "uk_health_raw";
var ukHealthTFIDF = textDataPath + "uk_health.csv";
/* Map CSV files for text data to raw text location */
var textRawDataMappings = {};
textRawDataMappings[crescentTFIDF] = crescentRawDataPath;
textRawDataMappings[crescentTopicModel] = crescentRawDataPath;
textRawDataMappings[ukHealthTFIDF] = ukHealthRawDataPath;
textRawDataMappings[textDataPath + "uk_health_sm.csv"] = ukHealthRawDataPath;
/* The pipelines available to use */
var flatTextUIs = ["cosmos", "composite", "sirius", "centaurus"];
var pipelines = {
andromeda: {
file: "pipelines/andromeda.py",
defaultData: "data/highD/Animal_Data_study.csv"
},
cosmos: {
file: "pipelines/cosmos.py",
defaultData: textDataPath + "crescent tfidf.csv"
},
sirius: {
file: "pipelines/sirius.py",
defaultData: "data/highD/Animal_Data_paper.csv"
},
centaurus: {
file: "pipelines/centaurus.py",
defaultData: "data/highD/Animal_Data_paper.csv"
},
twitter: {
file: "pipelines/twitter.py",
},
composite: {
file: "pipelines/composite.py",
defaultData: textDataPath + "crescent tfidf.csv"
},
elasticsearch: {
file: "pipelines/espipeline.py",
args: []
}
};
/* The locations of the different types of datasets on the server */
var textDataFolder = "data/text/";
var highDDataFolder = "data/highD/";
var customCSVFolder = "data/customCSV/";
var sirius_prototype = 2;
// An array to track the ports being processed to eliminate race conditions
// as much as possible
var portsInProcess = [];
var nextSessionNumber = 0;
var usedSessionNumbers = [];
/* Nebula class constructor */
function Nebula(io, pipelineAddr) {
/* This allows you to use "Nebula(obj)" as well as "new Nebula(obj)" */
if (!(this instanceof Nebula)) {
return new Nebula(io);
}
/* The group of rooms currently active, each with a string identifier
* Each room represents an instance of a visualization that can be shared
* among clients.
*/
this.rooms = {};
this.io = io;
/* For proper use in callback functions */
var self = this;
/* Accept new WebSocket clients */
io.on('connection', function(socket) {
// Skipped some irrelevant Socket.io callbacks
**// Use the csvFilePath to store the name of a user-defined CSV file
var csvFilePath = null;**
/* Helper function to tell the client that the CSV file is now ready for them
* to use. They are also sent a copy of the data
*/
var csvFileReady = function(csvFilePath) {
// Let the client know that the CSV file is now ready to be used on
// the server
socket.emit("csvDataReady");
// Prepare to parse the CSV file
var csvData = [];
const rl = readline.createInterface({
input: fs.createReadStream(csvFilePath),
crlfDelay: Infinity
});
// Print any error messages we encounter
rl.on('error', function (err) {
console.log("Error while parsing CSV file: " + csvFilePath);
console.log(err);
});
// Read each line of the CSV file one at a time and parse it
var columnHeaders = [];
var firstColumnName;
rl.on('line', function (data) {
var dataColumns = data.split(",");
// If we haven't saved any column names yet, do so first
if (columnHeaders.length == 0) {
columnHeaders = dataColumns;
firstColumnName = columnHeaders[0];
}
// Process each individual line of data in the CSV file
else {
var dataObj = {};
var i;
for (i = 0; i < dataColumns.length; i++) {
var key = columnHeaders[i];
var value = dataColumns[i];
dataObj[key] = value
}
csvData.push(dataObj);
}
});
// All lines are read, file is closed now.
rl.on('close', function () {
// On certain OSs, like Windows, an extra, blank line may be read
// Check for this and remove it if it exists
var lastObservation = csvData[csvData.length-1];
var lastObservationKeys = Object.keys(lastObservation);
if (lastObservationKeys.length = 1 && lastObservation[lastObservationKeys[0]] == "") {
csvData.pop();
}
// Provide the CSV data to the client
socket.emit("csvDataReadComplete", csvData, firstColumnName);
});
};
**/* Allows the client to specify a CSV file already on the server to use */
socket.on("setCSV", function(csvName) {
console.log("setCSV CALLED");
csvFilePath = "data/" + csvName;
csvFileReady(csvFilePath);
console.log("CSV FILE SET: " + csvFilePath);
});**
// Skipped some more irrelevant callbacks
/* a client/ a room. If the room doesn't next exist yet,
* initiate it and send the new room to the client. Otherwise, send
* the client the current state of the room.
*/
socket.on('join', function(roomName, user, pipeline, args) {
console.log("Join called for " + pipeline + " pipeline; room " + roomName);
socket.roomName = roomName;
socket.user = user;
socket.join(roomName);
console.log("CSV FILE PATH: " + csvFilePath);
var pipelineArgsCopy = [];
if (!self.rooms[roomName]) {
var room = {};
room.name = roomName;
room.count = 1;
room.points = new Map();
room.similarity_weights = new Map();
if (pipeline == "sirius" || pipeline == "centaurus") {
room.attribute_points = new Map();
room.attribute_similarity_weights = new Map();
room.observation_data = [];
room.attribute_data = [];
}
/* Create a pipeline client for this room */
console.log("CHECK BEFORE ASYNC: " + csvFilePath);
**// Here's the code snippet I provided above**
**(async () => {
console.log("CHECK AFTER ASYNC1: " + csvFilePath);
// First, grab a valid open port
var port;
while (!port || portsInProcess.indexOf(port) >= 0) {
console.log("CHECK AFTER ASYNC2: " + csvFilePath);
port = await getPort();
console.log(port);
}
console.log("CHECK AFTER ASYNC3: " + csvFilePath);**
portsInProcess.push(port);
console.log("CHECK AFTER ASYNC4: " + csvFilePath);
if (!pipelineAddr) {
var pythonArgs = ["-u"];
if (pipeline in pipelines) {
// A CSV file path should have already been set. This
// file path should be used to indicate where to find
// the desired file
console.log("LAST CHECK: " + csvFilePath);
if (!csvFilePath) {
csvFilePath = pipelines[pipeline].defaultData;
}
console.log("FINAL CSV FILE: " + csvFilePath);
pipelineArgsCopy.push(csvFilePath);
// If the UI supports reading flat text files, tell the
// pipeline where to find the files
if (flatTextUIs.indexOf(pipeline) >= 0) {
pipelineArgsCopy.push(textRawDataMappings[csvFilePath]);
}
// Set the remaining pipeline args
pythonArgs.push(pipelines[pipeline].file);
pythonArgs.push(port.toString());
if (pipeline != "twitter" && pipeline != "elasticsearch") {
pythonArgs = pythonArgs.concat(pipelineArgsCopy);
}
}
else {
pythonArgs.push(pipelines.cosmos.file);
pythonArgs.push(port.toString());
pythonArgs.push(pipelines.cosmos.defaultData);
pythonArgs.push(crescentRawDataPath);
}
// used in case of CosmosRadar
for (var key in args) {
if (args.hasOwnProperty(key)) {
pythonArgs.push("--" + key);
pythonArgs.push(args[key]);
}
}
// Dynamically determine which distance function should be
// used
if (pythonArgs.indexOf("--dist_func") < 0) {
if (pipeline === "twitter" || pipeline === "elasticsearch" ||
csvFilePath.startsWith(textDataPath)) {
pythonArgs.push("--dist_func", "cosine");
}
else {
pythonArgs.push("--dist_func", "euclidean");
}
}
console.log(pythonArgs);
console.log("");
var pipelineInstance = spawn("python2.7", pythonArgs, {stdout: "inherit"});
pipelineInstance.on("error", function(err) {
console.log("python2.7.exe not found. Trying python.exe");
pipelineInstance = spawn("python", pythonArgs,{stdout: "inherit"});
pipelineInstance.stdout.on("data", function(data) {
console.log("Pipeline: " + data.toString());
});
pipelineInstance.stderr.on("data", function(data) {
console.log("Pipeline error: " + data.toString());
});
});
/* Data received by node app from python process,
* ouptut this data to output stream(on 'data'),
* we want to convert that received data into a string and
* append it to the overall data String
*/
pipelineInstance.stdout.on("data", function(data) {
console.log("Pipeline STDOUT: " + data.toString());
});
pipelineInstance.stderr.on("data", function(data) {
console.log("Pipeline error: " + data.toString());
});
room.pipelineInstance = pipelineInstance;
}
/* Connect to the pipeline */
pipelineAddr = pipelineAddr || "tcp://127.0.0.1:" + port.toString();
room.pipelineSocket = zmq.socket('pair');
room.pipelineSocket.connect(pipelineAddr);
pipelineAddr = null;
portsInProcess.splice(portsInProcess.indexOf(port), 1);
/* Listens for messages from the pipeline */
room.pipelineSocket.on('message', function (msg) {
self.handleMessage(room, msg);
});
self.rooms[roomName] = socket.room = room;
invoke(room.pipelineSocket, "reset");
})();
}
else {
socket.room = self.rooms[roomName];
socket.room.count += 1;
if (pipeline == "sirius" || pipeline == "centaurus") {
socket.emit('update', sendRoom(socket.room, true), true);
socket.emit('update', sendRoom(socket.room, false), false);
}
else {
socket.emit('update', sendRoom(socket.room));
}
}
// Reset the csvFilePath to null for future UIs...
// I don't think this is actually necessary since
// csvFilePath is local to the "connections" message,
// which is called for every individual room
csvFilePath = null;
});
// Skipped the rest of the code; it's irrelevant
});
}
Full printouts:
setCSV CALLED
CSV FILE SET: data/text/crescent_topics.csv
Join called for sirius pipeline; room sirius0
CSV FILE PATH: data/text/crescent_topics.csv
CHECK BEFORE ASYNC: data/text/crescent_topics.csv
CHECK AFTER ASYNC1: data/text/crescent_topics.csv
CHECK AFTER ASYNC2: data/text/crescent_topics.csv
58562
CHECK AFTER ASYNC3: null
CHECK AFTER ASYNC4: null
LAST CHECK: null
FINAL CSV FILE: data/highD/Animal_Data_paper.csv
[ '-u',
'pipelines/sirius.py',
'58562',
'data/highD/Animal_Data_paper.csv',
undefined,
'--dist_func',
'euclidean' ]
Since bolding of code doesn't work, just search for the "**" to find the relevant pieces I've marked.
TL;DR There's a lot of communication happening between the client and server to establish an individualized communication that is directly linked to a specific dataset. The user has the ability to upload a custom CSV file to the system, but the code I'm working with right now is just trying to select an existing CSV file on the server, so I omitted the callbacks for the custom CSV file. Once the file has been selected, the client asks to "join" a room/session. The case I'm working with right now assumes that this is a new room/session as opposed to trying to do some shared room/session with another client. (Yes, I know, the code is messy for sharing rooms/sessions, but it works for the most part for now and is not my main concern.) Again, all this code worked just fine before the asynchronous code was added (and using a static port variable), so I don't know what changed so much by adding it.
Since you now included the whole code context, we can see that the issue is that the code after your async IIFE is what is causing the problem.
An async function returns a promise as soon as it hits an await. And, while that await is waiting for its asynchronous operation, the code following the call to the async function runs. In your case, you're essentially doing this:
var csvFilePath = someGoodValue;
(async () => {
port = await getPort();
console.log(csvFilePath); // this will be null
})();
csvFilePath = null; // this runs as soon as the above code hits the await
So, as soon as you hit your first await, the async function returns a promise and the code following it continues to run, hitting the line of code that resets your csvFilePath.
There are probably cleaner ways to restructure your code, but a simple thing you could do is this:
var csvFilePath = someGoodValue;
(async () => {
port = await getPort();
console.log(csvFilePath); // this will be null
})().finally(() => {
csvFilePath = null;
});
Note: .finally() is supported in node v10+. If you're using an older version, you can reset the path in both .then() and .catch().
Or, as your comment says, maybe you can just remove the resetting of the csvFilePath entirely.
I realized after some silly tests I tried that I'm resetting csvFilePath to null outside the asynchronous call, which is what is causing the error... Oops!
I'm building a web app that uses EvaporateJS to upload large files to Amazon S3 using Multipart Uploads. I noticed an issue where every time a new chunk was started the browser would freeze for ~2 seconds. I want the user to be able to continue to use my app while the upload is in progress, and this freezing makes that a bad experience.
I used Chrome's Timeline to look into what was causing this and found that it was SparkMD5's hashing. So I've moved the entire upload process into a Worker, which I thought would fix the issue.
Well the issue is now fixed in Edge and Firefox, but Chrome still has the exact same problem.
Here's a screenshot of my Timeline:
As you can see, during the freezes my main thread is doing basically nothing, with <8ms of JavaScript running during that time. All the work is occurring in my Worker thread, and even that is only running for ~600ms or so, not the 1386ms that my frame takes.
I'm really not sure what's causing the issue, are there any gotchas with Workers that I should be aware of?
Here's the code for my Worker:
var window = self; // For Worker-unaware scripts
// Shim to make Evaporate work in a Worker
var document = {
createElement: function() {
var href = undefined;
var elm = {
set href(url) {
var obj = new URL(url);
elm.protocol = obj.protocol;
elm.hostname = obj.hostname;
elm.pathname = obj.pathname;
elm.port = obj.port;
elm.search = obj.search;
elm.hash = obj.hash;
elm.host = obj.host;
href = url;
},
get href() {
return href;
},
protocol: undefined,
hostname: undefined,
pathname: undefined,
port: undefined,
search: undefined,
hash: undefined,
host: undefined
};
return elm;
}
};
importScripts("/lib/sha256/sha256.min.js");
importScripts("/lib/spark-md5/spark-md5.min.js");
importScripts("/lib/url-parse/url-parse.js");
importScripts("/lib/xmldom/xmldom.js");
importScripts("/lib/evaporate/evaporate.js");
DOMParser = self.xmldom.DOMParser;
var defaultConfig = {
computeContentMd5: true,
cryptoMd5Method: function (data) { return btoa(SparkMD5.ArrayBuffer.hash(data, true)); },
cryptoHexEncodedHash256: sha256,
awsSignatureVersion: "4",
awsRegion: undefined,
aws_url: "https://s3-ap-southeast-2.amazonaws.com",
aws_key: undefined,
customAuthMethod: function(signParams, signHeaders, stringToSign, timestamp, awsRequest) {
return new Promise(function(resolve, reject) {
var signingRequestId = currentSigningRequestId++;
postMessage(["signingRequest", signingRequestId, signParams.videoId, timestamp, awsRequest.signer.canonicalRequest()]);
queuedSigningRequests[signingRequestId] = function(signature) {
queuedSigningRequests[signingRequestId] = undefined;
if(signature) {
resolve(signature);
} else {
reject();
}
}
});
},
//logging: false,
bucket: undefined,
allowS3ExistenceOptimization: false,
maxConcurrentParts: 5
}
var currentSigningRequestId = 0;
var queuedSigningRequests = [];
var e = undefined;
var filekey = undefined;
onmessage = function(e) {
var messageType = e.data[0];
switch(messageType) {
case "init":
var globalConfig = {};
for(var k in defaultConfig) {
globalConfig[k] = defaultConfig[k];
}
for(var k in e.data[1]) {
globalConfig[k] = e.data[1][k];
}
var uploadConfig = e.data[2];
Evaporate.create(globalConfig).then(function(evaporate) {
var e = evaporate;
filekey = globalConfig.bucket + "/" + uploadConfig.name;
uploadConfig.progress = function(p, stats) {
postMessage(["progress", p, stats]);
};
uploadConfig.complete = function(xhr, awsObjectKey, stats) {
postMessage(["complete", xhr, awsObjectKey, stats]);
}
uploadConfig.info = function(msg) {
postMessage(["info", msg]);
}
uploadConfig.warn = function(msg) {
postMessage(["warn", msg]);
}
uploadConfig.error = function(msg) {
postMessage(["error", msg]);
}
e.add(uploadConfig);
});
break;
case "pause":
e.pause(filekey);
break;
case "resume":
e.resume(filekey);
break;
case "cancel":
e.cancel(filekey);
break;
case "signature":
var signingRequestId = e.data[1];
var signature = e.data[2];
queuedSigningRequests[signingRequestId](signature);
break;
}
}
Note that it relies on the calling thread to provide it with the AWS Public Key, AWS Bucket Name and AWS Region, AWS Object Key and the input File object, which are all provided in the 'init' message. When it needs something signed, it sends a 'signingRequest' message to the parent thread, which is expected to provided the signature in a 'signature' message once it's been fetched from my API's signing endpoint.
I can't give a very good example or analyze what you are doing with only the Worker code, but I strongly suspect that the issue either has to do with either the reading of the chunk on the main thread or some unexpected processing that you are doing on the chunk on the main thread. Maybe post the main thread code that calls postMessage to the Worker?
If I were debugging it right now, I'd try moving your FileReader operations into the Worker. If you don't mind the Worker blocking while it loads a chunk, you could also use FileReaderSync.
Post-comments update
Does generating the presigned URL require hashing the file content + metadata + a key? Hashing file content is going to take O(n) in the size of the chunk and it's possible, if the hash is the first operation that reads from the Blob, that the loading of the file content could be deferred until the hashing starts. Unless you are compelled to keep the signing in the main thread (you don't trust the worker with key material?) that would be another good thing to bring into the worker.
If moving the signing into the Worker is too much, you could have the worker do something to force the Blob to be read and/or pass the ArrayBuffer(or Uint8Array or what have you) of file content back to the main thread for signing; this would ensure that reading the chunk does not occur on the main thread.
I have a large json file that looks like that:
[
{"name": "item1"},
{"name": "item2"},
{"name": "item3"}
]
I want to stream this file (pretty easy so far), for each line run a asynchronous function (that returns a promise) upon the resolve/reject call edit this line.
The result of the input file could be:
[
{"name": "item1", "response": 200},
{"name": "item2", "response": 404},
{"name": "item3"} // not processed yet
]
I do not wish to create another file, I want to edit on the fly the SAME FILE (if possible!).
Thanks :)
I don't really answer the question, but don't think it can be answered in a satisfactory way anyway, so here are my 2 cents.
I assume that you know how to stream line by line, and run the function, and that the only problem you have is editing the file that you are reading from.
Consequences of inserting
It is not possible to natively insert data into any file (which is what you want to do by changing the JSON live). A file can only grow up at its end.
So inserting 10 bytes of data at the beginning of a 1GB file means that you need to write 1GB to the disk (to move all the data 10 bytes further).
Your filesystem does not understand JSON, and just sees that you are inserting bytes in the middle of a big file so this is going to be very slow.
So, yes it is possible to do.
Write a wrapper over the file API in NodeJS with an insert() method.
Then write some more code to be able to know where to insert bytes into a JSON file without loading the whole file and not producing invalid JSON at the end.
Now I would not recommend it :)
=> Read this question: Is it possible to prepend data to an file without rewriting?
Why do it then?
I assume that want to either
Be able to kill your process at any time, and easily resume work by reading the file again.
Retry partially treated files to fill only the missing bits.
First solution: Use a database
Abstracting the work that needs to be done to live edit files at random places is the sole purpose of existence of databases.
They all exist only to abstract the magic that is behind UPDATE mytable SET name = 'a_longer_name_that_the_name_that_was_there_before' where name = 'short_name'.
Have a look at LevelUP/Down, sqlite, etc...
They will abstract all the magic that needs to be done in your JSON file!
Second solution: Use multiple files
When you stream your file, write two new files!
One that contain current position in the input file and lines that need to be retried
The other one the expected result.
You will also be able to kill your process at any time and restart
According to this answer writing to the same file while reading is not reliable. As a commenter there says, better to write to a temporary file, and then delete the original and rename the temp file over it.
To create a stream of lines you can use byline. Then for each line, apply some operation and pipe it out to the output file.
Something like this:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var LineStream = require('byline').LineStream;
function Modify(options) {
stream.Transform.call(this, options);
}
util.inherits(Modify, stream.Transform);
Modify.prototype._transform = function(chunk, encoding, done) {
var self = this;
setTimeout(function() {
// your modifications here, note that the exact regex depends on
// your json format and is probably the most brittle part of this
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}') + '\n';
}
self.push(modifiedChunk);
done();
}, Math.random() * 2000 + 1000); // to simulate an async modification
};
var inPath = './data.json';
var outPath = './out.txt';
fs.createReadStream(inPath)
.pipe(new LineStream())
.pipe(new Modify())
.pipe(fs.createWriteStream(outPath))
.on('close', function() {
// replace input with output
fs.unlink(inPath, function() {
fs.rename(outPath, inPath);
});
});
Note that the above results in only one async operation happening at a time. You could also save the modifications to an array and once all of them are done write the lines from the array to a file, like this:
var fs = require('fs');
var stream = require('stream');
var LineStream = require('byline').LineStream;
var modifiedLines = [];
var modifiedCount = 0;
var inPath = './data.json';
var allModified = new Promise(function(resolve, reject) {
fs.createReadStream(inPath).pipe(new LineStream()).on('data', function(chunk) {
modifiedLines.length++;
var index = modifiedLines.length - 1;
setTimeout(function() {
// your modifications here
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}');
}
modifiedLines[index] = modifiedChunk;
modifiedCount++;
if (modifiedCount === modifiedLines.length) {
resolve();
}
}, Math.random() * 2000 + 1000);
});
}).then(function() {
fs.writeFile(inPath, modifiedLines.join('\n'));
}).catch(function(reason) {
console.error(reason);
});
If instead of lines you wish to stream chunks of valid json which would be a more robust approach, take a look at JSONStream.
As mentioned in the comment, the file you have is not proper JSON, although is valid in Javascript. In order to generate proper JSON, JSON.stringify() could be used. I think it would make life difficult for others to parse nonstandard JSON as well, therefore I would recommend furnishing a new output file instead of keeping the original one.
However, it is still possible to parse the original file as JSON. This is possible via eval('(' + procline + ')');, however it is not secure to take external data into node.js like this.
const fs = require('fs');
const readline = require('readline');
const fr = fs.createReadStream('file1');
const rl = readline.createInterface({
input: fr
});
rl.on('line', function (line) {
if (line.match(new RegExp("\{name"))) {
var procline = "";
if (line.trim().split('').pop() === ','){
procline = line.trim().substring(0,line.trim().length-1);
}
else{
procline = line.trim();
}
var lineObj = eval('(' + procline + ')');
lineObj.response = 200;
console.log(JSON.stringify(lineObj));
}
});
The output would be like this:
{"name":"item1","response":200}
{"name":"item2","response":200}
{"name":"item3","response":200}
Which is line-delimited JSON (LDJSON) and could be useful for streaming stuff, without the need for leading and trailing [, ], or ,. There is an ldjson-stream package for it as well.
I've got gulpfile.js set up like this:
var scripts = [
'bower_components/timezone-js/src/date.js',
'bower_components/jquery/jquery.min.js',
'bower_components/jquery-migrate/jquery-migrate.js',
'bower_components/jquery-ui/ui/minified/jquery-ui.min.js',
'bower_components/jqueryui-touch-punch/jquery.ui.touch-punch.min.js',
...
];
gulp.task('scripts', function () {
return gulp.src(scripts, {base: '.'})
.pipe(plumber(plumberOptions))
.pipe(sourcemaps.init({
loadMaps: false,
debug: debug,
}))
...
i.e., all my script files are exact matches. No globbing.
Every now and then I mess up a file path or the author changes the directory structure. I want to be notified when this happens instead of the script silently being excluded and causing run-time errors.
Is there some way for me to make gulp.src report these kinds of errors?
Use gulp-expect-file as per this answer.
var coffee = require('gulp-coffee');
var expect = require('gulp-expect-file');
gulp.task('mytask', function() {
var files = ['idontexist.html'];
return gulp.src(files)
.pipe(expect(files))
.pipe(coffee());
});
(Thanks rve)
gulp.src is actually just an alias to vinyl-fs.src which looks like this:
function src(glob, opt) {
opt = opt || {};
var pass = through.obj();
if (!isValidGlob(glob)) {
throw new Error('Invalid glob argument: ' + glob);
}
// return dead stream if empty array
if (Array.isArray(glob) && glob.length === 0) {
process.nextTick(pass.end.bind(pass));
return pass;
}
var options = defaults(opt, {
read: true,
buffer: true
});
var globStream = gs.create(glob, options);
// when people write to use just pass it through
var outputStream = globStream
.pipe(through.obj(createFile))
.pipe(getStats(options));
if (options.read !== false) {
outputStream = outputStream
.pipe(getContents(options));
}
return outputStream.pipe(pass);
}
It in turn uses glob-stream which uses glob. You can probably bypass most of that and use through2 directly to create a pipe from the array files. I haven't figured out how to do this yet.