I'm aware that I'm doing a computationally unfeasible thing, essentially doing Get all substrings of a string in JavaScript but with bytes of a 1MB exe instead of characters in a string.
But I wanted to see how many bytes all the segments would add up to, at least until my program crashed. Well it does crash, but I think my byte count is wrong.
const fs = require("fs");
const bytesPerKB = 1000;
const bytesPerMB = bytesPerKB * 1000;
const bytesPerGB = bytesPerMB * 1000;
function getAllSegments(buffer, skip = 1) {
let i, j, result = [], bytes = 0;
for (i = 0; i < buffer.length; i += skip) {
if (i % 1000 === 0) console.log('getting ranges for byte', i, 'with a total of', bytes / bytesPerGB, 'GB stored')
for (j = i + 1; j < buffer.length + 1; j++) {
const entry = buffer.slice(i, j)
bytes += entry.length
result.push(entry);
}
}
return result;
}
console.log('ready')
fs.promises.readFile('../data/scraped/test-1MB.exe').then(data => {
console.log('read file', data)
let segements = getAllSegments(data, 10000)
console.log('segments', segements);
})
output:
I'm pretty sure I don't have 8 TBs of storage on my PC, much less 8TB of swap space allocated. What'd I do wrong with the byte counting math?
For every single start position in the buffer, you accumulate the length of all possible substrings after that start position to the end of the buffer. Then, you repeat that processing starting one byte further into the buffer. There are gazillions of duplicates and lots of overlap that you are counting so, of course, it all adds up to way more than the size of the file or the size of your memory.
As for memory usage, buffer.slice(), returns a new Buffer object that references the original memory so that's why memory usage doesn't blow up as each of your sub-buffers is not a separate copy of the data. It's just a new Buffer object that "points" into the existing buffer with an offset and a length.
From the doc for buffer.slice():
Returns a new Buffer that references the same memory as the original, but offset and cropped by the start and end indices.
Related
I need to process lots of binary data in a Node.js app. As data arrive in many small chunks (in the format of Buffer object) that comes to my part of code via a callback, I have to do MANY operations per second like appending, slicing etc.
I was tempted to store the binary data in Javascript string, which support appending, slicing etc. But unfortunately I can't really convert the (arbitrary) binary data to string, which has to have an valid encoding, like UTF8.
To use Buffer object, the appending operation become very expensive. For example, the following code snippet took 1.5 seconds on my P7 processor.
var a = new Buffer([1])
var b = new Buffer([2])
var start = new Date()
for (i=0; i<100000; i++) {
a = Buffer.concat([a, b], a.length + 1)
}
console.log(new Date() - start)
If I were doing simple string appending a += b assuming a and b are strings, it will take only 0.01 second.
I wonder if there is an object in Javascript that can store arbitrary binary data and support appending very efficiently.
Thanks in advance
Update1
Tried TypeArray, the speed is a little better, but it's still far slower than string appending.
var a = new Uint8Array(),
b = new Uint8Array(1);
var c
b[0] = 11
var start = new Date()
for (i=0; i<100000; i++) {
c = new Uint8Array (a.length + b.length)
c.set(a,0)
c.set(b, a.length)
a = c
}
console.log(new Date() - start)
console.log(a.length)
I think smart-buffer might be what you're after? It allows you to write other buffers into it and will dynamically resize as needed.
Testing script:
const SmartBuffer = require('smart-buffer').SmartBuffer;
// set up buffers
var a = new Buffer([1])
var smart_a = new SmartBuffer();
smart_a.writeInt8(1);
var b = new Buffer([2])
// time buffer concatenation method
console.time("Buffer concatenation");
for (let i = 0; i < 100000; i++) {
a = Buffer.concat([a, b], a.length + 1)
}
console.timeEnd("Buffer concatenation");
// time smart buffer writeBuffer method
console.time("Smart Buffer writing");
for (let i = 0; i < 100000; i++) {
smart_a.writeBuffer(b);
}
let final_smart_a = smart_a.toBuffer();
console.timeEnd("Smart Buffer writing");
// check that resulting buffers match
for (let i = 0; i < 100000; i++) {
console.assert(a[i] == final_smart_a[i]);
}
Results (1 trial):
Buffer concatenation: 2110.282ms
Smart Buffer writing: 14.971ms
I am working on the Codility Peak problem:
Divide an array into the maximum number of same-sized blocks, each of which should contain an index P such that A[P - 1] < A[P] > A[P + 1].
My own solution is provided below, but it only scores 45%. So my question is:
How can I still improve my solution?
The code snippet seems to be long, since I added some extra comments to make myself clearer:
function solution(A) {
var storage = [], counter = 0;
// 1. So first I used a loop to find all the peaks
// and stored them all into an array called storage
for(var i = 1; i < A.length - 1; i++) {
if (A[i] > A[i-1] && A[i] > A[i+1]) {
storage.push(i);
}
}
// 2. Go and write the function canBeSeparatedInto
// 3. Use the for loop to check the counter
for(var j = 1; j < A.length; j++) {
if (canBeSeparatedInto(j, A, storage)) {
counter = j;
}
}
return counter;
}
/* this function tells if it is possible to divide the given array into given parts
* we will be passing our function with parameters:
* #param parts[number]: number of parts that we intend to divide the array into
* #param array[array]: the original array
* #param peaks[array]: an storage array that store all the index of the peaks
* #return [boolean]: true if the given array can be divided into given parts
*/
function canBeSeparatedInto(parts, array, peaks) {
var i = 1, result = false;
var blockSize = array.length / parts;
peaks.forEach(function(elem) {
// test to see if there is an element in the array belongs to the ith part
if ((elem+1)/blockSize <= i && (elem+1)/blockSize> i-1) {
i++;
}
});
// set the result to true if there are indeed peaks for every parts
if (i > parts) {
result = true;
}
return result;
}
The main problem with my code is that it does not pass the performance test. Could you give me some hint on that?
I would suggest this algorithm:
Sort the peeks by the distance they have with their predecessor. To do that, it might be more intuitive to identify "valleys", i.e. maximised ranges without peeks, and sort those by their size in descending order
Identify the divisors of the array length, as the solution must be one of those. For example, it is a waste of time to test for solutions when the array length is prime: in that case the answer can only be 1 (or zero if it has no peeks).
Try each of the divisors in ascending order (representing the size of array chunks), and see if for each valley such a split would bring one of the chunks completely inside that valley, i.e. it would not contain a peek: in that case reject that size as a solution, and try the next size.
Implementation with interactive input of the array:
"use strict";
// Helper function to collect the integer divisors of a given n
function divisors(n) {
var factors = [],
factors2 = [],
sq = Math.sqrt(n);
for (var i = 1; i <= sq; i++) {
if (n % i === 0) {
factors.push(n / i);
// Save time by storing complementary factor as well
factors2.push(i);
}
}
// Eliminate possible duplicate when n is a square
if (factors[factors.length-1] === factors2[factors2.length-1]) factors.pop();
// Return them sorted in descending order, so smallest is at end
return factors.concat(factors2.reverse());
}
function solution(A) {
var valleys = [],
start = 0,
size, sizes, i;
// Collect the maximum ranges that have no peeks
for (i = 1; i < A.length - 1; i++) {
if (A[i] > A[i-1] && A[i] > A[i+1]) {
valleys.push({
start,
end: i,
size: i - start,
});
start = i + 1;
}
}
// Add final valley
valleys.push({
start,
end: A.length,
size: A.length - start
});
if (valleys.length === 1) return 0; // no peeks = no solution
// Sort the valleys by descending size
// to improve the rest of the algorithm's performance
valleys.sort( (a, b) => b.size - a.size );
// Collect factors of n, as all chunks must have same, integer size
sizes = divisors(A.length)
// For each valley, require that a solution must not
// generate a chunk that falls completely inside it
do {
size = sizes.pop(); // attempted solution (starting with small size)
for (i = 0;
i < valleys.length &&
// chunk must not fit entirely inside this valley
Math.ceil(valleys[i].start / size) * size + size > valleys[i].end; i++) {
}
} while (i < valleys.length); // keep going until all valleys pass the test
// Return the number of chunks
return A.length / size;
}
// Helper function: chops up a given array into an
// array of sub arrays, which all have given size,
// except maybe last one, which could be smaller.
function chunk(arr, size) {
var chunks = [];
for (var i = 0; i < arr.length; i += size) {
chunks.push(arr.slice(i, i + size));
}
return chunks;
}
// I/O management
inp.oninput = function () {
// Get input as an array of positive integers (ignore non-digits)
if (!this.value) return;
var arr = this.value.match(/\d+/g).map(v => +v);
var parts = solution(arr);
// Output the array, chopped up into its parts:
outCount.textContent = parts;
outChunks.textContent = chunk(arr, arr.length / parts).join('\n');
}
Array (positive integers, any separator): <input id="inp" style="width:100%">
Chunks: <span id="outCount"></span>
<pre id="outChunks"></pre>
When checking whether array can be splitted into K parts, you will in worst case (array of [1,2,1,2,1,...]) do N/2 checks (since you are looking at every peak).
This can be done in K steps, by using clever datastructures:
Represent peaks as an binary array (0 - no peak, 1 - peak). Calculate prefix sums over that. If you want to check if block contains a peak, just compare prefix sums at the start and end of the block.
And also you have small other problem there. You should not check number of block which does not divide the size of the array.
Consider the following two alternatives:
const mb_before = process.memoryUsage().heapUsed / 1024 / 1024;
const n = 15849;
const o = 115;
const entries = [];
for (var i = 0; i < n; i++) {
const subarr = [];
for (var j = 0; j < o; j++) {
subarr.push(Math.random());
}
entries.push(subarr);
}
const mb_after = process.memoryUsage().heapUsed / 1024 / 1024;
console.log('arr using ' + (mb_after - mb_before) + ' megabyte');
// arr using 15.110992431640625 megabyte
and
const mb_before = process.memoryUsage().heapUsed / 1024 / 1024;
const n = 15849;
const o = 115;
const entries = new Array(n);
for (var i = 0; i < n; i++) {
const subarr = new Array(o);
for (var j = 0; j < o; j++) {
subarr[j] = Math.random();
}
entries[i] = subarr;
}
const mb_after = process.memoryUsage().heapUsed / 1024 / 1024;
console.log('arr using ' + (mb_after - mb_before) + ' megabyte');
// arr using 12.118911743164062 megabyte
From my understanding the two arrays' size should be identical, only the way they were instantiated differs. How can it be explained that the resulting memory usage is consistently different?
I believe this has to do with the way array memory is allocated. When you instantiate an array giving it a specific size as you are in the second example, it will allocate that memory.
When you grow the array it will allocate a small amount of extra space to handle growth and then as you grow the array the additional memory allocations will get bigger. This results in extra free space in the first example.
I don't find this surprising at all. Although standard arrays aren't really arrays at all*, JavaScript engines default to optimization: Treating them as though they were really arrays when they can.
In your first example, V8 doesn't know how big each of the arrays is going to get — it just keeps growing, and in order to treat it as an optimized array (rather than an object with special properties), V8 has to keep reallocating and copying to make it bigger periodically. So it's not surprising that the most recent proactive allocation left a lot of extra room in case it kept growing.
In your second example, you've given V8 a big old clue in advance of how big you intend to make the array. So it's reasonable that V8 would use that information to optimize the allocation it does for the underlying true array.
* (that's a post on my anemic little blog)
I have some data which is represented as an array of integers and can be up to 200 000 elements. The integer value can vary from 0 to 200 000.
To emulate this data (for debugging purposes) I can do the following:
let data = [];
let len = 200000
for (let i = 0; i < len; i++) {
data[i] = i;
}
To convert this array of integers as an unicode string I perform this:
let dataAsText = data.map((e) => {
return String.fromCodePoint(e);
}).join('');
When I want to convert back to an array of integers the array appears to be longer:
let dataBack = dataAsText.split('').map((e) => {
return e.codePointAt(e);
});
console.log(dataBack.length);
How does it come ? What is wrong ?
Extra information:
I use codePointAt/fromCodePoint because it can deal with all unicode values (up to 21 bits) while charCodeAt/fromCharCode fails.
Using, for example, .join('123') and .split('123') will make that the variable dataBack is the same length as data. But this isn't an elegant solution because the size of the string dataAsText will unnecessarily be very large.
If let len is equal or less to 65536 (which is 2^16 or 16 bits max value) then everything works fine. Which is strange ?
EDIT:
I use codePoint because I need to convert the data as unicode text so that the data is short.
More about codePoint vs charCode with an example:
If we convert 150000 to a character then back to an integer with codePoint:
console.log(String.fromCodePoint("150000").codePointAt(0));
this gives us 150000 which is correct. Doing the same with charCode fails and prints 18928 (and not 150000):
console.log(String.fromCharCode("150000").charCodeAt(0));
That's because higher code point values will yield 2 words, as can be seen in this snippet:
var s = String.fromCodePoint(0x2F804)
console.log(s); // Shows one character
console.log('length = ', s.length); // 2, because encoding is \uD87E\uDC04
var i = s.codePointAt(0);
console.log('CodePoint value at 0: ', i); // correct
var i = s.codePointAt(1); // Should not do this, it starts in the middle of a sequence!
console.log('CodePoint value at 1: ', i); // misleading
In your code things go wrong when you do split, as there the words making up the string are all split, discarding the fact that some pairs are intended to combine into a single character.
You can use the ES6 solution to this, where the spread syntax takes this into account:
let dataBack = [...dataAsText].map((e, i) => {
// etc.
Now your counts will be the same.
Example:
// (Only 20 instead of 200000)
let data = [];
for (let i = 199980; i < 200000; i++) {
data.push(i);
}
let dataAsText = data.map(e => String.fromCodePoint(e)).join("");
console.log("String length: " + dataAsText.length);
let dataBack = [...dataAsText].map(e => e.codePointAt(0));
console.log(dataBack);
Surrogates
Be aware that in the range 0 ... 65535 there are ranges reserved for so-called surrogates, which only represent a character when combined with another value. You should not iterate over those expecting that these values represent a character on their own. So in your original code, this will be another source for error.
To fix this, you should really skip over those values:
for (let i = 0; i < len; i++) {
if (i < 0xd800 || i > 0xdfff) data.push(i);
}
In fact, there are many other code points that do not represent a character.
I have a feeling split doesn't work with unicode values, a quick test above 65536 shows that they become double the length after splitting
Perhaps look at this post and answers, as they ask a similar question
I don't think you want charPointAt (or charCodeAt) at all. To convert a number to a string, just use String; to have a single delimited string with all the values, use a delimiter (like ,); to convert it back to a number, use the appropriate one of Number, the unary +, parseInt, or parseFloat (in your case, Number or + probably):
// Only 20 instead of 200000
let data = [];
for (let i = 199980; i < 200000; i++) {
data.push(i);
}
let dataAsText = data.join(",");
console.log(dataAsText);
let dataBack = dataAsText.split(",").map(Number);
console.log(dataBack);
If your goal with codePointAt is to keep the dataAsText string short, then you can do that, but you can't use split to recreate the array because JavaScript strings are UTF-16 (effectively) and split("") will split at each 16-bit code unit rather than keeping code points together.
A delimiter would help there too:
// Again, only 20 instead of 200000
let data = [];
for (let i = 199980; i < 200000; i++) {
data.push(i);
}
let dataAsText = data.map(e => String.fromCodePoint(e)).join(",");
console.log("String length: " + dataAsText.length);
let dataBack = dataAsText.split(",").map(e => e.codePointAt(0));
console.log(dataBack);
If you're looking for a way to encode a list of integers so that you can safely transmit it over a network, node Buffers with base64 encoding might be a better option:
let data = [];
for (let i = 0; i < 200000; i++) {
data.push(i);
}
// encoding
var ta = new Int32Array(data);
var buf = Buffer.from(ta.buffer);
var encoded = buf.toString('base64');
// decoding
var buf = Buffer.from(encoded, 'base64');
var ta = new Uint32Array(buf.buffer, buf.byteOffset, buf.byteLength >> 2);
var decoded = Array.from(ta);
// same?
console.log(decoded.join() == data.join())
Your original approach won't work because not every integer has a corresponding code point in unicode.
UPD: if you don't need the data to be binary-safe, no need for base64, just store the buffer as is:
// saving
var ta = new Int32Array(data);
fs.writeFileSync('whatever', Buffer.from(ta.buffer));
// loading
var buf = fs.readFileSync('whatever');
var loadedData = Array.from(new Uint32Array(buf.buffer, buf.byteOffset, buf.byteLength >> 2));
// same?
console.log(loadedData.join() == data.join())
I have tried to implement this knapsack problem solution algorithm in JavaScript, but the solutions s_opt I get has a total weight greater than the L_max.
What am I doing wrong?
I suspect it could be something related to Closures in recursion.
/*
GENERAL:
Assume we have a knapsack and we want to bring as much stuff as possible.
Of each thing we have several variants to choose from. Each of these variants have
different value and takes different amount of space.
DEFINITIONS:
L_max = integer, size of the knapsack for the entire problem having N items
l = matrix, having the elements l[i-1][j-1] representing the space taken
by variant j of item i (-1 since indexing the matrices has index starting on zero, i.e. item i is stored at position i-1)
p = matrix, having the elements p[i-1][j-1] representing the value given by
by variant j of item i
n = total number of items (used in a sub-problem)
N = total number of items (used in the full problem, N >= n)
s_opt = vector having the optimal combination of variant selections s_i, i.e. s_opt = arg max p_sum
*/
function knapsack(L_max,l,p) {
// constructing (initializing) - they are private members
var self = this; // in order for private functions to be able read variables
this.N = l.length;
var DCached = []; // this is only used by a private function so it doesnt need to made public using this.*
this.s_opt = [];
this.p_mean = null;
this.L_max = L_max;
// define public optimization function for the entire problem
// when this is completed the user can read
// s_opt to get the solution and
// p_mean to know the quality of the solution
this.optimize = function() {
self.p_mean = D(self.N,self.L_max) / Math.max(1,self.N);
}
// define private sub-problem optimization function
var D = function(n,r) {
if (r<0)
return -Infinity;
if (n==0)
return 0;
if(DCached[n-1] != null) {
if(DCached[n-1][r-1] != null) {
return DCached[n-1][r-1];
}
}
var p_max = -Infinity;
var p_sum;
var J = l[n-1].length;
for(var j = 0; j < J; j++) {
p_sum = p[n-1][j] + D( n-1 , r - l[n-1][j] );
if(p_sum>p_max) {
p_max = p_sum;
self.s_opt[n-1] = j;
}
}
DCached[n-1] = [];
DCached[n-1][r-1] = p_max;
return p_max;
}
}
The client using this knapsack solver does the following:
var knapsackSolution = new knapsack(5,l,p);
knapsackSolution.optimize();
// now the client can access knapsackSolution.s_opt containing the solution.
I found a solution. When solving a sub-problem D(n,r) the code in the question returned the optimized value, but it didn't really manage the array s_opt in a proper way. In the modified solution, pasted below, I fixed this. Instead of only returning the optimized value of the knapsack also an array of chosen variants (e.g. the arg of the max) are returned. The cache is also modified to manage these two parts of the solution (both max value and arg max value).
The code below also contains an additional feature addition. The user can now also pass a value maxComputingComplexity controlling the computational size of the problem in some kind of heuristic manner.
/*
GENERAL:
Assume we have a knapsack and we want to bring as much stuff as possible.
Of each thing we have several variants to choose from. Each of these variants have
different value and takes different amount of space.
The quantity of each variant is one.
DEFINITIONS:
L_max = integer, size of the knapsack, e.g. max number of letters, for the entire problem having N items
l = matrix, having the elements l[i-1][j-1] representing the space taken
by variant j of item i (-1 since indexing the matrices has index starting on zero, i.e. item i is stored at position i-1)
p = matrix, having the elements p[i-1][j-1] representing the value given by
by variant j of item i
maxComputingComplexity = value limiting the product L_max*self.N*M_max in order to make the optimization
complete in limited amount of time. It has a serious implication, since it may cut the list of alternatives
so that only the first alternatives are used in the computation, meaning that the input should be well
ordered
n = total number of items (used in a sub-problem)
N = total number of items (used in the full problem, N >= n)
M_i = number of variants of item i
s_i = which variant is chosen to pack of item i
s = vector of elements s_i representing a possible solution
r = maximum total space in the knapsack, i.e. sum(l[i][s_i]) <= r
p_sum = sum of the values of the selected variants, i.e. sum(p[i][s_i]
s_opt = vector having the optimal combination of variant selections s_i, i.e. s_opt = arg max p_sum
In order to solve this, let us see p_sum as a function
D(n,r) = p_sum (just seeing it as a function of the sub-problem n combined with the maximum total space r)
RESULT:
*/
function knapsack(L_max,l,p,maxComputingComplexity) {
// constructing (initializing) - they are private members
var self = this; // in order for private functions to be able read variables
this.N = l.length;
var DCached = []; // this is only used by a private function so it doesnt need to made public using this.*
//this.s_opt = [];
//this.p_mean = null;
this.L_max = L_max;
this.maxComputingComplexity = maxComputingComplexity;
//console.log("knapsack: Creating knapsack. N=" + N + ". L_max=" + L_max + ".");
// object to store the solution (both big problem and sub-problems)
function result(p_max,s_opt) {
this.p_max = p_max; //max value
this.s_opt = s_opt; //arg max value
}
// define public optimization function for the entire problem
// when this is completed the user can read
// s_opt to get the solution and
// p_mean to know the quality of the solution
// computing complexity O(L_max*self.N*M_max),
// think O=L_max*N*M_max => M_max=O/L_max/N => 3=x/140/20 => x=3*140*20 => x=8400
this.optimize = function() {
var M_max = Math.max(maxComputingComplexity / (L_max*self.N),2); //totally useless if not at least two
console.log("optimize: Setting M_max =" + M_max);
return D(self.N,self.L_max,M_max);
//self.p_mean = mainResult.D / Math.max(1,self.N);
// console.log...
}
// Define private sub-problem optimization function.
// The function reads to "global" variables, p and l
// and as arguments it takes
// n delimiting the which sub-set of items to be able to include (from p and l)
// r setting the max space that this sub-set of items may take
// Based on these arguments the function optimizes D
// and returns
// D the max value that can be obtained by combining the things
// s_opt the selection (array of length n) of things optimizing D
var D = function(n,r,M_max) {
// Start by checking whether the value is already cached...
if(DCached[n-1] != null) {
if(DCached[n-1][r-1] != null) {
//console.log("knapsack.D: n=" + n + " r=" + r + " returning from cache.");
return DCached[n-1][r-1];
}
}
var D_result = new result(-Infinity, []); // here we will manage the result
//D_result.s_opt[n-1] = 0; // just put something there to start with
if (r<0) {
//D_result.p_max = -Infinity;
return D_result;
}
if (n==0) {
D_result.p_max = 0;
return D_result;
}
var p_sum;
//self.s_opt[n] = 0; not needed
var J = Math.min(l[n-1].length,M_max);
var D_minusOneResult; //storing the result when optimizing all previous items given a max length
for(var j = 0; j < J; j++) {
D_minusOneResult = D( n-1 , r - l[n-1][j] , M_max)
p_sum = p[n-1][j] + D_minusOneResult.p_max;
if(p_sum > D_result.p_max) {
D_result.p_max = p_sum;
D_result.s_opt = D_minusOneResult.s_opt;
D_result.s_opt[n-1] = j;
}
}
DCached[n-1] = [];
DCached[n-1][r-1] = D_result;
//console.log("knapsack.D: n=" + n + " r=" + r + " p_max= "+ p_max);
return D_result;
}
}