Reading bytes from a JavaScript string - javascript

I have a string containing binary data in JavaScript. Now I want to read, for example, an integer from it. So I get the first 4 characters, use charCodeAt, do some shifting, etc. to get an integer.
The problem is that strings in JavaScript are UTF-16 (instead of ASCII) and charCodeAt often returns values higher than 256.
The Mozilla reference states that "The first 128 Unicode code points are a direct match of the ASCII character encoding." (what about ASCII values > 128?).
How can I convert the result of charCodeAt to an ASCII value? Or is there a better way to convert a string of four characters to a 4 byte integer?

I believe that you can can do this with relatively simple bit operations:
function stringToBytes ( str ) {
var ch, st, re = [];
for (var i = 0; i < str.length; i++ ) {
ch = str.charCodeAt(i); // get char
st = []; // set up "stack"
do {
st.push( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
re = re.concat( st.reverse() );
}
// return an array of bytes
return re;
}
stringToBytes( "A\u1242B\u4123C" ); // [65, 18, 66, 66, 65, 35, 67]
It should be a simple matter to sum the output up by reading the byte array as if it were memory and adding it up into larger numbers:
function getIntAt ( arr, offs ) {
return (arr[offs+0] << 24) +
(arr[offs+1] << 16) +
(arr[offs+2] << 8) +
arr[offs+3];
}
function getWordAt ( arr, offs ) {
return (arr[offs+0] << 8) +
arr[offs+1];
}
'\\u' + getWordAt( stringToBytes( "A\u1242" ), 1 ).toString(16); // "1242"

Borgar's answer seems correct.
Just wanted to clarify one point. Javascript treats bitwise operations as '32-bit signed int's, where the last (left-most) bit is the sign bit. Ie,
getIntAt([0x7f,0,0,0],0).toString(16) // "7f000000"
getIntAt([0x80,0,0,0],0).toString(16) // "-80000000"
However, for octet-data processing (eg, network stream, etc), usually want the 'unsigned int' representation. This can be accomplished by adding a '>>> 0' (zero-fill right-shift) operator which internally tells Javascript to treat this as unsigned.
function getUIntAt ( arr, offs ) {
return (arr[offs+0] << 24) +
(arr[offs+1] << 16) +
(arr[offs+2] << 8) +
arr[offs+3] >>> 0;
}
getUIntAt([0x80,0,0,0],0).toString(16) // "80000000"

There are two methods for encoding and decoding utf-8 string to a byte array and back.
var utf8 = {}
utf8.toByteArray = function(str) {
var byteArray = [];
for (var i = 0; i < str.length; i++)
if (str.charCodeAt(i) <= 0x7F)
byteArray.push(str.charCodeAt(i));
else {
var h = encodeURIComponent(str.charAt(i)).substr(1).split('%');
for (var j = 0; j < h.length; j++)
byteArray.push(parseInt(h[j], 16));
}
return byteArray;
};
utf8.parse = function(byteArray) {
var str = '';
for (var i = 0; i < byteArray.length; i++)
str += byteArray[i] <= 0x7F?
byteArray[i] === 0x25 ? "%25" : // %
String.fromCharCode(byteArray[i]) :
"%" + byteArray[i].toString(16).toUpperCase();
return decodeURIComponent(str);
};
// sample
var str = "Да!";
var ba = utf8.toByteArray(str);
alert(ba); // 208, 148, 208, 176, 33
alert(ba.length); // 5
alert(utf8.parse(ba)); // Да!

While #Borgar answers the question correctly, his solution is pretty slow. It took me a while to track it down (I used his function somewhere in a larger project), so I thought I would share my insight.
I ended up having something like #Kadm. It's not some little percent faster, it's like 500 times faster (no exaggeration!). I wrote a little benchmark, so you can see it for yourself :)
function stringToBytesFaster ( str ) {
var ch, st, re = [], j=0;
for (var i = 0; i < str.length; i++ ) {
ch = str.charCodeAt(i);
if(ch < 127)
{
re[j++] = ch & 0xFF;
}
else
{
st = []; // clear stack
do {
st.push( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
st = st.reverse();
for(var k=0;k<st.length; ++k)
re[j++] = st[k];
}
}
// return an array of bytes
return re;
}

Borga's solution works perfectly. In case you want a more concrete implementation, you may want to have a look at the BinaryReader class from vjeux (which, for the records, is based on the binary-parser class from Jonas Raoni Soares Silva).

How did you get the binary data into the string in the first place? How the binary data gets encoded into a string is an IMPORTANT consideration, and you need an answer to that question before you can proceed.
One way I know of to get binary data into a string, is to use the XHR object, and set it to expect UTF-16.
Once it's in utf-16, you can retrieve 16-bit numbers from the string using "....".charCodeAt(0)
which will be a number between 0 and 65535
Then, if you like, you can convert that number into two numbers between 0 and 255 like this:
var leftByte = mynumber>>>8;
var rightByte = mynumber&255;

borgars solution improvement:
...
do {
st.unshift( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
re = re.concat( st );
...

One nice and quick hack is to use a combination of encodeURI and unescape :
t=[];
for(s=unescape(encodeURI("zażółć gęślą jaźń")),i=0;i<s.length;++i)
t.push(s.charCodeAt(i));
t
[122, 97, 197, 188, 195, 179, 197, 130, 196, 135, 32, 103, 196, 153, 197, 155, 108, 196, 133, 32, 106, 97, 197, 186, 197, 132]
Perhaps some explanation is necessary why the heck it works, so let me split it into steps:
encodeURI("zażółć gęślą jaźń")
returns
"za%C5%BC%C3%B3%C5%82%C4%87%20g%C4%99%C5%9Bl%C4%85%20ja%C5%BA%C5%84"
which -- if you look closely -- is the original string in which all characters with values>127 got replaced with (possibly more than one) hexadecimal bytes representations.
For example letter "ż" became "%C5%BC". The fact is encodeURI escapes also some regular ascii characters like spaces, but it does not matter. What matters is that at this point each byte of the original string is either represented verbatim (as is the case with "z", "a", "g", or "j") or as a percent-encoded sequence of bytes (as was the case with "ż" which was originaly two bytes 197 and 188 and got converted to %C5 and %BC).
Now, we apply unescape:
unescape("za%C5%BC%C3%B3%C5%82%C4%87%20g%C4%99%C5%9Bl%C4%85%20ja%C5%BA%C5%84")
which gives
"zażóÅÄ gÄÅlÄ jaźÅ"
If you are not native Polish speaker you might not notice, that this result is in fact way different from the original "zażółć gęślą jaźń". For starters, it has a different number of characters :)
For sure, you can tell, that this strange versions of big letter A do not belong to standard ascii set. In fact this "Å" has value 197. (which is exactly C5 in hexadecimal).
Now, if you are like me, you would ask yourself: wait a minute...if this is really a sequence of bytes with values 122, 97, 197, 188, and JS is really using UTF then why do I see this "ż" characters, and not the original "ż" ?
Well, the thing is (I belive) that this sequence 122, 97, 197, 188 (which we see when applying charCodeAt) is not a sequence of bytes, but a sequence of codes. The character "Å" has a code 197, but its actually two bytes long sequence: C3 85.
So, the trick works because unescape treats numbers occuring in percent-encoded string as codes, not as byte values - or, to be more specific: unescape knows nothing about multibyte characters, so when it decodes bytes one-by-one, handling values lower than 128 just great, but not-so-good when they are above 127 and multibyte -- unescape in such cases simply returns a multibyte character which happens to have a code equal to the requested byte value. This "bug" is actually useful feature.

I'm going to assume for a second that your objective is to read arbitrary bytes from a string.
My first suggestion would be to make your string representation a hexidecmal representation of the binary data.
You can read the values using conversions to numbers from hex:
var BITS_PER_BYTE = 8;
function readBytes(hexString, numBytes) {
return Number( parseInt( hexString.substr(0, numBytes * (BITS_PER_BYTE/4) ),16 ) );
}
function removeBytes(hexString, numBytes) {
return hexString.substr( numBytes * (BITS_PER_BYTE/BITS_PER_CHAR) );
}
The functions can then be used to read whatever you want:
var hex = '4ef2c3382fd';
alert( 'We had: ' + hex );
var intVal = readBytes(hex,2);
alert( 'Two bytes: ' + intVal.toString(2) );
hex = removeBytes(hex,2);
alert( 'Now we have: ' + hex );
You can then interpret the byte string however you want.
Hope this helps!
Cheers!

Related

BigInteger to a Uint8Array of bytes

I need to get the bytes of a big integer in JavaScript.
I've tried a couple of big integer libraries, but the one that actually offered this function wouldn't work.
I am not quite sure how to implement this myself, given a string containing a large number, which is generally what the libraries give access to.
Is there a library that works and allows to do this?
Or is it actually not hard, and I am just missing something?
I was googling for quick and elegant solution of this problem in JavaScript, but the only what I found was the method of conversion, based on intermediate hex-string. What is suboptimal for sure and that code also didn't work for me, unfortunately. So, I implemented my own code and wanted to post it as an answer to my own question, but found this one.
Explanation
First of all, I will answer to the opposite question, since it is more illustrative.
Reading BigInteger from a bytes array
What is an array of bytes for us? This is a number in 256-base numeral system, which we want to convert to more convenient for us 10-base (decimal) system.
For instance, let's take an array of bytes
[AA][BB][CC][DD] (1 byte is 8 bits or 2 hexadecimal digits).
Depending on the side we start from (see https://en.wikipedia.org/wiki/Endianness), we can read it as:
(AA*1 + BB*256 + CC*256^2 + DD*256^3) in little-endian
or (DD*1 + CC*256 + BB*256^2 + AA*256^3) in big-endian.
Let's use little-endian here. So, our number encoded by the array [AA][BB][CC][DD] is:
AA + BB*256 + CC*256^2 + DD*256^3
= 170 + 187*256 + 204*65536 + 221*16777216
= 170 + 47872 + 13369344 + 3707764736
= 3721182122
Writing BigInteger to a bytes array
For writing a number into an array of bytes we have to perform an opposite operation, i.e. having a number in decimal system to find all digits of it in 256-base numeral system. Let's take the same number: 3721182122
To find it's least significant byte (https://en.wikipedia.org/wiki/Bit_numbering#Least_significant_byte), we have to just divide it by 256. The remainder represents higher digits. So, we divide the remainder again by 256 and so on, until we receive 0 remainder:
3721182122 = 14535867*256 + 170
14535867 = 56780*256 + 187
56780 = 221*256 + 204
221 = 0*256 + 221
So, the result is [170][187][204][221] in decimal, [AA][BB][CC][DD] in hex.
Solution in JavaScript
Now, here is this algorithm encoded in NodeJS with big-integer library.
const BigInteger = require('big-integer');
const zero = BigInteger(0);
const one = BigInteger(1);
const n256 = BigInteger(256);
function fromLittleEndian(bytes) {
let result = zero;
let base = one;
bytes.forEach(function (byte) {
result = result.add(base.multiply(BigInteger(byte)));
base = base.multiply(n256);
});
return result;
}
function fromBigEndian(bytes) {
return fromLittleEndian(bytes.reverse());
}
function toLittleEndian(bigNumber) {
let result = new Uint8Array(32);
let i = 0;
while (bigNumber.greater(zero)) {
result[i] = bigNumber.mod(n256);
bigNumber = bigNumber.divide(n256);
i += 1;
}
return result;
}
function toBigEndian(bytes) {
return toLittleEndian(bytes).reverse();
}
console.log('Reading BigInteger from an array of bytes');
let bigInt = fromLittleEndian(new Uint8Array([170, 187, 204, 221]));
console.log(bigInt.toString());
console.log('Writing BigInteger to an array of bytes');
let bytes = toLittleEndian(bigInt);
console.log(bytes);
Benchmark
I have written small benchmark for this approach. Anybody is welcome to modify it for his own conversion method and to compare with my one.
https://repl.it/repls/EvenSturdyEquipment
Set "i" to be your BigInt's value. You can see the bytes by looking at "a" after running this:
i=11111n;n=1500;a=new Uint8Array(n);while(i>0){a[--n]=Number(i&255n);i>>=8n}
You can also extract the BigInt back out from the Uint8Array:
a.reduce((p,c)=>BigInt(p)*256n+BigInt(c))
I've got a version that works with BigInt that's supported by the browser:
const big0 = BigInt(0)
const big1 = BigInt(1)
const big8 = BigInt(8)
bigToUint8Array(big: bigint) {
if (big < big0) {
const bits: bigint = (BigInt(big.toString(2).length) / big8 + big1) * big8
const prefix1: bigint = big1 << bits
big += prefix1
}
let hex = big.toString(16)
if (hex.length % 2) {
hex = '0' + hex
}
const len = hex.length / 2
const u8 = new Uint8Array(len)
var i = 0
var j = 0
while (i < len) {
u8[i] = parseInt(hex.slice(j, j + 2), 16)
i += 1
j += 2
}
return u8
}
I've got a BigDecimal implementation that works with sending & receiving bytes as arbitary precision big decimal: https://jackieli.dev/posts/bigint-to-uint8array/

Make a utf-8 string shorter with a utf-32 encoding in Javascript?

I'm trying to find a way to compress/decompress a string in Javascript. By compress I mean to make the string look shorter (less char). That's my goal.
Here's an example of how things should work:
// The string that I want to make shorter
// It will only contain [a-zA-Z0-9] chars and some ponctuations like ()[]{}.,;'"!
var string = "I like bananas !";
// The compressed string, maybe something like "䐓㐛꯱字",
// which is shorter than the original
var shortString = compress(string);
// The original string, "I like banana !"
var originalString = decompress(shortString);
Here's my first idea (maybe there's a better way to get to my goal, and if so I'm interested in it).
I know that my original string will be in utf-8. So I'm thinking of using utf-32 for the encoding, which should divide by 4 the length of the string.
But I don't know how to do these 2 functions that construct new strings with different encoding. Here's the code I have so far that doesn't work...
function compress(string) {
string = unescape(encodeURIComponent(string));
var newString = '';
for (var i = 0; i < string.length; i++) {
var char = string.charCodeAt(i);
newString += parseInt(char, 8).toString(32);
}
return newString;
}
Since you're using a set of less than 100 characters and that javascript strings are encoded in UTF-16 (which mean you have 65536 possible characters), what you can do is concatenate the character codes so as to have one "compressed" character per two basic character. This allows you to compress strings to half the length.
Like this for example:
document.getElementById('compressBtn').addEventListener('click', function() {
var stringToCompress = document.getElementById('tocompress').value;
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);
if (stringToCompress === decompressedString) {
document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
} else {
document.getElementById('display').innerHTML = "This string cannot be compressed"
}
})
function compress(string) {
string = unescape(encodeURIComponent(string));
var newString = '',
char, nextChar, combinedCharCode;
for (var i = 0; i < string.length; i += 2) {
char = string.charCodeAt(i);
if ((i + 1) < string.length) {
// You need to make sure that you don't have 3 digits second character else you might go over 65536.
// But in UTF-16 the 32 characters aren't in your basic character set. But it's a limitation, anything
// under charCode 32 will cause an error
nextChar = string.charCodeAt(i + 1) - 31;
// this is to pad the result, because you could have a code that is single digit, which would make
// decompression a bit harder
combinedCharCode = char + "" + nextChar.toLocaleString('en', {
minimumIntegerDigits: 2
});
// You take the concanated code string and convert it back to a number, then a character
newString += String.fromCharCode(parseInt(combinedCharCode, 10));
} else {
// Here because you won't always have pair number length
newString += string.charAt(i);
}
}
return newString;
}
function decompress(string) {
var newString = '',
char, codeStr, firstCharCode, lastCharCode;
for (var i = 0; i < string.length; i++) {
char = string.charCodeAt(i);
if (char > 132) {
codeStr = char.toString(10);
// You take the first part of the compressed char code, it's your first letter
firstCharCode = parseInt(codeStr.substring(0, codeStr.length - 2), 10);
// For the second one you need to add 31 back.
lastCharCode = parseInt(codeStr.substring(codeStr.length - 2, codeStr.length), 10) + 31;
// You put back the 2 characters you had originally
newString += String.fromCharCode(firstCharCode) + String.fromCharCode(lastCharCode);
} else {
newString += string.charAt(i);
}
}
return newString;
}
var stringToCompress = 'I like bananas!';
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);
document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
body {
padding: 10px;
}
#tocompress {
width: 200px;
}
<input id="tocompress" placeholder="enter string to compress" />
<button id="compressBtn">
Compress input
</button>
<div id="display">
</div>
Regarding the possible use of UTF-32 to further compress, I'm not sure it's possible, I might be wrong on that, but from my understanding it's not feasible. Here's why:
The approach above is basically concatenating two 1 byte values in one 2 bytes value. This is possible because javascript strings are encoded in 2 bytes (or 16 bits) (note that from what I understand the engine could decide to store differently making this compression unnecessary from a purely memory space point of view - that being said, in the end, one character is considered being 16 bits). A cleaner way to make the compression above would in fact to user the binary numbers instead of the decimal, it would make much more sense. Like this for example:
document.getElementById('compressBtn').addEventListener('click', function() {
var stringToCompress = document.getElementById('tocompress').value;
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);
if (stringToCompress === decompressedString) {
document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
} else {
document.getElementById('display').innerHTML = "This string cannot be compressed"
}
})
function compress(string) {
string = unescape(encodeURIComponent(string));
var newString = '',
char, nextChar, combinedCharCode;
for (var i = 0; i < string.length; i += 2) {
// convert to binary instead of keeping the decimal
char = string.charCodeAt(i).toString(2);
if ((i + 1) < string.length) {
nextChar = string.charCodeAt(i + 1).toString(2) ;
// you still need padding, see this answer https://stackoverflow.com/questions/27641812/way-to-add-leading-zeroes-to-binary-string-in-javascript
combinedCharCode = "0000000".substr(char.length) + char + "" + "0000000".substr(nextChar.length) + nextChar;
// You take the concanated code string and convert it back to a binary number, then a character
newString += String.fromCharCode(parseInt(combinedCharCode, 2));
} else {
// Here because you won't always have pair number length
newString += string.charAt(i);
}
}
return newString;
}
function decompress(string) {
var newString = '',
char, codeStr, firstCharCode, lastCharCode;
for (var i = 0; i < string.length; i++) {
char = string.charCodeAt(i);
if (char > 132) {
codeStr = char.toString(2);
// You take the first part (the first byte) of the compressed char code, it's your first letter
firstCharCode = parseInt(codeStr.substring(0, codeStr.length - 7), 2);
// then the second byte
lastCharCode = parseInt(codeStr.substring(codeStr.length - 7, codeStr.length), 2);
// You put back the 2 characters you had originally
newString += String.fromCharCode(firstCharCode) + String.fromCharCode(lastCharCode);
} else {
newString += string.charAt(i);
}
}
return newString;
}
var stringToCompress = 'I like bananas!';
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);
document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
<input id="tocompress" placeholder="enter string to compress" />
<button id="compressBtn">
Compress input
</button>
<div id="display">
</div>
So why not push the logic and use utf-32, which should be 4 bytes, meaning four 1 byte characters. One problem is that javascript has 2 bytes string. It's true that you can use pairs of 16 bits characters to represent utf-32 characters. Like this:
document.getElementById('test').innerHTML = "\uD834\uDD1E";
<div id="test"></div>
But if you test the length of the resulting string, you'll see that it's 2, even if there's only one "character". So from a javascript perspective, you're not reducing the actual string length.
The other thing is that UTF-32 has in fact 221 characters. See here: https://en.wikipedia.org/wiki/UTF-32
It is a protocol to encode Unicode code points that uses exactly 32
bits per Unicode code point (but a number of leading bits must be zero
as there are fewer than 221 Unicode code points)
So you don't really have 4 bytes, in fact you don't even have 3, which would be needed to encode 3. So UTF-32 doesn't seem to be a way to compress even more. And since javascript has native 2 bytes strings, it seems to me to be the most efficient - using that approach at least.
If your strings only contain ASCII characters [0, 127] you can "compress" the string using a custom 6 or 7-bit code page.
You can do this several ways, but I think one of the simpler methods is to define an array holding all allowed characters - a LUT, lookup-table if you like, then use its index value as the encoded value. You would of course have to manually mask and shift the encoded value into a typed array.
If your LUT looked like this:
var lut = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,:;!(){}";
you would in this case deal with a LUT of length 71 which means we would need to use a 7-bit range or [0, 127] (if length were 64 we could've reduced the it to 6-bit [0, 63] values).
Then you would take each characters in the string and convert to index values (you would normally do all the following steps in a single operation but I have separated them for simplicity):
var lut = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,:;!(){}";
var str = "I like bananas !";
var page = [];
Array.prototype.forEach.call(str, function(ch) {
var i = lut.indexOf(ch);
if (i < 0) throw "Invalid character - can't encode";
page.push(i);
});
console.log("Intermediate page:", page);
You can always tweak the LUT so that the most used characters are in the beginning, then support variable encoding bit-range, find max value and use that to determine what range you want to encode in. You can add an initial bit as a flag as to which range the encoding uses (for example bit 0 set if 6-bit fits, otherwise use 7-bit range).
Now that you know the indices we can start to encode the binary output itself using a 7-bit approach. Since JavaScript only support byte values, i.e. 8-bit width, we have to do all the split, shift and merge operations manually.
This means we need to keep track of remainder and position on a bit-level.
Say first index value was the following 7-bit value (full 7-bit range for readability - all in pseudo format):
&b01111111
The first step would be to shift it over to bit position 0 and keep track of a remainder:
&b01111111 << 1
Resulting in:
&b11111110
^
new bit position: 7
new remainder : 1
Then the next index value, for example:
&b01010101
would be encoded like this - first convert to 7-bit value in its own byte representation:
&b01010101 << 1 => &b10101010
Then get the reminder part first. To obtain this will shift everything right-wise using 8-bit minus the current remainder (within modulo of 8):
remainderValue = &b10101010 >>> (8 - remainder)
leaving us with the following representation:
&b00000001
(Note that we use triple >>> to shift right to avoid issues with sign.)
Next step now is to merge this value with our previous value that has already been encoded and stored into our destination byte array - for this we'll use an OR operation:
Index 0 New value Result in index 0 (index of dst. array)
&b11111110 | &b00000001 => &b11111111
then go to next index in our destination array and store the rest of the current value, then update the remainder and position.
The "leftover" of the byte is calculated like this using the original (after shifting it) 7-bit byte value:
leftover = &b10101010 << remainder => &b01010100
which we now put into the next position:
Index 0 Index 1 (destination array index, not page index)
&b11111111 01010100
^
new bit position: 14
new remainder : 2
And so on with the remaining index values. See this answer for actual code on how you can do this in JavaScript - the code in this answer doesn't deal with string encoding per-se, but it shows how you can shift byte buffers bit-wise which is essentially the same you need for this task.
To calculate the remainder step, use 8-bits minus your custom bit-range:
step = 8 - newRange (here 7) => 1
This will also be the start remainder. For each character, you'll add the step to remainder after it has been processed, but remember to use modulo 8 (byte width) when you use it for shifting:
remainder += step;
numOfBitsToShift = remainder % 8;
Bit-position uses of course the bit-range, in this case 7:
bitPosition += 7;
Then to find which indices you're dealing with you divide the bitPosition on 8, if any decimal you have to deal with two indexes (old and new), if no decimal the current position represents new index only (only shift is needed for current index value).
You can also use modulo and when modulo of remainder = step you know you that you are dealing with a single index in the destination.
To calculate the final length you would use the bit-length and length of string, then ceil the result so that all characters will fit into a 8-byte byte array which is the only array we can get in JavaScript:
dstLength = Math.ceil(7 * str.length / 8);
To decode you just reverse all the steps.
An alternative, if you use long strings or have to move forward fast, is to use an established compressor such as zlib which has a very compact header as well as good performance in JavaScript in the case of the linked solution. This will also deal with "patterns" in the string to further optimize the resulting size.
Disclaimer: as this is mostly a theoretical answer there might be some errors. Feel free to comment if any are found. Refer to linked answer for actual code example.
for full code see here: https://repl.it/NyMl/1
using the Uint8Array you can work with the bytes.
let msg = "This is some message";
let data = []
for(let i = 0; i < msg.length; ++i){
data[i] = msg.charCodeAt(i);
}
let i8 = new Uint8Array(data);
let i16 = new Uint16Array(i8.buffer);
you could also think of a compression like this: http://pieroxy.net/blog/pages/lz-string/demo.html
if you don't want to use a 3rd party library, the lz based compression should be fairly simple. see here (wikipedia)
I use the same library mentioned above, lz-string https://github.com/pieroxy/lz-string, and it creates file sizes that are smaller than most of the binary formats like Protocol Buffers.
I compress via Node.js like this:
var compressedString = LZString.compressToUTF16(str);
And I decompress client side like this:
var decompressedString = LZString.decompressFromUTF16(str);

Robust conversion of hexadecimal string to byte values in JavaScript

I try to extract the byte values from a string containing hexadecimal byte representations. The string also contains (unknown) non-hexadecimal characters which needs to be ignored (delimiters, whitespace formatting).
Given an input string "f5 df 45:f8 a 8 f53", the result would be the array [245, 223, 69, 248, 168, 245]. Note that byte values are only output from two hexadecimal digits (hence, the last 3 is ignored).
As an additional constraint, the code needs to work in ecmascript 3 environments.
So far, I have used this approach:
function parseHex(hex){
hex = hex.replace(/[^0-9a-fA-F]/g, '');
var i,
len = hex.length,
bin = [];
for(i = 0; i < len - 1; i += 2){
bin.push(+('0x' + hex.substring(i, i + 2)));
}
return bin;
}
However, I feel that it would be possible to find a more elegant solution to this, so the question is:
Is there a better solution to this problem (that would perform better or solve the problem with less code)?
Updated answer (ES3)
Since you mentioned in the comment to my original answer that you're limited to ES3, you should just be able to do this then:
function parseHex(string) {
// remove all non-hex characters, and then separate them into an array in groups of 2 characters
var arr = string.replace(/[^0-9a-fA-F]/g, '').match(/[0-9a-fA-F]{2}/g);
// mutate the array in-place with the correct decimal values
for(var i = 0; i<arr.length; i++) {
arr[i] = parseInt(arr[i], 16);
}
return arr;
}
parseHex('f5 df 45:f8 a 8 f53'); // => [245, 223, 69, 248, 168, 245]
It'll essentially do what map does, except it has less space complexity than map because it's mutating the array in place. See the updated jsfiddle.
Previous answer (ES5)
You can do this (here's a jsbin example):
'f5 df 45:f8 a 8 f53'.replace(/[^0-9a-fA-F]/g, '').match(/[0-9a-fA-F]{2}/g).map(function(hex) {
return parseInt(hex, 16);
});
// => [245, 223, 69, 248, 168, 245]
You can make it a function like this:
function parseHex(string) {
return string.replace(/[^0-9a-fA-F]/g, '').match(/[0-9a-fA-F]{2}/g).map(function(hex) {
return parseInt(hex, 16);
});
}
parseHex('f5 df 45:f8 a 8 f53');
Essentially you remove non-hex characters from the string, then match groups of two hex characters (as per your requirements). This answer describes the parseInt(hex, 16) portion (where the reverse would be hex.toString(16)).
TL;DR
Using regex methods lead to less code, but worse performance. A non-regex solution gives better performance, at the cost of slightly more code.
Regex approaches
After some more research/googling (and seeing Josh Beams answer use .match()), I figured that there are several possible regex approaches that could improve on the original approach.
Using .match() directly (without .replace()), inspired by Josh Beams answer:
function parseHex(hex){
hex = hex.match(/[\da-f]/gi);
for(var i = 0; i < hex.length - 1; i += 2){
hex[i >> 1] = +('0x' + hex[i] + hex[i + 1]);
}
hex.length = i >> 1;
return hex;
}
Use .replace() for iteration (inspired by this):
function parseHex(hex){
var bin = [];
hex.replace(/([\da-f])[^\da-f]*([\da-f])/gi,
function(m, digit1, digit2){
bin.push(+('0x' + digit1 + digit2));
}
);
return bin;
}
Looping with .exec() (also inspired by this):
function parseHex(hex){
var bin = [],
regex = /([\da-f])[^\da-f]*([\da-f])/gi,
result;
while(result = regex.exec(hex)){
bin.push(+('0x' + result[1] + result[2]));
}
return bin;
}
Performance and a non-regex solution
After running performance tests here, none of the regex approaches seem to perform significantly better than the original approach. Out of curiosity, I attempted a non-regex solution, which significantly outperforms the other approaches (at the cost of slightly more code):
function parseHex(hex){
var bin = [], i, c, isEmpty = 1, buffer;
for(i = 0; i < hex.length; i++){
c = hex.charCodeAt(i);
if(c > 47 && c < 58 || c > 64 && c < 71 || c > 96 && c < 103){
buffer = buffer << 4 ^ (c > 64 ? c + 9 : c) & 15;
if(isEmpty ^= 1){
bin.push(buffer & 0xff);
}
}
}
return bin;
}
I will probably go for the non-regex approach.

Distribution of Number of Digits of Random Numbers

I encounter this curious phenomenon trying to implement a UUID generator in JavaScript.
Basically, in JavaScript, if I generate a large list of random numbers with the built-in Math.random() on Node 4.2.2:
var records = {};
var l;
for (var i=0; i < 1e6; i += 1) {
l = String(Math.random()).length;
if (records[l]) {
records[l] += 1;
} else {
records[l] = 1;
}
}
console.log(records);
The numbers of digits have a strange pattern:
{ '12': 1,
'13': 11,
'14': 65,
'15': 663,
'16': 6619,
'17': 66378,
'18': 611441,
'19': 281175,
'20': 30379,
'21': 2939,
'22': 282,
'23': 44,
'24': 3 }
I thought this is a quirk of the random number generator of V8, but similar pattern appears in Python 3.4.3:
12 : 2
13 : 5
14 : 64
15 : 672
16 : 6736
17 : 66861
18 : 610907
19 : 280945
20 : 30455
21 : 3129
22 : 224
And the Python code is as follows:
import random
random.seed()
records = {}
for i in range(0, 1000000):
n = random.random()
l = len(str(n))
try:
records[l] += 1
except KeyError:
records[l] = 1;
for i in sorted(records):
print(i, ':', records[i])
The pattern from 18 to below is expected: say if random number should have 20 digits, then if the last digit of a number is 0, it effectively has only 19 digits. If the random number generator is good, the probability of that happening is roughly 1/10.
But why the pattern is reversed for 19 and beyond?
I guess this is related to float numbers' binary representation, but I can't figure out exactly why.
The reason is indeed related to floating point representation. A floating point number representation has a maximum number of (binary) digits it can represent, and a limited exponent value range. Now when you print this out without using scientific notation, you might in some cases need to have some zeroes after the decimal point before the significant digits start to follow.
You can visualize this effect by printing those random numbers which have the longest length when converted to string:
var records = {};
var l, r;
for (var i=0; i < 1e6; i += 1) {
r = Math.random();
l = String(r).length;
if (l === 23) {
console.log(r);
}
if (records[l]) {
records[l] += 1;
} else {
records[l] = 1;
}
}
This prints only the 23-long strings, and you will get numbers like these:
0.000007411070483631654
0.000053944830052166104
0.000018188989763578967
0.000029525788901141325
0.000009613635131744402
0.000005937417234758158
0.000021099748521158368
Notice the zeroes before the first non-zero digit. These are actually not stored in the number part of a floating point representation, but implied by its exponent part.
If you were to take out the leading zeroes, and then make a count:
var records = {};
var l, r, s;
for (var i=0; i < 1e6; i += 1) {
r = Math.random();
s = String(r).replace(/^[0\.]+/, '');
l = s.length;
if (records[l]) {
records[l] += 1;
} else {
records[l] = 1;
}
}
... you'll get results which are less strange.
However, you will see some irregularity that is due to how javascript converts tiny numbers to string: when they get too small, the scientific notation is used in the string representation. You can see this with the following script (not sure if every browser has the same breaking point, so maybe you need to play a bit with the number):
var i = 0.00000123456789012345678;
console.log(String(i), String(i/10));
This gives me the following output:
0.0000012345678901234567 1.2345678901234568e-7
So very small numbers will get a more fixed string length as a result, quite often 22 characters, while in the non-scientific notation a length of 23 is common. This influences also the second script I provided and length 22 will get more hits than 23.
It should be noted that javascript does not switch to scientific notation when converting to string in binary representation:
var i = 0.1234567890123456789e-120;
console.log(i.toString(2));
The above will print a string of over 450 binary digits!
It's because some of the values are like this:
0.00012345...
And thus they're longer.

compress/urlencode a series of 100 base-4 numbers in javascript

First thing: must be done entirely in javascript. (JQuery/mootools optional)
I have a series of 100 numbers each set 0,1,2, or 3 - these represents settings on the page. I would like to encode these to the shortest string possible to create a permalink to the page.
I am thinking the best way would be to store them in binary couplets, convert those couplets to a string, ant then urlencode the string.
However the best I have found so far is parseint( binary_var, 2 ), which coverts a binary number to a base_10 number. However to get the string short enough I'll need a better system.
If I could convert to 64-bit encoding I could store all the data in just 4 chars, I think. I know urls support unicode now, and I believe I can use escape and unescape to encode/decode 64-bit chars, so the main thing I am looking for is a way to encode/decode binary data to 64-bit characters.
Of course I am not 100% sure this is the best way, or will even work, so it I am completely off track feel free to point me in the right direction.
Thanks!
You can encode such arrays of numbers into a string, 3 per character, like this:
function encodeBase4(base4) {
var i, rv = [], n = ~~((base4.length + 2) / 3) * 3;
for (i = 0; i < n; i += 3) {
rv.push(
32 +
((base4[i] || 0) & 3) +
((base4[i + 1] || 0) & 3) * 4 +
((base4[i + 2] || 0) & 3) * 16
);
}
return String.fromCharCode.apply(null, rv);
}
You can then convert the other direction like this:
function decodeBase4(str) {
var i, rv = [], n = str.length;
for (i = 0; i < n; ++i) {
var b = str.charCodeAt(i) - 32;
rv.push(b & 3);
rv.push(~~(b / 4) & 3);
rv.push(~~(b / 16) & 3);
}
return rv;
}
Here's the jsfiddle which seems to work on its simple test case. (Note that you end up with a list that's a multiple of 3 in length; you'd have to know how many real values there are and just ignore the zeros at the end.)
Now these result strings will be "dirty" and require URL encoding if you're putting them in URLs. If you packed only 2 numbers per character, you could make the resulting strings be all alphabetic, and thus you'd avoid the encoding penalty; however they'd be longer, of course.
100 pieces of information with 2 bits each require 200 bits in total. With base 64 encoding you would require ceil(200/log2(64)) = 34 characters.
A URI path segment allows 79 character that don’t require being encoded using the percent-encoding. If you add the path segment separator / you have 80 characters and thus require ceil(200/log2(80)) = 32 characters. That’s the optimum you can achieve using the path alone.
You could use more than these characters, even Unicode characters. But those would need to be encoded with the percent-encoding as URIs are only allowed to contain US-ASCII. A URI path like /ä (ä = U+00E4) is actually /%C3%A4 and only the browser displays it as /ä.
Here’s an example (functions taken from arbitrary base conversion in javascript):
function getValueOfDigit(digit, alphabet)
{
var pos = alphabet.indexOf(digit);
return pos;
}
function convert(src, srcAlphabet, dstAlphabet)
{
var srcBase = srcAlphabet.length;
var dstBase = dstAlphabet.length;
var wet = src;
var val = 0;
var mlt = 1;
while (wet.length > 0)
{
var digit = wet.charAt(wet.length - 1);
val += mlt * getValueOfDigit(digit, srcAlphabet);
wet = wet.substring(0, wet.length - 1);
mlt *= srcBase;
}
wet = val;
var ret = "";
while (wet >= dstBase)
{
var digitVal = wet % dstBase;
var digit = dstAlphabet.charAt(digitVal);
ret = digit + ret;
wet /= dstBase;
}
var digit = dstAlphabet.charAt(wet);
ret = digit + ret;
return ret;
}
var base4Alphabet = "0123",
base79Alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~!$&'()*+,;=:#",
base80Alphabet = base79Alphabet+"/";
alert(convert(getValueOfDigit("010203210", base4Alphabet), base4Alphabet, base80Alphabet)); // "C#Q"

Categories