javascript FileReader - parsing long file in chunks - javascript

I have long file I need to parse. Because it's very long I need to do it chunk by chunk. I tried this:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize)
{
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + 1);
reader.onload = foo;
reader.readAsText(blob);
})( file, i );
}
}
After running it I see only the first chunk in the console. If I change 'console.log' to jquery append to some div I see only first chunk in that div. What about other chunks? How to make it work?

FileReader API is asynchronous so you should handle it with block calls. A for loop wouldn't do the trick since it wouldn't wait for each read to complete before reading the next chunk.
Here's a working approach.
function parseFile(file, callback) {
var fileSize = file.size;
var chunkSize = 64 * 1024; // bytes
var offset = 0;
var self = this; // we need a reference to the current object
var chunkReaderBlock = null;
var readEventHandler = function(evt) {
if (evt.target.error == null) {
offset += evt.target.result.length;
callback(evt.target.result); // callback for handling read chunk
} else {
console.log("Read error: " + evt.target.error);
return;
}
if (offset >= fileSize) {
console.log("Done reading file");
return;
}
// of to the next chunk
chunkReaderBlock(offset, chunkSize, file);
}
chunkReaderBlock = function(_offset, length, _file) {
var r = new FileReader();
var blob = _file.slice(_offset, length + _offset);
r.onload = readEventHandler;
r.readAsText(blob);
}
// now let's start the read with the first block
chunkReaderBlock(offset, chunkSize, file);
}

You can take advantage of Response (part of fetch) to convert most things to anything else blob, text, json and also get a ReadableStream that can help you read the blob in chunks đź‘Ť
var dest = new WritableStream({
write (str) {
console.log(str)
}
})
var blob = new Blob(['bloby']);
(blob.stream ? blob.stream() : new Response(blob).body)
// Decode the binary-encoded response to string
.pipeThrough(new TextDecoderStream())
.pipeTo(dest)
.then(() => {
console.log('done')
})
Old answer (WritableStreams pipeTo and pipeThrough was not implemented before)
I came up with a interesting idéa that is probably very fast since it will convert the blob to a ReadableByteStreamReader probably much easier too since you don't need to handle stuff like chunk size and offset and then doing it all recursive in a loop
function streamBlob(blob) {
const reader = new Response(blob).body.getReader()
const pump = reader => reader.read()
.then(({ value, done }) => {
if (done) return
// uint8array chunk (use TextDecoder to read as text)
console.log(value)
return pump(reader)
})
return pump(reader)
}
streamBlob(new Blob(['bloby'])).then(() => {
console.log('done')
})

The second argument of slice is actually the end byte. Your code should look something like:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize) {
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + start);
reader.onload = foo;
reader.readAsText(blob);
})(file, i);
}
}
Or you can use this BlobReader for easier interface:
BlobReader(blob)
.readText(function (text) {
console.log('The text in the blob is', text);
});
More information:
README.md
Docs

Revamped #alediaferia answer in a class (typescript version here) and returning the result in a promise. Brave coders would even have wrapped it into an async iterator…
class FileStreamer {
constructor(file) {
this.file = file;
this.offset = 0;
this.defaultChunkSize = 64 * 1024; // bytes
this.rewind();
}
rewind() {
this.offset = 0;
}
isEndOfFile() {
return this.offset >= this.getFileSize();
}
readBlockAsText(length = this.defaultChunkSize) {
const fileReader = new FileReader();
const blob = this.file.slice(this.offset, this.offset + length);
return new Promise((resolve, reject) => {
fileReader.onloadend = (event) => {
const target = (event.target);
if (target.error == null) {
const result = target.result;
this.offset += result.length;
this.testEndOfFile();
resolve(result);
}
else {
reject(target.error);
}
};
fileReader.readAsText(blob);
});
}
testEndOfFile() {
if (this.isEndOfFile()) {
console.log('Done reading file');
}
}
getFileSize() {
return this.file.size;
}
}
Example printing a whole file in the console (within an async context)
const fileStreamer = new FileStreamer(aFile);
while (!fileStreamer.isEndOfFile()) {
const data = await fileStreamer.readBlockAsText();
console.log(data);
}

Parsing the large file into small chunk by using the simple method:
//Parse large file in to small chunks
var parseFile = function (file) {
var chunkSize = 1024 * 1024 * 16; //16MB Chunk size
var fileSize = file.size;
var currentChunk = 1;
var totalChunks = Math.ceil((fileSize/chunkSize), chunkSize);
while (currentChunk <= totalChunks) {
var offset = (currentChunk-1) * chunkSize;
var currentFilePart = file.slice(offset, (offset+chunkSize));
console.log('Current chunk number is ', currentChunk);
console.log('Current chunk data', currentFilePart);
currentChunk++;
}
};

Related

Why is writing the same data to IndexedDB a second time consistently slower?

I stored some jpeg files (exactly 350, same files same size. Total: 336.14 MB) as Blob in IndexedDB. It took around 1 second to complete the transaction. Then I read all the data from IndexedDB to an array and again sored to IndexedDB. But this time it takes around 15 Seconds. I observed this as a consistent behavior. Anything wrong here? I used performance.now() to get the time difference
Files: 350,
Size of each: 937 KB,
Browser: Chrome and Chromium Edge
//Open
var dbOpen = indexedDB.open(INDEXED_DB_NAME, INDEXED_DB_VERSION);
dbOpen.onupgradeneeded = function (e) {
console.log("onupgradeneeded");
var store = e.currentTarget.result.createObjectStore(
IMAGE_DATA_STORE, { autoIncrement: true });
};
dbOpen.onsuccess = function (e) {
image_data_db = dbOpen.result;
console.log("indexed DB opened");
};
//Initial Write
var inputFiles = document.getElementById('inputFiles');
for (var i = 0; i < inputFiles.files.length; i++) {
let file = inputFiles.files[i];
var b = new Blob([file], { type: file.type });
fileblobs.push(b);
}
StoreIdb(fileblobs); // < First write
//StoreIdb()
t0 = performace.now();
var trx = image_data_db.transaction(IMAGE_DATA_STORE, 'readwrite');
var imagestore = trx.objectStore(IMAGE_DATA_STORE);
for (i = 0; i < fileblobs.length; i++) {
request = imagestore.add(fileblobs[i]);
request.onsuccess = function (e) {
console.log('added');
};
request.onerror = function (e) {
console.error("Request Error", this.error);
};
}
trx.onabort = function (e) {
console.error("Exception:", this.error, this.error.name);
};
trx.oncomplete = function (e) {
console.log('completed');
t1 = performance.now();
timetaken = t1 - t0;
}
//Read
var objectStore = image_data_db.transaction(IMAGE_DATA_STORE).objectStore(IMAGE_DATA_STORE);
objectStore.openCursor().onsuccess = function (e) {
var cursor = e.target.result;
if (cursor) {
blobArray.push(cursor.value.blob);
cursor.continue();
}
else
{
// completed
}
}
// blobArray will be used for second time << Second Write
I figured it out. First time it was storing file instance blob.
I ve changed file instance blob to Array buffer just to want to ensure data type similar in both cases. Now it is taking same time.
for (var i = 0; i < inputFiles.files.length; i++) {
let file = inputFiles.files[i];
file.arrayBuffer().then((arrayBuffer) => {
let blob = new Blob([new Uint8Array(arrayBuffer)], {type: file.type });
blobs.push(blob);
if ( blobs.length == inputFiles.files.length){
callback(blobs);
}
});
}

js can I read a portion of a file with FileReader? [duplicate]

I have long file I need to parse. Because it's very long I need to do it chunk by chunk. I tried this:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize)
{
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + 1);
reader.onload = foo;
reader.readAsText(blob);
})( file, i );
}
}
After running it I see only the first chunk in the console. If I change 'console.log' to jquery append to some div I see only first chunk in that div. What about other chunks? How to make it work?
FileReader API is asynchronous so you should handle it with block calls. A for loop wouldn't do the trick since it wouldn't wait for each read to complete before reading the next chunk.
Here's a working approach.
function parseFile(file, callback) {
var fileSize = file.size;
var chunkSize = 64 * 1024; // bytes
var offset = 0;
var self = this; // we need a reference to the current object
var chunkReaderBlock = null;
var readEventHandler = function(evt) {
if (evt.target.error == null) {
offset += evt.target.result.length;
callback(evt.target.result); // callback for handling read chunk
} else {
console.log("Read error: " + evt.target.error);
return;
}
if (offset >= fileSize) {
console.log("Done reading file");
return;
}
// of to the next chunk
chunkReaderBlock(offset, chunkSize, file);
}
chunkReaderBlock = function(_offset, length, _file) {
var r = new FileReader();
var blob = _file.slice(_offset, length + _offset);
r.onload = readEventHandler;
r.readAsText(blob);
}
// now let's start the read with the first block
chunkReaderBlock(offset, chunkSize, file);
}
You can take advantage of Response (part of fetch) to convert most things to anything else blob, text, json and also get a ReadableStream that can help you read the blob in chunks đź‘Ť
var dest = new WritableStream({
write (str) {
console.log(str)
}
})
var blob = new Blob(['bloby']);
(blob.stream ? blob.stream() : new Response(blob).body)
// Decode the binary-encoded response to string
.pipeThrough(new TextDecoderStream())
.pipeTo(dest)
.then(() => {
console.log('done')
})
Old answer (WritableStreams pipeTo and pipeThrough was not implemented before)
I came up with a interesting idéa that is probably very fast since it will convert the blob to a ReadableByteStreamReader probably much easier too since you don't need to handle stuff like chunk size and offset and then doing it all recursive in a loop
function streamBlob(blob) {
const reader = new Response(blob).body.getReader()
const pump = reader => reader.read()
.then(({ value, done }) => {
if (done) return
// uint8array chunk (use TextDecoder to read as text)
console.log(value)
return pump(reader)
})
return pump(reader)
}
streamBlob(new Blob(['bloby'])).then(() => {
console.log('done')
})
The second argument of slice is actually the end byte. Your code should look something like:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize) {
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + start);
reader.onload = foo;
reader.readAsText(blob);
})(file, i);
}
}
Or you can use this BlobReader for easier interface:
BlobReader(blob)
.readText(function (text) {
console.log('The text in the blob is', text);
});
More information:
README.md
Docs
Revamped #alediaferia answer in a class (typescript version here) and returning the result in a promise. Brave coders would even have wrapped it into an async iterator…
class FileStreamer {
constructor(file) {
this.file = file;
this.offset = 0;
this.defaultChunkSize = 64 * 1024; // bytes
this.rewind();
}
rewind() {
this.offset = 0;
}
isEndOfFile() {
return this.offset >= this.getFileSize();
}
readBlockAsText(length = this.defaultChunkSize) {
const fileReader = new FileReader();
const blob = this.file.slice(this.offset, this.offset + length);
return new Promise((resolve, reject) => {
fileReader.onloadend = (event) => {
const target = (event.target);
if (target.error == null) {
const result = target.result;
this.offset += result.length;
this.testEndOfFile();
resolve(result);
}
else {
reject(target.error);
}
};
fileReader.readAsText(blob);
});
}
testEndOfFile() {
if (this.isEndOfFile()) {
console.log('Done reading file');
}
}
getFileSize() {
return this.file.size;
}
}
Example printing a whole file in the console (within an async context)
const fileStreamer = new FileStreamer(aFile);
while (!fileStreamer.isEndOfFile()) {
const data = await fileStreamer.readBlockAsText();
console.log(data);
}
Parsing the large file into small chunk by using the simple method:
//Parse large file in to small chunks
var parseFile = function (file) {
var chunkSize = 1024 * 1024 * 16; //16MB Chunk size
var fileSize = file.size;
var currentChunk = 1;
var totalChunks = Math.ceil((fileSize/chunkSize), chunkSize);
while (currentChunk <= totalChunks) {
var offset = (currentChunk-1) * chunkSize;
var currentFilePart = file.slice(offset, (offset+chunkSize));
console.log('Current chunk number is ', currentChunk);
console.log('Current chunk data', currentFilePart);
currentChunk++;
}
};

Merge Mutilple pdf blobs into one

Okay so I'm converting html into pdfs, the pdf returned from my backend I convert to a new Blob with type: 'application/pdf', this all works fine. I now want to merge multiple Blobs into one. I'm using the following function to do so.
function ConcatenateBlobs(blobs, type, callback) {
var buffers = [];
var index = 0;
function readAsArrayBuffer() {
if (!blobs[index]) {
return concatenateBuffers();
}
var reader = new FileReader();
reader.onload = function(event) {
buffers.push(event.target.result);
index++;
readAsArrayBuffer();
};
reader.readAsArrayBuffer(blobs[index]);
}
readAsArrayBuffer();
function concatenateBuffers() {
var byteLength = 0;
buffers.forEach(function(buffer) {
byteLength += buffer.byteLength;
});
var tmp = new Uint16Array(byteLength);
var lastOffset = 0;
buffers.forEach(function(buffer) {
// BYTES_PER_ELEMENT == 2 for Uint16Array
var reusableByteLength = buffer.byteLength;
if (reusableByteLength % 2 != 0) {
buffer = buffer.slice(0, reusableByteLength - 1)
}
tmp.set(new Uint16Array(buffer), lastOffset);
lastOffset += reusableByteLength;
});
var blob = new Blob([tmp.buffer], {
type: type
});
callback(blob);
}
}
But for some reason I am only getting the last pdf in the array to show as the result.

Calculate MD5 hash of a large file using javascript

How do you upload a 500mb file and get a MD5 hash with CryptoJS?
Here is my code:
$('#upload-file').change(function(){
var reader = new FileReader();
reader.addEventListener('load',function () {
var hash = CryptoJS.MD5(CryptoJS.enc.Latin1.parse(this.result));
window.md5 = hash.toString(CryptoJS.enc.Hex);
});
reader.readAsBinaryString(this.files[0]);
});
If the file is under 200mb, it works. Anything bigger, this.result is an empty "".
I've tried:
filereader api on big files
javascript FileReader - parsing long file in chunks
and almost got this to work , but console is complaining about .join("")
http://dojo4.com/blog/processing-huge-files-with-an-html5-file-input
CryptoJS has a progressive api for hash digests. The rest is taken form alediaferia's answer with slight modifications.
function process() {
getMD5(
document.getElementById("my-file-input").files[0],
prog => console.log("Progress: " + prog)
).then(
res => console.log(res),
err => console.error(err)
);
}
function readChunked(file, chunkCallback, endCallback) {
var fileSize = file.size;
var chunkSize = 4 * 1024 * 1024; // 4MB
var offset = 0;
var reader = new FileReader();
reader.onload = function() {
if (reader.error) {
endCallback(reader.error || {});
return;
}
offset += reader.result.length;
// callback for handling read chunk
// TODO: handle errors
chunkCallback(reader.result, offset, fileSize);
if (offset >= fileSize) {
endCallback(null);
return;
}
readNext();
};
reader.onerror = function(err) {
endCallback(err || {});
};
function readNext() {
var fileSlice = file.slice(offset, offset + chunkSize);
reader.readAsBinaryString(fileSlice);
}
readNext();
}
function getMD5(blob, cbProgress) {
return new Promise((resolve, reject) => {
var md5 = CryptoJS.algo.MD5.create();
readChunked(blob, (chunk, offs, total) => {
md5.update(CryptoJS.enc.Latin1.parse(chunk));
if (cbProgress) {
cbProgress(offs / total);
}
}, err => {
if (err) {
reject(err);
} else {
// TODO: Handle errors
var hash = md5.finalize();
var hashHex = hash.toString(CryptoJS.enc.Hex);
resolve(hashHex);
}
});
});
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/components/core.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/components/md5.js"></script>
<input id="my-file-input" type="file">
<button onclick="process()">Process</button>
You don't need to read the whole file at once and feed it all in one go to CryptoJS routines.
You can create the hasher object, and feed chunks as you read them, and then get the final result.
Sample taken from the CryptoJS documentation
var sha256 = CryptoJS.algo.SHA256.create();
sha256.update("Message Part 1");
sha256.update("Message Part 2");
sha256.update("Message Part 3");
var hash = sha256.finalize();

Multiple XMLHttpRequest.send or eventlisteners memory leak?

I'm currently implementing an upload for files. Because I've to handle huge files here and there I've started to slice files and send them in 1mb chunks which works great as long as file are <~500MB after that it seems that memory isn't freed anyone randomly and I can't figure out what I'm missing here.
Prepare chunks
var sliceCount = 0;
var sendCount = 0;
var fileID = generateUUID();
var maxChunks = 0;
var userNotified = false;
function parseFile(file)
{
var fileSize = file.size;
var chunkSize = 1024 * 1024;//64 * 1024; // bytes
var offset = 0;
var self = this; // we need a reference to the current object
var chunkReaderBlock = null;
var numberOfChunks = fileSize / chunkSize;
maxChunks = Math.ceil(numberOfChunks);
// gets called if chunk is read into memory
var readEventHandler = function (evt)
{
if (evt.target.error == null) {
offset += evt.target.result.byteLength;
sendChunkAsBinary(evt.target.result);
}
else
{
console.log("Read error: " + evt.target.error);
return;
}
if (offset >= fileSize) {
console.log("Done reading file");
return;
}
// of to the next chunk
chunkReaderBlock(offset, chunkSize, file);
}
chunkReaderBlock = function (_offset, length, _file)
{
var r = new FileReader();
var blob = _file.slice(_offset, length + _offset);
sliceCount++;
console.log("Slicecount: " + sliceCount);
r.onload = readEventHandler;
r.readAsArrayBuffer(blob);
blob = null;
r = null;
}
// now let's start the read with the first block
chunkReaderBlock(offset, chunkSize, file);
}
Send Chunks
function sendChunkAsBinary(chunk)
{
var progressbar = $("#progressbar"), bar = progressbar.find('.uk-progress-bar');
// create XHR instance
var xhr = new XMLHttpRequest();
// send the file through POST
xhr.open("POST", 'upload.php', true);
var progressHandler = function (e)
{
// get percentage of how much of the current file has been sent
var position = e.loaded || e.position;
var total = e.total || e.totalSize;
var percentage = Math.round((sendCount / maxChunks) * 100);
// set bar width to keep track of progress
bar.css("width", percentage + "%").text(percentage + "%");
}
// let's track upload progress
var eventSource = xhr.upload || xhr;
eventSource.addEventListener("progress", progressHandler);
// state change observer - we need to know when and if the file was successfully uploaded
xhr.onreadystatechange = function ()
{
if (xhr.readyState == 4)
{
if (xhr.status == 200)
{
eventSource.removeEventListener("progress", progressHandler);
if (sendCount == maxChunks && !userNotified)
{
userNotified = true;
notifyUserSuccess("Datei hochgeladen!");
setTimeout(function ()
{
progressbar.addClass("uk-invisible");
bar.css("width", "0%").text("0%");
}, 250);
updateDocList();
}
}
else
{
notifyUser("Fehler beim hochladen der Datei!");
}
}
};
var blob;
if (typeof window.Blob == "function") {
blob = new Blob([chunk]);
} else {
var bb = new (window.MozBlobBuilder || window.WebKitBlobBuilder || window.BlobBuilder)();
bb.append(chunk);
blob = bb.getBlob();
}
sendCount++;
var formData = new FormData();
formData.append("chunkNumber", sendCount);
formData.append("maxChunks", maxChunks);
formData.append("fileID", fileID);
formData.append("chunkpart", blob);
xhr.send(formData);
progressbar.removeClass("uk-invisible");
console.log("Sendcount: " + sendCount);
}
If I attach to the debugger within Visual Studio 2015 it take a bit but soon I get an OutOfMemoryException in the send function at exactly this line: blob = new Blob([chunk]);. It's all the time the same line where the exception occures.
As soon as the Exception happens I get POST [...]/upload.php net::ERR_FILE_NOT_FOUND however I still got the chunks in my php-file.
Here's a Timeline-graph of my error
What I dont understand, I'm not able to see increasing memory inside the Task-Manager (a few mb of course but not close to 16gb ram I got).
So can anyone tell me where this leak comes from? What am I missing?

Categories