Best practice for comparing two large files in Node.js

Best practice for comparing two large files in Node.js - javascript

I want to compare two large files(5GB+) and find if they are the same or not. One solution I considered is hashing both with crypto and then comparing the hashes. But this would take a lot of time since I will have to go through the entire files instead of stopping when a difference is found.
Another solution I thought was to compare the file as they are being streamed with fs.createReadStream() and break when a difference is found.
stream.on('data', (data) => {
//compare the data from this stream with the other stream
})
But I am not quite sure how I can have two streams that are synchronized.

As requested in your comments, if you want to see how an implementation can be written to do this, here's one. Here's how it works:
Open each of the two files
Compare the two files sizes. If not the same, resolve false.
Allocate two 8k buffers (you can choose the size of buffer to use)
Read 8k of each file (or less if not 8k left in the file) into your buffers
Compare those two buffers. If not identical, resolve false.
When you finish comparing all the bytes, resolve true
Here's the code:
const fs = require('fs');
const fsp = fs.promises;
// resolves to true or false
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
h1 = await fsp.open(fname1);
h2 = await fsp.open(fname2);
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
if (h1) {
await h1.close();
}
if (h2) {
await h2.close();
}
}
}
// sample usage
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});
This could be sped up a bit by opening and closing the files in parallel using Promise.allSettled() to track when they are both open and then both closed, though because of the complications if one succeeds in opening and the other doesn't and you don't want to leak the one opened file handle, it takes a bit more code to do that perfectly so I kept it simpler here.
And, if you really wanted to optimize for performance, it would be worth testing larger buffers to see if it makes things faster or not.
It's also possible that buf1.equals(buf2) might be faster than buf1.compare(buf2), but you have to make sure that a partial buffer read at the end of the file still works properly when using that since .equals() always compares the entire buffer. You could build two versions and compare their performance.
Here's a more complicated version that opens and closes the files in parallel and might be slightly faster:
const fs = require('fs');
const fsp = fs.promises;
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
let openResults = await Promise.allSettled([fsp.open(fname1), fsp.open(fname2)]);
let err;
if (openResults[0].status === "fulfilled") {
h1 = openResults[0].value;
} else {
err = openResults[0].reason;
}
if (openResults[1].status === "fulfilled") {
h2 = openResults[1].value;
} else {
err = openResults[1].reason;
}
// after h1 and h2 are set (so they can be properly closed)
// throw any error we got
if (err) {
throw err;
}
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
// does not return file close errors
// but does hold resolving the promise until the files are closed
// or had an error trying to close them
// Since we didn't write to the files, a close error would be fairly
// unprecedented unless the disk went down
const closePromises = [];
if (h1) {
closePromises.push(h1.close());
}
if (h2) {
closePromises.push(h2.close());
}
await Promise.allSettled(closePromises);
}
}
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});

There are certainly libraries that do this, and file-sync-cmp is very popular (270k weekly downloads). It does the comparison in the simplest way, by reading the same number of bytes from the two files in different buffers, and then comparing the buffers byte by byte.
There's also a more modern library, filecompare, "using native Promises and native BufferTools (alloc and Buffer comparisons)".
Whenever practical, don't reinvent the wheel :)

Since the difference might be at the very end of the files, I guess calculating a hash of the files is the most (yet costly) straightforward and secure process.
Did you try the MD5-File npm package and get some performance indicators?

Related

NodeJS Running through file by steams

I'm trying to read through a file using a stream. The file is a custom file format that has some number of sections, each being 66 bytes. I have to check the value of the the first 32 bytes against a given 32 bytes, and if they match return the remaining 34 bytes. Currently I'm doing the following
const fs = require('fs')
/**
* #param {string} known
*/
function check(known) {
let stream = fs.createReadStream('path/to/file', {highWaterMark: 66});
let out = '';
stream.on('data', (chunk) => {
let temp = '';
for (let i = 0; i < 32; i++) {
temp += String.fromCharCode(chunk.at(i));
}
if (temp === known) {
for (let i = 32; i < 66; i++) {
out += String.fromCharCode(chunk.at(i));
}
}
});
return out;
}
However (from checking with console.log) I know that the function in stream.on is run after check "finishes" so regardless of if the given string is found, the returned value will always be the same. How fix? Thanks in advance

Streams are non-blocking and asynchronous. They call their event handlers some time in the future, after your check() function has returned. That is how they work.
As such, you cannot directly return your result from the function because the function returns before the result is known. Instead, you have to return the result using an asynchronous mechanism such as an event, a callback or a promise (I would prefer a promise in this case). see How to return the response from an asynchronous call for more detail on returning your asynchronously retrieved value.
But, using a stream to read a file that is structured in 66 byte chunks is the hard way to write this code because data blocks are going to be randomly sized and are not guaranteed to line up with your 66 byte blocks. It would be simpler to use const fileHandle = await fs.promises.open(...) and fileHandle.read(...) to read a 66 byte chunk from the beginning of the file, check the 32 bytes you want to check and then communicate back the remaining 34 bytes using a promise.
Here's one such way to write it using handle.read() instead:
const fsp = require('fs').promises;
async function check(known) {
const kReadLen = 66;
const kHeaderLen = 32;
let handle = await fsp.open('path/to/file');
try {
let buffer = Buffer.alloc(kReadLen, 0);
let { bytesRead } = await handle.read(buffer, 0, kReadLen, 0);
if (bytesRead !== readLen) {
throw new Error("Insufficient data in the file");
}
const headerStr = buffer.toString('utf8', 0, kHeaderLen);
if (headerStr === known) {
return buffer.toString('utf8', kHeaderLen, kReadLen);
} else {
return "";
}
} finally {
handle.close();
}
}
Sample Usage:
check(testKnown).then(result => {
console.log(result);
}).catch(err => {
console.log(err);
})

How to break out a loop in a callback

hi how can I break out of the for loop ? I want to be able to break out of it in the callback in the if statement
I want this program to create a folder in the given directory and every time it throws an error I want it to change the folder name and add a number to it so when it says that the folder already exists, It'll create a unique folder name until it doesn't throw an error.
I will check for the error code later help me solve this first
const path = require('path');
function folder(folderName) {
for (let i = 1; i <= 10; i++) {
let pathNumber = i;
let fullPath = folderName + pathNumber;
fs.mkdir(path.join("D:", fullPath), (err) => {
if (!err) {
return; // I want to break out of the loop here
}
})
}
}
folder("folder");

You can't write the code that way because the for loop will already be done before any of the fs.mkdir() callbacks are called. They are asynchronous and happen LATER.
If you want to execute one iteration of the loop, including the fs.mkdir() before moving on to any other, then you can use async/await with fs.promises.mkdir().
Here's what a solution could look like with fs.promises.mkdir(). I've also added error handling for the case where all 10 sub-dir names you're trying already exist.
async function folder(folderName) {
let lastError;
for (let pathNumber = 1; pathNumber <= 10; pathNumber++) {
let fullPath = path.join("D:", folderName + pathNumber);
try {
await fs.promises.mkdir(fullPath);
return fullPath;
} catch(e) {
lastError = e;
// ignore error so we keep trying other numbers
}
}
throw lastError;
}
folder("folder").then(fullPath => {
console.log(`dir created: ${fullPath}`);
}).catch(err => {
console.log(err);
});

Much simpler without await
const numFolders = 10,
folders = Array.from(Array(numFolders), (_,i) => `folder${i+1}`), len = folder.length;
let cnt = 0;
const makeFolder = () => {
if (cnt >= len) return; // stop because done
fs.mkdir(path.join("D:", fullPath), (err) => {
if (err) {
makeFolder(); // only call again if error
}
cnt++
}
makeFolder()

Workers in javascript not so fast

I am giving a try to workers in js and I tried to make a simple sort using the same js sort function. The comparison i am making is just using an async function which will sort 60000 random numbers. The first will sort the random numbers as traditionally we are used to do it.
async function normalSort(arr) {
return new Promise((res) => {
let copy = arr;
copy.sort((a, b) => a > b ? 1 : -1);
return res(copy)
})
}
the other is a normal function which will be called for a workersHandler function
const { Worker, parentPort, workerData } = require('worker_threads');
function sort(data) {
let copy = data;
copy.sort((a, b) => a > b ? 1 : -1);
parentPort.postMessage(copy)
process.exit();
}
sort(workerData);
the workers handler function
const os = require('os');
const path = require('path');
const { Worker } = require('worker_threads');
async function workersHandler(arr) {
const startTime = Date.now();
const cpusAmount = os.cpus().length;
const chSize = Math.ceil(arr.length / cpusAmount)
let promises = [];
for (let i = 0; i < arr.length; i += chSize) {
const end = i + chSize;
const currentChunk = arr.slice(i, end);
const promise = new Promise((res, rej) => {
//#ts-ignore
const worker = new Worker(path.join(__dirname, '..', '/utils/sort.js'), { workerData: currentChunk })
worker.on('message', res)
worker.on('error', rej)
})
promises.push(promise);
}
let result = await Promise.all(promises)
return result;
}
and the main function which will call the others functions
function main() {
let arr = new Array(60000).fill(0).map((_, i) => Math.round(Math.random() * 100));
const startTime = Date.now();
workersHandler(arr).then(r => console.log('workers sort', Date.now() - startTime + ' ms'))
normalSort(arr).then(r => console.log('normal sort', Date.now() - startTime + ' ms'))
}
main();
Surprisingly the normal sort function is way faster and is working in one thread.
I am receiving for the workers function 101 ms
for the normal sort function 53 ms
Someone could explain me why these weird results?. Are workers not so fast or I am making a wrong implementation?.

Basically, using a single worker thread and waiting for it to do the work will always be slower than doing the work in the local thread, because:
Creating threads takes time.
Sending data between threads takes time.
Where you might get gains is if you have isolated pieces of work that can be handled in parallel, and multiple CPU cores to work with. In that situation, you can send different pieces of work to multiple workers (up to as many CPU cores as are available), provided the work isn't constrained by some other single resource they'd all be competing for.
Below I've posted a program that sorts 12 arrays locally and via workers with repeated races. (When sorting in workers, it transfers the array data to the worker and then back rather than copying it.) It starts the workers in advance and reuses them, and but it includes the time that took when determining the average time the workers took to do their work, so we're including all overhead.
On my workstation, with four CPU cores and letting it have a worker for each core, workers easily win:
# of workers: 4
Local average: 8790.010573029518ms
Workers' average: 3550.658817946911ms
Workers win, taking 40.39425% of the time local did
If I limit it to one worker, though, the worker is pure overhead and the local thread wins:
# of workers: 1
Local average: 8907.022233068943ms
Workers' average: 8953.339844942093ms
Local wins, taking 99.48268% of the time workers did
Even just two workers wins, because they can work in parallel on this multi-core machine:
# of workers: 2
Local average: 8782.853852927685ms
Workers' average: 4754.60275799036ms
Workers win, taking 54.13505% of the time local did
On a single core machine (if you can find one anymore), those two workers would be pure overhead again, and the local thread would win.
Here's main.js:
const os = require('os');
const { Worker } = require('worker_threads');
const { performance } = require('perf_hooks');
const MAX_UINT32 = (2**32)-1;
const ARRAY_SIZE = 100000;
const ARRAY_COUNT = 12;
const workerCount = +process.argv[2] || os.cpus().length;
const raceCount = +process.argv[3] || 5;
class WorkerQueue {
#workers;
#available;
#pending;
#checkPending = () => { // private methods still aren't unflagged in v13, so...
if (this.#available.length && this.#pending.length) {
const resolve = this.#pending.shift();
const worker = this.#available.shift();
resolve(worker);
}
};
constructor(...workers) {
this.#workers = new Set(workers);
this.#available = [...this.#workers];
this.#pending = [];
}
get() {
return new Promise(resolve => {
this.#pending.push(resolve);
this.#checkPending();
});
}
release(worker) {
if (!this.#workers.has(worker)) {
throw new Error("Uknown worker");
}
this.#available.push(worker);
this.#checkPending();
}
terminate() {
for (const worker of this.#workers) {
worker.terminate();
}
this.#workers = new Set();
this.#available = [];
this.#pending = [];
}
}
const {workers, workerCreationTime} = createWorkers();
main();
function createWorkers() {
const start = performance.now();
const workers = new WorkerQueue(
...Array.from({length: workerCount}, () => new Worker("./worker.js"))
);
const workerCreationTime = performance.now() - start;
return {workers, workerCreationTime};
}
async function main() {
try {
console.log(`Workers: ${workerCount} (in ${workerCreationTime}ms), races: ${raceCount}`);
let localAverage = 0;
let workersAverage = 0;
for (let n = 1; n <= raceCount; ++n) {
console.log(`Race #${n}:`);
const {localTime, workersTime} = await sortRace();
localAverage += localTime;
workersAverage += workersTime;
}
// Include the time it took to create the workers in the workers' average, as
// though we'd created them for each race. (We didn't because doing so would
// have given the local thread an advantage: after the first race, it's warmed
// up, but a new worker would be cold. So we let the workers be warm but add
// the full creation time into each race.
workersAverage += workerCreationTime;
console.log("----");
console.log(`# of workers: ${workerCount}`);
console.log(`Local average: ${localAverage}ms`);
console.log(`Workers' average: ${workersAverage}ms`);
if (localAverage > workersAverage) {
showWinner("Workers win", "local", workersAverage, localAverage);
} else {
showWinner("Local wins", "workers", localAverage, workersAverage);
}
workers.terminate();
} catch (e) {
console.error(e.message, e.stack);
}
}
function showWinner(msg, loser, winnerAverage, loserAverage) {
const percentage = (winnerAverage * 100) / loserAverage;
console.log(`${msg}, taking ${percentage.toFixed(5)}% of the time ${loser} did`);
}
async function sortRace() {
// Create a bunch of arrays for local to sort
const localArrays = Array.from({length: ARRAY_COUNT}, () => createRandomArray(ARRAY_SIZE));
// Copy those array so the workers are dealing with the same values
const workerArrays = localArrays.map(array => new Uint32Array(array));
const localStart = performance.now();
const localResults = await Promise.all(localArrays.map(sortLocal));
const localTime = performance.now() - localStart;
checkResults(localResults);
console.log(`Local time: ${localTime}ms`);
const workerStart = performance.now();
const workersResults = await Promise.all(workerArrays.map(sortViaWorker));
const workersTime = performance.now() - workerStart;
checkResults(workersResults);
console.log(`Workers' time: ${workersTime}ms`);
return {localTime, workersTime};
}
async function sortLocal(array) {
await Promise.resolve(); // To make it start asynchronously, like `sortViaWorker` does
array.sort((a, b) => a - b);
return array;
}
async function sortViaWorker(array) {
const worker = await workers.get();
return new Promise(resolve => {
worker.once("message", result => {
workers.release(worker);
resolve(result.array);
});
worker.postMessage({array}, [array.buffer]);
});
}
function checkResults(arrays) {
for (const array of arrays) {
const badIndex = array.findIndex((value, index) => index > 0 && array[index-1] > value);
if (badIndex !== -1) {
throw new Error(
`Error, array entry ${badIndex} has value ${array[badIndex]} ` +
`which is > previous value ${array[badIndex-1]}`
);
}
}
}
function createRandomArray(length) {
const array = new Uint32Array(Uint32Array.BYTES_PER_ELEMENT * length);
return randomFillArray(array);
}
function randomFillArray(array) {
for (let length = array.length, i = 0; i < length; ++i) {
array[i] = Math.random() * MAX_UINT32;
}
return array;
}
and worker.js:
const { parentPort } = require("worker_threads");
parentPort.on("message", ({array}) => {
array.sort((a, b) => a - b);
parentPort.postMessage({array}, [array.buffer]);
});

60000 may not be enough, IPC times matter
btw IPC: generic JavaScript datatypes, including generic JS arrays are heavy when copied to workers, but there are binary array types, https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypedArray
postMessage() has a transfer argument, but it applies to a limited number of types only.
https://nodejs.org/api/worker_threads.html#worker_threads_port_postmessage_value_transferlist and https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage:
postMessage(value[, transferList])
node: transferList may be a list of ArrayBuffer and MessagePort objects. After transferring, they will not be usable on the sending side of the channel anymore (even if they are not contained in value).
MDN: An optional array of Transferable objects to transfer ownership of. If the ownership of an object is transferred, it becomes unusable (neutered) in the context it was sent from and becomes available only to the worker it was sent to. Transferable objects are instances of classes like ArrayBuffer, MessagePort or ImageBitmap objects that can be transferred.
Effect of types:
let typ=prompt("Type: 0/1/2/3 (Array/Float64Array/Float32Array/Uint32Array)");
let len=parseInt(prompt("Length"));
let basearray;
switch(typ){
case "1":basearray=new Float64Array(len);break;
case "2":basearray=new Float32Array(len);break;
case "3":basearray=new Uint32Array(len);break;
default: basearray=new Array(len);break;
}
for(let i=0;i<basearray.length;i++)
basearray[i]=Math.random()*0x1000000;
let cpus=4,
chunksize=basearray.length/cpus,
chunks=[],chunksw=[];
for(let i=0;i<cpus;i++)
chunksw[i]=(chunks[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice();
let start=Date.now();
for(let i=0;i<cpus;i++)
chunks[i].sort((a,b)=>a-b);
console.log("Seq:",Date.now()-start);
let code="onmessage=event=>postMessage(event.data.sort((a,b)=>a-b));";
let ws=[],cnt=0;
for(let i=0;i<cpus;i++){
ws[i]=new Worker("data:text/plain,"+escape(code));
let j=i;
ws[i].onmessage=event=>{
chunksw[j]=event.data;
if(++cnt===cpus){
console.log("Par:",Date.now()-start);
if(len<=20)
for(let i=0;i<cpus;i++)
console.log(chunks[i],chunksw[i]);
}
};
}
start=Date.now();
for(let i=0;i<cpus;i++)
ws[i].postMessage(chunksw[i]);
Specify a length divisible by 4. If length is 20 or less, the resulting sorted chunks are going to be logged too for verification purposes. JS Array-s are reliably slower for me when passed around (compared to the thread-less run), regardless of containing 20 or 6000000 elements (while a 6-million-element JS array runs for 8 seconds for me on an older laptop, it still may be safer to start with something less). The other types are faster when threaded, Uint being the fastest.
Actually anything which is not 1/2/3 is going to result in a JS Array (the slowest one), including the empty string.
Effect of transfer is not that spectacular, but already appears from the beginning (with 4 elements it is 59-69 ms vs 20-22 ms on my PC):
let typ=prompt("Type: 0/1/2 (Float64Array/Float32Array/Uint32Array)");
let len=parseInt(prompt("Length"));
let basearray;
switch(typ){
case "1":basearray=new Float32Array(len);break;
case "2":basearray=new Uint32Array(len);break;
default:basearray=new Float64Array(len);
}
for(let i=0;i<basearray.length;i++)
basearray[i]=Math.random()*0x1000000;
let cpus=4,
chunksize=basearray.length/cpus,
chunksw=[],chunkswt=[];
for(let i=0;i<cpus;i++)
chunkswt[i]=(chunksw[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice();
let start;
let code="onmessage=event=>postMessage(event.data.sort((a,b)=>a-b));";
let ws=[],cnt=0;
for(let i=0;i<cpus;i++){
ws[i]=new Worker("data:text/plain,"+escape(code));
let j=i;
ws[i].onmessage=event=>{
chunksw[j]=event.data;
if(++cnt===cpus){
console.log("Non-transfer:",Date.now()-start);
// launch transfer measurement
cnt=0;start=Date.now();
for(let i=0;i<cpus;i++)
wst[i].postMessage(chunkswt[i].buffer,[chunkswt[i].buffer]); }
};
}
let codet;
switch(typ){
case "1":
codet="onmessage=event=>{"+
"let arr=new Float32Array(event.data);"+
"arr.sort((a,b)=>a-b);"+
"postMessage(event.data,[event.data]);};";
break;
case "2":
codet="onmessage=event=>{"+
"let arr=new Uint32Array(event.data);"+
"arr.sort((a,b)=>a-b);"+
"postMessage(event.data,[event.data]);};";
break;
default:
codet="onmessage=event=>{"+
"let arr=new Float64Array(event.data);"+
"arr.sort((a,b)=>a-b);"+
"postMessage(event.data,[event.data]);};";
}
let wst=[];
for(let i=0;i<cpus;i++){
wst[i]=new Worker("data:text/plain,"+escape(codet));
let j=i;
wst[i].onmessage=event=>{
switch(typ){
case "1":chunkswt[j]=new Float32Array(event.data);break;
case "2":chunkswt[j]=new Uint32Array(event.data);break;
default:chunkswt[j]=new Float64Array(event.data);
}
if(++cnt===cpus){
console.log("Transfer:",Date.now()-start);
if(len<=20)
for(let i=0;i<cpus;i++)
console.log(chunksw[i],chunkswt[i]);
}
};
}
// launch non-transfer measurement
start=Date.now();
for(let i=0;i<cpus;i++)
ws[i].postMessage(chunksw[i]);
This code is a bit messy because it is the buffer which can be transferred, not the typed arrays themselves, and also, while the second measurement is initialized as a direct copy-paste (which already isn't that pretty), it is then launched from inside the completion function of the first one.
(I do not wish to provide exact measurement results because my PC is doing some other things too. Just run the snippets a couple times with varied or even repeated parameters)

How to inject javascript in existing HTML response with node.js and cloudflare workers

I have a vanity URL pointing to a GitBook. GitBook doesn't support the insertion of arbitrary javascript snippets. At the moment GitBook has 4 "integrations" only.
I could route through my own VM server to accomplish this, but I have CloudFlare and I want to try out workers. (Javascript running at the CDN edge).
The CloudFlare worker environment makes header injection very easy, but there is no obvious way to do this.

It's important to process with a TransformStream so that processing is async and doesn't require memory buffering (for scalability and to minimise GC) - there's only a 5ms CPU time budget.
Overview:
To use for yourself, change the strings forHeadStart, forHeadEnd, and forBodyEnd.
This deferredInjection approach is the recommended way that minimises CPU time for the worker. It's more efficient because it only needs to parse the very start of the HTML. The other approach requires parsing of the whole head section for headInjection, and if you use bodyInjection it practically needs to parse the whole html response.
The deferredInjection approach works by injecting the content into the start of the head tag, then on the client-side at runtime your HTML content will be deployed to the desired places.
You can inject directly if needed using headInjection and/or bodyInjection. Uncommenting related code, including code in injectScripts, and setting the strings for tagBytes that will be encoded.
This solution will only parse HTML content types
This solution works directly on bytes (not strings) for better efficiency. Searching for the bytes of the end-tag strings.
You could potentially target more end-tags, but usually you don't need to target more than these two
Processes data with streaming (the whole HTML string is not cached in memory). This lowers peak memory usage and speeds up time to first byte.
Handles a rare edge case where the closing tag is on a text read boundary. I believe a boundary might occur every ~1000 bytes (TCP packets 1000-1500 bytes each), and this can vary due to gzip compression.
Keeps the injection parsing code separate for the code to simply forward the rest for clarity.
You can disable the second body-tag injector by commenting it out if you don't need it - that will speed up processing.
I have tested this exact code for myself and it works. There might be remaining bugs (depending on location of closing tag, and depending if your server replies with partial html templates (body only)). I may have fixed one today 2019-06-28
Code
addEventListener('fetch', event => {
event.passThroughOnException();
event.respondWith(handleRequest(event.request))
})
/**
* Fetch and log a request
* #param {Request} request
*/
async function handleRequest(request) {
const response = await fetch(request);
var ctype = response.headers.get('content-type');
if (ctype.startsWith('text/html') === false)
return response; //Only parse html body
let { readable, writable } = new TransformStream();
let promise = injectScripts(response.body, writable);
return new Response(readable, response);
}
let encoder = new TextEncoder('utf-8');
let deferredInjection = function() {
let forHeadStart = `<script>var test = 1; //Start of head section</script>`;
let forHeadEnd = `<script>var test = 2; //End of head section</script>`;
let forBodyEnd = `<script>var test = 3; //End of body section</script><button>click</button>`;
let helper = `
${forHeadStart}
<script>
function appendHtmlTo(element, htmlContent) {
var temp = document.createElement('div');
temp.innerHTML = htmlContent;
while (temp.firstChild) {
element.appendChild(temp.firstChild);
};
}
let forHeadEnd = "${ btoa(forHeadEnd) }";
let forBodyEnd = "${ btoa(forBodyEnd) }";
if (forHeadEnd.length > 0) appendHtmlTo(document.head, atob(forHeadEnd));
if (forBodyEnd.length > 0) window.onload = function() {
appendHtmlTo(document.body, atob(forBodyEnd));
};
</script>
`;
return {
forInjection: encoder.encode(helper),
tagBytes: encoder.encode("<head>"),
insertAfterTag: true
};
}();
// let headInjection = {
// forInjection: encoder.encode("<script>var test = 1;</script>"),
// tagBytes: encoder.encode("</head>"), //case sensitive
// insertAfterTag: false
// };
// let bodyInjection = {
// forInjection: encoder.encode("<script>var test = 1;</script>"),
// tagBytes: encoder.encode("</body>"), //case sensitive
// insertAfterTag: false
// }
//console.log(bodyTagBytes);
encoder = null;
async function injectScripts(readable, writable) {
let processingState = {
readStream: readable,
writeStream: writable,
reader: readable.getReader(),
writer: writable.getWriter(),
leftOvers: null, //data left over after a closing tag is found
inputDone: false,
result: {charactersFound: 0, foundIndex: -1, afterHeadTag: -1} //Reused object for the duration of the request
};
await parseForInjection(processingState, deferredInjection);
//await parseForInjection(processingState, headInjection);
//await parseForInjection(processingState, bodyInjection);
await forwardTheRest(processingState);
}
///Return object will have foundIndex: -1, if there is no match, and no partial match at the end of the array
///If there is an exact match, return object will have charactersFound:(tagBytes.Length)
///If there is a partial match at the end of the array, return object charactersFound will be < (tagBytes.Length)
///The result object needs to be passed in to reduce Garbage Collection - we can reuse the object
function searchByteArrayChunkForClosingTag(chunk, tagBytes, result)
{
//console.log('search');
let searchStart = 0;
//console.log(tagBytes.length);
//console.log(chunk.length);
for (;;) {
result.charactersFound = 0;
result.foundIndex = -1;
result.afterHeadTag = -1;
//console.log(result);
let sweepIndex = chunk.indexOf(tagBytes[0], searchStart);
if (sweepIndex === -1)
return; //Definitely not found
result.foundIndex = sweepIndex;
sweepIndex++;
searchStart = sweepIndex; //where we start searching from next
result.charactersFound++;
result.afterHeadTag = sweepIndex;
//console.log(result);
for (let i = 1; i < tagBytes.length; i++)
{
if (sweepIndex === chunk.length) return; //Partial match
if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; } //Failed to match (even partially to boundary)
result.charactersFound++;
result.afterHeadTag = sweepIndex; //Because we work around the actual found tag in case it's across a boundary
}
if (result.charactersFound === tagBytes.length)
return; //Found
}
}
function continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result)
{
//console.log('continue');
//Finish the search (no need to check the last buffer at all)
//console.log('finish the search');
result.charactersFound = lastSplitResult.charactersFound; //We'll be building on the progress from the lastSplitResult
result.foundIndex = (-1 * result.charactersFound); //This won't be used, but a negative value is indicative of chunk spanning
let sweepIndex = 0;
result.afterHeadTag = 0;
for (let i = lastSplitResult.charactersFound; i < tagBytes.length; i++) //Zero-based
{
if (sweepIndex === chunk.length) return result; //So we support working on a chunk that's smaller than the tagBytes search size
if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; }
result.charactersFound++;
result.afterHeadTag = sweepIndex;
}
}
function continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
{
//console.log('continueOrNewSearch');
if (lastSplitResult == null)
searchByteArrayChunkForClosingTag(chunk, tagBytes, result);
else
{
continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result);
if (result.charactersFound === tagBytes.length)
return result;
else
return searchByteArrayChunkForClosingTag(chunk, tagBytes, result); //Keep searching onward
}
}
async function parseForInjection(processingState, injectionJob)
{
if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?
if (!injectionJob) return;
if (!injectionJob.tagBytes) return;
if (!injectionJob.forInjection) return;
let reader = processingState.reader;
let writer = processingState.writer;
let result = processingState.result;
let tagBytes = injectionJob.tagBytes;
//(reader, writer, tagBytes, forInjection)
let lastSplitResult = null;
let chunk = null;
processingState.inputDone = false;
for (;;) {
if (processingState.leftOvers)
{
chunk = processingState.leftOvers;
processingState.leftOvers = null;
}
else
{
let readerResult = await reader.read();
chunk = readerResult.value;
processingState.inputDone = readerResult.done;
}
if (processingState.inputDone) {
if (lastSplitResult !== null) {
//Very edge case: Somehow tagBytes is never found?
console.log('edge');
throw 'tag not found'; //Causing the system to fall back to the direct request
}
await writer.close();
return true;
}
//console.log(value.length);
continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
//console.log(result);
if (result.charactersFound === tagBytes.length) //Complete match
{
//Inject
//console.log('inject');
if (result.foundIndex > 0)
{
let partValue = chunk.slice(0, result.foundIndex);
//console.log(partValue);
await writer.write(partValue);
}
console.log('injected');
if (parseForInjection.insertAfterTag)
{
await writer.write(injectionJob.forInjection);
await writer.write(injectionJob.tagBytes);
}
else
{
await writer.write(injectionJob.tagBytes);
await writer.write(injectionJob.forInjection);
}
let remainder = chunk.slice(result.afterHeadTag, chunk.length - 1);
processingState.leftOvers = remainder;
lastSplitResult = null;
return;
}
if (lastSplitResult !== null)
{
//console.log('no match over boundary');
//The remainder wasn't found, so write the partial match from before (maybe `<` or `</`)
let failedLastBit = injectionJob.tagBytes.slice(0, lastSplitResult.charactersFound);
await writer.write(failedLastBit);
lastSplitResult = null;
}
if (result.charactersFound === 0)
{
//console.log('not found')
await writer.write(chunk);
continue;
}
if (result.charactersFound < tagBytes.length)
{
//console.log('boundary: ' + result.charactersFound);
lastSplitResult = result;
let partValue = chunk.slice(0, result.foundIndex);
//console.log(partValue);
await writer.write(partValue);
continue;
}
}
}
async function forwardTheRest(processingState)
{
try
{
if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?
if (processingState.leftOvers)
{
chunk = processingState.leftOvers;
await processingState.writer.write(chunk);
}
processingState.reader.releaseLock();
processingState.writer.releaseLock();
await processingState.readStream.pipeTo(processingState.writeStream);
//Should there be an explicit close method called? I couldn't find one
}
catch (e)
{
console.log(e);
}
}
Further explanation of working directly with (utf-8) bytes:
Only working with byte values. This is possible at least by searching for the first distinctive utf-8 byte of a character (< 128 and > 192). But in this case, we're searching for </head> which is made up of lower-than-128 bytes, very easy to work with.
Given the nature of searching for utf-8 (which is the trickiest), this should work with ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii']. You will need to change the snippet encoder to match.
This isn't thoroughly tested. The boundary case, didn't trigger for me. Ideally, we would have a testing rig for the core functions
thanks to Kenton Varda for challenging me
Please let me know if there's a CloudFlare workers way to do pipeTo in the forwardTheRest function
You might find continueOrNewSearch and the two sub-functions to be an interesting approach to finding multi-bytes across a chunk boundary. Up until the boundary we just count how many bytes are found. There's no need to keep those bytes (we know what they are). Then on the next chunk we continue where we left off. We always cut the array buffer around the header, and make sure we write the header bytes (using the tagBytes)

How can I get the raw download size of a request using Puppeteer?

That is, the total amount of data downloaded across all resources (including video/media), similar to that returned by Chrome DevTools' Network tab.

There doesn't seem to be any way to do this as of January 2018 that works with all resource types (listening for the response event fails for videos), and that correctly counts compressed resources.
The best workaround seems to be to listen for the Network.dataReceived event, and process the event manually:
const resources = {};
page._client.on('Network.dataReceived', (event) => {
const request = page._networkManager._requestIdToRequest.get(
event.requestId
);
if (request && request.url().startsWith('data:')) {
return;
}
const url = request.url();
// encodedDataLength is supposed to be the amount of data received
// over the wire, but it's often 0, so just use dataLength for consistency.
// https://chromedevtools.github.io/devtools-protocol/tot/Network/#event-dataReceived
// const length = event.encodedDataLength > 0 ?
// event.encodedDataLength : event.dataLength;
const length = event.dataLength;
if (url in resources) {
resources[url] += length;
} else {
resources[url] = length;
}
});
// page.goto(...), etc.
// totalCompressedBytes is unavailable; see comment above
const totalUncompressedBytes = Object.values(resources).reduce((a, n) => a + n, 0);

The solution of #mjs works perfectly even in 2021. Just need to replace:
page._networkManager -> page._frameManager._networkManager
Full example that works for me:
const resources = {};
page._client.on('Network.dataReceived', (event) => {
const request = page._frameManager._networkManager._requestIdToRequest.get(
event.requestId
);
if (request && request.url().startsWith('data:')) {
return;
}
const url = request.url();
const length = event.dataLength;
if (url in resources) {
resources[url] += length;
} else {
resources[url] = length;
}
});
await page.goto('https://stackoverflow.com/questions/48263345/how-can-i-get-the-raw-download-size-of-a-request-using-puppeteer');
const totalUncompressedBytes = Object.values(resources).reduce((a, n) => a + n, 0);
console.log(totalUncompressedBytes);

If you are using puppeteer, you have server side node... Why not pipe the request through a stream, or streams and then calculate the content size?
Also there is https://github.com/watson/request-stats
Also you may want to call page.waitForNavigation as you may be wrestling with async timing issues

const imgaes_width = await page.$$eval('img', anchors => [].map.call(anchors, img => img.width));
const imgaes_height = await page.$$eval('img', anchors => [].map.call(anchors, img => img.height));

We Keep Coding

JavaScript is the programming language of the Web.

Best practice for comparing two large files in Node.js - javascript

Since the difference might be at the very end of the files, I guess calculating a hash of the files is the most (yet costly) straightforward and secure process. Did you try the MD5-File npm package and get some performance indicators?

Related

NodeJS Running through file by steams

How to break out a loop in a callback

Workers in javascript not so fast

How to inject javascript in existing HTML response with node.js and cloudflare workers

How can I get the raw download size of a request using Puppeteer?

Categories

Resources