how to open a local PDF in PDFJS using file input? - javascript

I would like to know if there is a way to select a pdf file using input type="file" and open it using PDFJS

You should be able to use a FileReader to get the contents of a file object as a typed array, which pdfjs accepts (https://mozilla.github.io/pdf.js/examples/)
//Step 1: Get the file from the input element
inputElement.onchange = function(event) {
var file = event.target.files[0];
//Step 2: Read the file using file reader
var fileReader = new FileReader();
fileReader.onload = function() {
//Step 4:turn array buffer into typed array
var typedarray = new Uint8Array(this.result);
//Step 5:pdfjs should be able to read this
const loadingTask = pdfjsLib.getDocument(typedarray);
loadingTask.promise.then(pdf => {
// The document is loaded here...
});
};
//Step 3:Read the file as ArrayBuffer
fileReader.readAsArrayBuffer(file);
}
Edit: The pdfjs API changed at some point since I wrote this first answer in 2015. Updating to reflect the new API as of 2021(thanks to #Chiel) for the updated answer

If getDocument().then is not a function:
I reckon I have managed to solve the new problem with the new API. As mentioned in this GitHub issue, the getDocument function now has an promise added to itself.
In short, this:
PDFJS.getDocument(typedarray).then(function(pdf) {
// The document is loaded here...
});
became this:
const loadingTask = pdfjsLib.getDocument(typedarray);
loadingTask.promise.then(pdf => {
// The document is loaded here...
});
Adapting the older answer to the new api to comply to the bounty gives the following result:
//Step 1: Get the file from the input element
inputElement.onchange = function(event) {
//It is important that you use the file and not the filepath (The file path won't work because of security issues)
var file = event.target.files[0];
var fileReader = new FileReader();
fileReader.onload = function() {
var typedarray = new Uint8Array(this.result);
//replaced the old function with the new api
const loadingTask = pdfjsLib.getDocument(typedarray);
loadingTask.promise.then(pdf => {
// The document is loaded here...
});
};
//Step 3:Read the file as ArrayBuffer
fileReader.readAsArrayBuffer(file);
}
I have created an example below with the official releases of the source code below to show that it is working.
/*Offical release of the pdfjs worker*/
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.5.207/pdf.worker.js';
document.getElementById('file').onchange = function(event) {
var file = event.target.files[0];
var fileReader = new FileReader();
fileReader.onload = function() {
var typedarray = new Uint8Array(this.result);
console.log(typedarray);
const loadingTask = pdfjsLib.getDocument(typedarray);
loadingTask.promise.then(pdf => {
// The document is loaded here...
//This below is just for demonstration purposes showing that it works with the moderen api
pdf.getPage(1).then(function(page) {
console.log('Page loaded');
var scale = 1.5;
var viewport = page.getViewport({
scale: scale
});
var canvas = document.getElementById('pdfCanvas');
var context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// Render PDF page into canvas context
var renderContext = {
canvasContext: context,
viewport: viewport
};
var renderTask = page.render(renderContext);
renderTask.promise.then(function() {
console.log('Page rendered');
});
});
//end of example code
});
}
fileReader.readAsArrayBuffer(file);
}
<html>
<head>
<!-- The offical release-->
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.5.207/pdf.js"> </script>
</head>
<body>
<input type="file" id="file">
<h2>Rendered pdf:</h2>
<canvas id="pdfCanvas" width="300" height="300"></canvas>
</body>
</html>
Hope this helps! If not, please comment.
Note:
This might not work in jsFiddle.

I adopted your code and it worked! Then I was browsing for more tips here and there, then I learned there is an even more convenient method.
You can get the URL of client-loaded file with
URL.createObjectURL()
It reduces nesting by one level and you don't need to read the file, convert it to array, etc.

Related

Using pdf.js to render a PDF but it doesn't work and I don't get any error messages to help me debug the issue

I'm trying to build a Flask app where I upload pdf's and I'm working on previewing them before submitting to the back-end.
The script I'm using is as follows:
const imageUploadValidation = (function () {
"use strict";
pdfjsLib.GlobalWorkerOptions.workerSrc =
"https://mozilla.github.io/pdf.js/build/pdf.js";
const onFilePicked = function (event) {
// Select file Nodelist containing 1 file
const files = event.target.files;
const filename = files[0].name;
if (filename.lastIndexOf(".") <= 0) {
return alert("Please add a valid file!");
}
const fileReader = new FileReader();
fileReader.onload = function (e) {
const pdfData = e.target.result;
let loadingTask = pdfjsLib.getDocument({ data: pdfData })
loadingTask.promise.then(function (pdf) {
console.log("PDF loaded", pdf);
pdf.getPage(1).then((page) => {
console.log("page loaded", page);
// var scale = 1.5;
// var viewport = page.getViewport({ scale: scale });
var iframe = document.getElementById("image-preview");
iframe.src = page
// var context = canvas.getContext("2d");
// canvas.height = viewport.height;
// canvas.width = viewport.width;
// var renderContext = {
// canvasContext: context,
// viewport: viewport,
// };
// var renderTask = page.render(renderContext);
// renderTask.promise.then(function () {
// console.log("Page rendered");
// });
});
})
.catch((error) => {
console.log(error);
});
};
const pdf = fileReader.readAsArrayBuffer(files[0]);
console.log("read as Data URL", pdf);
};
const Constructor = function (selector) {
const publicAPI = {};
const changeHandler = (e) => {
// console.log(e)
onFilePicked(e);
};
publicAPI.init = function (selector) {
// Check for errors.
const fileInput = document.querySelector(selector);
if (!selector || typeof selector !== "string") {
throw new Error("Please provide a valid selector");
}
fileInput.addEventListener("change", changeHandler);
};
publicAPI.init(selector);
return publicAPI;
};
return Constructor;
})();
imageUploadValidation("form input[type=file]");
The loading task promise never seems to run. Everything seems to work up until that point. I'm not familiar with this Promise syntax, so I can't be sure if the problem is there or how I'm passing in the pdf file.
P.S. The commented out code is the original way I had this setup, what
s uncommented was just me testing a different way.
Check Datatype
First you might want to check what your getting back from your FileReader, specifically what is the datatype for pdfData. If you have a look at the documentation (direct link) getDocument is expecting a Unit8Array or a binary string.
Add Missing Parameters
The next problem you have is your missing required parameters in your call to getDocument. Here is the minimum required arguments:
var args = {
url: 'https://example.com/the-pdf-to-load.pdf',
cMapUrl: "./cmaps/",
cMapPacked: true,
}
I have never used the data argument in place of the url but as long as you supply the correct datatype you should be fine. Notice that cMapUrl should be a relative or absolute path to the cmap folder. PDFJS often needs these files to actually interpret a PDF file. Here are all the files from the demo repository (GitHub pages): cmaps You'll need to add these to your project.
Instead of using data I would recommend uploading your files as blobs and then all you have to do is supply the blob URL as url. I am not familiar with how to do that, I just know its possible in modern browsers.
Where Is Your Viewer / You Don't Need iFrame or Canvas
PDFJS just needs a div to place the PDF inside of. It's picky about some of the CSS rules, for exmaple it MUST be positioned absolute, otherwise PDFJS generates the pages as 0px height.
I don't see PDFViewer or PDFLinkService in your code. It looks like you are trying to build the entire viewer from scratch yourself. This is no small endeavor. When you get loadingTask working correctly the response should be handled something like this:
loadingTask.promise.then(
// Success function.
function( doc ) {
// viewer is holding: new pdfjsViewer.PDFViewer()
// linkService is: new pdfjsViewer.PDFLinkService()
viewer.setDocument( doc );
linkService.setDocument( doc );
},
// Error function.
function( exception ) {
// What type of error occurred?
if ( exception.name == 'PasswordException' ) {
// Password missing, prompt the user and try again.
elem.appendChild( getPdfPasswordBox() );
} else {
// Some other error, stop trying to load this PDF.
console.error( exception );
}
/**
* Additional exceptions can be reversed engineered from here:
* https://github.com/mozilla/pdf.js/blob/master/examples/mobile-viewer/viewer.js
*/
}
);
Notice that PDFViewer does all the hard work for you. PDFLinkService is needed if you want links in the PDF to work. You really should checkout the live demo and the example files.
Its a lot of work but these example files specifically can teach you all you need to know about PDFJS.
Example / Sample Code
Here is some sample code from a project I did with PDFJS. The code is a bit advanced but it should help you reverse engineer how PDFJS is working under the hood a bit better.
pdfObj = An object to store all the info and objects for this PDF file. I load multiple PDFs on a single page so I need this to keep them separate from each other.
updatePageInfo = My custome function that is called by PDFJS's eventBus when the user changes pages in the PDF; this happens as they scroll from page to page.
pdfjsViewer.DownloadManager = I allow users to download the PDFs so I need to use this.
pdfjsViewer.EventBus = Handles events like loading, page changing, and so on for the PDF. I am not 100% certain but I think the PDFViewer requires this.
pdfjsViewer.PDFViewer = What handles actually showing your PDF to users. container is the element on the page to render in, remember it must be positioned absolute.
// Create a new PDF object for this PDF.
var pdfObj = {
'container': elem.querySelector('.pdf-view-wrapper'),
'document': null,
'download': new pdfjsViewer.DownloadManager(),
'eventBus': new pdfjsViewer.EventBus(),
'history': null,
'id': id,
'linkService': null,
'loaded': 0,
'loader': null,
'pageTotal': 0,
'src': elem.dataset.pdf,
'timeoutCount': 0,
'viewer': null
};
// Update the eventBus to dispatch page change events to our own function.
pdfObj.eventBus.on( 'pagechanging', function pagechange(evt) {
updatePageInfo( evt );
} );
// Create and attach the PDFLinkService that handles links and navigation in the viewer.
var linkService = new pdfjsViewer.PDFLinkService( {
'eventBus': pdfObj.eventBus,
'externalLinkEnabled': true,
'externalLinkRel': 'noopener noreferrer nofollow',
'externalLinkTarget': 2 // Blank
} );
pdfObj.linkService = linkService;
// Create the actual PDFViewer that shows the PDF to the user.
var pdfViewer = new pdfjsViewer.PDFViewer(
{
'container': pdfObj.container,
'enableScripting': false, // Block embeded scripts for security
'enableWebGL': true,
'eventBus': pdfObj.eventBus,
'linkService': pdfObj.linkService,
'renderInteractiveForms': true, // Allow form fields to be editable
'textLayerMode': 2
}
);
pdfObj.viewer = pdfViewer;
pdfObj.linkService.setViewer( pdfObj.viewer );

How to correctly set new Image in the following situation?

I'm trying to read the width and height of files that I get after clicking the Browser button:
for (let i = 0; i < this.uploadingPanoramas.length; i++) {
const img = new Image() // eslint-disable-line
console.log('file', this.uploadingPanoramas[i].file)
img.src = this.uploadingPanoramas[i].file
console.log('img', img)
img.onload(() => {
console.log('width', img.width)
})
}
console.log('file') logs:
console.log('file') logs: <img src="[object File]">
So img.onload doesn't actually work because I'm not getting the image apparently.
What's the correct way of doing this?
EDIT:
this.uploadingPanoramas is an array which objects that contains the files:
[{
file: File,
progress: 0
}, {
file: File,
progress: 0
}]
As this.uploadingPanoramas[i].file is File type you can use FileReader to generate data URL using readAsDataURL() method.
The FileReader object lets web applications asynchronously read the contents of files (or raw data buffers) stored on the user's computer, using File or Blob objects to specify the file or data to read.
var reader = new FileReader();
reader.onload = function (e) {
img.src = e.target.result;
}
reader.readAsDataURL(this.uploadingPanoramas[i].file);

Read samples from wav-file

I'm trying to make a webpage in html5 which stores sample-data from a wav-file in an array. Is there any way to get the sample-data with javascript?
I'm using a file-input to select the wav-file.
In the javascript I already added:
document.getElementById('fileinput').addEventListener('change', readFile, false);
but I have no idea what to do in readFile.
EDIT:
I tried to get the file in an ArrayBuffer, pass it to the decodeAudioData method and get a typedArraybuffer out of it.
This is my code:
var openFile = function(event) {
var input = event.target;
var audioContext = new AudioContext();
var reader = new FileReader();
reader.onload = function(){
var arrayBuffer = reader.result;
console.log("arrayBuffer:");
console.log(arrayBuffer);
audioContext.decodeAudioData(arrayBuffer, decodedDone);
};
reader.readAsArrayBuffer(input.files[0]);
};
function decodedDone(decoded) {
var typedArray = new Uint32Array(decoded, 1, decoded.length);
console.log("decoded");
console.log(decoded);
console.log("typedArray");
console.log(typedArray);
for (i=0; i<10; i++)
{
console.log(typedArray[i]);
}
}
The elements of typedArray are all 0. Is my way of creating the typedArray wrong or did I do something else wrong on?
EDIT:
I finally got it. This is my code:
var openFile = function(event) {
var input = event.target;
var audioContext = new AudioContext();
var reader = new FileReader();
reader.onload = function(){
var arrayBuffer = reader.result;
console.log("arrayBuffer:");
console.log(arrayBuffer);
audioContext.decodeAudioData(arrayBuffer, decodedDone);
};
reader.readAsArrayBuffer(input.files[0]);
};
function decodedDone(decoded) {
var typedArray = new Float32Array(decoded.length);
typedArray=decoded.getChannelData(0);
console.log("typedArray:");
console.log(typedArray);
}
Thanks for the answers!
You'll need to learn a lot about Web APIs to accomplish that, but in the end it's quite simple.
Get the file contents in an ArrayBuffer with the File API
Pass it to the Web Audio API's decodeAudioData method.
Get a typed ArrayBuffer with the raw samples you wanted.
Edit: If you want to implement an equalizer, you're wasting your time, there's a native equalizer node in the Audio API. Depending on the length of your wave file it might be better not to load it all in memory and instead to just create a source that reads from it and connect that source to an equalizer node.
Here's a simple code example to get a Float32Array from a wav audio file in JavaScript:
let audioData = await fetch("https://example.com/test.wav").then(r => r.arrayBuffer());
let audioCtx = new AudioContext({sampleRate:44100});
let decodedData = await audioCtx.decodeAudioData(audioData); // audio is resampled to the AudioContext's sampling rate
console.log(decodedData.length, decodedData.duration, decodedData.sampleRate, decodedData.numberOfChannels);
let float32Data = decodedData.getChannelData(0); // Float32Array for channel 0

Clarification required on Javascript in HTML5 fileAPI for image preview

I was looking at the new HTML5 file API for showing a preview of an image to be uploaded. I googled for some code and almost every example had the same structure, almost the same code. I don't mind copying, particularly when it works, but I need to understand it. So I tried to understand the code but I am stuck with one area and need someone to explain that small part:
The code refers to a HTML form input field and when the file is selected shows the preview image in a img tag. Nothing fancy. Simple. Here it is after removing all the noise:
$('input[type=file]').change(function(e) {
var elem = $(this);
var file = e.target.files[0];
var reader = new FileReader();
//Part I could not understand starts here
reader.onload = (function(theFile) {
return function(e) {
var image_file = e.target.result
$('#img_id').attr('src',image_file);
};
})(file);
reader.readAsDataURL(file);
//Upto here
});
I think that reader.onload needs to be assigned a plain event handler, so I replaced the entire section marked above to:
reader.readAsDataURL(file);
reader.onload = function(e) {
var image_file = e.target.result;
//#img_id is the id of an img tag
$('#img_id').attr('src',image_file)
};
And it worked as I expected it to work.
QUESTION: What's the original code doing that the above simplified code is missing? I understand that it is a function expression returning a function and then calling it… but for what? There is just too much of the original code copied around under tutorials and what not on it but no good explanation. Please explain. Thanks
Sure.
The function-wrapped function, here serves the specific purpose of remembering which file it was you were looking at.
This might be less of a problem using your exact codebase, but if you had a multiple-upload widget, and you wanted to display a row of previews:
var my_files = [].slice.call(file_input.files),
file, reader,
i = 0, l = my_files.length;
for (; i < l; i += 1) {
file = my_files[i];
reader = new FileReader();
// always put the handler first (upside down), because this is async
// if written normally and the operation finishes first (ie:cached response)
// then handler never gets called
reader.onload = function (e) {
var blob_url = e.target.result,
img = new Image();
img.src = blob_url;
document.body.appendChild(img);
};
reader.readAsDataUrl(file);
}
That should all work fine. ...except...
And it's clean and readable.
The issue that was being resolved is simply this:
We're dealing with async-handlers, which means that the value of file isn't necessarily the same when the callback fires, as it was before...
There are a number of ways to potentially solve that.
Pretty much all of them that don't generate random ids/sequence-numbers/time-based hashes to check against on return, rely on closure.
And why would I want to create a whole management-system, when I could wrap it in a function and be done?
var save_file_reference_and_return_new_handler = function (given_file) {
return function (e) {
var blob_url = e.target.result,
file_name = given_file.name;
//...
};
};
So if you had that at the top of the function (above the loop), you could say:
reader = new FileReader();
reader.onload = save_file_reference_and_return_new_handler(file);
reader.readAsDataUrl(file);
And now it will work just fine.
Of course, JS people don't always feel compelled to write named functions, just to store one item in closure to remember later...
reader.onload = (function (current_file) {
return function (e) {
var blob_url = e.target.result,
file_name = current_file.name;
};
}(file));

Extract images from PDF file with JavaScript

I want to write JavaScript code to extract all image files from a PDF file, perhaps getting them as JPG or some other image format. There is already some JavaScript code for reading a PDF file, for example in the PDF viewer pdf-js.
window.addEventListener('change', function webViewerChange(evt) {
var files = evt.target.files;
if (!files || files.length === 0)
return;
// Read the local file into a Uint8Array.
var fileReader = new FileReader();
fileReader.onload = function webViewerChangeFileReaderOnload(evt) {
var buffer = evt.target.result;
var uint8Array = new Uint8Array(buffer);
PDFView.open(uint8Array, 0);
};
var file = files[0];
fileReader.readAsArrayBuffer(file);
PDFView.setTitleUsingUrl(file.name);
........
Can this code be used to extract images from a PDF file?
If you open a page with pdf.js, for example
PDFJS.getDocument({url: <pdf file>}).then(function (doc) {
doc.getPage(1).then(function (page) {
window.page = page;
})
})
you can then use getOperatorList to search for paintJpegXObject objects and grab the resources.
window.objs = []
page.getOperatorList().then(function (ops) {
for (var i=0; i < ops.fnArray.length; i++) {
if (ops.fnArray[i] == PDFJS.OPS.paintJpegXObject) {
window.objs.push(ops.argsArray[i][0])
}
}
})
Now args will have a list of the resources from that page that you need to fetch.
console.log(window.args.map(function (a) { page.objs.get(a) }))
should print to the console a bunch of <img /> objects with data-uri src= attributes. These can be directly inserted into the page, or you can do more scripting to get at the raw data.
It only works for embedded JPEG objects, but it's a start!
Here is link to working example of getting images from pdf and adding alpha channel to Uint8ClampedArray to be able to display it. It displays images in canvas.
Example in codepen: https://codepen.io/allandiego/pen/RwVGbyj
Getting data url from canvas to be able to display it in img tag:
const canvas = document.createElement('canvas');
canvas.width = imageWidth;
canvas.height = imageHeight;
const ctx = canvas.getContext('2d');
ctx!.putImageData(imageData, 0, 0);
const dataURL = canvas.toDataURL();

Categories