I paste all of my code because it might have a connection with the function I am asking for. I had help to make one of my functions run. Look at the parseJSON() function. Why I have to use 2 functions (parseJSON() and nested makeNav(navigation)), but not only one parseJSON(navigation) (and ofc to change the inner elements from makeNav to parseJSON). Can someone explain why it works only that way for me. Because I want to understand it, not just to do my exercise and go on.
var new_json;
$.get('navigation.json', function (json){
new_json = json;
parseJSON();
var reload_page;
var this_hash = window.location.hash;
if( this_hash.length == 0 ){
reload_page = "home";
}else{
reload_page = this_hash.replace('#', '');
};
loading(reload_page + '.html');
});
var cache = {};
function loading(url){
if( typeof(cache[url]) == 'undefined' ) {
console.log( 'cache A does not exists' );
container.load(url + ' .container>*', function(){
cache[url] = container.html();
});
}else {
console.log( 'cache A exists' );
container.html(cache[url]);
};
};
$('#navigation li a, #logo a').live('click',function(){
var url = $(this).attr('href');
window.location.hash = url.replace('.html', '');
loading(url);
return false;
});
function parseJSON() {
function makeNav(navigation) {
var nav_html = '';
console.log( navigation.length );
for (var i = 0; i < navigation.length; i++) {
var name = navigation[i]['name'];
var href = navigation[i]['href'];
var submenu = navigation[i]['navigation'];
nav_html += '<li>' + name + '<span class="ddArrow"></span>';
if( typeof(submenu) != 'undefined' ){
nav_html += '<ul>';
nav_html += makeNav(submenu);
nav_html += '</ul>';
}
nav_html += '</li>';
}
return nav_html;
}
$('#navigation ul').html(makeNav( new_json['navigation'] ));
}
Probable reason is that your parseJSON does extra things: $('#navigation ul').html(makeNav( new_json['navigation'])); and when you make recursive call to makeNav you don't need to set this html content. Reason for the nested definition of makeNav inside parseJSON could be that you don't want makeNav to be visible outside of the scope of parseJSON because you simply don't use it out of this scope and you don't want to pollute the "environment".
Anyway, I really don't think that's the best way to implement it...but that should be discussed at https://codereview.stackexchange.com/.
To use a single function, without the nested makeNav you can do something like:
var new_json;
$.get('navigation.json', function (json){
new_json = json;
parseJSON();
var reload_page;
var this_hash = window.location.hash;
if( this_hash.length == 0 ){
reload_page = "home";
}else{
reload_page = this_hash.replace('#', '');
};
loading(reload_page + '.html');
});
var cache = {};
function loading(url){
if( typeof(cache[url]) == 'undefined' ) {
console.log( 'cache A does not exists' );
container.load(url + ' .container>*', function(){
cache[url] = container.html();
});
}else {
console.log( 'cache A exists' );
container.html(cache[url]);
};
};
$('#navigation li a, #logo a').live('click',function(){
var url = $(this).attr('href');
window.location.hash = url.replace('.html', '');
loading(url);
return false;
});
function makeNav(navigation) {
var nav_html = '';
console.log( navigation.length );
for (var i = 0; i < navigation.length; i++) {
var name = navigation[i]['name'];
var href = navigation[i]['href'];
var submenu = navigation[i]['navigation'];
nav_html += '<li>' + name + '<span class="ddArrow"></span>';
if( typeof(submenu) != 'undefined' ){
nav_html += '<ul>';
nav_html += makeNav(submenu);
nav_html += '</ul>';
}
nav_html += '</li>';
}
return nav_html;
}
Related
i want to match two float number but unable check below:
https://jsfiddle.net/mcsab3aa/2/
js code:
$('#checkButton').click(function() {
var getusertarget = parseFloat(jQuery("#targetval").val());
var currentval = $("#demo").find( "h1" ).html();
currentval = parseFloat(currentval.replace('$',''));
console.log(currentval);
console.log(getusertarget);
var dividerval = (currentval/targetval); // it should be 1
console.log(dividerval);
if (dividerval==1) {
$('.coins_drags').hide();
//$("#demo").find( "h1" ).html('$' + sum.toFixed(2) + '<br />Great job');
console.log('great');
}
else {
//$("#demo").find( "h1" ).html('$' + sum.toFixed(2) + '<br />Try again');
//sum = 0;
console.log('try');
}
});
var dividerval = (currentval/targetval);
targetval is undefined. You might want to do:
var dividerval = (currentval/getusertarget);
Try like this
var dividerval = (currentval/getusertarget);
instead of
var dividerval = (currentval/targetval);
https://jsfiddle.net/mcsab3aa/4/
var h1 = $("#demo").find( "h1" ),
targetVal = f2num(h1.text()),
current = $("#targetval"),
msg = $('.msg');
msg.text('click after typing your guess');
$('#checkButton').click(function () {
if (current.val() == targetVal) {
msg.text('Great job');
h1.html('$' + targetVal).show();
} else {
h1.hide();
msg.text('try again!');
}
});
function f2num(n) {
return parseFloat((n+"").replace(/[^0-9\.]+/g, ''));
}
The problem was that you were trying to parse number with $ in it. And had handlers reversed.
How can I check if URL contains variable or not? I have my function like this
My 2nd problem is, let says the URL already pass the lang variable, something like this (http://index.php?id=23&lang=en) i want that when they run this function again it will replace the lang variable to the new on instead of add new lang variable like this (http://index.php?id=23&lang=en&lang=jp)
function menu_goto(newvalue)
{
var baseurl = window.location.href ;
var langwithpara = "&lang=" + newvalue;
var langwopara = "?lang=" + newvalue;
if (newvalue != 0) {
if(baseurl.match(/?/)){
alert ('123');
location.href = baseurl + langwithpara ;
}
else{
alert ('456');
location.href = baseurl + langwopara ;
}
}
}
My new coding (work)
function menu_goto(newvalue)
{
var baseurl = window.location.href ;
var url = baseurl + ( (baseurl.match(/\?/))? '&':'?' ) + 'lang=' + newvalue;
location.href = url ;
}
window.location is actually an object, it has a 'search' property that make it easier to parse the query string.
function getParam(param){
var qs = window.location.search.substring(1).split('&');
var qsp;
for (p in qs){
qsp = qs[p].split('=');
if (qsp[0] == param) return qsp[1];
}
return null;
}
to check for a specific parameter :
var value = getParam('name');
the problem is probably the missing escaping of "?" in your RegExp:
if (baseurl.match(/\?/)) { // ...
a bit shorter would be:
var url = baseurl + ( (baseurl.match(/\?/))? '&':'?' ) + 'lang=' + newvalue;
You would probably want to clean any param "lang" before, so it doesn't become an array by multiple occurrence.
It would probably better to assemble the url anew like
function menu_goto(newvalue) {
var params = {};
if (self.location.search) {
var pairs = self.location.search.substring(1).split("&"); // skip the "?"
for (var i = 0; i < pairs.length; i++) {
var parts = pairs[i].split('=');
params[parts[0]] = parts[1];
}
}
params.lang = newvalue;
var query = new Array();
for (var p in params) query.push(p + '=' + params[p]);
var url = self.location.protocol + '//' + self.location.host + self.location.pathname
+ '?' + query.join('&');
self.location.href = url;
}
Here is yet another solution using RegExps:
function menu_goto2(newvalue) {
var url = self.location.href;
url = url.replace(/#.*/, ''); // clean any hash at end
url = url.replace(/[&\?]lang=[^&]*/, ''); // clean any param lang and its value
// we might have had two or more params and "lang" was the first one
// then we might have lost the "?" => replace first "&" by "?"
if (url.indexOf('?') < 0 && url.indexOf('&') >= 0) url = url.replace(/&/, '?');
url += ( url.match(/\?/)? '&':'?') + 'lang=' + newvalue; // add the new param lang
self.location.href = url;
}
Which could be shortened to
function menu_goto3(newvalue) {
var url = self.location.href.replace(/#.*/, '').replace(/[&\?]lang=[^&]*/, '');
if (url.indexOf('?') < 0 && url.indexOf('&') >= 0) url = url.replace(/&/, '?');
url += ( url.match(/\?/)? '&':'?') + 'lang=' + newvalue;
self.location.href = url;
}
You can simply use indexOf() method for this
if(window.location.href.indexOf("your value") != -1) {
alert("It contains");
}
else {
alert("Nope");
}
function menu_goto(newvalue)
{
var baseurl = window.location.href ;
var langwithpara = "&lang=" + newvalue;
var langwopara = "?lang=" + newvalue;
if (newvalue != 0) {
if(baseurl.match(/\?/)){ // change /?/ to /\?/
alert('123');
location.href = baseurl + langwithpara ;
}
else{
alert('456');
location.href = baseurl + langwopara ;
}
}
}
I'm wondering how I can add a new parameter to an existing url.
The problem is: the url may also contain an anchor.
For example:
http://www.example.com?foo=bar#hashme
And I want to add another parameter to it, so it results in this:
http://www.example.com?foo=bar&x=y#hashme
I used parts of The Awesome One's solution, and a solution found on this question:
Adding a parameter to the URL with JavaScript
Combining them into this script:
function addParameter(url, parameterName, parameterValue, atStart/*Add param before others*/){
replaceDuplicates = true;
if(url.indexOf('#') > 0){
var cl = url.indexOf('#');
urlhash = url.substring(url.indexOf('#'),url.length);
} else {
urlhash = '';
cl = url.length;
}
sourceUrl = url.substring(0,cl);
var urlParts = sourceUrl.split("?");
var newQueryString = "";
if (urlParts.length > 1)
{
var parameters = urlParts[1].split("&");
for (var i=0; (i < parameters.length); i++)
{
var parameterParts = parameters[i].split("=");
if (!(replaceDuplicates && parameterParts[0] == parameterName))
{
if (newQueryString == "")
newQueryString = "?";
else
newQueryString += "&";
newQueryString += parameterParts[0] + "=" + (parameterParts[1]?parameterParts[1]:'');
}
}
}
if (newQueryString == "")
newQueryString = "?";
if(atStart){
newQueryString = '?'+ parameterName + "=" + parameterValue + (newQueryString.length>1?'&'+newQueryString.substring(1):'');
} else {
if (newQueryString !== "" && newQueryString != '?')
newQueryString += "&";
newQueryString += parameterName + "=" + (parameterValue?parameterValue:'');
}
return urlParts[0] + newQueryString + urlhash;
};
Example: addParameter('http://www.example.com?foo=bar#hashme', 'bla', 'valuebla', false)
Results in http://www.example.com?foo=bar&bla=valuebla#hashme
This can be another good solution, this version is even able to replace the parameter if it already exists, add parameter without value:
function addParam(url, param, value) {
var a = document.createElement('a'), regex = /(?:\?|&|&)+([^=]+)(?:=([^&]*))*/g;
var match, str = []; a.href = url; param = encodeURIComponent(param);
while (match = regex.exec(a.search))
if (param != match[1]) str.push(match[1]+(match[2]?"="+match[2]:""));
str.push(param+(value?"="+ encodeURIComponent(value):""));
a.search = str.join("&");
return a.href;
}
url = "http://www.example.com#hashme";
newurl = addParam(url, "ciao", "1");
alert(newurl);
http://jsfiddle.net/bknE4/81/
Try this:
location.href = location.href.replace(location.hash, '') + '&x=y' + location.hash
Update
What about this:
var a = document.createElement('a');
a.href = "http://www.example.com?foo=bar#hashme";
var url = a.href.replace(a.hash, '') + '&x=y' + a.hash;
I found out that the location object can be created by an anchor element(from Creating a new Location object in javascript).
You can use this JS lib called URI.JS
// mutating URLs
URI("http://example.org/foo.html?hello=world")
.username("rodneyrehm")
// -> http://rodneyrehm#example.org/foo.html?hello=world
.username("")
// -> http://example.org/foo.html?hello=world
.directory("bar")
// -> http://example.org/bar/foo.html?hello=world
.suffix("xml")
// -> http://example.org/bar/foo.xml?hello=world
.hash("hackernews")
// -> http://example.org/bar/foo.xml?hello=world#hackernews
.fragment("")
// -> http://example.org/bar/foo.xml?hello=world
.search("") // alias of .query()
// -> http://example.org/bar/foo.xml
.tld("com")
// -> http://example.com/bar/foo.xml
.search({ foo: "bar", hello: ["world", "mars"] });
// -> http://example.com/bar/foo.xml?foo=bar&hello=world&hello=mars
or
URI("?hello=world")
.addSearch("hello", "mars")
// -> ?hello=world&hello=mars
.addSearch({ foo: ["bar", "baz"] })
// -> ?hello=world&hello=mars&foo=bar&foo=baz
.removeSearch("hello", "mars")
// -> ?hello=world&foo=bar&foo=baz
.removeSearch("foo")
// -> ?hello=world
Easy.
<script>
function addPar(URL,param,value){
var url = URL;
var hash = url.indexOf('#');
if(hash==-1)hash=url.length;
var partOne = url.substring(0,hash);
var partTwo = url.substring(hash,url.length);
var newURL = partOne+'&'+param+'='+value+partTwo
return newURL;
}
document.write(addPar('http://www.example.com?foo=bar','x','y')) // returns what you asked for
</script>
The code could be modified a bit, and made a little more efficient, but this should work fine.
#Sangol's solution's better. Didn't know a location.hash property existed.
#freedev answer is great, but if you need something very simple (to insert key=value pair to the url and assume that key doesn't already exist), there's a much faster way to do it:
var addSearchParam = function(url,keyEqualsValue) {
var parts=url.split('#');
parts[0]=parts[0]+(( parts[0].indexOf('?') !== -1) ? '&' : '?')+keyEqualsValue;
return parts.join('#');
}
Example usage: addSearchParam('http://localhost?a=1#hash','b=5');
Here is an improved version of the answer by #skerit. This one supports # in URL path.
function addParameter(url, parameterName, parameterValue, atStart/*Add param before others*/) {
var replaceDuplicates = true;
var cl, urlhash;
parameterName = encodeURIComponent(parameterName);
parameterValue = encodeURIComponent(parameterValue);
if (url.lastIndexOf('#') > 0) {
cl = url.lastIndexOf('#');
urlhash = url.substring(cl, url.length);
} else {
urlhash = '';
cl = url.length;
}
var sourceUrl = url.substring(0, cl);
var urlParts = sourceUrl.split("?");
var newQueryString = "";
if (urlParts.length > 1) {
var parameters = urlParts[1].split("&");
for (var i=0; (i < parameters.length); i++) {
var parameterParts = parameters[i].split("=");
if (!(replaceDuplicates && parameterParts[0] === parameterName)) {
if (newQueryString === "") {
newQueryString = "?";
} else {
newQueryString += "&";
}
newQueryString += parameterParts[0] + "=" + (parameterParts[1]?parameterParts[1]:'');
}
}
}
if (newQueryString === "") {
newQueryString = "?";
}
if (atStart) {
newQueryString = '?'+ parameterName + "=" + parameterValue + (newQueryString.length>1?'&'+newQueryString.substring(1):'');
} else {
if (newQueryString !== "" && newQueryString != '?') {
newQueryString += "&";
}
newQueryString += parameterName + "=" + (parameterValue?parameterValue:'');
}
return urlParts[0] + newQueryString + urlhash;
}
Examples:
addParameter('http://www.example.com?foo=bar#hashme', 'bla', 'valuebla', false);
// Returns: http://www.example.com?foo=bar&bla=valuebla#hashme
addParameter('http://www.example.com/#iAmNotUrlHash/?foo=bar#hashme', 'bla', 'valuebla', false);
// Returns: http://www.example.com/#iAmNotUrlHash/?foo=bar&bla=valuebla#hashme
Something like this ?
var param = "x=y";
var split = url.split('#');
url = split[0] + '&' + param + "#" + split[1];
I always use this code, and its working fine ...
var currenturl=location.href;
var url = location.href+"?ts="+true;
window.location.replace(url,"_self");
if you are trying to add to the url parameter by html anchor tag, and, you have something like this just like I do:
<div class="wrapper-option">
JS
CSS
popular
newest
</div>
you can do something like:
// anchor setup
const anchors = this.document.querySelectorAll('.wrapper-option a');
anchors.forEach(a=>{
a.onclick = e => {
e.preventDefault();
if(a.href){
let uri = new URL(a.href);
let url = new URL(window.location.href);
for(const [k, v] of uri.searchParams){
url.searchParams.set(k, v);
}
window.location.href = url.href;
}
}
});
this will add param to url if it does exist, rewrite if exist. and will not allow duplicate and list.
For checking server side validation errors, I am using "afterSubmit" property of jqGRid.
afterSubmit: checkForErrors
And my checkForErrors is something like this:
function checkForErrors(response,postdata){
var success = true;
var message = ""
var json = eval('(' + response.responseText + ')');
if(json.errors.length>0) {
success = false;
for(i=0; i < json.errors.length; i++) {
message += json.errors[i] + '<br/>';
}
}
var new_id = null;
return [success,message,new_id];
}
The only thing i need to fix is change the default error class/style used by jqgrid (ie. ui-state-error), to my custom css class. How can i do that.
Thanks!
Try this:
function checkForErrors(response,postdata){
var success = true;
var message = ""
var json = eval('(' + response.responseText + ')');
if(json.errors.length>0) {
success = false;
for(i=0; i < json.errors.length; i++) {
message += json.errors[i] + '<br/>';
}
}
var new_id = null;
//Here pass the right class name instead of ".dialog" which you are using in your code
$(".dialog").find(".ui-state-error").removeClass("ui-state-error").addClass("newClass");
return [success,message,new_id];
}
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.