I have a problem in slideAC(data) function.
If condition seem to have some problem in the picture condition (data[0] == "picture")
I have already try to alert the data for testing the input value by alert(data[0]);
and the result is "picture" as well I have no idea what is the problem??
since other test condition work correctly.
HERE IS THE INPUT DATA IN THE extractData(data <-- array) FUCNCTION
(It's Already Split from other function by use split("\n"))
slide
,- width 400
,- height 300
,- into #slide1
,- picture
,+[pic/001.png]
,+[pic/002.jpg]
,+[pic/003.jpg]
,+[pic/004.jpg]
,
The purpose of the code is extract the words from the above text and generate some code.
function extractData(data){
alert(data);
var n = 0;
var step1 = "";
var step2 = "";
var step3 = "";
var step4 = "";
var picture =[];
//check '#' command by call the first line data
if(data[0].indexOf("slide") !== -1){
for(var i=1; i<data.length; i++){
// alert(n);
switch (n){
case 0:///////////////////////////////////////
// alert("case1");
//extract from '-'
if(data[i].indexOf('-') !== -1){
step1 = data[i].replace('-','');
step2 = step1.split(' ');
step3 = step2.slice(1,step2.length);
//slide Attribute Compiler
n = slideAC(step3);
}else{
alert("out");
}//end of if condition
break;
case 1:///////////////////////////////////////
// alert("case2");
//extract from '+'
if(data[i].indexOf('+') !== -1){
step1 = data[i].replace('+','');
step2 = step1.replace("[","");
step3 = step2.replace("]","");
picture.push(step3);
alert(step3);
}else if(data[i].indexOf('-') !== -1){
step1 = data[i].replace('-','');
step2 = step1.split(' ');
step3 = step2.slice(1,step2.length);
//slide Attribute Compiler
n = slideAC(step3);
}else{
//Error
// alert("wrong pic syntax");
// javascript_abort();
}//end of if condition
break;
}//end of iswitch case
}//end of item for loop
}else if(data[0].indexOf("menu") !== -1){
//Provision
}else if(data[0].indexOf("form") !== -1){
//Provision
}else{
javascript_abort();
}//end of if condition
}//end of syntaxCompiler
//slide Attribute Compiler
function slideAC(data){
// alert(data[0]);
var a = 0
if(data[0] == "width"){
var propWidth = data[1];
// alert(data[0] + " : " + propWidth);
// alert(typeof data);
a = 0;
}else if(data[0] == "height"){
var propHeight = data[1];
// alert(data[0] + " : " + propHeight);
// alert(typeof data);
a = 0;
}else if(data[0] == "into"){
var propInto = data[1];
// alert(data[0] + " : " + propInto);
// alert(typeof data);
a = 0;
}else if(data[0] == "picture"){
a = 1;
}else{
alert("why");
// javascript_abort();
}//end of if condition
return a;
}//end of slide attribute compiler
Do you have some advise ?? Please help
I just want the if condition work correctly
Thanks in advance
PS. Sorry If my wording make you confuse.
I'm guessing there may be space characters around the words. Hard to tell because you haven't shown a good picture of the result of the split, or shown the original input, and how you're splitting it.
If this is the case, you could trim it. I used a switch statement instead of your if/else if/else.
// trim the string-------v
switch(data[0].trim()) {
case "width":
var propWidth = data[1];
// alert(data[0] + " : " + propWidth);
// alert(typeof data);
a = 0;
break;
case "height":
var propHeight = data[1];
// alert(data[0] + " : " + propHeight);
// alert(typeof data);
a = 0;
break;
case "into":
var propInto = data[1];
// alert(data[0] + " : " + propInto);
// alert(typeof data);
a = 0;
break;
case "picture":
a = 1;
break;
default:
alert("why");
// javascript_abort();
}
You'll need a patch for the .trim() method if you support old browsers.
If js says it's false, then it is! You should eval your vars in a debugger to see what is happening. If using Chrome, just place a call to debugger to break and inspect vars values.
http://msdn.microsoft.com/en-us/library/ie/0bwt76sk(v=vs.94).aspx
Related
My system fatches data every 500 ms and my screen is full of html tables apart from each other . And every cell has unique key attribute. I am caching all of them anyway.
I have a global JavaScript object(_cellColorTimeouts) which contains settimeout functions for cellElements of tableRows that I mentioned above. After caching of cells, system creates timeout functions which is to wipe css out for spesific cell (in 3000ms).
In code block below uiElementKey_X and uiElementKey_Y are exact same but cached like are different. Adding unique suffix into table id makes them different. This proccess is done for row and cell items aswell.
example of _cellColorTimeouts data is
//array object keys are names of unique cell items.
_cellColorTimeouts = [uiElementKey_X_1, uiElementKey_X_2, uiElementKey_X_3,
uiElementKey_Y_1, uiElementKey_X_2, uiElementKey_Y_3];
.
. //does somethings to change cell colour
.
//after 3 seconds i need to clear css of this cell without looping the dom so i do it via cached dom.
if (_cellColorTimeouts.hasOwnProperty(uiElementKey) && _cellColorTimeouts[uiElementKey] != null) {
clearTimeout(_cellColorTimeouts[uiElementKey]);
_cellColorTimeouts[uiElementKey] = null;
}
_cellColorTimeouts[uiElementKey] = setTimeout(function () {
clearColourOfCell(cell);
}, 3000);
}
function clearColourOfCell(cell) {
cell.style.backgroundColor = cell.rowBGColour;
cell.style.color = "black";
_cellColorTimeouts[cell.uiElementKey] == null;
clearTimeout(_cellColorTimeouts[cell.uiElementKey]);
}
So the problem is settimeout function is not working for the first table but second is totally fine. I have checked is there any settimeout function return id from global, yes it has. For the first table somehow it does not work. I know this question is too unique for my case but any idea will be preciated?
---- EDIT ---- FULL FUNCTION UNCUT VERSION -----
function setWidgetData(widgetId, rowId, colId, value, colIndex) {
"use strict";
// check colIndex
if (colIndex === undefined || colIndex === null) {
colIndex = 0;
}
// loop on ui tables
var uiTables = _widgetUIElements[widgetId];
//var timeout;
for (var tableId in uiTables) {
var uiTable = uiTables[tableId];
var uiElementKey = tableId + "#" + rowId + "#" + colId + "#" + colIndex;
var cellCachedObject = uiTable[uiElementKey];
// check cell
if (cellCachedObject == undefined) {
//console.log("cell is undefined : " + "widgetId : " + widgetId + " - " + "rowId : " + rowId + " - " + "colId : " + colId + " - " + "colIndex : " + colIndex);
}
else {
// get cell
var cell = cellCachedObject["domElement"];
// set sell value
var cellValue = value;
// is value numeric? it means we will make some conversions on value
if (isNumeric(cellValue)) {
var canPaint = false;
// check cell entity
switch (cellCachedObject["entity"]) {
// date-time?
case "DATETIME":
// convert unix date time to readable date time
cellValue = new Date(fixDecimalSeparator(cellValue) * 1000);
cellValue = fixDateTimeDigits((cellValue.getDate())) + "/" + fixDateTimeDigits((cellValue.getMonth() + 1)) + " " + fixDateTimeDigits(cellValue.getHours()) + ":" + fixDateTimeDigits(cellValue.getMinutes());
break;
// date?
case "DATE":
// convert unix date time to readable date time
cellValue = new Date(fixDecimalSeparator(cellValue) * 1000);
cellValue = fixDateTimeDigits((cellValue.getDate())) + "/" + fixDateTimeDigits((cellValue.getMonth() + 1));
break;
// numeric?
case "NR":
// fix "," character in value
cellValue = fixDecimalSeparator(cellValue);
//just format the presicion
cellValue = number_format(cellValue, cellCachedObject["precision"], '.', ',');
canPaint = true;
break;
// other?
default:
// fix "," character in value
cellValue = fixDecimalSeparator(cellValue);
// if cell is number, no entity conversion
// entity convertion
cellValue = entityConverter(cellCachedObject["entity"], cellCachedObject["entityTo"], cellValue);
cellValue = new Number(cellValue).toFixed(cellCachedObject["precision"]);
// if widget currency is not USD. it means user selected currency from currency list or default user currency
if (cellCachedObject["isConvertable"]) {
// this scoop is not active with the new xml. if FOREX1 widget entity is RECIPCUR but never should not be
if (cellCachedObject["widgetIsFOREX1"]) {
cellValue = _currencyConverter.convertTrend(cellValue, cellCachedObject.currencyValueType, cellCachedObject["currencyTo"], cellCachedObject["rowId"], cellValue);
}
else {
cellValue = _currencyConverter.convert(cellValue, cellCachedObject["currency"], null, cellCachedObject["precision"]);
}
}
canPaint = true;
}
// if it is not date time
if (canPaint) {
// get current value of cell
var currentValue = cell.getAttribute("currentValue");
// check current value of cell make them coloured.
if (currentValue !== undefined) {
// new value is bigger than old value
var newVal = parseFloat(value);
var oldVal = parseFloat(currentValue);
var rowBGColour = cellCachedObject["rowBGColor"];
cell.rowBGColour = rowBGColour;
cell.uiElementKey = uiElementKey;
if (newVal > oldVal) {
//cell.css({ "background-color": "Green", "color": "White" });
cell.style.backgroundColor = "green";
cell.style.color = "white";
}
// new value is smaller than old value
if (newVal < oldVal) {
//cell.css({ "background-color": "Red", "color": "White" });
cell.style.backgroundColor = "red";
cell.style.color = "white";
}
if (_cellColorTimeouts.hasOwnProperty(uiElementKey) && _cellColorTimeouts[uiElementKey] != null) {
clearTimeout(_cellColorTimeouts[uiElementKey]);
_cellColorTimeouts[uiElementKey] = null;
}
_cellColorTimeouts[uiElementKey] = setTimeout(function () {
return function () {
clearColourOfCell(cell);
};
} (cell), 3000);
newVal = oldVal = rowBGColour = null;
}
currentValue = null;
}
canPaint = null;
// set new value as a current value
cell.setAttribute("currentValue", value);
}
cell.innerHTML = '';
cell.innerHTML = cellValue;
cellValue = null;
}
uiTable = uiElementKey = cellCachedObject = null;
}
uiTables = null;
}
You didn't post enough code for me to know for sure that this is the problem, but it's a good bet:
_cellColorTimeouts[uiElementKey] = setTimeout(function () {
return function() {
clearColourOfCell(cell);
};
}(cell), 3000);
By setting up the timeout handler like that, you ensure that the handler has its own private copy of that "cell" variable, so that no matter how "cell" is changed before the handler is finally invoked, that copy will retain the correct value.
I need to read a file line by line, and change a variable accordingly.
I would normally write this in PHP... but I decided to take the challenge.
I wrote:
fs = require('fs');
Lazy = require('lazy');
path = require('path');
files = fs.readdirSync('.');
var software = {};
files.forEach( function(fileName){
var m;
if( m = fileName.match(/^(.*)\.txt$/) ){
name = m[1];
console.log("Processing file: " + fileName);
software[name] = {};
console.log("Software 1: %j",software);
var section = 'unset';
new Lazy(fs.createReadStream(fileName)).lines.forEach(
function(line){
var m;
line = line + '';
if( m = line.match(/^([a-zA-Z_]*):$/)){
section = m[1];
software[name][section] = '';
console.log("Switching to section " + m[1]);
console.log("Software 2: %j",software);
} else if (line == '.'){
section = 'unset'
} else if (line == ''){
section = 'unset'
} else {
console.log("LINE: " + line) ;
software[name][section] = software[name][section] + line + "\n";
console.log("Software 3: %j",software);
}
}
);
}
});
console.log("Software 4: %j",software);
Apart from the code being very ugly and very unoptimised, I am having trouble as when the last line prints, the "software" variable is not YET populated! I am guessing Lazy is asyncronous. So, it basically works, but "at some point later". This is great, but... where do I write code when that important cycle, that fills in the software variable, is actually finished?!?
As requested: data to play with!
simply create "something.txt" and write:
name:
Name 1
.
Option 1:
Value 1
.
Option 2:
Value 2
.
Option 3:
Multi
Line
Value
.
Another_section:
Again
.
Merc.
The instances of Lazy returned by the library are EventEmitters, and it emits en event called pipe when a "set" of operations is complete:
new Lazy(
...
).on('pipe', function() {
// all done
});
Modifying your code to use this event results in (the only change is near the bottom):
fs = require('fs');
Lazy = require('lazy');
path = require('path');
files = fs.readdirSync('.');
var software = {};
files.forEach( function(fileName){
var m;
if( m = fileName.match(/^(.*)\.txt$/) ){
name = m[1];
console.log("Processing file: " + fileName);
software[name] = {};
console.log("Software 1: %j",software);
var section = 'unset';
new Lazy(fs.createReadStream(fileName)).lines.forEach(
function(line){
var m;
line = line + '';
if( m = line.match(/^([a-zA-Z_]*):$/)){
section = m[1];
software[name][section] = '';
console.log("Switching to section " + m[1]);
console.log("Software 2: %j",software);
} else if (line == '.'){
section = 'unset'
} else if (line == ''){
section = 'unset'
} else {
console.log("LINE: " + line) ;
software[name][section] = software[name][section] + line + "\n";
console.log("Software 3: %j",software);
}
}
).on('pipe', function() {
console.log("Software 4: %j",software);
});
}
});
[Edit] To answer your question regarding how I found this info:
I did indeed check out the source file for the project; I knew the library had a sum method that could be chained to instances of Lazy to sum up everything at the end; the code for that method calls foldr, and the code for that method listens for an event called pipeName, which is defaulted in line 22 as pipe.
As title says i have some problems with IE8 and Javascript. It's known about its bug in interpretation of global variables: simply it does not get them if you don't declare as:
var variable1 = something;
The problem it's i'm trying to make a script that change the body background clicking on a button and i need a global variable wrapping the actual status (what bg-x.png i'm loading). This script work on FF, Safari and Chrome but not, obviously, on IE. Help? (the problem is on the variable "status")
$('#change').click(function() {
var numStates = 2;
var name = $(this).text();
if(!(status)) {
status = parseInt(1,10);
}
if(status<numStates) {
status = parseInt(status,10) + 1;
}
else {
status = parseInt(1,10);
}
alert(status);
var bgvar = null;
switch(parseInt(status,10)) {
case 1: var bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat';
var name = 'Pattern';
break;
case 2: var bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat-x';
var name = 'Sfumato';
break;
default: alert('Default');
}
$('body').css({
background:bgvar,
});
$(this).text(name);
}
);
Working code even with IE (Thanks to Zeta):
$('#change').click(function() {
var numStates = 2;
var name = $(this).text();
// If data-status isn't defined set it to the initial value
if($('body').data('status') === undefined)
$('body').data('status',1);
// Extract the status
var status = parseInt($('body').data('status'),10);
// Handle the status
if(status < numStates)
status++;
else
status = 1;
// Save the status
$('body').data('status',status);
switch(status) {
case 1: bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat';
name = 'Pattern';
break;
case 2: bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat-x';
name = 'Sfumato';
break;
default: alert('Default');
}
$('body').css({
background:bgvar,
});
$(this).text(name);
}
);
status is a predefined member of the window-object and points to the content of the statusbar. Use another variable-name
Since you're already using jQuery you could use .data() instead of global variables:
$('#change').click(function() {
var numStates = 2;
var name = $(this).text();
// If data-status isn't defined set it to the initial value
if($('body').data('status') === undefined)
$('body').data('status',1);
// Extract the status
var status = parseInt($('body').data('status'),10);
// Handle the status
if(status < numStates)
status++;
else
status = 1;
// Save the status
$('body').data('status',status);
/* ... Rest of your code ... */
Note that this won't work in XML documents in IE (according to the jQuery doc).
Do you know when to use »var«?
I just cleaned your code in the follwing ways:
Add "var status;" to make it a local variable
Delete "var" for already defined variables in your switch statement
Delete the unnecessary comma behind "background: bgvar" which will cause errors in IE
$('#change').click(function () {
var numStates = 2;
var name = $(this).text();
var status;
if (!(status)) {
status = parseInt(1, 10);
}
if (status < numStates) {
status = parseInt(status, 10) + 1;
} else {
status = parseInt(1, 10);
}
alert(status);
var bgvar = null;
switch (parseInt(status, 10)) {
case 1:
bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat';
name = 'Pattern';
break;
case 2:
bgvar = ' #7097ab url(./img/bg-' + status + '.png) top center repeat-x';
name = 'Sfumato';
break;
default:
alert('Default');
}
$('body').css({
background: bgvar
});
$(this).text(name);
});
Does it work now?
P.S. Use http://www.jshint.com/ to prevent those kind of errors.
I have written a script that checks a set of radiobuttons to be checked. But due to different possibilities different radiobuttons will show. Is there a way to suppress JavaScript errors when it pops undefined/getElementById is null? Something like the #-char does in PHP?
Update:
A bit more background info. I've made a website where users can submit images and another party for whom the images are can select their top 3 of the images. So each image has three radiobuttons. The difficulty here lies in the fact that the radiobuttons must be controlled dimensional (horizontal and vertical), because a submitted image may only be at place 1, 2 or 3. This is my working code. But adding many if(!var == undefined) doesn't make the code prettier. Therefor I'm wondering if there is something like #suppressMe is possible?
function HandleRadioButtons(id, type, idString, img)
{
var idArray = idString.split("|");
var place1 = document.getElementById("G_" + id);
var place2 = document.getElementById("S_" + id);
var place3 = document.getElementById("B_" + id);
var img1 = document.getElementById("Winner1");
var img2 = document.getElementById("Winner2");
var img3 = document.getElementById("Winner3");
switch(type)
{
case "G" :
place2.checked = false;
place2.disabled = true;
place3.checked = false;
place3.disabled = true;
img1.style.background = 'url(' + img + ') no-repeat center center #FFF';
break;
case "S" :
place1.checked = false;
place1.disabled = true;
place3.checked = false;
place3.disabled = true;
img2.style.background = 'url(' + img + ') no-repeat center center #FFF';
break;
case "B" :
place1.checked = false;
place1.disabled = true;
place2.checked = false;
place2.disabled = true;
img3.style.background = 'url(' + img + ') no-repeat center center #FFF';
break;
}
var current1, current2, current3 = "";
for(i = 0; i < idArray.length - 1; i++)
{
var place1 = document.getElementById("G_" + idArray[i]);
var place2 = document.getElementById("S_" + idArray[i]);
var place3 = document.getElementById("B_" + idArray[i]);
if(place1.checked == true)
{
var current1 = idArray[i];
}
if(place2.checked == true)
{
var current2 = idArray[i];
}
if(place3.checked == true)
{
var current3 = idArray[i];
}
}
for(i = 0; i < idArray.length - 1; i++)
{
var place1 = document.getElementById("G_" + idArray[i]);
var place2 = document.getElementById("S_" + idArray[i]);
var place3 = document.getElementById("B_" + idArray[i]);
if(idArray[i] != id && idArray[i] != current1 && idArray[i] != current2 && idArray[i] != current3)
{
switch(type)
{
case "G" :
place1.disabled = false;
place2.disabled = false;
place3.disabled = false;
break;
case "S" :
place1.disabled = false;
place2.disabled = false;
place3.disabled = false;
break;
case "B" :
place1.disabled = false;
place2.disabled = false;
place3.disabled = false;
break;
}
}
}
}
You can easily test for a null or undefined value in JavaScript, as both these values are falsy:
var element = document.getElementById('some-id');
if (element) {
element.value = 'Hello';
}
You could also consider using a try/catch block:
try {
var element = document.getElementById('some-id');
element.value = 'Hello';
// ... the rest of your code here.
}
catch (e) {
if (!(e instanceof TypeError)) {
// The exception is not a TypeError, so throw it again.
throw e;
}
}
However be careful that the above will suppress all the TypeError exceptions and that might make your code more difficult to debug.
you can check if buttons are existing by getElementById and then check its length. Are you using any framework?
Try this
var element = document.getElementById('some-id');
element?.value = 'Hello';
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.