I am using an old version of UltraWebGrid by Infragistics and need to replace some of the built in javascript. The compiled js looks like its adding a bunch of functions to an object type as an api of sorts. formatted like:
var igtbl_ptsBand = ["functionname1",function(){...},"functionname2",function(){...},...
and so on. How would I override this?
Basically the control is adding html to the page in a way that is not compatible with newer browsers and the javascript code that does this just needs a little tweak. I found the code... I just need to change it.
The code can be found here
I added an answer to dump code examples in and whatnot. I will not select this answer
Similar SO question
The array you mentioned seems to be a function table of sorts:
var igtbl_ptsBand = ["func1", function() { }, "func2", function() { } ]
I would recommend using chaining instead of just an override. With chaining you can inject your own code, but still call the original function. Let's say you want to replace "func2" and chain. You could do something like this:
var origFunc, findex, ix;
if (igtbl_ptsBand.indexOf) {
// indexOf is supported, use it
findex = igtbl_ptsBand.indexOf("func2") + 1;
} else {
// Crippled browser such as IE, no indexOf, use loop
findex = -1;
for (ix = 0; ix < igtbl_ptsBand.length; ix += 2) {
if (igtbl_ptsBand[ix] === "func2") {
findex = ix + 1;
break;
}
}
}
if (findex >= 0) {
// Found it, chain
origFunc = igtbl_ptsBand[findex];
igtbl_ptsBand[findex] = function() {
// Your new pre-code here
// Call original func (chain)
origFunc();
// Your new post-code here
};
}
origFunc may have arguments, of course, and you may want to use the JavaScript call() function to set the "this pointer" to something specific, e.g.:
origFunc.call(customThis, arg1, arg2...);
If the arguments are in an array, you can use apply() instead of call().
I would not recommend doing this. You should always try to work with a third party library, not against it. That being said, this should work:
igtbl_ptsBand[igtbl_ptsBand.indexOf("functionYouWantToOverwrite") + 1] = function () {
// your new stuff...
};
Ok here is what I am doing. I will update this with my progress.
This fixes my problem. It turns out I had to apply the functionarray to all child objects of the parent "rows". To do this I added the code in my "Fix Rows" function. I split it up because i am running accross other browser JS error which I am fixing in this js file.
Here is the js file that I added to my .net page like so...
</form>
<script type="text/javascript" src="../../scripts/BrowserCompat.js"></script>
</body>
</html>
.
Brows_FixUltraWebGrid();
function Brows_FixUltraWebGrid() {
FixRows();
}
function FixRows() {
FixGridRows_render();
for (var i = 0; i < igtbl_ptsRows.length; i += 2)
igtbl_Rows.prototype[igtbl_ptsRows[i]] = igtbl_ptsRows[i + 1];
}
function FixGridRows_render() {
var origFunc, findex, ix;
if (igtbl_ptsRows.indexOf) {
// indexOf is supported, use it
findex = igtbl_ptsRows.indexOf("render") + 1;
} else {
// Crippled browser such as IE, no indexOf, use loop
findex = -1;
for (ix = 0; ix < igtbl_ptsRows.length; ix += 2) {
if (igtbl_ptsRows[ix] === "render") {
findex = ix + 1;
break;
}
}
}
if (findex >= 0) {
// Found it, chain
origFunc = igtbl_ptsRows[findex];
igtbl_ptsRows[findex] = function() {
// Your new pre-code here
// Call original func (chain)
//origFunc();
// Your new post-code here
var strTransform = this.applyXslToNode(this.Node);
if (strTransform) {
var anId = (this.AddNewRow ? this.AddNewRow.Id : null);
//new logic to include tbody if it is not there
var tadd1 = '';
var tadd2 = '';
if (!(/\<tbody\>/.test(strTransform))) {
tadd1 = '<tbody>';
tadd2 = '</tbody>';
}
this.Grid._innerObj.innerHTML =
"<table style=\"table-layout:fixed;\">" + tadd1 + strTransform + tadd2 + "</table>";
//old line
//this.Grid._innerObj.innerHTML = "<table style=\"table-layout:fixed;\">" + strTransform + "</table>";
var tbl = this.Element.parentNode;
igtbl_replaceChild(tbl, this.Grid._innerObj.firstChild.firstChild, this.Element);
igtbl_fixDOEXml();
var _b = this.Band;
var headerDiv = igtbl_getElementById(this.Grid.Id + "_hdiv");
var footerDiv = igtbl_getElementById(this.Grid.Id + "_fdiv");
if (this.AddNewRow) {
if (_b.Index > 0 || _b.AddNewRowView == 1 && !headerDiv || _b.AddNewRowView == 2 && !footerDiv) {
var anr = this.AddNewRow.Element;
anr.parentNode.removeChild(anr);
if (_b.AddNewRowView == 1 && tbl.tBodies[0].rows.length > 0)
tbl.tBodies[0].insertBefore(anr, tbl.tBodies[0].rows[0]);
else
tbl.tBodies[0].appendChild(anr);
}
this.AddNewRow.Element = igtbl_getElementById(anId);
this.AddNewRow.Element.Object = this.AddNewRow;
}
this.Element = tbl.tBodies[0];
this.Element.Object = this;
this._setupFilterRow();
for (var i = 0; i < this.Band.Columns.length; i++) {
var column = this.Band.Columns[i];
if (column.Selected && column.hasCells()) {
var col = this.getColumn(i);
if (col)
igtbl_selColRI(this.Grid.Id, col, this.Band.Index, i);
}
}
if (this.ParentRow) {
this.ParentRow.ChildRowsCount = this.length;
this.ParentRow.VisChildRowsCount = this.length;
}
}
console.log('overridden row render function executed');
};
}
}
Related
I'm using JavaScript to remove, order up, order down a text row, it runs normally in IE, but not in Chrome or Firefox.
When I run, I received a message from console bug:
Uncaught TypeError: Failed to execute 'removeChild' on 'Node': parameter 1 is not of type 'Node'.
How to fix the error?
function dels(index) {
var frm = document.writeForm;
var opts = frm['ans' + index].value = ''; // eval("frm.ans_list" + index + ".options");
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected) {
opts[i--].removeChild(true);
}
}
eval("frm.ans" + index + ".value = '' ");
setting_val(index);
}
function up_move(index) {
var frm = document.writeForm;
var opts = eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected && i > 0) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i - 1].insertAdjacentElement("beforeBegin", tmp).selected = true;
}
}
setting_val(index);
}
**(UPDATED)**
function down_move(index)
{
var frm = document.writeForm;
var opts=frm["ans_list" + index].options // eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i=opts.length-1; i>=0; i--) {
if (opts[i].selected && i<opts.length-1) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
}
}
setting_val(index);
}
<span class="bt_test_admin bg_type_01">Delete</span>
<span class="bt_test_admin bg_type_01">▲ Order</span>
<span class="bt_test_admin bg_type_01">▼ Order</span>
Wrong use of removeChild
if (opts[i].selected) {
opts[i--].removeChild(true);
}
The function is intended as:
ParentNode.removeChild(ChildNode);
// OR
ChildNode.parentNode.removeChild(ChildNode);
MDN Documentation on removeChild
Also, you can replace all your evals
eval("frm.ans" + index + ".value = '' ")
eval("frm.ans_list" + index + ".options")
It would be better written as
frm["ans" + index].value = ""
frm["ans_list" + index].options
Finally,
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
Cloning a node, appending the clone, and removing the original would be optimized as moving the original to its new location.
But, you try to remove the original, then insert the clone after the original. It's odd.
If I correctly understood what you try to do, this function could help you.
function reverse_options_order(select_element)
{
// we store the current value to restore it after reordering
const selected_value = select_element.value;
// document fragment will temporarily hold the children
const fragment = document.createDocumentFragment();
while (select_element.lastChild)
{
// last child become first child, effectively reversing the order
fragment.appendChild(select_element.lastChild);
}
// appending a fragment is equal to appending all its children
// the fragment will "merge" with the select_element seamlessly
select_element.appendChild(fragment);
select_element.value = selected_value;
}
You can use the same method to reverse any nodes order
Basic javascript function to scroll the text in the title bar, I'm calling it via a setInterval("rotateTitle()", 1000); call after onload.
This function, which takes text from an array, works perfectly.
var counter = 0;
function rotateTitle() {
var baseTitle = "www.mydomain.com - now with JavaScript";
var titleArray = new Array("a","b","c","d","e","f","g");
var titleString = "abcdefg";
var scrollText = getNextScroll(titleArray);
window.document.title=baseTitle.concat(scrollText);
}
function getNextScroll(inValue) {
var str = " ";
for (var i = 0; i<inValue.length; i++) {
var index = i+counter;
if (i+counter >= inValue.length) {
index -= inValue.length;
}
str += inValue[index];
}
counter++;
if (counter > inValue.length) {
counter = 0;
}
return str;
}
Edited here for clarity:
Now if I rewrite the function to scroll a string (not an array), I change the line
str += inValue[index];
to
str.concat(inValue.charAt(index));
and change getNextScroll(titleArray) to getNextScroll(titleString), the script seems to execute, but only the baseTitle is shown.
Why is this wrong?
You have to assign the result of str.concat back to str; otherwise you'll miss the concat operation. Instead of charAt you must use inValue[index].
Do like this:
str = str.concat(inValue[index]);
Here's a JS Bin: http://jsbin.com/aCEBAju/2/
In your original code you have this:
str.concat(inValue.charAt(index));
debugging in Chrome it barks: array has no method charAt.
The solution to the problem is that str.concat(inValue.charAt(index)); must change to str = str.concat(inValue.charAt(index)); or str += inValue.charAt(index);. Str must be assigned the new value. This is the entire working function:
var counter = 0;
function rotateTitle() {
var baseTitle = "www.berrmal.com - now with JavaScript";
var titleArray = new Array("b","e","r","r","m","a","l"); //no longer necessary
var titleString = "berrmal: bigger, longer, uncut";
var scrollText = getNextScroll(titleString);
window.document.title=baseTitle.concat(scrollText);
}
function getNextScroll(inString) {
var str = " ";
for (var i = 0; i<inString.length; i++) {
var index = i+counter;
if (i+counter >= inString.length) {
index -= inString.length;
}
str += inString.charAt(index);
}
counter++;
if (counter > inString.length) {
counter = 0;
}
return str;
}
I figured out the answer to the problem based on Leniel Macaferi's answer, though his posted code is not correct. This method runs successfully in Firefox 23.0 with no error in the console.
I'm trying to build a database based on some arbitrary data on a website. It's complex and changes for each site so I'll spare the details. Here's basically what I'm trying to do
function level0(arg) { textarea.innerHTML += arg + ' = {'; }
function level1(arg) { textarea.innerHTML += '\n\t' + arg + ': ['; }
function level2(arg) { textarea.innerHTML += arg + ', '; }
And so on. The thing is some level1's don't have any children and I can't get the formatting right.
My three problems are as follows.
The ending commas are going to break in IE (thank you MS)
Empty level1's shouldn't be printed if they don't have any children
Closing /curly?brackets/
HERE'S A DEMO of what I have so far. Notice the ending commas, the empty sub2 which shouldn't be printed, and no closing brackets or braces
Do I need to redesign the entire thing?
Is there also a way to have this all in one function so I don't have to worry if I add another layer?
EDIT
This needs to be done in a string format, I can't build an object and then stringify it, mostly because I need to know which element I'm in the middle of adding to.
Overall it looks that you still might want to build an object, but in case you insist on not building it - here is some sample solution:
function Printer() {
var result = '',
lastLevel = null,
close = {0:'\n}', 1:']', 2:''},
delimiter = {0: ',\n', 1:',\n', 2:','};
function closeLevel(level, noDelimiter) {
if(lastLevel === null)
return;
var l = lastLevel, d = level == lastLevel;
while(l >= level) {
result += close[l] + (l == level && !noDelimiter ? delimiter[l]:'');
l--;
}
}
this.level0 = function(arg) {
closeLevel(0);
result += arg + ' = {\n';
lastLevel = 0;
};
this.level1 = function(arg) {
closeLevel(1);
result += '\t' + arg + ': [';
lastLevel = 1;
};
this.level2 = function(arg) {
closeLevel(2);
result += arg;
lastLevel = 2;
};
this.getResult = function() {
closeLevel(lastLevel, true);
return result;
}
}
var p = new Printer();
p.level0('head');
p.level1('sub1');
p.level2('item1');p.level2('item2');p.level2('item3');
p.level1('sub2');
p.level1('sub3');
p.level2('newthing');
p.level0('head2');
document.getElementById('textarea').value = p.getResult();
You could see it in action here.
I'm not sure why you're building what looks like objects with nested arrays, using string concatenation. Something like this would be much simpler, since it wouldn't require fixing trailing commas, etc:
Edit: I've updated the code to make it keep track of the last level put in.
function Db() {
var level0, level1;
var data = new Object();
this.level0 = function(arg) {
level0 = new Object();
data[arg] = level0;
}
this.level1 = function(arg) {
level1 = new Array();
level0[arg] = level1;
}
this.level2 = function(arg) {
level1.push(arg);
}
this.toString = function() {
var s = '';
for(i in data) {
s += i + '\n';
for(j in data[i]) {
if(data[i][j].length>0) {
s += '\t' + j + ': [' + data[i][j] + ']\n' ;
}
}
}
return s;
}
}
Use like this:
var db = new Db();
db.level0('head');
db.level1('sub1');
db.level2('item1');db.level2('item2');db.level2('item3');
I've tested this in the demo you linked and it works just fine.
Hallo,
I have 3 Different function in Javascript, the first one replaces HTML Selectboxs width custom selectbox created with ULs.
and the other 2 replace Checkbox and Radio buttons respectivly.
Now I want to derive classes out of these functions, and need your suggestions, what will be the best way to organize these functions into class, whether inheretance is possible?
I really appriciate your help.
Thanks.
Here is some sample code.
function replaceSelect(formid) {
var form = $(formid);
if (!form) return;
invisibleSelectboes = document.getElementsByClassName("optionsDivInvisible");
if (invisibleSelectboes.length > 0) {
for (var i = 0; i < invisibleSelectboes.length; i++) {
document.body.removeChild(invisibleSelectboes[i]);
}
}
var selects = [];
var selectboxes = form.getElementsByTagName('select');
var selectText = "Bitte auswählen";
var selectRightSideWidth = 21;
var selectLeftSideWidth = 8;
selectAreaHeight = 21;
selectAreaOptionsOverlap = 2;
// Access all Selectboxes in Search mask.
for (var cfs = 0; cfs < selectboxes.length; cfs++) {
selects.push(selectboxes[cfs]);
}
// Replace the select boxes
for (var q = 0; q < selects.length; q++) {
if (selects[q].className == "") continue;
var onchangeEvent = selects[q].onchange;
//create and build div structure
var selectArea = document.createElement('div');
var left = document.createElement('div');
var right = document.createElement('div');
var center = document.createElement('div');
var button = document.createElement('a');
// var text = document.createTextNode(selectText);
var text = document.createTextNode('');
center.id = "mySelectText" + q;
if ( !! selects[q].getAttribute("selectWidth")) {
var selectWidth = parseInt(selects[q].getAttribute("selectWidth"));
} else {
var selectWidth = parseInt(selects[q].className.replace(/width_/g, ""));
}
center.style.width = selectWidth + 'px';
selectArea.style.width = selectWidth + selectRightSideWidth + selectLeftSideWidth + 'px';
if (selects[q].style.display == 'none' || selects[q].style.visibility == 'hidden') {
selectArea.style.display = 'none';
}
button.style.width = selectWidth + selectRightSideWidth + selectLeftSideWidth + 'px';
button.style.marginLeft = -selectWidth - selectLeftSideWidth + 'px';
// button.href = "javascript:toggleOptions( + q + ")";
Event.observe(button, 'click', function (q) {
return function (event) {
clickObserver(event, q)
}
}(q));
button.onkeydown = this.selectListener;
button.className = "selectButton"; //class used to check for mouseover
selectArea.className = "selectArea";
selectArea.id = "sarea" + q;
left.className = "left";
right.className = "right";
center.className = "center";
right.appendChild(button);
center.appendChild(text);
selectArea.appendChild(left);
selectArea.appendChild(right);
selectArea.appendChild(center);
//hide the select field
selects[q].style.display = 'none';
//insert select div
selects[q].parentNode.insertBefore(selectArea, selects[q]);
//build & place options div
var optionsDiv = document.createElement('div');
if (selects[q].getAttribute('width')) optionsDiv.style.width = selects[q].getAttribute('width') + 'px';
else optionsDiv.style.width = selectWidth + 8 + 'px';
optionsDiv.className = "optionsDivInvisible";
optionsDiv.id = "optionsDiv" + q;
optionsDiv.style.left = findPosX(selectArea) + 'px';
optionsDiv.style.top = findPosY(selectArea) + selectAreaHeight - selectAreaOptionsOverlap + 'px';
//get select's options and add to options div
for (var w = 0; w < selects[q].options.length; w++) {
var optionHolder = document.createElement('p');
if (selects[q].options[w].className == "informal") {
var optionLink = document.createElement('a');
var optionTxt = document.createTextNode(selects[q].options[w].getAttribute('text'));
optionLink.innerHTML = selects[q].options[w].getAttribute('text');
optionLink.className = "informal";
cic.addEvent(optionLink, 'click', function (event) {
Event.stop(event);
});
Event.observe(optionLink, 'mouseover', function (event) {
Event.stop(event);
});
Event.observe(optionLink, 'mouseout', function (event) {
Event.stop(event);
});
}
else {
var optionLink = document.createElement('a');
var optionTxt = document.createTextNode(selects[q].options[w].text);
optionLink.appendChild(optionTxt);
cic.addEvent(optionLink, 'click', function (id, w, q, onchangeEvent) {
return function () {
showOptions(q);
selectMe(selects[q].id, w, q, onchangeEvent);
}
}(selects[q].id, w, q, onchangeEvent));
}
//optionLink.href = "javascript:showOptions(" + q + "); selectMe('" + selects[q].id + "'," + w + "," + q + ");";
optionHolder.appendChild(optionLink);
optionsDiv.appendChild(optionHolder);
if (selects[q].options[w].selected) {
selectMe(selects[q].id, w, q);
}
}
document.getElementsByTagName("body")[0].appendChild(optionsDiv);
Event.observe(optionsDiv, 'mouseleave', function (submenuid) {
optionsDiv.className = 'optionsDivInvisible'
});
cic.addEvent(optionsDiv, 'click', function (event) {
if (event.stopPropagation) event.stopPropagation();
else event.cancelBubble = true;
});
}
form.setStyle({
visibility: 'visible'
});
}
From the sounds of it, you're looking to create a unified API to encapsulate all of this "form enhancing" functionality. Possibly something like this:
var formEnhancement = {
SelectBox: function(){ /* ... */ },
CheckBox: function(){ /* ... */ },
RadioButton: function(){ /* ... */ }
};
formEnhancement.SelectBox.prototype = { /* ... define methods ... */ };
// etc. (other prototypes)
// Call something:
var myEnhancedSelectBox = new formEnhancement.SelectBox(
document.getElementById('id-of-a-select-box')
);
Does this answer your query?
I'd go with
var Library = (function()
{
function _selectBox()
{
// stuff
}
function _checkBox()
{
// stuff
}
function _radioButton()
{
// stuff
}
return {
SelectBox : _selectBox,
CheckBox : _checkBox,
RadioButton : _radioButton
};
})();
or
var Library = (function()
{
return {
SelectBox : function()
{
// stuff
},
CheckBox : function()
{
// stuff
},
RadioButton : function()
{
// stuff
}
};
})();
[Edit]
this way, you can actually declare "private" variables that can be accessible only from the library itself, just declaring var foo="bar"; inside Library's declaration, makes a foo variable that can't be accessed from outside, but can be accessed by anything within Library, this is why functions like _selectBox in my example remain private, but can still be accessed through Library.SelectBox, which would be the "public getter"
[/Edit]
also, instead of
var Library = (function(){})();
you could do something like this:
var Library = Library || {};
Library.UI = (function(){})();
this way, you can keep separate parts of your code library, you can keep them in separate files, which don't care about the order in which they are loaded, as long as they have
var Library = Library || {};
on top of them
the functions would then be called like this:
Library.SelectBox();
or in the case you chose to go with "subclasses"
Library.UI.SelectBox();
All the answers are general patterns I think none of them is really helpful. Just because you put your 3 huge function into an object doesn't make your code modular, reusable, maintainable.
So my first suggestion is to utilize function decomposition. You've mentioned inheritance. Now if your code is basically made of this 3 giant functions nothing can be inherited or shared. You should separate function logic by purpose into smaller, more straighforward ones.
A good example is that you've mentioned the word replacing is relevant in all your cases. Maybe you can set up a function that is responsible for DOM replacement independently of the element's type. Such function can be shared between your modules making your code more robust and allowing you to DRY.
The best way to organize this process is called wishful thinking, when you solve your problem with functions which are intuitive and helpful even though they may not even exist. This is related to how you can design effective interaces.
Put the functions in a namespace:
Declare it like this:
FormUtils = {};
and add its properties, which will be your functions
FormUtils.replaceSelect = function () {/*your code*/};
FormUtils.replaceCheckbox = function () {/*your code*/};
FormUtils.replaceRadio = function () {/*your code*/};
then you call this functions with their namespace:
FormUtils.replaceSelect();
This is a simple and very accepted design pattern to javascript
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.