I'm trying to build a database based on some arbitrary data on a website. It's complex and changes for each site so I'll spare the details. Here's basically what I'm trying to do
function level0(arg) { textarea.innerHTML += arg + ' = {'; }
function level1(arg) { textarea.innerHTML += '\n\t' + arg + ': ['; }
function level2(arg) { textarea.innerHTML += arg + ', '; }
And so on. The thing is some level1's don't have any children and I can't get the formatting right.
My three problems are as follows.
The ending commas are going to break in IE (thank you MS)
Empty level1's shouldn't be printed if they don't have any children
Closing /curly?brackets/
HERE'S A DEMO of what I have so far. Notice the ending commas, the empty sub2 which shouldn't be printed, and no closing brackets or braces
Do I need to redesign the entire thing?
Is there also a way to have this all in one function so I don't have to worry if I add another layer?
EDIT
This needs to be done in a string format, I can't build an object and then stringify it, mostly because I need to know which element I'm in the middle of adding to.
Overall it looks that you still might want to build an object, but in case you insist on not building it - here is some sample solution:
function Printer() {
var result = '',
lastLevel = null,
close = {0:'\n}', 1:']', 2:''},
delimiter = {0: ',\n', 1:',\n', 2:','};
function closeLevel(level, noDelimiter) {
if(lastLevel === null)
return;
var l = lastLevel, d = level == lastLevel;
while(l >= level) {
result += close[l] + (l == level && !noDelimiter ? delimiter[l]:'');
l--;
}
}
this.level0 = function(arg) {
closeLevel(0);
result += arg + ' = {\n';
lastLevel = 0;
};
this.level1 = function(arg) {
closeLevel(1);
result += '\t' + arg + ': [';
lastLevel = 1;
};
this.level2 = function(arg) {
closeLevel(2);
result += arg;
lastLevel = 2;
};
this.getResult = function() {
closeLevel(lastLevel, true);
return result;
}
}
var p = new Printer();
p.level0('head');
p.level1('sub1');
p.level2('item1');p.level2('item2');p.level2('item3');
p.level1('sub2');
p.level1('sub3');
p.level2('newthing');
p.level0('head2');
document.getElementById('textarea').value = p.getResult();
You could see it in action here.
I'm not sure why you're building what looks like objects with nested arrays, using string concatenation. Something like this would be much simpler, since it wouldn't require fixing trailing commas, etc:
Edit: I've updated the code to make it keep track of the last level put in.
function Db() {
var level0, level1;
var data = new Object();
this.level0 = function(arg) {
level0 = new Object();
data[arg] = level0;
}
this.level1 = function(arg) {
level1 = new Array();
level0[arg] = level1;
}
this.level2 = function(arg) {
level1.push(arg);
}
this.toString = function() {
var s = '';
for(i in data) {
s += i + '\n';
for(j in data[i]) {
if(data[i][j].length>0) {
s += '\t' + j + ': [' + data[i][j] + ']\n' ;
}
}
}
return s;
}
}
Use like this:
var db = new Db();
db.level0('head');
db.level1('sub1');
db.level2('item1');db.level2('item2');db.level2('item3');
I've tested this in the demo you linked and it works just fine.
Related
I'm using JavaScript to remove, order up, order down a text row, it runs normally in IE, but not in Chrome or Firefox.
When I run, I received a message from console bug:
Uncaught TypeError: Failed to execute 'removeChild' on 'Node': parameter 1 is not of type 'Node'.
How to fix the error?
function dels(index) {
var frm = document.writeForm;
var opts = frm['ans' + index].value = ''; // eval("frm.ans_list" + index + ".options");
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected) {
opts[i--].removeChild(true);
}
}
eval("frm.ans" + index + ".value = '' ");
setting_val(index);
}
function up_move(index) {
var frm = document.writeForm;
var opts = eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected && i > 0) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i - 1].insertAdjacentElement("beforeBegin", tmp).selected = true;
}
}
setting_val(index);
}
**(UPDATED)**
function down_move(index)
{
var frm = document.writeForm;
var opts=frm["ans_list" + index].options // eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i=opts.length-1; i>=0; i--) {
if (opts[i].selected && i<opts.length-1) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
}
}
setting_val(index);
}
<span class="bt_test_admin bg_type_01">Delete</span>
<span class="bt_test_admin bg_type_01">▲ Order</span>
<span class="bt_test_admin bg_type_01">▼ Order</span>
Wrong use of removeChild
if (opts[i].selected) {
opts[i--].removeChild(true);
}
The function is intended as:
ParentNode.removeChild(ChildNode);
// OR
ChildNode.parentNode.removeChild(ChildNode);
MDN Documentation on removeChild
Also, you can replace all your evals
eval("frm.ans" + index + ".value = '' ")
eval("frm.ans_list" + index + ".options")
It would be better written as
frm["ans" + index].value = ""
frm["ans_list" + index].options
Finally,
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
Cloning a node, appending the clone, and removing the original would be optimized as moving the original to its new location.
But, you try to remove the original, then insert the clone after the original. It's odd.
If I correctly understood what you try to do, this function could help you.
function reverse_options_order(select_element)
{
// we store the current value to restore it after reordering
const selected_value = select_element.value;
// document fragment will temporarily hold the children
const fragment = document.createDocumentFragment();
while (select_element.lastChild)
{
// last child become first child, effectively reversing the order
fragment.appendChild(select_element.lastChild);
}
// appending a fragment is equal to appending all its children
// the fragment will "merge" with the select_element seamlessly
select_element.appendChild(fragment);
select_element.value = selected_value;
}
You can use the same method to reverse any nodes order
I'm trying to make a dropdown to display the results of a request given what the user writes in a field.
The problem I'm encountering is that when I try to add an onclick event to each item in the dropdown, only the last one acts like expected.
The dropdown is a section and I try to include sections in it.
Here is the dropdown :
<section id="projectDrop">
</section>
Here is the code :
var j = 0;
var tmp;
for (var i=0;((i<infos.projects.length) && (i<5));i++)
{
if (infos.projects[i].name.toLowerCase().match(projectName.value.toLowerCase()))
{
projectDrop.innerHTML += '<section id="project' + j + '">' + infos.projects[i].name + '</section>';
tmp = document.getElementById('project' + j);
projectDrop.style.height = (j+1)*20 + 'px';
tmp.style.top = j*20 + 'px';
tmp.style.height = '20 px';
tmp.style.width = '100%';
tmp.style.color = 'rgb(0, 0, 145)';
tmp.style.textAlign = 'center';
tmp.style.cursor = 'pointer';
tmp.style.zIndex = 5;
tmp.onclick = function(name, key)
{
return function()
{
return insertProject(name, key);
};
} (infos.projects[i].name, infos.projects[i].key);
++j;
}
}
The result is visually as I expected, I can see the dropdown with all my projects listed and a pointer while hovering etc...
But only the last project is clickable and trigger the "insertProject" function while the other do nothing.
If someone could help me solve that !
You need to store the key somewhere. Take a look at the solution below, I have used the data-key attribute on the <section> to store the key.
Also note how I have changed the code to create the element object and assign its properties, instead of building a raw string of HTML. The problem with building HTML as a string is you have to worry about escaping quotes, whereas this way you don't.
var j = 0;
var tmp;
for (var i=0;((i<infos.projects.length) && (i<5));i++)
{
if (infos.projects[i].name.toLowerCase().match(projectName.value.toLowerCase()))
{
tmp = document.createElement('section');
tmp.id = "project" + j;
tmp.setAttribute('data-key', infos.projects[i].key);
tmp.innerHTML = infos.projects[i].name;
projectDrop.style.height = (j+1)*20 + 'px';
tmp.style.top = j*20 + 'px';
tmp.style.height = '20 px';
tmp.style.width = '100%';
tmp.style.color = 'rgb(0, 0, 145)';
tmp.style.textAlign = 'center';
tmp.style.cursor = 'pointer';
tmp.style.zIndex = 5;
tmp.onclick = function(){
insertProject(this.innerHTML, this.getAttribute('data-key'));
};
projectDrop.appendChild(tmp);
++j;
}
}
Change:
tmp.onclick = function(name, key)
{
return function()
{
return insertProject(name, key);
};
} (infos.projects[i].name, infos.projects[i].key);
to
tmp.onclick = function(j){
return function(name, key)
{
return function()
{
return insertProject(name, key);
};
} (infos.projects[j].name, infos.projects[j].key);
}(i)
I am using an old version of UltraWebGrid by Infragistics and need to replace some of the built in javascript. The compiled js looks like its adding a bunch of functions to an object type as an api of sorts. formatted like:
var igtbl_ptsBand = ["functionname1",function(){...},"functionname2",function(){...},...
and so on. How would I override this?
Basically the control is adding html to the page in a way that is not compatible with newer browsers and the javascript code that does this just needs a little tweak. I found the code... I just need to change it.
The code can be found here
I added an answer to dump code examples in and whatnot. I will not select this answer
Similar SO question
The array you mentioned seems to be a function table of sorts:
var igtbl_ptsBand = ["func1", function() { }, "func2", function() { } ]
I would recommend using chaining instead of just an override. With chaining you can inject your own code, but still call the original function. Let's say you want to replace "func2" and chain. You could do something like this:
var origFunc, findex, ix;
if (igtbl_ptsBand.indexOf) {
// indexOf is supported, use it
findex = igtbl_ptsBand.indexOf("func2") + 1;
} else {
// Crippled browser such as IE, no indexOf, use loop
findex = -1;
for (ix = 0; ix < igtbl_ptsBand.length; ix += 2) {
if (igtbl_ptsBand[ix] === "func2") {
findex = ix + 1;
break;
}
}
}
if (findex >= 0) {
// Found it, chain
origFunc = igtbl_ptsBand[findex];
igtbl_ptsBand[findex] = function() {
// Your new pre-code here
// Call original func (chain)
origFunc();
// Your new post-code here
};
}
origFunc may have arguments, of course, and you may want to use the JavaScript call() function to set the "this pointer" to something specific, e.g.:
origFunc.call(customThis, arg1, arg2...);
If the arguments are in an array, you can use apply() instead of call().
I would not recommend doing this. You should always try to work with a third party library, not against it. That being said, this should work:
igtbl_ptsBand[igtbl_ptsBand.indexOf("functionYouWantToOverwrite") + 1] = function () {
// your new stuff...
};
Ok here is what I am doing. I will update this with my progress.
This fixes my problem. It turns out I had to apply the functionarray to all child objects of the parent "rows". To do this I added the code in my "Fix Rows" function. I split it up because i am running accross other browser JS error which I am fixing in this js file.
Here is the js file that I added to my .net page like so...
</form>
<script type="text/javascript" src="../../scripts/BrowserCompat.js"></script>
</body>
</html>
.
Brows_FixUltraWebGrid();
function Brows_FixUltraWebGrid() {
FixRows();
}
function FixRows() {
FixGridRows_render();
for (var i = 0; i < igtbl_ptsRows.length; i += 2)
igtbl_Rows.prototype[igtbl_ptsRows[i]] = igtbl_ptsRows[i + 1];
}
function FixGridRows_render() {
var origFunc, findex, ix;
if (igtbl_ptsRows.indexOf) {
// indexOf is supported, use it
findex = igtbl_ptsRows.indexOf("render") + 1;
} else {
// Crippled browser such as IE, no indexOf, use loop
findex = -1;
for (ix = 0; ix < igtbl_ptsRows.length; ix += 2) {
if (igtbl_ptsRows[ix] === "render") {
findex = ix + 1;
break;
}
}
}
if (findex >= 0) {
// Found it, chain
origFunc = igtbl_ptsRows[findex];
igtbl_ptsRows[findex] = function() {
// Your new pre-code here
// Call original func (chain)
//origFunc();
// Your new post-code here
var strTransform = this.applyXslToNode(this.Node);
if (strTransform) {
var anId = (this.AddNewRow ? this.AddNewRow.Id : null);
//new logic to include tbody if it is not there
var tadd1 = '';
var tadd2 = '';
if (!(/\<tbody\>/.test(strTransform))) {
tadd1 = '<tbody>';
tadd2 = '</tbody>';
}
this.Grid._innerObj.innerHTML =
"<table style=\"table-layout:fixed;\">" + tadd1 + strTransform + tadd2 + "</table>";
//old line
//this.Grid._innerObj.innerHTML = "<table style=\"table-layout:fixed;\">" + strTransform + "</table>";
var tbl = this.Element.parentNode;
igtbl_replaceChild(tbl, this.Grid._innerObj.firstChild.firstChild, this.Element);
igtbl_fixDOEXml();
var _b = this.Band;
var headerDiv = igtbl_getElementById(this.Grid.Id + "_hdiv");
var footerDiv = igtbl_getElementById(this.Grid.Id + "_fdiv");
if (this.AddNewRow) {
if (_b.Index > 0 || _b.AddNewRowView == 1 && !headerDiv || _b.AddNewRowView == 2 && !footerDiv) {
var anr = this.AddNewRow.Element;
anr.parentNode.removeChild(anr);
if (_b.AddNewRowView == 1 && tbl.tBodies[0].rows.length > 0)
tbl.tBodies[0].insertBefore(anr, tbl.tBodies[0].rows[0]);
else
tbl.tBodies[0].appendChild(anr);
}
this.AddNewRow.Element = igtbl_getElementById(anId);
this.AddNewRow.Element.Object = this.AddNewRow;
}
this.Element = tbl.tBodies[0];
this.Element.Object = this;
this._setupFilterRow();
for (var i = 0; i < this.Band.Columns.length; i++) {
var column = this.Band.Columns[i];
if (column.Selected && column.hasCells()) {
var col = this.getColumn(i);
if (col)
igtbl_selColRI(this.Grid.Id, col, this.Band.Index, i);
}
}
if (this.ParentRow) {
this.ParentRow.ChildRowsCount = this.length;
this.ParentRow.VisChildRowsCount = this.length;
}
}
console.log('overridden row render function executed');
};
}
}
I am trying to select dynamically all valid emails which are entered into a text area and spaced by either a space or a comma. (I'm not sure how to use a regex to achieve this in javascript).
My main issue is a number of false positives and extra duplicate information is being displayed, (I assume from using keyup), is there a way to fix this problem so it only shows each valid email once?
$(document).ready(function(){
$('.emails').keyup(function () {
var matches = $('.emails').val().split(' ');
for (var i = 0; i < matches.length; i++){
if (validEmail(matches[i])){
$('#emails-send').append("<div class='newmail'>" + matches[i] + "</div>");
}
}
});
function validEmail(emailAddress) {
var pattern = new RegExp(/^((([a-z]|\d|[!#\$%&'\*\+\-\/=\?\^_`{\|}~]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])+(\.([a-z]|\d|[!#\$%&'\*\+\-\/=\?\^_`{\|}~]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])+)*)|((\x22)((((\x20|\x09)*(\x0d\x0a))?(\x20|\x09)+)?(([\x01-\x08\x0b\x0c\x0e-\x1f\x7f]|\x21|[\x23-\x5b]|[\x5d-\x7e]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(\\([\x01-\x09\x0b\x0c\x0d-\x7f]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]))))*(((\x20|\x09)*(\x0d\x0a))?(\x20|\x09)+)?(\x22)))#((([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.)+(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.?$/i);
return pattern.test(emailAddress);
};
});
You could store the results in another object and check if they're already appended like this: http://jsfiddle.net/LKPwg/2/
this also uses a timeout to prevent incomplete emails addresses to be added.
var results = {};
var timer = null;
$(document).ready(function() {
$('.emails').keyup(function() {
clearTimeout(timer);
timer = setTimeout("parseEmails()", 500);
});
});
function parseEmails() {
var matches = $('.emails').val().split(' ');
for (var i = 0; i < matches.length; i++) {
if (validEmail(matches[i])) {
if (results[matches[i]] == undefined) {
results[matches[i]] = matches[i];
$('#emails-send').append("<div class='newmail'>" + matches[i] + "</div>");
}
}
}
}
function validEmail(emailAddress) {
var pattern = new RegExp(/^((([a-z]|\d|[!#\$%&'\*\+\-\/=\?\^_`{\|}~]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])+(\.([a-z]|\d|[!#\$%&'\*\+\-\/=\?\^_`{\|}~]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])+)*)|((\x22)((((\x20|\x09)*(\x0d\x0a))?(\x20|\x09)+)?(([\x01-\x08\x0b\x0c\x0e-\x1f\x7f]|\x21|[\x23-\x5b]|[\x5d-\x7e]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(\\([\x01-\x09\x0b\x0c\x0d-\x7f]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]))))*(((\x20|\x09)*(\x0d\x0a))?(\x20|\x09)+)?(\x22)))#((([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.)+(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.?$/i);
return pattern.test(emailAddress);
};
as for the comma/whitespace seperation, you could replace comma with whitespace first:
$('.emails').val().replace(',', ' ').split(' ')
UPDATE
http://jsfiddle.net/LKPwg/4/
alternate way of checking for duplicates, by searching for a substring of the found token in the results and just updating the result-div in that case:
(this example doesn't need the timeout and uses the data-attribute to identify results)
var substr = matches[i].substr(0, (matches[i].length - 1));
if (results[substr]) {
delete results[substr];
$('#emails-send .newmail[data-email="' + substr + '"]').html(matches[i]).attr('data-email', matches[i]);
}
else {
$('#emails-send').append("<div class='newmail' data-email=" + matches[i] + ">" + matches[i] + "</div>");
}
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.