I am interested in extracting links from sites where the links are dynamically generated with JavaScript and are essentially invisible in HTML source. For instance here is an example site where the links are inserted via a js menu:
http://www.stcroixwebsolutions.com/
When I hover with the mouse over the links, I see the links, but they are not discernible in HTML source.
I would like to output the links like so:
http://www.stcroixwebsolutions.com/?110000
http://www.stcroixwebsolutions.com/?110010
etc.
What do you recommend I use to extract these links?
You could try something like this... This will at least get you started!
http://jsfiddle.net/Qv4St/
function showLinks() {
var links = document.getElementsByTagName( 'a' );
var last = links.length;
var list = {};
// for each anchor...
for (var i = 0; i < last; i++) {
list[links[i].href] = i;
console.log(list);
//' - text=' + links[i].innerHTML + '<br>';
}
var linksList = document.getElementById( 'linksList' );
linksList.innerHTML = list;
}
var getLinks = function () {
"use strict";
var a = document.getElementsByTagName("a"),
b = a.length,
c = 0,
d = [],
e = "",
f = location.href;
f = f.substring(0, f.lastIndexOf("/"));
for (c = 0; c < b; c += 1) {
e = a[c].getAttribute("href");
if (typeof e === "string" && e.length > 4) {
if (e.charAt(0) === "/" || e.charAt(0) === "?") {
e = f + e;
}
d.push(e);
}
}
return d.join("\n") + "\n" + d.length + " total links";
},
myLinks = getLinks(); //myLinks variable will contain the desired output.
//To output to the console just replace the line with 'return' with this code:
//console.log(d.join("\n") + "\n" + d.length + " total links");
Run this code to return a list of all hyperlinks on the given page in a list with each result on its own line.
EDIT: I now convert relative links to absolute URIs.
There is a standard document.links collection that is all the links in a document. Simply iterate over that.
Related
I'm using JavaScript to remove, order up, order down a text row, it runs normally in IE, but not in Chrome or Firefox.
When I run, I received a message from console bug:
Uncaught TypeError: Failed to execute 'removeChild' on 'Node': parameter 1 is not of type 'Node'.
How to fix the error?
function dels(index) {
var frm = document.writeForm;
var opts = frm['ans' + index].value = ''; // eval("frm.ans_list" + index + ".options");
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected) {
opts[i--].removeChild(true);
}
}
eval("frm.ans" + index + ".value = '' ");
setting_val(index);
}
function up_move(index) {
var frm = document.writeForm;
var opts = eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i = 0; i < opts.length; i++) {
if (opts[i].selected && i > 0) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i - 1].insertAdjacentElement("beforeBegin", tmp).selected = true;
}
}
setting_val(index);
}
**(UPDATED)**
function down_move(index)
{
var frm = document.writeForm;
var opts=frm["ans_list" + index].options // eval("frm.ans_list" + index + ".options"); // frm['ans' + index].value = '';
for (var i=opts.length-1; i>=0; i--) {
if (opts[i].selected && i<opts.length-1) {
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
}
}
setting_val(index);
}
<span class="bt_test_admin bg_type_01">Delete</span>
<span class="bt_test_admin bg_type_01">▲ Order</span>
<span class="bt_test_admin bg_type_01">▼ Order</span>
Wrong use of removeChild
if (opts[i].selected) {
opts[i--].removeChild(true);
}
The function is intended as:
ParentNode.removeChild(ChildNode);
// OR
ChildNode.parentNode.removeChild(ChildNode);
MDN Documentation on removeChild
Also, you can replace all your evals
eval("frm.ans" + index + ".value = '' ")
eval("frm.ans_list" + index + ".options")
It would be better written as
frm["ans" + index].value = ""
frm["ans_list" + index].options
Finally,
tmp = opts[i].cloneNode(true);
opts[i].removeChild(true);
opts[i].insertAdjacentElement("afterEnd", tmp).selected = true;
Cloning a node, appending the clone, and removing the original would be optimized as moving the original to its new location.
But, you try to remove the original, then insert the clone after the original. It's odd.
If I correctly understood what you try to do, this function could help you.
function reverse_options_order(select_element)
{
// we store the current value to restore it after reordering
const selected_value = select_element.value;
// document fragment will temporarily hold the children
const fragment = document.createDocumentFragment();
while (select_element.lastChild)
{
// last child become first child, effectively reversing the order
fragment.appendChild(select_element.lastChild);
}
// appending a fragment is equal to appending all its children
// the fragment will "merge" with the select_element seamlessly
select_element.appendChild(fragment);
select_element.value = selected_value;
}
You can use the same method to reverse any nodes order
I have a problem with the javascript replace function and I don't succeed to resolve it.
This is my code : https://jsfiddle.net/r36k20sa/1/
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>.*?)(\\b" + element + "\\b)(?!.*?<\\/a>)", "igm"),
'$1'
);
});
In the tags array, if I reverse the array "johnny" then "zazie" all tags are well selected otherwise, some tags are missing. (The last in this example). What can be the trick?
What can be explained that ? It seems like the javascript replace function runs asynchronous?
Thanks for your help.
Are you seriously using regex to process HTML when you have a DOM parser at your fingertips?
var content = document.getElementById('content');
function findTextNodes(root,ret) {
// recursively descend into child nodes and return an array of text nodes
var children = root.childNodes, l = children.length, i;
ret = ret || [];
for( i=0; i<l; i++) {
if( children[i].nodeType == 1) { // ElementNode
// excluding A tags here, you might also want to exclude BUTTON tags
if( children[i].nodeName != "A") {
findTextNodes(children[i],ret);
}
}
if( children[i].nodeType == 3) { // TextNode
ret.push(children[i]);
}
}
return ret;
}
var textNodes = findTextNodes(content);
// now search those text node contents for matching tags.
var tags = ['zazie','johnny'], tagcount = tags.length, regexes, tag;
for( tag=0; tag<tagcount; tag++) {
regexes[tag] = new RegExp("\b"+tags[tag]+"\b","i");
}
var node, match, index, tagtext, newnode;
while(node = textNodes.shift()) {
for( tag=0; tag<tagcount; tag++) {
if( match = node.nodeValue.match(regexes[tag])) {
index = match.index;
textNodes.unshift(node.splitText(index + tags[tag].length));
tagtext = node.splitText(index);
newnode = document.createElement('a');
newnode.href = "";
newnode.className = "esk-seo-plu-link";
newnode.style.cssText = "background:red;color:white";
tagtext.parentNode.replaceChild(newnode,tagtext);
newnode.appendChild(tagtext);
}
}
}
// and done - no more action needed since it was in-place.
See it in action
Please replace . with \\.
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>\\.*?)(\\b" + element + "\\b)(?!\\.*?<\\/a>)", "igm"),
'$1'
);
});
Can someone give me a hand and tell me what does this "+e+" do in the following script (taken from
https://tracking.crealytics.com/lib/multi_conversion.min.js
)? I highlighted it in black:
(function(){var
t,e,n;this.__multi_conversion_tracking=function(e,n){var i,c,r;return
i=document.getElementsByTagName("body")[0],c=document.createElement("div"),c.id="multi_conversion_tracking",c.style.display="none",r=document.createElement("iframe"),r.src=t(e,n,1),c.appendChild(r),i.appendChild(c)},n=function(){return"https:"===location.protocol.toLowerCase()?"https":"http"},t=function(t,e,i){return
null==i&&(i=1),""+n()+"://tracking.crealytics.com/"+t+"/multi_check.php
?data="+e+" &random="+(new Date).getTime()+"
&frame="+i},e=function(t,e){return-1!==t.indexOf(e,t.length-e.length)}}).call(this);
I am trying to figure out why this script is not parsing correctly the following tag:
<script
src="https://tracking.crealytics.com/lib/multi_conversion.min.js"></script>
<script type="text/javascript"> var transactionString =
{{CrealyticsProductsInfo}};__multi_conversion_tracking(70,
"transactionString"); </script> <noscript> <div style="display:inline;"> <img
src="https://tracking.crealytics.com/70/multi_check.php?data=transactionString">
</div> </noscript>
this is the assignation I give the variable in my tracking code:
var divElement = document.createElement("Div");
divElement.id = "transactionString";
divElement.setAttribute('data-transaction-string', products_info);
It is supposed to mimic the following div element:
<div id='transactionString' data-transaction-string='DATA'></div>
multi_conversion_tracking function takes two parameters, e and n. the value of the first parameter (e) will be appended to the data parameter in that query string being composed using + e +
e it's just argument of function t. It concatenating e argument with another parts of url.
__multi_conversion_tracking call t function r.src = t(e, n, 1)
(function() {
var t, e, n;
this.__multi_conversion_tracking = function(e, n) {
var i, c, r;
return i = document.getElementsByTagName("body")[0],
c = document.createElement("div"),
c.id = "multi_conversion_tracking",
c.style.display = "none",
r = document.createElement("iframe"),
r.src = t(e, n, 1),
c.appendChild(r),
i.appendChild(c)
}, n = function() {
return "https:" === location.protocol.toLowerCase() ? "https" : "http"
}, t = function(t, e, i) {
return null == i && (i = 1), "" + n() + "://tracking.crealytics.com/" + t + "/multi_check.php ?data=" + e + " &random=" + (new Date).getTime() + " &frame=" + i
}, e = function(t, e) {
return -1 !== t.indexOf(e, t.length - e.length)
}
}).call(this)
You concat a string with a variable, or multiple variables, with this.
For example
var e = "johan855";
var string = "Hello " + e + ".";
console.log(string);
And / or
var e = "johan855";
var a = "Hello ";
var dot = ".";
var string = a + e + dot;
console.log(string);
will output
Hello johan855.
The "+e+" part in this javascript code is just a concatenation of the var e with other elements to create a string.
I'm trying to build a database based on some arbitrary data on a website. It's complex and changes for each site so I'll spare the details. Here's basically what I'm trying to do
function level0(arg) { textarea.innerHTML += arg + ' = {'; }
function level1(arg) { textarea.innerHTML += '\n\t' + arg + ': ['; }
function level2(arg) { textarea.innerHTML += arg + ', '; }
And so on. The thing is some level1's don't have any children and I can't get the formatting right.
My three problems are as follows.
The ending commas are going to break in IE (thank you MS)
Empty level1's shouldn't be printed if they don't have any children
Closing /curly?brackets/
HERE'S A DEMO of what I have so far. Notice the ending commas, the empty sub2 which shouldn't be printed, and no closing brackets or braces
Do I need to redesign the entire thing?
Is there also a way to have this all in one function so I don't have to worry if I add another layer?
EDIT
This needs to be done in a string format, I can't build an object and then stringify it, mostly because I need to know which element I'm in the middle of adding to.
Overall it looks that you still might want to build an object, but in case you insist on not building it - here is some sample solution:
function Printer() {
var result = '',
lastLevel = null,
close = {0:'\n}', 1:']', 2:''},
delimiter = {0: ',\n', 1:',\n', 2:','};
function closeLevel(level, noDelimiter) {
if(lastLevel === null)
return;
var l = lastLevel, d = level == lastLevel;
while(l >= level) {
result += close[l] + (l == level && !noDelimiter ? delimiter[l]:'');
l--;
}
}
this.level0 = function(arg) {
closeLevel(0);
result += arg + ' = {\n';
lastLevel = 0;
};
this.level1 = function(arg) {
closeLevel(1);
result += '\t' + arg + ': [';
lastLevel = 1;
};
this.level2 = function(arg) {
closeLevel(2);
result += arg;
lastLevel = 2;
};
this.getResult = function() {
closeLevel(lastLevel, true);
return result;
}
}
var p = new Printer();
p.level0('head');
p.level1('sub1');
p.level2('item1');p.level2('item2');p.level2('item3');
p.level1('sub2');
p.level1('sub3');
p.level2('newthing');
p.level0('head2');
document.getElementById('textarea').value = p.getResult();
You could see it in action here.
I'm not sure why you're building what looks like objects with nested arrays, using string concatenation. Something like this would be much simpler, since it wouldn't require fixing trailing commas, etc:
Edit: I've updated the code to make it keep track of the last level put in.
function Db() {
var level0, level1;
var data = new Object();
this.level0 = function(arg) {
level0 = new Object();
data[arg] = level0;
}
this.level1 = function(arg) {
level1 = new Array();
level0[arg] = level1;
}
this.level2 = function(arg) {
level1.push(arg);
}
this.toString = function() {
var s = '';
for(i in data) {
s += i + '\n';
for(j in data[i]) {
if(data[i][j].length>0) {
s += '\t' + j + ': [' + data[i][j] + ']\n' ;
}
}
}
return s;
}
}
Use like this:
var db = new Db();
db.level0('head');
db.level1('sub1');
db.level2('item1');db.level2('item2');db.level2('item3');
I've tested this in the demo you linked and it works just fine.
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.