after dynamicly creating some iframes, setting src and onload, I expect that the contents would be available for putting it to an array and sort the array. Next would be removing the iframes (interesting content already put in array) and create a table on my HTML-page. When no alert is set before sorting, the array is empty as the onload fires just before finishing the script. When an alert is place before sorting,I found the onload-procedure fired, array was populated and everything works fine?!! But I don't want to put an alert, can anyone explain what I did wrong? It's my first script so please help me to understand.
function LoadFile( ) {
var FName, PName, myID, myFrame;
var myBody = document.getElementById("StartList");
for ( var i = 0; i < FileList.length; i++ ) {
// Read input file into frame
FName = FileList[i].FName;
PName = FName + ".html";
myID = "iframe" + i;
// Create frame.
myFrame = document.createElement("iframe");
myFrame.setAttribute("id", myID);
myFrame.setAttribute("width","0");
myFrame.setAttribute("height","0");
myFrame.setAttribute('src', PName);
//Attach onload-event to frame, triggering ReadTableInfo.
if (myFrame.attachEvent){
myFrame.attachEvent("onload", function(){
ReadTableInfo();
});
} else {
myFrame.onload = function(){
ReadTableInfo();
};
}
myBody.appendChild(myFrame);
}
}
function ReadTableInfo() {
var a = document.getElementsByTagName("iframe")[idx];
var p = FileList[idx].FName;
var b = (a.contentWindow || a.contentDocument);
var td;
if ( b.document) {
b = b.document;
// Get and process table with functions and procedures.
var myTable = b.getElementsByTagName("Table")[5];
var myList = myTable.getElementsByTagName("TR");
var Name = "";
var Desc = "";
for ( var j = 0; j < myList.length; j++) {
Name = myTable.getElementsByTagName("TR") [j].getElementsByTagName("A")[0].innerHTML;
if ( myTable.getElementsByTagName("TR")[j].getElementsByTagName("TD")[1] != null) {
td = myTable.getElementsByTagName("TR")[j].getElementsByTagName("TD")[1];
Desc = td.textContent || td.innerText || "";
}
if ( searchval == "" || ( TestVal.test(Name) && searchkey == 1 ) || ( TestVal.test(Desc) && searchkey == 2 ) ) {
ProcList[ProcList.length++] = new AddProcList(Name.toLowerCase(), p.toLowerCase(), Desc);
}
Name = "";
Desc = "";
}
idx++;
}
}
function UpdateList( opt ) {
searchval = document.getElementById("edtSearchVal").value;
TestVal = new RegExp(".", "i");
if ( searchval !== "" ) {
if ( opt == 2 ) {
TestVal = new RegExp(searchval, "i"); // searchpattern for RegExp descriptions
} else {
TestVal = new RegExp(searchval.replace(" ","_"), "i"); // searchpattern for RegExp.
}
}
switch ( opt ) {
case 1: searchkey = 1;
break;
case 2: searchkey = 2;
break;
default:
searchkey = 3;
}
Init();
// Get package names from index.
SetFileList(); // Determines which external files to examine.
LoadFile(); // Loads the external files into iframes to be used later.
alert("Start generating list, this may take a while."); // Necessary to read frames! Don't know why???
var sortkeys = {FName:"a",PName:"a"}; // Sorting order of ProcList, which will be the listorder.
ProcList.keySort(sortkeys); // Sort ProcList.
TableCreate(); // Make new table with all entries in ProcList
}
Thanks for any comments, just here to learn from others :))
your LoadFile(); is loading content to iframe , and alert() is giving just enough time for iframe content to load completely, hence rest of the code works. If you remove alert then rest of the code is executed immediately before iframe content is loaded.
I would suggest to put onload even inside iframe page as well. Then propagate this event to parent window. To access parent container window you can do window.parent inside iframe.
Related
I have a function for adding the contents of a separate google document at the cursor point within the active document, but I haven't been able to get it to work. I keep getting the "Element does not contain the specified child element" exception, and then the contents gets pasted to the bottom of the document rather than at the cursor point!
function AddTable() {
//here you need to get document id from url (Example, 1oWyVMa-8fzQ4leCrn2kIk70GT5O9pqsXsT88ZjYE_z8)
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM"; //Browser.inputBox("ID der Serienbriefvorlage (aus Dokumentenlink kopieren):");
var doc = DocumentApp.openById(FileTemplateFileId);
var DocName = doc.getName();
//Create copy of the template document and open it
var docCopy = DocumentApp.getActiveDocument();
var totalParagraphs = doc.getBody().getParagraphs(); // get the total number of paragraphs elements
Logger.log(totalParagraphs);
var cursor = docCopy.getCursor();
var totalElements = doc.getNumChildren();
var elements = [];
for (var j = 0; j < totalElements; ++j) {
var body = docCopy.getBody();
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.appendParagraph(element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.appendTable(element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.appendListItem(element);
}
// ...add other conditions (headers, footers...
}
Logger.log(element.editAsText().getText());
elements.push(element); // store paragraphs in an array
Logger.log(element.editAsText().getText());
for (var el = 0; el < elements.length; el++) {
var paragraph = elements[el].copy();
var doc = DocumentApp.getActiveDocument();
var bodys = doc.getBody();
var cursor = doc.getCursor();
var element = cursor.getElement();
var container = element.getParent();
try {
var childIndex = body.getChildIndex(container);
bodys.insertParagraph(childIndex, paragraph);
} catch (e) {
DocumentApp.getUi().alert("There was a problem: " + e.message);
}
}
}
You want to copy the objects (paragraphs, tables and lists) from the document of 1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM to the active Document.
You want to copy the objects to the cursor position on the active Document.
You want to achieve this using Google Apps Script.
If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.
Modification points:
In your script, appendParagraph, appendTable and appendListItem are used at the 1st for loop. I think that the reason that the copied objects are put to the last of the document is due to this.
var body = docCopy.getBody(); can be put to the out of the for loop.
In your case, I think that when the 1st for loop is modified, 2nd for loop is not required.
When above points are reflected to your script, it becomes as follows.
Modified script:
function AddTable() {
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM";
var doc = DocumentApp.openById(FileTemplateFileId);
var docCopy = DocumentApp.getActiveDocument();
var body = docCopy.getBody();
var cursor = docCopy.getCursor();
var cursorPos = docCopy.getBody().getChildIndex(cursor.getElement());
var totalElements = doc.getNumChildren();
for (var j = 0; j < totalElements; ++j) {
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.insertParagraph(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.insertTable(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.insertListItem(cursorPos + j, element);
}
}
}
It seems that DocName is not used in your script.
References:
insertParagraph()
insertTable()
insertListItem()
If I misunderstood your question and this was not the result you want, I apologize. At that time, can you provide the sample source Document? By this, I would like to confirm it.
I have a problem with the javascript replace function and I don't succeed to resolve it.
This is my code : https://jsfiddle.net/r36k20sa/1/
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>.*?)(\\b" + element + "\\b)(?!.*?<\\/a>)", "igm"),
'$1'
);
});
In the tags array, if I reverse the array "johnny" then "zazie" all tags are well selected otherwise, some tags are missing. (The last in this example). What can be the trick?
What can be explained that ? It seems like the javascript replace function runs asynchronous?
Thanks for your help.
Are you seriously using regex to process HTML when you have a DOM parser at your fingertips?
var content = document.getElementById('content');
function findTextNodes(root,ret) {
// recursively descend into child nodes and return an array of text nodes
var children = root.childNodes, l = children.length, i;
ret = ret || [];
for( i=0; i<l; i++) {
if( children[i].nodeType == 1) { // ElementNode
// excluding A tags here, you might also want to exclude BUTTON tags
if( children[i].nodeName != "A") {
findTextNodes(children[i],ret);
}
}
if( children[i].nodeType == 3) { // TextNode
ret.push(children[i]);
}
}
return ret;
}
var textNodes = findTextNodes(content);
// now search those text node contents for matching tags.
var tags = ['zazie','johnny'], tagcount = tags.length, regexes, tag;
for( tag=0; tag<tagcount; tag++) {
regexes[tag] = new RegExp("\b"+tags[tag]+"\b","i");
}
var node, match, index, tagtext, newnode;
while(node = textNodes.shift()) {
for( tag=0; tag<tagcount; tag++) {
if( match = node.nodeValue.match(regexes[tag])) {
index = match.index;
textNodes.unshift(node.splitText(index + tags[tag].length));
tagtext = node.splitText(index);
newnode = document.createElement('a');
newnode.href = "";
newnode.className = "esk-seo-plu-link";
newnode.style.cssText = "background:red;color:white";
tagtext.parentNode.replaceChild(newnode,tagtext);
newnode.appendChild(tagtext);
}
}
}
// and done - no more action needed since it was in-place.
See it in action
Please replace . with \\.
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>\\.*?)(\\b" + element + "\\b)(?!\\.*?<\\/a>)", "igm"),
'$1'
);
});
Following the documentation sample, I'm trying to create a function that searchs for a numerated list in a google document and, if finds it, adds a new item to that list. But I get this error: Cannot find method setListId(string). (line 21, file "test") or, if I change line 21 content (replacing elementContentfor newElement), I get the message: Preparing for execution... and nothing happens. How to fix it?
This is my code:
function test() {
var elementContent = "New item testing"; // a paragraph with its formating
var targetDocId = "1R2c3vo9oOOjjlDR_n5L6Tf9yb-luzt4IxpHwwZoTeLE";
var targetDoc = DocumentApp.openById(targetDocId);
var body = targetDoc.getBody();
for (var i = 0; i < targetDoc.getNumChildren(); i++) {
var child = targetDoc.getChild(i);
if (child.getType() == DocumentApp.ElementType.LIST_ITEM){
var listId = child.getListId();
var newElement = body.appendListItem(elementContent);
newElement.setListId(newElement);
Logger.log("child = " + child);
}
}
}
Following my comment, I tried to play with your script to see what happened and I came up with that code below...
I'm not saying it solves your issue and/or is the best way to achieve what you want but at least it gives a result that works as expected.
Please consider it as a "new playground" and keep experimenting on it to make it better ;-)
function test() {
var elementContent = "New item testing"; // a paragraph with its formating
var targetDocId = DocumentApp.getActiveDocument().getId();
var targetDoc = DocumentApp.openById(targetDocId);
var body = targetDoc.getBody();
var childIndex = 0;
for (var i = 0; i < targetDoc.getNumChildren(); i++) {
var child = targetDoc.getChild(i);
if (child.getType() == DocumentApp.ElementType.LIST_ITEM){
while(child.getType() == DocumentApp.ElementType.LIST_ITEM){
child = targetDoc.getChild(i)
childIndex = body.getChildIndex(child);
Logger.log(childIndex)
i++
}
child = targetDoc.getChild(i-2)
var listId = child.getListId();
Logger.log(childIndex)
var newElement = child.getParent().insertListItem(childIndex, elementContent);
newElement.setListId(child);
break;
}
}
}
So I have been able to figure out in Alfresco, there is a form called skin.css that allows me to change the highlighted color of data table items. However, I only want to be able to change this property during the course of a workflow and not as it applies to all data list elements throughout the entire Share website.
To start, I have a script which kicks off based on a rule and moves any updated/new files into a specified folder and then kicks off a workflow for that file. Within starting the workflow, the package items list is populated with all the documents within the same folder as the document that just got moved/the workflow started on. Below is the script:
function main()
{
var counter=0;
//Administrative Adjudication space/folder MUST exist under companyhome.
var rootSpaceName = companyhome.childByNamePath("mainFolder");
//If the rootspacename is null (not previously created), then exit the program as we have nothing to do.
if(rootSpaceName == null)
{
logger.log("Company Home/mainFolder does not exist, so we have nothing to do.");
return;
}
else
{
logger.log("Company Home/mainFolder exists, so carry on our process.");
//Creates an array of all the children under the rootSpaceName
var childList = rootSpaceName.children;
//Creates a variable which counts the number of children in the childList array
var count = childList.length;
//var seconds = new Date().getTime() / 1000;
//If there are no children in the rootSpaceName folder, exit the program.
if(count == 0)
{
logger.log("Company Home/mainFolder does not have child, nothing to do.");
return;
}
else
{
for(var i = 0; i < count; i++)
{
//Title MUST exist.
var childTitle = childList[i].properties["hearing:childTitle"];
//Author MUST exist.
var childAuthor = childList[i].properties["hearing:childAuthor"];
logger.log("childTitle: " + childTitle);
logger.log("childAuthor: " + childAuthor);
if(childTitle == null || childAuthor == null)
{
logger.log(i + ". Both the childTitle and childAuthor are null...");
continue;
}
var child = childList[i];
if(child.isContainer == false)
{
for(var j = 0; j < count; j++)
{
var newChildName = childList[j].properties.name;
logger.log("New child name: " + newChildName);
var newChild = childList[j];
if((newChild.isContainer == true) && (childTitle == newChildName))
{
logger.log("There is a currently existing folder with the same name as the title of original child");
var newSpaceName = rootSpaceName.childByNamePath(newChildName);
var newChildList = newSpaceName.children;
var newCount = newChildList.length;
for(var k = 0; k < newCount; k++)
{
var newNewChildName = newChildList[k].properties.name;
var newNewchildAuthor = newChildList[k].properties.author;
var newNewChild = newChildList[k];
if((newNewChild.isContainer == true) && (newNewchildAuthor == childAuthor))
{
var currentSpace = newSpaceName.childByNamePath(newNewChildName);
if(child.isDocument == true)
{
//Only want the workflow to run once so we increment count
counter=counter+1;
child.move(currentSpace);
//If Count is 1, then run workflow
if(counter==1)
{
//starts HelloWorldUI workflow
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=currentSpace;
var childList=rootSpaceName.children;
var count=childList.length;
//add all existing documents in the space to the workflow
for(var i = 0; i < count; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
}
}
}
}
else
{
// If title folder is already created, not need to create again.
var newSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle);
if(newSpaceName == null)
{
newSpaceName = rootSpaceName.createFolder(childTitle);
logger.log("mainFolder/" + childTitle + " is created.");
}
// If author folder is already created, not need to create again.
var newNewSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle + "/" + childAuthor);
if(newNewSpaceName == null)
{
newNewSpaceName = newSpaceName.createFolder(childAuthor);
logger.log("mainFolder/" + childTitle + "/" + childAuthor + " is created.");
}
if(child.isDocument == true)
{
counter=counter + 1;
child.move(newNewSpaceName);
if(counter == 1)
{
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=newNewSpaceName;
var childList=rootSpaceName.children;
var count=childList.length;
//add all items from the space to the workflow
for(var i = 0; i <c ount; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
logger.log("Moving file " + child.properties.name);
}
}
}
}
}
}
}
return;
}
main();
I would like to be able to create a function of some sort that can be called to access the skin.css file only during the course of the workflow and basically set .yui-skin-default tr.yui-dt-first{background-color:#FFF} in the CSS file. Does anyone know how I would go about doing that?
If you want to change only in start workflow page,
your css should write in start-workflow.css which is pointed by start-workflow.get.head.ftl. This css will override in other css file like skin.css.
Like this way, you can override any css to affect in only start workflow page not others.
You can try for other workflow related pages.
I've found a bookmarklet that will allow you to inject a CSS file on any page you'd like. Only down side is that you'll have to run it every time you load your page.
http://allben.net/post/2010/01/30/CSS-JavaScript-Injection-Bookmarklets.aspx
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.