How to Access local CSS file from within Alfresco Javascript? - javascript

So I have been able to figure out in Alfresco, there is a form called skin.css that allows me to change the highlighted color of data table items. However, I only want to be able to change this property during the course of a workflow and not as it applies to all data list elements throughout the entire Share website.
To start, I have a script which kicks off based on a rule and moves any updated/new files into a specified folder and then kicks off a workflow for that file. Within starting the workflow, the package items list is populated with all the documents within the same folder as the document that just got moved/the workflow started on. Below is the script:
function main()
{
var counter=0;
//Administrative Adjudication space/folder MUST exist under companyhome.
var rootSpaceName = companyhome.childByNamePath("mainFolder");
//If the rootspacename is null (not previously created), then exit the program as we have nothing to do.
if(rootSpaceName == null)
{
logger.log("Company Home/mainFolder does not exist, so we have nothing to do.");
return;
}
else
{
logger.log("Company Home/mainFolder exists, so carry on our process.");
//Creates an array of all the children under the rootSpaceName
var childList = rootSpaceName.children;
//Creates a variable which counts the number of children in the childList array
var count = childList.length;
//var seconds = new Date().getTime() / 1000;
//If there are no children in the rootSpaceName folder, exit the program.
if(count == 0)
{
logger.log("Company Home/mainFolder does not have child, nothing to do.");
return;
}
else
{
for(var i = 0; i < count; i++)
{
//Title MUST exist.
var childTitle = childList[i].properties["hearing:childTitle"];
//Author MUST exist.
var childAuthor = childList[i].properties["hearing:childAuthor"];
logger.log("childTitle: " + childTitle);
logger.log("childAuthor: " + childAuthor);
if(childTitle == null || childAuthor == null)
{
logger.log(i + ". Both the childTitle and childAuthor are null...");
continue;
}
var child = childList[i];
if(child.isContainer == false)
{
for(var j = 0; j < count; j++)
{
var newChildName = childList[j].properties.name;
logger.log("New child name: " + newChildName);
var newChild = childList[j];
if((newChild.isContainer == true) && (childTitle == newChildName))
{
logger.log("There is a currently existing folder with the same name as the title of original child");
var newSpaceName = rootSpaceName.childByNamePath(newChildName);
var newChildList = newSpaceName.children;
var newCount = newChildList.length;
for(var k = 0; k < newCount; k++)
{
var newNewChildName = newChildList[k].properties.name;
var newNewchildAuthor = newChildList[k].properties.author;
var newNewChild = newChildList[k];
if((newNewChild.isContainer == true) && (newNewchildAuthor == childAuthor))
{
var currentSpace = newSpaceName.childByNamePath(newNewChildName);
if(child.isDocument == true)
{
//Only want the workflow to run once so we increment count
counter=counter+1;
child.move(currentSpace);
//If Count is 1, then run workflow
if(counter==1)
{
//starts HelloWorldUI workflow
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=currentSpace;
var childList=rootSpaceName.children;
var count=childList.length;
//add all existing documents in the space to the workflow
for(var i = 0; i < count; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
}
}
}
}
else
{
// If title folder is already created, not need to create again.
var newSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle);
if(newSpaceName == null)
{
newSpaceName = rootSpaceName.createFolder(childTitle);
logger.log("mainFolder/" + childTitle + " is created.");
}
// If author folder is already created, not need to create again.
var newNewSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle + "/" + childAuthor);
if(newNewSpaceName == null)
{
newNewSpaceName = newSpaceName.createFolder(childAuthor);
logger.log("mainFolder/" + childTitle + "/" + childAuthor + " is created.");
}
if(child.isDocument == true)
{
counter=counter + 1;
child.move(newNewSpaceName);
if(counter == 1)
{
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=newNewSpaceName;
var childList=rootSpaceName.children;
var count=childList.length;
//add all items from the space to the workflow
for(var i = 0; i <c ount; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
logger.log("Moving file " + child.properties.name);
}
}
}
}
}
}
}
return;
}
main();
I would like to be able to create a function of some sort that can be called to access the skin.css file only during the course of the workflow and basically set .yui-skin-default tr.yui-dt-first{background-color:#FFF} in the CSS file. Does anyone know how I would go about doing that?

If you want to change only in start workflow page,
your css should write in start-workflow.css which is pointed by start-workflow.get.head.ftl. This css will override in other css file like skin.css.
Like this way, you can override any css to affect in only start workflow page not others.
You can try for other workflow related pages.

I've found a bookmarklet that will allow you to inject a CSS file on any page you'd like. Only down side is that you'll have to run it every time you load your page.
http://allben.net/post/2010/01/30/CSS-JavaScript-Injection-Bookmarklets.aspx

Related

How to customize the Split node in Node-Red in a way not to send all msgs at once?

I am trying to customize the Split node in Node-Red in a way to send the next message only when the first arrives to the Join node; as I am doing some processing in between, and would like to process each msg separately before joining them.
So I have cloned the Split node from Node-Red project, and at the part where the splitting of an array happens; I register listeners to events (random IDs generated by the original Split node).
else if (Array.isArray(msg.payload)) { // then split array into messages
msg.parts.type = 'array';
var count = msg.payload.length / node.arraySplt;
if (Math.floor(count) !== count) {
count = Math.ceil(count);
}
msg.parts.count = count;
var pos = 0;
var data = msg.payload;
msg.parts.len = node.arraySplt;
for (let i = 0; i < count; i++) {
msg.payload = data.slice(pos, pos + node.arraySplt);
if (node.arraySplt === 1) {
msg.payload = msg.payload[0];
}
msg.parts.index = i;
pos += node.arraySplt;
if (i === 0) {
send(RED.util.cloneMessage(msg));
continue;
}
let eventName = msg.parts.id + '-' + i;
addHandler(eventName, send, msg);
}
done();
}
My handler function
function addHandler(eventName, send, msg) {
RED.events.addListener(eventName, () => {
send(RED.util.cloneMessage(msg));
});
}
And at the Join node (which is in the same js file)
// commented below is the original code of the join function
// if ((tcnt > 0 && group.currentCount > tcnt) || msg.hasOwnProperty('complete')) {
// completeSend(partId);
// return;
// } else
if (msg.parts.index <= (msg.parts.count - 2)) {
let eventName = msg.parts.id + '-' + (parseInt(msg.parts.index) + 1);
RED.events.emit(eventName);
} else if (msg.parts.index >= (msg.parts.count - 1)) {
completeSend(partId);
return;
}
However, this would send the first msg (as I directly send it, and not through an event), and the last msg only; it would skip whatever in between.
Modifying the split node is the wrong way to do this.
There are rate limiting nodes that will do this for you. e.g. https://flows.nodered.org/node/node-red-contrib-semaphore
Or you can use the delay node with it's release function

Comparing JPG files with Photoshop Layers

Is it possible to compare filenames for a set of files that are imported as Photoshop layers ?
I have a folder of 50 jpg images which I have used in a PSD file.
Now I want to check whether all the JPG files are used or not ?
Is it possible to do so ?
As I've said, Photoshop scripting can help you achieve this by using File Objects and basic javascript knowledge. I've modified my old script as you've desired and now it should work well with any nested groups and images.
I highly encourage you to learn scripting and ask questions here wherever you feels confused.
Save below code as 'Script.jsx' and run it from 'File > Scripts > Browse'
Update 2 : Now it saves log.txt file too as per you requested. P.S. Learn from this script and tweak it to your desired result.
// Managing Document
var docs = app.documents;
// Progress Bar
var win = new Window("window{text:'Progress',bounds:[100,100,400,150],bar:Progressbar{bounds:[20,20,280,31] , value:0,maxvalue:100}};");
// assigning activeDocument
if (docs.length != 0) {
var docRef = app.activeDocument;
// Defining the folder
alert("You will be prompted for the folder containing your images.\n" +
"Files will be selected with a '.png'/'.jpg/.jpeg' on the end in the same folder.");
var folder = Folder.selectDialog();
if (!folder) {
exit;
}
var photoFiles = folder.getFiles(/\.(jpg|jpeg|png)$/i);
var matchFiles = [];
var photoFilesName = [];
//Searching for used images
var increment = parseFloat(0);
var divider = parseFloat(100/photoFiles.length);
win.show();
for (var i = 0; i < photoFiles.length; i++) {
increment = increment + divider;
var indexPhotoName = removeExtension(photoFiles[i].displayName);
photoFilesName.push(indexPhotoName);
var doc = activeDocument;
var curLayer;
goThroughLayers(doc, indexPhotoName);
}
function goThroughLayers(parentLayer, targetName) {
for (var i = 0; i < parentLayer.layers.length; i++) {
curLayer = parentLayer.layers[i];
doc.activeLayer = curLayer;
if (curLayer.typename == 'LayerSet') {
goThroughLayers(curLayer, targetName)
} else {
if (curLayer.name == targetName) {
// if (curLayer.name.match(/[e]/ig)) {
matchFiles.push(targetName);
// }
} //end if
} //end else
} //end loop
} //end function
function arr_diff(a1, a2) {
var a = [],
diff = [];
for (var i = 0; i < a1.length; i++) {
a[a1[i]] = true;
}
for (var i = 0; i < a2.length; i++) {
if (a[a2[i]]) {
delete a[a2[i]];
} else {
a[a2[i]] = true;
}
}
for (var k in a) {
diff.push(k);
}
return diff;
}
function removeExtension(str) {
return str.split('.').slice(0, -1).join('.');
}
var missItems = arr_diff(matchFiles, photoFilesName);
if (missItems.length > 0) {
var missFolder = new Folder(photoFiles[0].path + '/Missed%20Files');
if(!missFolder.exists){
missFolder.create();
}
for (var y = 0; y < photoFiles.length; y++) {
var photoTrimName = removeExtension(photoFiles[y].displayName);
for( var x = 0; x < missItems.length ; x++){
if(photoTrimName == missItems[x]){
photoFiles[y].copy(new File(missFolder+'/'+photoFiles[y].displayName));
}
}
};
win.close();
alert("You've missed total " + missItems.length + " files. Press OK to open folder containing missing files. Log report is generated wherever PSD is saved.");
var FileStr = "";
for(var m=0; m<missItems.length; m++){
FileStr = FileStr + '\n' + (m+1) + '. ' + missItems[m];
}
var str = "Your missed files are : " + FileStr;
saveTxt(str);
missFolder.execute();
} else {
win.close();
saveTxt('All Photos are used');
alert('All Photos are used');
}
} else {
alert('Open atleast one document');
}
function saveTxt(txt)
{
var Name = "LogReport_" + app.activeDocument.name.replace(/\.[^\.]+$/, '');
var Ext = decodeURI(app.activeDocument.name).replace(/^.*\./,'');
if (Ext.toLowerCase() != 'psd')
return;
var Path = app.activeDocument.path;
var saveFile = File(Path + "/" + Name +".txt");
if(saveFile.exists)
saveFile.remove();
saveFile.encoding = "UTF8";
saveFile.open("e", "TEXT", "????");
saveFile.writeln(txt);
saveFile.close();
}
In Javascript, it is possible to get some information related to PSD file layers using PSD.js library

Add a paragraph or table etc. at Cursor

I have a function for adding the contents of a separate google document at the cursor point within the active document, but I haven't been able to get it to work. I keep getting the "Element does not contain the specified child element" exception, and then the contents gets pasted to the bottom of the document rather than at the cursor point!
function AddTable() {
//here you need to get document id from url (Example, 1oWyVMa-8fzQ4leCrn2kIk70GT5O9pqsXsT88ZjYE_z8)
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM"; //Browser.inputBox("ID der Serienbriefvorlage (aus Dokumentenlink kopieren):");
var doc = DocumentApp.openById(FileTemplateFileId);
var DocName = doc.getName();
//Create copy of the template document and open it
var docCopy = DocumentApp.getActiveDocument();
var totalParagraphs = doc.getBody().getParagraphs(); // get the total number of paragraphs elements
Logger.log(totalParagraphs);
var cursor = docCopy.getCursor();
var totalElements = doc.getNumChildren();
var elements = [];
for (var j = 0; j < totalElements; ++j) {
var body = docCopy.getBody();
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.appendParagraph(element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.appendTable(element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.appendListItem(element);
}
// ...add other conditions (headers, footers...
}
Logger.log(element.editAsText().getText());
elements.push(element); // store paragraphs in an array
Logger.log(element.editAsText().getText());
for (var el = 0; el < elements.length; el++) {
var paragraph = elements[el].copy();
var doc = DocumentApp.getActiveDocument();
var bodys = doc.getBody();
var cursor = doc.getCursor();
var element = cursor.getElement();
var container = element.getParent();
try {
var childIndex = body.getChildIndex(container);
bodys.insertParagraph(childIndex, paragraph);
} catch (e) {
DocumentApp.getUi().alert("There was a problem: " + e.message);
}
}
}
You want to copy the objects (paragraphs, tables and lists) from the document of 1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM to the active Document.
You want to copy the objects to the cursor position on the active Document.
You want to achieve this using Google Apps Script.
If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.
Modification points:
In your script, appendParagraph, appendTable and appendListItem are used at the 1st for loop. I think that the reason that the copied objects are put to the last of the document is due to this.
var body = docCopy.getBody(); can be put to the out of the for loop.
In your case, I think that when the 1st for loop is modified, 2nd for loop is not required.
When above points are reflected to your script, it becomes as follows.
Modified script:
function AddTable() {
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM";
var doc = DocumentApp.openById(FileTemplateFileId);
var docCopy = DocumentApp.getActiveDocument();
var body = docCopy.getBody();
var cursor = docCopy.getCursor();
var cursorPos = docCopy.getBody().getChildIndex(cursor.getElement());
var totalElements = doc.getNumChildren();
for (var j = 0; j < totalElements; ++j) {
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.insertParagraph(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.insertTable(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.insertListItem(cursorPos + j, element);
}
}
}
It seems that DocName is not used in your script.
References:
insertParagraph()
insertTable()
insertListItem()
If I misunderstood your question and this was not the result you want, I apologize. At that time, can you provide the sample source Document? By this, I would like to confirm it.

For loop variable undefined in Javascript

I'm working on implementing a system where elements can be dragged and dropped to create flowcharts. My Issue is with saving the flowchart so that it could be reloaded when needed. For now I've created a method that saves all the previous data of the element onto the final array that holds only elements that are dropped on the container. But I'm getting a Trivial Error as Undefined variable on the debugging interface. Hence I'm not getting the intended output and the alert messages that I included are not being printed when the condition is met.
Code in Context
function saveFlowchart()
{
var nodes = [];
var matches = [];
var searchEles = document.getElementById("container").children;
for(var i = 0; i < searchEles.length; i++)
{
matches.push(searchEles[i]);
var idOfEl = searchEles[i].id;
if(searchEles[i].id !=null || searchEles[i].id !="")
{
var $element = $("#" + searchEles[i].id);
var dropElem = $("#" + searchEles[i].id).attr('class');
var position = $element.position();
position.bottom = position.top + $element.height();
position.right = position.left + $element.width();
alert("class:"+dropElem+"\nTop position: " + position.top + "\nLeft position: " + position.left + "\nBottom position: " + position.bottom + "\nRight position: " + position.right);
finalArray[idOfEl-1][0]= idOfEl;
finalArray[idOfEl-1][1]= dropElem;
var elId = parseInt(idOfEl);
if (dropElem == "stream ui-draggable")
{
for(var count=0;count<100;count++)
{
alert("One loop opened with count="+count);
if(createdImportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdImportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdImportStreamArray[count][2]; //asName
alert("createdImportStreamArray[count][0]==elId");
}
else if(createdExportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdExportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdExportStreamArray[count][2]; //asName
}
else if(createdDefinedStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdDefinedStreamArray[count][1]; //Stream Name
finalArray[elId-1][3]= createdDefinedStreamArray[count][4]; //Number of Attributes
finalArray[elId-1][4]=[];
for(var f=0;f<createdDefinedStreamArray[r][4];f++)
{
finalArray[elId-1][4][f][0]=createdDefinedStreamArray[count][2][f][0]; //Attribute Name
finalArray[elId-1][4][f][1]=createdDefinedStreamArray[count][2][f][1]; // Attribute Type
}
}
alert("One loop closed with count="+count);
}
alert("Loop ended with count="+count);
}
else if (dropElem == "wstream ui-draggable")
{
ElementType="wstream";
}
// else if conditions...
alert(finalArray);
}
}
//Code to output the connection details in a json format
//The following is not affected by the above mentioned error
$(".node").each(function (idx, elem) {
var $elem = $(elem);
var endpoints = jsPlumb.getEndpoints($elem.attr('id'));
console.log('endpoints of '+$elem.attr('id'));
console.log(endpoints);
nodes.push({
blockId: $elem.attr('id'),
nodetype: $elem.attr('data-nodetype'),
positionX: parseInt($elem.css("left"), 10),
positionY: parseInt($elem.css("top"), 10)
});
});
var connections = [];
$.each(jsPlumb.getConnections(), function (idx, connection) {
connections.push({
connectionId: connection.id,
pageSourceId: connection.sourceId,
pageTargetId: connection.targetId
});
});
var flowChart = {};
flowChart.nodes = nodes;
flowChart.connections = connections;
flowChart.numberOfElements = numberOfElements;
var flowChartJson = JSON.stringify(flowChart);
//console.log(flowChartJson);
$('#jsonOutput').val(flowChartJson);
}
Debugging Interface
According to this the count variable in the for loop is undefined. I've tried taking the first statement var count=0 outside the loop declaration part and defining it in the very beginning of the method. But that simply checks for count=0 against the conditions and doesn't increment at all.
Any help in this regard will be highly appreciated as I've been working on this minor error for almost 2 days now.

How to extract text from a PDF in JavaScript

I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.

Categories