Comparing JPG files with Photoshop Layers - javascript

Is it possible to compare filenames for a set of files that are imported as Photoshop layers ?
I have a folder of 50 jpg images which I have used in a PSD file.
Now I want to check whether all the JPG files are used or not ?
Is it possible to do so ?

As I've said, Photoshop scripting can help you achieve this by using File Objects and basic javascript knowledge. I've modified my old script as you've desired and now it should work well with any nested groups and images.
I highly encourage you to learn scripting and ask questions here wherever you feels confused.
Save below code as 'Script.jsx' and run it from 'File > Scripts > Browse'
Update 2 : Now it saves log.txt file too as per you requested. P.S. Learn from this script and tweak it to your desired result.
// Managing Document
var docs = app.documents;
// Progress Bar
var win = new Window("window{text:'Progress',bounds:[100,100,400,150],bar:Progressbar{bounds:[20,20,280,31] , value:0,maxvalue:100}};");
// assigning activeDocument
if (docs.length != 0) {
var docRef = app.activeDocument;
// Defining the folder
alert("You will be prompted for the folder containing your images.\n" +
"Files will be selected with a '.png'/'.jpg/.jpeg' on the end in the same folder.");
var folder = Folder.selectDialog();
if (!folder) {
exit;
}
var photoFiles = folder.getFiles(/\.(jpg|jpeg|png)$/i);
var matchFiles = [];
var photoFilesName = [];
//Searching for used images
var increment = parseFloat(0);
var divider = parseFloat(100/photoFiles.length);
win.show();
for (var i = 0; i < photoFiles.length; i++) {
increment = increment + divider;
var indexPhotoName = removeExtension(photoFiles[i].displayName);
photoFilesName.push(indexPhotoName);
var doc = activeDocument;
var curLayer;
goThroughLayers(doc, indexPhotoName);
}
function goThroughLayers(parentLayer, targetName) {
for (var i = 0; i < parentLayer.layers.length; i++) {
curLayer = parentLayer.layers[i];
doc.activeLayer = curLayer;
if (curLayer.typename == 'LayerSet') {
goThroughLayers(curLayer, targetName)
} else {
if (curLayer.name == targetName) {
// if (curLayer.name.match(/[e]/ig)) {
matchFiles.push(targetName);
// }
} //end if
} //end else
} //end loop
} //end function
function arr_diff(a1, a2) {
var a = [],
diff = [];
for (var i = 0; i < a1.length; i++) {
a[a1[i]] = true;
}
for (var i = 0; i < a2.length; i++) {
if (a[a2[i]]) {
delete a[a2[i]];
} else {
a[a2[i]] = true;
}
}
for (var k in a) {
diff.push(k);
}
return diff;
}
function removeExtension(str) {
return str.split('.').slice(0, -1).join('.');
}
var missItems = arr_diff(matchFiles, photoFilesName);
if (missItems.length > 0) {
var missFolder = new Folder(photoFiles[0].path + '/Missed%20Files');
if(!missFolder.exists){
missFolder.create();
}
for (var y = 0; y < photoFiles.length; y++) {
var photoTrimName = removeExtension(photoFiles[y].displayName);
for( var x = 0; x < missItems.length ; x++){
if(photoTrimName == missItems[x]){
photoFiles[y].copy(new File(missFolder+'/'+photoFiles[y].displayName));
}
}
};
win.close();
alert("You've missed total " + missItems.length + " files. Press OK to open folder containing missing files. Log report is generated wherever PSD is saved.");
var FileStr = "";
for(var m=0; m<missItems.length; m++){
FileStr = FileStr + '\n' + (m+1) + '. ' + missItems[m];
}
var str = "Your missed files are : " + FileStr;
saveTxt(str);
missFolder.execute();
} else {
win.close();
saveTxt('All Photos are used');
alert('All Photos are used');
}
} else {
alert('Open atleast one document');
}
function saveTxt(txt)
{
var Name = "LogReport_" + app.activeDocument.name.replace(/\.[^\.]+$/, '');
var Ext = decodeURI(app.activeDocument.name).replace(/^.*\./,'');
if (Ext.toLowerCase() != 'psd')
return;
var Path = app.activeDocument.path;
var saveFile = File(Path + "/" + Name +".txt");
if(saveFile.exists)
saveFile.remove();
saveFile.encoding = "UTF8";
saveFile.open("e", "TEXT", "????");
saveFile.writeln(txt);
saveFile.close();
}

In Javascript, it is possible to get some information related to PSD file layers using PSD.js library

Related

Looping through layers and setting them to visible

I'm trying to loop through the layers in an InDesign doc and set all of them to visible. This is to ensure that file collection occurs correctly.
I put together the following
var myDocument = app.activeDocument;
//make all layers visable
for (i = 0; i < myDocument.layers.length; i++) {
if(myDocument.layers[i].visible = false) {
myDocument.layers[i].visible = true;
};
};
This is excerpted from a larger script that automates the file collect, this is just the routine for the layers.
For context here's the actual script.
function Left(str, n){
if (n <= 0)
return "";
else if (n > String(str).length)
return str;
else
return String(str).substring(0,n);
}
function Right(str, n){
if (n <= 0)
return "";
else if (n > String(str).length)
return str;
else {
var iLen = String(str).length;
return String(str).substring(iLen, iLen - n);
}
}
if (app.documents.length != 0){
var myDocument = app.activeDocument;
var docName = myDocument.name;
var docName = Left(docName, String(docName).length-5)
//alert(docName);
var myFolder = new Folder ("~/Desktop/"+docName+"/");
//myFolder.create("Bob");s
/*new Folder ("~/Desktop/Collected/Hi-Res PDF/");
new Folder ("~/Desktop/Collected/RELEASE INFO/");*/
//make all layers visable
for (i = 0; i < myDocument.layers.length; i++) {
if(myDocument.layers[i].visible = false) {
myDocument.layers[i].visible = true;
};
};
myDocument.packageForPrint (myFolder,1,1,0,1,0,0,0);
var newFolder = new Folder ("~/Desktop/"+docName+"/RELEASE INFO/");
newFolder.create();
var inddFolder = new Folder ("~/Desktop/"+docName+"/Indesign Files/");
inddFolder.create();
var newFolder = new Folder ("~/Desktop/"+docName+"/IDML Files/");
newFolder.create();
//Export IMDL File
myDocument.exportFile(ExportFormat.INDESIGN_MARKUP, File("~/Desktop/"+docName+"/IDML Files/"+docName+".idml"), false);
//Move INDD File
//var myInddfile = File("~/Desktop/"+docName+"/"+docName+".indd");
//myDocument.changePath(File(inddFolder),false);
//Rip Low Res PDFs
var myPDFExportPreset = app.pdfExportPresets.item("CP3 Low Rez");
app.activeDocument.exportFile(ExportFormat.pdfType,
File("~/Desktop/"+docName+"/RELEASE INFO/"+docName+"_LR.pdf"), false, myPDFExportPreset);
//Now export the document. You'll have to fill in your own file path.
//app.activeDocument.exportFile(ExportFormat.pdfType, File("~/Desktop/"+docName+"_FILM/RELEASE INFO/"+docName+"_LR.pdf"), false);
var newFolder = new Folder ("~/Desktop/"+docName+"/Hi-Res PDF/");
newFolder.create();
//Rip Hi-Res PDF
var myPDFExportPreset = app.pdfExportPresets.item("Kern Hi Rez Print");
app.activeDocument.exportFile(ExportFormat.pdfType,
File("~/Desktop/"+docName+"/Hi-Res PDF/"+docName+"_HiRes.pdf"), false, myPDFExportPreset);
//Now export the document. You'll have to fill in your own file path.
//app.activeDocument.exportFile(ExportFormat.pdfType, File("~/Desktop/"+docName+"_FILM/Hi-Res PDF/"+docName+"_HiRes.pdf"), false);
myFolder.execute();
}
else{
alert("Please open a document and try again.");
}
Hopefully, when the script executes, all the layers will be set to visible then the file collect will occur.
Use triple equals in your if statement for strict equality. For instance:
for (i = 0; i < myDocument.layers.length; i++) {
if(myDocument.layers[i].visible === false) { // <-- Note the `===` instead of `=`
myDocument.layers[i].visible = true;
};
};
Or even better, you can change it to utilize the Logical NOT ! operator
for (i = 0; i < myDocument.layers.length; i++) {
if (!myDocument.layers[i].visible) { // <-- Change to this.
myDocument.layers[i].visible = true;
};
};
Note: Given your example, the conditional if statement is not actually necessary. You could simply do this instead:
for (i = 0; i < myDocument.layers.length; i++) {
myDocument.layers[i].visible = true;
};
Set everything to be visible
If you actually want to make everything visible - including; InDesign document layers and all page items on the sub-layer(s), then you'll need to do something like this example:
var myDocument = app.activeDocument;
// ...
function makeAllVisible() {
for (i = 0, max = myDocument.layers.length; i < max; i++) {
var currentLayer = myDocument.layers[i];
currentLayer.visible = true; // Make the top level layer visible.
// Make all sub layers visible,
// i.e. make all page items on the current layer visible.
var currentLayerPageItems = currentLayer.allPageItems;
for (x = 0, len = currentLayerPageItems.length; x < len; x++) {
currentLayerPageItems[x].visible = true
}
}
}
makeAllVisible(); // Invoke the function.
// ...

Photoshop Javascript to get all layers in the active document

I'm sure it should be discussed before by Photoshop scripters. I write a solution as following. I think it's logically right, but the result is not correct. Anybody can help to check where's wrong in the code, or have ideas for this topic? I want to get all the layers in a document.
Code:
function getAllLayersInLayerSets(layerNodes) {
var retList = [];
for (var i=0; i<layerNodes.length; i++) {
if(layerNodes[i].layerSets.length > 0)
{
var tmp = getAllLayersInLayerSets(layerNodes[i].layerSets);
var j = (tmp == null) ? -1 : tmp.length-1;
while(tmp && j>=0)
{
retList.push(tmp[i]);
j--;
}
}
for(var layerIndex=0; layerIndex < layerNodes[i].artLayers.length; layerIndex++)
{
var layer=layerNodes[i].artLayers[layerIndex];
retList.push(layer);
}
}
return retList;
}
Many thanks for any help or discussion.
I know this is an old thread, but this might be useful for someone.
I was looking for a function that would get me all the ArtLayers in a Photoshop comp, including layers nested in groups. The above function was returning undefined, so I modified it and got it to work.
var doc = app.activeDocument;
var allLayers = [];
var allLayers = collectAllLayers(doc, allLayers);
function collectAllLayers (doc, allLayers){
for (var m = 0; m < doc.layers.length; m++){
var theLayer = doc.layers[m];
if (theLayer.typename === "ArtLayer"){
allLayers.push(theLayer);
}else{
collectAllLayers(theLayer, allLayers);
}
}
return allLayers;
}
Minor expansion on Ghoul Fool's post to only get all VISIBLE art layers in the active document. :P
// Get layers in a document
var sourceDocument = app.activeDocument;
var visibleLayers = [];
var visibleLayers = collectAllLayers(sourceDocument, visibleLayers);
// Print out total layers found
alert(visibleLayers.length);
// Recursively get all visible art layers in a given document
function collectAllLayers (parent, allLayers)
{
for (var m = 0; m < parent.layers.length; m++)
{
var currentLayer = parent.layers[m];
if (currentLayer.typename === "ArtLayer")
{
if(currentLayer.visible)
{
allLayers.push(currentLayer);
}
}
else
{
collectAllLayers(currentLayer, allLayers);
}
}
return allLayers;
}
To get all the layers (and sub layers) you have to have a recursive function
var allLayers = new Array();
var theLayers = collectAllLayers(app.activeDocument, 0);
function collectAllLayers (theParent, level)
{
for (var m = theParent.layers.length - 1; m >= 0; m--)
{
var theLayer = theParent.layers[m];
if (theLayer.typename != "ArtLayer")
{
allLayers.push(level + theLayer.name);
collectAllLayers(theLayer, level + 1)
}
}
}
function selectAllLayers() {
var desc29 = new ActionDescriptor();
var ref23 = new ActionReference();
ref23.putEnumerated(charIDToTypeID('Lyr '), charIDToTypeID('Ordn'), charIDToTypeID('Trgt'));
desc29.putReference(charIDToTypeID('null'), ref23);
executeAction(stringIDToTypeID('selectAllLayers'), desc29, DialogModes.NO);
}

Spliting String and getting appropriate value in JavaScript

I have a string where |||| means next to it is the directory. ||| means the user is allowed to access this directory and || means the files allocated to these users follow.
I need to find allocated file names of a specific user from this string. I have tried to split the string and assign values to an array but I am not able to get the result I'm looking for.
This is the string:
||||Root|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,||||1400842226669|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,||||1401191909489|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,LimitTest_20140528164643.xlsx,
And here is my attempt:
function getData() {
var user = 'km11285c';
var value = "||||Root|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,||||1400842226669|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,||||1401191909489|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,LimitTest_20140528164643.xlsx,";
var users = null;
var files = null;
var Dir = value.split("||||");
var arrayLength = Dir.length;
for (var i = 0; i < arrayLength; i++) {
users = Dir[i].split("|||");
}
return users;
}
console.log(getData());
and the jsFiddle
I changed your jsfiddle example a bit so maybe you need to change the code here and there, but something like this should work:
function buildTree(data) {
var tree = [];
var dirs = data.split("||||");
// Remove the first entry in the array, since it should be empty.
dirs.splice(0, 1);
for (var i = 0; i < dirs.length; ++i) {
var tempArray = dirs[i].split("|||");
var dirName = tempArray[0];
var usersAndFiles = tempArray[1];
tempArray = usersAndFiles.split("||");
var users = tempArray[0];
var files = tempArray[1];
var treeDir = { name: dirName };
treeDir.users = users.split(",");
treeDir.files = files.split(",");
tree.push(treeDir);
}
return tree;
}
function getData() {
var user = 'km11285c';
var value="||||Root|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,||||1400842226669|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,||||1401191909489|||adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,km11285c,km61052,km61639c,adil001,kl04707c,km47389,km58184,km61052,kq61023c,||LimitTest_20140528164643.xlsx,testTask2_20140528140033.xlsx,testTask1_20140528135944.xlsx,testTask2_20140528140033.xlsx,LimitTest_20140528164643.xlsx,";
var tree = buildTree(value);
for (var i = 0; i < tree.length; ++i) {
var dir = tree[i];
if (dir.users.indexOf(user) >= 0) {
console.log("User '" + user + "' has access to directory '" + dir.name + "', which contains these files: " + dir.files.join(","));
}
}
}
getData();

How to Access local CSS file from within Alfresco Javascript?

So I have been able to figure out in Alfresco, there is a form called skin.css that allows me to change the highlighted color of data table items. However, I only want to be able to change this property during the course of a workflow and not as it applies to all data list elements throughout the entire Share website.
To start, I have a script which kicks off based on a rule and moves any updated/new files into a specified folder and then kicks off a workflow for that file. Within starting the workflow, the package items list is populated with all the documents within the same folder as the document that just got moved/the workflow started on. Below is the script:
function main()
{
var counter=0;
//Administrative Adjudication space/folder MUST exist under companyhome.
var rootSpaceName = companyhome.childByNamePath("mainFolder");
//If the rootspacename is null (not previously created), then exit the program as we have nothing to do.
if(rootSpaceName == null)
{
logger.log("Company Home/mainFolder does not exist, so we have nothing to do.");
return;
}
else
{
logger.log("Company Home/mainFolder exists, so carry on our process.");
//Creates an array of all the children under the rootSpaceName
var childList = rootSpaceName.children;
//Creates a variable which counts the number of children in the childList array
var count = childList.length;
//var seconds = new Date().getTime() / 1000;
//If there are no children in the rootSpaceName folder, exit the program.
if(count == 0)
{
logger.log("Company Home/mainFolder does not have child, nothing to do.");
return;
}
else
{
for(var i = 0; i < count; i++)
{
//Title MUST exist.
var childTitle = childList[i].properties["hearing:childTitle"];
//Author MUST exist.
var childAuthor = childList[i].properties["hearing:childAuthor"];
logger.log("childTitle: " + childTitle);
logger.log("childAuthor: " + childAuthor);
if(childTitle == null || childAuthor == null)
{
logger.log(i + ". Both the childTitle and childAuthor are null...");
continue;
}
var child = childList[i];
if(child.isContainer == false)
{
for(var j = 0; j < count; j++)
{
var newChildName = childList[j].properties.name;
logger.log("New child name: " + newChildName);
var newChild = childList[j];
if((newChild.isContainer == true) && (childTitle == newChildName))
{
logger.log("There is a currently existing folder with the same name as the title of original child");
var newSpaceName = rootSpaceName.childByNamePath(newChildName);
var newChildList = newSpaceName.children;
var newCount = newChildList.length;
for(var k = 0; k < newCount; k++)
{
var newNewChildName = newChildList[k].properties.name;
var newNewchildAuthor = newChildList[k].properties.author;
var newNewChild = newChildList[k];
if((newNewChild.isContainer == true) && (newNewchildAuthor == childAuthor))
{
var currentSpace = newSpaceName.childByNamePath(newNewChildName);
if(child.isDocument == true)
{
//Only want the workflow to run once so we increment count
counter=counter+1;
child.move(currentSpace);
//If Count is 1, then run workflow
if(counter==1)
{
//starts HelloWorldUI workflow
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=currentSpace;
var childList=rootSpaceName.children;
var count=childList.length;
//add all existing documents in the space to the workflow
for(var i = 0; i < count; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
}
}
}
}
else
{
// If title folder is already created, not need to create again.
var newSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle);
if(newSpaceName == null)
{
newSpaceName = rootSpaceName.createFolder(childTitle);
logger.log("mainFolder/" + childTitle + " is created.");
}
// If author folder is already created, not need to create again.
var newNewSpaceName = companyhome.childByNamePath("mainFolder/" + childTitle + "/" + childAuthor);
if(newNewSpaceName == null)
{
newNewSpaceName = newSpaceName.createFolder(childAuthor);
logger.log("mainFolder/" + childTitle + "/" + childAuthor + " is created.");
}
if(child.isDocument == true)
{
counter=counter + 1;
child.move(newNewSpaceName);
if(counter == 1)
{
var wfdef=workflow.getDefinitionByName("activiti$helloWorldUI");
if(wfdef)
{
var wfparams=new Array();
wfparams["bpm:workflowDescription"]="";
wfparams["bpm:groupAssignee"]=people.getGroup("GROUP_Managers");
var wfpackage=workflow.createPackage();
var rootSpaceName=newNewSpaceName;
var childList=rootSpaceName.children;
var count=childList.length;
//add all items from the space to the workflow
for(var i = 0; i <c ount; i++)
{
wfpackage.addNode(childList[i]);
}
var wfpath=wfdef.startWorkflow(wfpackage,wfparams);
var tasks=wfpath.getTasks();
for each(task in tasks)
{
task.endTask(null);
}
}
}
logger.log("Moving file " + child.properties.name);
}
}
}
}
}
}
}
return;
}
main();
I would like to be able to create a function of some sort that can be called to access the skin.css file only during the course of the workflow and basically set .yui-skin-default tr.yui-dt-first{background-color:#FFF} in the CSS file. Does anyone know how I would go about doing that?
If you want to change only in start workflow page,
your css should write in start-workflow.css which is pointed by start-workflow.get.head.ftl. This css will override in other css file like skin.css.
Like this way, you can override any css to affect in only start workflow page not others.
You can try for other workflow related pages.
I've found a bookmarklet that will allow you to inject a CSS file on any page you'd like. Only down side is that you'll have to run it every time you load your page.
http://allben.net/post/2010/01/30/CSS-JavaScript-Injection-Bookmarklets.aspx

How to extract text from a PDF in JavaScript

I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.

Categories