Script runs extremely slow, needs to be more efficient - javascript

Goal:
My script takes every folder and layer in photoshop, gets the center point coordinates, and saves them to a txt file.
Issue:
The script works very well and gives the exact data I need. However, the script runs extremely slow when I have lots of photoshop layers. For example I ran the script on a PSD that has lets say 200 small layers. This took about 20 minutes to get the output text file that I need. MY question, and bare in mind I am not a programmer, is how to increase the efficiency of this code, and have this run faster.
Here is a Sample of the Output Data:
1 -MUD ROOM/GARAGE: 483.5x,559y
130A: 307.5x,681y
Lighting_icon_square_4x copy 19: 382x,749y
Lighting_icon_square_4x copy 19: 382x,681y
Lighting_icon_square_4x copy 19: 382x,613y
Lighting_icon_square_4x copy 18: 233x,749y
Lighting_icon_square_4x copy 17: 233x,681y
Lighting_icon_square_4x copy 13: 233x,613y
The Code:
// Bring application forward
app.bringToFront();
// Set active Document variable and decode name for output
var docRef = app.activeDocument;
var docName = decodeURI(activeDocument.name).slice(0, -4);
// Define pixels as unit of measurement
var defaultRulerUnits = preferences.rulerUnits;
preferences.rulerUnits = Units.PIXELS;
// Define variable for the number of layers in the active document
var layerNum = app.activeDocument.artLayers.length;
// Define variable for the active layer in the active document
var layerRef = app.activeDocument.activeLayer;
// Define varibles for x and y of layers
var x = (layerRef.bounds[2].value) - (layerRef.bounds[0].value);
var y = (layerRef.bounds[3].value) - (layerRef.bounds[1].value);
var coords = "";
// Loop to iterate through all layers
function recurseLayers(currLayers) {
for ( var i = 0; i < currLayers.layers.length; i++ ) {
layerRef = currLayers.layers[i];
x = (layerRef.bounds[2].value) - (layerRef.bounds[0].value);
y = (layerRef.bounds[3].value) - (layerRef.bounds[1].value);
coords += layerRef.name + ": " + (layerRef.bounds[0].value + x/2) + "x" + "," + (layerRef.bounds[1].value + y/2) + "y" + "\n";
//test if it's a layer set
if ( isLayerSet(currLayers.layers[i]) ) {
recurseLayers(currLayers.layers[i]);
}
}
}
//a test for a layer set
function isLayerSet(layer) {
try {
if ( layer.layers.length > 0 ) {
return true;
}
}
catch(err) {
return false;
}
}
// Ask the user for the folder to export to
var FPath = Folder.selectDialog("Save exported coordinates to");
// Detect line feed type
if ( $.os.search(/windows/i) !== -1 ) {
fileLineFeed = "Windows";
}
else {
fileLineFeed = "Macintosh";
}
// Export to txt file
function writeFile(info) {
try {
var f = new File(FPath + "/" + docName + ".txt");
f.remove();
f.open('a');
f.lineFeed = fileLineFeed;
f.write(info);
f.close();
}
catch(e){}
}
// Run the functions
recurseLayers(docRef);
preferences.rulerUnits = defaultRulerUnits; // Set preferences back to user 's defaults
writeFile(coords);
// Show results
if ( FPath == null ) {
alert("Export aborted", "Canceled");
}
else {
alert("Exported " + docName + " x/y coordinates to " + FPath + "/" + docName + ".txt ");
}

For the sake of other readers facing performance bugs, I've shortened your code up a bit. This runs fast on my machine.
function main() {
var file = File.saveDialog("Save exported coordinates");
if (file == null) return;
if (!file.open('w')) {
alert("Aborted", "Could not write file.");
return;
}
write(activeDocument.layers, file);
alert("Exported x/y coordinates to " + file.path);
}
function write(layers, file) {
for (var i = 0; i < layers.length; i++) {
var layer = layers[i];
var width = layer.bounds[2] - layer.bounds[0];
var height = layer.bounds[3] - layer.bounds[1];
file.write(layer.name + ": " + (layer.bounds[0] + width / 2).value +
"x, " + (layer.bounds[1] + height / 2).value + "y\n");
if (layer instanceof LayerSet) {
write(layer.layers, file);
}
}
}
main();

Related

Is it possible to load build result in VS Output window?

I was been trying to switch IDEs.
What if I will want to load MISRA check result file to Visual Studio.
Is there any direct or simpler way?
Made indirect workaround:
var fso = new ActiveXObject("Scripting.FileSystemObject");
var rootPath = fso.GetFolder(".");
var cStat = rootPath.files;
for(var objEnum = new Enumerator(cStat); !objEnum.atEnd(); objEnum.moveNext()) {
var strFileName = objEnum.item();
if (strFileName.ShortName.length - strFileName.ShortName.toUpperCase().indexOf(".TXT") != 4) continue;
//WScript.Echo(strFileName);
break;
}
var ts = strFileName.OpenAsTextStream(1);
while(!ts.AtEndOfStream) {
var textLine = ts.ReadLine();
textLine = textLine.split('\t'); // IAR MISRA line: Description Rule Severity File:Line
if (textLine[3])
{
var res = textLine[3].replace(/(.+):(\d+)/g, "$1($2)");
if (textLine[2] == "Low")
{
res += ": warning " + textLine[1] + ": " + textLine[0];
}
else
{
res += ": error " + textLine[1] + ": " + textLine[0] + ' ' + textLine[2];
}
WScript.Echo(res);
}
}
ts.Close();
Using fake NMAKE project with build command like cscript /NoLogo PrintLog.js.
Now I can open files reported by MISRA in VS by copying export txt file to this project and running build.
Similar older IAR warning(s) filter used as pipe by command:
...\iarbuild "project.ewp" ReleaseCfg | cscript /NoLogo IARfilterPipe.js.
var fso = new ActiveXObject("Scripting.FileSystemObject");
var rootPath = fso.GetFolder(".") + '\\';
var x = oldBad(), skip = {};
for (i in x) skip[x[i]] = 1;
var stat = [0, 0], all = [], newWarnings = [];
do {
var line = WScript.StdIn.ReadLine();
if (line.indexOf('[') > 0) // possible warning line
{
all.push(line);
line = line.replace(rootPath, "");
var fit = 0;
if (line.indexOf("Remark[") > -1)
{
fit++;
stat[0]++;
line = line
.replace(/\d+>\s+/g, "")
.replace(/Remark\[(\S+)\]:/g, "Warning $1:");
}
else if (line.indexOf("Error[") > -1)
{
fit++;
stat[1]++;
line = line
.replace(/\d+>\s+/g, "")
.replace(/Error\[(\S+)\]:/g, "Error $1:");
}
if (skip[line] != 1 && fit)
{
newWarnings.push(line);
WScript.Echo('!' + line);
var m = line.match(/\s*[^\s.]+\.(s|cpp|c)/g)
} else {
WScript.Echo('_' + line);
}
}
else {
var m = line.match(/\s*[^\s.]+\.(s|cpp|c)/g)
if (m == null) // no name.ext
{
WScript.Echo(line);
}
else if (m.length == 1) // single filename
{
all.push(line);
}
}
} while (!WScript.StdIn.AtEndOfStream);
if (all.length)
{
all = all.sort();
writeFile("buildFiles.txt", all.join('\n'));
}
if (newWarnings.length)
{
writeFile("newWarnings.txt", newWarnings.join('\n'));
WScript.Echo("========== New warnings: ==========");
}
for(var l in newWarnings)
{
WScript.Echo(newWarnings[l]);
}
if (stat[0] + stat[1])
{
WScript.Echo("========== Build Result - Warnings " + stat[0] + " Errors " + stat[1] + " ==========");
}
WScript.Quit(0); // (do not work from file after Echo => -1)
function writeFile(filename, content)
{
var TextStream = fso.CreateTextFile(filename);
TextStream.Write(content);
TextStream.Close()
}
function oldBad()
{
return [
'somefile.cpp(42) : Warning Pe340: value copied to temporary, reference to temporary used', ...
];
}

Website not responding - I think due to javascript code

I'm trying to make a game in javascript and display it on my website, but the website isn't responding anymore. The internet connection is just fine.
Here is the code where I think it went wrong:
$(function(){
init();
console.log("Main Init Called")
});
function InitFilesRanksBrd() {
var index = 0;
var file = FILES.FILE_A;
var rank = RANKS.RANK_1;
var sq = SQUARES.A1;
for(index = 0; index < BRD_SQ_NUM; ++index) {
FilesBrd[index] = SQUARES.OFFBOARD;
RanksBrd[index] = SQUARES.OFFBOARD;
}
for(rank = RANKS.RANK_1; rank <= RANKS.RANK_5; ++rank) {
for(file = FILES.FILE_1; rank <= FILES.FILE_E; ++file) {
sq = FR2SQ(file,rank);
FilesBrd[sq] = file;
RanksBrd[sq] = rank;
}
}
console.log("FilesBrd[0]:" + FilesBrd[0] + " RanksBrd[0]:" + RanksBrd[0]);
console.log("FilesBrd[SQUARES.A1]:" + FilesBrd[SQUARES.A1] + " RanksBrd[SQUARES.A1]:" + RanksBrd[SQUARES.A1]);
//console.log("FilesBrd[SQUARES.C5]:" + FilesBrd[SQUARES.C5] + " RanksBrd[SQUARES.C5]:" + RanksBrd[SQUARES.C5]);
console.log(FilesBrd);
console.log(RanksBrd);
}
function init(){
console.log("init() called");
InitFilesRanksBrd();
}
In the inner for, the variable that you are comparing is rank, not file. It should be:
for(file = FILES.FILE_1; file <= FILES.FILE_E; ++file)
Because the rank variable isn't incrementing until the loop content end, but the loop content doesn't end because of the inner loop.

Node and Lazy: How do I know when it's done?

I need to read a file line by line, and change a variable accordingly.
I would normally write this in PHP... but I decided to take the challenge.
I wrote:
fs = require('fs');
Lazy = require('lazy');
path = require('path');
files = fs.readdirSync('.');
var software = {};
files.forEach( function(fileName){
var m;
if( m = fileName.match(/^(.*)\.txt$/) ){
name = m[1];
console.log("Processing file: " + fileName);
software[name] = {};
console.log("Software 1: %j",software);
var section = 'unset';
new Lazy(fs.createReadStream(fileName)).lines.forEach(
function(line){
var m;
line = line + '';
if( m = line.match(/^([a-zA-Z_]*):$/)){
section = m[1];
software[name][section] = '';
console.log("Switching to section " + m[1]);
console.log("Software 2: %j",software);
} else if (line == '.'){
section = 'unset'
} else if (line == ''){
section = 'unset'
} else {
console.log("LINE: " + line) ;
software[name][section] = software[name][section] + line + "\n";
console.log("Software 3: %j",software);
}
}
);
}
});
console.log("Software 4: %j",software);
Apart from the code being very ugly and very unoptimised, I am having trouble as when the last line prints, the "software" variable is not YET populated! I am guessing Lazy is asyncronous. So, it basically works, but "at some point later". This is great, but... where do I write code when that important cycle, that fills in the software variable, is actually finished?!?
As requested: data to play with!
simply create "something.txt" and write:
name:
Name 1
.
Option 1:
Value 1
.
Option 2:
Value 2
.
Option 3:
Multi
Line
Value
.
Another_section:
Again
.
Merc.
The instances of Lazy returned by the library are EventEmitters, and it emits en event called pipe when a "set" of operations is complete:
new Lazy(
...
).on('pipe', function() {
// all done
});
Modifying your code to use this event results in (the only change is near the bottom):
fs = require('fs');
Lazy = require('lazy');
path = require('path');
files = fs.readdirSync('.');
var software = {};
files.forEach( function(fileName){
var m;
if( m = fileName.match(/^(.*)\.txt$/) ){
name = m[1];
console.log("Processing file: " + fileName);
software[name] = {};
console.log("Software 1: %j",software);
var section = 'unset';
new Lazy(fs.createReadStream(fileName)).lines.forEach(
function(line){
var m;
line = line + '';
if( m = line.match(/^([a-zA-Z_]*):$/)){
section = m[1];
software[name][section] = '';
console.log("Switching to section " + m[1]);
console.log("Software 2: %j",software);
} else if (line == '.'){
section = 'unset'
} else if (line == ''){
section = 'unset'
} else {
console.log("LINE: " + line) ;
software[name][section] = software[name][section] + line + "\n";
console.log("Software 3: %j",software);
}
}
).on('pipe', function() {
console.log("Software 4: %j",software);
});
}
});
[Edit] To answer your question regarding how I found this info:
I did indeed check out the source file for the project; I knew the library had a sum method that could be chained to instances of Lazy to sum up everything at the end; the code for that method calls foldr, and the code for that method listens for an event called pipeName, which is defaulted in line 22 as pipe.

Indesign CS6 Scripting - Exporting images

I'm having trouble writing a js script in indesign cs6 to export my formatted images. the code below (found on this website and slightly modified) only opens the document.
ideally the script would loop through all of the formatted/cropped images in my document and export them into a new folder on the desktop, but with the original file names.
any help would be much appreciated:
test();
function test(){
var myDoc = app.open('/Users/StudioA/Desktop/file.indd');
var myGroups = myDoc.groups;
//for each group...
for (var i = 0;i < myGroups.length; i++){
// for each rectangle in the group...
for(var r = 0; r< myGroups[i].rectangles.length; r++){
var myRect = myGroups[i].rectangles[r];
app.jpegExportPreferences.exportResolution = 300;
app.jpegExportPreferences.jpegQuality = JPEGOptionsQuality.MAXIMUM;
//give it a unique name
var myFile = new File('/Users/StudioA/Desktop/Export/' + myRect.name + '.jpg');
myRect.exportFile(ExportFormat.JPG, myFile);
}
}
}
The file name isn't located on the rectangle but on the link related to the placed graphic.
This should do what you want given an open document:
test();
function test() {
var myDoc = app.activeDocument, apis = myDoc.allPageItems, rect, fileName;
while ( rect = apis.pop() )
{
if ( !(rect instanceof Rectangle) || !rect.graphics[0].isValid ){ continue; }
fileName = File ( rect.graphics[0].itemLink.filePath ).name;
fileName = fileName.replace( /\.[a-z]{2,4}$/i, '.jpg' );
app.jpegExportPreferences.exportResolution = 300;
app.jpegExportPreferences.jpegQuality = JPEGOptionsQuality.MAXIMUM;
//give it a unique name
var myFile = new File (Folder.desktop+"/"+ fileName);
rect.exportFile(ExportFormat.JPG, myFile);
}
}
Just adding my verbose version of this, which works from the current selection in InDesign and provides console feedback. It renames the images with the prefix "crop_" and saves them to ~/temp
exportSelectedImages();
function exportSelectedImages() {
// configure export settings
app.jpegExportPreferences.exportResolution = 72;
app.jpegExportPreferences.jpegQuality = JPEGOptionsQuality.HIGH;
// collect selected objects
var selected = app.activeDocument.selection;
$.writeln("Got " + selected.length + " selected objects...");
// process selected objects
for (var i = 0; i < selected.length; i++) {
var cursor = selected[i];
var img = cursor.images;
$.writeln("Processing #" + (i+1) + "/" + selected.length);
$.writeln("\t Type: " + cursor.constructor.name);
// verify if object contains an image or not
if (cursor.images.length > 0) {
var img = cursor.images[0];
$.writeln("\t Contains image of type " + img.imageTypeName);
var imageFileName = cursor.images[0].itemLink.name;
$.writeln("\t File Name: " + imageFileName);
} else {
$.writeln("\t Not an image");
}
// save the object to a jpeg in path specified below
var myFile = new File('~/temp/' + "crop_" + imageFileName + '.jpg');
cursor.exportFile(ExportFormat.JPG, myFile);
}
$.writeln("Done.");
}

How to extract text from a PDF in JavaScript

I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.

Categories