I am trying to customize the Split node in Node-Red in a way to send the next message only when the first arrives to the Join node; as I am doing some processing in between, and would like to process each msg separately before joining them.
So I have cloned the Split node from Node-Red project, and at the part where the splitting of an array happens; I register listeners to events (random IDs generated by the original Split node).
else if (Array.isArray(msg.payload)) { // then split array into messages
msg.parts.type = 'array';
var count = msg.payload.length / node.arraySplt;
if (Math.floor(count) !== count) {
count = Math.ceil(count);
}
msg.parts.count = count;
var pos = 0;
var data = msg.payload;
msg.parts.len = node.arraySplt;
for (let i = 0; i < count; i++) {
msg.payload = data.slice(pos, pos + node.arraySplt);
if (node.arraySplt === 1) {
msg.payload = msg.payload[0];
}
msg.parts.index = i;
pos += node.arraySplt;
if (i === 0) {
send(RED.util.cloneMessage(msg));
continue;
}
let eventName = msg.parts.id + '-' + i;
addHandler(eventName, send, msg);
}
done();
}
My handler function
function addHandler(eventName, send, msg) {
RED.events.addListener(eventName, () => {
send(RED.util.cloneMessage(msg));
});
}
And at the Join node (which is in the same js file)
// commented below is the original code of the join function
// if ((tcnt > 0 && group.currentCount > tcnt) || msg.hasOwnProperty('complete')) {
// completeSend(partId);
// return;
// } else
if (msg.parts.index <= (msg.parts.count - 2)) {
let eventName = msg.parts.id + '-' + (parseInt(msg.parts.index) + 1);
RED.events.emit(eventName);
} else if (msg.parts.index >= (msg.parts.count - 1)) {
completeSend(partId);
return;
}
However, this would send the first msg (as I directly send it, and not through an event), and the last msg only; it would skip whatever in between.
Modifying the split node is the wrong way to do this.
There are rate limiting nodes that will do this for you. e.g. https://flows.nodered.org/node/node-red-contrib-semaphore
Or you can use the delay node with it's release function
Is it possible to compare filenames for a set of files that are imported as Photoshop layers ?
I have a folder of 50 jpg images which I have used in a PSD file.
Now I want to check whether all the JPG files are used or not ?
Is it possible to do so ?
As I've said, Photoshop scripting can help you achieve this by using File Objects and basic javascript knowledge. I've modified my old script as you've desired and now it should work well with any nested groups and images.
I highly encourage you to learn scripting and ask questions here wherever you feels confused.
Save below code as 'Script.jsx' and run it from 'File > Scripts > Browse'
Update 2 : Now it saves log.txt file too as per you requested. P.S. Learn from this script and tweak it to your desired result.
// Managing Document
var docs = app.documents;
// Progress Bar
var win = new Window("window{text:'Progress',bounds:[100,100,400,150],bar:Progressbar{bounds:[20,20,280,31] , value:0,maxvalue:100}};");
// assigning activeDocument
if (docs.length != 0) {
var docRef = app.activeDocument;
// Defining the folder
alert("You will be prompted for the folder containing your images.\n" +
"Files will be selected with a '.png'/'.jpg/.jpeg' on the end in the same folder.");
var folder = Folder.selectDialog();
if (!folder) {
exit;
}
var photoFiles = folder.getFiles(/\.(jpg|jpeg|png)$/i);
var matchFiles = [];
var photoFilesName = [];
//Searching for used images
var increment = parseFloat(0);
var divider = parseFloat(100/photoFiles.length);
win.show();
for (var i = 0; i < photoFiles.length; i++) {
increment = increment + divider;
var indexPhotoName = removeExtension(photoFiles[i].displayName);
photoFilesName.push(indexPhotoName);
var doc = activeDocument;
var curLayer;
goThroughLayers(doc, indexPhotoName);
}
function goThroughLayers(parentLayer, targetName) {
for (var i = 0; i < parentLayer.layers.length; i++) {
curLayer = parentLayer.layers[i];
doc.activeLayer = curLayer;
if (curLayer.typename == 'LayerSet') {
goThroughLayers(curLayer, targetName)
} else {
if (curLayer.name == targetName) {
// if (curLayer.name.match(/[e]/ig)) {
matchFiles.push(targetName);
// }
} //end if
} //end else
} //end loop
} //end function
function arr_diff(a1, a2) {
var a = [],
diff = [];
for (var i = 0; i < a1.length; i++) {
a[a1[i]] = true;
}
for (var i = 0; i < a2.length; i++) {
if (a[a2[i]]) {
delete a[a2[i]];
} else {
a[a2[i]] = true;
}
}
for (var k in a) {
diff.push(k);
}
return diff;
}
function removeExtension(str) {
return str.split('.').slice(0, -1).join('.');
}
var missItems = arr_diff(matchFiles, photoFilesName);
if (missItems.length > 0) {
var missFolder = new Folder(photoFiles[0].path + '/Missed%20Files');
if(!missFolder.exists){
missFolder.create();
}
for (var y = 0; y < photoFiles.length; y++) {
var photoTrimName = removeExtension(photoFiles[y].displayName);
for( var x = 0; x < missItems.length ; x++){
if(photoTrimName == missItems[x]){
photoFiles[y].copy(new File(missFolder+'/'+photoFiles[y].displayName));
}
}
};
win.close();
alert("You've missed total " + missItems.length + " files. Press OK to open folder containing missing files. Log report is generated wherever PSD is saved.");
var FileStr = "";
for(var m=0; m<missItems.length; m++){
FileStr = FileStr + '\n' + (m+1) + '. ' + missItems[m];
}
var str = "Your missed files are : " + FileStr;
saveTxt(str);
missFolder.execute();
} else {
win.close();
saveTxt('All Photos are used');
alert('All Photos are used');
}
} else {
alert('Open atleast one document');
}
function saveTxt(txt)
{
var Name = "LogReport_" + app.activeDocument.name.replace(/\.[^\.]+$/, '');
var Ext = decodeURI(app.activeDocument.name).replace(/^.*\./,'');
if (Ext.toLowerCase() != 'psd')
return;
var Path = app.activeDocument.path;
var saveFile = File(Path + "/" + Name +".txt");
if(saveFile.exists)
saveFile.remove();
saveFile.encoding = "UTF8";
saveFile.open("e", "TEXT", "????");
saveFile.writeln(txt);
saveFile.close();
}
In Javascript, it is possible to get some information related to PSD file layers using PSD.js library
I have a function for adding the contents of a separate google document at the cursor point within the active document, but I haven't been able to get it to work. I keep getting the "Element does not contain the specified child element" exception, and then the contents gets pasted to the bottom of the document rather than at the cursor point!
function AddTable() {
//here you need to get document id from url (Example, 1oWyVMa-8fzQ4leCrn2kIk70GT5O9pqsXsT88ZjYE_z8)
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM"; //Browser.inputBox("ID der Serienbriefvorlage (aus Dokumentenlink kopieren):");
var doc = DocumentApp.openById(FileTemplateFileId);
var DocName = doc.getName();
//Create copy of the template document and open it
var docCopy = DocumentApp.getActiveDocument();
var totalParagraphs = doc.getBody().getParagraphs(); // get the total number of paragraphs elements
Logger.log(totalParagraphs);
var cursor = docCopy.getCursor();
var totalElements = doc.getNumChildren();
var elements = [];
for (var j = 0; j < totalElements; ++j) {
var body = docCopy.getBody();
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.appendParagraph(element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.appendTable(element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.appendListItem(element);
}
// ...add other conditions (headers, footers...
}
Logger.log(element.editAsText().getText());
elements.push(element); // store paragraphs in an array
Logger.log(element.editAsText().getText());
for (var el = 0; el < elements.length; el++) {
var paragraph = elements[el].copy();
var doc = DocumentApp.getActiveDocument();
var bodys = doc.getBody();
var cursor = doc.getCursor();
var element = cursor.getElement();
var container = element.getParent();
try {
var childIndex = body.getChildIndex(container);
bodys.insertParagraph(childIndex, paragraph);
} catch (e) {
DocumentApp.getUi().alert("There was a problem: " + e.message);
}
}
}
You want to copy the objects (paragraphs, tables and lists) from the document of 1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM to the active Document.
You want to copy the objects to the cursor position on the active Document.
You want to achieve this using Google Apps Script.
If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.
Modification points:
In your script, appendParagraph, appendTable and appendListItem are used at the 1st for loop. I think that the reason that the copied objects are put to the last of the document is due to this.
var body = docCopy.getBody(); can be put to the out of the for loop.
In your case, I think that when the 1st for loop is modified, 2nd for loop is not required.
When above points are reflected to your script, it becomes as follows.
Modified script:
function AddTable() {
var FileTemplateFileId = "1MFG06knf__tcwHWdybaBk124Ia_Mb0gBE0Gk8e0URAM";
var doc = DocumentApp.openById(FileTemplateFileId);
var docCopy = DocumentApp.getActiveDocument();
var body = docCopy.getBody();
var cursor = docCopy.getCursor();
var cursorPos = docCopy.getBody().getChildIndex(cursor.getElement());
var totalElements = doc.getNumChildren();
for (var j = 0; j < totalElements; ++j) {
var element = doc.getChild(j).copy();
var type = element.getType();
if (type == DocumentApp.ElementType.PARAGRAPH) {
body.insertParagraph(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.TABLE) {
body.insertTable(cursorPos + j, element);
} else if (type == DocumentApp.ElementType.LIST_ITEM) {
body.insertListItem(cursorPos + j, element);
}
}
}
It seems that DocName is not used in your script.
References:
insertParagraph()
insertTable()
insertListItem()
If I misunderstood your question and this was not the result you want, I apologize. At that time, can you provide the sample source Document? By this, I would like to confirm it.
I'm working on implementing a system where elements can be dragged and dropped to create flowcharts. My Issue is with saving the flowchart so that it could be reloaded when needed. For now I've created a method that saves all the previous data of the element onto the final array that holds only elements that are dropped on the container. But I'm getting a Trivial Error as Undefined variable on the debugging interface. Hence I'm not getting the intended output and the alert messages that I included are not being printed when the condition is met.
Code in Context
function saveFlowchart()
{
var nodes = [];
var matches = [];
var searchEles = document.getElementById("container").children;
for(var i = 0; i < searchEles.length; i++)
{
matches.push(searchEles[i]);
var idOfEl = searchEles[i].id;
if(searchEles[i].id !=null || searchEles[i].id !="")
{
var $element = $("#" + searchEles[i].id);
var dropElem = $("#" + searchEles[i].id).attr('class');
var position = $element.position();
position.bottom = position.top + $element.height();
position.right = position.left + $element.width();
alert("class:"+dropElem+"\nTop position: " + position.top + "\nLeft position: " + position.left + "\nBottom position: " + position.bottom + "\nRight position: " + position.right);
finalArray[idOfEl-1][0]= idOfEl;
finalArray[idOfEl-1][1]= dropElem;
var elId = parseInt(idOfEl);
if (dropElem == "stream ui-draggable")
{
for(var count=0;count<100;count++)
{
alert("One loop opened with count="+count);
if(createdImportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdImportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdImportStreamArray[count][2]; //asName
alert("createdImportStreamArray[count][0]==elId");
}
else if(createdExportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdExportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdExportStreamArray[count][2]; //asName
}
else if(createdDefinedStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdDefinedStreamArray[count][1]; //Stream Name
finalArray[elId-1][3]= createdDefinedStreamArray[count][4]; //Number of Attributes
finalArray[elId-1][4]=[];
for(var f=0;f<createdDefinedStreamArray[r][4];f++)
{
finalArray[elId-1][4][f][0]=createdDefinedStreamArray[count][2][f][0]; //Attribute Name
finalArray[elId-1][4][f][1]=createdDefinedStreamArray[count][2][f][1]; // Attribute Type
}
}
alert("One loop closed with count="+count);
}
alert("Loop ended with count="+count);
}
else if (dropElem == "wstream ui-draggable")
{
ElementType="wstream";
}
// else if conditions...
alert(finalArray);
}
}
//Code to output the connection details in a json format
//The following is not affected by the above mentioned error
$(".node").each(function (idx, elem) {
var $elem = $(elem);
var endpoints = jsPlumb.getEndpoints($elem.attr('id'));
console.log('endpoints of '+$elem.attr('id'));
console.log(endpoints);
nodes.push({
blockId: $elem.attr('id'),
nodetype: $elem.attr('data-nodetype'),
positionX: parseInt($elem.css("left"), 10),
positionY: parseInt($elem.css("top"), 10)
});
});
var connections = [];
$.each(jsPlumb.getConnections(), function (idx, connection) {
connections.push({
connectionId: connection.id,
pageSourceId: connection.sourceId,
pageTargetId: connection.targetId
});
});
var flowChart = {};
flowChart.nodes = nodes;
flowChart.connections = connections;
flowChart.numberOfElements = numberOfElements;
var flowChartJson = JSON.stringify(flowChart);
//console.log(flowChartJson);
$('#jsonOutput').val(flowChartJson);
}
Debugging Interface
According to this the count variable in the for loop is undefined. I've tried taking the first statement var count=0 outside the loop declaration part and defining it in the very beginning of the method. But that simply checks for count=0 against the conditions and doesn't increment at all.
Any help in this regard will be highly appreciated as I've been working on this minor error for almost 2 days now.
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.