I am trying to display JSON data on a list but my javascript code gets stuck in the middle of wlCommonInit() function.
function wlCommonInit(){
WL.Logger.debug("inside the wlcommoninit");
busyIndicator = new WL.BusyIndicator('AppBody');
var $list = $("ul#myList");
$list.append('<li> ' + "resrere" +
'</li>');
getData();
$list.append('<li> ' + "resrere" +
'</li>');
}
function getData() {
$.mobile.showPageLoadingMsg();
WL.App.overrideBackButton (function(){WL.App.close();});
var invocationData = {
adapter : 'StudentInfo',
procedure : 'getStudentInfos'
};
WL.Client.invokeProcedure(invocationData,{
onSuccess : loadFeedsSuccess,
onFailure : getDataFailure,
});
}
function loadFeedsSuccess(result){
if (!result || !result.invocationResult || !result.invocationResult.items || result.invocationResult.items.length == 0)
alert("Could not retrieve feeds");
feeds = result.invocationResult.items;
$("ul#myList").empty();
for (var i=0; i<feeds.length; i++){
var dataItem = feeds[i];
var listItem = $("<li>" + dataItem.question + "</li>");
$("ul#myList").append(listItem);
}
}
In this code, it sticks at getData(); it displays the first "resrere" but it does not display second "resrere". I can not find the problem, so I can not create any solution. Thanks for your help.
items should be resultSet.
Tested locally and verified to work after the below changes.
Change this:
if (!result || !result.invocationResult ||
!result.invocationResult.items || result.invocationResult.items.length
== 0)
To This:
if (!result || !result.invocationResult ||
!result.invocationResult.resultSet ||
result.invocationResult.resultSet.length == 0)
And this:
feeds = result.invocationResult.items;
To this:
feeds = result.invocationResult.resultSet;
Related
I am receiving a JavaScript error due to the ASP.net bundle functionality, the JS code works fine before it gets minified but not after.
The issue Is have a isSelected variable that contains a boolean. But when the code gets minified it shortens this variable name to i. But in the $.each method I create an inline function with two parameters one being i.
It seems the minification logic failed to realize that renaming the variable to i would conflict with the code in my inline function.
Is this a bug with the ASP.net bundling system? If not how am I recommended to solve this, I am concerned that although I could just rename the parameter name that this bug could crop up in other parts of my web application.
Original JS code
function ItemManagerUpdateUi(instance) {
var selectedValue = instance.ItemCombo.GetValue();
var selectedItem = ItemManagerGetSelectedItem(instance);
var isSelected = selectedValue != null;
var isNewSelected = selectedValue == -1;
var applyText = isNewSelected ? "Add" : "Apply";
var text = isNewSelected ? "" : instance.ItemCombo.GetText();
var highestSortOrder = 0;
$.each(instance.Items, function (i, e) {
if (e.SortOrder > highestSortOrder)
highestSortOrder = e.SortOrder;
});
var sortOrder = isNewSelected || !isSelected ? highestSortOrder + 1 : selectedItem.SortOrder;
$('#' + instance.EditPanelId).toggle(isSelected);
$('#' + instance.ApplyButtonId).val(applyText);
$('#' + instance.DeleteButtonId).toggle(!isNewSelected);
var labelWidth = $(instance.ItemCombo.GetMainElement()).closest('.rbox-clearfix').find('.labelStyle').width() + 15;
$('#' + instance.ButtonPanelId).css('margin-left', labelWidth + 'px');
instance.TextControl.SetText(text);
instance.SortOrderControl.SetNumber(sortOrder);
$.each(instance.CustomFields, function (i, e) {
var value = isNewSelected || !isSelected ? null : selectedItem[e.DataName];
var dxControl = eval(e.ControlName);
dxControl.SetValue(value);
dxControl.SetIsValid(true);
});
if (!isNewSelected)
$(instance.ItemCombo.GetMainElement()).find('.dms-combo-main-input').removeClass('newActivityItem');
instance.TextControl.SetIsValid(true);
}
JS code after minification
function ItemManagerUpdateUi(n) {
var u = n.ItemCombo.GetValue(),
f = ItemManagerGetSelectedItem(n),
i = u != null,
t = u == -1,
s = t ? "Add" : "Apply",
h = t ? "" : n.ItemCombo.GetText(),
r = 0,
e,
o;
$.each(n.Items, function(n, t) {
t.SortOrder > r && (r = t.SortOrder)
});
e = t || !i ? r + 1 : f.SortOrder;
$("#" + n.EditPanelId).toggle(i);
$("#" + n.ApplyButtonId).val(s);
$("#" + n.DeleteButtonId).toggle(!t);
o = $(n.ItemCombo.GetMainElement()).closest(".rbox-clearfix").find(".labelStyle").width() + 15;
$("#" + n.ButtonPanelId).css("margin-left", o + "px");
n.TextControl.SetText(h);
n.SortOrderControl.SetNumber(e);
console.log(i);
$.each(n.CustomFields, function(i, e) {
console.log(i);
var value = t || !i ? null : f[e.DataName]
, dxControl = eval(e.ControlName);
dxControl.SetValue(value);
dxControl.SetIsValid(!0)
});
t || $(n.ItemCombo.GetMainElement()).find(".dms-combo-main-input").removeClass("newActivityItem");
n.TextControl.SetIsValid(!0)
}
EDIT
After more research it seems removing the eval line of code fixes the issue. For the time being I changed my code to not use eval although this still seems like a bug in the bundle process, for some reason you have these strange behaviour when using eval
I am using a handlebars template to stick some info on my nav bar. The info is coming from a Rails controller via an AJAX call. The AJAX, given that it's asynchronous, is finishing after the template has received it's variables, thus a variable that should be set in the AJAX call never does so. Here's the code:
export default {
name: "data-menu-item",
initialize: function(container) {
$(document).ready(function() {
var source = $("#notification-menu-item").html();
var template = Handlebars.compile(source);
var user = Discourse.User.current();
var pro = false;
var logged_user = false;
var data_url = "";
$.ajax("/custom_group_names", {
type: 'GET'
}).done(function(res){
if(res.custom_group_names){
console.log(res.group_names);
for (var i=0; i < res.group_names.length; i++) {
// Agents, Brokers, ManagingBrokers, MortageBrokers, admins
if (res.group_names[i]["name"] === "Brokers" || res.group_names[i]["name"] === "ManagingBrokers" || res.group_names[i]["name"] === "MortageBrokers") {
console.log("groups were brokers, etc.");
pro = true;
data_url = "twobydev.com/brokerdashboard";
} else if (res.group_names[i]["name"] === "admins" || res.group_names[i]["name"] === "Agents") {
console.log("groups were admin or agents");
pro = true;
data_url = "twobydev.com/agentdashboard";
console.log(pro);
console.log(data_url);
}
}
}
});
if(user) {
logged_user = true;
if(user.total_unread_notifications > 0) {
new_notification_class = "new-notifications"
notification_count = "(" + user.total_unread_notifications + ")";
}
}
var html = template({pro: pro, logged_user: logged_user, data_url: data_url});
$('body').prepend(html);
});
}
}
logged_user gets set because it is outside of the ajax call, however, I need pro and data_url to be set as well. Any advice or help is much appreciated!
Move the template processing into the ajax callback
function processTemplate(){
var html = template({pro: pro, logged_user: logged_user, data_url: data_url});
$('body').prepend(html);
}
$.ajax({
.....
}).done(function(res){
/* existing processing code */
// now process template
processTemplate()
});
You just need to move this...
if(user) {
logged_user = true;
if(user.total_unread_notifications > 0) {
new_notification_class = "new-notifications"
notification_count = "(" + user.total_unread_notifications + ")";
}
}
var html = template({pro: pro, logged_user: logged_user, data_url: data_url});
$('body').prepend(html);
inside of your done function.
I'm trying to get the ContentTypeId of an item in sharepoint to get the full url of the item to get the binary of it and after send it to another plateform.
So here i put this code in element.xml to get the list ID and the document ids of the items i'm selecting, after this i send them to an ASPX page in a Sharepoint Dialog to define the destination of the items and after this in the postback, stream the binary and send it to the another platform. The problem is : To get the full url of my items i need ListId, ItemId and ContentTypeId.
Because i've found a code to stream the binary here :
How to Programatically Download files from sharepoint document library
And i need the full url of my items.
Any idea?
thanks
var iddocs ='';
var listId ='';
function geturl()
{
var context = SP.ClientContext.get_current();
this.web = context.get_web();
listId = SP.ListOperation.Selection.getSelectedList();
var list = this.web.get_lists().getById(listId);
var ok = false;
try
{
if ( SP.ListOperation.Selection.getSelectedItems(context) !== false)
{
var items = SP.ListOperation.Selection.getSelectedItems(context);
var url='listId:'+listId+ ' Number of selected items: ' + items.length ;
var i = 0;
if(items.length==0)
{
}else{
while( i != items.length )
{
url += ' Doc' + i + ': ' + items[i].id;
if(i>0){iddocs += '-'};
iddocs += items[i].id;
i++;
};
ok = true;
alert(url+' Id of clicked item:'+{ItemId});
};
};
}
catch(err)
{
};
return ok;
};
function OpenDialog(pidliste) {
var options = SP.UI.$create_DialogOptions();
options.width = 600;
options.height = 600;
options.title = 'Envoyer vers Nuxeo';
options.url ='/_Layouts/SPTest.CustomMenuItem/index.aspx?click={ItemId}';
if(pidliste){options.url += '&ids='+pidliste +'-'+ iddocs;};
options.dialogReturnValueCallback = Function.createDelegate(null, CloseCallback);
SP.UI.ModalDialog.showModalDialog(options);
}
function CloseCallback(result, target) {
if (result == SP.UI.DialogResult.OK) {
}
if (result == SP.UI.DialogResult.cancel) {
SP.UI.Notify.addNotification('Opération canceled', false, '', null);
}
}
if(geturl())
{
OpenDialog(listId);
}else{
alert('Please select an item');
};
I've found the solution. In fact, items can be reached via :
{SiteUrl}+{ItemUrl}
The download function is linked in my first Post. But it doesn't work for multiple items, with this method you can only reach the properties of the item you're selecting.
You have to note that if you want to access to a SP file, you have to set your request.credential via :
request.Credentials = System.Net.CredentialCache.DefaultCredentials;
which will take the current credential you're using.
Hope it helps.
Ok, feeling stupid here, but wondering what the problem is here exactly.
Although the function works as it should, I get this JS Error in Opera. Not sure about other browsers...
Uncaught exception: TypeError: Cannot
convert
'document.getElementById("shoutbox_area"
+ moduleId)' to object
oElement = document.getElementById("shoutbox_area"
+ moduleId).childNodes;
Here is the relevant code:
function appendShout(XMLDoc)
{
var shoutData = XMLDoc.getElementsByTagName("item");
var oElement = [];
if (shoutData.length > 0)
{
var moduleId = shoutData[0].getAttribute("moduleid");
if (shoutData[shoutData.length - 1].getAttribute("lastshout") != "undefined")
{
for (var i = 0; i < shoutData.length; i++)
if (shoutData[i].firstChild.nodeValue != 0)
document.getElementById("shoutbox_area" + moduleId).innerHTML += shoutData[i].firstChild.nodeValue;
oElement = document.getElementById("shoutbox_area" + moduleId).childNodes;
var i = oElement.length;
while (i--)
{
if (i % 2 == 0)
oElement[i].className = "windowbg2";
else
oElement[i].className = "windowbg";
}
oElement[oElement.length - 2].style.borderBottom = "1px black dashed";
}
}
}
Can someone please help me to understand why it is giving me an error here:
oElement = document.getElementById("shoutbox_area" + moduleId).childNodes;
Can I not assign an array to the childNodes?
EDIT:
This JS Error occurs when I try and delete a shout. The JS function for deleting a shout is this:
function removeShout(shout, moduleID)
{
var shoutContainer = shout.parentNode.parentNode;
var send_data = "id_shout=" + shout.id;
var url = smf_prepareScriptUrl(smf_scripturl) + "action=dream;sa=shoutbox;xml;" + "delete_shout;" + "canmod=" + canMod[moduleID] + ";" + sessVar + "=" + sessId;
sendXMLDocument(url, send_data);
var shoutID = 0;
while (shoutID !== null)
{
var shoutID = document.getElementById(shout.parentNode.id);
var moduleID = shoutID.parentNode.getAttribute("moduleid");
if (shoutID.parentNode.lastChild)
{
var url = smf_prepareScriptUrl(smf_scripturl) + "action=dream;sa=shoutbox;xml;get_shouts=" + (shoutID.parentNode.lastChild.id.replace("shout_", "") - 1) + ";membercolor=" + memberColor[moduleID] + ";maxcount=" + maxCount[moduleID] + ";shoutboxid=" + shoutboxID[moduleID] + ";textsize=" + textSize[moduleID] + ";parsebbc=" + parseBBC[moduleID] + ";moduleid=" + moduleID + ";maxcount=" + maxCount[moduleID] + ";canmod=" + canMod[moduleID] + ";" + sessVar + "=" + sessId;
getXMLDocument(url, appendShout);
}
element = shoutID.parentNode.childNodes;
var i = element.length;
while (i--)
{
if (i % 2 == 0)
element[i].className = "windowbg2";
else
element[i].className = "windowbg";
}
shoutID.parentNode.removeChild(shoutID);
}
}
Am using the following functions for the sending and getting the XMLHttpRequest as you may have noticed already in the removeShout function above:
// Load an XML document using XMLHttpRequest.
function getXMLDocument(sUrl, funcCallback)
{
if (!window.XMLHttpRequest)
return null;
var oMyDoc = new XMLHttpRequest();
var bAsync = typeof(funcCallback) != 'undefined';
var oCaller = this;
if (bAsync)
{
oMyDoc.onreadystatechange = function () {
if (oMyDoc.readyState != 4)
return;
if (oMyDoc.responseXML != null && oMyDoc.status == 200)
{
if (funcCallback.call)
{
funcCallback.call(oCaller, oMyDoc.responseXML);
}
// A primitive substitute for the call method to support IE 5.0.
else
{
oCaller.tmpMethod = funcCallback;
oCaller.tmpMethod(oMyDoc.responseXML);
delete oCaller.tmpMethod;
}
}
};
}
oMyDoc.open('GET', sUrl, bAsync);
oMyDoc.send(null);
return oMyDoc;
}
// Send a post form to the server using XMLHttpRequest.
function sendXMLDocument(sUrl, sContent, funcCallback)
{
if (!window.XMLHttpRequest)
return false;
var oSendDoc = new window.XMLHttpRequest();
var oCaller = this;
if (typeof(funcCallback) != 'undefined')
{
oSendDoc.onreadystatechange = function () {
if (oSendDoc.readyState != 4)
return;
if (oSendDoc.responseXML != null && oSendDoc.status == 200)
funcCallback.call(oCaller, oSendDoc.responseXML);
else
funcCallback.call(oCaller, false);
};
}
oSendDoc.open('POST', sUrl, true);
if ('setRequestHeader' in oSendDoc)
oSendDoc.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
oSendDoc.send(sContent);
return true;
}
Hopefully this is good enough, you can do a view source on it to see the actual HTML, but there are attributes that get added to the Shoutbox tags at runtime so as to be XHTML compliant, etc..
Please let me know if there is anything else you need?
Thanks :)
The code is breaking because shoutID is null in the second of these two lines, the second time through the loop:
var shoutID = document.getElementById(shout.parentNode.id);
var moduleID = shoutID.parentNode.getAttribute("moduleid");
The first of those lines is strange. Why not just use var shoutID = shout.parentNode;?
Also, the moduleId attribute seems to be nowhere around.
What are you trying to achieve with the while loop?
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.