I am trying to fix all of the hyperlinks in my indesign files, and replace the https with http. right now, in order for it to work, I run this script..
var
i;
hls = app.activeDocument.hyperlinkURLDestinations;
for (i = 0; i < hls.length; i++) {
if (!hls[i].destinationURL.match('http://')) {
hls[i].destinationURL = 'http://' + hls[i].destinationURL;
}
}
followed by this script, choosing https to be replaced by http...
Menu for find/replace
main();
function main(){
var d = app.dialogs.add({name:"Replace Hyperlink URL Values"});
var col1 = d.dialogColumns.add();
var col2 = d.dialogColumns.add();
col1.staticTexts.add({staticLabel:"Find (GREP):"});
col1.staticTexts.add({staticLabel:"Replace:"});
var find = col2.textEditboxes.add({minWidth:100});
var change = col2.textEditboxes.add({minWidth:100});
var result = d.show();
if(!result){
d.destroy();
return;
}
var grepForFind = RegExp(find.editContents,"g");
var grepForReplace = change.editContents;
d.destroy();
var dests = app.documents[0].hyperlinkURLDestinations.everyItem().getElements();
for(var i=0;i<dests.length;i++){
dests[i].destinationURL = dests[i].destinationURL.replace(grepForFind,grepForReplace);
}
}
Once both of these have been ran, I notice that the "http://" has been duplicated on the hyperlinks that already contain "http://".
So I run the second script again replacing (http:// + http://) with "http://" which solves the problem.
My question, is how to make it into a single script that would work the first time.
**Note:**The second script presents this error if the first is not run, which baffles me as well.
Any and all help would be appreciated.
On the first script you get http:// duplicated because you are adding it to its own reference i.e. "http://"+"http://β¦". You have to replace string, not to add it:
var
i;
hls = app.activeDocument.hyperlinkURLDestinations;
for (i = 0; i < hls.length; i++) {
if (!hls[i].destinationURL.match('http://')) {
hls[i].destinationURL = hls[i].destinationURL.replace(/^https/,"http");
}
}
Another approach:
Hyperlink.prototype.grep = function(findString,repString, specifiers){
var r, dests = this.destination, url, dest, n = dests.length;
if ( !n
|| !findString
|| !repString
|| typeof (findString) != "string"
|| typeof (repString) != "string"
|| ( specifiers && typeof ( specifiers )!="string" )
) return;
r = new RegExp ( findString, specifiers? specifiers:"gi" );
while (n-- ) {
dest = dests[n];
if ( dest instanceof HyperlinkURLDestination )Β {
url = dest.destinationURL;
dest.destinationURL = url.replace ( r, repString );
}
}
}
main();
function main(){
var d = app.dialogs.add({name:"Replace Hyperlink URL Values"});
var col1 = d.dialogColumns.add();
var col2 = d.dialogColumns.add();
col1.staticTexts.add({staticLabel:"Find (GREP):"});
col1.staticTexts.add({staticLabel:"Replace:"});
var find = col2.textEditboxes.add({minWidth:100, editContents:"^https"});
var change = col2.textEditboxes.add({minWidth:100, editContents:"http"});
var result = d.show();
if(!result){
d.destroy();
return;
}
var grepForFind = RegExp(find.editContents,"g");
var grepForReplace = change.editContents;
app.documents[0].hyperlinks.everyItem().grep(find.editContents, change.editContents, "g");
d.destroy();
}
Bass
I ran over all the configurations and to the exception of an empty url destination that indeed thrown an error, I can't reproduce what you are facing.
Maybe give this new snippet a try ?
If still failing, any chance you share the file ? Go at ozalto.com on the contact page if you prefer.
Hyperlink.prototype.grep = function(findString,repString, specifiers){
var r, dests = this.destination, url, dest, n = dests.length;
if ( !n
|| !findString
|| !repString
|| typeof (findString) != "string"
|| typeof (repString) != "string"
|| ( specifiers && typeof ( specifiers )!="string" )
) return;
r = new RegExp ( findString, specifiers? specifiers:"gi" );
while (n-- ) {
dest = dests[n];
if ( dest instanceof HyperlinkURLDestination ) {
url = dest.destinationURL;
url!="" && dest.destinationURL = url.replace ( r, repString );
}
}
}
main();
function main(){
var d = app.dialogs.add({name:"Replace Hyperlink URL Values"});
var col1 = d.dialogColumns.add();
var col2 = d.dialogColumns.add();
col1.staticTexts.add({staticLabel:"Find (GREP):"});
col1.staticTexts.add({staticLabel:"Replace:"});
var find = col2.textEditboxes.add({minWidth:100, editContents:"^https"});
var change = col2.textEditboxes.add({minWidth:100, editContents:"http"});
var result = d.show();
if(!result){
d.destroy();
return;
}
var grepForFind = RegExp(find.editContents,"g");
var grepForReplace = change.editContents;
app.documents[0].hyperlinks.everyItem().grep(find.editContents, change.editContents, "g");
d.destroy();
}
Related
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 months ago.
Improve this question
I want to extract the content from an xlsx gmail attachment via google apps script. And then put the information into a Google Sheet. It's working fine for CSV files, but I don't get the content of a xlsx file.
Unlike csv files, xlsx file data cannot be directly inserted into a spreadsheet
What you can do instead:
Save the attachment on your disc in its original mimeType
Convert it to a Google Sheets document with e.g. Drive.Files.copy
Delete the excel file from your disc
Sample:
function GmailToDrive() {
var threads = GmailApp.getInboxThreads();
var message = threads[0].getMessages()[0];
var attachment = message.getAttachments()[0];
var blob = attachment.getAs(attachment.getContentType());
blob.setName(attachment.getName())
var excel = DriveApp.createFile(blob);
Drive.Files.copy({mimeType: MimeType.GOOGLE_SHEETS}, excel.getId());
excel.setTrashed(true)
}
Note that Drive is an advances service that needs to be enabled beforehand.
You can extract data directly from MS Excel files stored in Google Drive or in Gmail attachment without any upload or conversion to Google Spreadsheet. ππΎπ₯³
Since xlsx workbooks are zipped XML files you can unzip the xlsx blob, process the XML files and extract data needed like this.
/**
* Parsing MS Excel files and returns values in JSON format.
*
* #param {BlobSource} blob the blob from MS Excel file
* #param {String[]} requiredSheets the array of required sheet names (if omitted returns all)
* #return {Object} Object of sheet names and values (2D arrays)
*/
function parseMSExcelBlob(blob, requiredSheets){
var col_cache = {};
var forbidden_chars = {
"<": "<",
">": ">",
"&": "&",
"'": "'",
""": '"'
};
blob.setContentType("application/zip");
var parts = Utilities.unzip(blob);
var relationships = {};
for( var part of parts ){
var part_name = part.getName();
if( part_name === "xl/_rels/workbook.xml.rels" ){
var txt = part.getDataAsString();
var rels = breakUpString(txt, '<Relationship ', '/>');
for( var i = 0; i < rels.length; i++ ){
var rId = breakUpString(rels[i], 'Id="', '"')[0];
var path = breakUpString(rels[i], 'Target="', '"')[0];
relationships[rId] = "xl/" + path;
}
}
}
var worksheets = {};
for( var part of parts ){
var part_name = part.getName();
if( part_name === "xl/workbook.xml" ){
var txt = part.getDataAsString();
var sheets = breakUpString(txt, '<sheet ', '/>');
for( var i = 0; i < sheets.length; i++ ){
var sh_name = breakUpString(sheets[i], 'name="', '"')[0];
sh_name = decodeForbiddenChars(sh_name);
var rId = breakUpString(sheets[i], 'r:id="', '"')[0];
var path = relationships[rId];
if( path.includes("worksheets") ){
worksheets[path] = sh_name;
}
}
}
}
requiredSheets = Array.isArray(requiredSheets) && requiredSheets.length && requiredSheets || [];
var worksheets_needed = [];
for( var path in worksheets ){
if( !requiredSheets.length || requiredSheets.includes(worksheets[path]) ){
worksheets_needed.push(path);
}
}
if( !worksheets_needed.length ) return {"Error": "Requested worksheets not found"};
var sharedStrings = [];
for( var part of parts ){
var part_name = part.getName();
if( part_name === "xl/sharedStrings.xml" ){
var txt = part.getDataAsString();
txt = txt.replace(/ xml:space="preserve"/g, "");
sharedStrings = breakUpString(txt, '<t>', '</t>');
for( var i = 0; i < sharedStrings.length; i++ ){
sharedStrings[i] = decodeForbiddenChars(sharedStrings[i]);
}
}
}
var result = {};
for( var part of parts ){
var part_name = part.getName();
if( worksheets_needed.includes(part_name) ){
var txt = part.getDataAsString();
var cells = breakUpString(txt, '<c ', '</c>');
var tbl = [[]];
for( var i = 0; i < cells.length; i++ ){
var r = breakUpString(cells[i], 'r="', '"')[0];
var t = breakUpString(cells[i], 't="', '"')[0];
if( t === "inlineStr" ){
var data = breakUpString(cells[i].replace(/ xml:space="preserve"/g, ""), '<t>', '</t>')[0];
data = decodeForbiddenChars(data);
}else if( t === "s" ){
var v = breakUpString(cells[i], '<v>', '</v>')[0];
var data = sharedStrings[v];
}else{
var v = breakUpString(cells[i], '<v>', '</v>')[0];
var data = Number(v);
}
var row = r.replace(/[A-Z]/g, "") - 1;
var col = colNum(r.replace(/[0-9]/g, "")) - 1;
if( tbl[row] ){
tbl[row][col] = data;
}else{
tbl[row] = [];
tbl[row][col] = data;
}
}
var sh_name = worksheets[part_name];
result[sh_name] = squareTbl(tbl);
}
}
function decodeForbiddenChars(txt){
for( var char in forbidden_chars ){
var regex = new RegExp(char,"g");
txt = txt.replace(regex, forbidden_chars[char]);
}
return txt;
}
function breakUpString(str, start_patern, end_patern){
var arr = [], raw = str.split(start_patern), i = 1, len = raw.length;
while( i < len ){ arr[i - 1] = raw[i].split(end_patern, 1)[0]; i++ };
return arr;
}
function colNum(char){
if( col_cache[char] ) return col_cache[char];
var alph = "ABCDEFGHIJKLMNOPQRSTUVWXYZ", i, j, result = 0;
for( i = 0, j = char.length - 1; i < char.length; i++, j-- ){
result += Math.pow(alph.length, j) * (alph.indexOf(char[i]) + 1);
}
col_cache[char] = result;
return result;
}
function squareTbl(arr){
var tbl = [];
var x_max = 0;
var y_max = arr.length;
for( var y = 0; y < y_max; y++ ){
arr[y] = arr[y] || [];
if( arr[y].length > x_max ){ x_max = arr[y].length };
}
for( var y = 0; y < y_max; y++ ){
var row = [];
for( var x = 0; x < x_max; x++ ){
row.push(arr[y][x] || arr[y][x] === 0 ? arr[y][x] : "");
}
tbl.push(row);
}
return tbl.length ? tbl : [[]];
}
return result;
}
Using the function parseMSExcelBlob(blob, requiredSheets) you can put the data in a gsheet.
function getDataFromGmail(){
var threads = GmailApp.getInboxThreads();
var message = threads[0].getMessages()[0];
var attachment = message.getAttachments()[0];
var blob = attachment.copyBlob();
// if second parameter is not provided all sheets will be parsed
var data = parseMSExcelBlob(blob, ["Funny corgi names"]);
// here we have the data in 2D array
var tbl = data["Funny corgi names"];
// putting data into the sheet
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sh = ss.getSheetByName("Corgi names");
sh.clearContents();
sh.getRange(1, 1, tbl.length, tbl[0].length).setValues(tbl);
}
Also find it in this GitHub repo.
The "Options" should all create a Data Validation dropdown based off of the "Category" input. ManualAutomatic and Option A work fine and return correct dropdowns. Options B thru E return a Data Validation dropdown that contains only "Undefined"
I am new to google scripting. I started with Learn Google Spreadsheet's tutorial and built from there. I apologize in advance if I am asking a question that is answered elsewhere.
var mainWsName = "Bid Sheet";
var nameData = "Data";
var Category = 1;
var ManualAutomatic = 2;
var OptionA = 3;
var OptionB = 4;
var OptionC = 5;
var OptionD = 6;
var OptionE = 7;
var ws = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(mainWsName);
var wsData = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(nameData);
var manAutoOption = wsData.getRange(2, 1,wsData.getLastRow()-1,3).getValues();
function onEdit(e){
var activeCell = e.range;
var val = activeCell.getValue();
var r = activeCell.getRow();
var c = activeCell.getColumn();
var wsName = activeCell.getSheet().getName();
if (wsName === mainWsName && c === Category && r > 3){
OptionsValidation (val,r);
} else if(wsName === mainWsName && c === ManualAutomatic && r > 3){
NothingScriptRemove (val,r);
}//NothingScriptRemove is useless find a way to remove without breaking
}//end onEdit
function OptionsValidation (val,r){
if(val === ""){
ws.getRange(r,ManualAutomatic).clearContent();
ws.getRange(r,ManualAutomatic).clearDataValidations();
ws.getRange(r,OptionA).clearContent();
ws.getRange(r,OptionA).clearDataValidations();
ws.getRange(r,OptionB).clearContent();
ws.getRange(r,OptionB).clearDataValidations();
ws.getRange(r,OptionC).clearContent();
ws.getRange(r,OptionC).clearDataValidations();
ws.getRange(r,OptionD).clearContent();
ws.getRange(r,OptionD).clearDataValidations();
ws.getRange(r,OptionE).clearContent();
ws.getRange(r,OptionE).clearDataValidations();
} else {
ws.getRange(r,ManualAutomatic).clearContent();
ws.getRange(r,OptionA).clearContent();
ws.getRange(r,OptionB).clearContent();
ws.getRange(r,OptionC).clearContent();
ws.getRange(r,OptionD).clearContent();
ws.getRange(r,OptionE).clearContent();
var filterOptions = manAutoOption.filter(function(o){ return o[0] === val });
var listToApply = filterOptions.map(function (o) { return o[1] });
var cell = ws.getRange(r,ManualAutomatic);
applyValidationtoCell(listToApply,cell);
var firstLevelColValue = ws.getRange(r, Category).getValue();
var filterOptions = manAutoOption.filter(function(o){ return o[0] === firstLevelColValue});
var listToApplyA = filterOptions.map(function (o) { return o[2] });
var cell = ws.getRange(r,OptionA);
applyValidationtoCell(listToApplyA,cell);
var listToApplyB = filterOptions.map(function (o) { return o[3] });
var cell = ws.getRange(r,OptionB);
applyValidationtoCell(listToApplyB,cell);
var listToApplyC = filterOptions.map(function (o) { return o[4] });
var cell = ws.getRange(r,OptionC);
applyValidationtoCell(listToApplyC,cell);
var listToApplyD = filterOptions.map(function (o) { return o[5] });
var cell = ws.getRange(r,OptionD);
applyValidationtoCell(listToApplyD,cell);
var listToApplyE = filterOptions.map(function (o) { return o[6] });
var cell = ws.getRange(r,OptionE);
applyValidationtoCell(listToApplyE,cell);
So I found the answer. A good night's sleep and a clear head make all the difference.
For anyone who stumbles on this looking for answers to similar issues...
This variable isn't pulling the correct data. Or more specifically it isn't pulling the right number of columns.
var manAutoOption = wsData.getRange(2, 1,wsData.getLastRow()-1,3).getValues();
"wsData.getLastRow()-1,3)" The last number is the number of columns that get pulled. Change that number to the number of columns you need to reference and everything else works great.
I'm not very familiar with js and now I need to do something very important for me, but I really don't know how to do it.
I'd like to include google translation api to my site, but I need to change some code in their js files. I have the element.js file on local host:
(function () {
var d = window,
e = document,
f = ".",
g = "UTF-8",
h = "complete",
k = "head",
l = "link",
m = "script",
n = "stylesheet",
p = "text/css",
q = "text/javascript";
Math.random();
function r(b) {
var a = e.getElementsByTagName(k)[0];
a || (a = e.body.parentNode.appendChild(e.createElement(k)));
a.appendChild(b)
}
function _loadJs(b) {
var a = e.createElement(m);
a.type = q;
a.charset = g;
a.src = b;
r(a)
}
function _loadCss(b) {
var a = e.createElement(l);
a.type = p;
a.rel = n;
a.charset = g;
a.href = b;
r(a)
}
function _isNS(b) {
b = b.split(f);
for (var a = d, c = 0; c < b.length; ++c) if (!(a = a[b[c]])) return !1;
return !0
}
function _setupNS(b) {
b = b.split(f);
for (var a = d, c = 0; c < b.length; ++c) a = a[b[c]] || (a[b[c]] = {});
return a
}
d.addEventListener && "undefined" == typeof e.readyState && d.addEventListener("DOMContentLoaded",
function () {
e.readyState = h
}, !1);
if (_isNS('google.translate.Element')) {
return
}
var c = _setupNS('google.translate._const');
c._cl = 'en';
c._cuc = 'googleSectionalElementInit';
c._cac = '';
c._cam = '';
var h = 'translate.googleapis.com';
var b = (window.location.protocol == 'https:' ? 'https://' : 'http://') + h;
c._pah = h;
c._pbi = b + '/translate_static/img/te_bk.gif';
c._pci = b + '/translate_static/img/te_ctrl3.gif';
c._phf = h + '/translate_static/js/element/hrs.swf';
c._pli = b + '/translate_static/img/loading.gif';
c._plla = h + '/translate_a/l';
c._pmi = b + '/translate_static/img/mini_google.png';
c._ps = b + '/translate_static/css/sectionalelement.css';
c._puh = 'translate.google.com';
_loadCss(c._ps);
_loadJs(b + '/translate_static/js/element/main_se.js');
})();
(If it's important, link to this file from web page is "element.js?cb=googleSectionalElementInit&ug=section&hl=en" )
And I need to get main_se.js (the last link in the file) on localhost too, but I don't know how to change link in element.js to this file to make it local. I need it, because I have to replace some html tags in this file to make api work properly for me.
Hope that somebody will advice me what to do.
If I understand correctly, elements.js produces a <script tag with src pointing to translate.googleapi.com and you want it to point to localhost.
The answer is quite easy in this case, simply remove the b+ as b is http://translate.googlapi.com you will get the following script tag
<script src="/transalte_static/js/element/main_se.js"></script>
All you have to do now, it make sure you return the right file (your localhost copy) from this path.
Let me know if you need anything else.
var test = "http://www.example.org/search?q=whatever&another=moretext";
How can I extract another's value (moretext) in the query string above and make a variable out of it?
var test = "http://www.example.org/search?q=whatever&another=moretext";
var another = test.split('another=');
another is an array with another[0] = 'http://www.example.org/search?q=whatever&' and another[1] = 'moretext'.
keep this function in your bag :
function querySt(qsName, url)
{
var theUrl;
if (url == null || url == undefined)
theUrl = window.location.search.substring(1); else theUrl = url;
var g = theUrl.split("&");
for (var i = 0; i < g.length; i++) {
var pair = g[i].split("=");
if (pair[0].toLowerCase() == qsName.toLowerCase())
{
return pair[1];
}
}
return null;
}
Usages
alert(querySt("another")+' '+querySt("q"));
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.