This problem has been successfully resolved. I am editing my post to document my experience for posterity and future reference.
The Task
I have 117 PDF files (average size ~238 KB) uploaded to Google Drive. I want to convert them all to Google Docs and keep them in a different Drive folder.
The Problem
I attempted to convert the files using Drive.Files.insert. However, under most circumstances, only 5 files could be converted this way before the function expires prematurely with this error
Limit Exceeded: DriveApp. (line #, file "Code")
where the line referenced above is when the insert function is called. After calling this function for the first time, subsequent calls typically failed immediately with no additional google doc created.
Approach
I used 3 main ways to achieve my goal. One was using the Drive.Files.insert, as mentioned above. The other two involved using Drive.Files.copy and sending a batch of HTTP requests. These last two methods were suggested by Tanaike, and I recommend reading his answer below for more information. The insert and copy functions are from Google Drive REST v2 API, while batching multiple HTTP requests is from Drive REST v3.
With Drive.Files.insert, I experienced issues dealing with execution limitations (explained in the Problem section above). One solution was to run the functions multiple times. And for that, I needed a way to keep track of which files were converted. I had two options for this: using a spreadsheet and a continuation token. Therefore, I had 4 different methods to test: the two mentioned in this paragraph, batching HTTP requests, and calling Drive.Files.copy.
Because team drives behave differently from regular drives, I felt it necessary to try each of those methods twice, one in which the folder containing the PDFs is a regular non-Team Drive folder and one in which that folder is under a Team Drive. In total, this means I had 8 different methods to test.
These are the exact functions I used. Each of these was used twice, with the only variations being the ID of the source and destination folders (for reasons stated above):
Method A: Using Drive.Files.insert and a spreadsheet
function toDocs() {
var sheet = SpreadsheetApp.openById(/* spreadsheet id*/).getSheets()[0];
var range = sheet.getRange("A2:E118");
var table = range.getValues();
var len = table.length;
var resources = {
title: null,
mimeType: MimeType.GOOGLE_DOCS,
parents: [{id: /* destination folder id */}]
};
var count = 0;
var files = DriveApp.getFolderById(/* source folder id */).getFiles();
while (files.hasNext()) {
var blob = files.next().getBlob();
var blobName = blob.getName();
for (var i=0; i<len; i++) {
if (table[i][0] === blobName.slice(5, 18)) {
if (table[i][4])
break;
resources.title = blobName;
Drive.Files.insert(resources, blob); // Limit Exceeded: DriveApp. (line 51, file "Code")
table[i][4] = "yes";
}
}
if (++count === 10) {
range.setValues(table);
Logger.log("time's up");
}
}
}
Method B: Using Drive.Files.insert and a continuation token
function toDocs() {
var folder = DriveApp.getFolderById(/* source folder id */);
var sprop = PropertiesService.getScriptProperties();
var contToken = sprop.getProperty("contToken");
var files = contToken ? DriveApp.continueFileIterator(contToken) : folder.getFiles();
var options = {
ocr: true
};
var resource = {
title: null,
mimeType: null,
parents: [{id: /* destination folder id */}]
};
while (files.hasNext()) {
var blob = files.next().getBlob();
resource.title = blob.getName();
resource.mimeType = blob.getContentType();
Drive.Files.insert(resource, blob, options); // Limit Exceeded: DriveApp. (line 113, file "Code")
sprop.setProperty("contToken", files.getContinuationToken());
}
}
Method C: Using Drive.Files.copy
Credit for this function goes to Tanaike -- see his answer below for more details.
function toDocs() {
var sourceFolderId = /* source folder id */;
var destinationFolderId = /* destination folder id */;
var files = DriveApp.getFolderById(sourceFolderId).getFiles();
while (files.hasNext()) {
var res = Drive.Files.copy({parents: [{id: destinationFolderId}]}, files.next().getId(), {convert: true, ocr: true});
Logger.log(res)
}
}
Method D: Sending batches of HTTP requests
Credit for this function goes to Tanaike -- see his answer below for more details.
function toDocs() {
var sourceFolderId = /* source folder id */;
var destinationFolderId = /* destination folder id */;
var files = DriveApp.getFolderById(sourceFolderId).getFiles();
var rBody = [];
while (files.hasNext()) {
rBody.push({
method: "POST",
endpoint: "https://www.googleapis.com/drive/v3/files/" + files.next().getId() + "/copy",
requestBody: {
mimeType: "application/vnd.google-apps.document",
parents: [destinationFolderId]
}
});
}
var cycle = 20; // Number of API calls at 1 batch request.
for (var i = 0; i < Math.ceil(rBody.length / cycle); i++) {
var offset = i * cycle;
var body = rBody.slice(offset, offset + cycle);
var boundary = "xxxxxxxxxx";
var contentId = 0;
var data = "--" + boundary + "\r\n";
body.forEach(function(e){
data += "Content-Type: application/http\r\n";
data += "Content-ID: " + ++contentId + "\r\n\r\n";
data += e.method + " " + e.endpoint + "\r\n";
data += e.requestBody ? "Content-Type: application/json; charset=utf-8\r\n\r\n" : "\r\n";
data += e.requestBody ? JSON.stringify(e.requestBody) + "\r\n" : "";
data += "--" + boundary + "\r\n";
});
var options = {
method: "post",
contentType: "multipart/mixed; boundary=" + boundary,
payload: Utilities.newBlob(data).getBytes(),
headers: {'Authorization': 'Bearer ' + ScriptApp.getOAuthToken()},
muteHttpExceptions: true,
};
var res = UrlFetchApp.fetch("https://www.googleapis.com/batch", options).getContentText();
// Logger.log(res); // If you use this, please remove the comment.
}
}
What Worked and What Didn't
None of the functions using Drive.Files.insert worked. Every
function using insert for conversion failed with this error
Limit Exceeded: DriveApp. (line #, file "Code")
(line number replaced with generic symbol). No further details or
description of the error could be found. A notable variation was one
in which I used a spreadsheet and the PDFs were in a team drive
folder; while all other methods failed instantly without converting a
single file, this one converted 5 before failing. However, when
considering why this variation did better than the others, I think it
was more of a fluke than any reason related to the use of particular
resources (spreadsheet, team drive, etc.)
Using Drive.Files.copy and batch HTTP requests worked only
when the source folder was a personal (non-Team Drive) folder.
Attempting to use the copy function while reading from a Team Drive
folder fails with this error:
File not found: 1RAGxe9a_-euRpWm3ePrbaGaX5brpmGXu (line #, file "Code")
(line number replaced with generic symbol). The line being referenced
is
var res = Drive.Files.copy({parents: [{id: destinationFolderId}]}, files.next().getId(), {convert: true, ocr: true});
Using batch HTTP requests while reading from a Team Drive folder
does nothing -- no doc files are created and no errors are thrown.
Function silently terminates without having accomplished anything.
Conclusion
If you wish to convert a large number of PDFs to google docs or text files, then use Drive.Files.copy or send batches of HTTP requests and make sure that the PDFs are stored in a personal drive rather than a Team Drive.
Special thanks to #tehhowch for taking such an avid interest in my question and for repeatedly coming back to provide feedback, and to #Tanaike for providing code along with explanations that successfully solved my problem (with a caveat, read above for details).
You want to convert from PDF files in the folder to Google Documents. PDF files are in a folder of team drive. You want to import converted them to a folder of your Google Drive. If my understanding is correct, how about this method?
For the conversion from PDF to Google Document, it can convert using not only Drive.Files.insert(), but also Drive.Files.copy(). The advantage of use of Drive.Files.copy() is
Although Drive.Files.insert() has the size limitation of 5 MB, Drive.Files.copy() can use over the size of 5 MB.
In my envoronment, the process speed was faster than Drive.Files.insert().
For this method, I would like to propose the following 2 patterns.
Pattern 1 : Using Drive API v2
In this case, Drive API v2 of Advanced Google Services is used for converting files.
function myFunction() {
var sourceFolderId = "/* source folder id */";
var destinationFolderId = "/* dest folder id */";
var files = DriveApp.getFolderById(sourceFolderId).getFiles();
while (files.hasNext()) {
var res = Drive.Files.copy({parents: [{id: destinationFolderId}]}, files.next().getId(), {convert: true, ocr: true});
// Logger.log(res) // If you use this, please remove the comment.
}
}
Pattern 2 : Using Drive API v3
In this case, Drive API v3 is used for converting files. And here, I used the batch requests for this situation. Because the batch requests can use 100 API calls by one API call. By this, the issue of API quota can be removed.
function myFunction() {
var sourceFolderId = "/* source folder id */";
var destinationFolderId = "/* dest folder id */";
var files = DriveApp.getFolderById(sourceFolderId).getFiles();
var rBody = [];
while (files.hasNext()) {
rBody.push({
method: "POST",
endpoint: "https://www.googleapis.com/drive/v3/files/" + files.next().getId() + "/copy",
requestBody: {
mimeType: "application/vnd.google-apps.document",
parents: [destinationFolderId]
}
});
}
var cycle = 100; // Number of API calls at 1 batch request.
for (var i = 0; i < Math.ceil(rBody.length / cycle); i++) {
var offset = i * cycle;
var body = rBody.slice(offset, offset + cycle);
var boundary = "xxxxxxxxxx";
var contentId = 0;
var data = "--" + boundary + "\r\n";
body.forEach(function(e){
data += "Content-Type: application/http\r\n";
data += "Content-ID: " + ++contentId + "\r\n\r\n";
data += e.method + " " + e.endpoint + "\r\n";
data += e.requestBody ? "Content-Type: application/json; charset=utf-8\r\n\r\n" : "\r\n";
data += e.requestBody ? JSON.stringify(e.requestBody) + "\r\n" : "";
data += "--" + boundary + "\r\n";
});
var options = {
method: "post",
contentType: "multipart/mixed; boundary=" + boundary,
payload: Utilities.newBlob(data).getBytes(),
headers: {'Authorization': 'Bearer ' + ScriptApp.getOAuthToken()},
muteHttpExceptions: true,
};
var res = UrlFetchApp.fetch("https://www.googleapis.com/batch", options).getContentText();
// Logger.log(res); // If you use this, please remove the comment.
}
}
Note :
If the number of API calls at 1 batch request is large (the current value is 100), please modify var cycle = 100.
If Drive API v3 cannot be used for team drive, please tell me. I can convert it for Drive API v2.
If the team drive is the reason of issue for your situation, can you try this after it copied PDF files to your Google Drive?
Reference :
Batching Requests
If these are not useful for you, I'm sorry.
You can first of all fetch and store id of all files in a google sheet. Then you can proceed with processing each file normally by using it's id. Then after you have processed them mark that file as processed. And before processing a file check if that file is already processed.
If there are several files then you can also store the row number till where you have processed, next time continue after that.
Then at last create a trigger to execute your function every 10 minutes or so.
By this you can overcome execution time limit for single execution. API request quota and all will not be by-passed by this method.
Related
I'm creating an automatic e-mail which includes many data that change every week.
I'm new in Apps Script and I would like to add an image at the end of the e-mail.
Here the code :
// Drive where is stored the image
const folder = DriveApp.getFolderById("1XXXXXXXXX");
// Retrieve ID file where is stored the image
const file = folder.getFilesByName("file")
const fileIDs = [];
while (file.hasNext()) {
var files = file.next();
fileIDs.push(files.getId());
}
var ssFile = SpreadsheetApp.openById(fileIDs[0]);
SpreadsheetApp.setActiveSpreadsheet(ssFile);
//Spreadsheet
var mail = ssFile.getSheetByName("Mail");
//Retrieve image from the spreadsheet
var retrieveImage = mail.getImages()[0];
var arrayImage = new Array();
var image = {};
arrayImage[0] = retrieveImage.getAs('image/png')
image["image"+0] = arrayImage[0];
//Fonction to send mail
function sendEmailS(){
var message = "Test";
message += "<img src='cid:image" +0+ "'> <br>";
GmailApp.sendEmail("email#email.com", "subject", "",
{
htmlBody: message,
inlineImages: image
}
);
}
I've got the error that getAs is not a function. Could help me or give me any clue to finish my script ?
Issue and workaround:
From your showing script and the error of I've got the error that getAs is not a function., I thought that the reason for your issue is due to that the image cannot be retrieved as a blob from Spreadsheet.
In the current stage, unfortunately, there is no method for directly retrieving the image on Spreadsheet as a blob. So, in this answer, I would like to propose a workaround. In this workaround, a Google Apps Script library is used. This library supports for the processes that the current Google services cannot directly achieve.
Usage:
1. Install Google Apps Script library.
Please install DocsServiceApp of Google Apps Script library. You can see how to install it at here.
2. Modified script.
When your script is modified using this library, it becomes as follows.
function sendEmailS() {
// Drive where is stored the image
const folder = DriveApp.getFolderById("1XXXXXXXXX");
// Retrieve ID file where is stored the image
const file = folder.getFilesByName("file")
const fileIDs = [];
while (file.hasNext()) {
var files = file.next();
fileIDs.push(files.getId());
}
var ssFile = SpreadsheetApp.openById(fileIDs[0]);
SpreadsheetApp.setActiveSpreadsheet(ssFile);
//Spreadsheet
var mail = ssFile.getSheetByName("Mail");
//Retrieve image from the spreadsheet
var retrieveImage = mail.getImages()[0];
var arrayImage = new Array();
var image = {};
const anchor = retrieveImage.getAnchorCell().getA1Notation();
const res = DocsServiceApp.openBySpreadsheetId(fileIDs[0]).getSheetByName("Mail").getImages();
const obj = res.find(({ range: { a1Notation } }) => a1Notation == anchor);
if (!obj) return;
arrayImage[0] = obj.image.blob;
image["image" + 0] = arrayImage[0];
//Fonction to send mail
var message = "Test";
message += "<img src='cid:image" + 0 + "'> <br>";
GmailApp.sendEmail("email#email.com", "subject", "",
{
htmlBody: message,
inlineImages: image
}
);
}
3. Testing.
When this script is run, an image of mail.getImages()[0] is retrieved as a blob. And, an email is sent using the retrieved image blob.
Reference:
DocsServiceApp of Google Apps Script library
Maybe some ideas here for you?....
...this gets image file from G.drive and emails it...
function emailImage(){
fileList = DriveApp.getFilesByName('imageNameInDrive.jpg');
while (fileList.hasNext()) { image = fileList.next().getId(); }
var insertImage = DriveApp.getFileById(image).getBlob();
var message = 'Test<br>';
message += '<img src="cid:insertImage" > <br>';
GmailApp.sendEmail("email#gmail.com", "subject", "",
{
htmlBody: message,
inlineImages: {
insertImage: insertImage
}
}
);
}
In addition to Tanaike's answer, which in my opinion would be a good workaround, there is an open Feature Request for converting Spreadsheet images to BlobSource.
Remember to hit the +1 button to tell Google that you are also interested.
Update OverGridImage to support BlobSource interface
I have a large json file that looks like that:
[
{"name": "item1"},
{"name": "item2"},
{"name": "item3"}
]
I want to stream this file (pretty easy so far), for each line run a asynchronous function (that returns a promise) upon the resolve/reject call edit this line.
The result of the input file could be:
[
{"name": "item1", "response": 200},
{"name": "item2", "response": 404},
{"name": "item3"} // not processed yet
]
I do not wish to create another file, I want to edit on the fly the SAME FILE (if possible!).
Thanks :)
I don't really answer the question, but don't think it can be answered in a satisfactory way anyway, so here are my 2 cents.
I assume that you know how to stream line by line, and run the function, and that the only problem you have is editing the file that you are reading from.
Consequences of inserting
It is not possible to natively insert data into any file (which is what you want to do by changing the JSON live). A file can only grow up at its end.
So inserting 10 bytes of data at the beginning of a 1GB file means that you need to write 1GB to the disk (to move all the data 10 bytes further).
Your filesystem does not understand JSON, and just sees that you are inserting bytes in the middle of a big file so this is going to be very slow.
So, yes it is possible to do.
Write a wrapper over the file API in NodeJS with an insert() method.
Then write some more code to be able to know where to insert bytes into a JSON file without loading the whole file and not producing invalid JSON at the end.
Now I would not recommend it :)
=> Read this question: Is it possible to prepend data to an file without rewriting?
Why do it then?
I assume that want to either
Be able to kill your process at any time, and easily resume work by reading the file again.
Retry partially treated files to fill only the missing bits.
First solution: Use a database
Abstracting the work that needs to be done to live edit files at random places is the sole purpose of existence of databases.
They all exist only to abstract the magic that is behind UPDATE mytable SET name = 'a_longer_name_that_the_name_that_was_there_before' where name = 'short_name'.
Have a look at LevelUP/Down, sqlite, etc...
They will abstract all the magic that needs to be done in your JSON file!
Second solution: Use multiple files
When you stream your file, write two new files!
One that contain current position in the input file and lines that need to be retried
The other one the expected result.
You will also be able to kill your process at any time and restart
According to this answer writing to the same file while reading is not reliable. As a commenter there says, better to write to a temporary file, and then delete the original and rename the temp file over it.
To create a stream of lines you can use byline. Then for each line, apply some operation and pipe it out to the output file.
Something like this:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var LineStream = require('byline').LineStream;
function Modify(options) {
stream.Transform.call(this, options);
}
util.inherits(Modify, stream.Transform);
Modify.prototype._transform = function(chunk, encoding, done) {
var self = this;
setTimeout(function() {
// your modifications here, note that the exact regex depends on
// your json format and is probably the most brittle part of this
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}') + '\n';
}
self.push(modifiedChunk);
done();
}, Math.random() * 2000 + 1000); // to simulate an async modification
};
var inPath = './data.json';
var outPath = './out.txt';
fs.createReadStream(inPath)
.pipe(new LineStream())
.pipe(new Modify())
.pipe(fs.createWriteStream(outPath))
.on('close', function() {
// replace input with output
fs.unlink(inPath, function() {
fs.rename(outPath, inPath);
});
});
Note that the above results in only one async operation happening at a time. You could also save the modifications to an array and once all of them are done write the lines from the array to a file, like this:
var fs = require('fs');
var stream = require('stream');
var LineStream = require('byline').LineStream;
var modifiedLines = [];
var modifiedCount = 0;
var inPath = './data.json';
var allModified = new Promise(function(resolve, reject) {
fs.createReadStream(inPath).pipe(new LineStream()).on('data', function(chunk) {
modifiedLines.length++;
var index = modifiedLines.length - 1;
setTimeout(function() {
// your modifications here
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}');
}
modifiedLines[index] = modifiedChunk;
modifiedCount++;
if (modifiedCount === modifiedLines.length) {
resolve();
}
}, Math.random() * 2000 + 1000);
});
}).then(function() {
fs.writeFile(inPath, modifiedLines.join('\n'));
}).catch(function(reason) {
console.error(reason);
});
If instead of lines you wish to stream chunks of valid json which would be a more robust approach, take a look at JSONStream.
As mentioned in the comment, the file you have is not proper JSON, although is valid in Javascript. In order to generate proper JSON, JSON.stringify() could be used. I think it would make life difficult for others to parse nonstandard JSON as well, therefore I would recommend furnishing a new output file instead of keeping the original one.
However, it is still possible to parse the original file as JSON. This is possible via eval('(' + procline + ')');, however it is not secure to take external data into node.js like this.
const fs = require('fs');
const readline = require('readline');
const fr = fs.createReadStream('file1');
const rl = readline.createInterface({
input: fr
});
rl.on('line', function (line) {
if (line.match(new RegExp("\{name"))) {
var procline = "";
if (line.trim().split('').pop() === ','){
procline = line.trim().substring(0,line.trim().length-1);
}
else{
procline = line.trim();
}
var lineObj = eval('(' + procline + ')');
lineObj.response = 200;
console.log(JSON.stringify(lineObj));
}
});
The output would be like this:
{"name":"item1","response":200}
{"name":"item2","response":200}
{"name":"item3","response":200}
Which is line-delimited JSON (LDJSON) and could be useful for streaming stuff, without the need for leading and trailing [, ], or ,. There is an ldjson-stream package for it as well.
I have a javascript which uses the Google Drive API. Sometime it works flawlessly and other times it works only in part:
Basically it retrieves a list of files from google drive, downloads the files in a browser, put the files in another folder in google drive and finally erases the file from its original folder.
here is the portion of the script which carrys out the above process.
function makeRequest() {
console.log("make request");
var request = gapi.client.drive.files.list({
'q' : "'0BxTSfcTBxwlXflNjeVRZeWFQbUIzcDJMeElER1pDVEZla0NNVjhpWUpGTzY1ZDVUTS0yTFE' in Parents"}); //get only the files in a specified folder
request.execute(function(resp) {
var x = []; //array for revised list of file to only include those which have a suffix #FHM#
for (i = 0; i < resp.items.length; i++) {
if (resp.items[i].title.substring(0, 5) == "#FHM#") {
x.push([resp.items[i].title, resp.items[i].webContentLink, resp.items[i].id]);
}
}
if (x.length == 0) {
document.getElementById("downloadButton").value = "There are no files to download";
}
var originalFolder = "0BxTSfcTBxwlXflNjeVRZeWFQbUIzcDJMeElER1pDVEZla0NNVjhpWUpGTzY1ZDVUTS0yTFE" //original google drive folder ID
var processedFolder = "0BxTSfcTBxwlXfkVlYVRCdnBaMFRubWM4eUt5V0FYVEdIcHdjWDd6SFhYb3pPSjBZeFZ2T3M" // google drive folder for processed files
//loop through all files and trigger a download event for each for (i = 0; i < x.length; i++) {
console.log("download loop " + i);
var dlUrl = x[i][1];
var fileIdentity = x[i][2];
var fileTitle = x[i][0];//only used in the console log
downloadUrl(dlUrl);
}
//loop through all of final list of files and trigger the function moveFile (i.e link file to a new folder location)
for (i = 0; i < x.length; i++) {
console.log("moveFile loop " + i);
var dlUrl = x[i][1];
fileIdentity = x[i][2];
fileTitle = x[i][0];//only used in the console log
setTimeout(moveFile(processedFolder, originalFolder, fileIdentity, fileTitle),1000);
}
//loop through all of final list of files and trigger the function moveFile2 (i.e the one that deletes the original parent folder)
for (i = 0; i < x.length; i++) {
console.log("moveFile2 loop " + i);
var dlUrl = x[i][1];
fileIdentity = x[i][2];
fileTitle = x[i][0];//only used in the console log
setTimeout(moveFile2(processedFolder, originalFolder, fileIdentity, fileTitle),1000);
}
});
}
function downloadUrl(url) {
var iframe = document.createElement("iframe");
iframe.src = url;
iframe.style.display = "none";
document.body.appendChild(iframe);
console.log("download triggered");
}
function moveFile(processedFolder, originalFolder, fileIdentity, fileTitle) {
//move the file to the new processed folder
var body = {
'id': processedFolder
};
var request = gapi.client.drive.parents.insert({
'fileId': fileIdentity,
'resource': body
});
request.execute(function(resp) {});
console.log("file should be in new location : " + fileTitle);
}
function moveFile2(processedFolder, originalFolder, fileIdentity, fileTitle) {
//this bit removes the file from the the original folder
var request = gapi.client.drive.parents.delete({
'parentId': originalFolder,
'fileId': fileIdentity
});
request.execute(function(resp) {});
console.log("file should be removed from old location : " + fileTitle);
}
OK, so when I run this with 1 file to process the console.log code sections lists the events
as:
Download loop 0
download triggered
Movefile loop 0
file should be in new location : filename is XXXXX
movefile2 loop 0
file should be removed from original location
so I think that tells me the the order that the events have fired. yet sometimes I don't get all the events completing as expected. I might have some downloads missed, or some will move or some will still be in the original location.
If I run any one of the 3 for loops on its own it works flawlessly every time but as soon as I do all 3 it starts to fall apart and get all sorts of console error messages about being forbidden, about get request denied, deleted post request - all sorts of stuff
So.....
is this to do with the way Javascript handles events?
do I need to somehow delay the firing of events until the previous for loop has finished? (I've tried a setTimeout for the 2nd and 3rd For loop but doesn't seems to help)
or I am missing something fundamental to how javascript works
it's like if I try and add a file to a folder before the download has physically arrived on my hard drive then it falls apart, similary if I remove a parent folder before it has finished it also falls apart. as far as i understand Google Drive is not physically moving the file it is simply adding a label to the file. The actual physical location in googles cloud remains unchanged throughout.
help!!!!!
Too much code for me to read on my mobile, but there are a couple of obvious issues.
1/ The use of settimeout is almost always wrong since it introduces timing errors.
2/ you have an execute call with a null callback function, followed by console.log("done "). The api call is only done within the callback, not the next line of JavaScript.
So I'd say that your guess that you've misunderstood the event v handling is correct.
I have a batch file that will launch a .js file which, via WinSCP, checks if a file exists and returns to the batch file if it does or not.
The problem IS: It always returns not found, and I cannot figure out why. I am unsure how to use a wildcard in this scenario.
The batch file looks like this:
cscript /nologo file.js
if errorlevel 1 goto notfound
exit
:notfound
(another script to copy a file over)
Only one file can exist on the server at once. So every ten min, this batch file will run, check if there is a file, if not, copy one over.
The file.js:
// Configuration
// Remote file search for
var FILEPATH = "../filepath/TSS*";
// Session to connect to
var SESSION = "mysession#someplace.come";
// Path to winscp.com
var WINSCP = "c:\\program files (x86)\\winscp\\winscp.com";
var filesys = WScript.CreateObject("Scripting.FileSystemObject");
var shell = WScript.CreateObject("WScript.Shell");
var logfilepath = filesys.GetSpecialFolder(2) + "\\" + filesys.GetTempName() + ".xml";
var p = FILEPATH.lastIndexOf('/');
var path = FILEPATH.substring(0, p);
var filename = FILEPATH.substring(p + 1);
var exec;
// run winscp to check for file existence
exec = shell.Exec("\"" + WINSCP + "\" /log=\"" + logfilepath + "\"");
exec.StdIn.Write(
"option batch abort\n" +
"open \"" + SESSION + "\"\n" +
"ls \"" + path + "\"\n" +
"exit\n");
// wait until the script finishes
while (exec.Status == 0)
{
WScript.Sleep(100);
WScript.Echo(exec.StdOut.ReadAll());
}
if (exec.ExitCode != 0)
{
WScript.Echo("Error checking for file existence");
WScript.Quit(1);
}
// look for log file
var logfile = filesys.GetFile(logfilepath);
if (logfile == null)
{
WScript.Echo("Cannot find log file");
WScript.Quit(1);
}
// parse XML log file
var doc = new ActiveXObject("MSXML2.DOMDocument");
doc.async = false;
doc.load(logfilepath);
doc.setProperty("SelectionNamespaces",
"xmlns:w='http://winscp.net/schema/session/1.0'");
var nodes = doc.selectNodes("//w:file/w:filename[#value='" + filename + "']");
if (nodes.length > 0)
{
WScript.Echo("File found");
// signalize file existence to calling process;
// you can also continue with processing (e.g. downloading the file)
// directly from the script here
WScript.Quit(0);
}
else
{
WScript.Echo("File not found");
WScript.Quit(1);
}
On line 4 it says:
var FILEPATH = "../filepath/TSS*";
That star is what is giving me issues, i think. I need to look for a file which STARTS WITH TSS, but will have a time stamp tacked on the end. So i need to just use a wildcard after TSS.
So what i need help with is: Making this process return true if any file exists with TSS*
Any help would be much appreciated.
EDIT:
var nodes = doc.selectNodes("//w:file/w:filename[starts-with(#value, 'TSS')]");
This code seems to not work. If this code worked, it seems like it would solve all my problems.
You need to correct xpath expression in var nodes... line.
Try something like this:
doc.setProperty("SelectionLanguage", "XPath"); //added in edit
var nodes = doc.selectNodes("//w:file/w:filename[starts-with(#value, '" + filename + "')]");
and delete asterisk from FILEPATH.
Note: first line is required in order to use XPath as the query language, not default (and old) XSLPattern which doesn't support methods such as starts-with or contains.
SelectionLanguage Property (MDSN).
You can use the stat command. You can even inline the WinSCP script into the batch file:
#echo off
set REMOTE_PATH=/home/user/test.txt
winscp.com /command ^
"option batch abort" ^
"open mysession" ^
"stat %REMOTE_PATH%" ^
"exit"
if errorlevel 1 goto error
echo File %REMOTE_PATH% exists
rem Do something
exit 0
:error
echo Error or file %REMOTE_PATH% not exists
exit 1
An alternative is using the Session.FileExists from WinSCP .NET assembly.
For further details, see the WinSCP article Checking file existence.
I get a PKCS#7 crypto package from a 3rd party system.
The package is not compressed and not encrypted, PEM-encoded, signed with X.509 certificate.
I also have a PEM cert file from the provider.
The data inside is XML
I need to do the following in Node.JS:
extract the data
verify the signature
A sample package (no sensitive info, data refers to our qa system) http://pastebin.com/7ay7F99e
OK, finally got it.
First of all, PKCS messages are complex structures binary-encoded using ASN1.
Second, they can be serialized to binary files (DER encoding) or text PEM files using Base64 encoding.
Third, PKCS#7 format specifies several package types from which my is called Signed Data. These formats are distinguished by OBJECT IDENTIFIER value in the beginning of the ASN1 object (1st element of the wrapper sequence) — you can go to http://lapo.it/asn1js/ and paste the package text for the fully parsed structure.
Next, we need to parse the package (Base64 -> ASN1 -> some object representation). Unfortunately, there's no npm package for that. I found quite a good project forge that is not published to npm registry (though npm-compatible). It parsed PEM format but the resulting tree is quite an unpleasant thing to traverse. Based on their Encrypted Data and Enveloped Data implementations I created partial implementation of Signed Data in my own fork. UPD: my pull request was later merged to the forge project.
Now finally we have the whole thing parsed.
At that point I found a great (and probably the only on the whole web) explanative article on signed PKCS#7 verification: http://qistoph.blogspot.com/2012/01/manual-verify-pkcs7-signed-data-with.html
I was able to extract and successfully decode the signature from the file, but the hash inside was different from the data's hash. God bless Chris who explained what actually happens.
The data signing process is 2-step:
original content's hash is calculated
a set of "Authorized Attributes" is constructed including: type of the data singed, signing time and data hash
Then the set from step 2 is signed using the signer's private key.
Due to PKCS#7 specifics this set of attributes is stored inside of the context-specific constructed type (class=0x80, type=0) but should be signed and validated as normal SET (class=0, type=17).
As Chris mentions (https://stackoverflow.com/a/16154756/108533) this only verifies that the attributes in the package are valid. We should also validate the actual data hash against the digest attribute.
So finally here's a code doing validation (cert.pem is a certificate file that the provider sent me, package is a PEM-encoded message I got from them over HTTP POST):
var fs = require('fs');
var crypto = require('crypto');
var forge = require('forge');
var pkcs7 = forge.pkcs7;
var asn1 = forge.asn1;
var oids = forge.pki.oids;
var folder = '/a/path/to/files/';
var pkg = fs.readFileSync(folder + 'package').toString();
var cert = fs.readFileSync(folder + 'cert.pem').toString();
var res = true;
try {
var msg = pkcs7.messageFromPem(pkg);
var attrs = msg.rawCapture.authenticatedAttributes;
var set = asn1.create(asn1.Class.UNIVERSAL, asn1.Type.SET, true, attrs);
var buf = Buffer.from(asn1.toDer(set).data, 'binary');
var sig = msg.rawCapture.signature;
var v = crypto.createVerify('RSA-SHA1');
v.update(buf);
if (!v.verify(cert, sig)) {
console.log('Wrong authorized attributes!');
res = false;
}
var h = crypto.createHash('SHA1');
var data = msg.rawCapture.content.value[0].value[0].value;
h.update(data);
var attrDigest = null;
for (var i = 0, l = attrs.length; i < l; ++i) {
if (asn1.derToOid(attrs[i].value[0].value) === oids.messageDigest) {
attrDigest = attrs[i].value[1].value[0].value;
}
}
var dataDigest = h.digest();
if (dataDigest !== attrDigest) {
console.log('Wrong content digest');
res = false;
}
}
catch (_e) {
console.dir(_e);
res = false;
}
if (res) {
console.log("It's OK");
}
Your answer is a big step in the right direction. You are however missing out an essential part of the validation!
You should verify the hash of the data against the digest contained in the signed attributes. Otherwise it would be possible for someone to replace the content with malicious data. Try for example validating the following 'package' with your code (and have a look at the content): http://pastebin.com/kaZ2XQQc
I'm not much of a NodeJS developer (this is actually my first try :p), but here's a suggestion to help you get started.
var fs = require('fs');
var crypto = require('crypto');
var pkcs7 = require('./js/pkcs7'); // forge from my own fork
var asn1 = require('./js/asn1');
var folder = '';
var pkg = fs.readFileSync(folder + 'package').toString();
var cert = fs.readFileSync(folder + 'cert.pem').toString();
try {
var msg = pkcs7.messageFromPem(pkg);
var attrs = msg.rawCapture.authenticatedAttributes; // got the list of auth attrs
var set = asn1.create(asn1.Class.UNIVERSAL, asn1.Type.SET, true, attrs); // packed them inside of the SET object
var buf = new Buffer(asn1.toDer(set).data, 'binary'); // DO NOT forget 'binary', otherwise it tries to interpret bytes as UTF-8 chars
var sig = msg.rawCapture.signature;
var shasum = crypto.createHash('sha1'); // better be based on msg.rawCapture.digestAlgorithms
shasum.update(msg.rawCapture.content.value[0].value[0].value);
for(var n in attrs) {
var attrib = attrs[n].value;
var attrib_type = attrib[0].value;
var attrib_value = attrib[1].value[0].value;
if(attrib_type == "\x2a\x86\x48\x86\xf7\x0d\x01\x09\x04") { // better would be to use the OID (1.2.840.113549.1.9.4)
if(shasum.digest('binary') == attrib_value) {
console.log('hash matches');
var v = crypto.createVerify('RSA-SHA1');
v.update(buf);
console.log(v.verify(cert, sig)); // -> should type true
} else {
console.log('hash mismatch');
}
}
}
}
catch (_e) {
console.dir(_e);
}
based on inspiration form this answer, I've implemented a sample for signing and verifying pdf files using node-signpdf and node-forge.