local PDF file scraping in node.js

local PDF file scraping in node.js - javascript

I have uploaded a pdf via a MEAN stack web application using fs. I want to extract certain fields from the pdf and display them on the web app. I have looked at a couple npm packages like pdf.js, pdf2json. I can't figure out the documentation and javascript callbacks used in the examples available. Please help!

I hope I can help answer your question. Using pdf2json can be used to parse a pdf and extract the text. There are a couple of steps that need to be taken to get it working. I have adapted the example from https://github.com/modesty/pdf2json.
The setup is to install pdf2json in the node app, and also underscore. The example page didn't explain the need to define your own callback functions. It also used self instead of this to register them. So, with the appropriate changes the code to extract all the text from the pdf will be something like this:
// Get the dependencies that have already been installed
// to ./node_modules with `npm install <dep>`in the root director
// of your app
var _ = require('underscore'),
PDFParser = require('pdf2json');
var pdfParser = new PDFParser();
// Create a function to handle the pdf once it has been parsed.
// In this case we cycle through all the pages and extraxt
// All the text blocks and print them to console.
// If you do `console.log(JSON.stringify(pdf))` you will
// see how the parsed pdf is composed. Drill down into it
// to find the data you are looking for.
var _onPDFBinDataReady = function (pdf) {
console.log('Loaded pdf:\n');
for (var i in pdf.data.Pages) {
var page = pdf.data.Pages[i];
for (var j in page.Texts) {
var text = page.Texts[j];
console.log(text.R[0].T);
}
}
};
// Create an error handling function
var _onPDFBinDataError = function (error) {
console.log(error);
};
// Use underscore to bind the data ready function to the pdfParser
// so that when the data ready event is emitted your function will
// be called. As opposed to the example, I have used `this` instead
// of `self` since self had no meaning in this context
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
// Construct the file path of the pdf
var pdfFilePath = 'test3.pdf';
// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);

I am running the code out of my server side controller.
module.exports = (function() {
return {
add: function(req, res) {
var tmp_path = req.files.pdf.path;
var target_path = './uploads/' + req.files.pdf.name;
fs.rename(tmp_path, target_path, function(err) {
if (err) throw err;
// delete the temporary file, so that the explicitly set temporary upload dir does not get filled with unwanted files
fs.unlink(tmp_path, function() {
if (err) throw err;
//edit here pdf parser
res.redirect('#/');
});
})
},
show: function(req, res) {
var pdfParser = new PDFParser();
var _onPDFBinDataReady = function (pdf) {
console.log('Loaded pdf:\n');
for (var i in pdf.data.Pages) {
var page = pdf.data.Pages[i];
// console.log(page.Texts);
for (var j in page.Texts) {
var text = page.Texts[j];
// console.log(text.R[0].T);
}
}
console.log(JSON.stringify(pdf));
};
// Create an error handling function
var _onPDFBinDataError = function (error) {
console.log(error);
};
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
// Construct the file path of the pdf
var pdfFilePath = './uploads/Invoice_template.pdf';
// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);
},
//end controller
}

Related

Watson Assistant context is not updated

I use watson assistant v1
My problem is that every time I make a call to the code in Nodejs, where I return the context, to have a coordinated conversation, the context is only updated once and I get stuck in a node of the conversation
this is my code
client.on('message', message => {
//general variables
var carpetaIndividual = <../../../>
var cuerpoMensaje = <....>
var emisorMensaje = <....>
//detect if context exists
if(fs.existsSync(carpetaIndividual+'/contexto.json')) {
var watsonContexto = require(carpetaIndividual+'/contexto.json');
var variableContexto = watsonContexto;
} else {
var variableContexto = {}
}
//conection with Watson Assistant
assistant.message(
{
input: { text: cuerpoMensaje },
workspaceId: '<>',
context: variableContexto,
})
.then(response => {
let messageWatson = response.result.output.text[0];
let contextoWatson = response.result.context;
console.log('Chatbot: ' + messageWatson);
//Save and create JSON file for context
fs.writeFile(carpetaIndividual+'/contexto.json', JSON.stringify(contextoWatson), 'utf8', function (err) {
if (err) {
console.error(err);
}
});
//Send messages to my application
client.sendMessage(emisorMensaje, messageWatson)
})
.catch(err => {
console.log(err);
});
}
client.initialize();
the context.json file is updated, but when it is read the code only reads the first update of the context.json file and not the other updates

This will be because you are using require to read the .json file. For all subsequent requires of an already-required file, the data is cached and reused.
You will need to use fs.readfile and JSON.parse
// detect if context exists
if (fs.existsSync(carpetaIndividual+'/contexto.json')) {
var watsonContexto = fs.readFileSync(carpetaIndividual+'/contexto.json');
// Converting to JSON
var variableContexto = JSON.parse(watsonContexto);
} else {
var variableContexto = {}
}
There is another subtle problem with your code, in that you are relying on
your async call to fs.writeFile completing before you read the file. This will be the case most of the time, but as you don't wait for the fs.writeFile to complete there is the chance that you may try to read the file, before it is written.

Passing HTML input file as the parameter of Firebase Cloud Function

I am pretty new this area and I started firebase cloud function 2 days ago.
Sorry, I am still a student so I might not understand clearly some documentation.
I tried to figure out how the parameter is passed from my client-side javascript to firebase cloud function.
my cloud function
exports.OCR = functions.https.onCall((req) => {
const vision = require('#google-cloud/vision');
// Creates a client
const client = new vision.ImageAnnotatorClient();
console.log(req);
// Performs label detection on the image file
client
.documentTextDetection(req)
.then((results) => {
console.log("Entered");
console.log(req);
const fullTextAnnotation = results[0].fullTextAnnotation;
console.log(fullTextAnnotation.text);
return results[0].fullTextAnnotation.text;
})
.catch(err => {
console.error('ERROR:', err);
return "error";
});
})
I am using firebase cloud function and Google Vision API.
actually I tried to pass the parameter like this
My client side coe
document.getElementById("fileInput").click();
var file = document.getElementById("fileInput");
var fileInput = document.getElementById('fileInput');
fileInput.addEventListener('change', function (e) {
var file = e.target.files[0];
// Do something with the image file.
var tmppath = URL.createObjectURL(file);
console.log(file);
console.log(tmppath);
//var url = "https://firebasestorage.googleapis.com/v0/b/recette-f3ef5.appspot.com/o/FB1.gif?alt=media&token=28727220-181c-440e-87ae-4808b5c9ba28";
OCR(file)
.then(function(result) {
console.log(result);
}).catch(function(err) {
console.log(err);
});
});
and it did not work. I always got null return when I trigger the function.
So, my question is that how can I pass the file (HTML INPUT TAG) to my cloud function?
p.s: when I tried the code with node the_code.js it works.

According to the Google Cloud Node.js library documentation the documentTextDetection function should receive a JS object like this:
var image = {
source: {imageUri: 'gs://path/to/image.jpg'}
};
vision.documentTextDetection(image).then(response => {
// doThingsWith(response);
}).catch(err => {
console.error(err);
});
The file you are passing to OCR function has probably a different structure than that defined in documentation.
There are some variants to this:
If the key is source, the value should be another object containing
imageUri or filename as a key and a string as a value.
If the key is content, the value should be a Buffer.
So your code should look something like this.
console.log(tmppath);
//var url = "https://firebasestorage.googleapis.com/v0/b/recette-f3ef5.appspot.com/o/FB1.gif?alt=media&token=28727220-181c-440e-87ae-4808b5c9ba28";
image = {source: {imageUri: 'https://firebasestorage.googleapis.com/v0/b/recette-f3ef5.appspot.com/o/FB1.gif?alt=media&token=28727220-181c-440e-87ae-4808b5c9ba28'}}
OCR(image)
Please provide complete error messages and description of what is file..

Create plugin gulp with stream

I created plugin for send json data in json file.
But I don't understand why send my object json in pipe, and not write file directly in my plugin.
I want use my plugin whit this syntax:
gulp.task('js-hash', function()
{
// Get all js in redis
gulp.src('./build/js/**/*.js')
.pipe(getHashFile('/build/js/'))
.pipe(gulp.dest('./build/js/hash.json'));
});
And not that:
gulp.task('js-hash', function()
{
// Get all js in redis
gulp.src('./build/js/**/*.js')
.pipe(getHashFile('./build/js/hash.json', '/build/js/'));
});
This is my plugin:
var through = require('through2');
var gutil = require('gulp-util');
var crypto = require('crypto');
var fs = require('fs');
var PluginError = gutil.PluginError;
// Consts
const PLUGIN_NAME = 'get-hash-file';
var json = {};
function getHashFile(filename, basename)
{
if (!filename) {
throw PluginError(PLUGIN_NAME, "Missing filename !");
}
// Creating a stream through which each file will pass
var stream = through.obj(function (file, enc, callback) {
if (file.isNull()) {
this.push(file); // Do nothing if no contents
return callback();
}
if (file.isBuffer()) {
var hash = crypto.createHash('sha256').update(String(file.contents)).digest('hex');
json[file.path.replace(file.cwd+basename, '')] = hash;
return callback();
}
if (file.isStream()) {
this.emit('error', new PluginError(PLUGIN_NAME, 'Stream not supported!'));
return callback();
}
}).on('finish', function () {
fs.writeFile(filename, JSON.stringify(json), function(err) {
if (err) {
throw err;
}
});
});
// returning the file stream
return stream;
}
// Exporting the plugin main function
module.exports = getHashFile;
Your are idea

Nothing prevents you from doing this... besides not respecting plugins guidelines!
Users actually assume a plugin will stream files and that they can pipe them to other plugins.
If I get your code right, you're trying to generate a file that contains all sha hashes of inbound files. Why not let users take this file and pipe it to other plugins? You'd be surprised what people could do.
While this question looks a bit opinion-based, you could definitely put the focus on how to deal with files that may not belong to the main stream of files. Issues like this can be found in many plugins; for example, gulp-uglify authors are wondering how they can add source-maps without mixing js and source map downstream.

node.js never exits after insert to couchbase, opposite of most node questions

My problem seems to be the opposite of every node.js question :-) I have a simple forEach loop to read a list of files and insert them into a Couchbase database. This works great, but it never exits after reading all the lines. So I added a counter to shutdown the couchbase connection after all inserts are complete. This works.
This process is intended to load hundreds of thousands of files, so I brought the async module into the mix to batch the inserts into groups of 100. The async.eachLimit is used to iterate over the array and insert documents in batches. Now the orig problem is back. Whatever magic async.eachLimit uses to recognize the process is complete is not happening.
I've been going through javascript scoping, callbacks, async, etc. Google searches are hitting keywords but not this issue. I've reduced the code down to the following testcase. To test, create three files and add their names to testlist.txt.
The async.eachLimit in place works up until it hits the limit, then hangs. Comment this out and uncomment array.forEach line and it works. Thanks in advance!
var fs = require('fs');
var couchbase = require('couchbase');
var async = require('async');
var filelist = 'testlist.txt';
var key_count = 0;
var cb_config = { host: 'localhost:8091', bucket: 'default'};
var db = new couchbase.Connection(cb_config, function(err) {
if (err) {
console.log('ERRR connect to couchbase at config['+cb_config+']');
throw err;
}
});
var insertFile=function(line) {
console.log('LOAD ['+line+']');
fs.readFile(line, function(file_err, f_doc) {
if(file_err) throw file_err;
db.set(line, f_doc, function(db_err, db_res){
if (db_err) {
console.log('FAIL ['+line+'] err['+db_err+']');
} else {
console.log('PASS ['+line+']');
}
key_count--;
if (key_count == 0) {
console.log('DONE Shutting down client, no more keys');
db.shutdown();
}
});
});
}
// read list of files into data array from file filelist
fs.readFile(filelist, function(filelist_err, lines) {
if(filelist_err) throw filelist_err;
// HACK split adds empty line to array, use replace to fix
var array = lines.toString().replace(/\n$/, '').split('\n');
key_count = array.length;
console.log('INIT lines['+key_count+']');
async.eachLimit(array, 2, insertFile, function(err) { console.log('FAIL async err['+err+']');} );
//array.forEach(function(data){insertFile(data);return;});
});
Testcase output using array.forEach:
INIT lines[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
LOAD [files.txt]
PASS [files.little.txt]
PASS [files.big.txt]
PASS [files.txt]
DONE Shutting down client, no more keys
Testcase output using async.eachLimit:
INIT lines[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
PASS [files.little.txt]
PASS [files.big.txt]
... hang, never gets to 3...

After review with a coworker, they spotted my mistake. I missed the async callback in my insertFile function. Adding that in works and allows me to remove the key counter! Solution code below:
var fs = require('fs');
var couchbase = require('couchbase');
var async = require('async');
var filelist = 'testlist.txt';
var key_count = 0;
var cb_config = { host: 'localhost:8091', bucket: 'default'};
var db = new couchbase.Connection(cb_config, function(err) {
if (err) {
console.log('ERRR connect to couchbase at config['+cb_config+']');
throw err;
}
});
var insertFile=function(line, callback) {
console.log('LOAD ['+line+']');
fs.readFile(line, function(file_err, f_doc) {
if(file_err) throw file_err;
db.set(line, f_doc, function(db_err, db_res){
if (db_err) {
console.log('FAIL ['+line+'] err['+db_err+']');
callback(db_err);
} else {
console.log('PASS ['+line+']');
callback();
}
});
});
}
// read list of files into data array from file filelist
fs.readFile(filelist, function(filelist_err, data) {
if(filelist_err) throw filelist_err;
// HACK stoopid bug split adds empty line to array, use replace to fix
var array = data.toString().replace(/\n$/, '').split('\n');
key_count = array.length;
console.log('READ files['+key_count+']');
async.eachLimit(array, 2, insertFile, function(err) {
if (err) console.log('LAST with async err['+err+']');
console.log('DONE Shutting down client, no more keys');
db.shutdown();
});
});
And successful output:
$ node testcase.js
READ files[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
PASS [files.little.txt]
LOAD [files.txt]
PASS [files.big.txt]
PASS [files.txt]
DONE Shutting down client, no more keys

Node.js - Organising code and closures - SFTP/Inotify

I was hoping I could get some advice on why my nodejs program is behaving in the way it is.
I am using two modules, node-sftp and node-inotify. I have setup node-inotify to watch a directory and call a function when something is written there, the function being an sftp upload.
Now the problem I have is that processing one file at a time is fine but when I drop 4 files in one go there, the function is called four times but only one sftp upload goes through.
Do I need to order my code in a particular way to ensure that the sftp upload occurs x times, is this something to do with closures perhaps?
This is a basic version of my code...
"event_handler" is called when something happens on a "watched" directory
"check_event" figures out if this type of event is one we want, in this case it's a "write"
"ftp_to_server" prepare connection details
"do_ftp" basically uses the node-sftp module to perform the sftp upload
event_handler = function(event){
var supplier;
check_event(event, supplier, type, ftp_to_server);
};
=================
function check_event(event, handler)
{
if (event.type === 'xxxxxx') {
var file_to_process_name = 'abc';
var file_to_process_dir = 'abc';
var remote_dir = 'abc';
handler(file_to_process_name, file_to_process_dir, remote_dir);
}
}
function ftp_to_server(file_to_process_name, file_to_process_dir, remote_dir) {
var connection_details = conf.ftp.connections
do_ftp(connection_details, file_to_process_name, file_to_process_dir, remote_dir);
}
function do_ftp(connection_details, file_to_process_name, file_to_process_dir, remote_dir) {
var credentials = {
// FTP settings here
};
var local_file = file_to_process_dir + file_to_process_name;
var remote_file = remote_dir + file_to_process_name;
connection = new sftp(credentials, function(err) {
if (err){
throw err;
}
connection.writeFile(remote_file, fs.readFileSync(local_file, "utf8"), null, function(err) {
if (err) {
throw err;
}
console.info('FTP PUT DONE');
});
});
};

Your "connection = new sftp(credentials, function(err) {"
should be
var connection = new sftp(credentials, function(err) {
The way you currently have it coded, "connection" is a global and you are writing over it.

We Keep Coding

JavaScript is the programming language of the Web.

local PDF file scraping in node.js - javascript

Related

Watson Assistant context is not updated

Passing HTML input file as the parameter of Firebase Cloud Function

Create plugin gulp with stream

node.js never exits after insert to couchbase, opposite of most node questions

Node.js - Organising code and closures - SFTP/Inotify

Categories

Resources