I'm using pdf.js-extract to read data pdf file from URL. This is my code and it runs well:
const fs = require('fs');
const PDFExtract = require('pdf.js-extract').PDFExtract;
const pdfExtract = new PDFExtract();
const https = require('https');
const url = 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf'
const readData = async (url) => {
https.get(url, async function (response) {
const file = fs.createWriteStream('./dummy.pdf');
response.pipe(file);
file.on("finish", () => {
pdfExtract.extract('./dummy.pdf', {})
.then(async function(data) {
console.log(data)
});
file.close();
fs.unlinkSync('./dummy.pdf')
});
});
}
readData(url)
But I have 2 problem about this code.
The first is that how can I deal with if the url changes from https
to http, I try node-fetch but it's seem not work as I expected.
The second one is that the code take too long to handle if the pdf
file is large, about 5 seconds for 2 Mbs. I wonder if there is a
faster way like reading the file from the buffer or something like
that without having to save it as a temporary file
Thanks for your attention
Related
I'm using the azure file storage, and using express JS to write a backend to render the contents stored in the azure file storage.
I am writing the code based on https://learn.microsoft.com/en-us/javascript/api/#azure/storage-file-share/shareserviceclient?view=azure-node-latest
const { ShareServiceClient, StorageSharedKeyCredential } = require("#azure/storage-file-share");
const account = "<account>";
const accountKey = "<accountkey>";
const credential = new StorageSharedKeyCredential(account, accountKey);
const serviceClient = new ShareServiceClient(
`https://${account}.file.core.windows.net`,
credential
);
const shareName = "<share name>";
const fileName = "<file name>";
// [Node.js only] A helper method used to read a Node.js readable stream into a Buffer
async function streamToBuffer(readableStream) {
return new Promise((resolve, reject) => {
const chunks = [];
readableStream.on("data", (data) => {
chunks.push(data instanceof Buffer ? data : Buffer.from(data));
});
readableStream.on("end", () => {
resolve(Buffer.concat(chunks));
});
readableStream.on("error", reject);
});
}
And you can view the contents through
const downloadFileResponse = await fileClient.download();
const output = await streamToBuffer(downloadFileResponse.readableStreamBody)).toString()
Thing is, I only want to find if the file exists and not spend time downloading the entire file, how could I do this?
I looked at https://learn.microsoft.com/en-us/javascript/api/#azure/storage-file-share/shareserviceclient?view=azure-node-latest
to see if the file client class has what I want, but it doesn't seem to have methods useful for this.
If you are using #azure/storage-file-share (version 12.x) Node package, there's an exists method in ShareFileClient. You can use that to find if a file exists or not. Something like:
const fileExists = await fileClient.exists();//returns true or false.
I am trying to download a file to my windows pc using nodejs
I tried the following code. the problem is the file which i download from nodejs is 185kb and the actual original file size is 113kb(found by directly downloading from browser)
request = require('request');
function download(url, dest, cb){
request.head(url, function(err, res, body){
request(url).pipe(fs.createWriteStream(dest)).on('close', function(){
cb();
});
});
};
I also tried downloading the file using a different code
function download(url, dest, cb) {
var file = fs.createWriteStream(dest);
var request = https.get(url, function (response) {
response.pipe(file);
file.on('finish', function () {
file.close(cb);
file.end();
});
});
}
But the same bug happened
The problem is i am trying to open that file in photoshop , but it fails , saying the file is corrupted, please help
This code (using the built-in https module) should work correctly. The stream will close automatically, there's no need to close it, the autoClose parameter defaults to true when creating a write file stream.
See docs at: fs.createWriteStream.
If the file is still too large it is likely that you are not using the direct image link, try selecting "View image" / "Open image in new tab" etc. in your browser and using that link instead.
const https = require("https");
const fs = require("fs");
const url = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e1/FullMoon2010.jpg/1024px-FullMoon2010.jpg";
const fileStream = fs.createWriteStream("test.jpg");
https.get(url, response => {
response.pipe(fileStream);
});
You can also use the request library:
const request = require("request");
const fs = require("fs");
const url = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e1/FullMoon2010.jpg/1024px-FullMoon2010.jpg";
const fileStream = fs.createWriteStream("request-test.jpg");
const req = request(url);
req.on("response", response => {
response.pipe(fileStream);
});
I want to import data from a data.json file into the neural network (which uses the Brain.js framework).Here is the part which is supposed to bring that data to the network and analyse it:
const result = brain.likely(
require('data.js')
,net);
alert("This is the result: " + result);
And get that data analysed by the neural network and shown to the user.
Here are the contents of the data.json file for reference:
{
'Rating1': 0.12434213,
'Rating2': 0.987653236,
'Rating3': 0.432543654
}
For your information this is on written on node.js enviroment.
Assuming your data.json file is in the same directory:
fetch('data.json')
.then(response => response.json())
.then(json => {
const result = brain.likely(json, net);
});
Alternatively, with async/await:
(async () => {
const json = await (await fetch('data.json')).json();
const result = brain.likely(json, net);
})();
If done through a file upload:
// target input element
const input = document.querySelector('input');
// upload event
input.addEventListener('change', () => {
const file = this.files[0];
const reader = new FileReader();
reader.addEventListener('load', e => {
const json = JSON.parse(e.target.result);
const result = brain.likely(json, net);
});
reader.readAsText(file);
});
If done through Node:
const json = require('./data.json');
brain.likely(json, net);
Useful resources for handling files:
Using files from web apps - practical examples on how to use the FileReader API
Fetch API - how to use files already on your server in the browser
Node's File System readFileSync method - to read file contents synchronously in a Node environment
JSON.parse - native JS method to convert a string to JSON, useful in all environments
I'm trying to do this with just pure Javascript and the SDK. I am not using Node.js. I'm converting my application from v2 to v10 of the SDK azure-storage-js-v10
The azure-storage.blob.js bundled file is compatible with UMD
standard, if no module system is found, following global variable
will be exported: azblob
My code is here:
const serviceURL = new azblob.ServiceURL(`https://${account}.blob.core.windows.net${accountSas}`, pipeline);
const containerName = "container";
const containerURL = azblob.ContainerURL.fromServiceURL(serviceURL, containerName);
const blobURL = azblob.BlobURL.fromContainerURL(containerURL, blobName);
const downloadBlobResponse = await blobURL.download(azblob.Aborter.none, 0);
The downloadBlobResponse looks like this:
downloadBlobResponse
Using v10, how can I convert the downloadBlobResponse into a new blob so it can be used in the FileSaver saveAs() function?
In azure-storage-js-v2 this code worked on smaller files:
let readStream = blobService.createReadStream(containerName, blobName, (err, res) => {
if (error) {
// Handle read blob error
}
});
// Use event listener to receive data
readStream.on('data', data => {
// Uint8Array retrieved
// Convert the array back into a blob
var newBlob = new Blob([new Uint8Array(data)]);
// Saves file to the user's downloads directory
saveAs(newBlob, blobName); // FileSaver.js
});
I've tried everything to get v10 working, any help would be greatly appreciated.
Thanks,
You need to get the body by await blobBody.
downloadBlobResponse = await blobURL.download(azblob.Aborter.none, 0);
// data is a browser Blob type
const data = await downloadBlobResponse.blobBody;
Thanx Mike Coop and Xiaoning Liu!
I was busy making a Vuejs plugin to download blobs from a storage account. Thanx to you, I was able to make this work.
var FileSaver = require('file-saver');
const { BlobServiceClient } = require("#azure/storage-blob");
const downloadButton = document.getElementById("download-button");
const downloadFiles = async() => {
try {
if (fileList.selectedOptions.length > 0) {
reportStatus("Downloading files...");
for await (const option of fileList.selectedOptions) {
var blobName = option.text;
const account = '<account name>';
const sas = '<blob sas token>';
const containerName = '< container name>';
const blobServiceClient = new BlobServiceClient(`https://${account}.blob.core.windows.net${sas}`);
const containerClient = blobServiceClient.getContainerClient(containerName);
const blobClient = containerClient.getBlobClient(blobName);
const downloadBlockBlobResponse = await blobClient.download(blobName, 0, undefined);
const data = await downloadBlockBlobResponse.blobBody;
// Saves file to the user's downloads directory
FileSaver.saveAs(data, blobName); // FileSaver.js
}
reportStatus("Done.");
listFiles();
} else {
reportStatus("No files selected.");
}
} catch (error) {
reportStatus(error.message);
}
};
downloadButton.addEventListener("click", downloadFiles);
Thanks Xiaoning Liu!
I'm still learning about async javascript functions and promises. Guess I was just missing another "await". I saw that "downloadBlobResponse.blobBody" was a promise and also a blob type, but, I couldn't figure out why it wouldn't convert to a new blob. I kept getting the "Iterator getter is not callable" error.
Here's my final working solution:
// Create a BlobURL
const blobURL = azblob.BlobURL.fromContainerURL(containerURL, blobName);
// Download blob
downloadBlobResponse = await blobURL.download(azblob.Aborter.none, 0);
// In browsers, get downloaded data by accessing downloadBlockBlobResponse.blobBody
const data = await downloadBlobResponse.blobBody;
// Saves file to the user's downloads directory
saveAs(data, blobName); // FileSaver.js
I need to get a lot of images from a few websites and download them to my disk so that I can use them (will upload them to a blob (azure) and then save the link to my DB).
GETTING THE IMAGES
I know how to get the images from the html with JS, for example one of them I would make a for-loop and do:
document.getElementsByClassName('person')[i].querySelector('div').querySelector('img').getAttribute('src')
And there I would have the links to all the images.
SAVING THE IMAGES
I also saw that I can save the files to disk using node and the fs module, by doing:
function saveImageToDisk(url, localPath) {var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
}
HOW TO PUT IT ALL TOGETHER
This is where I am stuck, I don't know exactly how to connect the two parts (the script and the nodejs code), I want to get the image and also the image name (alt tag in this case) and then use them in node to upload the image to a blob and put them name and image blob url in my DB.
I thought I could download the html page and then put the JS script on the bottom of the body but then I don't know how to pass the url to the nodejs code.
How can I do this?
I am not very used to using scripts, I mostly used node without them and I get a bit confused by their interactions and how to connect js scripts to my code.
Also is this the best way to go about this or is there a simpler/better way I am not seeing?
This feels like you should use a crawler. The following code should work (using the npm module crawler):
const Crawler = require("crawler")
const c = new Crawler({
callback: function(error, res, done) {
if (error) {
console.log({error})
} else {
const images = res.$('.person div img')
images.each(index => {
// here you can save the file or save them in an array to download them later
console.log({
src: images[index].attribs.src,
alt: images[index].attribs.alt,
})
})
}
}
})
c.queue('https://www.yoursite.com')
You need a bridge between Web API (for DOM parsing etc) and Node.js API. For example, some headless browser managing tool for Node.js. Say, you can use puppeteer with this script:
'use strict';
const puppeteer = require('puppeteer');
const https = require('https');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://en.wikipedia.org/wiki/Image');
const imgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(imgURLs);
await browser.close();
imgURLs.forEach((imgURL, i) => {
https.get(imgURL, (response) => {
response.pipe(fs.createWriteStream(`${i++}.${imgURL.slice(-3)}`));
});
});
} catch (err) {
console.error(err);
}
})();
You can even download images just once, using pictures already downloaded by the browser. This script saves identical images, but with one session of requests, without using https Node.js module (this saves time, network traffic and server workload):
'use strict';
const puppeteer = require('puppeteer');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
const allImgResponses = {};
page.on('response', (response) => {
if (response.request().resourceType() === 'image') {
allImgResponses[response.url()] = response;
}
});
await page.goto('https://en.wikipedia.org/wiki/Image');
const selecedImgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(selecedImgURLs);
let i = 0;
for (const imgURL of selecedImgURLs) {
fs.writeFileSync(
`${i++}.${imgURL.slice(-3)}`,
await allImgResponses[imgURL].buffer(),
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
I recommend you to use the dom-parser module. See here: https://www.npmjs.com/package/dom-parser
By doing so, you can download the whole html-File with http.get() and parse it using the dom-parser. Then extract all the information you need from the HTML-File. With the Image URL, use your saveImageToDisk() function.
Following your idea, you have to add the JS script to the html-File as you mentioned. But in addition you have to use Ajax (xmlHttpRequest) to post the URL to a nodeJS-Server.
You can use Promise & inside it do the job of getting all the images and put the image url in an array.Then inside the then method you can either iterate the array and call the saveImageToDisk each time or you can send the array to the middle layer with slide modification. The second option is better since it will make only one network call
function getImages() {
return new Promise((resolve, reject) => {
// Array.from will create an array
// map will return a new array with all the image url
let k = Array.from(document.getElementsByClassName('person')[0].querySelector('div')
.querySelectorAll('img'))
.map((item) => {
return item.getAttribute('src')
})
resolve(k)
})
}
getImages().then((d) => {
// it will work only after the promise is resolved
console.log('****', d);
(item => {
// call saveImageToDisk function
})
})
function saveImageToDisk(url, localPath) {
var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
<div class='person'>
<div>
<img src='https://www.fast-growing-trees.com/images/P/Leyland-Cypress-450-MAIN.jpg'>
<img src='http://cdn.shopify.com/s/files/1/2473/3486/products/Cypress_Leyland_2_Horticopia_d1b5b63a-8bf7-4897-96fb-05320bf3d81b_grande.jpg?v=1532991076'>
<img src='https://www.fast-growing-trees.com/images/P/Live-Oak-Tree-450w.jpg'>
<img src='https://www.greatgardenplants.com/images/uploads/452_1262_popup.jpg'>
<img src='https://shop.arborday.org/data/default/images/catalog/600/Turnkey/1/Leyland-Cypress_3-828.jpg'>
<img src='https://images-na.ssl-images-amazon.com/images/I/51RZkKnrlSL._SX425_.jpg'>
<img src='https://thumbs-prod.si-cdn.com/Z3JYiuJ96ReLq04NCT1B94sTd4E=/800x600/filters:no_upscale()/https://public-media.si-cdn.com/filer/06/9c/069cfb16-c46c-4742-85f0-3c7e45fa139d/mar2018_a05_talkingtrees.jpg'>
</div>