I am building a web scraper to get all of user's submissions on codeforces.
I don't know much about async, await, promises.
I have used axios (promise based) to request codeforces and cheerio to parse HTML .
app.post("/", (req, res) => {
const usernameorhandle = req.body.userName;
getstatus(usernameorhandle).then ( ()=> {
var output = fs.createWriteStream(__dirname + '/Data/solutions.zip');
var archive = archiver('zip', {
zlib: { level: 9 } // Sets the compression level.
});
output.on('close', function() {
console.log(archive.pointer() + ' total bytes');
console.log('archiver has been finalized and the output file descriptor has closed.');
});
output.on('end', function() {
console.log('Data has been drained');
});
res.attachment(__dirname + "/Data/Problems", 'Codeforces-Solutions');
archive.pipe(res);
archive.directory(__dirname + "/Data/Problems", 'Codeforces-Solutions');
archive.finalize();
}) })
I am using to accept post request.
I am putting all the solutions into a folder and creating zip folder and then send to res.
Below is my getstatus function.
async function getstatus(handle){
return new Promise(async (resolve, reject)=> {
console.log("HELLLLLLLOOOOOOOO");
await axios.get("https://codeforces.com/api/user.status?handle=" + handle + "&from=1")
.then(response => {
if(response.data.status === 'OK'){
let results = response.data.result;
console.log("AAAAAAAAAAAAAAAAAAAAAAAa");
scrape(results).then( () =>{
console.log("DONE");
resolve();
})
.catch(err => console.log(err));
// resolve();
}
else console.log(submissions.comment);
})
})
}
I use scrape function to obtain HTML data and put to folder named Problems.
async function scrape (results){
console.log("inside scrape");
// console.log("HELLO");
return new Promise( async (resolve, reject) => {
await results.forEach(async (result)=> {
if(result.verdict === 'OK'){
await axios.get("https://codeforces.com/contest/" + result.contestId + "/submission/" + result.id)
.then(solutionPage => {
const $ = cheerio.load(solutionPage.data);
const path = "/home/srujan/Desktop/crawlerapp/Data/Problems/" + result.problem.name + ".cpp";
fs.writeFile(path, $('#program-source-text').text(), function(err){
if(err){
console.log(err);
}
else{
console.log("Saved file");
}
})
})
.catch( error => {
console.log("HTML PARSE ERROR" + error);
})
}
})
console.log("hey");
resolve();
})
The problem is I am getting
HELLLLLLLOOOOOOOO
AAAAAAAAAAAAAAAAAAAAAAAa
inside scrape
hey
DONE
saved file
saved file
...
Browser downloads after DONE and then files are saved.
I am new to js and don't know why I am getting this.
PS : I know this is very long question. I tried reading a lot about this. Didn't understand properly how to do that. I copy pasted some code which I didn't understand like how to zip a folder.
forEach(callback) executes callback. If callback returns a promise (ie, it's an async function), the promise won't be resolved before calling the callback on the next element of the array.
So, basically, you can't use async functions inside forEach... But you can use for-loops or Promise.all instead!
Also, fs.writeFile works with sync + callback, but there exists a fs.promise.writeFile that uses promises instead.
Here's a scrape function that should work better:
async function scrape(results) {
for (const result of results) {
if(result.verdict === 'OK') {
const solutionPage = await axios.get("https://codeforces.com/contest/" + result.contestId + "/submission/" + result.id);
const $ = cheerio.load(solutionPage.data);
const path = "/home/srujan/Desktop/crawlerapp/Data/Problems/" + result.problem.name + ".cpp";
try {
await fs.promises.writeFile(path, $('#program-source-text').text());
} catch(err) { console.log(err) }
}
}
}
The problem is to use result.forEach
Try to use a simple for(let i = 0; i < result.length; i++) without async.
If that doesn't work, try to return anything inside the then.
This is how I would construct getstatus function with await async
async function getstatus(handle) {
const response = await axios.get("https://codeforces.com/api/user.status?handle=" + handle + "&from=1")
if(response.data.status === 'OK') {
let results = response.data.result;
try {
await scrape(results);
console.log("DONE");
}
catch(error) {
}
}
}
and scrape function accordingly...
const fs = require('fs').promises;
async function scrape (results) {
results.forEach(async (result)=> {
if(result.verdict === 'OK') {
const solutionPage = await axios.get("https://codeforces.com/contest/" + result.contestId + "/submission/" + result.id)
const $ = cheerio.load(solutionPage.data);
const path = "/home/srujan/Desktop/crawlerapp/Data/Problems/" + result.problem.name + ".cpp";
try {
await fs.writeFile(path, $('#program-source-text').text())
console.log("Saved file");
}
catch(error) {
}
}
}
}
Related
I am new to node and stuck with this issue. Here' the file:
I am running 'startProcess' function and I want to run 'downloadFiles' and wait until it's completed and save the files before executing any code after it.
This code always ends up running 'runVideoUploadEngine' even before the download has been completed?
const downloadAndSaveFiles = async ({ url, dir }) => {
try {
https.get(url, (res) => {
// File will be stored at this path
console.log('dir: ', dir);
var filePath = fs.createWriteStream(dir);
res.pipe(filePath);
filePath.on('finish', () => {
filePath.close();
console.log('Download Completed');
});
});
return true;
} catch (e) {
console.log(e);
throw e;
}
};
const downloadFiles = async ({ data }) => {
try {
mediaUrl = data.mediaUrl;
thumbnailUrl = data.thumbnailUrl;
const mediaExt = path.extname(mediaUrl);
const thumbExt = path.extname(thumbnailUrl);
mediaDir = `${__dirname}/temp/${'media'}${mediaExt}`;
thumbDir = `${__dirname}/temp/${'thumb'}${thumbExt}`;
await downloadAndSaveFiles({ url: mediaUrl, dir: mediaDir });
await downloadAndSaveFiles({ url: thumbnailUrl, dir: thumbDir });
return { mediaDir, thumbDir };
} catch (e) {
console.log(e);
throw e;
}
};
module.exports = {
startProcess: async ({ message }) => {
//check if message is proper
data = JSON.parse(message.Body);
//download video and thumbnail and store in temp.
console.log('starting download..');
const { mediaDir, thumbDir } = await downloadFiles({ data });
console.log('dir:- ', mediaDir, thumbDir);
pageAccessToken =
'myRandomToken';
_pageId = 'myRandomPageID';
console.log('running engine');
await runVideoUploadEngine({ pageAccessToken, _pageId, mediaDir, thumbDir });
//start videoUploadEngine
//on success: delete video/thumbnail
},
};
What am I doing wrong?
downloadAndSaveFiles returns a promise (because the function is async) but that promise doesn't "wait" for https.get or fs.createWriteStream to finish, and therefore none of the code that calls downloadAndSaveFiles can properly "wait".
If you interact with callback APIs you cannot really use async/await. You have to create the promise manually. For example:
const downloadAndSaveFiles = ({ url, dir }) => {
return new Promise((resolve, reject) => {
// TODO: Error handling
https.get(url, (res) => {
// File will be stored at this path
console.log('dir: ', dir);
var filePath = fs.createWriteStream(dir);
filePath.on('finish', () => {
filePath.close();
console.log('Download Completed');
resolve(); // resolve promise once everything is done
});
res.pipe(filePath);
});
});
};
I'm tryng to upgrade this code for a better maintenance, this code uploads two images to a server, i know it's possible to get rid of those .catch, by applying async await functions, and try catch blocks, but it's pretty confusing for me, any help will be apreciated.
this._uploadService.makeFileRequest(Global.url + "/upload-image1/" + response.product._id, [], this.filesToUpload1, 'image')
.then((result: Product) => {
this.filesToUpload1 = null;
this._uploadService.makeFileRequest(Global.url + "/upload-image/" + response.product._id, [], this.filesToUpload, 'image')
.then((result: Product) => {
this.filesToUpload = null;
setTimeout( () => this._router.navigate(['/detail', this.saveProduct._id]), 800 );
})
.catch(err => {
console.log(err);
this._router.navigate(['/detail', this.saveProduct._id]);
})
})
.catch(err => {
console.log(err);
this._router.navigate(['/detail', this.saveProduct._id]);
})
I suggest using a pen and paper to draw a block diagram for the logic involved, i.e. which api gets called first, with what kind of data, then which api comes afterwards; also include any logical conditionals through branching.
After that, you should attempt to write something like
const aggregateFunction = async() => {
try {
const someResponse = await callFirstApi(); // return response
await callSecondApi(someResponse); // use the response of the first api for the second api
if (someConditional) {
await callThirdApi(); // response not returned (i.e. when not required)
}
} catch (error) { // catch all errors from all await if they're not within another try-catch
console.log(error);
}
}
This pattern should eliminate all then and catch blocks. If you need more specific error handling for calling say a specific api, wrap function call inside another try-catch block, but everything should still be within the outer try-catch so that all errors will be caught regardless.
this._uploadService.makeFileRequest = function(){
return new Promise(resolve => {
// do logic of file request
resolve(true);
})
}
var waitForTime = function() {
return new Promise(resolve => {
setTimeout( () => {
this._router.navigate(['/detail', this.saveProduct._id]),
resolve(true)
}, 800 );
})
}
var f = async function(){
try {
await this._uploadService.makeFileRequest(Global.url + "/upload-image1/" + response.product._id, [], this.filesToUpload1, 'image');
await this.fileToUpload1 = null;
await this._uploadService.makeFileRequest(Global.url + "/upload-image/" + response.product._id, [], this.filesToUpload, 'image')
await this.fileToUpload = null;
await waitForTime();
}
catch(e) {
// error logic
}
}
if (this.filesToUpload1 && this.filesToUpload) {
f()
}
this might be another cleaner approach with async,await and promise
In the 1st function 'get_files()' I can log the file_list variable, it is correct here, however when I log it again in my 2nd function 'get_diffs()' it is undefined..
// Get files
async function get_files() {
await fs.readdir(dirPath, function (err, files) {
(async () => {
if (await err) {
console.log("Error getting directory information.", err)
} else {
var file_list = []; // Reset
await files.forEach(function (file) {
file_list.push(file);
});
console.log('1st Call = ' + file_list); // Correct
return await file_list;
}
})();
});
}
// Get Diffs
async function get_diffs() {
const file_list = await get_files();
console.log('2nd Call = ' + file_list); // Undefined
const dates = await get_dates();
return await files.filter(x => !dates.includes(x));
}
You have misunderstood async/await. Learn the basics here
function get_files() {
return new Promise((resolve, reject) => {
fs.readdir(dirPath, function (err, files) {
if (err) {
reject(err);
} else {
var file_list = []; // Reset
files.forEach(function (file) {
file_list.push(file);
});
console.log('1st Call = ' + file_list); // Correct
resolve(file_list);
}
});
})
}
fs.readdir does not return a promise. Use the promise based function fs.promise.readdir instead.
async function get_diffs() {
const file_list = await fs.promise.readdir(dirPath);
// ...
}
So you don't really need the other function. It had many problems anyway. await makes not much sense when used with an expression that is not a promise. All the places where you have used await in get_files, the expression that follows it does not represent a promise.
I have an array of video info from a YouTube playlist. I'm trying to retrieve the audio of each video using npm module youtube-mp3-downloader.
I'm want to iterate over the array asynchronously, in sequence.
The array I'm iterating has a bunch of objects like the following one:
{
snippet: {
title: 'Video Title'
},
contentDetails: {
videoId: 'video-Id'
}
}
I'm running the following loop and it successfully runs the 1st iteration (downloads the audio file and resolves the promise). Then it enters the 2nd iteration, it does console.log("started iteration " + i) but stops right there, before entering await getFromYT()
async function getAudio(list) {
for (let i = 0; i < list.length; i++) {
console.log("started iteration " + i)
await getFromYT(list[i]).then(message =>
console.log(message + " " + i)
).catch(error =>
console.log(error)
)
}
}
here is what I get in the console:
started iteration 0
Finished 0
started iteration 1
And here is the function that is being awaited:
function getFromYT(item) {
return new Promise((resolve, reject) => {
YD.download(item.contentDetails.videoId, item.snippet.title + '.mp3');
YD.on("error", function (error) {
reject(error);
});
YD.on("finished", function (err, data) {
resolve("Finished");
});
})
}
I have been trying to figure this out for a while now and haven't gotten anywhere.
Ideally I would like to know what's wrong with the code that I have. Why doesn't it continue the loop?
However, if you can suggest a straight forward way to achieve what I'm trying to do that would be helpful too.
You are mixing async/await style and promises. the function which loops through should look like this
async function getAudio(list) {
for (let i = 0; i < list.length; i++) {
try {
console.log("started iteration " + i)
const message = await getFromYT(list[i]);
console.log(message + " " + i);
} catch(err) {
console.error(err);
}
}
}
if the operations can be done in parallel, consider using Promise.all with .map
Here's how I'd do the same using a combination of async/await and Promise.
const getFromYT = (item) => new Promise((resolve, reject) => {
YD.download(item.contentDetails.videoId, item.snippet.title + '.mp3');
YD.on("error", (error) => {
reject(new Error(error));
});
YD.on("finished", (err, data) => {
resolve({ id: item.id, data});
});
});
const getAudio = async (list) => {
const promises = list.map((listItem) => {
return getFromYT(listItem)
.catch((err) => console.log(err));
});
let data = null;
try {
data = await Promise.all(promises);
} catch (err) {
console.log(err);
}
return data;
};
So, what I've done here is simply instantiated all the promises together in a loop using the map function.
Then, I execute all the promises parallely using Promise.all and await for all the promises to resolve parallely.
I ensure that the promise chain in getAudio doesn't break by catching rejections at the individual getFromYT promises itself.
You can get the data and the corresponding item without any mismatch as follows :-
const allData = await getAudio(list);
allData.forEach((datum) => {
const { id, data } = datum;
console.log(id);
console.log(data);
});
Hope this helps!
Edit 1: So as #bergi said, I can't return / throw from event-emitters, I need to promisify them. Here's how!
I know the question is asked really often, and I might get downvoted for that. But I really struggle to understand how I can wait for a function to process data before returning a value.
I had a look on many popular posts (such as here), but I cannot achieve what I want.
Here is my code:
app.post("/video_url", function (req, res) {
videoProcessed = videoScraper(req.body.videoURL.videoURL);
res.send(videoProcessed);
});
It does not wait for this function to process the data:
function videoScraper(url) {
console.log("URL to Scraper: " + url);
const options = {
uri: `${url}`,
transform: function(body) {
return cheerio.load(body);
}
};
var videoProcessed;
rp(options)
.then(($) => {
videoProcessed = $("body").find("iframe").attr("src");
return videoProcessed;
})
.catch((err) => {
console.log(err);
});
}
I tried using callbacks but it gets really messy, and I don't know were to put the promise (if any) in my code.
Add await and async (if you have node 8+):
app.post("/video_url", async function (req, res) {
const videoProcessed = await videoScraper(req.body.videoURL.videoURL);
res.send(videoProcessed);
});
And in your videoScraper function, you need to return rp! :
function videoScraper(url) {
console.log("URL to Scraper: " + url);
const options = {
uri: `${url}`,
transform: function(body) {
return cheerio.load(body);
}
};
return rp(options)
.then($ => $("body").find("iframe").attr("src"))
.catch((err) => {
console.error(err);
});
}
That would depend on the videoScrapper working fine, I've no idea what rp is so I can't tell.
Don't forget to handle videoProcessed === undefined (error case) in the first code snippet. It can also be abstracted using express-promise-router that will even catch async errors... That's further down the road.
Don't hesitate to read up on await & async, it's really wonderful to write asynchronous code in the same manner as synchronous code.
Use async/await
app.post("/video_url", async (req, res)=> {
try{
let videoProcessed = await videoScraper(req.body.videoURL.videoURL);
res.send(videoProcessed);
}
catch(ex){
// handle the exception
}
});
const videoScraper = async fuction(url) {
console.log("URL to Scraper: " + url);
let options = {
uri: `${url}`,
transform: function(body) {
return cheerio.load(body);
}
};
try{
let temp = await rp(options);
let videoProcessed = $("body").find("iframe").attr("src");// you can return it directly
return videoProcessed;
}
catch(ex){
// handle the exception
}
}
if you your node is < 8 then use promises (bluebird module)
const bluebird = require('bluebird');
function videoScraper(url){
return new bluebird(function(resolve,reject){
let options = {
uri: `${url}`,
transform: function(body) {
return cheerio.load(body);
}
};
rp(options)
.then(($)=>{
resolve($("body").find("iframe").attr("src"));
})
.catch(err=>{
return err;
})
})
}
app.post("/video_url", (req, res)=> {
videoScraper(req.body.videoURL.videoURL)
.then(result=>{
res.send(result)
})
.catch(err=>{
// handle the error
})
});
Do not use const for variable declaration unless its value is constant, and usually use let instead of var
You can try the following:
app.post("/video_url", function (req, res) {
videoScraper(req.body.videoURL.videoURL)
.then(videoProcessed => {
res.send(videoProcessed);
})
.catch(err => {
// render proper response with error message
})
});
And change the function to something as below, so as to return a promise from the same:
function videoScraper(url) {
console.log("URL to Scraper: " + url);
const options = {
uri: `${url}`,
transform: function(body) {
return cheerio.load(body);
}
};
return rp(options)
.then(($) => {
videoProcessed = $("body").find("iframe").attr("src");
return videoProcessed;
});
}