Resolving async function after a loop end - javascript

I expect when I call an async function to resolve promise at the end, not before.
const urls = await uploadImages({ imageChanges, questions });
// ...next step
// I will use urls
But after calling await uploadImages() it continues to run until const data = await fetch(image.src);
And then ...next step starts. How can I make it wait for imageChanges.forEach loop finish ? Should I create another nested function inside ?
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
let urls;
try {
//** convert each new image's src from blob to downloadUrl. */
imageChanges.forEach(async image => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
// **
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
urls.push(downloadURL)
});
return urls;
} catch (error) {
console.log(error.message);
}
};

forEach with async doesn't work as expected. Read this answer for more info.
Try like this
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
try {
const imageChangesUrlPromise = imageChanges.map(async () => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
return downloadURL;
})
return await Promise.all(imageChangesUrlPromise);
} catch (error) {
console.log(error.message);
}
};
and then
const urls = await uploadImages({ imageChanges, questions });
...

JavaScript does this because forEach is not promise-aware. It cannot support async and await. You cannot use await in forEach.
If you use await in a map, map will always return an array of promises. This is because asynchronous functions always return promises.
By littile modification to your code, this should work,
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
let urls;
try {
//** convert each new image's src from blob to downloadUrl. */
await Promise.all(imageChanges.map(async image => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
// **
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
urls.push(downloadURL)
}));
return urls;
} catch (error) {
console.log(error.message);
}
};
const urls = await uploadImages({ imageChanges, questions });

Related

Js puppeteer cant find elements after scroll

I am making this scraper to collect post from public Facebook pages. My problem is when I turn the scraper up to collect more than like 10 post, it's unable to scrape elements after scrolling way down. So the way it works is the scraper goes to a public page then scrolls the entire feed until it grabs the IDs for the number of post you want to collect. After collecting all the IDs it will then go to every post and collect what specific info like comments, shares, reactions etc. But when I'm far down the feed it can't find the post higher up the feed by ID, even tho when in Chrome dev tools the selector works in puppeteer its undefined. So my question is why does the scroll location affect puppeteer being able to read the dom. And if there is a better way for me to collect this information.
-Sidenote: this scraper is expected to grab thosands of post
Here is my code so far
const { scrollPageToBottom } = require('puppeteer-autoscroll-down')
const puppeteer = require('puppeteer');
const prompt = require('prompt-sync')();
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const fbPage = prompt('What FaceBook Page?');
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
await page.goto(`https://www.facebook.com/OfficialMensHumor/`, {waitUntil : 'networkidle2' }).catch(e => void 0);
await scrapeArticles(page)
await browser.close();
})();
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
async function getText(spans){
for (const ele of spans){
// const text = ele.getProperty('innerText')
const text = await (await ele.getProperty('innerText')).jsonValue()
console.log(text)
}
}
async function scrapeArticles(
page,
// extractItems,
postCount=100,
scrollDelay = 800,
) {
let post = [];
try {
let previousHeight;
while (post.length < postCount) {
const content = await page.$('div[role="main"] > div.k4urcfbm')
post = await content.evaluate(()=>{
const postDivs = Array.from(document.querySelectorAll('div.du4w35lb.l9j0dhe7 div[class=lzcic4wl][role="article"]'))
return postDivs.map(post=>({id:post.getAttribute('aria-posinset')}))
})
console.log(post)
let isLoadingAvailable = true
await scrollPageToBottom(page, { size: 500 , delay: 250})
}
console.log(1)
await getPostUrls(page, post)
await getComments(page, post)
await getShares(page, post)
await getReactions(page, post)
await getPostImg(page, post)
await getTime(page, post)
console.log(post)
saveToFile(post)
} catch(e) {
console.log(e)
}
// return items;
}
const getComments = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
// const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')
// Comment String
const commentNum = await (await handle.getProperty('innerText')).jsonValue()
obj['commentsNum'] = commentNum
}
}
}
// console.log(articleNums)
}
const getShares = async (page, articleNums) => {
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.waitForXPath("//span[contains(text(), 'Shares')]", {visible: true})
// Share String
const shareNum = await (await handle[0].getProperty('innerText')).jsonValue()
obj['sharesNum'] = shareNum
}
}
}
// console.log(articleNums)
}
const getReactions = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.$('span[aria-label="See who reacted to this"] + span[aria-hidden="true"]')
// Share String
const reactionsNum = await (await handle.getProperty('innerText')).jsonValue()
obj['reactionsNum'] = reactionsNum
}
}
}
// console.log(articleNums)
}
const getPostImg = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const imgDiv = await article.$('div[class="pmk7jnqg kr520xx4"]')
const handle = await imgDiv.$('img[alt]')
// Share String
const imgUrl = await (await handle.getProperty('src')).jsonValue()
obj['imgUrl'] = imgUrl
}
}
}
// console.log(articleNums)
}
// And timestamp
const getTime = async (page, articleNums)=>{
for (const obj of articleNums){
for (const key in obj){
if(key == 'postUrl'){
await page.goto(obj[key])
const timeStamp = await page.$eval('abbr[data-shorten]', abbr=>abbr.dataset.tooltipContent)
obj['timestamp'] = timeStamp
}
}
}
}
const getPostUrls = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const postURLHandle = await article.$('a[role="link"][aria-label]')
// Share String
const postURL = await (await postURLHandle.getProperty('href')).jsonValue()
obj['postUrl'] = postURL
}
}
}
console.log(articleNums)
}
const saveToFile = async (list) =>{
const csv = new ObjectsToCsv(list);
// Save to file:
await csv.toDisk('./post_sample.csv');
}
These are the lines in question that are continously returning undefined
/ const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')

Asynchronous function recognizes function is done when really it still has more to do

The page.on is reconized by the async for loop at the bottom as finished and ready to run the function again, but its not actually done. It still needs to run everything up to page.close. How do I let the async function know that it is done after page.close, not page.on? Let me know if you need anymore info, thanks.
const puppeteer = require('puppeteer');
const fs = require('fs');
const req = require('request');
const got = require('got');
const NodeID3 = require('node-id3');
const readline = require('readline');
const selectors = require('./selectors');
const getDownloadUrl = async (url, browser) => {
const page = await browser.newPage();
await page.goto(url);
page.setRequestInterception(true);
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './Songs'})
const baseUrl = 'https://cf-hls-media.sndcdn.com/media/';
await page.on('request', async (request) => {
if(request.url().includes(baseUrl)){
const downloadUrl = fixUrl(request.url());
const info = await getSongInfo(page);
downloadSong(downloadUrl, info.title);
await tagSong(info);
await request.abort();
await page.close();
} else {
request.continue();
}
});
};
const fixUrl = (url) => {
...
};
const downloadSong = (url, title) => {
...
};
const getSongInfo = async (page) => {
...
};
const tagSong = async (info) => {
...
};
(() => {
const readInterface = readline.createInterface({
input: fs.createReadStream('../Song Urls.csv'),
output: process.stdout,
console: false,
terminal: false,
});
let urls = [];
readInterface.on('line', function(line) {
urls.push(line);
}).on('close', async () => {
const browser = await puppeteer.launch({headless: false});
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await getDownloadUrl(url, browser);
}
});
})();
/*
Issue: The loop recognizes that the getDownloadUrl function is done even though it's
not and continues anyways.
*/
await only works with promises, and page.on looks to be a callback-based event listener, not something that returns a promise. If you want to be able to await it, you will need to create a promise around it.
await new Promise((resolve) => {
page.on('request', async (request) => {
if(request.url().includes(baseUrl)){
const downloadUrl = fixUrl(request.url());
const info = await getSongInfo(page);
downloadSong(downloadUrl, info.title);
await tagSong(info);
await request.abort();
await page.close();
resolve();
} else {
request.continue();
}
});
})

Not able to read the dom content in puppeteer [node.js] using class & objects

I'm trying to read the dom content from indian superleague for example goals, attacking, mins per goal, etc using class and object. It gives an error like this
Evaluation failed: TypeError: Cannot read property 'textContent' of undefined
at puppeteer_evaluation_script:7:64
Here's the code
config.js
const puppeteer = require('puppeteer')
class Puppeteer{
constructor(){
this.param = {
path: 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe',
url: 'https://indiansuperleague.com',
}
}
async connect(){
this.param.browser = await puppeteer.launch({executablePath: this.param.path, headless: false})
this.param.page = await this.param.browser.newPage()
await this.param.page.goto(this.param.url, {timeout: 0})
}
async disconnect(){
await this.param.browser.close()
}
}
module.exports = Puppeteer
states.js
class States{
constructor(param){
this.param = param
}
async fetchData(){
const page = this.param.page
const res = await page.evaluate(() => {
const title = 'si-fkt-sctn-title', value = 'si-fkt-sctn-number'
// const titleArray = document.getElementsByClassName(title)
// const valueArray = document.getElementsByClassName(value)
let key = document.getElementsByClassName(title)[0].textContent.trim()
let num = document.getElementsByClassName(value)[0].textContent.trim()
/* for(let i=0; i<titleArray.length; i++){
key[i] = titleArray[i].textContent.trim()
num[i] = valueArray[i].textContent.trim()
// Object.defineProperty(temp, key, {value:num,writable: true,configurable: true,enumerable: true})
} */
return {key, num}
})
console.log(res)
}
}
module.exports = States
app.js
const Puppeteer = require('./config')
const States = require('./modules/states')
const puppeteer = new Puppeteer()
const states = new States(puppeteer.param)
puppeteer.connect().then(async() => {
let res = await states.fetchData()
console.log(res)
await puppeteer.disconnect()
}).catch(e => console.log(e))
What is the solution?
The elements may be created dynamically after some time. You can try to use page.waitForSelector() before retrieving the data from them. For example:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://indiansuperleague.com');
await Promise.all([
page.waitForSelector('.si-fkt-sctn-title'),
page.waitForSelector('.si-fkt-sctn-number'),
]);
const data = await page.evaluate(() => {
return [
document.querySelector('.si-fkt-sctn-title').textContent,
document.querySelector('.si-fkt-sctn-number').textContent,
];
});
console.log(data);
await browser.close();
} catch (err) {
console.error(err);
}
})();
Output:
[ ' Goals', ' 63' ]

How to fetch data from multiple urls at once?

I have a function that fetches from a url in React
const DataContextProvider = (props) => {
const [isLoading, setLoading] = useState(false);
const [cocktails, setCocktails] = useState([]);
useEffect(() => {
const fetchCocktailList = async () => {
const baseUrl = 'https://www.thecocktaildb.com/api/json/v1/1/';
setLoading(true);
try {
const res = await fetch(`${baseUrl}search.php?s=margarita`);
const data = await res.json();
console.log(data);
setCocktails(data.drinks);
setLoading(false);
} catch (err) {
console.log('Error fetching data');
setLoading(false);
}
};
fetchCocktailList();
}, []);
How I'm mapping data so far.
const DrinkList = () => {
const { cocktails } = useContext(DataContext);
return (
<div className='drink-list-wrapper'>
{cocktails.length > 0 &&
cocktails.map((drink) => {
return <DrinkItem drink={drink} key={drink.idDrink} />;
})}
</div>
);
};
However I want to fetch from this url also ${baseUrl}search.php?s=martini
I would like a good clean way to do this and set my state to both of the returned data.
First base the data fetch function on a parameter:
const fetchCocktail = async (name) => {
const baseUrl = 'https://www.thecocktaildb.com/api/json/v1/1/';
try {
const res = await fetch(`${baseUrl}search.php?s=` + name);
const data = await res.json();
return data.drinks;
} catch (err) {
console.log('Error fetching data');
}
}
Then use Promise.all to await all results:
setLoading(true);
var promises = [
fetchCocktail(`margarita`),
fetchCocktail(`martini`)
];
var results = await Promise.all(promises);
setLoading(false);
DrinkList(results);
Where results will be an array with the responses that you can use on the DrinkList function.
Here's a method which will let you specify the cocktail names as dependencies to the useEffect so you can store them in your state and fetch new drink lists if you want new recipes. If not, it'll just be a static state variable.
I've also added another state variable errorMessage which you use to pass an error message in the case of failure.
Also, you should include the appropriate dependencies in your useEffect hook. The setState functions returned by calls to useState are stable and won't trigger a re-run of the effect, and the cocktailNames variable won't trigger a re-run unless you update it with new things to fetch.
const DataContextProvider = (props) => {
const [isLoading, setLoading] = useState(false);
const [cocktails, setCocktails] = useState([]);
const [errorMessage, setErrorMessage] = useState(''); // holds an error message in case the network request dosn't succeed
const [cocktailNames, setCocktailNames] = useState(['margarita', 'martini']); // the search queries for the `s` parameter at your API endpoint
useEffect(() => {
const fetchCocktailLists = async (...cocktailNames) => {
const fetchCocktailList = async (cocktailName) => {
const baseUrl = 'https://www.thecocktaildb.com/api/json/v1/1/search.php';
const url = new URL(baseUrl);
const params = new URLSearchParams({s: cocktailName});
url.search = params.toString(); // -> '?s=cocktailName'
const res = await fetch(url.href); // -> 'https://www.thecocktaildb.com/api/json/v1/1/search.php?s=cocktailName'
const data = await res.json();
const {drinks: drinkList} = data; // destructured form of: const drinkList = data.drinks;
return drinkList;
};
setLoading(true);
try {
const promises = [];
for (const cocktailName of cocktailNames) {
promises.push(fetchCocktailList(cocktailName));
}
const drinkLists = await Promise.all(promises); // -> [[drink1, drink2], [drink3, drink4]]
const allDrinks = drinkLists.flat(1); // -> [drink1, drink2, drink3, drink4]
setCocktails(allDrinks);
}
catch (err) {
setErrorMessage(err.message /* or whatever custom message you want */);
}
setLoading(false);
};
fetchCocktailList(...cocktailNames);
}, [cocktailNames, setCocktails, setErrorMessage, setLoading]);
};
var promises = [
fetchCocktail(api1),
fetchCocktail(api2)
];
var results = await Promise.allSettled(promises);

How to Test a Nested await function in JS

const test = async (url, id) => {
if (!isValidUrl(url)) {
throw new Error('Invalid URL')
}
const storage = new Storage(Indexdb, id);
const cae = new valueExtract(url);
const data = await cae.fetch();
const obj = await new ZIPExtractor(data); // shudder. A constructor should never return a promise
const zip = await obj.getZip();
const list = await zip.getList();
const sI = storage.connection;
await Promise.all(Object.keys(list).map(async (fileName, index) => {
const blob = await new FileExtractor(list[fileName]);
const store = new StoreObject(fileName, 'testData', blob);
await sI.setItemForce(fileName, store.dataObject);
}));
return sI; // or something?
}
This is the answer of a question i had asked , I wanted to also know how do we test such nested awaited functions.
unit Test of Normal Fun
test('Tes scenario', () => {
let storage = new StorageAdapter(
type,
dbName
)
expect(storage.storageName).toBe(dbName)
expect(storage.storageType).toEqual(type)
expect(storage.connection).not.toBeNull()
})

Categories