I have a scenario where I am calling an API that has pagination.
What I'd like to do is the following, 1 page at a time.
Call API Page 1
For each of the items in the response, call a Promise to get more data and store in an array
Send the array to an API
Repeat until all pages are complete
What I currently have is the following, however I think I am possibly complicating this too much, although unsure on how to proceed.
export const importData = async() {
const pSize = 15;
const response = await getItems(pSize, 1);
const noPage = Math.ceil(response.totalMerchandiseCount/pSize);
for (let i = 1; i < noPage; i++) {
const items = [];
const data = await getItems(pSize, i);
await async.each(data.merchandiseList, async(i, cb) => {
const imageURL = await getImageURL(i.id, i.type);
items.push({
id: i.id,
imageURL: imageURL,
});
cb();
}, async() => {
return await api.mockable('sync', items);
});
}
}
export const getImageURL = async(id, type) => {
let url = `https://example.com/${id}`;
return axios.get(url)
.then((response) => {
const $ = cheerio.load(response.data);
// do stuff to get imageUrl
return image;
})
.catch((e) => {
console.log(e);
return null;
})
};
The issue I have at the moment is that it seems to wait until all pages are complete before calling api.mockable. Items is also empty at this point.
Can anyone suggest a way to make this a bit neater and help me get it working?
If this is all meant to be serial, then you can just use a for-of loop:
export const importData = async() {
const pSize = 15;
const response = await getItems(pSize, 1);
const noPage = Math.ceil(response.totalMerchandiseCount/pSize);
for (let i = 1; i < noPage; i++) { // Are you sure this shouldn't be <=?
const items = [];
const data = await getItems(pSize, i);
for (const {id, type} of data.merchandiseList) {
const imageURL = await getImageURL(id, type);
items.push({id, imageURL});
}
await api.mockable('sync', items);
}
}
I also threw some destructuring and shorthand properties in there. :-)
If it's just the pages in serial but you can get the items in parallel, you can replace the for-of with map and Promise.all on the items:
export const importData = async() {
const pSize = 15;
const response = await getItems(pSize, 1);
const noPage = Math.ceil(response.totalMerchandiseCount/pSize);
for (let i = 1; i < noPage; i++) { // Are you sure this shouldn't be <=?
const data = await getItems(pSize, i);
const items = await Promise.all(data.merchandiseList.map(async ({id, type}) => {
const imageURL = await getImageURL(id, type);
return {id, imageURL};
}));
await api.mockable('sync', items);
}
}
That async function call to map can be slightly more efficient as a non-async function:
export const importData = async() {
const pSize = 15;
const response = await getItems(pSize, 1);
const noPage = Math.ceil(response.totalMerchandiseCount/pSize);
for (let i = 1; i < noPage; i++) {
const data = await getItems(pSize, i);
const items = await Promise.all(data.merchandiseList.map(({id, type}) =>
getImageURL(id, type).then(imageURL => ({id, imageURL}))
));
await api.mockable('sync', items);
}
}
Related
I am making this scraper to collect post from public Facebook pages. My problem is when I turn the scraper up to collect more than like 10 post, it's unable to scrape elements after scrolling way down. So the way it works is the scraper goes to a public page then scrolls the entire feed until it grabs the IDs for the number of post you want to collect. After collecting all the IDs it will then go to every post and collect what specific info like comments, shares, reactions etc. But when I'm far down the feed it can't find the post higher up the feed by ID, even tho when in Chrome dev tools the selector works in puppeteer its undefined. So my question is why does the scroll location affect puppeteer being able to read the dom. And if there is a better way for me to collect this information.
-Sidenote: this scraper is expected to grab thosands of post
Here is my code so far
const { scrollPageToBottom } = require('puppeteer-autoscroll-down')
const puppeteer = require('puppeteer');
const prompt = require('prompt-sync')();
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const fbPage = prompt('What FaceBook Page?');
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
await page.goto(`https://www.facebook.com/OfficialMensHumor/`, {waitUntil : 'networkidle2' }).catch(e => void 0);
await scrapeArticles(page)
await browser.close();
})();
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
async function getText(spans){
for (const ele of spans){
// const text = ele.getProperty('innerText')
const text = await (await ele.getProperty('innerText')).jsonValue()
console.log(text)
}
}
async function scrapeArticles(
page,
// extractItems,
postCount=100,
scrollDelay = 800,
) {
let post = [];
try {
let previousHeight;
while (post.length < postCount) {
const content = await page.$('div[role="main"] > div.k4urcfbm')
post = await content.evaluate(()=>{
const postDivs = Array.from(document.querySelectorAll('div.du4w35lb.l9j0dhe7 div[class=lzcic4wl][role="article"]'))
return postDivs.map(post=>({id:post.getAttribute('aria-posinset')}))
})
console.log(post)
let isLoadingAvailable = true
await scrollPageToBottom(page, { size: 500 , delay: 250})
}
console.log(1)
await getPostUrls(page, post)
await getComments(page, post)
await getShares(page, post)
await getReactions(page, post)
await getPostImg(page, post)
await getTime(page, post)
console.log(post)
saveToFile(post)
} catch(e) {
console.log(e)
}
// return items;
}
const getComments = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
// const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')
// Comment String
const commentNum = await (await handle.getProperty('innerText')).jsonValue()
obj['commentsNum'] = commentNum
}
}
}
// console.log(articleNums)
}
const getShares = async (page, articleNums) => {
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.waitForXPath("//span[contains(text(), 'Shares')]", {visible: true})
// Share String
const shareNum = await (await handle[0].getProperty('innerText')).jsonValue()
obj['sharesNum'] = shareNum
}
}
}
// console.log(articleNums)
}
const getReactions = async (page, articleNums) =>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const handle = await article.$('span[aria-label="See who reacted to this"] + span[aria-hidden="true"]')
// Share String
const reactionsNum = await (await handle.getProperty('innerText')).jsonValue()
obj['reactionsNum'] = reactionsNum
}
}
}
// console.log(articleNums)
}
const getPostImg = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const imgDiv = await article.$('div[class="pmk7jnqg kr520xx4"]')
const handle = await imgDiv.$('img[alt]')
// Share String
const imgUrl = await (await handle.getProperty('src')).jsonValue()
obj['imgUrl'] = imgUrl
}
}
}
// console.log(articleNums)
}
// And timestamp
const getTime = async (page, articleNums)=>{
for (const obj of articleNums){
for (const key in obj){
if(key == 'postUrl'){
await page.goto(obj[key])
const timeStamp = await page.$eval('abbr[data-shorten]', abbr=>abbr.dataset.tooltipContent)
obj['timestamp'] = timeStamp
}
}
}
}
const getPostUrls = async (page, articleNums)=>{
for (const obj of articleNums){
for(const key in obj){
if(key == 'id'){
const article = await page.$(`div[aria-posinset="${obj[key]}"]`)
const postURLHandle = await article.$('a[role="link"][aria-label]')
// Share String
const postURL = await (await postURLHandle.getProperty('href')).jsonValue()
obj['postUrl'] = postURL
}
}
}
console.log(articleNums)
}
const saveToFile = async (list) =>{
const csv = new ObjectsToCsv(list);
// Save to file:
await csv.toDisk('./post_sample.csv');
}
These are the lines in question that are continously returning undefined
/ const handle = await article.waitForFunction('document.querySelector("span.d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.fe6kdd0r.mau55g9w.c8b282yb.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.iv3no6db.jq4qci2q.a3bd9o3v.b1v8xokw.m9osqain").innerText')
// const handle = await article.waitForXPath("//span[contains(text(), 'Comments')]", {visible: true})
const handle = await article.waitForSelector('div[aria-posinset="1"] div.gtad4xkn')
export function fetchNews(data) {
const news = []
var numOfArticlesArray = fetchNewsPreprocessing(data)
data.map((interest, index) => {
fetch(`https://newsapi.org/v2/top-headlines?country=us&category=${interest}&apiKey=`)
.then(res => res.json())
.then(res => res.articles)
.then(res => {
for (let i = 0; i < numOfArticlesArray[index]; i++) {
news.push(res[i])
}
})
.catch(err => console.log(err))
})
console.log(news);
}
So here is the function, my issue is that I'm getting this console.log(news); before I finish appending to my news array in here news.push(res[i]) which results in a blank array.
I tried adding async and await to the function like this async function fetchNews(data) and await data.map((interest, index) => { but no use.
thanks in advance.
Do you want to execute your fetch() calls serially, or in parallel?
If you want to execute them serially then something like this will work:
export function fetchNews(data) {
const news = [];
const numOfArticlesArray = fetchNewsPreprocessing(data);
data.map( async (interest, index) => {
const url = `https://newsapi.org/v2/top-headlines?country=us&category=${interest}&apiKey=`;
try {
const res = await fetch(url).then(res => res.json());
const articles = res.articles;
for ( let i = 0 ; i < numOfArticlesArray[index] ; i++ ) {
news.push(articles[i]);
}
} catch (err) {
console.log(err);
}
})
console.log(news);
}
If you want to execute them in parallel, however, then something like this is what you want:
export async function fetchNews(data) {
const news = [];
const numOfArticlesArray = fetchNewsPreprocessing(data);
const requests = data.map( (interest, index) => {
const url = `https://newsapi.org/v2/top-headlines?country=us&category=${interest}&apiKey=`;
const res = fetch(url).then(res => res.json());
return res;
})
const responses = await Promise.all( requests );
for ( const i = 0 ; i < responses.length ; ++i ) {
const res = responses[i];
const articles = res.articles;
for ( let j = 0 ; j < numOfArticlesArray[i] ; ++j ) {
news.push(articles[j]);
}
}
console.log(news);
}
You should put await in front of fetch() instead. For example, this piece of code will output the news array with the test element:
async function fetchNews(data) {
let news = [];
await fetch(url).then(() => news.push('test'));
console.log(news)
}
I expect when I call an async function to resolve promise at the end, not before.
const urls = await uploadImages({ imageChanges, questions });
// ...next step
// I will use urls
But after calling await uploadImages() it continues to run until const data = await fetch(image.src);
And then ...next step starts. How can I make it wait for imageChanges.forEach loop finish ? Should I create another nested function inside ?
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
let urls;
try {
//** convert each new image's src from blob to downloadUrl. */
imageChanges.forEach(async image => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
// **
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
urls.push(downloadURL)
});
return urls;
} catch (error) {
console.log(error.message);
}
};
forEach with async doesn't work as expected. Read this answer for more info.
Try like this
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
try {
const imageChangesUrlPromise = imageChanges.map(async () => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
return downloadURL;
})
return await Promise.all(imageChangesUrlPromise);
} catch (error) {
console.log(error.message);
}
};
and then
const urls = await uploadImages({ imageChanges, questions });
...
JavaScript does this because forEach is not promise-aware. It cannot support async and await. You cannot use await in forEach.
If you use await in a map, map will always return an array of promises. This is because asynchronous functions always return promises.
By littile modification to your code, this should work,
const uploadImages = async ({ imageChanges, questions }) => {
if (!imageChanges.length) return null;
const storage = firebase.storage();
let urls;
try {
//** convert each new image's src from blob to downloadUrl. */
await Promise.all(imageChanges.map(async image => {
const questionId = questions.findIndex(q => q.id === image.questionId);
const imagePath = `${questionId}.jpg`;
const storageRef = storage.ref(imagePath);
// **
const data = await fetch(image.src);
const blob = await data.blob();
const uploadTaskSnapshot = await storageRef.put(blob);
const downloadURL = await uploadTaskSnapshot.ref.getDownloadURL();
urls.push(downloadURL)
}));
return urls;
} catch (error) {
console.log(error.message);
}
};
const urls = await uploadImages({ imageChanges, questions });
Here's some code to join something from 3 object stores:
let db;
indexedDB.open('db', 1).onsuccess = ev => {
db = ev.target.result;
const tran = db.transaction(['s1', 's2', 's3']);
tran.objectStore('s1').get('third').onsuccess = ev1 =>
tran.objectStore('s2').index('connectTo').get('third').onsuccess = ev2 =>
tran.objectStore('s3').index('connectTo').get('third').onsuccess = ev3 => {
const [res1, res2, res3] = [ev1.target.result, ev2.target.result, ev3.target.result];
const result = {...res1, ...res2, ...res3};
......
}
}
Can I use promises or other means like async/await to avoid the heavy nesting? It'd be good if I can put these query processes in a function and get the result object as the return value.
Something like this should work.
const someFunction = async () => {
let db
const openDB = await indexedDB.open('db', 1)
db = openDB.target.result
const tran = db.transaction(['s1', 's2', 's3'])
const ev1 = await tran.objectStore('s1').get('third')
const ev2 = await tran.objectStore('s2').index('connectTo').get('third')
const ev3 = await tran.objectStore('s3').index('connectTo').get('third')
const [res1, res2, res3] = [
ev1.target.result,
ev2.target.result,
ev3.target.result,
]
const result = { ...res1, ...res2, ...res3 }
}
someFunction()
Personally I would store the results like this and eliminate the need for copies (if possible for you).
const result = { ...ev1.target.result, ...ev2.target.result, ...ev3.target.result }
I've a very simple script that gets me some info by mapping over an array of around 150 records and the code seems to work fine with smaller number of records but everytime I run it with this 150 records it just stops working and doesn't continue and I think it might be a Promise.all problem.
any idea?
code:
const request = require('request');
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs').promises;
let champions = [];
const getChampData = async hrefs => {
const requests = hrefs.map(async ({ href }) => {
try {
const html = await axios.get(href);
const $ = cheerio.load(html.data);
const champName = $('.style__Title-sc-14gxj1e-3 span').text();
let skins = [];
$('.style__CarouselItemText-sc-1tlyqoa-16').each((_, el) => {
const skinName = $(el).text();
skins.push(skinName);
});
const champion = {
champName,
skins
};
console.log(champion);
return champion;
} catch (err) {
console.error(err);
}
});
const results = await Promise.all(requests);
await fs.writeFile('json/champions-skins.json', JSON.stringify(results));
return results;
};
edit #1:
I used a package called p-map with it and now everything works just fine!
const axios = require('axios');
const pMap = require('p-map');
const cheerio = require('cheerio');
const fs = require('fs').promises;
const getChampData = async hrefs => {
// const champions = JSON.parse(await fs.readFile('json/champions.json'));
try {
let champsList = await pMap(hrefs, async ({ href }) => {
const { data } = await axios(href);
const $ = cheerio.load(data);
const champName = $('.style__Title-sc-14gxj1e-3 span').text();
let skins = [];
$('.style__CarouselItemText-sc-1tlyqoa-16').each((_, el) => {
const skinName = $(el).text();
skins.push(skinName);
});
const champion = {
champName,
skins
};
console.log(champion);
return champion;
});
await fs.writeFile(
'champions-with-skins-list.json',
JSON.stringify(champsList)
);
} catch (err) {
console.error(err.message);
}
};
On Error return is missing. Look like issue with some url to fetch.
const getChampData = async hrefs => {
const requests = hrefs.map(async ({ href }) => {
try {
const html = await axios.get(href);
// rest of the code
} catch (err) {
console.error(err);
return []
}
});
const results = await Promise.all(requests);
await fs.writeFile("json/champions-skins.json", JSON.stringify(results));
return results;
};