How to call web service endpoints one after other? - javascript

I want to implement a function in JavaScript which calls a series of web service endpoint and checks for a value in the response of the API call.
I need to achieve it in a way that the first endpoint page is called first then the there would be a filter method to filter out the specific object from the response. If the object is found, this process should break and the object must be returned. However if the object is not found in the first endpoint, then the second endpoint must be called and the same process is repeated until the object is found.
The Web service endpoint that I am working on is:
https://jsonmock.hackerrank.com/api/countries?page=1
This API returns a list of country data. Here the value of page query varies from 1 to 25. I need to call the endpoint and check for a specific country from 1 to 25 until the country object is found.
I tried achieving this using JavaScript Promise and Fetch API and couldn't think of a way to call the APIs one after the other.
I am really looking forward for your answer. Thank you in advance.

You can use async and await for this:
async function findCountry(country) {
for (let page = 1; page < 26; page++) {
console.log("page = " + page); // for debugging only
let response = await fetch("https://jsonmock.hackerrank.com/api/countries?page=" + page);
let {data} = await response.json();
let obj = data.find(obj => obj.name == country);
if (obj) return obj;
}
}
let country = "Belgium";
findCountry(country).then(obj => {
if (obj) {
console.log("The capital of " + country + " is " + obj.capital);
} else {
console.log("Could not find " + country);
}
});
If you know that the data is sorted by country name, then you could reduce the average number of requests by using a binary search.

Here's a way that you can do it.
const url = 'https://jsonmock.hackerrank.com/api/countries'
const fetchFromApi = async (countryName, page) => {
const res = await fetch(`${url}?page=${page}`)
return await res.json()
}
const getCountryFromResults = (countryName, data) => {
const country = countryName.toLowerCase()
return data.find(({name}) => name.toLowerCase() === country)
}
const findCountry = async (countryName) => {
let page = 1;
let totalPages = 1;
while(page <= totalPages) {
const res = await fetchFromApi(countryName, page);
if(totalPages < res.total_pages) {
totalPages = res.total_pages
}
const country = getCountryFromResults(countryName, res.data)
if(country){
return country
}
page = page + 1
}
}
( async () => {
console.log(await findCountry("Afghanistan"))
console.log(await findCountry("Argentina"))
}
)()

Related

Promise.all fetch continue executing after throwing error?

I am trying to fetch JSON data from the WordPress Developer Reference site. I need to search a keyword without knowing if it's a function, class, hook, or method, which is part of the url I need to fetch. So I'm using Promise.all to cycle through all possible urls. It works if the response.status <= 299, throwing the error immediately, and if the response is ok, then it continues to .then. Fine, but occasionally it will return an ok status if the JSON exists and only returns an empty array. So I need to check if the JSON data is an empty array, which I can't seem to do in the first part. I can only check in the second part as far as I know. And if it throws the error it doesn't continue trying the other urls. Any suggestions?
var keyword = 'AtomParser';
const refs = ['function', 'hook', 'class', 'method'];
// Store the promises
let promises = [];
// Cycle through each type until we find one we're looking for
for (let t = 0; t < refs.length; t++) {
const url =
'https://developer.wordpress.org/wp-json/wp/v2/wp-parser-' +
refs[t] +
'?search=' +
keyword;
// console.log(url);
promises.push(fetch(url));
}
Promise.all(promises)
.then(function(response) {
console.log(response[0]);
// Get the status
console.log('Status code: ' + response[0].status);
if (response[0].status <= 299) {
// The API call was successful!
return response[0].json();
} else {
throw new Error('Broken link status code: ' + response[0].status);
}
})
.then(function(data) {
// This is the HTML from our response as a text string
console.log(data);
// Make sure we have data
if (data.length == 0) {
throw new Error('Empty Array');
}
// ref
const reference = data[0];
// Only continue if not null or empty
if (reference !== null && reference !== undefined && data.length > 0) {
// Success
// Return what I want from the reference
}
})
.catch(function handleError(error) {
console.log('Error' + error);
});
Is there some way to get the JSON data in the first part so I can check if it's in an array while I'm checking the response status?
I would recommend encapsulating the success / failure logic for individual requests, then you can determine all the resolved and rejected responses based on the result of that encapsulation.
For example
const checkKeyword = async (ref, keyword) => {
const params = new URLSearchParams({ search: keyword });
const res = await fetch(
`https://developer.wordpress.org/wp-json/wp/v2/wp-parser-${encodeURIComponent(
ref
)}?${params}`
);
if (!res.ok) {
throw new Error(`${res.status}: ${await res.text()}`);
}
const data = await res.json();
if (data.length === 0) {
throw new Error(`Empty results for '${ref}'`);
}
return { ref, data };
};
Now you can use something like Promise.any() or Promise.allSettled() to find the first successful request or all successful requests, respectively
const keyword = "AtomParser";
const refs = ["function", "hook", "class", "method"];
const promises = refs.map((ref) => checkKeyword(ref, keyword));
// First success
Promise.any(promises)
.then(({ ref, data }) => {
console.log(ref, data);
})
.catch(console.error);
// All successes
Promise.allSettled(promises)
.then((responses) =>
responses.reduce(
(arr, { status, value }) =>
status === "fulfilled" ? [...arr, value] : arr,
[]
)
)
.then((results) => {
// results has all the successful responses
});
For whatever reason I couldn't get Phil's answer to work, so I ended up doing the following which works fine for me. (This is for a discord bot in case you're wondering what the other stuff is all about).
var keyword = 'AtomParser';
const refs = ['function', 'hook', 'class', 'method'];
// Store the successful result or error
let final: any[] = [];
let finalError = '';
// Cycle through each type until we find one we're looking for
for (let t = 0; t < refs.length; t++) {
const url =
'https://developer.wordpress.org/wp-json/wp/v2/wp-parser-' +
refs[t] +
'?search=' +
keyword;
console.log(url);
// Try to fetch it
await fetch(url)
.then(function (response) {
console.log(response);
// Get the status
console.log('Status code: ' + response.status);
if (response.status > 299) {
finalError = '`' + refs[t] + '` does not exist.';
throw new Error(finalError);
} else {
// The API call was successful!
return response.json();
}
})
.then(function (data) {
// This is the HTML from our response as a text string
console.log(data);
// Make sure we have data
if (data.length == 0) {
finalError = "Sorry, I couldn't find `" + keyword + '`';
throw new Error(finalError);
}
// Only continue if not null or empty
if (data[0] !== null && data[0] !== undefined && data.length > 0) {
for (let d = 0; d < data.length; d++) {
// Add it to the final array
final.push(data[d]);
}
}
})
.catch(function handleError(error) {
console.log(error);
});
}
if (final.length > 0) {
for (let f = 0; f < final.length; f++) {
// ref
const reference = final[f];
// Get the link
const link = reference.link;
// Get the title
var title = reference.title.rendered;
title = excerpt.replace('>', '>');
// Get the excerpt
var excerpt = reference.excerpt.rendered;
excerpt = excerpt.replace('<p>', '');
excerpt = excerpt.replace('</p>', '');
excerpt = excerpt.replace('<b>', '**');
excerpt = excerpt.replace('</b>', '**');
console.log(excerpt);
message.reply(
new discord.Embed({
title: `${title}`,
url: link,
description: `${excerpt}\n\n`,
footer: {
text: `WordPress Developer Code Reference\nhttps://developer.wordpress.org/`,
},
})
);
}
} else if (finalError != '') {
message.reply(finalError);
} else {
message.reply('Something went wrong...');
}
wp module
#Phil's answer puts you on the right track but I want to expand on some of his ideas. Use of URLSearchParamas is great but you can improve by using the high-level URL API and forego encodeURIComponent and constructing search params manually. Notice I'm putting this code in its own wp module so I can separate concerns more easily. We don't want all of this code leaking into your main program.
// wp.js
import { fetch } from "whatwg-fetch" // or your chosen implementation
const baseURL = "https://developer.wordpress.org"
async function search1(path, query) {
const u = new URL(path, baseURL)
u.searchParams.set("search", query)
const result = await fetch(u)
if (!result.ok) throw Error(`Search failed (${result.status}): ${u}`)
return result.json()
}
search1 searches one path, but we can write search to search all the necessary paths. I don't think there's any reason to get fancy with each path here, so just write them out -
// wp.js (continued)
function search(query) {
const endpoints = [
"/wp-json/wp/v2/wp-parser-function",
"/wp-json/wp/v2/wp-parser-hook",
"/wp-json/wp/v2/wp-parser-class",
"/wp-json/wp/v2/wp-parser-method"
]
return Promise
.all(endpoints.map(e => search1(e, query)))
.then(results => results.flat())
}
export { search }
main module
Notice we only exported search as search1 is internal to the wp module. Let's see how we can use it in our main module now -
// main.js
import { search } from "./wp.js"
for (const result of await search("database"))
if(result.guid.rendered)
console.log(`${result.title.rendered}\n${result.guid.rendered}\n`)
In this example, we first search for "database" -
wp_should_replace_insecure_home_url()
https://developer.wordpress.org/reference/functions/wp_should_replace_insecure_home_url/
wp_delete_signup_on_user_delete()
https://developer.wordpress.org/reference/functions/wp_delete_signup_on_user_delete/
get_post_datetime()
https://developer.wordpress.org/reference/functions/get_post_datetime/
wp_ajax_health_check_get_sizes()
https://developer.wordpress.org/reference/functions/wp_ajax_health_check_get_sizes/
wp_should_replace_insecure_home_url
https://developer.wordpress.org/reference/hooks/wp_should_replace_insecure_home_url/
comments_pre_query
https://developer.wordpress.org/reference/hooks/comments_pre_query/
users_pre_query
https://developer.wordpress.org/reference/hooks/users_pre_query/
WP_Object_Cache
http://developer.wordpress.org/reference/classes/wp_object_cache/
wpdb
http://developer.wordpress.org/reference/classes/wpdb/
WP_REST_Menu_Items_Controller::prepare_item_for_database()
https://developer.wordpress.org/reference/classes/wp_rest_menu_items_controller/prepare_item_for_database/
WP_REST_Global_Styles_Controller::prepare_item_for_database()
https://developer.wordpress.org/reference/classes/wp_rest_global_styles_controller/prepare_item_for_database/
WP_REST_Menus_Controller::prepare_item_for_database()
https://developer.wordpress.org/reference/classes/wp_rest_menus_controller/prepare_item_for_database/
WP_REST_Templates_Controller::prepare_item_for_database()
https://developer.wordpress.org/reference/classes/wp_rest_templates_controller/prepare_item_for_database/
WP_REST_Application_Passwords_Controller::prepare_item_for_database()
https://developer.wordpress.org/reference/classes/wp_rest_application_passwords_controller/prepare_item_for_database/
wpdb::db_server_info()
https://developer.wordpress.org/reference/classes/wpdb/db_server_info/
WP_REST_Attachments_Controller::insert_attachment()
https://developer.wordpress.org/reference/classes/wp_rest_attachments_controller/insert_attachment/
WP_Debug_Data::get_database_size()
https://developer.wordpress.org/reference/classes/wp_debug_data/get_database_size/
WP_REST_Meta_Fields::update_multi_meta_value()
https://developer.wordpress.org/method/wp_rest_meta_fields/update_multi_meta_value/
another search example
Now let's search for "image" -
for (const result of await search("image"))
if(result.guid.rendered)
console.log(`${result.title.rendered}\n${result.guid.rendered}\n`)
get_adjacent_image_link()
https://developer.wordpress.org/reference/functions/get_adjacent_image_link/
get_next_image_link()
https://developer.wordpress.org/reference/functions/get_next_image_link/
get_previous_image_link()
https://developer.wordpress.org/reference/functions/get_previous_image_link/
wp_robots_max_image_preview_large()
https://developer.wordpress.org/reference/functions/wp_robots_max_image_preview_large/
wp_getimagesize()
https://developer.wordpress.org/reference/functions/wp_getimagesize/
is_gd_image()
https://developer.wordpress.org/reference/functions/is_gd_image/
wp_show_heic_upload_error()
https://developer.wordpress.org/reference/functions/wp_show_heic_upload_error/
wp_image_src_get_dimensions()
https://developer.wordpress.org/reference/functions/wp_image_src_get_dimensions/
wp_image_file_matches_image_meta()
https://developer.wordpress.org/reference/functions/wp_image_file_matches_image_meta/
_wp_check_existing_file_names()
https://developer.wordpress.org/reference/functions/_wp_check_existing_file_names/
edit_custom_thumbnail_sizes
https://developer.wordpress.org/reference/hooks/edit_custom_thumbnail_sizes/
get_header_image_tag_attributes
https://developer.wordpress.org/reference/hooks/get_header_image_tag_attributes/
image_editor_output_format
https://developer.wordpress.org/reference/hooks/image_editor_output_format/
wp_image_src_get_dimensions
https://developer.wordpress.org/reference/hooks/wp_image_src_get_dimensions/
wp_get_attachment_image
https://developer.wordpress.org/reference/hooks/wp_get_attachment_image/
image_sideload_extensions
https://developer.wordpress.org/reference/hooks/image_sideload_extensions/
wp_edited_image_metadata
https://developer.wordpress.org/reference/hooks/wp_edited_image_metadata/
wp_img_tag_add_loading_attr
https://developer.wordpress.org/reference/hooks/wp_img_tag_add_loading_attr/
wp_image_file_matches_image_meta
https://developer.wordpress.org/reference/hooks/wp_image_file_matches_image_meta/
get_custom_logo_image_attributes
https://developer.wordpress.org/reference/hooks/get_custom_logo_image_attributes/
Custom_Image_Header
http://developer.wordpress.org/reference/classes/custom_image_header/
WP_Image_Editor_Imagick
http://developer.wordpress.org/reference/classes/wp_image_editor_imagick/
WP_Embed
http://developer.wordpress.org/reference/classes/wp_embed/
WP_Image_Editor
http://developer.wordpress.org/reference/classes/wp_image_editor/
WP_Customize_Background_Image_Setting
http://developer.wordpress.org/reference/classes/wp_customize_background_image_setting/
WP_Customize_Header_Image_Setting
http://developer.wordpress.org/reference/classes/wp_customize_header_image_setting/
WP_Image_Editor_GD
http://developer.wordpress.org/reference/classes/wp_image_editor_gd/
WP_Customize_Header_Image_Control
http://developer.wordpress.org/reference/classes/wp_customize_header_image_control/
WP_REST_Server::add_image_to_index()
https://developer.wordpress.org/reference/classes/wp_rest_server/add_image_to_index/
WP_REST_URL_Details_Controller::get_image()
https://developer.wordpress.org/reference/classes/wp_rest_url_details_controller/get_image/
WP_Image_Editor::get_default_quality()
https://developer.wordpress.org/reference/classes/wp_image_editor/get_default_quality/
WP_Theme_JSON::get_blocks_metadata()
https://developer.wordpress.org/reference/classes/wp_theme_json/get_blocks_metadata/
WP_Image_Editor_Imagick::pdf_load_source()
https://developer.wordpress.org/reference/classes/wp_image_editor_imagick/pdf_load_source/
WP_Image_Editor_Imagick::write_image()
https://developer.wordpress.org/reference/classes/wp_image_editor_imagick/write_image/
WP_Image_Editor_Imagick::maybe_exif_rotate()
https://developer.wordpress.org/reference/classes/wp_image_editor_imagick/maybe_exif_rotate/
WP_Image_Editor_Imagick::make_subsize()
https://developer.wordpress.org/reference/classes/wp_image_editor_imagick/make_subsize/
WP_Image_Editor_GD::make_subsize()
https://developer.wordpress.org/reference/classes/wp_image_editor_gd/make_subsize/
empty search result
Searching for "zzz" will yield no results -
for (const result of await search("zzz"))
if(result.guid.rendered)
console.log(`${result.title.rendered}\n${result.guid.rendered}\n`)
<empty result>

Google Apps Script Working on backend but not on sheets

I am trying to create a script that pulls from the coin market cap API and displays the current price. The script is working fine on the back end when I assign the variable a value. However, when I try to run the function on sheets the returned value is null.
function marketview(ticker) {
var url = "https://pro-api.coinmarketcap.com/v1/cryptocurrency/quotes/latest?CMC_PRO_API_KEY=XXX&symbol=" + ticker;
var data = UrlFetchApp.fetch(url);
const jsondata = JSON.parse(data);
Logger.log(jsondata.data[ticker].quote['USD'].price)
}
My execution logs show that the scripts are running, but when when I use the function and try and quote ETH for example, the script is running for BTC.
When I do this on the backend and assign ETH the script works fine and returns the right quote. Any ideas on what I'm missing?
I did the same with coingecko API and add an issue having all my requests being rejected with quota exceeded error.
I understood that Google sheets servers IPs address were already spamming coingecko server. (I was obviously not the only one to try this).
This is why I used an external service like apify.com to pull the data and re-expose data over their API.
This is my AppScripts coingecko.gs:
/**
* get latest coingecko market prices dataset
*/
async function GET_COINGECKO_PRICES(key, actor) {
const coinGeckoUrl = `https://api.apify.com/v2/acts/${actor}/runs/last/dataset/items?token=${key}&status=SUCCEEDED`
return ImportJSON(coinGeckoUrl);
}
You need ImportJSON function, available here: https://github.com/bradjasper/ImportJSON/blob/master/ImportJSON.gs
Then in a cell I write: =GET_COINGECKO_PRICES(APIFY_API_KEY,APIFY_COINGECKO_MARKET_PRICES), you will have to create two field named APIFY_API_KEY and APIFY_COINGECKO_MARKET_PRICES in order for this to work.
Then register on apify.com, then you'll have to create an actor by forking apify-webscraper actor.
I set the StartURLs with https://api.coingecko.com/api/v3/coins/list, this will give me the total number of existing crypto (approx 11000 as of today), and number of page so I can run the request concurrently (rate limit is 10 concurrent requests on coingecko), then I just replace /list with /market and set the proper limit to get all the pages I need.
I use the following for the tasks page function:
async function pageFunction(context) {
let marketPrices = [];
const ENABLE_CONCURRENCY_BATCH = true;
const PRICE_CHANGE_PERCENTAGE = ['1h', '24h', '7d'];
const MAX_PAGE_TO_SCRAP = 10;
const MAX_PER_PAGE = 250;
const MAX_CONCURRENCY_BATCH_LIMIT = 10;
await context.WaitFor(5000);
const cryptoList = readJson();
const totalPage = Math.ceil(cryptoList.length / MAX_PER_PAGE);
context.log.info(`[Coingecko total cryptos count: ${cryptoList.length} (${totalPage} pages)]`)
function readJson() {
try {
const preEl = document.querySelector('body > pre');
return JSON.parse(preEl.innerText);
} catch (error) {
throw Error(`Failed to read JSON: ${error.message}`)
}
}
async function loadPage($page) {
try {
const params = {
vs_currency: 'usd',
page: $page,
per_page: MAX_PER_PAGE,
price_change_percentage: PRICE_CHANGE_PERCENTAGE.join(','),
sparkline: true,
}
let pageUrl = `${context.request.url.replace(/\/list$/, '/markets')}?`;
pageUrl += [
`vs_currency=${params.vs_currency}`,
`page=${params.page}`,
`per_page=${params.per_page}`,
`price_change_percentage=${params.price_change_percentage}`,
].join('&');
context.log.info(`GET page ${params.page} URL: ${pageUrl}`);
const page = await fetch(pageUrl).then((response) => response.json());
context.log.info(`Done GET page ${params.page} size ${page.length}`);
marketPrices = [...marketPrices, ...page];
return page
} catch (error) {
throw Error(`Fail to load page ${$page}: ${error.message}`)
}
}
try {
if (ENABLE_CONCURRENCY_BATCH) {
const fetchers = Array.from({ length: totalPage }).map((_, i) => {
const pageIndex = i + 1;
if (pageIndex > MAX_PAGE_TO_SCRAP) {
return null;
}
return () => loadPage(pageIndex);
}).filter(Boolean);
while (fetchers.length) {
await Promise.all(
fetchers.splice(0, MAX_CONCURRENCY_BATCH_LIMIT).map((f) => f())
);
}
} else {
let pageIndex = 1
let page = await loadPage(pageIndex)
while (page.length !== 0 && page <= MAX_PAGE_TO_SCRAP) {
pageIndex += 1
page = await loadPage(pageIndex)
}
}
} catch (error) {
context.log.info(`Fetchers failed: ${error.message}`);
}
context.log.info(`End: Updated ${marketPrices.length} prices for ${cryptoList.length} cryptos`);
const data = marketPrices.sort((a, b) => a.id.toLowerCase() > b.id.toLowerCase() ? 1 : -1);
context.log.info(JSON.stringify(data.find((item) => item.id.toLowerCase() === 'bitcoin')));
function sanitizer(item) {
item.symbol = item.symbol.toUpperCase()
return item;
}
return data.map(sanitizer)
}
I presume you are hiting the same issue I had with coinmarketcap, and that you could do the same with it.
You're not return ing anything to the sheet, but just logging it. Return it:
return jsondata.data[ticker].quote['USD'].price

How to get items value json

I am getting JSON from a api GET call. I am trying to get a value of a item which I think is a array. I am trying to console log the low price of the json.
I tried to cycle through it like a array like open.openDate.btcusd[5] and so on.
//JSON DATA FROM API
btcusd":{
"high":"9206.36",
"low":"8804.57",
"volume":"1291.122483",
"last":"8989.64",
"bid":"8987.88",
"ask":"8998.24"
//Call
coin.getOpen()
.then(data=>{
coin.ui(data);
});
//Function
async getOpen(){
const openres = await
fetch(`https://api.lakebtc.com/api_v2/ticker`);
const openBtc = await openres.json();
return {
openDate : openBtc
}
}
//New Function to console.log
ui(open){
console.log(open.openDate.btcusd); //I want the low value
}
do call this function like
async function getOpen(){
const openres = await fetch(`https://api.lakebtc.com/api_v2/ticker`);
const openBtc = await openres.json();
// console.log(openBtc.btcusd.low);
return {
openDate : openBtc
}
}
// function call
getOpen()
.then(res => console.log(res.openDate.btcusd.low))
.catch(err => console.error(err))
Didnt completely get your requirement here, but the response here is an object not an array.
We can convert to array and print the low values for different entries like :
async function getOpen(){
const openres = await fetch(`https://api.lakebtc.com/api_v2/ticker`);
const openBtc = await openres.json();
return {
openDate : openBtc
}
};
getOpen().then(data=>{
ui(data);
});
function ui(obj){
var arr = Object.entries(obj.openDate);
var lowValues = arr.map(d => console.log(d[0] + " value of low is " + d[1].low));
}

Batch get DocumentReferences?

I'm trying to improve a firestore get function, I have something like:
return admin.firestore().collection("submissions").get().then(
async (x) => {
var toRet: any = [];
for (var i = 0; i < 10; i++) {
try {
var hasMedia = x.docs[i].data()['mediaRef'];
if (hasMedia != null) {
var docData = (await x.docs[i].data()) as MediaSubmission;
let submission: MediaSubmission = new MediaSubmission();
submission.author = x.docs[i].data()['author'];
submission.description = x.docs[i].data()['description'];
var mediaRef = await admin.firestore().doc(docData.mediaRef).get();
submission.media = mediaRef.data() as MediaData;
toRet.push(submission);
}
}
catch (e) {
console.log("ERROR GETTIGN MEDIA: " + e);
}
}
return res.status(200).send(toRet);
});
The first get is fine but the performance is worst on the line:
var mediaRef = await admin.firestore().doc(docData.mediaRef).get();
I think this is because the call is not batched.
Would it be possible to do a batch get on an array of mediaRefs to improve performance?
Essentially I have a collection of documents which have foreign references stored by a string pointing to the path in a separate collection and getting those references has been proven to be slow.
What about this? I did some refactoring to use more await/async code, hopefully my comments are helpful.
The main idea is to use Promise.all and await all the mediaRefs retrieval
async function test(req, res) {
// get all docs
const { docs } = await admin
.firestore()
.collection('submissions')
.get();
// get data property only of docs with mediaRef
const datas = await Promise.all(
docs.map(doc => doc.data()).filter(data => data.mediaRef),
);
// get all media in one batch - this is the important change
const mediaRefs = await Promise.all(
datas.map(({ mediaRef }) =>
admin
.firestore()
.doc(mediaRef)
.get(),
),
);
// create return object
const toRet = datas.map((data: MediaSubmission, i) => {
const submission = new MediaSubmission();
submission.author = data.author;
submission.description = data.description;
submission.media = mediaRefs[i].data() as MediaData;
return submission;
});
return res.status(200).send(toRet);
}

Web scraper iterating over pages with Rx.js

About a month ago I built this web scraper using Async / Await as a async way of collecting info for a web scraper. I'm trying to build that very same scraper again using Rx.js. I've read through the docs and it seems to make sense, starting off is the hardest bit, but after that hump I made some progress.
You can see here that I get the first page on the site (page 0) and I need to use that page to get the count of pages (which is around 6000). I have that count and using the getPageURI(page) I can create each page URL, however my issue is that I can't figure out how to trigger, or fire, or pipe information back to the original pageRequestStream. I have this page count number and I need a way to iterate over it pushing data back to the first original pageRequestStream stream.
import cheerio from 'cheerio'
import Rx from 'rx'
import fetch from 'isomorphic-fetch'
const DIGITAL_NYC_URI = 'http://www.digital.nyc'
let getPageURI = (page) => `${DIGITAL_NYC_URI}/startups?page=${page}`
let getProfileURI = (profile) => `${DIGITAL_NYC_URI}${profile}`
function fetchURL(stream, dataType = 'json') {
return stream.flatMap(requestURL => {
return Rx.Observable.fromPromise(fetch(requestURL).then(res => res[dataType]()))
})
}
function getNumberOfPages($) {
let summary = $('.result-summary').text()
let match = summary.match(/Showing 1 - 20 of (\d+) Startups/)
return parseInt(match[1], 10)
}
function getCompaniesOnPage ($) {
let companySelector = 'h3.node-title a'
let companies = $(companySelector).map(function (i, el) {
let name = $(this).text()
let profile = $(this).attr('href')
return {
'name': name,
'profile': profile
}
}).get()
return companies
}
let pageRequestStream = Rx.Observable.just(getPageURI(0))
let pageResponseStream = fetchURL(pageRequestStream, 'text')
let parsedPageHTMLStream = pageResponseStream.map(html => cheerio.load(html))
let numberOfPagesStream = parsedPageHTMLStream.map(html => getNumberOfPages(html))
// not sure how to get this to iterate over count and fire url's into pageRequestStream
numberOfPagesStream.subscribe(pageCount => console.log(pageCount))
let companiesOnPageStream = parsedPageHTMLStream.flatMap(html => getCompaniesOnPage(html))
// not sure how to build up the company object to include async value company.profileHTML
companiesOnPageStream.subscribe(companies => console.log(companies))
// let companyProfileStream = companiesOnPageStream.map((company) => {
// return fetch(getProfileURI(company.profile))
// .then(res => res.html())
// .then(html => {
// company.profileHTML = html
// return company
// })
// })
Have a look at subjects, they allow you to fire events as you go.
Maybe this can serve as some inspiration
import cheerio from 'cheerio';
import Rx from 'rx';
import fetch from 'isomorphic-fetch';
function getCheerio(url) {
var promise = fetch(url)
.then(response => response.text())
.then(body => cheerio.load(body));
return Rx.Observable.fromPromise(promise);
}
const DIGITAL_NYC_URI = 'http://www.digital.nyc';
var pageRequest = new Rx.Subject();
pageRequest
.flatMap(pageUrl => getCheerio(pageUrl))
.flatMap(page$ => {
// here we pipe back urls into our original observable.
var nextPageUrl = page$('ul.pagination li.arrow a').attr('href');
if(nextPageUrl) pageRequest.onNext(DIGITAL_NYC_URI + '/' + nextPageUrl);
var profileUrls = page$('h3.node-title a')
.map(function() {
var url = page$(this).attr('href');
return DIGITAL_NYC_URI + '/' + url;
});
return Rx.Observable.from(profileUrls);
})
.flatMap(url => getCheerio(url))
.map(profile$ => {
// build the company profile here
return profile$('title').text();
})
.subscribe(value => console.log('profile ', value));
pageRequest.onNext(DIGITAL_NYC_URI + '/startups');

Categories