I have a script that is pulling 25,000 records from AWS Athena which is basically a PrestoDB Relational SQL Database. Lets say that I'm generating a request for each one of these records, which means I have to make 25,000 requests to Athena, then when the data comes back I have to make 25,000 requests to my Redis Cluster.
What would be the ideal amount of requests to make at one time from node to Athena?
The reason I ask is because I tried to do this by creating an array of 25,000 promises and then calling Promise.all(promiseArray) on it, but the app just hanged forever.
So I decided instead to fire off 1 at a time and use recursion to splice the first index out and then pass the remaining records to the calling function after the promise has been resolved.
The problem with this is that it takes forever. I took about an hour break and came back and there were 23,000 records remaining.
I tried to google how many requests Node and Athena can handle at once, but I came up with nothing. I'm hoping someone might know something about this and be able to share it with me.
Thank you.
Here is my code just for reference:
As a sidenote, what I would like to do differently is instead of sending one request at a time I could send 4, 5, 6, 7 or 8 at a time depending on how fast it would execute.
Also, how would a Node cluster effect the performance of something like this?
exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
athenaClient.execute(`SELECT DISTINCT the_column from "the_db"."the_table"`,
(err, data) => {
var getAndStoreDomainData = (records) => {
if(records.length){
return new promise((resolve, reject) => {
var subrecords = records.splice(0, )[0]
athenaClient.execute(`
SELECT
field,
field,
field,
SUM(field) as field
FROM "the_db"."the_table"
WHERE the_field IN ('Month') AND the_field = '`+ record.domain_name +`'
GROUP BY the_field, the_field, the_field
`, (err, domainTrend) => {
if(err) {
console.log(err)
reject(err)
}
redisClient.set(('Some String' + domainTrend[0].domain_name), JSON.stringify(domainTrend))
resolve(domainTrend);
})
})
.then(res => {
getAndStoreDomainData(records);
})
}
}
getAndStoreDomainData(data);
})
})
}
Using the lib your code could look something like this:
const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const distinctDomains = () =>
new Promise(
(resolve,reject)=>
athenaClient.execute(
`SELECT DISTINCT domain_name from "endpoint_dm"."bd_mb3_global_endpoints"`,
(err,data)=>
(err)
? reject(err)
: resolve(data)
)
);
const domainDetails = domain_name =>
new Promise(
(resolve,reject)=>
athenaClient.execute(
`SELECT
timeframe_end_date,
agg_type,
domain_name,
SUM(endpoint_count) as endpoint_count
FROM "endpoint_dm"."bd_mb3_global_endpoints"
WHERE agg_type IN ('Month') AND domain_name = '${domain_name}'
GROUP BY timeframe_end_date, agg_type, domain_name`,
(err, domainTrend) =>
(err)
? reject(err)
: resolve(domainTrend)
)
);
const redisSet = keyValue =>
new Promise(
(resolve,reject)=>
redisClient.set(
keyValue,
(err,res)=>
(err)
? reject(err)
: resolve(res)
)
);
const process = batchSize => limitFn => resolveValue => domains =>
Promise.all(
domains.slice(0,batchSize)
.map(//map domains to promises
domain=>
//maximum 5 active connections
limitFn(domainName=>domainDetails(domainName))(domain.domain_name)
.then(
domainTrend=>
//the redis client documentation makes no sense whatsoever
//https://redis.io/commands/set
//no mention of a callback
//https://github.com/NodeRedis/node_redis
//mentions a callback, since we need the return value
//and best to do it async we will use callback to promise
redisSet([
`Endpoint Profiles - Checkin Trend by Domain - Monthly - ${domainTrend[0].domain_name}`,
JSON.stringify(domainTrend)
])
)
.then(
redisReply=>{
//here is where things get unpredictable, set is documented as
// a synchronous function returning "OK" or a function that
// takes a callback but no mention of what that callback recieves
// as response, you should try with one or two records to
// finish this on reverse engineering because documentation
// fails 100% here and can not be relied uppon.
console.log("bad documentation of redis client... reply is:",redisReply);
(redisReply==="OK")
? domain
: Promise.reject(`Redis reply not OK:${redisReply}`)
}
)
.catch(//catch failed, save error and domain of failed item
e=>
new Fail([e,domain])
)
)
).then(
results=>{
console.log(`got ${batchSize} results`);
const left = domains.slice(batchSize);
if(left.length===0){//nothing left
return resolveValue.conat(results);
}
//recursively call process untill done
return process(batchSize)(limitFn)(resolveValue.concat(results))(left)
}
);
const max5 = lib.throttle(5);//max 5 active connections to athena
distinctDomains()//you may want to limit the results to 50 for testing
//you may want to limit batch size to 10 for testing
.then(process(1000)(max5)([]))//we have 25000 domains here
.then(
results=>{//have 25000 results
const successes = results.filter(x=>!isFail(x));
//array of failed items, a failed item has a .reason property
// that is an array of 2 items: [the error, domain]
const failed = results.filter(isFail);
}
)
You should figure out what redis client does, I tried to figure it out using the documentation but may as well ask my goldfish. Once you've reverse engineered the client behavior it is best to try with small batch size to see if there are any errors. You have to import lib to use it, you can find it here.
I was able to take what Kevin B said to find a much quicker way to query the data. What I did was change the query so that I could get the trend for all domains from Athena. I ordered it by domain_name and then sent it as a Node stream so that I could separate out each domain name into it's own JSON as the data was coming in.
Anyways this is what I ended up with.
exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
var streamObj = athenaClient.execute(`
SELECT field,
field,
field,
SUM(field) AS field
FROM "db"."table"
WHERE field IN ('Month')
GROUP BY field, field, field
ORDER BY field desc`).toStream();
var data = [];
streamObj.on('data', (record)=>{
if (!data.length || record.field === data[0].field){
data.push(record)
} else if (data[0].field !== record.field){
redisClient.set(('Key'), JSON.stringify(data))
data = [record]
}
})
streamObj.on('end', resolve);
streamObj.on('error', reject);
})
.then()
}
Related
I am trying to append numbers that I get from an api call (a promise) into an array. When I test the array's length it's always returning 1 as if each api call resets the array and puts in a new number.
here's the code:
The API call
wiki()
.page("COVID-19_pandemic_in_Algeria")
.then((page) => page.fullInfo())
.then((info) => {
(data.confirmed.value = info.general.confirmedCases),
(data.recovered.value = info.general.recoveryCases),
(data.deaths.value = info.general.deaths);
});
const data = {
confirmed: { value: 0 },
deaths: { value: 0 },
recovered: { value: 0 },
};
Now I want to put the deaths count into an array, so that I have a list of numbers over the next days to keep track of.
function countStats() {
const counter = [];
var deathCounter = data.deaths.value;
counter.push(deathCounter);
console.log(counter.length);
return counter;
}
countStats();
every time the functions run (wiki() and countStats()) the counter array's length is always 1. Why is that?
Unless ...
the data source provides multi-day data, or
you are going to run an extremely long javascript session (which is impractical and unsafe),
... then javascript can't, on its own, meet the objective of processing/displaying data arising from multiple days'.
Let's assume that the data source provides data that is correct for the current day.
You will need a permanent data store, in which scraped data can be accumulated, and retreived on demand. Exactly what you choose for your permanent data store is dependant on the environment in which you propose to run your javascript (essentially client-side browser or server-side NODE), and that choice is beyond the scope of this question.
Your master function might be something like this ...
function fetchCurrentDataAndRenderAll() {
return fetchCurrentData()
.then(writeToFile)
.then(readAllFromFile)
.then(data => {
// Here, you have the multi-day data that you want.
return renderData(data); // let's assume the data is to be rendered, say as a graph.
})
.catch(error => {
// something went wrong
console.log(error);
throw error;
});
}
... and the supporting functions might be something like this:
function fetchCurrentData() {
return wiki() // as given in the question ...
.page("COVID-19_pandemic_in_Algeria")
.then(page => page.fullInfo())
.then(info => ({
'timeStamp': Date.now(), // you will most likely need to timestamp the data
'confirmed': info.general.confirmedCases,
'recovered': info.general.recoveryCases,
'deaths': info.general.deaths
}));
}
function writeToFile(scrapedData) {
// you need to write this ...
// return Promise.
}
function readAllFromFile() {
// you need to write this ...
// return Promise.
}
function renderData(data) {
// you need to write this ...
// optionally: return Promise (necessary if rendering is asynchronous).
}
You can use Promise.all(). I take it that you'll not be requesting the same page 10 times but requesting a different page in each call e.g. const Pages = ['COVID-19_pandemic_in_Algeria','page2','page3','page4','page5','page6','page7','page8','page9','page10']. Then you could make the 10 calls as follows:
//const wiki = ......
const Pages = ['COVID-19_pandemic_in_Algeria','page2','page3','page4','page5','page6','page7','page8','page9','page10'];
let counter = [];
Promise.all(
Pages.map(Page => wiki().page(Page))
)
.then(results => {
for (page of results) {
let infoGeneral = page.fullInfo().general;
counter.push(infoGeneral.deaths);
}
console.log( counter.length ); //10
console.log( counter ); //[10 deaths results one for each page]
})
.catch(err => console.log(err.message));
I have about 650 products and each product has a lot of additional information relating to it being stored in metafields. I need all the metafield info to be stored in an array so I can filter through certain bits of info and display it to the user.
In order to get all the metafiled data, you need to make one API call per product using the product id like so: /admin/products/#productid#/metafields.json
So what I have done is got all the product ids then ran a 'for in loop' and made one call at a time. The problem is I run into a '429 error' because I end up making more than 2 requests per second. Is there any way to get around this like with some sort of queuing system?
let products = []
let requestOne = `/admin/products.json?page=1&limit=250`
let requestTwo = `/admin/products.json?page=2&limit=250`
let requestThree = `/admin/products.json?page=3&limit=250`
// let allProducts will return an array with all products
let allProducts
let allMetaFields = []
let merge
$(document).ready(function () {
axios
.all([
axios.get(`${requestOne}`),
axios.get(`${requestTwo}`),
axios.get(`${requestThree}`),
])
.then(
axios.spread((firstResponse, secondResponse, thirdResponse) => {
products.push(
firstResponse.data.products,
secondResponse.data.products,
thirdResponse.data.products
)
})
)
.then(() => {
// all 3 responses into one array
allProducts = [].concat.apply([], products)
})
.then(function () {
for (const element in allProducts) {
axios
.get(
`/admin/products/${allProducts[element].id}/metafields.json`
)
.then(function (response) {
let metafieldsResponse = response.data.metafields
allMetaFields.push(metafieldsResponse)
})
}
})
.then(function () {
console.log("allProducts: " + allProducts)
console.log("allProducts: " + allMetaFields)
})
.catch((error) => console.log(error))
})
When you hit 429 error, check for Retry-After header and wait for the number of seconds specified there.
You can also use X-Shopify-Shop-Api-Call-Limit header in each response to understand how many requests left until you exceed the bucket size limit.
See more details here: REST Admin API rate limits
By the way, you're using page-based pagination which is deprecated and will become unavailable soon.
Use cursor-based pagination instead.
I have a list of id and for each of them I'm fetching the corresponding item from a DynamoDB table using GetItem.
The thing is some ids are not present.
My question is: Let's say if I go through my list, there are 5000 ids that doesn't match any item in table, and I make each call with a 2 seconds delay between each of them.
What should I expect to happen to my table ?
const dynamo = new AWS.DynamoDB.DocumentClient();
const getItem = (key) => {
const getParams = {
TableName: 'my-table',
Key: {
id: key
}
};
return dynamo
.get(getParams)
.promise()
.then(result => {
const item = result.Item;
if(item){
return Promise.resolve(item);
}
return Promise.reject();
}).catch(error => {
console.log('Could not retrieve item with id', key);
return Promise.reject(error);
});
};
Well nothing will happen to DynamoDB Table. It will still serve normally. It's very scalable and fast. But, attention here if you missed that. This might trick you into increasing costs ->
If you perform a read operation on an item that does not exist,
DynamoDB still consumes provisioned read throughput: A strongly
consistent read request consumes one read capacity unit, while an
eventually consistent read request consumes 0.5 of a read capacity
unit.
See here
please apologize the unprecise title for this question, I am not an experienced programmer and even less so in node.js
My intent is a simple one: I want to use the bitfinex-api-node package (a node.js wrapper for bitfinex cryptocurrency exchange) that I installed via npm to read price data of various currency-pairs from the exchange to calculate better trading strategies.
The example code provided in the readme.md works fine, this is a stripped down version that creates a BFX-object which subscribes to a ticker of a given currency-pair and constantly outputs ticker-data:
const BFX = require('bitfinex-api-node')
const API_KEY = 'secret'
const API_SECRET = 'secret'
const opts = {
version: 2,
transform: true
}
const bws = new BFX(API_KEY, API_SECRET, opts).ws
bws.on('open', () => {
bws.subscribeTicker('BTCUSD')
})
bws.on('ticker', (pair, ticker) => {
console.log('Ticker:', ticker)
})
bws.on('error', console.error)
so far so good. Now for the sake of a simple example let's say I want to get the current price of two currency pairs (BTC/USD, ETH/USD) and add them an display the result. My obviously naive approach is like this:
const BFX = require('bitfinex-api-node')
const API_KEY = 'secret'
const API_SECRET = 'secret'
const opts = {
version: 2,
transform: true
}
const bws1 = new BFX(API_KEY, API_SECRET, opts).ws
const bws2 = new BFX(API_KEY, API_SECRET, opts).ws
var priceBTCUSD;
var priceETHBTC;
bws1.on('open', () => {
bws1.subscribeTicker('BTCUSD')
})
bws2.on('open', () => {
bws2.subscribeTicker('ETHUSD')
})
bws1.on('ticker', (pair, ticker) => {
//console.log('Ticker1:', ticker.LAST_PRICE)
priceBTCUSD = ticker.LAST_PRICE
})
bws2.on('ticker', (pair, ticker) => {
//console.log('Ticker2:', ticker.LAST_PRICE)
priceETHBTC = ticker.LAST_PRICE
})
bws1.on('error', console.error)
bws2.on('error', console.error)
//HERE IT COMES:
console.log(priceBTCUSD+priceETHBTC)
where the resulting output of the last line is "NaN". It seems the last line that logs the desired result to the console is executed before the BFX-objects establish a connection and receive any data.
How do I set this up properly? How can I retrieve data from the received data-stream? Do I really need a BFX-websocket object per currency pair? How would I read the price-data once, close down the websocket connection (which is not needed after reading the price once) and reconnect to read the price for a different currency pair?
Thank you! Feel free to request more data if my question isn't clear enough.
Kind regards,
s
Oh, your console.log is too soon there. Try this (I skipped a few lines):
bws1.on('ticker', (pair, ticker) => {
//console.log('Ticker1:', ticker.LAST_PRICE)
priceBTCUSD = ticker.LAST_PRICE;
printResults();
})
bws2.on('ticker', (pair, ticker) => {
//console.log('Ticker2:', ticker.LAST_PRICE)
priceETHBTC = ticker.LAST_PRICE
printResults();
})
bws1.on('error', console.error)
bws2.on('error', console.error)
//HERE IT COMES:
function printResults() {
if (priceBTCUSD && priceETHBTC)
console.log(priceBTCUSD+priceETHBTC)
}
Now, this is not the best approach, but it gets you of the ground. A better way is to have both prices asked on the same websocket, so when you get both prices back, call this function to calculate your results.
Background
I'm trying to construct an observable stream of values from the Stash Rest Api of pull requests. Unfortunately, the information of whether or not a PR has merge conflicts is available at a different endpoint to the list of merges.
The list of open pull requests is visible at, say, http://my.stash.com/rest/api/1.0/projects/myproject/repos/myrepo/pull-requests
For each PR, the data on merge conflicts is visible at http://my.stash.com/rest/api/1.0/projects/myproject/repos/myrepo/pull-requests/[PR-ID]/merge
Using the atlas-stash package, I can create and subscribe to an observable stream of pull requests (updated every second):
let pullRequestsObs = Rx.Observable.create(function(o) {
stash.pullRequests(project, repo)
.on('error', function(error) {o.onError(error)})
.on('allPages', function(data) {
o.onNext(data);
o.onCompleted();
});
});
let pullRequestStream = pullRequestsObs
.take(1)
.merge(
Rx.Observable
.interval(1000)
.flatMapLatest(pullRequestsObs)
);
pullRequestsStream.subscribe(
(data) => {
console.log(data)
// do something with data
},
(error) => log.error(error),
() => log.info('done')
);
This works as I want and expect. In the end, the pullRequestsStream is an observable whose values are lists of JSON objects.
My Goal
I would like the pullRequestsStream values to be updated so every element of the list includes information from the [PR-ID]/merge api.
I assume that this can be achieved using a map on pullRequestsStream, but I'm not succeeding in doing this.
let pullRequestWithMergeStream = pullRequestStream.map(function(prlist) {
_.map(prlist, function(pr) {
let mergeObs = Rx.Observable.create(function(o) {
stash.pullRequestMerge(project, repo, pr['id'])
.on('error', function(error) {o.onError(error)})
.on('newPage', function(data) {
o.onNext(data);
o.onCompleted();
}).take(1);
});
mergeObs.subscribe(
(data) => {
pr['merge'] = data;
return pr; // this definitely isn't right
},
(error) => log.error(error),
() => log.info('done')
);
});
});
With a bit of logging, I can see that both the pull-request and the merge apis are being hit correctly, but when I subscribe to pullRequestWithMergeStream I
get undefined values.
Using return within the the subscribe step within a map doesn't work (and doesn't seem like it should) but I can't figure out what pattern/idiom would achieve what I want.
Is there a correct way of doing this? Have I gone completely down the wrong track?
tl;dr
Can I update values from an Rxjs.Observable with information from a different observable?
You could use flatMap or concatMap to have one task trigger another one. You could use forkJoin to request the merges in parallel and collect the result in one place. It is not tested, but it should go like this :
pullRequestStream.concatMap(function (prlist){
var arrayRequestMerge = prlist.map(function(pr){
return Rx.Observable.create(function(o) {...same as your code});
});
return Rx.Observable.forkJoin(arrayRequestMerge)
.do(function(arrayData){
prlist.map(function(pr, index){pr['merge']=arrayData[index]
})})
.map(function(){return prlist})
})
PS : I supposed prlist was an array.
UPDATE
Following your comment, here is a version that will run only maxConcurrent calls in parallels.
pullRequestStream.concatMap(function (prlist){
var arrayRequestMerge = prlist.map(function(pr, index){
return Rx.Observable.create(function(o) {
stash.pullRequestMerge(project, repo, pr['id'])
.on('error', function(error) {o.onError(error)})
.on('newPage', function(data) {
o.onNext({data: data, index : index});
o.onCompleted();
}).take(1);
});
});
var maxConcurrent = 2;
Rx.Observable.from(arrayRequestMerge)
.merge(maxConcurrent)
.do(function(obj){
prlist[obj.index]['merge'] = obj.data
})})
.map(function(){return prlist})
})