I have a json like
{
"connectEnd": 1366.2749999930384,
"connectStart": 175.91999999422114,
"decodedBodySize": 3360,
"domComplete": 10424.984999990556,
"domContentLoadedEventEnd": 6581.454999992275,
"domContentLoadedEventStart": 6581.454999992275,
"domInteractive": 6581.420000002254,
"domainLookupEnd": 175.91999999422114,
"domainLookupStart": 12.000000002444722,
"duration": 10425.015000000712,
"encodedBodySize": 1279,
"entryType": "navigation",
"fetchStart": 0.22499999613501132,
"initiatorType": "navigation",
"loadEventEnd": 10425.015000000712,
"loadEventStart": 10424.994999993942,
"name": "https://something/login",
"nextHopProtocol": "http/1.1",
"redirectCount": 0,
"redirectEnd": 0,
"redirectStart": 0,
"requestStart": 1366.394999990007,
"responseEnd": 2062.7999999996973,
"responseStart": 2059.7599999891827,
"secureConnectionStart": 414.94000000238884,
"serverTiming": [],
"startTime": 0,
"transferSize": 2679,
"type": "navigate",
"unloadEventEnd": 0,
"unloadEventStart": 0,
"workerStart": 0,
"workerTiming": []
}
I used papaparse to convert JSON into csv and I am getting this:
"Request time","Time to first byte","Response time","Request Response time","Cache seek plus response time","Dom interactive","Dom complete","Transfer Size","duration","Domain lookup time taken","Connection time taken"
693.3649999991758,3.040000010514632,3.040000010514632,696.4050000096904,2062.5750000035623,6581.420000002254,10424.984999990556,2679,10425.015000000712,163.91999999177642,1190.3549999988172
I am planning to use Jenkins plugin called BenchMark Evaluator
This plugin accepts csv only in this format:
csv table image link
My problem statement: How I change the structure of parsed csv into the desired csv format. Is there a npm package that can give me directly what I want or the parsed csv needs to be converted.
A more elegant way to convert json to csv is to use the map function without any framework:
var json = json3.items
var fields = Object.keys(json[0])
var replacer = function(key, value) { return value === null ? '' : value }
var csv = json.map(function(row){
return fields.map(function(fieldName){
return JSON.stringify(row[fieldName], replacer)
}).join(',')
})
csv.unshift(fields.join(',')) // add header column
csv = csv.join('\r\n');
console.log(csv)
Output:
title,description,link,timestamp,image,embed,language,user,user_image,user_link,user_id,geo,source,favicon,type,domain,id
"Apple iPhone 4S Sale Cancelled in Beijing Amid Chaos (Design You Trust)","Advertise here with BSA Apple cancelled its scheduled sale of iPhone 4S in one of its stores in China’s capital Beijing on January 13. Crowds outside the store in the Sanlitun district were waiting on queues overnight. There were incidents of scuffle between shoppers and the store’s security staff when shoppers, hundreds of them, were told that the sales [...]Source : Design You TrustExplore : iPhone, iPhone 4, Phone","http://wik.io/info/US/309201303","1326439500","","","","","","","","","wikio","http://wikio.com/favicon.ico","blogs","wik.io","2388575404943858468"
"Apple to halt sales of iPhone 4S in China (Fame Dubai Blog)","SHANGHAI – Apple Inc said on Friday it will stop selling its latest iPhone in its retail stores in Beijing and Shanghai to ensure the safety of its customers and employees. Go to SourceSource : Fame Dubai BlogExplore : iPhone, iPhone 4, Phone","http://wik.io/info/US/309198933","1326439320","","","","","","","","","wikio","http://wikio.com/favicon.ico","blogs","wik.io","16209851193593872066"
Use this less dense syntax and also JSON.stringify to add quotes to strings while keeping numbers unquoted:
const items = json3.items
const replacer = (key, value) => value === null ? '' : value // specify how you want to handle null values here
const header = Object.keys(items[0])
const csv = [
header.join(','), // header row first
...items.map(row => header.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','))
].join('\r\n')
console.log(csv)
Related
I used CRYPTOFINANCE.ai with Google Spreadsheets for some months now and I want to move on, to be able to get cryptocurrency data by "myself".
I discover the CoinMarketCap API and that should do it. I succeed in import one or many quotes. Now I would like to import the full listings data so I can have all the prices updated, in order to get a realistic value of my portfolio.
Here is what I have now, but it isn't importing the full listings :
function price() {
var sh1=SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Feuille 4');
var requestOptions = {
method: 'GET',
uri: 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/quotes/latest',
qs: {
'start': '1',
'limit': '5000',
'convert': 'USD'},
'headers' : {'X-CMC_PRO_API_KEY': '**********'},
'json': true,
'gzip': true};
var url='https://pro-api.coinmarketcap.com/v1/cryptocurrency/quotes/latest?symbol=ETH';
var result = UrlFetchApp.fetch(url, requestOptions);
var txt= result.getContentText();
var d=JSON.parse(txt);
sh1.getRange(1, 2).setValue(d.data.ETH.quote.USD.price)
}
I know it has something to deal with: https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest but I couldn't figure it out by myself.
I am not familiar with this site but when I try the URL on its own it returns an error stating the API key is missing
{
"status": {
"timestamp": "2021-03-31T12:29:45.203Z",
"error_code": 1002,
"error_message": "API key missing.",
"elapsed": 0,
"credit_count": 0
}
}
Also, it looks like Yahoo Finance has crypto historical data for free, no API required, just make an appropriate web request. This one grabs BTC from March 31, 2020 thru March 31, 2021.
https://query1.finance.yahoo.com/v7/finance/download/BTC-USD?period1=1585662494&period2=1617198494&interval=1d&events=history&includeAdjustedClose=true
One would need to decipher the format for period1 and period2, I believe they are UNIX timestamps, I was able to confirm at this site:
https://www.unixtimestamp.com/
Then this code would download the data into the current sheet:
function importCSVFromWeb() {
// Provide the full URL of the CSV file.
var csvUrl = "function importCSVFromWeb() {
// Provide the full URL of the CSV file.
var csvUrl = "https://query1.finance.yahoo.com/v7/finance/download/BTC-USD?period1=1585662494&period2=1617198494&interval=1d&events=history&includeAdjustedClose=true";
var csvContent = UrlFetchApp.fetch(csvUrl).getContentText();
var csvData = Utilities.parseCsv(csvContent);
var sheet = SpreadsheetApp.getActiveSheet();
sheet.getRange(1, 1, csvData.length, csvData[0].length).setValues(csvData);
}
BTW, it looks like Yahoo Finance gets their crypto data from CoinMarketCap
I was mistaken two requests : /listings and /quotes
Listing gives you a list of multiple currencies.
Quotes gives you data from one coin only
i'm trying to display a graph with data from this .tsv file:
d3.tsv('GDP.tsv', function(data) {
for (var i = 0; i < data.length; i++) {
console.log(data[i]);
}
MG.data_graphic({
title: "Line Chart",
description: "This is a simple line chart. You can remove the area portion by adding area: false to the arguments list.",
data: data, /*This is very probably part of the issue*/
width: 240,
height: 128,
target: document.getElementById('graph-gdp'),
x_accessor: 'value', /*This is very probably part of the issue*/
y_accessor: 'time' /*This is very probably part of the issue*/
});
This is outputing each row as:
{na_item,unit,geo\time: "B1GQ,CP_MEUR,AL",
2005 : "6475.3 ",
2006 : "7090.8 ",
2007 : "7809.8 ",
2008 : "8800.3 ",
2009 : "8662.2 ",
2010 : "8996.6 ",
2011 : "9268.3 ",
2012 : "9585.8 ",
2013 : "9625.4 "(etc...)}
How would I do to display a single row, so selected with it's name ("B1GQ,CP_MEUR,AL", for example), and display that data on a graph, with x=year and y=value?
Sorry if this is a rather noobish question, but I'm new to .js, .tsv files, and web development in general. I've tried figuring this out by myself, but have been failing pathetically.
Once you loaded that TSV using d3.tsv, the first step is filtering the data array, getting only that row you chose:
var filtered = data.filter(d => d["na_item,unit,geo\\time"] == "B1GQ,CP_EUR_HAB,AL");
//escaping the backslash here ---------------------^
However, since that TSV has strange headers (and values as well...), remember to escape the backslash.
That filtered array has only one object, and you're not going too far away with it. Thus, the next step is converting it in an array of several objects, one for each data point, using d3.entries:
var selectedData = d3.entries(filtered[0]).filter(d => d.key != ["na_item,unit,geo\\time"])
With that array, you can create your graph.
Here is a demo code, the array is printed in the console ("key" is the year, that you're gonna use in the x axis, and "value" is the value, that you're gonna use in the y axis):
data = d3.tsvParse(d3.select("#tsv").text());
var filtered = data.filter(d => d["na_item,unit,geo\\time"] == "B1GQ,CP_EUR_HAB,AL");
var selectedData = d3.entries(filtered[0]).filter(d => d.key != ["na_item,unit,geo\\time"])
console.log(selectedData);
pre {
display: none;
}
<script src="https://d3js.org/d3.v4.min.js"></script>
<pre id="tsv">na_item,unit,geo\time 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
B1GQ,CP_EUR_HAB,AL 2200 2400 2600 3000 3000 3100 3200 3300 3300 3400 p 3600 p :
B1GQ,CP_EUR_HAB,AT 30800 32200 34000 35100 34300 35200 36800 37600 38000 38700 39400 40000
B1GQ,CP_EUR_HAB,BE 29700 31000 32500 33100 32300 33500 34500 35100 35300 35900 36600 37400
B1GQ,CP_EUR_HAB,BG 3100 3600 4300 5000 5000 5200 5600 5700 5800 5900 p 6300 p 6600 p</pre>
PS: Again, that TSV has several problems with the values.
PPS: This answer only shows you how to filter and prepare your data array, and that only.
I have some raw imdb data in two strings:
var headers = "ID imdbID Title Year Rating Runtime Genre Released Director Writer Cast Metacritic imdbRating imdbVotes Poster Plot FullPlot Language Country Awards lastUpdated";
const content = "1 tt0000001 Carmencita 1894 NOT RATED 1 min Documentary, Short William K.L. Dickson Carmencita 5.8 1136 http://ia.media-imdb.com/images/M/MV5BMjAzNDEwMzk3OV5BMl5BanBnXkFtZTcwOTk4OTM5Ng##._V1_SX300.jpg Performing on what looks like a small wooden stage, wearing a dress with a hoop skirt and white high-heeled pumps, Carmencita does a dance with kicks and twirls, a smile always on her face. USA 2016-05-04 00:03:31.600000000";
Of course I am cleaning out the tabs and spaces etc and creating two arrays like so:
const values = content.split(/(\t+)/).filter( (part) => !/\t/.test(part) );
let keys = headers.split(/(\s+)/).filter( (part) => !/\s+/.test(part) );
Now I want to map the keys in keys array to the values in the values array accurately. So I do a reduce like so:
var result = {};
values.reduce( (acc, val, index) => {
return result[ keys[index] ] = val;
}, result);
This gives me back a final result object that looks like this:
{
Cast: "1136",
Director: "Carmencita",
Genre: "Documentary, Short",
ID: "1",
imdbID: "tt0000001",
imdbRating: "Performing on what looks like a small wooden stage, wearing a dress with a hoop skirt and white high-heeled pumps, Carmencita does a dance with kicks and twirls, a smile always on her face.",
imdbVotes: "USA",
Metacritic: "http://ia.media-imdb.com/images/M/MV5BMjAzNDEwMzk3OV5BMl5BanBnXkFtZTcwOTk4OTM5Ng##._V1_SX300.jpg",
Poster: "2016-05-04 00:03:31.600000000",
Rating: "NOT RATED",
Released: "William K.L. Dickson",
Runtime: "1 min",
Title: "Carmencita",
Writer: "5.8",
Year: "1894"
}
As you can see, the order is all mixed up! How do I get the right order in my data so I get title key mapped to the movie title value etc. ?
I am open to using a library like lodash to achieve this but can't tell what I should use? Here is a jsbin demo of the present state: https://jsbin.com/gekawi/edit?js,console
If you take the raw data with the tabs inside, you could use them for splitting.
var headers = "ID imdbID Title Year Rating Runtime Genre Released Director Writer Cast Metacritic imdbRating imdbVotes Poster Plot FullPlot Language Country Awards lastUpdated",
content = "1 tt0000001 Carmencita 1894 NOT RATED 1 min Documentary, Short William K.L. Dickson Carmencita 5.8 1136 http://ia.media-imdb.com/images/M/MV5BMjAzNDEwMzk3OV5BMl5BanBnXkFtZTcwOTk4OTM5Ng##._V1_SX300.jpg Performing on what looks like a small wooden stage, wearing a dress with a hoop skirt and white high-heeled pumps, Carmencita does a dance with kicks and twirls, a smile always on her face. USA 2016-05-04 00:03:31.600000000",
headerArray = headers.split(/\t/),
contentArray = content.split(/\t/),
object = {};
headerArray.forEach(function (k, i) {
object[k] = contentArray[i];
});
console.log(object);
I am using scrapy to gather schedule information on uslpro website. The site I am crawling is http://uslpro.uslsoccer.com/schedules/index_E.html.
The content of the page is rendered when the page is loaded. So I can't get the table data directly from source code. I looked at the source code and found that the schedule objects are stored in one object.
Here is the JavaScript Code.
preRender: function(){
var gmsA=diiH2A(DIISnapshot.gamesHolder);
....
This gmsA object has all schedule information. Is there any way to get this JS object using scrapy? Thank you very much for your help.
For starters, you have multiple options to choose from:
parse the javascript file containing the data (which is I'm describing below)
use Scrapy with scrapyjs tool
automate a real browser with the help of selenium
Okay, the first option (is arguably the most complicated).
The page is loaded via a separate call to a .js file which contains the information about matches and teams in two separate objects:
DIISnapshot.gms = {
"4428801":{"code":"1","tg":65672522,"fg":"2953156","fac":"22419","facn":"Blackbaud Stadium","tm1":"13380700","tm2":"22310","sc1":"1","sc2":"1","gmapply":"","dt":"22-MAR-2014","tim":"30-DEC-1899 19:30:00.0000","se":"65672455","modst":"","gmlabel":"","golive":0,"gmrpt":"67842863","urlvideo":"http://www.youtube.com/watch?v=JHi6_nnuAsQ","urlaudio":""}
, "4428803":{"code":"2","tg":65672522,"fg":"2953471","fac":"1078448","facn":"StubHub Center","tm1":"33398866","tm2":"66919078","sc1":"1","sc2":"3","gmapply":"","dt":"22-MAR-2014","tim":"30-DEC-1899 22:30:00.0000","se":"65672455","modst":"","gmlabel":"","golive":0,"gmrpt":"67846731","urlvideo":"http://www.youtube.com/watch?v=nLaRaTi7BgE","urlaudio":""}
...
, "5004593":{"code":"217","tg":65672522,"fg":"66919058","fac":"66919059","facn":"Bonney Field","tm1":"934394","tm2":"65674034","sc1":"0","sc2":"2","gmapply":"3","dt":"27-SEP-2014","tim":"30-DEC-1899 22:30:00.0000","se":"65672455","modst":"21-SEP-2014 1:48:26.5710","gmlabel":"FINAL","golive":0,"gmrpt":"72827154","urlvideo":"https://www.youtube.com/watch?v=QPhL8Ktkz4M","urlaudio":""}
};
DIISnapshot.tms = {
"13380700":{"name":"Orlando City SC","club":"","nick":"Orlando","primarytg":"65672522"}
...
, "8969532":{"name":"Pittsburgh Riverhounds","club":"","nick":"Pittsburgh","primarytg":"65672522"}
, "934394":{"name":"Harrisburg City Islanders","club":"","nick":"Harrisburg","primarytg":"65672522"}
};
And things are getting a bit more difficult because the URL to that js file is also constructed with javascript in the following script tag:
<script type="text/javascript">
var DIISnapshot = {
goLive: function(gamekey) {
clickpop1=window.open('http://uslpro.uslsoccer.com/scripts/runisa.dll?M2:gp::72013+Elements/DisplayBlank+E+2187955++'+gamekey+'+65672455','clickpop1','toolbar=0,location=0,status=0,menubar=0,scrollbars=1,resizable=0,top=100,left=100,width=315,height=425');
}
};
var DIISchedule = {
MISL_lgkey: '36509042',
sename:'2014',
sekey: '65672455',
lgkey: '2792331',
tg: '65672522',
...
fetchInfo:function(){
var fname = DIISchedule.tg;
if (fname === '') fname = DIISchedule.sekey;
new Ajax.Request('/schedules/' + DIISchedule.seSeq + '/' + fname + '.js?'+rand4(),{asynchronous: false});
DIISnapshot.gamesHolder = DIISnapshot.gms;
DIISnapshot.teamsHolder = DIISnapshot.tms;
DIISnapshot.origTeams = [];
for (var teamkey in DIISnapshot.tms) DIISnapshot.origTeams.push(teamkey);
},
...
DIISchedule.scheduleLoaded = true;
}
}
document.observe('dom:loaded',DIISchedule.init);
</script>
Okay, let's use BeautifulSoup HTML parser and slimit javascript parser to get the dynamic part (that tg value is the name of the js with the data) used to construct the URL, then make a request to a URL, parse the javascript and print out the matches:
import json
import random
import re
from bs4 import BeautifulSoup
import requests
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
# start a session
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'}
session = requests.Session()
response = session.get('http://uslpro.uslsoccer.com/schedules/index_E.html', headers=headers)
# get the dynamic part of the JS url
soup = BeautifulSoup(response.content)
script = soup.find('script', text=lambda x: x and 'var DIISchedule' in x)
tg = re.search(r"tg: '(\d+)',", script.text).group(1)
# request to JS url
js_url = "http://uslpro.uslsoccer.com/schedules/2014/{tg}.js?{rand}".format(tg=tg, rand=random.randint(1000, 9999))
response = session.get(js_url, headers=headers)
# parse js
parser = Parser()
tree = parser.parse(response.content)
matches, teams = [json.loads(node.right.to_ecma())
for node in nodevisitor.visit(tree)
if isinstance(node, ast.Assign) and isinstance(node.left, ast.DotAccessor)]
for match in matches.itervalues():
print teams[match['tm1']]['name'], '%s : %s' % (match['sc1'], match['sc2']), teams[match['tm2']]['name']
Prints:
Arizona United SC 0 : 2 Orange County Blues FC
LA Galaxy II 1 : 0 Seattle Sounders FC Reserves
LA Galaxy II 1 : 3 Harrisburg City Islanders
New York Red Bulls Reserves 0 : 1 OKC Energy FC
Wilmington Hammerheads FC 2 : 1 Charlotte Eagles
Richmond Kickers 3 : 2 Harrisburg City Islanders
Charleston Battery 0 : 2 Orlando City SC
Charlotte Eagles 0 : 2 Richmond Kickers
Sacramento Republic FC 2 : 1 Dayton Dutch Lions FC
OKC Energy FC 0 : 5 LA Galaxy II
...
The part printing the list of matches is for demonstration purposes. You can use matches and teams dictionaries to output the data in a format you need.
As this is not a popular tag I don't expect any upvotes - most importantly, it was an interesting challenge for me.
I'm working on a scraper for The List as a JS project, and my regex-fu could be better than it is.
Given a data structure like
<a name="may_21"><b>Wed May 21</b></a>
<ul>
<li><b>Ace of Spades, Sacramento</b> Christina Perri, Birdy a/a $20 7pm **
...
</ul>
I've written the following to leverage cheerio to grab a date, venue, and list of bands:
request(url, (error, response, html)->
if(!error)
$ = cheerio.load(html)
concert = { bands : {}, location : {venue: "", address : ""}, date: {date: "", time: ""}}
calendar = {}
dates = []
#grab dates
$('body > ul > li > a').each(->
data = $(this)
$dates = data.children().first()
dates.push($dates.text())
)
#build concerts
for date in dates
$("a:contains('" + date + "')").siblings().each(->
$venue = $(this).children().find("b")
$bands = $venue.siblings("a")
$time = $venue.parent()#.match()
)
)
As you can see, I'm having trouble figuring out how to grab the time from the above structure.
Typically, that is going to be a bit of plain text at the end of a li that corresponds to a specific show, so that for something like
Bottom of the Hill, S.F. Matt Pond PA, Lighthouse And The Whaler, Kyle M. Terrizzi a/a $14/$16 8pm/9pm **
I would be looking to grab the "8pm/9pm" text out of
<li><b>Bottom of the Hill, S.F.</b> Matt Pond PA, Lighthouse And The Whaler, Kyle M. Terrizzi a/a $14/$16 8pm/9pm **
Sometimes it will be in the form of "8pm", sometimes "8pm/9m" and sometimes it won't be there at all.
What's the best way to structure a regex to grab this data?
Don't regex the full raw html (general advice).
Instead, try loading the html to a temporary container-div (or documentFragment but you need some custom basic getter-shims).
Now work your way (loop) through the known structure, discarding everything you don't need (like anchors) and finally loop through the containers (in what's left over) to grab your final data (using a much simpler regex, that matches for example: /(\d+[ap]m/?){1,2}$/i.
PS, a word from a scraper: you often only know your final routine once you fully and successfully completed your scrape! (Like you usually find lost stuff in the last place you look..).
As Tomalak commented: pitfall no 1: data that doesn't match what you anticipate. Try to research your expected data-formats!!
EDIT:
Extra advice: add as much error-checking you can. Try to translate every flaw you find during testing to a check. You NEED any help you can get once you start scraping massive amounts of data.
Consider a chunking-approach : If a check fails, you don't need to start over from the beginning of the data. Instead, add extra check/solution and continue your scrape.
Otherwise just testing/debugging your scraper might even look like DOS behavior/traffic.
got this working, here's the code that I ended up using
fs = require('fs')
request = require('request')
cheerio = require('cheerio')
crypto = require("crypto")
url = 'http://www.foopee.com/punk/the-list/by-date.0.html'
getConcertItem = (text, regex)->
return text.match(regex)?.toString().replace(/,/g, '').trim()
request(url, (error, response, html)->
if(!error)
$ = cheerio.load(html)
#print(html)
calendar = {}
$dates = $('body > ul > li')
#dates
$dates.each(->
date = $(this).find("a").first().text()
$concerts = $(this).children("ul").children()
$concerts.each( ->
#todo: use the import-style ID generator
ID = parseInt(crypto.randomBytes(4).toString('hex'), 16)
concert = {bands : [], location : {venue: "", address : ""}, date: {date: "", time: ""}, cost: "", allAges: false}
$venue = $(this).find("b")
concert.location.venue = $venue.text()
concertText = $venue.parent().clone().children().remove().end().text()
timeRegex = /(\d?:?\d+[ap]m\/?\s?\w*\s?)/g
concert.date.date = date
concert.date.time = getConcertItem(concertText, timeRegex)
costRegex = /(\$\d+[\/-]?)/g
concert.cost = getConcertItem(concertText, costRegex)
allAgesRegex = /(a\/a)/g
if getConcertItem(concertText, allAgesRegex)
concert.allAges = true
$bands = $venue.siblings()
bands = []
$bands.each( ->
band = $(this).text()
bands.push(band)
)
concert.bands = bands
calendar[ID] = concert
)
)
)