Web scraping dynamic content with phantomjs - javascript

Im trying to scrape data from a site that generates its data via a javascript but I cant seem to locate the invoking script. When I look at the page source there is an in-page script that has a variable which has an array of the data I wish to retrieve but there is another script which contains all the codes for the various companies I wish to retrieve data from. This is what i've tried so far:
var url = 'http://www.asx.com.au/asx/share-price-research/company/ZAM/details';
var page = require('webpage').create();
page.open(url, function (status) {
var digitalData = page.evaluate(function () {
return window.digitalData;
})page.then(function (digitalData) {
console.log
}
('DigtalData is ' + digitalData));
phantom.exit();
});
Unfortunately the above script doesnt output anything. The script which contains the codes for the companies I want to get the data from I think is this:
var locationPath = window.location.pathname.split('/');
var companyCode = locationPath[locationPath.length - 1].trim();
var sectorCodes = {
"MOQ": "soft",
"1PG": "soft",
"ONT": "heal",
"1ST": "heal",
"T3D": "food",
"TGP": "real",
"TIX": "real",
"TDO": "ener",
"DDD": "mate",
"3PL": "cons",
"4DS": "semi"
};
setTimeout(function () {
googletag.cmd.push(function () {
googletag.defineSlot('/76291182/ASX_leaderboard_com_info', [728, 90], 'div-gpt-ad-1450158832871-0').addService(googletag.pubads());
googletag.defineSlot('/76291182/ASX_MREC_com_info', [[300, 250], [300, 600]], 'div-gpt-ad-1450158832871-1').addService(googletag.pubads());
googletag.defineSlot('/76291182/ASX_MREC_lower_com_info', [300, 250], 'div-gpt-ad-1450158832871-2').addService(googletag.pubads());
googletag.defineSlot('/76291182/ASX_skyscraper_com_info', [160, 600], 'div-gpt-ad-1450158832871-3').addService(googletag.pubads());
googletag.defineSlot('/76291182/ASX_half_page_com_info', [300, 600], 'div-gpt-ad-1450158832871-4').addService(googletag.pubads());
googletag.pubads().setTargeting("cc", companyCode);
if (typeof sectorCodes[companyCode] != 'undefined') {
googletag.pubads().setTargeting('sec', sectorCodes[companyCode]);
}
googletag.enableServices();
});
}, 2000);
This script seems to do a lot more than just contain the codes for the companies but its the only JS script I can find that has the company codes. I believe the data I want is from this script:
var currentURL = (document.URL);
var part = currentURL.split("/")[6];
// var dwsDTM = $('#company-code-title').text();
var digitalData = {
"page": {
"pageInfo": {
"pageID": "3345",
"pageName": "Company info " + part,
"pageURL": window.location.href,
"issueDate": "n/a",
"updatedDate": "n/a",
"brand": "ASX",
"generator": "OpenText",
"domain": "Website",
"sysEnv": "",
"delayType": "Normal"
},
"category": {
"primaryCategory": "Prices and research",
"subCategory1": "Company information",
"subCategory2": "Company info " + part,
"subCategory3": "",
"pageType": ""
},
"productInfo": {}
},
"user": {
"profileInfo": {
"memberB2B": "",
"businessMemberID": "",
"memberRetail": "",
"retailMemberID": ""
},
"version": "1.0",
"events": [],
"vendor": {
"GoogleAnalytics": {
"account": "UA-9950793-3",
"eventCategory": ""
}
}
}
};
I believe its generated to this HTML:
<div class="view-content" ui-view></div>
Im new to scraping dynamic content and this looks so hard to do. Could someone point me in the right direction to retrieve this data?
Thanks
UPDATE: So after a few hours of playing on the site I figured out that they use angularJS for their data call backs and upon further playing around I found the links they use to store their data. They have an API but I dont think its publicly available. In any case I think I can write a simple DOM Script to retrieve and format the data. Will keep you all posted

I figured out a solution. Utilising their API and using HTML DOM and PHP I was able to retrieve the data I wanted and then output the data to CSV. I cant discuss my code or the API because it would be in violation of their terms and service.

Related

How to retrieve data from json file

I am having trouble with accessing my json file from my javascript file. I would like to change the object to a different text in my json file once a submit button is clicked on the webpage. I am aware that I would use ajax to achieve this goal, but I do not know how to access the json file.
This is the db.json file
{
{
"assets": [
{
"id": "0946",
"manufacturer": "SONY",
},
{
"id": "0949",
"manufacturer": "AUDIOTECNIA"
}
],
"transfers": [
{
"id": 1,
"status": "in-progress"
}
]
}
This is my Javascript file
$('form').on('submit', function(e){
e.preventDefault();
parsedData = JSON.parse(db.json);
console.log(parsedData[0].id)
//Changing Status
$.ajax({
type: "PATCH",
url: `http://localhost:3000/transfers/`
});
I've tried using parseData because I read that is how to retrieve the object, from the json file, but I do not believe I am writing it correctly. What documentation or steps would one recommend for solving this issue?
You have an extra comma after "in-progress",
const parsedData = JSON.parse(`{
"transfers": [ {
"id": 1,
"status": "in-progress"
}]
}`)
Then, to access id in parsedData:
console.log(parsedData.transfers[0].id)
You did not initialize the variable parsedData.
var parsedData = JSON.parse(db.json);

Error with Remote Pagination in Tabulator 5.0

My tabulator 5.0 table works fine. But when I attempt to use remote pagination it no longer works at all.
var table = new Tabulator("#account-tran-detail-table", {
pagination:true,
paginationMode:"remote", //if this line is commented out then it works fine without pagination
paginationSize: 12,
dataSendParams:{
page: "page",
size: "page_size",
},
dataReceiveParams:{
last_page:"total_pages",
} ,
ajaxResponse:function(url, params, response){
return response.results; // return the array of table items
},
});
table.on("tableBuilt", function(){
table.setColumns(columns);
});
function generateReport () {
table.clearData();
var columns = [
{title:"id", field:"id", headerFilter:false, visible:false, download:true},
{title:"name", field:"name", headerFilter:false, visible:false, download:true},
];
var gender = "m";
var url = "/api/v1/myendpointname/";
var append_params = "?gender=" + gender;
$("#tableBuilt").destroy;
table.setData(url + append_params);
};
The response looks like this:
{
"count": 10,
"total_pages": 3,
"next": "http://localhost:8000/myendpointname/?gender=m&page=2&size=3
"previous": null,
"results": [
{
"id": 1,
"gender": "m",
"name": "abc",
},
{
"id": 2,
"gender": "m",
"name": "def",
},
{
"id": 3,
"gender": "m",
"name": "ghi",
},
]
}
You can see where I'm asking tabulator to read total_pages instead of the default last_page by using dataReceiveParams. And you can also see where I'm asking tabulator to send page_size instead of the default size by using dataSendParams to generate the request.
I get two warnings . . .
Remote Pagination Error - Server response missing 'undefined' property Page.js:707:11
Remote Pagination Error - Server response missing 'undefined' property Page.js:754:11
Followed by an error.
Data Loading Error - Unable to process data due to invalid data type
Expecting: array
Received: undefined
Data: undefined
The code referenced by the warnings:
707: console.warn("Remote Pagination Error - Server response missing '" + this.dataReceivedNames.last_page + "' property");
754: console.warn("Remote Pagination Error - Server response missing '" + this.dataReceivedNames.data + "' property");
So it seems to me that tabulator isn't "seeing" the total_pages portion of the response. Why is it not using defaultReceiveParams? Could it be because of my ajaxResponse function returns response.results and strips the rest of the response away?
(Same behavior for Chrome and Firefox)
Use ajaxResponse to return data and last_page as
Codesandbox

How to correctly retrieve a document from PropDB using specific formatting in the http Trigger GET request (Azure Function)

Here is my issue:
I am trying to set-up an http trigger using Azure functions in javascript. I have been able to post this data into Cosmosdb using my POST function.
CosmosDB Example Item I am looking for:
{
"id": "POLL:FAVECOLORS:LM:LMBWZ18",
"partition": "POLL:LM",
"value": {
"name": "fave colors poll",
"question": "Which color do you like the most?",
"answers": [
{
"text": "Orange",
"count": 0
},
{
"text": "Yellow",
"count": 0
},
{
"text": "Blue",
"count": 0
}
]
},
"_rid": "<info>",
"_self": "<info>",
"_etag": "\"<info>\"",
"_attachments": "<info>/",
"_ts": <info>
}
I am trying to pull this information via input into my Azure Function.
Here is my function.json.
{
"bindings": [
{
"authLevel": "function",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"route": "<MyURL>/{type}/{objname}/{brand}/{site}",
"methods": [
"get",
"post",
"patch"
]
},
{
"type": "http",
"direction": "out",
"name": "$return"
},
{
"name": "CosmosSend",
"type": "documentDB",
"databaseName": "PropDB",
"collectionName": "WithoutCCN",
"createIfNotExists": false,
"connection": "<Connection-Info>",
"direction": "out"
},
{
"type": "documentDB",
"name": "documents",
"databaseName": "PropDB",
"collectionName": "WithoutCCN",
"connection": "<Connection-Info>",
"direction": "in",
"sqlQuery": "SELECT * FROM c WHERE c.id = {id}"
}
],
"disabled": false
}
I am not sure how I can change my requests format so that it uses the request parameters for the query instead of looking for "https:///{type}/{objname}/{brand}/{site}?id=POLL:FAVECOLORS:LM:LMBWZ18"
module.exports = function (context, req) {
var documents = context.bindings.documents;
var totalDocuments = documents.length;
var azureAppId = req.headers["azure-app-id"];
context.log('azure-app-id = ' + azureAppId);
var idx = '' + (req.params.type + ':' + req.params.objname + ':' + req.params.brand + ':' + req.params.site).toUpperCase();
var paritionx = '' + (req.params.type + ":" + req.params.brand).toUpperCase();
var valuex = req.body;
if (req.method === 'GET') {
context.log('Found '+ totalDocuments +' documents');
if(totalDocuments === 0)
{
context.done(null, createResult(200, 'application/json', "The
requested document was not found."));
}
else {
context.done(null, createResult(200, 'application/json', documents));
}
} else if (req.method === 'POST') {
if (typeof req.body === 'object') {
context.bindings.CosmosSend = JSON.stringify({
id: idx,
partition: paritionx,
value: valuex
});
context.done(null, createResult(201, 'application/json', req.body));
}
else {
context.done(null, createResult(400, 'text/plain', 'The message must be of type application/json.'));
}
I have tried to take the variable 'idx' and put that into the query since it is already in the same format as the POST sends the 'id', but since it is not apart of the function.json itself it cannot be found. If I change the query in the function to:
"sqlQuery": "SELECT * FROM c WHERE c.id = /"{TYPE}:{OBJNAME}:{BRAND}:{SITE}/""
and anything remotely close to that kinda thinking it doesnt work. If I shove what I am looking for exactly into it:
"sqlQuery": "SELECT * FROM c WHERE c.id = \"POLL:FAVECOLORS:LM:LMBWZ18\""
It will find it perfectly every time, but only that one record, and it defeats the purpose of the GET request including its own target information.
I have really been racking my brain on this issue and any tips would really help. I have read a lot of microsoft docs related to the Azure Function and javascript but nothing has helped with this specific issue.
The expected results will be the item in cosmos returned when given the http GET request from the Azure Function app. it will look for the item by the information located in the URL of the request.
Have you tried adding the id as an optional parameter in the route?
"route": "<MyURL>/{type:alpha}/{objname:alpha}/{brand:alpha}/{site:alpha}/{id:alpha?}",
Keep in mind though, that doing this in the same HTTP Trigger Function will always execute the query, even on a POST where you don't send the id. So you are consuming RU's that you are not using.
Ideally you would split this into separate Functions, one for POST, one for GET, which helps them scale independently and you have a greater separation of concerns.

Getting undefined when trying to acces a JSON item [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
i am trying to get a JSON and access its objects to retrieve data, all my paths seems to be working good but one specifically is not working. I just checked and the path is the same the other objects i am trying to access, in the console i keep getting undefined as a result. i will appreciate if you guys could tell me whats wrong with the code i will leave you the JSON structure and also the java Scripti hope you guys can help me figuring out, and also would be good if you could give me some tips on my coding because I am learning.
Java Script:
This function is called when you click on some item in a list.
function checkUser(){
console.log("clicked");
var user = $(this).html();
$.getJSON("js/options.json", function(json){
var itemsLength = json.chat.OnlineContacts.length;
for ( var i = 0; i < itemsLength; i++) {
var jsonUserName = json.chat.OnlineContacts[i].name;
var jsonUserStatus = json.chat.OnlineContacts[i].status;
var jsonUserAvatar = json.chat.OnlineContacts[i].avatar;
if(user == jsonUserName){
/*displayChatWindow(jsonUserName, jsonUserStatus, jsonUserAvatar);*/
console.log(jsonUserAvatar);
}
};
});
}
function displayChatWindow(user, status, avatar){
/*var template = _.template($("#windowTemplate").html(), {userName: user, userStatus: status, userAvatar: avatar});
$("body").prepend(template);*/
$(".messages-container").slimScroll({
height: '200',
size: '10px',
position: 'right',
color: '#535a61',
alwaysVisible: false,
distance: '0',
railVisible: true,
railColor: '#222',
railOpacity: 0.3,
wheelStep: 10,
disableFadeOut: false,
start: "bottom"
});
}
And this is the JSON:
{
"chat": {
"NumberOfOnlineContacts": "7",
"NumberOfOfflineContacts": "800",
"OnlineContacts": [
{
"name": "Nandy Torres",
"status": "online",
"avatar": "img/profile-picture2.jpg"
},
{
"name": "Catherine Varela",
"status": "Busy",
"avatar": "img/profile-picture3.jpg"
},
{
"name": "Jhonnatan Gonzalez",
"status": "online",
"avatar": "img/profile-picture4.jpg"
},
{
"name": "Juan Prieto",
"status": "away",
"avatar": "img/profile-picture5.jpg"
},
{
"name": "Alexander Barranco",
"status": "Busy",
"avatar": "img/profile-picture6.jpg"
}
],
"OfflineContacts": [
{
"name": "Nandy Torres"
},
{
"name": "Catherine Varela"
},
{
"name": "Jhonnatan Gonzalez"
},
{
"name": "Juan Prieto"
},
{
"name": "Jhonathan Sanchez"
}
]
}
}
When I try to do the console log on avatar like is in the example I just get undefined, but if i do console.log of name or status y just get the correct values.
Consider restructuring the original JSON object so you can access users directly by their unique id - in this case it appears to be "name".
For example:
{
"chat": {
"NumberOfOnlineContacts": "7",
"NumberOfOfflineContacts": "800",
"OnlineContacts": [
{
"Nandy Torres" : {
"status": "online",
"avatar": "img/profile-picture2.jpg"
}
},
{
"Catherine Varela" : {
"status": "Busy",
"avatar": "img/profile-picture3.jpg"
}
},
{...}
]
}
}
This could be accessed in the following ways without the expense of a loop:
json.chat.OnlineContacts[0].avatar
json.chat.OnlineContacts["Nandy Torres"].avatar
json.chat.OnlineContacts[user].avatar
There's an obvious cost-benefit here, but you may get the best of both by inlcuding the "name" attr as well:
"Nandy Torres" : {
"name": "Nandy Torres",
"status": "online",
"avatar": "img/profile-picture2.jpg"
}
And eventually moving to a more formal unique id:
"123abc" : {
"name": "Nandy Torres",
"status": "online",
"avatar": "img/profile-picture2.jpg"
}
There's lots of excellent ideas for structuring JSON at json.com
Jhonnatan, I'm glad you got your code working. So this isn't really an "answer" to your original question. But you also asked for tips on the coding and how to improve it. That's great that you're looking for ways to improve your code!
Here's one suggestion for you, a simpler way to write the checkUser() function. This is just a direct conversion of your function that should work with the same data format.
function checkUser() {
console.log( "clicked" );
var user = $(this).html();
$.getJSON( "js/options.json", function( json ) {
$.each( json.chat.OnlineContacts, function( i, contact ) {
if( user == contact.name ) {
displayChatWindow( contact.name, contact.status, contact.avatar );
console.log( contact.avatar );
}
});
});
}
Note how the use of the contact object makes it unnecessary to create all those jsonUserXxxx variables, because code like contact.avatar is just as simple and clear as jsonUserAvatar.
I also highly recommend #DaveRomero's excellent answer for a more strategic look at the question of how best to structure your JSON data.
More related reading: this discussion of JSON restructuring that I posted in response to this question earlier today.

Only want to pull JSON Class not all data

Trying to only pull text from a class with a unique name using $.getJSON and YQL. Right now it pulls all data and strips out tags. Anyone know if this can be achieved?
function filterData(data){
// filter all the nasties out
// no body tags
data = data.replace(/<?\/body[^>]*>/g,'');
// no linebreaks
data = data.replace(/[\r|\n]+/g,'');
// no comments
data = data.replace(/<--[\S\s]*?-->/g,'');
// no noscript blocks
data = data.replace(/<noscript[^>]*>[\S\s]*?<\/noscript>/g,'');
// no script blocks
data = data.replace(/<script[^>]*>[\S\s]*?<\/script>/g,'');
// no self closing scripts
data = data.replace(/<script.*\/>/,'');
// the below doesn't work of course, but if I could use jQuery I would do as follows:
data = $(data).find('.count').text();
return data;
}
Here is part of the JSON it pulls:
"div": [
{
"id": "store-page-rating",
"div": [
{
"id": "store-six-month-rating",
"div": {
"style": "float:left;",
"span": {
"class": "rating",
"img": {
"alt": "Rating 8.68/10",
"class": "stars fourhalf",
"src": "http://images4.resellerratings.com/CDN-1328669559/static/images/blankstar.gif",
"style": "width: 80px; height: 16px;",
"title": "Rating 8.68/10"
},
"span": [
{
"class": "count",
"content": "27"
},
How can I just display the "27" on class "count" instead of ALL the data on the domain?
I don't think you have much of a choice here, yql provides you a proxy to query the resources that are not on your domain. you can make your own server side proxy that can filter the results and return you the desired output in json format.
What I had to do and was unaware of was to use XPATH at the end of the YQL statement.
select content from html where url="http://www.resellerratings.com/store/Burkett_Restaurant_Equipment_Supplies"
and xpath="//span[#class='count']"
See here

Categories