I try to retrieve data (scraping) from a url with Get method (axios) but it doesn't retrieve the data what i looking for (piano) and no error is show. I use json to retrieve the data. Any idea what i'm doing wrong .dddddddddddddddddddddddddddd
const axios = require('axios');
const cheerio = require ('cheerio');
const fs = require('fs');
const baseUrl = '**********/';
const axiosCookieJarSupport = require('axios-cookiejar-support').default;
const tough = require('tough-cookie');
axiosCookieJarSupport(axios);
const cookieJar = new tough.CookieJar();
function scrapeUrl(url, items=[]) {
const params = {
keywords: "piano",
latitude:40.489353,
longitude:-3.6827461,
};
return axios
.get(baseUrl + "search", {
params,
jar: cookieJar,
withCredentials: true,
})
.then(response => {
const $ = cheerio.load(response.data);
const pageItems = $('.container-wall .card-product-content').toArray()
tracked').toArray()
.map(item => {
const $item = $(item);
// console.log('todo', $item);
return {
// id: $item.attr('data-adid'),
title: $item.find('.card-product-image').attr('alt'),
link: baseUrl + $item.parent('a').attr('href'),
image: $item.find('.card-product-image').attr('src'),
price: $item.find('.product-info-price').text(),
};
});
const allItems = items.concat(pageItems);
console.log(pageItems.length,'items retrieved', allItems.length,
'acumulated');
const nextUrl = $('.pagination .next a').attr('href');
return nextUrl ? scrapeUrl(baseUrl + nextUrl, allItems) : allItems;
})
.catch(error => {
console.log('error', error);
return items;
});
}
scrapeUrl(baseUrl + initialUrl)
.then(items => {
process.stdout.write(JSON.stringify(items));
fs.writeFile('./items.json', JSON.stringify(items), 'utf8', function(error) {
if (error) return console.log('error', error);
console.log(items.length, 'items saved');
});
});
i switch to puppeteer and headless Chrome as Chris sayed it's better than axios/cheerio becouse nowadays when many of the websites are built as a single page application and gets rendered dynamically on the client it might not be possible to get the content.
Related
So I'm triyng to update some ids from a categories tree using TreeModelJS.
after editing I would like to dump the tree to a file in JSON format.
but when outputing other keys from TreeModel gets outputed as well.
How could I output edited tree as JSON (model only)?
I managed to replace other keys values with null and so far I got this:
const axios = require('axios')
const TreeModel = require('tree-model')
const fs = require('fs')
const url = 'https://my-api-uri-for-categories'
const dumpPath = `${process.cwd()}/data/test/categories.json`
const getCategories = async () => {
try {
const response = await axios.get(url)
return response.data.categories
} catch (error) {
console.log('Error reading categories', error)
}
}
const dumpJsonTofile = data => {
try {
console.log('Dumping to file')
console.log(data)
fs.writeFileSync(
dumpPath,
JSON.stringify(data, (k, v) => {
if (k === 'parent' || k === 'config' || k === 'children') return null
else return v
}),
'utf8'
) // write it back
} catch (error) {
console.log('Error dumping categories', error)
}
}
const scraping = async category => {
try {
const response = await axios.get(category.url)
const document = response.data
const json = document.match(/{"searchTerm"(.*);/g)[0]
const data = JSON.parse(json.replace(';', ''))
return data
} catch (error) {
console.log(`Error while scraping category: ${category.name}`, error)
}
}
async function run() {
const categories = await getCategories()
const categoriesTree = new TreeModel({
childrenPropertyName: 'items',
})
const root = categoriesTree.parse({ id: 0, origin: {}, items: categories })
root.walk(async node => {
const category = node.model
console.log(`scraping category: ${category.name}...`)
if (!category.url) return console.log(`skipping (root?)...`)
const data = await scraping(category)
category.id = data.categoryId
})
dumpJsonTofile(root)
}
run()
but that still outputs a Node object like this:
{
"config":null,
"model":{},
"children":null
}
I need to output all the tree showing only the model key value for each item
Try JSON.stringify(root.model).
Can this anyhow in the feature damage the flow they belong to?
I have a lambda that works behind a API Gateway websocket endpoint.
This simply asks for a clientId and a message payload, query all connections on dynamo for that clientId (multi device realtime dashboard frontend) and updates all interested users.
It's working fine if you test trought "wscat" on command line but it is buggy on real world browser using js websocket api or c# websocket api.
Doest this exceptin has anything to do with it?
const AWS = require("aws-sdk");
let dynamo = new AWS.DynamoDB.DocumentClient();
require("aws-sdk/clients/apigatewaymanagementapi");
const ORDERS_TABLE = "ordersTable";
const successfullResponse = {
statusCode: 200,
body: "everything is alright"
};
module.exports.sendMessageHandler = (event, context, callback) => {
console.log(event);
sendMessageToAllConnectedClientDevices(event)
.then(data => {
console.log("sucesso", data);
callback(null, successfullResponse);
})
.catch(err => {
console.log("erro: ", err);
callback(null, JSON.stringify(err));
});
};
const sendMessageToAllConnectedClientDevices = async event => {
try {
const body = JSON.parse(event.body);
const { clientId } = body;
console.log(
"handler.sendMessageToAllConnectedClientDevices.clientId: ",
clientId
);
const connectionIds = await getConnectionIds(clientId);
return await Promise.all(
connectionIds.Items.map(connectionId => {
send(event, connectionId.connectionId);
})
);
} catch (error) {
console.log("erro sendMessageToAllConnectedClientDevices");
return error;
}
};
const getConnectionIds = async clientId => {
console.log("handler.getConnectionIds.clientId: ", clientId);
const params = {
TableName: ORDERS_TABLE,
// IndexName: "client_gsi",
FilterExpression: "clientId = :cliend_id",
// KeyConditionExpression: "clientId = :cliend_id",
ProjectionExpression: "connectionId",
ExpressionAttributeValues: {
":cliend_id": clientId
}
};
console.log("handler.getConnectionIds.params: ", JSON.stringify(params));
const data = await dynamo.scan(params).promise();
return data;
};
const send = async (event, connectionId) => {
const body = JSON.parse(event.body);
const postData = body.data;
const endpoint =
event.requestContext.domainName + "/" + event.requestContext.stage;
const apigwManagementApi = new AWS.ApiGatewayManagementApi({
apiVersion: "2018-11-29",
endpoint: endpoint
});
const params = {
ConnectionId: connectionId,
Data: postData
};
return await apigwManagementApi.postToConnection(params).promise();
};
ERROR Unhandled Promise Rejection
I think problem is with API Gateway, check how you are handling information passing through to Lambda function (because browser sends some extra information as compared to command line call)
I've got a function that returns the number of records from a DynamoDB table (Things):
const table = 'Things';
const region = 'us-east-1';
const profile = 'development';
process.env.AWS_SDK_LOAD_CONFIG = true;
process.env.AWS_PROFILE = profile;
const AWS = require('aws-sdk');
AWS.config.update({ region: region });
function ddb_table_has_records(table_name) {
const ddb_client = new AWS.DynamoDB.DocumentClient();
const ddb_query_parameters = {
TableName: table_name,
Select: 'COUNT'
}
const results = ddb_client.scan(ddb_query_parameters).promise();
results.then((data) => {
console.log(data.Count);
return data;
}).catch((err) => {
console.log("Error: ", err);
})
}
console.log(ddb_table_has_records(table));
When I run this code, I get the following...
PS C:\> node .\get-count-thing.js
undefined
3951
I'm not capturing the data from the scan in the following; although, I see it in the console.log() call:
console.log(ddb_table_has_records(table));
What am I mucking up?
Posting my fix in-case anyone has the same question. I had to make two changes to retrieve the items from the table; I needed to...
...project ALL_ATTRIBUTES
...iterate over the collection of Items returned
The following was my function with changes:
function ddb_table_has_records(table_name) {
const ddb_client = new AWS.DynamoDB.DocumentClient();
const ddb_query_parameters = {
TableName: table_name,
Select: 'ALL_ATTRIBUTES'
}
const results = ddb_client.scan(ddb_query_parameters).promise();
results.then((data) => {
console.log(data.Count);
data.Items.forEach((thing) => {
console.log(thing);
});
}).catch((err) => {
console.log("Error: ", err);
})
}
I have a app.get which inside of it is quite a bit of logic. Which everything works great aside from some of the logic being called twice for some reason. I have noticed when I was saving something to by db that it would save two rows.
So I put a console.log in that area and sure enough it was logging it twice.
Any reason why this is happening?
app.get('/shopify/callback', (req, res) => {
const { shop, hmac, code, state } = req.query;
const stateCookie = cookie.parse(req.headers.cookie).state;
if (state !== stateCookie) {
return res.status(403).send('Request origin cannot be verified');
}
if (shop && hmac && code) {
// DONE: Validate request is from Shopify
const map = Object.assign({}, req.query);
delete map['signature'];
delete map['hmac'];
const message = querystring.stringify(map);
const providedHmac = Buffer.from(hmac, 'utf-8');
const generatedHash = Buffer.from(
crypto
.createHmac('sha256', config.oauth.client_secret)
.update(message)
.digest('hex'),
'utf-8'
);
let hashEquals = false;
try {
hashEquals = crypto.timingSafeEqual(generatedHash, providedHmac)
} catch (e) {
hashEquals = false;
};
if (!hashEquals) {
return res.status(400).send('HMAC validation failed');
}
// DONE: Exchange temporary code for a permanent access token
const accessTokenRequestUrl = 'https://' + shop + '/admin/oauth/access_token';
const accessTokenPayload = {
client_id: config.oauth.api_key,
client_secret: config.oauth.client_secret,
code,
};
request.post(accessTokenRequestUrl, { json: accessTokenPayload })
.then((accessTokenResponse) => {
const accessToken = accessTokenResponse.access_token;
// DONE: Use access token to make API call to 'shop' endpoint
const shopRequestUrl = 'https://' + shop + '/admin/shop.json';
const shopRequestHeaders = {
'X-Shopify-Access-Token': accessToken,
}
request.get(shopRequestUrl, { headers: shopRequestHeaders })
.then((shopResponse) => {
const response = JSON.parse(shopResponse);
const shopData = response.shop;
console.log('BEING CALLED TWICE...')
res.render('pages/brand_signup',{
shop: shopData.name
})
})
.catch((error) => {
res.status(error.statusCode).send(error.error.error_description);
});
})
.catch((error) => {
res.status(error.statusCode).send(error.error.error_description);
});
} else {
res.status(400).send('Required parameters missing');
}
});
I'm trying to scrape a website with load more button, but I can't do a recursive function with in nightmare. my code is something like this:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show:true
});// }
const request = require('request');
const cheerio = require('cheerio');
let url = 'https://www.housers.com/es/proyectos/avanzado';
let propertyArray = [];
var getThePage = function() {
nightmare
.goto('https://www.housers.com/es/proyectos/avanzado')
.wait(1500)
.click('#loadMore')
.evaluate(() =>{
return document.querySelector('.all-info').innerHTML;
})
.end()
.then((result) => {
let $ = cheerio.load(result);
let loadMore = $('#loadMore')
if (loadMore) {
getThePage();
}
return result
})
.catch((error) => {
console.error('Search failed:', error);
});
}
getThePage()
I don't know if you have any way to do it by this method or any other idea
If you want to scrap the data in the table, you don't need to use nightmare. From the network tab, you would see that it calls this endpoint :
https://www.housers.com/es/proyectos/avanzado/scroll
with some pagination & page size, let's take 200 per page (don't know if it's above the limit).
Then you just have to parse html & put data in an array :
const axios = require('axios');
const querystring = require('querystring');
const cheerio = require('cheerio');
const entities = require("entities");
const url = 'https://www.housers.com/es/proyectos/avanzado/scroll';
const prices = [];
function doRequest(url, page){
return axios.post(url + '?page=' + page + '&size=200', querystring.stringify({
word: "",
country: "",
type: "",
order: "STOCK_PRICE_VARIATION",
orderDirection: "DESC"
}));
}
async function getPrices() {
var empty = false;
var page = 0;
while (!empty) {
//call API
console.log("GET page n°" + page);
var res = await doRequest(url, page);
page++;
//parse HTML
const $ = cheerio.load(res.data,{
xmlMode: true,
normalizeWhitespace: true,
decodeEntities: true
});
if (res.data.trim() !== ""){
//extract prices : put it in array
$('tr').map(function(){
var obj = [];
$(this).children('td').map(function(){
obj.push(entities.decodeHTML($(this).text().trim()));
});
prices.push(obj);
});
}
else {
empty = true;
}
}
console.log(prices);
console.log("total length : " + prices.length);
}
getPrices();