I am creating a new actor in Apify with Cheerio to read an input file of URLs and return primarily two items: (1) the HTTP status code and (2) the HTML title. As part of our process, I would like to be able to try up to 4 variations of each input URL, such as:
HTTP://WWW.SOMEURL.COM
HTTPS://WWW.SOMEURL.COM
HTTP://SOMEURL.COM
HTTPS://SOMEURL.COM
If one of the 4 variations is successful, then the process should ignore the other variations and move to the next input URL.
I read the original input list into a RequestList, and then would like to create the variations in a RequestQueue. Is this the most efficient way to do it? Please see code below, and thank you!
const Apify = require('apify');
const {
utils: { enqueueLinks },
} = Apify;
const urlParse = require('url');
Apify.main(async () => {
const input = await Apify.getInput();
const inputFile = input.inputFile;
console.log('INPUT FILE: ' + inputFile);
const requestList = await Apify.openRequestList('urls', [
{ requestsFromUrl: inputFile, userData: { isFromUrl: true } },
]);
const requestQueue = await Apify.openRequestQueue();
const proxyConfiguration = await Apify.createProxyConfiguration();
const handlePageFunction = async ({ $, request, response }) => {
let parsedHost = urlParse.parse(request.url).host;
let simplifiedHost = parsedHost.replace('www.', '');
const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
let i;
for (i = 0; i < urlPrefixes.length; i++) {
let newUrl = urlPrefixes[i] + simplifiedHost;
console.log('NEW URL: ' + newUrl);
await requestQueue.addRequest({ url: newUrl });
}
console.log(`Processing ${request.url}`);
const results = {
inputUrl: request.url,
httpCode: response.statusCode,
title: $('title').first().text().trim(),
responseUrl: response.url
};
await Apify.pushData(results);
};
const crawler = new Apify.CheerioCrawler({
proxyConfiguration,
maxRequestRetries: 0,
handlePageTimeoutSecs: 60,
requestTimeoutSecs: 60,
requestList,
requestQueue,
handlePageFunction,
handleFailedRequestFunction: async ({ request }) => {
await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
}
});
await crawler.run();
});
you should create your URL list beforehand. the handlePageFunction is only used for the actual scraping part, and you should only have the Apify.pushData there:
//...
const initRequestList = await Apify.openRequestList('urls', [
{ requestsFromUrl: inputFile },
]);
const parsedRequests = [];
let req;
while (req = await initRequestList.fetchNextRequest()) {
const parsedHost = urlParse.parse(req .url).host;
const simplifiedHost = parsedHost.replace('www.', '');
const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
for (let i = 0; i < urlPrefixes.length; i++) {
let newUrl = urlPrefixes[i] + simplifiedHost;
console.log('NEW URL: ' + newUrl);
parsedRequests.push({
url: newUrl,
userData: { isFromUrl: true }
});
}
}
const requestList = await Apify.openRequestList('starturls', parsedRequests);
//...
const crawler = new Apify.CheerioCrawler({
proxyConfiguration,
maxRequestRetries: 0,
handlePageTimeoutSecs: 60,
requestTimeoutSecs: 60,
handlePageFunction,
requestList,
handleFailedRequestFunction: async ({ request }) => {
await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
}
});
//...
requestsFromUrl is a greedy function that tries to parse all URLs from to the given resource. so you'll have to perform the processing as an additional step.
Related
I'm trying to call balanceOf() method on a specific Tron contract (TYukBQZ2XXCcRCReAUguyXncCWNY9CEiDQ), but what I get is Failed to execute . If I do not provide the parameters I get Invalid argument count provided - meaning at some level it works for this contract.
What is interesting it works well on contracts other than the ones created with JustSwap Factory contract eg. https://tronscan.io/#/contract/TYukBQZ2XXCcRCReAUguyXncCWNY9CEiDQ/code .
The code includes the standard TRC20 methods - including balanceOf() - I'm stuck and tried all that's possible form my side, but let's just say I'm not fluent in tronweb api.
My code:
export const getDataToken = async (contractAddress, account, account2) => {
try {
const instance = await window.tronWeb.contract(
[
{
constant: true,
inputs: [{ name: "owner", type: "address" }],
name: "balanceOf",
outputs: [{ name: "", type: "uint256" }],
payable: false,
stateMutability: "view",
type: "function"
}
],
contractAddress
);
console.log(instance);
if (instance.balanceOf) {
console.log("dadadad if");
const tokenBalance = await instance.balanceOf(account).call();
const tokenBalance2 = await instance.balanceOf(account2).call();
return {
tokenBalance: (tokenBalance / Math.pow(10, 18)).toString(),
tokenContract: instance,
tokenBalance2: (tokenBalance2 / Math.pow(10, 18)).toString()
};
}
} catch (message) {
console.log("error getData :" + message);
}
};
const { tokenBalance, tokenContract, tokenBalance2 } = getDataToken(
"TYukBQZ2XXCcRCReAUguyXncCWNY9CEiDQ",
"TL4HzzxGMc1LMfs3XCi4yTJikaBVubz5y4",
"TTFp171XD4JdUB33sDq2ydXJyUEEZjNhLD"
);
This function can help (example for getting JustSwap pair price):
async function takePrice(contractAddress, token){
var functionSelector = 'getTokenToTrxInputPrice(uint256)';
var parameter = [
{
type: 'uint256',
value: token
}
]
var options = {};
transaction = await window.tronWeb.transactionBuilder.triggerConstantContract(contractAddress, functionSelector, options, parameter);
return window.tronWeb.BigNumber("0x"+transaction['constant_result'][0]);
}
priceUSDTTRX = window.tronWeb.fromSun(await takePrice(USDTTRX_ADDRESS, "1000000"));
priceSomeTone18TRX = window.tronWeb.fromSun(await takePrice(SomeTone18TRX_ADDRESS, "1"+"0".repeat(18)));
I have good result:
const TronWeb = require("tronweb");
const ethers = require('ethers')
const MAINNET_RPC = "https://api.trongrid.io";
const PLACEHOLDER_PRIVATE_KEY = "YOUR_PRIVATE_KEY";
const HttpProvider = TronWeb.providers.HttpProvider;
const fullNode = new HttpProvider(MAINNET_RPC);
const solidityNode = new HttpProvider(MAINNET_RPC);
const eventServer = new HttpProvider(MAINNET_RPC);
const tronWeb = new TronWeb(fullNode,solidityNode,eventServer,PLACEHOLDER_PRIVATE_KEY);
const startJustSwap = async () => {
try {
const contractTokenExchangeUSDT = 'TQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE'; //S-USDT-TRX Token
const parameter = [{type:`uint256`,value: 10000000}];
const tx = await tronWebLocal.transactionBuilder.triggerConstantContract(contractToken, `trxToTokenSwapInput(uint256,uint256)`, {}, parameter);
console.log(tx);
} catch (e) {
console.log(e);
}
};
startJustSwap();
try const tokenBalance = await instance.balanceOf(account).call({ _isConstant: true });
It works for me.
transactionBuilder.triggerConstantContract works too, but mine is simpler
i have a function that gets data from an API. I now want to outsource the program logic to a service to keep the controller cleaner.
I get the data about Async/Await, unfortunately I don't know how to outsource it to a service?
Anyone have an idea?
Here my homeController.js:
const ispwrapper = require('../lib/ispwrapper');
require('dotenv').config();
const BASE_URL = process.env.API_BASE_URL;
const OPTIONS = {
username: process.env.API_USERNAME,
password: process.env.API_PASSWORD
};
const renderHome = async (req, res) => {
let domain = [],
message = '';
try {
let a = new ispwrapper.ISPConfig(BASE_URL, OPTIONS);
const response = await a.getDataByPrimaryId('sites_web_domain_get', { active: 'y' });
for (let i = 0; i < response.length; i++){
domain.push(response[i].domain);
}
} catch(err) {
message = 'Error when retriving domains from API';
} finally {
res.render('home', {
title: 'ISPConfig',
heading: 'Welcome to my ISPConfig Dashboard',
homeActive: true,
domain,
message
});
}
};
module.exports = {
renderHome
};
My homeService.js:
const
ispwrapper = require('../lib/ispwrapper');
require('dotenv').config();
const BASE_URL = process.env.API_BASE_URL;
const OPTIONS = {
username: process.env.API_USERNAME,
password: process.env.API_PASSWORD
};
const getDomains = async () => {
// i have no idea how use my renderHome() logic here
};
module.exports = {
getDomains
};
Here my solution.
domainService.js:
const
ispwrapper = require('../lib/ispwrapper');
require('dotenv').config();
const BASE_URL = process.env.API_BASE_URL;
const OPTIONS = {
username: process.env.API_USERNAME,
password: process.env.API_PASSWORD
};
const getDomains = async () => {
let data = [];
try {
let a = new ispwrapper.ISPConfig(BASE_URL, OPTIONS);
let response = await a.getDataByPrimaryId('sites_web_domain_get', { active: 'y' });
for (let i = 0; i < response.length; i++){
data.push(response[i].domain);
}
return data;
} catch(err) {
console.log(err);
}
};
module.exports = {
getDomains
};
homeController:
const homeService = require('../services/domainService');
const renderHome = async (req, res) => {
let message = '',
domain = [];
try {
let response = await homeService.getDomains();
for (let i = 0; i < response.length; i++){
domain.push(response[i]);
}
} catch(err) {
message = 'Error when retriving domains from API';
} finally {
res.render('home', {
title: 'ISPConfig',
heading: 'Welcome to my ISPConfig Dashboard',
homeActive: true,
domain,
message
});
}
};
module.exports = {
renderHome
};
Cannot figure out why the below script won't run. It is likely the script is not going to do what I want but using
node ./contentful/contentful-assets.js
in the terminal, it does nothing - No errors, nothing logged for me to even start debugging. However, if I remove async it will attempt the script and shoot back an error.
./contentful/contentful-assets.js
const contentful = require('contentful-management');
const iterator = require('make-iterator');
const assets = require('./assetObject.js');
async resolve => {
console.log('Creating Contentful client');
const client = contentful.createClient({
accessToken: 'token',
logHandler: (level, data) => console.log(`${level} | ${data}`)
});
const iterableAssets = iterator(assets);
const space = await client.getSpace('space');
const environment = await space.getEnvironment('enviroment');
const cmsAssets = [];
const assetProcessingTimes = [];
const inProcess = new Map();
let processedAssetsCounter = 0;
const createAndPublishSingleAsset = async ({ asset, done, index }) => {
if (done) {
if (inProcess.size > 0) return false;
return resolve(cmsAssets);
}
const start = Date.now();
const id = '' + start + Math.round(Math.random() * 100);
inProcess.set(id, true);
let cmsAsset;
try {
cmsAsset = await environment.createAssetWithId(asset.postId, {
fields: {
title: {
'en-US': asset.title
},
description: {
'en-US': asset.description
},
file: {
'en-US': {
contentType: 'image/jpg',
fileName: asset.filename,
upload: asset.link
}
}
}
});
} catch (e) {
console.log(`Asset "${asset.title}" failed to create, retrying...`);
createAndPublishSingleAsset({
asset,
done,
index
});
}
try {
const processedCMSAsset = await cmsAsset.processForAllLocales();
const publishedCMSAsset = await processedCMSAsset.publish();
cmsAssets.push(publishedCMSAsset);
assetProcessingTimes.push((Date.now() - start) / 1000);
inProcess.clear(id);
const eta = Math.floor(
assetProcessingTimes.reduce((a, b) => a + b, 0) /
assetProcessingTimes.length *
(assets.length - index) /
60
);
processedAssetsCounter += 1;
console.log(
`Processed asset ${processedAssetsCounter}/${assets.length} - eta: ${eta}m`
);
createAndPublishSingleAsset(iterableAssets.next());
} catch (e) {
console.log(`Asset "${asset.title}" failed to process, retrying...`);
await cmsAsset.delete();
createAndPublishSingleAsset({
asset,
done,
index
});
}
};
console.log('Starting to create assets');
createAndPublishSingleAsset(iterableAssets.next());
createAndPublishSingleAsset(iterableAssets.next());
createAndPublishSingleAsset(iterableAssets.next());
};
assetObject.js
[
{
link: 'https://example.com/example1.jpg',
title: 'Example 1',
description: 'Description of example 1',
postId: '1234567890',
filename: 'example1.jpeg'
}, ... // Many more
]
What have I missed here?
I fear that you are not calling the function, could you try, the following?
const contentful = require('contentful-management');
const iterator = require('make-iterator');
const assets = require('./assetObject.js');
const doWork = async resolve => {
console.log('Creating Contentful client');
...
}
doWork();
You are just declaring a function that is async and does all of the code defined, but you are not actually calling it.
In this code snippet you are declaring a function, but never invoking it:
//declaring an async function, with "resolve" as the argument
async resolve => {
//function definition
}
In order to be able to later reference the function to invoke you can assign it to const/let/etc.:
const createAssets = async resolve => { }
//now, invoke
createAssets()
I'm trying to scrape a website with load more button, but I can't do a recursive function with in nightmare. my code is something like this:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show:true
});// }
const request = require('request');
const cheerio = require('cheerio');
let url = 'https://www.housers.com/es/proyectos/avanzado';
let propertyArray = [];
var getThePage = function() {
nightmare
.goto('https://www.housers.com/es/proyectos/avanzado')
.wait(1500)
.click('#loadMore')
.evaluate(() =>{
return document.querySelector('.all-info').innerHTML;
})
.end()
.then((result) => {
let $ = cheerio.load(result);
let loadMore = $('#loadMore')
if (loadMore) {
getThePage();
}
return result
})
.catch((error) => {
console.error('Search failed:', error);
});
}
getThePage()
I don't know if you have any way to do it by this method or any other idea
If you want to scrap the data in the table, you don't need to use nightmare. From the network tab, you would see that it calls this endpoint :
https://www.housers.com/es/proyectos/avanzado/scroll
with some pagination & page size, let's take 200 per page (don't know if it's above the limit).
Then you just have to parse html & put data in an array :
const axios = require('axios');
const querystring = require('querystring');
const cheerio = require('cheerio');
const entities = require("entities");
const url = 'https://www.housers.com/es/proyectos/avanzado/scroll';
const prices = [];
function doRequest(url, page){
return axios.post(url + '?page=' + page + '&size=200', querystring.stringify({
word: "",
country: "",
type: "",
order: "STOCK_PRICE_VARIATION",
orderDirection: "DESC"
}));
}
async function getPrices() {
var empty = false;
var page = 0;
while (!empty) {
//call API
console.log("GET page n°" + page);
var res = await doRequest(url, page);
page++;
//parse HTML
const $ = cheerio.load(res.data,{
xmlMode: true,
normalizeWhitespace: true,
decodeEntities: true
});
if (res.data.trim() !== ""){
//extract prices : put it in array
$('tr').map(function(){
var obj = [];
$(this).children('td').map(function(){
obj.push(entities.decodeHTML($(this).text().trim()));
});
prices.push(obj);
});
}
else {
empty = true;
}
}
console.log(prices);
console.log("total length : " + prices.length);
}
getPrices();
This is the module that collections and exports async data: scraper.js
const express = require('express')
const cheerio = require('cheerio')
const request = require("tinyreq")
const fs = require('fs')
const _ = require('lodash')
const uuid = require('uuid/v4')
const async = require('async')
const mental_models = {
url: 'https://www.farnamstreetblog.com/mental-models/',
data: {}
}
const decision_making = {
url: 'https://www.farnamstreetblog.com/smart-decisions/',
data: {}
}
const cognitive_bias = {
url: 'https://betterhumans.coach.me/cognitive-bias-cheat-sheet-55a472476b18',
data: {}
}
const DATA_URLS = [mental_models, decision_making, cognitive_bias]
const filterScrape = async (source, params) => {
let filtered_data = {
topics: [],
content: [],
additional_content: []
}
let response = await scrape(source)
try {
let $ = cheerio.load(response)
params.forEach((elem) => {
let headers = ['h1', 'h2', 'h3']
if ($(elem) && headers.includes(elem)) {
let topic = {}
let content = {}
let id = uuid()
topic.id = id
topic.text = $(elem).text()
if ($(elem).closest('p')) {
content.text = $(elem).closest('p').text()
content.id = id
}
filtered_data.topics.push(topic)
filtered_data.content.push(content)
} else if ($(elem) && !headers.includes(elem)) {
let content = {}
let id = uuid()
content.text = $(elem).text()
content.id = id
filtered_data.additional_content.push(content)
} else {
}
})
}
catch (err) {
console.log(err)
}
return filtered_data
}
const scrape = (source) => {
return new Promise((resolve, reject) => {
request(source.url, function (err, body) {
if (err) {
reject(err)
return
}
resolve(body)
})
})
}
const DATA = _.map(DATA_URLS, async (source) => {
let params = ['h1', 'h2', 'h3', 'p']
let new_data = await filterScrape(source, params)
try {
source.data = new_data
}
catch (err) {
console.log(err)
}
})
module.exports = DATA
This is the module that imports the data: neural.js
const brain = require('brain')
const neural_net = new brain.NeuralNetwork()
const DATA = require('./scraper')
console.log(DATA)
Obviously not much going on, I've removed the code since the variable doesn't resolve. When logged it logs a promise but the promise does not resolve. However in the imported module, the promise is logged and then resolves. What gives? Should I import a function that resolves the data?
Of course it would be best to import that function, however it won't change the issue in your code which is here:
const DATA = _.map(DATA_URLS, async (source) => {
Lodash doesn't support async iteration - so you need to have some other method, one would be to use the newest nodejs version (10.x) and make use of async iteration - but that won't use the full power of asynchronous code.
You can also use scramjet - a framework my company is supporting. The code above would take the following form:
const {DataStream} = require("scramjet");
const DATA_URLS = [mental_models, decision_making, cognitive_bias];
module.exports = async () => DataStream.fromArray(DATA_URLS)
.setOptions({maxParallel: 2}) // if you need to limit that at all.
.map(async ({url}) => {
let params = ['h1', 'h2', 'h3', 'p']
let data = await filterScrape(source, params);
return { url, data };
})
.toArray();
The other file would take the following form:
const brain = require('brain')
const neural_net = new brain.NeuralNetwork()
const scraper = require('./scraper')
(async (){
const DATA = await scraper();
console.log(DATA); // or do whatever else you were expecting...
})();