Apify - How to Include Failed Results in Dataset - javascript

We are using the Apify Web Scraper actor to create a URL validation task that returns the input URL, the page's title, and the HTTP response status code. We have a set of 5 test URLs we are using: 4 valid, and 1 non-existent. The successful results are always included in the dataset, but never the failed URL.
Logging indicates that the pageFunction is not even reached for the failed URL:
2021-05-05T14:50:08.489Z ERROR PuppeteerCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"http://www.invalidurl.com","retryCount":1,"id":"XS9JTk8dYRM8bpM"}
2021-05-05T14:50:08.490Z Error: gotoFunction timed out after 30 seconds.
2021-05-05T14:50:08.490Z at PuppeteerCrawler._handleRequestTimeout (/home/myuser/node_modules/apify/build/crawlers/puppeteer_crawler.js:387:15)
2021-05-05T14:50:08.496Z at PuppeteerCrawler._handleRequestFunction (/home/myuser/node_modules/apify/build/crawlers/puppeteer_crawler.js:329:26)
Eventually it times out, based on our settings:
2021-05-05T14:50:42.052Z ERROR Request http://www.invalidurl.com failed and will not be retried anymore. Marking as failed.
2021-05-05T14:50:42.052Z Last Error Message: Error: gotoFunction timed out after 30 seconds.
I tried wrapping the code in the pageFunction in a try/catch block, but again, because the pageFunction is not getting reached for the invalid URL, that does not resolve the issue. Is there a way to still include the failed result in the dataset with a hard-coded response status code of '000'? (See pageFunction code below.) Please let me know if I can provide any additional information, and thanks in advance!
async function pageFunction(context) {
context.log.info("Starting pageFunction");
// use jQuery as $
const { request, jQuery: $ } = context;
const { url } = request;
context.log.info("Trying " + url);
let title = null;
let responseCode = null;
try {
context.log.info("In try block for " + url);
title = $('title').first().text().trim();
responseCode = context.response.status;
} catch (error) {
context.log.info("EXCEPTION for " + url);
title = "";
responseCode = "000";
}
return {
url,
title,
responseCode
};
}

you can use https://sdk.apify.com/docs/typedefs/puppeteer-crawler-options#handlefailedrequestfunction:
you can then push it to the when all retries fail:
handleFailedRequestFunction: async ({ request }) => {
// failed all retries
await Apify.pushData({ url: request.url, responseCode: '000' });
}

Related

How to rerun and stop app script based on the execution log info?

I am looking to run below app script with multiple var url such as
https://www.testurl.com/link1=processing
https://www.testurl.com/link2=processing
https://www.testurl.com/link3=processing
How to add multiple url to below code:
function url() {
var url = 'https://www.testurl.com/link1=processing';
var options = {
'method': 'get'
};
var response = UrlFetchApp.fetch(url, options);
Logger.log(response);
}
Each run with single url shows below execution log
3:33:30 AM Notice Execution started
3:33:31 AM Info {"status":200,"message":"Import 25 of 200 completed."}
3:33:31 AM Notice Execution completed
I am looking to rerun script for first url till i get the log message containing "is not triggered" and then run second url and rerun till i get the log message containing "is not triggered" and then run third url and rerun till i get the log message containing "is not triggered" and then stop.
3:33:30 AM Notice Execution started
3:33:31 AM Info {"status":403,"message":"Import #16 is not triggered. Request skipped."}
3:33:31 AM Notice Execution completed
There is really no way for me to test this but I think it illustrates how this can be done:
function getUrls() {
try {
var urls = [ 'https://www.testurl.com/link1=processing',
'https://www.testurl.com/link2=processing',
'https://www.testurl.com/link3=processing' ];
function getUrl(url) {
var options = { 'method': 'get' };
var response = UrlFetchApp.fetch(url, options);
while( response.message.indexOf("is not triggered") < 0 ) {
Logger.log(response);
response = UrlFetchApp.fetch(url, options);
}
}
urls.forEach( getUrl );
}
catch(err) {
Logger.log(err);
}
}

Postman Get value of url request parameter to do a test

I am trying to get a url from the get request params.
How to Construct request URL from environment variables
In Pre-request script urlwfsservice and orderid is set from environment variables
{{urlwfsservice}}/v1/merchantorders/{{orderId}}/BoardingActivities.updateMerchantToHub.REQUEST/details/
When use
var urlnew =request.url;
console.log(request.url);
I get this output as variables name not actual value or url
{{urlwfsservice}}/v1/merchantorders/{{orderId}}/BoardingActivities.updateMerchantToHub.REQUEST/details/
how can I get output like below simpleurl ?
var simpleurl = “https://dev-someweburl.com/v1/merchantorders/ZN2aB/BoardingActivities.updateMerchantToHub.REQUEST/details/”;
Complete Pre-request script Code
// how to Construct request URL from environment variables
console.log("logging url");
var urlnew =request.url;
console.log(urlnew);
//var url = "https://dev-someweburl.com/v1/merchantorders/ZN2aB/BoardingActivities.updateMerchantToHub.REQUEST/details/";
var retryDelay = 200;
var retryLimit = 5;
function isProcessingComplete(retryCount) {
pm.sendRequest(urlnew, function (err, response) {
if(err) {
// hmmm. Should I keep trying or fail this run? Just log it for now.
console.log(err);
} else {
// I could also check for response.json().results.length > 0, but that
// would omit SUCCESS with empty results which may be valid
if(response.json().auditRecords.length === 0) {
if (retryCount < retryLimit) {
console.log('Job is still PENDING. Retrying in ' + retryDelay + 'ms');
setTimeout(function() {
isProcessingComplete(++retryCount);
}, retryDelay);
} else {
console.log('Retry limit reached, giving up.');
postman.setNextRequest(null);
}
}
}
});
}
isProcessingComplete(1);
console.log(pm.variables.replaceIn(pm.request.url.toString()))
you can use replaceIn method to replace variables to its actual values. Also use pm. as pm is the new api or syntax in postman

How to do a JSON api call with error response?

I use Javascript to retrieve through a JSON api call the amount of active products pasted in a certain filter.
My typicall response would be
{"total":34,"product_ids":["PRODUCT1","PRODUCT2",....."]}
My script is working fine when products are present but when none of the products are active the response will be:
{"error":"No products found, please check request settings"}
In this case the script will crash.
What I tried to do is to set the var NumEdPicks to 0 when I get an error but I don't really know how as the script is crashing when it doesn't find "total".
This is what the retrieve part of the script looks like
// Retrieve
var url = 'http://api.jetlore.com/products/products_by_filter.json?jl_cid=' + clientID + '&filter=' + filterName + '&per_page=' + maxCount + '&page=1';
var response = HTTP.Get(url);
var responseObj = Platform.Function.ParseJSON(response["Content"]);
var NumEditorsPick = responseObj.total;
if(NumEditorsPick>maxCount){ var NumEditorsPick = maxCount;}
I would like to set NumEditorsPick to 0 when I get the error response.
Some things I was thinking about but which isn't working:
var NumEditorsPick = responseObj.total || 0
or
var NumEditorsPick = ‘total’ in responseObj ? responseObj.total : 0
How to define NumEditorsPick when there is no total?
I've tried so far:
if (responseObj.hasOwnProperty('total')){
var NumEditorsPick = responseObj.total;
}else{
var NumEditorsPick = 0;
}
And
if (responseObj.has("total")){var NumEditorsPick = responseObj.total;
}
if (responseObj.has("error")){var NumEditorsPick = 0;
}
Both are crashing the execution of my script, so I'm starting to think that when there is an error response it just stops the script and ignores the rest, would that be possible? In that case, how to ignore this error response?
EDIT:
After using the try/catch method as suggested in the comments, I managed to finally make it work:
var NumEditorsPick;
try {
var response = HTTP.Get(url);
var responseObj = Platform.Function.ParseJSON(response["Content"]);
NumEditorsPick = responseObj.total;
} catch (error) {
NumEditorsPick = 0;
}
You can use Javascript's hasOwnProperty() to check if the parse JSON has the key you're looking for.
In this case, I'd be something like:
var responseObj = Platform.Function.ParseJSON(response["Content"]);
if (responseObj.hasOwnProperty('error')){
// handle error msg
}else{
// do something else
}
Here's a simple example using the JSON input you've provided.
Update
Ok, so my initial answer was based on what you said here:
My script is working fine when products are present but when none of
the products are active the response will be:
{"error":"No products found, please check request settings"}
But the service you're calling does not return a JSON string containing the error. Instead it returns a 404 and therefore, any attempt to parse or use the response content is not valid.
So, to start, you could try wrapping your HTTP.Get(url)in a try/catch method and on the catch clause set the NumEdPicks to zero.
Another option would be to check HTTP.Get() method documentation to see if the response object has a status (e.g: response.Status) or if you can pass a callback function for response and error, like this example in AJAX:
$.ajax({
url: 'yourUrl',
type: 'GET',
success: function(data){
// Set NumEdPicks to total and do other stuff
},
error: function(data) {
// Set NumEdPicks to zero
}
});

Error in HTTP GET Request with gadgets.io.makeRequest

Sorry but I spent a half a day practicing first with gadgets.io.makeRequest, and can not understand why the request response contains an error. The code is Javascript working as OpenSocial gadget:
requestURI = "https://jazz.server.com:9443/rm/views?projectURL=https%3A%2F%2Fjazz.server.com%3A9443%2Frm%2Fprocess%2Fproject-areas%2F_FvrWIG3nEeexYJvvGxVsZg&oslc.query=true&oslc.prefix=rt=<https://jazz.server.com:9443/rm/types/>&oslc.select=rt:_W0SGoW3nEeexYJvvGxVsZg";
makeGETRequest(requestURI);
...
function makeGETRequest(url) {
try {
var params = {};
params[gadgets.io.RequestParameters.METHOD] = gadgets.io.MethodType.GET;
params[gadgets.io.RequestParameters.HEADERS] = {
"Accept" : "application/rdf+xml",
"OSLC-Core-Version": "2.0"
}
gadgets.io.makeRequest(url, function(obj) {
console.log("===== HTTP REQUEST START =====");
console.log("Method : GET");
console.log("URL : " + url);
console.log("Response : " + obj.text);
console.log("====== HTTP REQUEST END ======");
}, params);
}
catch(err) {
console.log("Can not perform HTTP request because of error: " + err.message);
}
};
When I do the same request with REST Client in Firefox, everything works properly. But if I do that with the code above, then I get an error in the log (abbreviated):
===== HTTP REQUEST START =====
common.js:311 Method : GET
common.js:312 URL : https://jazz.server.com:9443/rm/views?projectURL=https%3A%2F%2Fjazz.server.…roject-areas%2F_FvrWIG3nEeexYJvvGxVsZg&oslc.query=true&oslc.prefix=rt=<https://jazz.server.com:9443/rm/types/>&oslc.select=rt:_W0SGoW3nEeexYJvvGxVsZg
common.js:313 Response : {"errorMessage":"Illegal character in query at index 178: https://jazz.server.com:9443/rm/views?projectURL=https%3A%2F%2Fjazz.server.com%3A9443%2Frm%2Fprocess%2Fproject-areas%2F_FvrWIG3nEeexYJvvGxVsZg&amp;oslc.query=true&oslc.prefix=rt=<https://jazz.server.com:9443/rm/types/>&oslc.select=rt:_W0SGoW3nEeexYJvvGxVsZg","errorClass":"java.lang.IllegalArgumentException","errorTrace":["java.net.URI.create(URI.java:871)","org.apache.http.client.methods.HttpGet.<init>
...
common.js:314 ====== HTTP REQUEST END ======
I tried to replace greater and less symbols by their hex values but there's no result. And there's no ideas currently.
May be somebody could make a fresh sight to the code and define the problem on the fly. Help me please, I'm at a dead end.
Thank you very much in advance for any advice!
The error in your Response indicates the Java system on the server side can't create a valid URI from your query. Therefor it throws back an error
My best guess would be the dot just before query=true in oslc.query=true. And therefor all following uses of oslcDOT .
From RFC 1738 specification:
Thus, only alphanumerics, the special characters "$-_.+!*'(),", and reserved characters used for their reserved purposes may be used unencoded within a URL.
I discovered that gadgets.io.makeRequest isn't very stable as I would like to expect. May be I do some wrong but sometimes this function completes without any feedback and without starting the response function in the parameters. I changed to next code:
function makeGETRequest(urlValue) {
try {
$.ajax({
url: urlValue,
type: 'GET',
dataType: 'text',
headers: {
'Accept': 'application/rdf+xml',
'OSLC-Core-Version': '2.0'
},
success: function (result) {
var data = result;
},
error: function (error) {
console.log("Can not perform HTTP request because of error: " + error.message);
}
});
}
catch(err) {
console.log("Can not perform HTTP request because of error: " + err.message);
}
};
And there's no problem!

How do I capture an aborted call or is setting the timeout to 0 correct?

I have a JavaScript client that works in Chrome and Firefox, but fails in IE. Looking at the network trace in the IE debugger it shows that multiple of the AJAX calls have been aborted.
I've been able to get around it by setting the timeout to 0. I'd like to know if this is the correct way to handle my requests being aborted? Basically what could go wrong?
My initial thought was that I should capture and resend on error, and if multiple resubmits do not result in a completed request, finally alert the user. I'd still like to know how to do this even if the setTimeout is the proper way to address my immediate issue.
Also the application will process an excel workbook of addresses, call a web service to add some data to them and then allow the user to download the enhanced file.
This is what I have so far, first in the app.js
var requestWithFeedback = function (args) {
$(".loader").removeClass('hidden');
var oldConfig = args.config || function () { };
args.config = function (xhr) {
xhr.setRequestHeader("Authorization", "Bearer " + localStorage.token);
oldConfig(xhr);
extract: extract;
};
var deferred = m.deferred();
setTimeout(function () { // <== This solved in IE, but is this the way to handle this?
m.request(args).then(deferred.resolve, function(err){
if (err === "Invalid token!"){
m.route('/');
}
})}, 0);
$(".loader").addClass('hidden');
return deferred.promise;
}
From the model.js
app.MarkedAddresses.ProcessAddressBatch = function () {
var requestData = {
Addresses: app.MarkedAddresses.vm.addresses
}
return requestWithFeedback({
method: "POST"
, url: "API/server.ashx"
, data: requestData
, deserialize: function (value) { return value; }
})
.then(function (value) {
var responseJSON = $.parseJSON(value);
$.merge(app.MarkedAddresses.vm.results, responseJSON)
app.MarkedAddresses.vm.currentRecord(app.MarkedAddresses.vm.results.length);
app.MarkedAddresses.vm.progress(Math.max(app.MarkedAddresses.vm.progress(), ~~(app.MarkedAddresses.vm.currentRecord() / app.MarkedAddresses.vm.totalRecords() * 100)));
m.redraw(); //Force redraw for progress bar
return value;
},
function (error) { console.log(error) } // <== I thought error would show up here, but I never hit a breakpoint here.
);
}
Added loops
function process_wb(wb) {
app.MarkedAddresses.vm.results.length = 0;
$('.descending').removeClass("descending");
$('.ascending').removeClass("ascending");
app.MarkedAddresses.vm.progress(.1);
m.redraw();
var header = mapHeader(wb);
var addressJSON = to_json(wb, header);
app.MarkedAddresses.vm.totalRecords(addressJSON.length);
for (var i = 0; (i < addressJSON.length + 1) ; i += 1000) {
app.MarkedAddresses.vm.addresses = addressJSON.slice(i, Math.min(((i) + 1000), addressJSON.length));
app.MarkedAddresses.vm.response(new app.MarkedAddresses.vm.processAddressBatch());
}
}
Why isn't the error triggered in the section of the code?
It seems like I should add a deferred section here, but anything I've tried has been a syntax error.

Categories