Nightmare.js does not work with Azure webjob - javascript

I am trying to run an azure webjob which takes a json object and renders a webpage, then prints it to pdf, via the electron browser in Nightmare.js.
When I run this locally it works perfectly, but when I run it in azure webjob it never completes.
I get the two console.log statements output to the log, but seeing as I can not output anything from the nightmare.js calls, nor display the electron browser window, I have no idea what is going wrong.
There is also a webserver in the script, omitted as it seems to take the request with the json object and pass it to createPage just fine.
I have verified that index.html file is in the right directory. Does anyone know what might be wrong?
var Nightmare = require('nightmare'),
http = require('http');
function createPage(o, final) {
var start = new Date().getTime();
var page = Nightmare({
//show: true, //uncomment to show electron browser window
//openDevTools: { mode: 'detach'}, //uncomment to open developer console ('show: true' needs to be set)
gotoTimeout: 300000, //set timeout for .goto() to 2 minutes
waitTimeout: 300000, //set timeout for .wait() to 5 minutes
executionTimeout: 600000 //set timeout for .evaluate() to 10 minutes
})
.goto('file:\\\\' + __dirname + '\\index.html');
page.wait("#ext-quicktips-tip") //wait till HTML is loaded
.wait(function () { // wait till JS is loaded
console.log('Extjs loaded.');
return !!(Ext.isReady && window.App && App.app);
});
console.log("CreatePage()1");
page.evaluate(function (template, form, lists, printOptions) {
App.pdf.Builder.create({
template: template,
form: form,
lists: lists,
format: o.printOptions.format,
});
console.log('Create done');
}, template, form, o.lists, printOptions);
console.log("CreatePage()2");
page.wait(function () {
console.log('Content created. ' + App.pdf.Builder.ready);
return App.pdf.Builder.ready;
})
.pdf(o.outputDir + form.filename, { "pageSize": "A4", "marginsType": 1 })
.end()
.then(function () {
console.log('Pdf printed, time: ' + (new Date().getTime() - start) / 1000 + ' seconds');
final(true);
})
.catch(function (err) {
console.log('Print Error: ' + err.message);
});
}
Solved
As Rick states in his answer, this will not currently work!
This document lists the current state of webjobs sandbox:
https://github.com/projectkudu/kudu/wiki/Azure-Web-App-sandbox
It has the following paragraph relating to my issue:
PDF generation from HTML
There are multiple libraries used to convert HTML to PDF. Many Windows/.NET specific versions leverage IE APIs and therefore leverage User32/GDI32 extensively. These APIs are largely blocked in the sandbox (regardless of plan) and therefore these frameworks do not work in the sandbox.
There are some frameworks that do not leverage User32/GDI32 extensively (wkhtmltopdf, for example) and we are working on enabling these in Basic+ the same way we enabled SQL Reporting.

I guess for nightmare.js to work you need desktop interaction, which you're not getting on a WebJob.
Taken from this issue on Github:
Nightmare isn't truly headless: it requires an Electron instance to
work, which in turn requires a framebuffer to render properly (at
least, for now).
This will not fly on an Azure WebJob.

Related

Does anyone know how to define navigator online in main process in electron?

I know you can use navigator onLine inside the renderer process because it's a rendered inside a browser. But what I'm trying to do is something like this in the main process:
if (navigator.onLine){
mainWindow.loadURL("https://google.com")
} else {
mainWindow.loadFile(path.join(__dirname, 'index.html'));
}
So basically if the user is offline, just load a local html file, and if they're online, take them to a webpage. But, like expected, I keep getting the error that 'navigator is not defined'. Does anyone know how can I somehow import the navigate cdn in the main process? Thanks!
TL;DR: The easiest thing to do is to just ask Electron. You can do this via the net module from within the Main Process:
const { net } = require ("electron");
const isInternetAvailable = () => return net.isOnline ();
// To check:
if (isInternetAvailable ()) { /* do something... */ }
See Electron's documentation on the method; specifically, this approach doesn't tell you whether your service is accessible via the internet, but rather that a service can be contacted (or not even this, as the documentation mentions links which would not involve any HTTP request at all).
However, this is not a reliable measurement and you might want to increase its hit rate by manuallly checking whether a certain connection can be made.
In order to check whether an internet connection is available, you'll have to make a connection yourself and see if it fails. This can be done from the Main Process using plain NodeJS:
// HTTP code basically from the NodeJS HTTP tutorial at
// https://nodejs.dev/learn/making-http-requests-with-nodejs/
const https = require('https');
const REMOTE_HOST = "google.com"; // Or your domain
const REMOTE_EP = "/"; // Or your endpoint
const REMOTE_PAGE = "https://" + REMOTE_HOST + REMOTE_EP;
function checkInternetAvailability () {
return new Promise ((resolve, reject) => {
const options = {
hostname: REMOTE_HOST,
port: 443,
path: REMOTE_EP,
method: 'GET',
};
// Try to fetch the given page
const req = https.request (options, res => {
// Yup, that worked. Tell the depending code.
resolve (true);
req.destroy (); // This is no longer needed.
});
req.on ('error', error => {
reject (error);
});
req.on ('timeout', () => {
// No, connection timed out.
resolve (false);
req.destroy ();
});
req.end ();
});
}
// ... Your window initialisation code ...
checkInternetAvailability ().then (
internetAvailable => {
if (internetAvailable) mainWindow.loadURL (REMOTE_PAGE);
else mainWindow.loadFile (path.join (__dirname, 'index.html'));
// Call any code needed to be executed after this here!
}
).catch (error => {
console.error ("Oops, couldn't initialise!", error);
app.quit (1);
});
Please note that this code here might not be the most desirable since it just "crashes" your app with exit code 1 if there is any error other than connection timeout.
This, however, makes your startup asynchronous, which means that you need to pay attention on the execution chain of your app startup. Also, startup may be really slow in case the timeout is reached, it may be worth considering NodeJS' http module documentation.
Also, it makes sense to actually try to retrieve the page you're wanting to load in the BrowserWindow (constant values REMOTE_HOST and REMOTE_EP), because that also gives you an indication whether your server is up or not, although that means that the page will be fetched twice (in the best case, when the connection test succeeds and when Electron loads the page into the window). However, that should not be that big of a problem, since no external assets (images, CSS, JS) will be loaded.
One last note: This is not a good metric of whether any internet connection is available, it just tells you whether your server answered within the timeout window. It might very well be that any other service works or that the connection just is very slow (i.e., expect false negatives). Should be "good enough" for your use-case though.

Unable to load a specific URL with Cypress

I’m unable to load the following URL with Cypress. Getting timeout error. I have set the page load time to 2 mins, still same issue. General URLs eg. (https://www.google.co.nz/) works fine.
it(‘First Test’, () => {
cy.visit(‘https://shop.countdown.co.nz/‘)
})
Here's a way, not the best, could be improved...
The Countdown site has an aversion to being run in an iframe, but it can be tested in a child window, see custom command here Cypress using child window
Cypress.Commands.add('openWindow', (url, features) => {
const w = Cypress.config('viewportWidth')
const h = Cypress.config('viewportHeight')
if (!features) {
features = `width=${w}, height=${h}`
}
console.log('openWindow %s "%s"', url, features)
return new Promise(resolve => {
if (window.top.aut) {
console.log('window exists already')
window.top.aut.close()
}
// https://developer.mozilla.org/en-US/docs/Web/API/Window/open
window.top.aut = window.top.open(url, 'aut', features)
// letting page enough time to load and set "document.domain = localhost"
// so we can access it
setTimeout(() => {
cy.state('document', window.top.aut.document)
cy.state('window', window.top.aut)
resolve()
}, 10000)
})
})
Can test with that like this
cy.openWindow('https://shop.countdown.co.nz/').then(() => {
cy.contains('Recipes').click()
cy.contains('Saved Recipes', {timeout:10000}) // if this is there, have navigated
})
I bumped the setTimeout() in custom command to 10 seconds, cause this site drags it's feet a bit.
Configuration:
// cypress.json
{
"baseUrl": "https://shop.countdown.co.nz/",
"chromeWebSecurity": false,
"defaultCommandTimeout": 20000 // see below for better way
}
Command timeout error
Using Gleb's child window command, there's a timeout error that I can't track the source of.
To avoid it I set "defaultCommandTimeout": 20000 in config, but since it's only needed for the openWindow call it's better to remove the global setting and use this instead
cy.then({timeout:20000}, () => {
cy.openWindow('https://shop.countdown.co.nz/', {}).then(() => {
cy.contains('Recipes').click()
cy.contains('Saved Recipes', {timeout:10000}) // if this is there, have navigated
})
})
To check if the long command timeout only applies once, break one of the inner test commands and check that that it times out in the standard 4000 ms.
cy.then({timeout:20000}, () => {
cy.openWindow('https://shop.countdown.co.nz/', {}).then(() => {
cy.contains('Will not find this').click() // Timed out retrying after 4000ms
The quotes are wrong. Try the below code:
it('First Test', ()=>{ cy.visit('https://shop.countdown.co.nz/') })
On trying to visit the URL I am getting the error:
cy.visit() failed trying to load:
https://shop.countdown.co.nz/
We attempted to make an http request to this URL but the request
failed without a response.
We received this error at the network level:
Error: ESOCKETTIMEDOUT
Common situations why this would fail:
you don't have internet access
you forgot to run / boot your web server
your web server isn't accessible
you have weird network configuration settings on your computer
Error Screenshot:
Lets look into the common situations where this might happen:
you don't have internet access: I have a internet access, so this can be ruled out.
you forgot to run / boot your web server - Your site is accessible from a normal browser, this can be ruled out as well.
your web server isn't accessible - This is a possibility where may be there are firewall settings at the server end because of which cypress is not getting any response when accessing the site.
you have weird network configuration settings on your computer - This can be ruled out as well.
I had a similar issue, so what I observed in my case was that the URL was not getting added to the iframe src property and hence cy.visit() was getting timed out each time.
So, I added the URL manually to the src property of the iframe.
Here's my custom command for reference:
Cypress.Commands.add('goto', url => {
return new Promise(res => {
setTimeout(() => {
const frame = window.top.document.getElementsByClassName('aut-iframe')[0];
frame.src = url;
var evt = window.top.document.createEvent('Event');
evt.initEvent('load', false, false);
window.dispatchEvent(evt);
res();
}, 300);
});
});
Now use cy.goto('https://yoururl.com') and you are good to go.

Retrieve html content of a page several seconds after it's loaded

I'm coding a script in nodejs to automatically retrieve data from an online directory.
Knowing that I had never done this, I chose javascript because it is a language I use every day.
I therefore from the few tips I could find on google use request with cheerios to easily access components of dom of the page.
I found and retrieved all the necessary information, the only missing step is to recover the link to the next page except that the one is generated 4 seconds after loading of page and link contains a hash so that this step Is unavoidable.
What I would like to do is to recover dom of page 4-5 seconds after its loading to be able to recover the link
I looked on the internet, and much advice to use PhantomJS for this manipulation, but I can not get it to work after many attempts with node.
This is my code :
#!/usr/bin/env node
require('babel-register');
import request from 'request'
import cheerio from 'cheerio'
import phantom from 'node-phantom'
phantom.create(function(err,ph) {
return ph.createPage(function(err,page) {
return page.open(url, function(err,status) {
console.log("opened site? ", status);
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js', function(err) {
//jQuery Loaded.
//Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
setTimeout(function() {
return page.evaluate(function() {
var tt = cheerio.load($this.html())
console.log(tt)
}, function(err,result) {
console.log(result);
ph.exit();
});
}, 5000);
});
});
});
});
but i get this error :
return ph.createPage(function (page) {
^
TypeError: ph.createPage is not a function
Is what I am about to do is the best way to do what I want to do? If not what is the simplest way? If so, where does my error come from?
If You dont have to use phantomjs You can use nightmare to do it.
It is pretty neat library to solve problems like yours, it uses electron as web browser and You can run it with or without showing window (You can also open developer tools like in Google Chrome)
It has only one flaw if You want to run it on server without graphical interface that You must install at least framebuffer.
Nightmare has method like wait(cssSelector) that will wait until some element appears on website.
Your code would be something like:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show: true, // will show browser window
openDevTools: true // will open dev tools in browser window
});
const url = 'http://hakier.pl';
const selector = '#someElementSelectorWitchWillAppearAfterSomeDelay';
nightmare
.goto(url)
.wait(selector)
.evaluate(selector => {
return {
nextPage: document.querySelector(selector).getAttribute('href')
};
}, selector)
.then(extracted => {
console.log(extracted.nextPage); //Your extracted data from evaluate
});
//this variable will be injected into evaluate callback
//it is required to inject required variables like this,
// because You have different - browser scope inside this
// callback and You will not has access to node.js variables not injected
Happy hacking!

PhanthomJS - How do you capture continuous requests?

If you go to CNBC.COM and open your debugging tools in "Chrome" or whichever browser you prefer, you can see the network trace. Once the page loads, the quote data for the stocks continuously update.
You'll see something similar like this repeating every few seconds.
http://quote.cnbc.com/quote-html-webservice/quote.htm?partnerId=2&requestMethod=quick&exthrs=1&noform=1&fund=1&output=jsonp&symbols=.SPX|.IXIC|.RUT|.VIX|.GDAXI|.FTSE|.FCHI|.FTMIB|.STOXX|.N225|.SSEC|.HSI|.AXJO|.KS11|%40CL.1|%40LCO.1|%40NG.1|%40RB.1|%40HO.1|%40SI.1|%40GC.1|%40HG.1|%40PL.1|%40PA.1|US10Y|DE10Y-DE|JP10Y-JP|UK10Y-GB|FR10Y-FR|EUR%3D&callback=quoteHandler1
When I use the following code:
var page = require('webpage').create();
page.onResourceRequested = function(request) {
console.log('Request ' + JSON.stringify(request, undefined, 4));
};
page.onResourceReceived = function(response) {
console.log('Receive ' + JSON.stringify(response, undefined, 4));
};
page.open(url)
I see all the initial requested resources for the page. But, not the continuous quote data. I tried setting timeouts setTimeout() for like 20 seconds, to give it additional time to capture the additional requests, but it doesn't.
I just need like 20 seconds worth of additional capture of all the requests to get a few additional qoute.cnbc.com in the trace.
I just started using PhantomJS and I tried just about everything I seen on the web and stackoverflow and nothing helped.

Loading external file via $.get often not returning response, even that response code is 200

I have a website running on Azure. In there, I'm playing with Knockout.JS components and custom loaders. I've used a very simple example from KO documentation found here:
(A component loader that loads external files using custom code)
http://knockoutjs.com/documentation/component-loaders.html#custom-component-loader
// loader helper
var templateFromUrlLoader = {
loadTemplate: function(name, templateConfig, callback) {
if (templateConfig.url) {
var today = new Date();
var fullUrl = templateConfig.url + "?v=" + today.getTime();
$.get(fullUrl, function(markupString) {
ko.components.defaultLoader.loadTemplate(name, markupString, callback);
});
} else {
callback(null);
}
}
};
// component is registered
ko.components.register('postcode-lookup', {
viewModel: function() {
// component js
},
template: {
url: 'https://www.purplebricks.com/content/lib/component-postcode-lookup/dist/component/templates/postcode-lookup.html'
}
});
ko.components.loaders.unshift(templateFromUrlLoader);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/knockout/3.4.0/knockout-min.js"></script>
Now, locally, everything works perfectly, each time a page gets refreshed, the component template is fetched:
and the preview of the response is here:
The problem arises when the page is deployed to staging/live production. It will work for every person just fine the first time, but as soon as you refresh the page, the probability of template returning blank response is very very high = almost 90%. It is clearly cached, the response code becomes 304 not modified, but the response is blank. I've tried adding a query string that adds a timestamp to it - it made almost 0 difference. Now I have a 200 status code, but the time of the request goes into pending and seems to last forever:
It almost feels like Azure is caching this incorrectly, perhaps DNS issues?
This appears to be down to an incorrectly set gzip header, try adding the following to your route config to prevent ASP.NET MVC from processing the static content:
routes.IgnoreRoute("Content/{*pathInfo}");

Categories