Retrieve html content of a page several seconds after it's loaded - javascript

I'm coding a script in nodejs to automatically retrieve data from an online directory.
Knowing that I had never done this, I chose javascript because it is a language I use every day.
I therefore from the few tips I could find on google use request with cheerios to easily access components of dom of the page.
I found and retrieved all the necessary information, the only missing step is to recover the link to the next page except that the one is generated 4 seconds after loading of page and link contains a hash so that this step Is unavoidable.
What I would like to do is to recover dom of page 4-5 seconds after its loading to be able to recover the link
I looked on the internet, and much advice to use PhantomJS for this manipulation, but I can not get it to work after many attempts with node.
This is my code :
#!/usr/bin/env node
require('babel-register');
import request from 'request'
import cheerio from 'cheerio'
import phantom from 'node-phantom'
phantom.create(function(err,ph) {
return ph.createPage(function(err,page) {
return page.open(url, function(err,status) {
console.log("opened site? ", status);
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js', function(err) {
//jQuery Loaded.
//Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
setTimeout(function() {
return page.evaluate(function() {
var tt = cheerio.load($this.html())
console.log(tt)
}, function(err,result) {
console.log(result);
ph.exit();
});
}, 5000);
});
});
});
});
but i get this error :
return ph.createPage(function (page) {
^
TypeError: ph.createPage is not a function
Is what I am about to do is the best way to do what I want to do? If not what is the simplest way? If so, where does my error come from?

If You dont have to use phantomjs You can use nightmare to do it.
It is pretty neat library to solve problems like yours, it uses electron as web browser and You can run it with or without showing window (You can also open developer tools like in Google Chrome)
It has only one flaw if You want to run it on server without graphical interface that You must install at least framebuffer.
Nightmare has method like wait(cssSelector) that will wait until some element appears on website.
Your code would be something like:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show: true, // will show browser window
openDevTools: true // will open dev tools in browser window
});
const url = 'http://hakier.pl';
const selector = '#someElementSelectorWitchWillAppearAfterSomeDelay';
nightmare
.goto(url)
.wait(selector)
.evaluate(selector => {
return {
nextPage: document.querySelector(selector).getAttribute('href')
};
}, selector)
.then(extracted => {
console.log(extracted.nextPage); //Your extracted data from evaluate
});
//this variable will be injected into evaluate callback
//it is required to inject required variables like this,
// because You have different - browser scope inside this
// callback and You will not has access to node.js variables not injected
Happy hacking!

Related

Does anyone know how to define navigator online in main process in electron?

I know you can use navigator onLine inside the renderer process because it's a rendered inside a browser. But what I'm trying to do is something like this in the main process:
if (navigator.onLine){
mainWindow.loadURL("https://google.com")
} else {
mainWindow.loadFile(path.join(__dirname, 'index.html'));
}
So basically if the user is offline, just load a local html file, and if they're online, take them to a webpage. But, like expected, I keep getting the error that 'navigator is not defined'. Does anyone know how can I somehow import the navigate cdn in the main process? Thanks!
TL;DR: The easiest thing to do is to just ask Electron. You can do this via the net module from within the Main Process:
const { net } = require ("electron");
const isInternetAvailable = () => return net.isOnline ();
// To check:
if (isInternetAvailable ()) { /* do something... */ }
See Electron's documentation on the method; specifically, this approach doesn't tell you whether your service is accessible via the internet, but rather that a service can be contacted (or not even this, as the documentation mentions links which would not involve any HTTP request at all).
However, this is not a reliable measurement and you might want to increase its hit rate by manuallly checking whether a certain connection can be made.
In order to check whether an internet connection is available, you'll have to make a connection yourself and see if it fails. This can be done from the Main Process using plain NodeJS:
// HTTP code basically from the NodeJS HTTP tutorial at
// https://nodejs.dev/learn/making-http-requests-with-nodejs/
const https = require('https');
const REMOTE_HOST = "google.com"; // Or your domain
const REMOTE_EP = "/"; // Or your endpoint
const REMOTE_PAGE = "https://" + REMOTE_HOST + REMOTE_EP;
function checkInternetAvailability () {
return new Promise ((resolve, reject) => {
const options = {
hostname: REMOTE_HOST,
port: 443,
path: REMOTE_EP,
method: 'GET',
};
// Try to fetch the given page
const req = https.request (options, res => {
// Yup, that worked. Tell the depending code.
resolve (true);
req.destroy (); // This is no longer needed.
});
req.on ('error', error => {
reject (error);
});
req.on ('timeout', () => {
// No, connection timed out.
resolve (false);
req.destroy ();
});
req.end ();
});
}
// ... Your window initialisation code ...
checkInternetAvailability ().then (
internetAvailable => {
if (internetAvailable) mainWindow.loadURL (REMOTE_PAGE);
else mainWindow.loadFile (path.join (__dirname, 'index.html'));
// Call any code needed to be executed after this here!
}
).catch (error => {
console.error ("Oops, couldn't initialise!", error);
app.quit (1);
});
Please note that this code here might not be the most desirable since it just "crashes" your app with exit code 1 if there is any error other than connection timeout.
This, however, makes your startup asynchronous, which means that you need to pay attention on the execution chain of your app startup. Also, startup may be really slow in case the timeout is reached, it may be worth considering NodeJS' http module documentation.
Also, it makes sense to actually try to retrieve the page you're wanting to load in the BrowserWindow (constant values REMOTE_HOST and REMOTE_EP), because that also gives you an indication whether your server is up or not, although that means that the page will be fetched twice (in the best case, when the connection test succeeds and when Electron loads the page into the window). However, that should not be that big of a problem, since no external assets (images, CSS, JS) will be loaded.
One last note: This is not a good metric of whether any internet connection is available, it just tells you whether your server answered within the timeout window. It might very well be that any other service works or that the connection just is very slow (i.e., expect false negatives). Should be "good enough" for your use-case though.

Unable to load a specific URL with Cypress

I’m unable to load the following URL with Cypress. Getting timeout error. I have set the page load time to 2 mins, still same issue. General URLs eg. (https://www.google.co.nz/) works fine.
it(‘First Test’, () => {
cy.visit(‘https://shop.countdown.co.nz/‘)
})
Here's a way, not the best, could be improved...
The Countdown site has an aversion to being run in an iframe, but it can be tested in a child window, see custom command here Cypress using child window
Cypress.Commands.add('openWindow', (url, features) => {
const w = Cypress.config('viewportWidth')
const h = Cypress.config('viewportHeight')
if (!features) {
features = `width=${w}, height=${h}`
}
console.log('openWindow %s "%s"', url, features)
return new Promise(resolve => {
if (window.top.aut) {
console.log('window exists already')
window.top.aut.close()
}
// https://developer.mozilla.org/en-US/docs/Web/API/Window/open
window.top.aut = window.top.open(url, 'aut', features)
// letting page enough time to load and set "document.domain = localhost"
// so we can access it
setTimeout(() => {
cy.state('document', window.top.aut.document)
cy.state('window', window.top.aut)
resolve()
}, 10000)
})
})
Can test with that like this
cy.openWindow('https://shop.countdown.co.nz/').then(() => {
cy.contains('Recipes').click()
cy.contains('Saved Recipes', {timeout:10000}) // if this is there, have navigated
})
I bumped the setTimeout() in custom command to 10 seconds, cause this site drags it's feet a bit.
Configuration:
// cypress.json
{
"baseUrl": "https://shop.countdown.co.nz/",
"chromeWebSecurity": false,
"defaultCommandTimeout": 20000 // see below for better way
}
Command timeout error
Using Gleb's child window command, there's a timeout error that I can't track the source of.
To avoid it I set "defaultCommandTimeout": 20000 in config, but since it's only needed for the openWindow call it's better to remove the global setting and use this instead
cy.then({timeout:20000}, () => {
cy.openWindow('https://shop.countdown.co.nz/', {}).then(() => {
cy.contains('Recipes').click()
cy.contains('Saved Recipes', {timeout:10000}) // if this is there, have navigated
})
})
To check if the long command timeout only applies once, break one of the inner test commands and check that that it times out in the standard 4000 ms.
cy.then({timeout:20000}, () => {
cy.openWindow('https://shop.countdown.co.nz/', {}).then(() => {
cy.contains('Will not find this').click() // Timed out retrying after 4000ms
The quotes are wrong. Try the below code:
it('First Test', ()=>{ cy.visit('https://shop.countdown.co.nz/') })
On trying to visit the URL I am getting the error:
cy.visit() failed trying to load:
https://shop.countdown.co.nz/
We attempted to make an http request to this URL but the request
failed without a response.
We received this error at the network level:
Error: ESOCKETTIMEDOUT
Common situations why this would fail:
you don't have internet access
you forgot to run / boot your web server
your web server isn't accessible
you have weird network configuration settings on your computer
Error Screenshot:
Lets look into the common situations where this might happen:
you don't have internet access: I have a internet access, so this can be ruled out.
you forgot to run / boot your web server - Your site is accessible from a normal browser, this can be ruled out as well.
your web server isn't accessible - This is a possibility where may be there are firewall settings at the server end because of which cypress is not getting any response when accessing the site.
you have weird network configuration settings on your computer - This can be ruled out as well.
I had a similar issue, so what I observed in my case was that the URL was not getting added to the iframe src property and hence cy.visit() was getting timed out each time.
So, I added the URL manually to the src property of the iframe.
Here's my custom command for reference:
Cypress.Commands.add('goto', url => {
return new Promise(res => {
setTimeout(() => {
const frame = window.top.document.getElementsByClassName('aut-iframe')[0];
frame.src = url;
var evt = window.top.document.createEvent('Event');
evt.initEvent('load', false, false);
window.dispatchEvent(evt);
res();
}, 300);
});
});
Now use cy.goto('https://yoururl.com') and you are good to go.

Can you disable a service worker before the page loads?

My team and I have a project that was originally built as a PWA, but have since decided to scrap that idea as we realized it would need to change much more frequently than originally intended. However, the service worker is already live, as well as a newly redesigned landing page for the website. Despite all our efforts to clear the PWA caching, our clients are still reporting that they are receiving the old cached version of the website.
Currently, we have the service worker set up to delete all caches upon install (and whenever anything at all happens as a precaution), as well as some JavaScript to unregister the service worker when the new page actually loads. However, the problem is that none of this runs until the user makes a request to the website, and at that point the browser is already loading the cached content. Is it possible to clear this cache and prevent the browser from loading any content that was already cached?
Current service-worker.js
// Caching
var cacheCore = 'mkeSculptCore-0330121058';
var cacheAssets = 'mkeSculptAssets-0330121058';
self.addEventListener('install', function (event) {
self.skipWaiting();
caches.keys().then(function (names) {
for (let name of names)
caches.delete(name);
});
});
self.addEventListener('activate', function (event) {
caches.keys().then(function (names) {
for (let name of names)
caches.delete(name);
});
});
self.addEventListener('fetch', function (event) {
caches.keys().then(function (names) {
for (let name of names)
caches.delete(name);
});
});
Script in index.html
(function () {
if ('serviceWorker' in navigator) {
navigator.serviceWorker.getRegistrations().then(function (registrations) {
//returns installed service workers
if (registrations.length) {
for (let registration of registrations) {
registration.unregister();
}
}
});
}
})();
So far, I've read a few other similar StackOverflow answers, including this one, but they tend to rely on users manually doing something to fetch the new content, ie. via a hard reload or disabling the service worker manually through the browser settings. However, in my case, we cannot rely on manual user actions.
One way to solve this issue is to add a timestamp at end of the file(js, css) name so each time when it is making a request, the cache key is not available in the service worker and thus it tends to get a new version of the file at each load.
<script type="text/javascript" src="/js/scipt1.js?t=05042018121212"/>
For appending a new timestamp dynamically in the file name, please check this answer
But this may not be reliable if HTML itself is cached.
add this before to "update" all contents:
$.each(['index.html','file1.js','file2.js','file3.js'],function(index,file) {
$.get(file+'?t='+new Date().getTime(), function(){});
});
location.reload(true);
for the ServiceWorker to stop all windows using it must be closed.
If it's a webapp you can use window.close();
This code just loads a fresh version of the files in the list.
If there are any internal caches they will be all updated.

Nightmare.js does not work with Azure webjob

I am trying to run an azure webjob which takes a json object and renders a webpage, then prints it to pdf, via the electron browser in Nightmare.js.
When I run this locally it works perfectly, but when I run it in azure webjob it never completes.
I get the two console.log statements output to the log, but seeing as I can not output anything from the nightmare.js calls, nor display the electron browser window, I have no idea what is going wrong.
There is also a webserver in the script, omitted as it seems to take the request with the json object and pass it to createPage just fine.
I have verified that index.html file is in the right directory. Does anyone know what might be wrong?
var Nightmare = require('nightmare'),
http = require('http');
function createPage(o, final) {
var start = new Date().getTime();
var page = Nightmare({
//show: true, //uncomment to show electron browser window
//openDevTools: { mode: 'detach'}, //uncomment to open developer console ('show: true' needs to be set)
gotoTimeout: 300000, //set timeout for .goto() to 2 minutes
waitTimeout: 300000, //set timeout for .wait() to 5 minutes
executionTimeout: 600000 //set timeout for .evaluate() to 10 minutes
})
.goto('file:\\\\' + __dirname + '\\index.html');
page.wait("#ext-quicktips-tip") //wait till HTML is loaded
.wait(function () { // wait till JS is loaded
console.log('Extjs loaded.');
return !!(Ext.isReady && window.App && App.app);
});
console.log("CreatePage()1");
page.evaluate(function (template, form, lists, printOptions) {
App.pdf.Builder.create({
template: template,
form: form,
lists: lists,
format: o.printOptions.format,
});
console.log('Create done');
}, template, form, o.lists, printOptions);
console.log("CreatePage()2");
page.wait(function () {
console.log('Content created. ' + App.pdf.Builder.ready);
return App.pdf.Builder.ready;
})
.pdf(o.outputDir + form.filename, { "pageSize": "A4", "marginsType": 1 })
.end()
.then(function () {
console.log('Pdf printed, time: ' + (new Date().getTime() - start) / 1000 + ' seconds');
final(true);
})
.catch(function (err) {
console.log('Print Error: ' + err.message);
});
}
Solved
As Rick states in his answer, this will not currently work!
This document lists the current state of webjobs sandbox:
https://github.com/projectkudu/kudu/wiki/Azure-Web-App-sandbox
It has the following paragraph relating to my issue:
PDF generation from HTML
There are multiple libraries used to convert HTML to PDF. Many Windows/.NET specific versions leverage IE APIs and therefore leverage User32/GDI32 extensively. These APIs are largely blocked in the sandbox (regardless of plan) and therefore these frameworks do not work in the sandbox.
There are some frameworks that do not leverage User32/GDI32 extensively (wkhtmltopdf, for example) and we are working on enabling these in Basic+ the same way we enabled SQL Reporting.
I guess for nightmare.js to work you need desktop interaction, which you're not getting on a WebJob.
Taken from this issue on Github:
Nightmare isn't truly headless: it requires an Electron instance to
work, which in turn requires a framebuffer to render properly (at
least, for now).
This will not fly on an Azure WebJob.

How to use phantomjs in node-js environment for dynamic-page web scraping?

I am working on web scraping for few task to complete.
I have used node-js request module for page scraping.
It is working fine and great for cookie-session and all.
But it fails when time comes to render Dynamic pages build with some javascript framework like ANGULAR or BACKBONE etc.
I am trying for phantomjs to overcome this thing as i found on google that it is helpful to come over such case.
I also found one nodejs bridge for phantomjs phantom
With phantomjs and this bridge module i am able to achieve same thing nothing more.
var phantom = require('phantom');
var fs = require('fs');
var sitepage = null;
var phInstance = null;
phantom.create()
.then(instance => {
phInstance = instance;
console.log("Instance created");
return instance.createPage();
})
.then(page => {
sitepage = page;
console.log("createing page");
return page.open('https://paytm.com/shop/p/carrier-estrella-plus-1-5-ton-3-star-window-ac-LARCARRIER-ESTRPLAN5550519593A34?src=grid&tracker=%7C%7C%7C%7C%2Fg%2Felectronics%2Flarge-appliances%2F1-5-ton-3-star-ac-starting-at-rs-22699%7C88040%7C1');
})
.then(status => {
//console.log(status);
console.log("getting content of page");
return sitepage.property('content');
})
.then(content => {
console.log("success");
//console.log(content);
fs.writeFile("ok.text", content);
sitepage.close();
phInstance.exit();
})
.catch(error => {
console.log("errr");
//console.log(error);
phInstance.exit();
});
Above is code which i am trying for load one of dynamic website page which is build with angular framework.
Can anybody guide me for same or correct in above code where i am missing right things.
You're getting the content of the page before the dynamic code has run, you need to wait for the load to be completed.
The block behind the page.open would need to wait for the page to complete, if there is an element you know is being fetched from the back end you can lie in wait for that element (see the waitfor example in phantomjs doc).

Categories