Scraping fully rendered webpage with nodejs

Scraping fully rendered webpage with nodejs - javascript

I am trying to get amazon pricing information with nodejs.
Here's the target url:
http://aws.amazon.com/ec2/pricing/
But the content of the pricing tables which I am reading in nodejs is not fully rendered and there are only javascripts.
So far I have used jsdom, jquerygo and phantom but I was not successful. Even setting timeouts does not help. Can anyone please provide me with a working solution for this specific case?
Thanks and best regards.

There are different ways to scrape a web page using node.js
I was inspired by spookjs
var Spooky = require('spooky');
var spooky = new Spooky({
child: {
transport: 'http'
},
casper: {
logLevel: 'debug',
verbose: true
}
}, function (err) {
if (err) {
e = new Error('Failed to initialize SpookyJS');
e.details = err;
throw e;
}
spooky.start(
'http://en.wikipedia.org/wiki/Spooky_the_Tuff_Little_Ghost');
spooky.then(function () {
this.emit('hello', 'Hello, from ' + this.evaluate(function () {
return document.title;
}));
});
spooky.run();
});
spooky.on('error', function (e, stack) {
console.error(e);
if (stack) {
console.log(stack);
}
});
spooky.on('console', function (line) {
console.log(line);
});
spooky.on('hello', function (greeting) {
console.log(greeting);
});
spooky.on('log', function (log) {
if (log.space === 'remote') {
console.log(log.message.replace(/ \- .*/, ''));
}
});
Note: Gives flexibility to run casperjs and phantom js using node.js

This solved my issue:
I noticed that when installing phantom module in node, it was complaining about version of phantomjs (version 2) and was downloading version (1.9.8) in some temporary location.
Thus I installed version 1.9.8 instead and set the PATH variable to that. And it worked!
Also must note that inside page.open(...) function you must setTimeout for quite a long time (in my case about 35 seconds) so that the whole page is fully loaded and rendered.

Related

Node Js: Using a WebWorker to resize images

I am making a node.js application that can resize images. I am able to do this successfully with jimp. However resizing an image is not asynchronous, and freezes the UI while resizing. I want to avoid this, so I tried using a webworker.
// main.js
var worker = new Worker(__dirname + '\\worker.js');
worker.addEventListener('message', function(e) {
if (e.data == 'done') { worker.terminate() } // Done
}, false);
worker.postMessage({'buff': buf, 'filename': filename}); // Start the worker
|
// worker.js
const Jimp = require('jimp'); // Oops, this doesn't work
self.addEventListener('message', function(e) {
resize(e.data.buf, e.data.filename);
}, false);
function resize(buf, filename) {
Jimp.read(buf).then(image => {
image.resize(1920, Jimp.AUTO);
image.writeAsync(filename).then(cb => { self.postMessage('done') });
});
}
What I found is that I cannot use node.js functions like require() in a webworker. How can I use Jimp in a webworker or resize an image in a different asynchronous way?
Edit: I am trying to use webworkify, copying the answer from #RubyJunk. When I try to create the worker I get the error Uncaught TypeError: Cannot convert undefined or null to object at Function.keys (<anonymous>). Does anyone know how to fix that?
Edit 2: I am using electron to create my Node.js app. They have a window property (nodeIntegrationInWorker) that makes it seem like I can run Node.js functions in a Web Worker, but when I try to use require() it still tells me it is not a function.

Try using webworkify, and create your worker like so instead
var work = require('webworkify');
var worker = work(require('./worker.js');
and put your worker method into module.exports like so
const Jimp = require('jimp');
module.exports = function(self) {
self.addEventListener('message', function(e) {
resize(e.data.buf, e.data.filename);
}, false);
function resize(buf, filename) {
Jimp.read(buf).then(image => {
image.resize(1920, Jimp.AUTO);
image.writeAsync(filename).then(cb => { self.postMessage('done') });
});
}
}

Node.js filesystem save file error 56 EROFS while saving every 2 seconds

I am running node.js on raspbian and trying to save/update a file every 2/3 seconds using the following code:
var saveFileSaving = false;
function loop() {
mainLoop = setTimeout(function() {
// update data
saveSaveFile(data, function() {
//console.log("Saved data to file");
loop();
});
}, 1500);
}
function saveSaveFile(data, callback) {
if(!saveFileSaving) {
saveFileSaving = true;
var wstream = fs.createWriteStream(path.join(__dirname, 'save.json'));
wstream.on('finish', function () {
saveFileSaving = false;
callback(data);
});
wstream.on('error', function (error) {
console.log(error);
saveFileSaving = false;
wstream.end();
callback(null);
});
wstream.write(JSON.stringify(data));
wstream.end();
} else {
callback(null);
}
}
When I run this it works fine for an hour then starts spitting out:
[25/May/2016 11:3:4 am] { [Error: EROFS, open '<path to file>']
errno: 56,
code: 'EROFS',
path: '<path to file>' }
I have tried jsonfile plugin which also sends out a similiar write error after an hour.
I have tried both fileSystem.writeFile and fileSystem.writeFileSync both give the same error after an hour.
I was thinking it had to do with the handler not being let go before a new save occurs which is why I started using the saveFileSaving flag.
Resetting the system via hard reset fixes the issue (soft reset does not work as the system seems to be locked up).
Any suggestions guys? I have searched the web and so only found one other question slightly similar from 4 years ago which was left in limbo.
Note: I am using the callback function from the code to continue with the main loop.

I was able to get this working by unlinking the file and saving the file every time I save while it is not pretty it works and shouldn't cause too much overhead.
I also added a backup solution which saves a backup every 5 minutes in case the save file has issues.
Thank you for everyone's help.

Here is my ideas:
1) Check free space when this problem happens by typing in terminal:
df -h
2) Also check if file is editable when problem occurs. with nano or vim and etc.
3) Your code too complicated for simply scheduling data manipulation and writing it to file. Because of even Your file will be busy (saveFileSaving) You will lose data until next iteration, try to use that code:
var
async = require('async'),
fs = require('fs'),
path = require('path');
async.forever(function(next) {
// some data manipulation
try {
fs.writeFileSync(path.join(__dirname, 'save.json'), JSON.stringify(data));
}
catch(ex) {
console.error('Error writing data to file:', ex);
}
setTimeout(next, 2000);
});
4) How about keeping file descriptor open?
var
async = require('async'),
fs = require('fs'),
path = require('path');
var file = fs.createWriteStream(path.join(__dirname, 'save.json'));
async.forever(function(next) {
// some data manipulation
file.write(JSON.stringify(data));
setTimeout(next, 2000);
});
var handleSignal = function (exc) {
// close file
file.end();
if(exc) {
console.log('STOPPING PROCESS BECAUSE OF:', exc);
}
process.exit(-1);
}
process.on('uncaughtException', handleSignal);
process.on('SIGHUP', handleSignal);
5) hardware or software problems (maybe because of OS drivers) with raspberry's storage controller.

replicate pouchDB document with couchDB

I have used pouchDB in one application and now I want to introduce couchDB to sync the document to remote server. Hence i followed this link http://pouchdb.com/getting-started.html i used the below code to replicate the data to couchDB
var db2 = new PouchDB('todos');
var remoteCouch = 'http://localhost:5984/_utils/database.html?couchdb_sample';
db2.changes({
since: 'now',
live: true
}).on('change', showTodos);
sync();
function sync() {
//alert("sync");
//syncDom.setAttribute('data-sync-state', 'syncing');
//var opts = {live: true};
db2.replicate.to(remoteCouch).on('complete', function () {
console.log("done");
}).on('error', function (err) {
console.log(err);
});
function addTodo(text) {
var todo = {
_id: $("#eid").val()+$("#version").val(),
title: text,
name: $("#nameid").val(),
version: $("#version").val(),
completed: false
};
db2.put(todo, function callback(err, result) {
if (!err) {
console.log('Successfully posted a todo!');
}
else{
console.log(err);
}
});}
here the title has an xml string as value. But i am facing below error
SyntaxError: Unexpected token <
at Object.parse (native)
for this line db2.replicate.to(remoteCouch). I manually created a new document in couchDb database and entered the same data it gave no error but when i try replicating it shows syntax error. Can anyone please hint me where I have gone wrong

http://localhost:5984/_utils/database.html?couchdb_sample
Points to a HTML site (copied over from the browsers address bar, right?). Remove the middle part:
http://localhost:5984/couchdb_sample

It look like you have not defined the remote database in the way PouchDb is expecting. You should use the "new PouchDb" call. The second line of your code is:
var remoteCouch = 'http://localhost:5984/_utils/database.html?couchdb_sample';
but I think it should be like this:
var remoteCouch = new PouchDB('http://localhost:5984/couchdb_sample');
I am not clear from your code what the name of the remote database is, but it would not normally end in ".html" as Ingo Radatz pointed out, so I have assumed it is couchdb_sample above. There is more information about replication on the PouchDb site.

"Meteor code must always run within a Fiber" when calling Collection.insert on server

I have the following code in server/statusboard.js;
var require = __meteor_bootstrap__.require,
request = require("request")
function getServices(services) {
services = [];
request('http://some-server/vshell/index.php?type=services&mode=json', function (error, response, body) {
var resJSON = JSON.parse(body);
_.each(resJSON, function(data) {
var host = data["host_name"];
var service = data["service_description"];
var hardState = data["last_hard_state"];
var currState = data["current_state"];
services+={host: host, service: service, hardState: hardState, currState: currState};
Services.insert({host: host, service: service, hardState: hardState, currState: currState});
});
});
}
Meteor.startup(function () {
var services = [];
getServices(services);
console.log(services);
});
Basically, it's pulling some data from a JSON feed and trying to push it into a collection.
When I start up Meteor I get the following exception;
app/packages/livedata/livedata_server.js:781
throw exception;
^
Error: Meteor code must always run within a Fiber
at [object Object].withValue (app/packages/meteor/dynamics_nodejs.js:22:15)
at [object Object].apply (app/packages/livedata/livedata_server.js:767:45)
at [object Object].insert (app/packages/mongo-livedata/collection.js:199:21)
at app/server/statusboard.js:15:16
at Array.forEach (native)
at Function.<anonymous> (app/packages/underscore/underscore.js:76:11)
at Request._callback (app/server/statusboard.js:9:7)
at Request.callback (/usr/local/meteor/lib/node_modules/request/main.js:108:22)
at Request.<anonymous> (/usr/local/meteor/lib/node_modules/request/main.js:468:18)
at Request.emit (events.js:67:17)
Exited with code: 1
I'm not too sure what that error means. Does anyone have any ideas, or can suggest a different approach?

Just wrapping your function in a Fiber might not be enough and can lead to unexpected behavior.
The reason is, along with Fiber, Meteor requires a set of variables attached to a fiber. Meteor uses data attached to a fiber as a dynamic scope and the easiest way to use it with 3rd party api is to use Meteor.bindEnvironment.
T.post('someurl', Meteor.bindEnvironment(function (err, res) {
// do stuff
// can access Meteor.userId
// still have MongoDB write fence
}, function () { console.log('Failed to bind environment'); }));
Watch these videos on evented mind if you want to know more:
https://www.eventedmind.com/posts/meteor-dynamic-scoping-with-environment-variables
https://www.eventedmind.com/posts/meteor-what-is-meteor-bindenvironment

As mentioned above it is because your executing code within a callback.
Any code you're running on the server-side needs to be contained within a Fiber.
Try changing your getServices function to look like this:
function getServices(services) {
Fiber(function() {
services = [];
request('http://some-server/vshell/index.php?type=services&mode=json', function (error, response, body) {
var resJSON = JSON.parse(body);
_.each(resJSON, function(data) {
var host = data["host_name"];
var service = data["service_description"];
var hardState = data["last_hard_state"];
var currState = data["current_state"];
services+={host: host, service: service, hardState: hardState, currState: currState};
Services.insert({host: host, service: service, hardState: hardState, currState: currState});
});
});
}).run();
}
I just ran into a similar problem and this worked for me. What I have to say though is that I am very new to this and I do not know if this is how this should be done.
You probably could get away with only wrapping your insert statement in the Fiber, but I am not positive.

Based on my tests you have to wrap the insert in code I tested that is similar to the above example.
For example, I did this and it still failed with Fibers error.
function insertPost(args) {
if(args) {
Fiber(function() {
post_text = args.text.slice(0,140);
T.post('statuses/update', { status: post_text },
function(err, reply) {
if(reply){
// TODO remove console output
console.log('reply: ' + JSON.stringify(reply,0,4));
console.log('incoming twitter string: ' + reply.id_str);
// TODO insert record
var ts = Date.now();
id = Posts.insert({
post: post_text,
twitter_id_str: reply.id_str,
created: ts
});
}else {
console.log('error: ' + JSON.stringify(err,0,4));
// TODO maybe store locally even though it failed on twitter
// and run service in background to push them later?
}
}
);
}).run();
}
}
I did this and it ran fine with no errors.
function insertPost(args) {
if(args) {
post_text = args.text.slice(0,140);
T.post('statuses/update', { status: post_text },
function(err, reply) {
if(reply){
// TODO remove console output
console.log('reply: ' + JSON.stringify(reply,0,4));
console.log('incoming twitter string: ' + reply.id_str);
// TODO insert record
var ts = Date.now();
Fiber(function() {
id = Posts.insert({
post: post_text,
twitter_id_str: reply.id_str,
created: ts
});
}).run();
}else {
console.log('error: ' + JSON.stringify(err,0,4));
// TODO maybe store locally even though it failed on twitter
// and run service in background to push them later?
}
}
);
}
}
I thought this might help others encountering this issue. I have not yet tested calling the asynchy type of external service after internal code and wrapping that in a Fiber. That might be worth testing as well. In my case I needed to know the remote action happened before I do my local action.
Hope this contributes to this question thread.

Save and render a webpage with PhantomJS and node.js

I'm looking for an example of requesting a webpage, waiting for the JavaScript to render (JavaScript modifies the DOM), and then grabbing the HTML of the page.
This should be a simple example with an obvious use-case for PhantomJS. I can't find a decent example, the documentation seems to be all about command line use.

From your comments, I'd guess you have 2 options
Try to find a phantomjs node module - https://github.com/amir20/phantomjs-node
Run phantomjs as a child process inside node - http://nodejs.org/api/child_process.html
Edit:
It seems the child process is suggested by phantomjs as a way of interacting with node, see faq - http://code.google.com/p/phantomjs/wiki/FAQ
Edit:
Example Phantomjs script for getting the pages HTML markup:
var page = require('webpage').create();
page.open('http://www.google.com', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByTagName('html')[0].innerHTML
});
console.log(p);
}
phantom.exit();
});

With v2 of phantomjs-node it's pretty easy to print the HTML after it has been processed.
var phantom = require('phantom');
phantom.create().then(function(ph) {
ph.createPage().then(function(page) {
page.open('https://stackoverflow.com/').then(function(status) {
console.log(status);
page.property('content').then(function(content) {
console.log(content);
page.close();
ph.exit();
});
});
});
});
This will show the output as it would have been rendered with the browser.
Edit 2019:
You can use async/await:
const phantom = require('phantom');
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
const status = await page.open('https://stackoverflow.com/');
const content = await page.property('content');
console.log(content);
await instance.exit();
})();
Or if you just want to test, you can use npx
npx phantom#latest https://stackoverflow.com/

I've used two different ways in the past, including the page.evaluate() method that queries the DOM that Declan mentioned. The other way I've passed info from the web page is to spit it out to console.log() from there, and in the phantomjs script use:
page.onConsoleMessage = function (msg, line, source) {
console.log('console [' +source +':' +line +']> ' +msg);
}
I might also trap the variable msg in the onConsoleMessage and search for some encapsulate data. Depends on how you want to use the output.
Then in the Nodejs script, you would have to scan the output of the Phantomjs script:
var yourfunc = function(...params...) {
var phantom = spawn('phantomjs', [...args]);
phantom.stdout.setEncoding('utf8');
phantom.stdout.on('data', function(data) {
//parse or echo data
var str_phantom_output = data.toString();
// The above will get triggered one or more times, so you'll need to
// add code to parse for whatever info you're expecting from the browser
});
phantom.stderr.on('data', function(data) {
// do something with error data
});
phantom.on('exit', function(code) {
if (code !== 0) {
// console.log('phantomjs exited with code ' +code);
} else {
// clean exit: do something else such as a passed-in callback
}
});
}
Hope that helps some.

Why not just use this ?
var page = require('webpage').create();
page.open("http://example.com", function (status)
{
if (status !== 'success')
{
console.log('FAIL to load the address');
}
else
{
console.log('Success in fetching the page');
console.log(page.content);
}
phantom.exit();
});

Late update in case anyone stumbles on this question:
A project on GitHub developed by a colleague of mine exactly aims at helping you do that: https://github.com/vmeurisse/phantomCrawl.
It still a bit young, it certainly is missing some documentation, but the example provided should help doing basic crawling.

Here's an old version that I use running node, express and phantomjs which saves out the page as a .png. You could tweak it fairly quickly to get the html.
https://github.com/wehrhaus/sitescrape.git

We Keep Coding

JavaScript is the programming language of the Web.

Scraping fully rendered webpage with nodejs - javascript

Related

Node Js: Using a WebWorker to resize images

Node.js filesystem save file error 56 EROFS while saving every 2 seconds

replicate pouchDB document with couchDB

"Meteor code must always run within a Fiber" when calling Collection.insert on server

Save and render a webpage with PhantomJS and node.js

Categories

Resources