phantomjs save array urls not working

phantomjs save array urls not working - javascript

Okay so I'm using phantomjs to go through an array of urls that i want phantomjs to load and save each as an html file.
So far my code kinda works.
It starts loading each page and saving but after so many pages it just suddenly stops loading new pages and just saves the same one over and over again.
I believe its partially due to me not using page.close() but when I do the code doesn't work at all.
I was looking for someones help, especially for an explanation for whats the cause of the problem. And if someone has a solution for my problem it would be greatly appreciated. I know its messy at the moment, but I waiting to clean up after i discover the problem.
var fs = require('fs');
/* this is used get an array of urls I'm trying to find.
function linkfinder(){
var array = fs.read('C:\\Users\\jacob\\Documents\\SDD\\links.txt').toString().split('\n');
console.log(array[1]);
console.log('ffff');
return array;
}*/
var urls = {
http://www.njcaa.org/member_colleges/college-profile?collegeId=1476,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1548,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1781,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1506,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1321,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1390,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1430,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1707,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1477,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1431,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1678,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1409,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1239,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1765,
http://www.njcaa.org/member_colleges/college-profile?collegeId=2203,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1889,
http://www.njcaa.org/member_colleges/college-profile?collegeId=2240,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1650,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1490,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1514,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1322,
http://www.njcaa.org/member_colleges/college-profile?collegeId=1744
}
var page = new WebPage();
function handle_page(url){
page.open(url, function(){
//...
page.injectJs('jquery.min.js');
// var html = page.evaluate(function(){
// return document.getElementsByTagName('html')[0].innerHTML
// });
//save to file
page.onLoadFinished = function() {
console.log("page load finished");
var path ='C:\\Users\\jacob\\Documents\\SDD\\schools\\.html';
var linked = url.substr(63, 4);
var output = [path.slice(0, 37), linked, path.slice(37)].join('');
console.log(output);
//page.render('C:\Users\jacob\Documents\export.png');
fs.write( output, page.content, 'w');
};
// page.close();
next_page();
});
}
function next_page(){
var url = links.shift();
if(!url){
phantom.exit(0);
}
handle_page(url);
}
next_page();

This will work, but you need to specify the right path (i'm working on Linux, /root/pjs my path there).
var page = require('webpage').create(), fs = require('fs');
page.onLoadFinished = function() {}// won't work at all. The same content/every page((
var urls = [//an array
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1476",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1548",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1781",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1506",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1321",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1390",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1430",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1707",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1477",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1431",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1678",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1409",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1239",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1765",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=2203",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1889",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=2240",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1650",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1490",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1514",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1322",
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1744"
]
var i=0;
function on_a_page(status){i++;
console.log("page load finished");
var output = '/root/pjs/'+page.url.substr(63, 4)+'.html';// You need to specify the right path (i'm working on Linux, '/root/pjs' my path there).
console.log(output);
fs.write( output, page.content, 'w');
if(i<urls.length){to_open()}else{phantom.exit()}
}
function to_open(){ page.open(urls[i], on_a_page);console.log(i) }
to_open()

Related

nodejs download multiple files from web

I am trying to download multiple files from a OneDrive folder. Below has my code but it will only download the last file and not all of them
for(const f in files)
{
var fileURL = (files[f]["#microsoft.graph.downloadUrl"]);
var fileName = (JSON.stringify(files[f].name)).slice(1,-1);
var request = https.get(fileURL, function(response){
console.log(fileURL);
if(response.statusCode == 200){
var file = fs.createWriteStream(`./temp/${userId}/${fileName}`);
response.pipe(file);
}
request.setTimeout(60000,function(){
request.destroy();
});
});
}
i.e the console log would print
FILE_URL1
FILE_URL1
FILE_URL1
rather than
FILE_URL1
FILE_URL2
FILE_URL3
Note that if the console.log(fileURL) is placed before var request https.get... it prints out the 3 file urls. I'm not sure if its a problem with the loops or if there is something else. I am quite new at javascript so I dont know a lot.

Replace the var with const or let you will see the different result
Description: What's the difference between using "let" and "var"?

For Loop in Protractor is not working properly

I am working on Protractor for testing the Angular JS application. I have written a code to read the data from excel sheet.My scenario is like I have a end to end flow that should execute.The code will take the URL,UserName and Password from the excel sheet and will execute the entire flow. Than again it will iterate the other value. But its not going into the loop.
My code is:
var Excel = require('exceljs');
var XLSX = require('xlsx');
var os = require('os');
var TEMP_DIR = os.tmpdir();
var wrkbook = new Excel.Workbook();
//---------------------Duration as Days------------------------------------------
describe('Open the clinicare website by logging into the site', function () {
it('IP Medication Simple flows for Patient Keerthi for Days,Weeks and Months', function () {
console.log("hello6");
browser.driver.manage().window().maximize();
var wb = XLSX.readFile('E:\\LAM WAH EE_Testing Enviornment\\IP_Medication_Flow\\Patients_Entry.xlsx');
var ws = wb.Sheets.Sheet1;
var json = XLSX.utils.sheet_to_json(wb.Sheets.Sheet1);
console.log("json", json);
//var json = XLSX.utils.sheet_to_json(wb.Sheets.Sheet1);
//console.log("json", json);
for(var a = 0; a < json.length ; a++){
console.log("Test_URL", json[a].Test_URL);
console.log("User_Name", json[a].User_Name);
console.log("Password", json[a].Password);
browser.get(json[a].Test_URL);
console.log("hello10");
//Perform Login:UserName
element(by.model('accessCode')).sendKeys(json[a].User_Name);
browser.sleep(6000);
// browser.driver.sleep(6000);
//Perform Login:Password
element(by.model('password')).sendKeys(json[a].Password);
browser.sleep(6000);
//Hospital Name
element(by.cssContainingText('option', 'HLWE')).click();
browser.sleep(6000);
//Perform Login:LoginButton
element(by.css('.btn.btn-primary.pull-right')).click();
browser.sleep(6000);
//Clicking on Admitted Tab
element(by.xpath("//span[contains(text(),' Admitted(25)')]")).click();
browser.sleep(6000);
// browser.driver.sleep(6000);
//Clicking on First Admitted Patient
element(by.cssContainingText('span.clearfloat', '35690')).element(by.xpath('//*[#id="searchPatientImgAdmittedF"]')).click();
jasmine.DEFAULT_TIMEOUT_INTERVAL = 600000;
// browser.sleep(600);
//Clicking anywhere to proceed
element(by.xpath('/html/body/div[3]/div[1]/div[16]/div[1]/div/table[4]/tbody/tr[2]/td/div/div/div[3]/table/tbody/tr[1]/td[3]')).click();
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
browser.sleep(800);
Anyone's help is appreciated. Thanks in advance.

Alright initially confused with the 'exceljs' node module. It is not used in your test. I think the major problem here is that the file does not exist.
readFile and ENOENT
The first thing of the readFile is an alias for readFileSync which calls readSync which calls (probably) read_binary which offloads to node's fs.readFileSync. More than likely the fs.readFileSync is throwing the ENOENT because the path does not exist.
Looking at your path, you might need a backslash before your spaces.
var wb = XLSX.readFile('E:\\LAM\ WAH\ EE_Testing Enviornment\\IP_Medication_Flow\\Patients_Entry.xlsx');
It could be a good practice to get the file path with path.resolve prior to calling the read file method.
var path = require('path');
var patientEntryFilePath = path.resolve('E:\\LAM\ WAH\ EE_Testing Enviornment\\IP_Medication_Flow\\Patients_Entry.xlsx');
console.log(patientEntryFilePath);
var wb = XLSX.readFile(patientEntryFilePath);
Additional comments and thoughts about the original code snippet
Some additional comments about the code snippet from the original question. Maybe considerations for future cleanup.
Think about using a beforeAll or beforeEach for setting your browser driver window size and reading in a file. Reading in the file once is potentially a time and resource saver.
describe('Open the clinicare website by logging into the site', function () {
var json = null;
beforeAll(() => {
browser.driver.manage().window().maximize();
var wb = XLSX.readFile('E:\\LAM\ WAH\ EE_Testing Enviornment\\IP_Medication_Flow\\Patients_Entry.xlsx');
var ws = wb.Sheets.Sheet1;
json = XLSX.utils.sheet_to_json(wb.Sheets.Sheet1);
});
it('IP Medication Simple flows for Patient Keerthi for Days,Weeks and Months', function () {
console.log("json", json);
...
Looking at your test that it is a login and it appears to have the same flow, you really only need to test this once. The for loop is acceptable since the json file is resolved and each line is executed in the control flow that Protractor uses.
Avoid using xpath. It is better to find elements by css or id or partial path. In the developer adds an additional div in the list of div's will break your test, making your test more fragile and require more upkeep.

This because Protractor API execute Async, but the For loop execute Sync. Get detail explain from here, which is same issue as yours.
To fix your issue, we can use javascript closure.
for(var a = 0; a < json.length ; a++) {
(function(a){
console.log("Test_URL", json[a].Test_URL);
console.log("User_Name", json[a].User_Name);
console.log("Password", json[a].Password);
browser.get(json[a].Test_URL);
console.log("hello10");
//Perform Login:UserName
element(by.model('accessCode')).sendKeys(json[a].User_Name);
browser.sleep(6000);
// browser.driver.sleep(6000);
//Perform Login:Password
element(by.model('password')).sendKeys(json[a].Password);
browser.sleep(6000);
...
})(a)
}

phantomJS - Pass Argument to the JS File

Right now I'm using the following command to run phantomJS
exec('./phantomjs table.js',$op,$er);
table.js
var page = require('webpage').create();
page.open('table.php', function () {
page.render('table.png');
phantom.exit();
});
This serves the purpose. But now I'm required to work with a dynamic variable, namely date. So is it possible to pass a PHP or Javascript variable inside the exec command line so that I can use that variable inside table.js?
Update
I tried modifying my code according to a solution posted here Passing a variable to PhantomJS via exec
exec('./phantomjs table.js http://www.yahoo.com',$op,$er);
table.js
var args = require('system').args;
var page = require('webpage').create();
var address = system.args[1];
page.open(address, function () {
page.render('table.png');
phantom.exit();
});
But this results in 2 problems:
The whole process takes about 3-4 minutes to finish
After that I get "Server Not Found" message
If I remove the modified code, everything works as expected.
More Debugging
Inside table.js I used this:
var args = require('system').args;
args.forEach(function(arg, i) {
console.log(i+'::'+arg);
});
var page = require('webpage').create();
var address = 'http://www.gmail.com';
page.open(address, function () {
page.render('github.png');
phantom.exit();
});
On running this, my $op (from exec command) printout out this:
Array ( [0] => 0::table.js [1] => 1::http://www.yahoo.com )
So far so good. But as soon as I put the below code, the same problems are encountered
var args = require('system').args;
var page = require('webpage').create();
var address = system.args[1]; // <--- This line is creating problem, the culprit
page.open(address, function () {
page.render('github.png');
phantom.exit();
});
Seems like that is not the correct syntax. Anything obvious that I'm unable to see?

The problem with your code is a simple oversight.
You have already stored the args using
var args = require('system').args;
So when you need to reference them you only have to do:
var address = args[1];
The use of "system" is looking in a completely different array

I had to do this and this answers pointed me to find my final answer however as some people expressed here my browser was crashing... I found the problem and solution and thought was worth sharing...
This will work perfectly fine if:
exec('phantomjs phdemo.js http://google.com', $o, $e); ?>
var page = require('webpage').create();
var system = require('system');
var address = system.args[1];
page.open(address, function () {
page.render('output.pdf');
phantom.exit();
});
However if you want to pass more than une parameter in the url address for example google.com?searchteext&date=today I found that the character '&' crashes the browser as it expects it as a different command
My solution was to use the same but instead of putting & I used # sign so the url will look something like google.com?searchteext#date=today
then at the other end I added a string replace
var address = address.replace(/#/gi,"&");
Then everything works perfectly fine.... There may be other ways of doing it but this worked perfectly for me

Well, I found an alternative to the above problem. Instead of using
var address = system.args[1];
I'm doing it by following the below modification
var args = require('system').args;
var address = '';
args.forEach(function(arg, i) {
if(i == 1)
{
address = arg;
}
});
var page = require('webpage').create();
page.open(address, function () { // <-- use that address variable from above
page.render('github.png');
phantom.exit();
});

Save and render a webpage with PhantomJS and node.js

I'm looking for an example of requesting a webpage, waiting for the JavaScript to render (JavaScript modifies the DOM), and then grabbing the HTML of the page.
This should be a simple example with an obvious use-case for PhantomJS. I can't find a decent example, the documentation seems to be all about command line use.

From your comments, I'd guess you have 2 options
Try to find a phantomjs node module - https://github.com/amir20/phantomjs-node
Run phantomjs as a child process inside node - http://nodejs.org/api/child_process.html
Edit:
It seems the child process is suggested by phantomjs as a way of interacting with node, see faq - http://code.google.com/p/phantomjs/wiki/FAQ
Edit:
Example Phantomjs script for getting the pages HTML markup:
var page = require('webpage').create();
page.open('http://www.google.com', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByTagName('html')[0].innerHTML
});
console.log(p);
}
phantom.exit();
});

With v2 of phantomjs-node it's pretty easy to print the HTML after it has been processed.
var phantom = require('phantom');
phantom.create().then(function(ph) {
ph.createPage().then(function(page) {
page.open('https://stackoverflow.com/').then(function(status) {
console.log(status);
page.property('content').then(function(content) {
console.log(content);
page.close();
ph.exit();
});
});
});
});
This will show the output as it would have been rendered with the browser.
Edit 2019:
You can use async/await:
const phantom = require('phantom');
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
const status = await page.open('https://stackoverflow.com/');
const content = await page.property('content');
console.log(content);
await instance.exit();
})();
Or if you just want to test, you can use npx
npx phantom#latest https://stackoverflow.com/

I've used two different ways in the past, including the page.evaluate() method that queries the DOM that Declan mentioned. The other way I've passed info from the web page is to spit it out to console.log() from there, and in the phantomjs script use:
page.onConsoleMessage = function (msg, line, source) {
console.log('console [' +source +':' +line +']> ' +msg);
}
I might also trap the variable msg in the onConsoleMessage and search for some encapsulate data. Depends on how you want to use the output.
Then in the Nodejs script, you would have to scan the output of the Phantomjs script:
var yourfunc = function(...params...) {
var phantom = spawn('phantomjs', [...args]);
phantom.stdout.setEncoding('utf8');
phantom.stdout.on('data', function(data) {
//parse or echo data
var str_phantom_output = data.toString();
// The above will get triggered one or more times, so you'll need to
// add code to parse for whatever info you're expecting from the browser
});
phantom.stderr.on('data', function(data) {
// do something with error data
});
phantom.on('exit', function(code) {
if (code !== 0) {
// console.log('phantomjs exited with code ' +code);
} else {
// clean exit: do something else such as a passed-in callback
}
});
}
Hope that helps some.

Why not just use this ?
var page = require('webpage').create();
page.open("http://example.com", function (status)
{
if (status !== 'success')
{
console.log('FAIL to load the address');
}
else
{
console.log('Success in fetching the page');
console.log(page.content);
}
phantom.exit();
});

Late update in case anyone stumbles on this question:
A project on GitHub developed by a colleague of mine exactly aims at helping you do that: https://github.com/vmeurisse/phantomCrawl.
It still a bit young, it certainly is missing some documentation, but the example provided should help doing basic crawling.

Here's an old version that I use running node, express and phantomjs which saves out the page as a .png. You could tweak it fairly quickly to get the html.
https://github.com/wehrhaus/sitescrape.git

How to get the file-path of the currently executing javascript code

I'm trying to do something like a C #include "filename.c", or PHP include(dirname(__FILE__)."filename.php") but in javascript. I know I can do this if I can get the URL a js file was loaded from (e.g. the URL given in the src attribute of the tag). Is there any way for the javascript to know that?
Alternatively, is there any good way to load javascript dynamically from the same domain (without knowing the domain specifically)? For example, lets say we have two identical servers (QA and production) but they clearly have different URL domains. Is there a way to do something like include("myLib.js"); where myLib.js will load from the domain of the file loading it?
Sorry if thats worded a little confusingly.

Within the script:
var scripts = document.getElementsByTagName("script"),
src = scripts[scripts.length-1].src;
This works because the browser loads and executes scripts in order, so while your script is executing, the document it was included in is sure to have your script element as the last one on the page. This code of course must be 'global' to the script, so save src somewhere where you can use it later. Avoid leaking global variables by wrapping it in:
(function() { ... })();

All browsers except Internet Explorer (any version) have document.currentScript, which always works always (no matter how the file was included (async, bookmarklet etc)).
If you want to know the full URL of the JS file you're in right now:
var script = document.currentScript;
var fullUrl = script.src;
Tadaa.

I just made this little trick :
window.getRunningScript = () => {
return () => {
return new Error().stack.match(/([^ \n])*([a-z]*:\/\/\/?)*?[a-z0-9\/\\]*\.js/ig)[0]
}
}
console.log('%c Currently running script:', 'color: blue', getRunningScript()())
✅ Works on: Chrome, Firefox, Edge, Opera
Enjoy !

The accepted answer here does not work if you have inline scripts in your document. To avoid this you can use the following to only target <script> tags with a [src] attribute.
/**
* Current Script Path
*
* Get the dir path to the currently executing script file
* which is always the last one in the scripts array with
* an [src] attr
*/
var currentScriptPath = function () {
var scripts = document.querySelectorAll( 'script[src]' );
var currentScript = scripts[ scripts.length - 1 ].src;
var currentScriptChunks = currentScript.split( '/' );
var currentScriptFile = currentScriptChunks[ currentScriptChunks.length - 1 ];
return currentScript.replace( currentScriptFile, '' );
}
This effectively captures the last external .js file, solving some issues I encountered with inline JS templates.

Refining upon the answers found here I came up with the following:
getCurrentScript.js
var getCurrentScript = function() {
if (document.currentScript) {
return document.currentScript.src;
} else {
var scripts = document.getElementsByTagName('script');
return scripts[scripts.length - 1].src;
}
}
// module.exports = getCurrentScript;
console.log({log: getCurrentScript()})
getCurrentScriptPath.js
var getCurrentScript = require('./getCurrentScript');
var getCurrentScriptPath = function () {
var script = getCurrentScript();
var path = script.substring(0, script.lastIndexOf('/'));
return path;
};
module.exports = getCurrentScriptPath;
BTW: I'm using CommonJS
module format and bundling with webpack.

I've more recently found a much cleaner approach to this, which can be executed at any time, rather than being forced to do it synchronously when the script loads.
Use stackinfo to get a stacktrace at a current location, and grab the info.file name off the top of the stack.
info = stackinfo()
console.log('This is the url of the script '+info[0].file)

I've coded a simple function which allows to get the absolute location of the current javascript file, by using a try/catch method.
// Get script file location
// doesn't work for older browsers
var getScriptLocation = function() {
var fileName = "fileName";
var stack = "stack";
var stackTrace = "stacktrace";
var loc = null;
var matcher = function(stack, matchedLoc) { return loc = matchedLoc; };
try {
// Invalid code
0();
} catch (ex) {
if(fileName in ex) { // Firefox
loc = ex[fileName];
} else if(stackTrace in ex) { // Opera
ex[stackTrace].replace(/called from line \d+, column \d+ in (.*):/gm, matcher);
} else if(stack in ex) { // WebKit, Blink and IE10
ex[stack].replace(/at.*?\(?(\S+):\d+:\d+\)?$/g, matcher);
}
return loc;
}
};
You can see it here.

Refining upon the answers found here:
little trick
getCurrentScript and getCurrentScriptPath
I came up with the following:
//Thanks to https://stackoverflow.com/a/27369985/5175935
var getCurrentScript = function() {
if (document.currentScript && (document.currentScript.src !== ''))
return document.currentScript.src;
var scripts = document.getElementsByTagName('script'),
str = scripts[scripts.length - 1].src;
if (str !== '')
return str ;
//Thanks to https://stackoverflow.com/a/42594856/5175935
return new Error().stack.match(/(https?:[^:]*)/)[0];
};
//Thanks to https://stackoverflow.com/a/27369985/5175935
var getCurrentScriptPath = function() {
var script = getCurrentScript(),
path = script.substring(0, script.lastIndexOf('/'));
return path;
};
console.log({path: getCurrentScriptPath()})

Regardless of whether its a script, a html file (for a frame, for example), css file, image, whatever, if you dont specify a server/domain the path of the html doc will be the default, so you could do, for example,
<script type=text/javascript src='/dir/jsfile.js'></script>
or
<script type=text/javascript src='../../scripts/jsfile.js'></script>
If you don't provide the server/domain, the path will be relative to either the path of the page or script of the main document's path

I may be misunderstanding your question but it seems you should just be able to use a relative path as long as the production and development servers use the same path structure.
<script language="javascript" src="js/myLib.js" />

I've thrown together some spaghetti code that will get the current .js file ran (ex. if you run a script with "node ." you can use this to get the directory of the script that's running)
this gets it as "file://path/to/directoryWhere/fileExists"
var thisFilesDirectoryPath = stackinfo()[0].traceline.substring("readFile (".length, stackinfo()[0].traceline.length - ")".length-"readFile (".length);
this requires an npm package (im sure its on other platforms as well):
npm i stackinfo
import stackinfo from 'stackinfo'; or var {stackinfo} = require("stackinfo");

function getCurrnetScriptName() {
const url = new URL(document.currentScript.src);
const {length:len, [len-1]:last} = url.pathname.split('/');
return last.slice(0,-3);
}

We Keep Coding

JavaScript is the programming language of the Web.

phantomjs save array urls not working - javascript

Related

nodejs download multiple files from web

For Loop in Protractor is not working properly

phantomJS - Pass Argument to the JS File

Save and render a webpage with PhantomJS and node.js

How to get the file-path of the currently executing javascript code

Categories

Resources