How to go to the next page for scraping in PhantomJS - javascript

I'm trying to get several elements from a website with several pages. I'm currently using PhantomJS to do that work and my code almost works, but the issue is that my code scrapes twice the first page even if (according to the log) it seems that I already moved to the second one.
Here's the code:
var page = require('webpage').create();
page.viewportSize = { width: 1061, height: 1000 }; //To specify the window size
page.open("website", function () {
function fetch_names(){
var name = page.evaluate(function () {
return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
return name.getAttribute('href');
});
});
console.log(name.join('\n'));
page.render('1.png');
window.setTimeout(function (){
goto_next_page();
}, 5000);
}
function goto_next_page(){
page.evaluate(function () {
var a = document.querySelector('#block-system-main .next a');
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
a.dispatchEvent(e);
waitforload = true;
});
fetch_names();
}
fetch_names();
});
You can try it by yourself to understand how all of that work.

You need to wait for the page to load after you click and not before you click by moving setTimeout() from fetch_names to goto_next_page:
function fetch_names(){
var name = page.evaluate(function () {
return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
return name.getAttribute('href');
});
});
console.log(name.join('\n'));
page.render('1.png');
goto_next_page();
}
function goto_next_page(){
page.evaluate(function () {
var a = document.querySelector('#block-system-main .next a');
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
a.dispatchEvent(e);
waitforload = true;
});
window.setTimeout(function (){
fetch_names();
}, 5000);
}
Note that there are many more ways to wait for something other than the static timeout. Instead, you can
register to the page.onLoadFinished event:
page.onLoadFinished = fetch_names;
wait for a specific selector to appear with the waitFor() function from the examples.

Related

How to trigger an element by id? (".click();" doesn't work)

I want to trigger this element in Google Translate, so that it would always auto-correct everything I type.
https://i.snag.gy/NRsWFB.jpg
The element's id is "spelling-correction".
I tried this:
document.getElementById('spelling-correction').click();
And this:
function clickLink(link) {
var cancelled = false;
if (document.createEvent) {
var event = document.createEvent("MouseEvents");
event.initMouseEvent("click", true, true, window,
0, 0, 0, 0, 0,
false, false, false, false,
0, null);
cancelled = !link.dispatchEvent(event);
}
else if (link.fireEvent) {
cancelled = !link.fireEvent("onclick");
}
if (!cancelled) {
window.location = link.href;
}
}
setInterval(function copyText() {
var correction123 = document.getElementById("spelling-correction");
correction123.clickLink();
}, 100);
But they don't work unfortunately. I would like to somehow trigger this "spelling-correction", so that whatever I write would be auto-corrected.
Thank you in advance!
The issue is you're clicking on a div. Divs do nothing when clicked (unless otherwise specified).
Since what you want seems to be clicking on the link itself, you should try something like this instead:
childAnchors = document.querySelectorAll("#spelling-correction > a");
childAnchors[0].click();

smoothstate.js - back button sometimes doesn't work?

I'm using the plugin smoothstate.js on my website. For some reason, every now and again when I navigate through the pages using the back and forward buttons, the back button stops working.
The URL changes but the content remains the same?
I've checked the console for errors this is displaying:
TypeError: Cannot read property 'state' of undefined
Does anyone know why this is happening? Like I said, the majority of the time it works okay but all of sudden it doesn't.
The code I'm using is like so:
$(function(){
'use strict';
var options = {
prefetch: true,
debug:true,
cacheLength: 0,
repeatDelay: 500,
onStart: {
duration: 0, // Duration of our animation
render: function ($container) {
// Add your CSS animation reversing class
$container.addClass('is-exiting');
// Restart your animation
smoothState.restartCSSAnimations();
}
},
onProgress: {
// How long this animation takes
duration: 0,
// A function that dictates the animations that take place
render: function ($container) {
$container.addClass('is-loading');
$('#progressBar').append('<div id="bar"></div>');
var progress = '100%';
$('#bar').animate({
width: progress
}, 400);
}
},
onReady: {
duration: 0,
render: function ($container, $newContent) {
$container.removeClass('is-loading is-exiting');
// Inject the new content
$container.html($newContent);
},
},
onAfter: function() {
navbarAnimate();
closeMenu();
ImageSliders();
initPhotoSwipeFromDOM('.gallery');
ImageOverlay();
window.parsePinBtns();
backToTop();
}
},
smoothState = $('#main').smoothState(options).data('smoothState');
});
I also ran into this issue which happens when the back and forward buttons are clicked too fast without the page fully loading. A hacky solution for me was to reload the page if page.cache[page.href] is undefined.
/** Handles the popstate event, like when the user hits 'back' */
onPopState = function(e) {
if(e.state !== null || typeof e.state !== undefined) {
var url = window.location.href;
var $page = $('#' + e.state.id);
var page = $page.data('smoothState');
if (typeof(page.cache[page.href]) !== 'undefined') {
var diffUrl = (page.href !== url && !utility.isHash(url, page.href));
var diffState = (e.state !== page.cache[page.href].state);
if(diffUrl || diffState) {
if (diffState) {
page.clear(page.href);
}
page.load(url, false);
}
}
else {
//reload the page if page.cache[page.href] is undefined
location.reload();
}
}
},

Disable AJAX Load of Content via Function

I have a custom plugin that was originally loading html content via ajax into the page by appending a hash marker and page ID to the URL.
I am very new to this level of complexity and would like to 'undo' this functionality, so the plugin can initialize without the Router function. I've been looking at this for a couple days and am pretty lost...
The entire plugin seems to be initialized by this function. Any tips or suggestions on how to turn 'off' this feature, so the code still initializes without appending the # to the URL would be greatly appreciated.
$(function() {
var connections = [],
IEversion = detectIE(),
killConnections = null,
node = null,
randomBehaviour,
rootIndex = 1,
silentRoute = null;
// Splash.
var splash = {
init: function() {
$('#splash').addClass('active');
$('.node.splash').draggable({
containment: 'parent',
drag: function() {
if(!splash.destroyed) {
$('.node.splash').addClass('dragging');
splash.destroy('drag');
splash.destroyed = true;
}
},
scroll: false,
disabled: false
});
setTimeout(function() {
if($('.arrow').is(':visible')) {
splash.destroy();
}
}, 4000);
},
destroy: function(event) {
if(event === 'drag') {
$('.arrow').hide();
} else {
$('.arrow').fadeOut(500);
}
$('#splash-wrapper p, .node.splash').fadeOut(500);
setTimeout(function() {
$('#splash').remove();
nody.init();
}, 500);
},
destroyed: false
};
// Router.
var routes = {
'/': function() {
if(!splash.destroyed) {
splash.init();
}else {
nody.unloadMenu();
}
}
};
var router = Router(routes);
router.configure({
strict: false,
before: function() {
if(silentRoute) {
silentRoute = false;
return false;
}
}
}).init('/');
});
An easy answer-
$.mobile.ajaxEnabled = false;
Using jQuery Mobile.
Detail is here.
If u want to use jQuery, then there is not a direct way of doing it.
So u should use a global variable for doing it like this way-
var is_ajax_enabled = true;
$(document).ready(function()
{
...................
...................
$("selector").click(function()
{
...................
//before AJAX call, just do a checking like it-
if(is_ajax_enabled())
{
//make your AJAX call here
$.ajax({url: "demo_test.txt", success: function(result)
{
$("#div1").html(result);
}});
}
...................
});
...................
...................
});
function disable_ajax()
{
is_ajax_enabled=false;
}
function enable_ajax()
{
is_ajax_enabled=true;
}
function is_ajax_enabled()
{
return is_ajax_enabled;
}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
Update-
If u want to have only one request available at a time and make a queue for the request, u can try this jQuery-Ajax-Singleton like this way-
$.ajax(options || {})

How to click on selectbox options using PhantomJS

There is the page testkrok.org.ua with a consistent selection of parameters. So, I need to create a series of 5 clicks on each of the options of 5 select boxes that depend on each other.
document.querySelector('select.se1')[3]
document.querySelector('select.se2')[1]
document.querySelector('select.se3')[1]
document.querySelector('select.se4')[1]
document.querySelector('select.se5')[3]
to redirect to the page with tests.
But on snapshot taken after the first click the second panel does not appear?
Maybe I don't hit the the element?
var page = require('webpage').create();
page.open('https://testkrok.org.ua', function(status) {
console.log("Status: " + status);
if(status === "success") {
page.evaluate(function() {
var theEvent = document.createEvent("MouseEvent");
theEvent.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
var element = document.querySelector('select.se1')[3];
element.dispatchEvent(theEvent);
});
}
setTimeout( function() {
page.render('snapshot.png');
phantom.exit()
}, 5000);
});
You can't click (trigger a click event) on options of a select box. You need to change the selected option and then trigger a change event. For example:
var sel = document.querySelector('select.se1');
sel.selectedIndex = 2;
var event = new UIEvent("change", {
"view": window,
"bubbles": true,
"cancelable": true
});
sel.dispatchEvent(event);
You can package that in a function
function selectOption(selector, optionIndex) {
page.evaluate(function(selector, optionIndex){
var sel = document.querySelector(selector);
sel.selectedIndex = optionIndex;
var event = new UIEvent("change", {
"view": window,
"bubbles": true,
"cancelable": true
});
sel.dispatchEvent(event);
}, selector, optionIndex);
}
Then you can call it one after the other
selectOption("select.se1", 2);
selectOption("select.se2", 0);
selectOption("select.se3", 0);
...
You get the idea. In case the onChange event of the select box needs remote data for example through AJAX, then you will need to wait between the calls. Either use a static wait time (see following example) or use waitFor().
setTimeout(function(){
selectOption("select.se1", 2);
}, 1000);
setTimeout(function(){
selectOption("select.se2", 0);
}, 2000);
setTimeout(function(){
selectOption("select.se3", 0);
}, 3000);
...

How to make a click over a button inside a dynamically loaded iframe

The title say it all. I am working with a iframe whose the only thing I know is part of their src attribute. Until now I can reach the target element (an anchor) by their (known) id:
var f = $('iframe[src^="url"]', newTabBrowser.contentDocument);
if ( ! f.length)
return;
var b = f.contents().find('#button');
if ( ! b.length)
return;
At this point I have the desired anchor element into the jQuery variable b, but I can't click it. The anchor is like this:
I have tried:
b.click();
and:
simulateClick(b);
function simulateClick(elm) {
var evt = document.createEvent("MouseEvents");
evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
var canceled = !elm.dispatchEvent(evt);
if(canceled) {
return false;
} else {
return true;
}
}
None of both works. Any idea about how to proceed or another technique to try?
OBS: This is part of a FF addon. That's why I use newTabBrowser.contentDocument
Does this work for you:
$(document).ready(function() {
var frame = $('#iframeID').get(0).contentDocument;
$('#button', frame).click(function() {
alert("Clicked me..!");
});
});
Hope it helps in some sense.

Categories