Selenium Click() not working with scrapy spider

Selenium Click() not working with scrapy spider - javascript

I am trying to scrape links to product pages from a listing page using a scrapy spider. The page shows the first 10 machines and has a button for 'show all machines' that calls some javascript. The javascript is reasonably complicated (i.e. I can't just look at the function and see the url that the button points to). I'm trying to use the selenium webdriver to simulate a click on the button but it isn't working for some reason. When I scrape the product links I only get the first 10, not the complete list.
Can anybody tell me why it doesn't work?
The page I'm trying to scrape is http://www.ncservice.com/en/second-hand-milling-machines
The spider is
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem
import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
requests = list(super(MachineSpider, self).start_requests())
requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
return requests
def parsencmilllist(self,response):
hxs=HtmlXPathSelector(response)
driver= webdriver.Firefox()
driver.get(response.url)
try:
driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
except:
log.msg("Couldnt get all the machines", level=log.INFO)
ncmachs = hxs.select('//div[#id="resultados"]//a/#href').extract()
for ncmach in ncmachs:
yield Request(ncmach,
meta = {'type':'Milling'},
callback=self.parsencmachine)
driver.quit()
def parsencmachine(self,response):
#scrape the machine
return item
Thanks!

The main problem is that you need to initialize your Selector from the webdriver's page_source and not the response passed into the callback:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy import Selector
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
callback=self.parsencmilllist)
def parsencmilllist(self, response):
driver = webdriver.Firefox()
driver.get(response.url)
driver.find_element_by_id("mas-resultados-fresadoras").click()
sel = Selector(text=driver.page_source)
driver.quit()
links = sel.xpath('//div[#id="resultados"]//a/#href').extract()
for link in links:
yield Request(link,
meta={'type': 'Milling'},
callback=self.parsencmachine)
def parsencmachine(self, response):
print response.url

Related

Use Python to Parse HTML from dynamic js webpage with qt5 and Beautifulsoup [duplicate]

I am trying to get HTML of a page loaded in PyQT5 QWebEngineView. Here is a simple example:
import sys
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from PyQt5.QtWidgets import *
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.getElementsByTagName('html')[0]", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
I was hoping to be able to return html from the runJavaScript() call but i get a blank in the callback function.
What is incorrect in my code and what alternatives are available for obtaining HTML of a page?

Using my old answer written C++ and translating the solution to Python:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.documentElement.outerHTML", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
Update:
The problem in your case is that getElementsByTagName() returns a list of js elements, and that element cannot be exported to python, what you should do is get the innerHTML:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript(
"document.getElementsByTagName('html')[0].innerHTML", callback_function
)
# or document.getElementsByTagName('html')[0].outerHTML
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())

How do I scrape data from JavaScript website?

I am trying to scrape data from this dynamic JavaScript website. Since the page is dynamic I am using Selenium to extract the data from the table. Please suggest me how to scrape the data from the dynamic table. Here is my code.
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import lxml.html as LH
import requests
# specify the url
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
##driver = webdriver.Firefox(executable_path = 'C:/Users/Shresth Suman/Downloads/geckodriver-v0.26.0-win64/geckodriver.exe')
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 5s
time.sleep(5)
# driver.quit()
# find elements by xpath
##results = driver.find_elements_by_xpath("//div[#id='div_taboa']//table[#id='taboa']/tbody")
##results = driver.find_elements_by_xpath("//*[#id='page-title']")
##results = driver.find_elements_by_xpath("//*[#id='div_main']/h2[1]")
results = driver.find_elements_by_xpath("//*[#id = 'frame_historicos']")
print(results)
print(len(results))
# create empty array to store data
data = []
# loop over results
for result in results:
heading = result.text
print(heading)
headingfind = result.find_element_by_tag_name('h1')
# append dict to array
data.append({"head" : headingfind, "name" : heading})
# close driver
driver.quit()
###################################################################
# save to pandas dataframe
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('testsot.csv')
I want to extract data from 2005 till present with Averages/Totals of 10 min which gives me data for only one month.

Induce WebDriverWait And element_to_be_clickable()
Install Beautiful soup library
Using pandas read_html()
I haven't create list. you should create startdate and enddate list and itearte for all those month since 1/1/2005
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
driver.get(urlpage)
WebDriverWait(driver,20).until(EC.frame_to_be_available_and_switch_to_it((By.ID,"frame_historicos")))
inputstartdate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[1]")))
inputstartdate.clear()
inputstartdate.send_keys("1/1/2005")
inputenddate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[last()]")))
inputenddate.clear()
inputenddate.send_keys("1/31/2005")
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//input[#class='form-submit'][#value='REFRESH']"))).click()
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#taboa")))
time.sleep(3)
soup=BeautifulSoup(driver.page_source,"html.parser")
table=soup.find("table", id="taboa")
df=pd.read_html(str(table))
df.to_csv('testsot.csv')
print(df)

Python Selenium BeautifulSoup Page Source Does Not Display Everything

Goal:
Hello I am pretty new to Web and Selenium. I am currently trying to grab a value from my JIRA Board. Here:
Problem:
For some reason that value does not show up in the page source. I think it might be a JavaScript rendered value? or maybe it gets generated after the page loads. I tried using implicitly_wait, WebDriverWait, and switch_Frame but nothing seems to work. =/
Code:
#!/usr/local/bin/python2.7
#import requests
import json
import base64
import sys
import getopt
import argparse
from datetime import datetime
from datetime import timedelta
from bs4 import BeautifulSoup
from jira import JIRA
from jira.client import GreenHopper
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
JIRA_INSTALLATION = "jira.turn.com"
STATE_IN_PROGRESS = "In Progress"
STATE_RESOLVED = "Resolved"
STATE_CLOSED = "Closed"
options = {'server': 'https://jira.turn.com'}
CUR_TIMEZONE_SHIFT = timedelta(hours=7)
def main(argv):
p=argparse.ArgumentParser(description="Gets a set of completed stories and list them with their implementation time.")
p.add_argument('filter_id', help="Id of the filter that contains completed stories.")
p.add_argument('-u', dest='username', help="JIRA username. Needs to have read access to all tickets returned from the search filter.")
p.add_argument('-p', dest='password', help="Password for the JIRA user to use for API calls.")
args = p.parse_args(argv)
driver = webdriver.Firefox()
driver.get('https://jira.turn.com/')
driver.find_element_by_id("login-form-username").send_keys(args.username)
driver.find_element_by_id ("login-form-password").send_keys(args.password)
driver.find_element_by_id("login").click()
#driver.implicitly_wait(10)
#ele = WebDriverWait(driver, 10)
driver.get('https://jira.turn.com/secure/RapidBoard.jspa?rapidView=184&view=reporting&chart=controlChart&days=30&column=1214&column=1298')
#WebDriverWait
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')#'html.parser')#'lxml')
print soup_level1.find(id='ghx-chart-snapshot')#find(id='content').find(id="gh").find(id="ghx-content-main").find(id="ghx-chart-header"))
print soup_level1.find(id='ghx-chart-snapshot').find(id='ghx-chart-snapshot')
driver.quit()
return
if __name__ == "__main__":
main(sys.argv[1:])
Output:
<div id="ghx-chart-snapshot"></div>
None

Form scraping using scrapy with Javascript embedded

I am trying to scrape this website. My spider is functional but the website has javascript embedded in the form to get the result, which I can t get through.
I've read about selenium and how I have to include a browser to get through it but I still with how to scrape after the dynamically generated HTML is loaded, while still passing form arguments.
Here is the code for my spider, any help, referrals or code snippets are welcome. I've navigated and read many threads to no avail.
from scrapy.spiders import BaseSpider
from scrapy.http import FormRequest
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from tax1.items import Tax1Item
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from selenium import webdriver
import time
class tax1Spider(BaseSpider):
name = "tax1Spider"
allowed_domains = ["http://taxesejour.impots.gouv.fr"]
start_urls = ["http://taxesejour.impots.gouv.fr/DTS_WEB/UK/"]
def parse(self, response):
yield FormRequest.from_response(response,
formname='PAGE_DELIBV2',
formdata={'A8':'05 - Hautes-alpes',
'A10':'AIGUILLES'},
callback = self.parse1)
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="lh0 dzSpan dzA15"]',)), callback="parse1", follow= True))
# lh0 dzSpan dzA15
def __init__(self):
CrawlSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse1(self, response):
#hxs = HtmlXPathSelector(response)
self.browser.get(response.url)
time.sleep(3)
Selector(text=self.browser.page_source)
items = []
#item = Tax1Item()
item['message'] = hxs.select('//td[#id="tzA18"]').extract()
print item['message']
return item

How to retrieve the exact HTML as in a browser

I'm using a Python script to render web pages and retrieve their HTML's. It works fine with most of the pages, but with some of them the HTML retrieved is incomplete. And I don't quite understand why. This is the script I'm using to scrap this page, for some reason, the link to every product is not in the HTML:
Link: http://www.pullandbear.com/es/es/mujer/vestidos-c29016.html
Python script:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtNetwork
from PyQt4 import QtCore
url = sys.argv[1]
path = sys.argv[2]
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.request = QtNetwork.QNetworkRequest()
self.request.setUrl(QtCore.QUrl(url))
self.request.setRawHeader("Accept-Language", QtCore.QByteArray ("es ,*"))
self.mainFrame().load(self.request)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
r = Render(url)
result = r.frame.toHtml()
html_file = open(path, "w")
html_file.write("%s" % result.encode("utf-8"))
html_file.close()
sys.exit(app.exec_())
This code was taken from here: https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
Am I missing something? What are the limitations of this framework?
Thanks in advance,

If you want headless browsing you can combine phantomjs with selenium, the following gets all the source:
url = "http://www.pullandbear.com/es/es/mujer/vestidos-c29016.html"
from selenium import webdriver
dr = webdriver.PhantomJS()
dr.get(url)
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
element = WebDriverWait(dr, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, "grid_itemContainer"))
)
Just using selenium without the WebDriverWait did not always return the full source, adding the wait until the a tags with the grid_itemContainer class were visible makes sure the html has been generated, the xpath below returns all your links:
print([a.get_attribute('href') for a in dr.find_elements_by_xpath("//a[#class='grid_itemContainer']")])
[u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-detalle-crochet-pechera-c29016p100064004.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-bordado-escote-pico-c29016p100123006.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-manga-larga-espalda-abierta-c29016p100147503.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-hombros-descubiertos-beads-c29016p100182001.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-jacquard-capa-c29016p100255505.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-vaquero-eyelets-c29016p100336010.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-liso-oversized-c29016p100289013.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-liso-oversized-c29016p100289013.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-camisero-oversized-c29016p100036616.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-pico-c29016p100166506.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-estampado-rayas-c29016p100234507.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-manga-corta-liso-c29016p100262008.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-largo-cuello-halter-liso-c29016p100036162.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-capa-jacquard-%C3%A9tnico-c29016p100259002.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-largo-cuello-halter-rayas-c29016p100036161.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-capa-jacquard-tri%C3%A1ngulo-c29016p100255506.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-marinero-escote-bardot-c29016p100259003.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-rayas-escote-espalda-c29016p100262007.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cruzado-c29016p100216013.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-flores-canes%C3%BA-bordado-c29016p100203011.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-bordados-c29016p100037160.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-flores-volante-c29016p100216014.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-lencero-c29016p100104515.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuadros-detalle-encaje-c29016p100216016.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-drapeado-abertura-bajo-c29016p100129011.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-drapeado-abertura-bajo-c29016p100129011.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-vaquero-bolsillo-plastr%C3%B3n-c29016p100036822.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-rayas-bajo-desigual-c29016p100123010.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-camisero-vaquero-c29016p100036575.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-midi-estampado-rayas-c29016p100189011.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-midi-rayas-manga-3-4-c29016p100149507.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-midi-canal%C3%A9-ajustado-c29016p100149508.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-estampado-bolsillos-c29016p100212503.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-corte-evas%C3%A9-bolsillos-c29016p100189012.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-vaquero-camisero-cuadros-c29016p100036624.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/pichi-vaquero-c29016p100073526.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-estampado-geom%C3%A9trico-cuello-halter-c29016p100037021.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-perkins-manga-larga-c29016p100036882.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-perkins-manga-larga-c29016p100036882.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-perkins-manga-larga-c29016p100036882.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-perkins-manga-larga-c29016p100036882.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-jacquard-evas%C3%A9-c29016p100037207.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cr%C3%AApe-evas%C3%A9-estampado-flores-manga-3-4-c29016p100036932.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cr%C3%AApe-evas%C3%A9-estampado-flores-manga-3-4-c29016p100037280.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-perkins-parche-c29016p100037464.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cr%C3%AApe-evas%C3%A9-liso-manga-3-4-c29016p100036930.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cr%C3%AApe-evas%C3%A9-liso-manga-3-4-c29016p100036930.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-alto-liso-c29016p100037156.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-alto-estampado-flores-c29016p100036921.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-cuello-alto-estampado-corbatero-c29016p100037155.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-largo-manga-sisa-c29016p100170011.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-largo-manga-sisa-rayas-c29016p100170012.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-manga-acampanada-c29016p100149506.html', u'http://www.pullandbear.com/es/es/mujer/vestidos/vestido-punto-espalda-abierta-c29016p100195504.html']
If you want to write the source:
with open("out.html", "w") as f:
f.write(dr.page_source)

I think you can use http://ghost-py.readthedocs.org/en/latest/ for this case. It's loads web page like real browser and run JavaScript.
Also you can try PhantomJS for example, but it written on nodeJS.

We Keep Coding

JavaScript is the programming language of the Web.

Selenium Click() not working with scrapy spider - javascript

Related

Use Python to Parse HTML from dynamic js webpage with qt5 and Beautifulsoup [duplicate]

How do I scrape data from JavaScript website?

Python Selenium BeautifulSoup Page Source Does Not Display Everything

Form scraping using scrapy with Javascript embedded

How to retrieve the exact HTML as in a browser

Categories

Resources