How do I scrape data from JavaScript website? - javascript

I am trying to scrape data from this dynamic JavaScript website. Since the page is dynamic I am using Selenium to extract the data from the table. Please suggest me how to scrape the data from the dynamic table. Here is my code.
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import lxml.html as LH
import requests
# specify the url
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
##driver = webdriver.Firefox(executable_path = 'C:/Users/Shresth Suman/Downloads/geckodriver-v0.26.0-win64/geckodriver.exe')
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 5s
time.sleep(5)
# driver.quit()
# find elements by xpath
##results = driver.find_elements_by_xpath("//div[#id='div_taboa']//table[#id='taboa']/tbody")
##results = driver.find_elements_by_xpath("//*[#id='page-title']")
##results = driver.find_elements_by_xpath("//*[#id='div_main']/h2[1]")
results = driver.find_elements_by_xpath("//*[#id = 'frame_historicos']")
print(results)
print(len(results))
# create empty array to store data
data = []
# loop over results
for result in results:
heading = result.text
print(heading)
headingfind = result.find_element_by_tag_name('h1')
# append dict to array
data.append({"head" : headingfind, "name" : heading})
# close driver
driver.quit()
###################################################################
# save to pandas dataframe
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('testsot.csv')
I want to extract data from 2005 till present with Averages/Totals of 10 min which gives me data for only one month.

Induce WebDriverWait And element_to_be_clickable()
Install Beautiful soup library
Using pandas read_html()
I haven't create list. you should create startdate and enddate list and itearte for all those month since 1/1/2005
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
driver.get(urlpage)
WebDriverWait(driver,20).until(EC.frame_to_be_available_and_switch_to_it((By.ID,"frame_historicos")))
inputstartdate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[1]")))
inputstartdate.clear()
inputstartdate.send_keys("1/1/2005")
inputenddate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[last()]")))
inputenddate.clear()
inputenddate.send_keys("1/31/2005")
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//input[#class='form-submit'][#value='REFRESH']"))).click()
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#taboa")))
time.sleep(3)
soup=BeautifulSoup(driver.page_source,"html.parser")
table=soup.find("table", id="taboa")
df=pd.read_html(str(table))
df.to_csv('testsot.csv')
print(df)

Related

Web scraping javascript based websites using selenium gives error

I've been working on a project to send a few numbers to a specific discord server scraped from a javascript based website. I've gotten to the point where I only need to scrape the numbers, but I am having issues with it. When I try to get the numbers, this error pops up:
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\cukor4_dry.py", line 48, in <module>
element = wait.until(EC.visibility_of_element_located((By.ID, "mainbgsection")))
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
code I use:
#import libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import time
from twill.commands import *
import pyautogui
import os
import subprocess
from dhooks import Webhook, File
import sys
#set settings
chrome_options = webdriver.ChromeOptions()
webdriver = webdriver.Chrome("chromedriver.exe", options=chrome_options)
hook = Webhook('webhook link')
time.sleep(4)
print('form')
showforms()
try:
#try to log into page
webdriver.get('url')
webdriver.find_element_by_id('username').send_keys('username')
webdriver.find_element_by_id('password').send_keys('password')
webdriver.find_element_by_name('actionButton').click()
print('submit')
except:
#already logged in
pass
print('waited')
#try to scrape the website
url = "url"
webdriver.get(url)
wait = WebDriverWait(webdriver, 10)
element = wait.until(EC.visibility_of_element_located((By.ID, "mainbgsection")))

How do I extract the data from these JavaScript tables using Selenium and Python?

I am very new to Python, JavaScript, and Web-Scraping. I am trying to write code that writes all of the data in tables like this into a csv file. The webpage is "https://www.mcmaster.com/cam-lock-fittings/material~aluminum/"
I started by trying to find the data in the html but then realized that the website uses JavaScript. I then tried using selenium but I cannot find anywhere in the JavaScript code that has the actual data that is displayed in these tables. I wrote this code to see if I could find the display data anywhere but I was unable to find it.
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://www.mcmaster.com/cam-lock-fittings/material~aluminum/'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(executable_path='C:/Users/Brian Knoll/Desktop/chromedriver.exe', options=options)
driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
driver.close()
filename = "McMaster Text.txt"
fo = open(filename, "w")
fo.write(html)
fo.close()
I'm sure there's an obvious answer that is just going over my head. Any help would be greatly appreciated! Thank you!
I guess you need to wait till the table your looking for is loaded.
To do so, add the following line to wait for 10 seconds before start scraping the data
fullLoad = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[contains(#class, 'ItmTblCntnr')]")))
Here is the full code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
url = 'https://www.mcmaster.com/cam-lock-fittings/material~aluminum/'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"), options=options)
driver.get(url)
fullLoad = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[contains(#class, 'ItmTblCntnr')]")))
html = driver.execute_script("return document.documentElement.outerHTML")
driver.close()
filename = "McMaster Text.txt"
fo = open(filename, "w")
fo.write(html)
fo.close()

Selenium Python - Get Table Data Instead of JavaScript Code

I need some help on a data Scraping Task on this : https://soilhealth.dac.gov.in/NewHomePage/NutriPage
I Managed to fill the dropdown Menu and to click on view using this code :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
url = "https://soilhealth.dac.gov.in/NewHomePage/NutriPage"
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get(url)
select = Select(driver.find_element_by_id('NutriCatId'))
select.select_by_visible_text('Sample Wise')
select = Select(driver.find_element_by_id('CycleId'))
select.select_by_visible_text('All Cycle')
select = Select(driver.find_element_by_id('State_Code'))
select.select_by_visible_text('Andaman And Nicobar Islands')
driver.implicitly_wait(5)
select = Select(driver.find_element_by_id('District_Code'))
select.select_by_visible_text('Nicobars')
driver.find_element_by_id('s').click()
driver.implicitly_wait(30)
soup_level1=BeautifulSoup(driver.page_source, 'lxml')
I need to scrape the table data from the source code, instead of having it on soup_level1 xml, I only have the javascript code.
Any help to know if scraping the data is possible using Selenium is possible and how can I do it would be awsome.
Thank you for your help.
Hey the below code does the job. But it is slow since the table is huge and it takes a bit of time to parse. I observed that the report has an export option available so do try to download it directly using Selenium. Oh and for the explanation the report is generated as an iframe which is different from the default source of the page so you need to switch to that frame to get the info. Do let me know for any clarification. The required data is in df variable.
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 24 11:08:32 2020
#author: prakh
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import time
url = "https://soilhealth.dac.gov.in/NewHomePage/NutriPage"
driver = webdriver.Chrome(executable_path='C:/Users/prakh/Documents/PythonScripts/chromedriver.exe')
driver.get(url)
select = Select(driver.find_element_by_id('NutriCatId'))
select.select_by_visible_text('Sample Wise')
select = Select(driver.find_element_by_id('CycleId'))
select.select_by_visible_text('All Cycle')
select = Select(driver.find_element_by_id('State_Code'))
select.select_by_visible_text('Andaman And Nicobar Islands')
driver.implicitly_wait(5)
select = Select(driver.find_element_by_id('District_Code'))
select.select_by_visible_text('Nicobars')
driver.find_element_by_id('s').click()
driver.implicitly_wait(30)
#soup_level1=BeautifulSoup(driver.page_source, 'lxml')
#src = driver.find_element_by_xpath('//*[#id="report"]/iframe').get_attribute("src")
driver.switch_to.frame(driver.find_element_by_xpath('//*[#id="report"]/iframe'))
time.sleep(10)
html = driver.page_source
df_list = pd.read_html(html)
df = df_list[-3]
driver.quit()

Python Selenium BeautifulSoup Page Source Does Not Display Everything

Goal:
Hello I am pretty new to Web and Selenium. I am currently trying to grab a value from my JIRA Board. Here:
Problem:
For some reason that value does not show up in the page source. I think it might be a JavaScript rendered value? or maybe it gets generated after the page loads. I tried using implicitly_wait, WebDriverWait, and switch_Frame but nothing seems to work. =/
Code:
#!/usr/local/bin/python2.7
#import requests
import json
import base64
import sys
import getopt
import argparse
from datetime import datetime
from datetime import timedelta
from bs4 import BeautifulSoup
from jira import JIRA
from jira.client import GreenHopper
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
JIRA_INSTALLATION = "jira.turn.com"
STATE_IN_PROGRESS = "In Progress"
STATE_RESOLVED = "Resolved"
STATE_CLOSED = "Closed"
options = {'server': 'https://jira.turn.com'}
CUR_TIMEZONE_SHIFT = timedelta(hours=7)
def main(argv):
p=argparse.ArgumentParser(description="Gets a set of completed stories and list them with their implementation time.")
p.add_argument('filter_id', help="Id of the filter that contains completed stories.")
p.add_argument('-u', dest='username', help="JIRA username. Needs to have read access to all tickets returned from the search filter.")
p.add_argument('-p', dest='password', help="Password for the JIRA user to use for API calls.")
args = p.parse_args(argv)
driver = webdriver.Firefox()
driver.get('https://jira.turn.com/')
driver.find_element_by_id("login-form-username").send_keys(args.username)
driver.find_element_by_id ("login-form-password").send_keys(args.password)
driver.find_element_by_id("login").click()
#driver.implicitly_wait(10)
#ele = WebDriverWait(driver, 10)
driver.get('https://jira.turn.com/secure/RapidBoard.jspa?rapidView=184&view=reporting&chart=controlChart&days=30&column=1214&column=1298')
#WebDriverWait
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')#'html.parser')#'lxml')
print soup_level1.find(id='ghx-chart-snapshot')#find(id='content').find(id="gh").find(id="ghx-content-main").find(id="ghx-chart-header"))
print soup_level1.find(id='ghx-chart-snapshot').find(id='ghx-chart-snapshot')
driver.quit()
return
if __name__ == "__main__":
main(sys.argv[1:])
Output:
<div id="ghx-chart-snapshot"></div>
None

Selenium Click() not working with scrapy spider

I am trying to scrape links to product pages from a listing page using a scrapy spider. The page shows the first 10 machines and has a button for 'show all machines' that calls some javascript. The javascript is reasonably complicated (i.e. I can't just look at the function and see the url that the button points to). I'm trying to use the selenium webdriver to simulate a click on the button but it isn't working for some reason. When I scrape the product links I only get the first 10, not the complete list.
Can anybody tell me why it doesn't work?
The page I'm trying to scrape is http://www.ncservice.com/en/second-hand-milling-machines
The spider is
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem
import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
requests = list(super(MachineSpider, self).start_requests())
requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
return requests
def parsencmilllist(self,response):
hxs=HtmlXPathSelector(response)
driver= webdriver.Firefox()
driver.get(response.url)
try:
driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
except:
log.msg("Couldnt get all the machines", level=log.INFO)
ncmachs = hxs.select('//div[#id="resultados"]//a/#href').extract()
for ncmach in ncmachs:
yield Request(ncmach,
meta = {'type':'Milling'},
callback=self.parsencmachine)
driver.quit()
def parsencmachine(self,response):
#scrape the machine
return item
Thanks!
The main problem is that you need to initialize your Selector from the webdriver's page_source and not the response passed into the callback:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy import Selector
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
callback=self.parsencmilllist)
def parsencmilllist(self, response):
driver = webdriver.Firefox()
driver.get(response.url)
driver.find_element_by_id("mas-resultados-fresadoras").click()
sel = Selector(text=driver.page_source)
driver.quit()
links = sel.xpath('//div[#id="resultados"]//a/#href').extract()
for link in links:
yield Request(link,
meta={'type': 'Milling'},
callback=self.parsencmachine)
def parsencmachine(self, response):
print response.url

Categories