Python Selenium BeautifulSoup Page Source Does Not Display Everything

Python Selenium BeautifulSoup Page Source Does Not Display Everything - javascript

Goal:
Hello I am pretty new to Web and Selenium. I am currently trying to grab a value from my JIRA Board. Here:
Problem:
For some reason that value does not show up in the page source. I think it might be a JavaScript rendered value? or maybe it gets generated after the page loads. I tried using implicitly_wait, WebDriverWait, and switch_Frame but nothing seems to work. =/
Code:
#!/usr/local/bin/python2.7
#import requests
import json
import base64
import sys
import getopt
import argparse
from datetime import datetime
from datetime import timedelta
from bs4 import BeautifulSoup
from jira import JIRA
from jira.client import GreenHopper
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
JIRA_INSTALLATION = "jira.turn.com"
STATE_IN_PROGRESS = "In Progress"
STATE_RESOLVED = "Resolved"
STATE_CLOSED = "Closed"
options = {'server': 'https://jira.turn.com'}
CUR_TIMEZONE_SHIFT = timedelta(hours=7)
def main(argv):
p=argparse.ArgumentParser(description="Gets a set of completed stories and list them with their implementation time.")
p.add_argument('filter_id', help="Id of the filter that contains completed stories.")
p.add_argument('-u', dest='username', help="JIRA username. Needs to have read access to all tickets returned from the search filter.")
p.add_argument('-p', dest='password', help="Password for the JIRA user to use for API calls.")
args = p.parse_args(argv)
driver = webdriver.Firefox()
driver.get('https://jira.turn.com/')
driver.find_element_by_id("login-form-username").send_keys(args.username)
driver.find_element_by_id ("login-form-password").send_keys(args.password)
driver.find_element_by_id("login").click()
#driver.implicitly_wait(10)
#ele = WebDriverWait(driver, 10)
driver.get('https://jira.turn.com/secure/RapidBoard.jspa?rapidView=184&view=reporting&chart=controlChart&days=30&column=1214&column=1298')
#WebDriverWait
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')#'html.parser')#'lxml')
print soup_level1.find(id='ghx-chart-snapshot')#find(id='content').find(id="gh").find(id="ghx-content-main").find(id="ghx-chart-header"))
print soup_level1.find(id='ghx-chart-snapshot').find(id='ghx-chart-snapshot')
driver.quit()
return
if __name__ == "__main__":
main(sys.argv[1:])
Output:
<div id="ghx-chart-snapshot"></div>
None

Related

Python web scraping with requests sign in

I am working with www.freightquote.com and at some point I need to sign in otherwise not allowed me to get freight rates for more than 45 pairs.
I would like to enter sign in information for this website but for some reason it is not working. I could not understand the problem.
You can directly use this website: https://account.chrobinson.com/
I have problem to enter the information that I am asked. Here is what I did:
from selenium import webdriver
from time import sleep
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
PATH = r'C:\Users\b\Desktop\Webscraping\chromedriver.exe'
s= Service(PATH )
driver = webdriver.Chrome(service=s)
link = "https://www.freightquote.com/book/#/free-quote/pickup"
driver.get(link)
sleep(2)
driver.maximize_window()
sleep(2)
driver.find_elements(by=By.XPATH, value = '//button[#type="button"]')[0].click()
sleep(3)
#Username:
driver.find_element(by=By.XPATH, value='//input[#type="email"]').send_keys('USERNAME')
driver.find_elements(by=By.XPATH, value = '//input[#class="button button-primary" and #type="submit"]')[0].click()
#password
driver.find_element(by=By.XPATH, value='//input[#type="password"]').send_keys('PASSWORD')
driver.find_elements(by=By.XPATH, value = '//input[#class="button button-primary" and #type="submit"]')[0].click()
sleep(2)

your code and your technic have too many problems, you should learn how to code in selenium completely and then start writing code.
I modified your code to the point of entering the email, please complete the code accordingly.
driver = webdriver.Chrome()
link = "https://www.freightquote.com/book/#/free-quote/pickup"
driver.get(link)
driver.maximize_window()
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH,
'(//button[#type="button"])[1]'))).click()
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH,
'//input[#type="email"]'))).send_keys('USERNAME')
also, you don't need to add chromedriver path in your code. if you use Windows or Linux you should add it into your virtualenv, in the /bin folder
and if you use from mac you should add it to this path /usr/local/bin

To enter sign in information for the website you need to induce WebDriverWait for the element_to_be_clickable() and you can use the following locator strategies:
Using CSS_SELECTOR:
driver.get("https://account.chrobinson.com/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']"))).send_keys("Ribella")
driver.find_element(By.CSS_SELECTOR, "input[name='password']").send_keys("Ribella")
driver.find_element(By.CSS_SELECTOR, "input[value='Sign In']").click()
Note: You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Browser Snapshot:

WebScrape This Field

My code goes into a webpage, and identifies each block in the page.
Each block contains the same style format for information.
When trying to get the title however, I am not able to pull anything?
Ideally i want the Title, Abstract, And Author.
Here is my code so far in trying it for the title, using xpath.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
driver.get('https://meetinglibrary.asco.org/results?filters=JTVCJTdCJTIyZmllbGQlMjIlM0ElMjJmY3RNZWV0aW5nTmFtZSUyMiUyQyUyMnZhbHVlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclMjIlMkMlMjJxdWVyeVZhbHVlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclMjIlMkMlMjJjaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMCUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIwJTIyJTdEJTJDJTdCJTIyZmllbGQlMjIlM0ElMjJZZWFyJTIyJTJDJTIydmFsdWUlMjIlM0ElMjIyMDIxJTIyJTJDJTIycXVlcnlWYWx1ZSUyMiUzQSUyMjIwMjElMjIlMkMlMjJjaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMSUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIxJTIyJTdEJTVE')
time.sleep(4)
page_source = driver.page_source
soup=BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='ng-star-inserted')
for item in productlist:
title=item.find_element_by_xpath("//span[#class='ng-star-inserted']").text
print(title)

Try below code and let me know if you have any query -
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 60)
driver.get(
'https://meetinglibrary.asco.org/results?filters=JTVCJTdCJTIyZmllbGQlMjIlM0ElMjJmY3RNZWV0aW5nTmFtZSUyMiUyQyUyMnZhbH'
'VlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclMjIlMkMlMjJxdWVyeVZhbHVlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclM'
'jIlMkMlMjJjaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMCUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIwJTIyJTdEJTJDJTdCJTIy'
'ZmllbGQlMjIlM0ElMjJZZWFyJTIyJTJDJTIydmFsdWUlMjIlM0ElMjIyMDIxJTIyJTJDJTIycXVlcnlWYWx1ZSUyMiUzQSUyMjIwMjElMjIlMkMlMjJ'
'jaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMSUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIxJTIyJTdEJTVE')
AllRecords = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class=\"record\"]")))
for SingleRecord in AllRecords:
print("Title :- " + SingleRecord.find_element_by_xpath(
"./descendant::div[contains(#class,\"record__title\")]/span").text)
print("Author :- " + SingleRecord.find_element_by_xpath(
"./descendant::div[contains(text(),\"Author\")]/following-sibling::div").text)
print("Abstract :- " + SingleRecord.find_element_by_xpath(
"./descendant::span[contains(text(),\"Abstract\")]/parent::div/following-sibling::span").text)
print("-------------------------------------------------")
The Output looks like -
if it resolves then please mark it as answer.

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait=WebDriverWait(driver, 40)
driver.get('https://meetinglibrary.asco.org/results?filters=JTVCJTdCJTIyZmllbGQlMjIlM0ElMjJmY3RNZWV0aW5nTmFtZSUyMiUyQyUyMnZhbHVlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclMjIlMkMlMjJxdWVyeVZhbHVlJTIyJTNBJTIyQVNDTyUyMEFubnVhbCUyME1lZXRpbmclMjIlMkMlMjJjaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMCUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIwJTIyJTdEJTJDJTdCJTIyZmllbGQlMjIlM0ElMjJZZWFyJTIyJTJDJTIydmFsdWUlMjIlM0ElMjIyMDIxJTIyJTJDJTIycXVlcnlWYWx1ZSUyMiUzQSUyMjIwMjElMjIlMkMlMjJjaGlsZHJlbiUyMiUzQSU1QiU1RCUyQyUyMmluZGV4JTIyJTNBMSUyQyUyMm5lc3RlZFBhdGglMjIlM0ElMjIxJTIyJTdEJTVE')
productList=wait.until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='record']")))
for product in productList:
title=product.find_element_by_xpath(".//span[#class='ng-star-inserted']").text
print(title)
Use .// and wait for the elements to be present. Also the div class you used was off.
Outputs
A post-COVID survey of current and future parents among faculty, trainees, and research staff at an...
Novel approach to improve the diagnosis of pediatric cancer in Kenya via telehealth education.
Sexual harassment of oncologists.
Overall survival with circulating tumor DNA-guided therapy in advanced non-small cell lung cancer.
The other two are
.//div[#class='record__ellipsis']
.//span[.=' Abstract ']/following::span

Clicking a button in JavaScript page - Selenium/Python

My code accesses a page, and I am trying to click on the button that says "Physician Program" on the menu list. If you click on this on the browser, it directs you to a new webpage.
However, there is no href on the html of the page that would help me find this link via code (I am assuming because it is JavaScript?) Currently, I just used its Xpath.
My question is - If I am able to click on it in a browser, shouldnt I be able to click on it using Selenium? If so, how can this be done?
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.kidney.org/spring-clinical/program')
time.sleep(6)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
element1 = driver.find_element_by_xpath('//*[#id="dx-c7ad8807-6124-b55e-d292-29a4389dee8e"]/div')
element1.click()

The element is inside iframe you need to switch to iframe
driver.switch_to.frame("SCM20 Advanced Practitioner Program")
element1 = driver.find_element_by_xpath("//div[text()='Physician Program']")
element1.click()
Ideally you should use webdriverwait and wait for frame to be available.
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.NAME,"SCM20 Advanced Practitioner Program")))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH "//div[text()='Physician Program']"))).click()
You need to import below libraries
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import subprocess
#other imports
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.kidney.org/spring-clinical/program')
time.sleep(6)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
frame= WebDriverWait(driver,10).until(EC.presence_of_element_located(
(By.NAME, "SCM20 Advanced Practitioner Program")))
driver.switch_to.frame(frame)
options = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located(
(By.CSS_SELECTOR, '[class="track-selector-popup"] [role="option"]')))
options[0].click()
input()
Element is inside iframe so switch to it and also use waits, to switch back and interact with elements outside the frame use:
driver.switch_to.default_content()

Web scraping javascript based websites using selenium gives error

I've been working on a project to send a few numbers to a specific discord server scraped from a javascript based website. I've gotten to the point where I only need to scrape the numbers, but I am having issues with it. When I try to get the numbers, this error pops up:
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\cukor4_dry.py", line 48, in <module>
element = wait.until(EC.visibility_of_element_located((By.ID, "mainbgsection")))
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
code I use:
#import libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import time
from twill.commands import *
import pyautogui
import os
import subprocess
from dhooks import Webhook, File
import sys
#set settings
chrome_options = webdriver.ChromeOptions()
webdriver = webdriver.Chrome("chromedriver.exe", options=chrome_options)
hook = Webhook('webhook link')
time.sleep(4)
print('form')
showforms()
try:
#try to log into page
webdriver.get('url')
webdriver.find_element_by_id('username').send_keys('username')
webdriver.find_element_by_id('password').send_keys('password')
webdriver.find_element_by_name('actionButton').click()
print('submit')
except:
#already logged in
pass
print('waited')
#try to scrape the website
url = "url"
webdriver.get(url)
wait = WebDriverWait(webdriver, 10)
element = wait.until(EC.visibility_of_element_located((By.ID, "mainbgsection")))

How do I scrape data from JavaScript website?

I am trying to scrape data from this dynamic JavaScript website. Since the page is dynamic I am using Selenium to extract the data from the table. Please suggest me how to scrape the data from the dynamic table. Here is my code.
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import lxml.html as LH
import requests
# specify the url
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
##driver = webdriver.Firefox(executable_path = 'C:/Users/Shresth Suman/Downloads/geckodriver-v0.26.0-win64/geckodriver.exe')
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 5s
time.sleep(5)
# driver.quit()
# find elements by xpath
##results = driver.find_elements_by_xpath("//div[#id='div_taboa']//table[#id='taboa']/tbody")
##results = driver.find_elements_by_xpath("//*[#id='page-title']")
##results = driver.find_elements_by_xpath("//*[#id='div_main']/h2[1]")
results = driver.find_elements_by_xpath("//*[#id = 'frame_historicos']")
print(results)
print(len(results))
# create empty array to store data
data = []
# loop over results
for result in results:
heading = result.text
print(heading)
headingfind = result.find_element_by_tag_name('h1')
# append dict to array
data.append({"head" : headingfind, "name" : heading})
# close driver
driver.quit()
###################################################################
# save to pandas dataframe
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('testsot.csv')
I want to extract data from 2005 till present with Averages/Totals of 10 min which gives me data for only one month.

Induce WebDriverWait And element_to_be_clickable()
Install Beautiful soup library
Using pandas read_html()
I haven't create list. you should create startdate and enddate list and itearte for all those month since 1/1/2005
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
driver.get(urlpage)
WebDriverWait(driver,20).until(EC.frame_to_be_available_and_switch_to_it((By.ID,"frame_historicos")))
inputstartdate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[1]")))
inputstartdate.clear()
inputstartdate.send_keys("1/1/2005")
inputenddate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[#class='dijitReset dijitInputInner'])[last()]")))
inputenddate.clear()
inputenddate.send_keys("1/31/2005")
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//input[#class='form-submit'][#value='REFRESH']"))).click()
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#taboa")))
time.sleep(3)
soup=BeautifulSoup(driver.page_source,"html.parser")
table=soup.find("table", id="taboa")
df=pd.read_html(str(table))
df.to_csv('testsot.csv')
print(df)

We Keep Coding

JavaScript is the programming language of the Web.

Python Selenium BeautifulSoup Page Source Does Not Display Everything - javascript

Related

Python web scraping with requests sign in

WebScrape This Field

Clicking a button in JavaScript page - Selenium/Python

Web scraping javascript based websites using selenium gives error

How do I scrape data from JavaScript website?

Categories

Resources