Parsing URL's from JavaScript driven page with Beautifulsoup and Selenium - javascript

I want to parse all URL's in Git repository where any e-mails occur.
I use https://grep.app
The code:
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://grep.app/search?current=100&q=%40gmail.com'
chrome = "/home/dev/chromedriver"
browser = webdriver.Chrome(executable_path=chrome)
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
tags = soup.select('a')
print(tags)
When code started, Chrome started and page with results are loaded and in Developers tools in Chrome, in source code I can see a lot of A and HREF for URL's.
Source from page
Like:
lib/plugins/revert/lang/eu/lang.php
But my code return only "tags" from footer:
"[<span class="slashes">//</span>grep.app, Contact]"
As I understand something wrong with JS parsing.
Please advise what I'm doing wrong?

Code:
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://grep.app/search?current=100&q=%40gmail.com'
chrome = "/home/dev/chromedriver"
browser = webdriver.Chrome(executable_path=chrome)
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
links = []
tags = soup.find_all('a', href=True)
for tag in tags:
links.append(tag['href'])
print(links)
Output:
['/', 'mailto:hello#grep.app']

Related

Python BeautifulSoup html.parser not working

I have a script to pull off book information from Amazon which was running successfully before but failed today. I am not able to figure out exactly what is going wrong but I am assuming its the parser or Javascript related. I am using the below code.
from bs4 import BeautifulSoup
import requests
response = requests.get('https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=9780307397980',headers={'User-Agent': b'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})
html = response.content
soup = BeautifulSoup(html, "html.parser")
resultcol = soup.find('div', attrs={'id':'resultsCol'})
Previously I used to get data in resultcol but now its blank. When I check html I see the tag i am looking for i.e. <div id="resultsCol" class=\'\' >. But soup does not have this text in it. Can anyone help me debug this? It was working perfectly fine before but now it is not.
Remove headers, and it should work.
from bs4 import BeautifulSoup
import requests
response = requests.get('https://www.amazon.com/s/ref=nb_sb_noss?url=search- alias%3Dstripbooks&field-keywords=9780307397980')
html = response.content
soup = BeautifulSoup(html, "html.parser")
resultcol = soup.find('div', attrs={'id':'resultsCol'})`
You need to wait until the page is completely loaded. You have to use phantomJs to make sure page is loaded correctly.
I was able to get the correct element with following code.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
url = ("https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3D"
"stripbooks&field-keywords=9780307397980")
browser = webdriver.PhantomJS()
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
resultcol = soup.find('img', attrs={'class': 's-access-image'})
print resultcol

Scraping a dynamic web page that requires authentication using python requests, BeautifulSoup and Selenium

I am working on a web scraping project for a login site. I managed to login in successfully. The site contains a dynamic table. When I run my code, it scraped the page but not the dynamic content, I tried to use selenium but it always asked me to login to Chrome instead of taking me to the page.
The following is my login code to the page:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
server = requests.Session()
login_page_url = 'https://connect.data.com/login'
loginProcess_url = 'https://connect.data.com/loginProcess'
html = server.get(login_page_url).content
soup = BeautifulSoup(html, 'html.parser')
csrf = soup.find(id="CSRF_TOKEN")['value']
login_detail = {
'j_username':'******',
'j_password':'******',
'CSRF_TOKEN': csrf,
}
server.post(loginProcess_url, data=login_detail)
r = server.get('https://connect.data.com/search#p=searchresult;;t=companies;;ss=advancedsearch;;q=H4sIAAAAAAAAAE2PzQ6CQAyE36VnDgsKGq48gMar4UCWqptAa_YHYwjv7nYJ6mUyO52v2c5wM4NH66CeQXMgbw3GxxWOSiloMzDUB_dNc1UoGWSQF7vNqWpzufpOk5UFOD4HfuPKlzLci-xECpGjyEGkSoDFCSms_V8rQeV_NZGxzy-KBzzMc_2hANAuGXTaGyZ3ooaHMFI6UbIJGyYfXUocWw819Og0LJHSwVokf-7uCHVeZuDZd8MFNds-7lrzUi0flYbWRDoBAAA')
soup = BeautifulSoup(r.text)
print (soup.find('table',{"class":"result"}))
The following is the code i added to scrape the dynamic content:
path_to_driver = '/Users/Moment/Desktop/phantomjs'
url = 'https://connect.data.com/search#p=searchresult;;t=companies;;ss=advancedsearch;;q=H4sIAAAAAAAAAE2PzQ6CQAyE36VnDgsKGq48gMar4UCWqptAa_YHYwjv7nYJ6mUyO52v2c5wM4NH66CeQXMgbw3GxxWOSiloMzDUB_dNc1UoGWSQF7vNqWpzufpOk5UFOD4HfuPKlzLci-xECpGjyEGkSoDFCSms_V8rQeV_NZGxzy-KBzzMc_2hANAuGXTaGyZ3ooaHMFI6UbIJGyYfXUocWw819Og0LJHSwVokf-7uCHVeZuDZd8MFNds-7lrzUi0flYbWRDoBAAA'
browser = webdriver.PhantomJS(executable_path = path_to_driver)
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())
The first section of code logs me in but each time I added the second section of code I am no longer logged in. Instead I get the login page.
I have used Chromedriver and PhantomJS.

Selenium + PhantomJS in Scrapy

I am trying to use Selenium and PhantomJS to get the dynamic content of a website. Here's my code
class judge(Spider):
name = "judge"
start_urls = ["http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6"]
def init_driver(self):
driver = webdriver.Chrome()
return driver
def parse(self,response):
driver = self.init_driver()
driver.get(self.start_urls[0])
sel = Selector(text=driver.page_source)
self.logger.info(u'---------------Parsing----------------')
print sel.xpath("//div[#class='dataItem'][1]/table/tbody/tr[1]/td/div[#class='wstitle']/a/text()").extract()
self.logger.info(u'---------------success----------------')
When I try my script with driver = webdriver.Chrome(), sel.xpath("//div[#class='dataItem'] gives the desired content and everything works fine. But when I instead use driver = webdriver.PhantomJS(), sel.xpath("//div[#class='dataItem'] is empty. I have try to use WebDriverWait after driver.get() to make the page fully loaded, but it does not work.
You might try this:
driver = webdriver.PhantomJS('add your directory of phantomjs here')

Python web scraping for javascript generated content

I am trying to use python3 to return the bibtex citation generated by http://www.doi2bib.org/. The url's are predictable so the script can work out the url without having to interact with the web page. I have tried using selenium, bs4, etc but cant get the text inside the box.
url = "http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9"
import urllib.request
from bs4 import BeautifulSoup
text = BeautifulSoup(urllib.request.urlopen(url).read())
print(text)
Can anyone suggest a way of returning the bibtex citation as a string (or whatever) in python?
You don't need BeautifulSoup here. There is an additional XHR request sent to the server to fill out the bibtex citation, simulate it, for example, with requests:
import requests
bibtex_id = '10.1007/s00425-007-0544-9'
url = "http://www.doi2bib.org/#/doi/{id}".format(id=bibtex_id)
xhr_url = 'http://www.doi2bib.org/doi2bib'
with requests.Session() as session:
session.get(url)
response = session.get(xhr_url, params={'id': bibtex_id})
print(response.content)
Prints:
#article{Burgert_2007,
doi = {10.1007/s00425-007-0544-9},
url = {http://dx.doi.org/10.1007/s00425-007-0544-9},
year = 2007,
month = {jun},
publisher = {Springer Science $\mathplus$ Business Media},
volume = {226},
number = {4},
pages = {981--987},
author = {Ingo Burgert and Michaela Eder and Notburga Gierlinger and Peter Fratzl},
title = {Tensile and compressive stresses in tracheids are induced by swelling based on geometrical constraints of the wood cell},
journal = {Planta}
}
You can also solve it with selenium. The key trick here is to use an Explicit Wait to wait for the citation to become visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9')
element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//pre[#ng-show="bib"]')))
print(element.text)
driver.close()
Prints the same as the above solution.

How to get html with javascript rendered sourcecode by using selenium

I run a query in one web page, then I get result url. If I right click see html source, I can see the html code generated by JS. If I simply use urllib, python cannot get the JS code. So I see some solution using selenium. Here's my code:
from selenium import webdriver
url = 'http://www.archives.com/member/Default.aspx?_act=VitalSearchResult&lastName=Smith&state=UT&country=US&deathYear=2004&deathYearSpan=10&location=UT&activityID=9b79d578-b2a7-4665-9021-b104999cf031&RecordType=2'
driver = webdriver.PhantomJS(executable_path='C:\python27\scripts\phantomjs.exe')
driver.get(url)
print driver.page_source
>>> <html><head></head><body></body></html> Obviously It's not right!!
Here's the source code I need in right click windows, (I want the INFORMATION part)
</script></div><div class="searchColRight"><div id="topActions" class="clearfix
noPrint"><div id="breadcrumbs" class="left"><a title="Results Summary"
href="Default.aspx? _act=VitalSearchR ...... <<INFORMATION I NEED>> ...
to view the entire record.</p></div><script xmlns:msxsl="urn:schemas-microsoft-com:xslt">
jQuery(document).ready(function() {
jQuery(".ancestry-information-tooltip").actooltip({
href: "#AncestryInformationTooltip", orientation: "bottomleft"});
});
So my question is: How to get the information generated by JS?
You will need to get get the document via javascript you can use seleniums execute_script function
from time import sleep # this should go at the top of the file
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
print html
That will get everything inside of the <html> tag
It's not necessary to use that workaround, you can use instead:
driver = webdriver.PhantomJS()
driver.get('http://www.google.com/')
html = driver.find_element_by_tag_name('html').get_attribute('innerHTML')
I have same problem about getting Javascript sourcecode from Internet, and I solved it using above Victory's suggestion.
*First: execute_script
driver=webdriver.Chrome()
driver.get(urls)
innerHTML = driver.execute_script("return document.body.innerHTML")
#print(driver.page_source)
*Second: parse html using beautifulsoup (You can Downloaded beautifulsoup by pip command)
import bs4 #import beautifulsoup
import re
from time import sleep
sleep(1) #wait one second
root=bs4.BeautifulSoup(innerHTML,"lxml") #parse HTML using beautifulsoup
viewcount=root.find_all("span",attrs={'class':'short-view-count style-scope yt-view-count-renderer'}) #find the value which you need.
*Third: print out the value you need
for span in viewcount:
print(span.string)
*Full code
from selenium import webdriver
import lxml
urls="http://www.archives.com/member/Default.aspx?_act=VitalSearchResult&lastName=Smith&state=UT&country=US&deathYear=2004&deathYearSpan=10&location=UT&activityID=9b79d578-b2a7-4665-9021-b104999cf031&RecordType=2"
driver = webdriver.PhantomJS()
##driver=webdriver.Chrome()
driver.get(urls)
innerHTML = driver.execute_script("return document.body.innerHTML")
##print(driver.page_source)
import bs4
import re
from time import sleep
sleep(1)
root=bs4.BeautifulSoup(innerHTML,"lxml")
viewcount=root.find_all("span",attrs={'class':'short-view-count style-scope yt-view-count-renderer'})
for span in viewcount:
print(span.string)
driver.quit()
I am thinking that you are getting the source code before the JavaScript has rendered the dynamic HTML.
Initially try putting a few seconds sleep between the navigate and get page source.
If this works, then you can change to a different wait strategy.
You try Dryscrape this browser is fully supported heavy js codes try it i hope it work for you
I met the same problem and finally solved by desired_capabilities.
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
proxy = Proxy(
{
'proxyType': ProxyType.MANUAL,
'httpProxy': 'ip_or_host:port'
}
)
desired_capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(desired_capabilities)
driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
driver.get('test_url')
print driver.page_source

Categories