How to scrape Javascript rendered data? - javascript

I have been trying to scrape this page for a while now and no matter what I try I can't seem to get the table data. The closest I've gotten is the table headers and titles.
I've tried using PhantomJS, Selenium and other methods but I'm stuck.
Site: http://marketwatch.dfm.ae/?isRedirected=true
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://marketwatch.dfm.ae/?isRedirected=true'
r = Render(url)
soup = BeautifulSoup(unicode(r.frame.toHtml()))
print soup
And I've tried this
import platform
from bs4 import BeautifulSoup
from selenium import webdriver
PHANTOMJS_PATH='./phantomjs'
browser = webdriver.PhantomJS(PHANTOMJS_PATH)
browser.get('http://marketwatch.dfm.ae/')
# let's parse our html
soup = BeautifulSoup(browser.page_source, "html.parser")
# get all the table data
tabdata = soup.find_all('tr', {'id': 'mw'})
print tabdata
Any help is appreciated.
Thank you

Related

Use Python to Parse HTML from dynamic js webpage with qt5 and Beautifulsoup [duplicate]

I am trying to get HTML of a page loaded in PyQT5 QWebEngineView. Here is a simple example:
import sys
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from PyQt5.QtWidgets import *
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.getElementsByTagName('html')[0]", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
I was hoping to be able to return html from the runJavaScript() call but i get a blank in the callback function.
What is incorrect in my code and what alternatives are available for obtaining HTML of a page?
Using my old answer written C++ and translating the solution to Python:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.documentElement.outerHTML", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
Update:
The problem in your case is that getElementsByTagName() returns a list of js elements, and that element cannot be exported to python, what you should do is get the innerHTML:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript(
"document.getElementsByTagName('html')[0].innerHTML", callback_function
)
# or document.getElementsByTagName('html')[0].outerHTML
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())

Scraping with Python returns NONE

I'm trying to scrape info from a site that is using java script overlayed on google maps. Using inspector I can clearly see the data I want to recover, but I have been unsuccessful thus far.
I have tried the following; requests, Beautiful soup with PyQt (Sentdex example) - this works for his example but I can't get it to work for mijn. See code... and I've tried many others. Any ideas? Thanks.
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://inzameling.spaarnelanden.nl/')
soup = bs.BeautifulSoup(page.html, 'lxml')
js_test = soup.find('div', class_='infoBoxValue')
print(js_test)
if __name__ == '__main__':
main()

getting dynamic data using python

I'm new to Python and got interested in writing scripts. I'm currently building a crawler that goes on a page and extract copy from tags. Write now I can only list tags; I'm having trouble getting the text out of tags and I'm not sure why exactly. I'm also using BeautifulSoup and PyQt4 to get dynamic data(this might need a new question).
So based on this code below, I should be getting the "Images" copy from the Google homepage, or at least the span tag itself. I'm getting returned NONE
I tried reading the docs for BeautifulSoup and it was a little overwhelming. I'm still reading it, but I think I keep going down a rabbit hole. I can print all anchor tags or all divs, but targeting a specific one is where I'm struggling.
import urllib
import re
from bs4 import BeautifulSoup, Comment
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://google.com'
source = urllib.urlopen(url).read()
soup = BeautifulSoup(source, 'html.parser')
js_test = soup.find("a", class_="gb_P")
print js_test

Form scraping using scrapy with Javascript embedded

I am trying to scrape this website. My spider is functional but the website has javascript embedded in the form to get the result, which I can t get through.
I've read about selenium and how I have to include a browser to get through it but I still with how to scrape after the dynamically generated HTML is loaded, while still passing form arguments.
Here is the code for my spider, any help, referrals or code snippets are welcome. I've navigated and read many threads to no avail.
from scrapy.spiders import BaseSpider
from scrapy.http import FormRequest
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from tax1.items import Tax1Item
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from selenium import webdriver
import time
class tax1Spider(BaseSpider):
name = "tax1Spider"
allowed_domains = ["http://taxesejour.impots.gouv.fr"]
start_urls = ["http://taxesejour.impots.gouv.fr/DTS_WEB/UK/"]
def parse(self, response):
yield FormRequest.from_response(response,
formname='PAGE_DELIBV2',
formdata={'A8':'05 - Hautes-alpes',
'A10':'AIGUILLES'},
callback = self.parse1)
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="lh0 dzSpan dzA15"]',)), callback="parse1", follow= True))
# lh0 dzSpan dzA15
def __init__(self):
CrawlSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse1(self, response):
#hxs = HtmlXPathSelector(response)
self.browser.get(response.url)
time.sleep(3)
Selector(text=self.browser.page_source)
items = []
#item = Tax1Item()
item['message'] = hxs.select('//td[#id="tzA18"]').extract()
print item['message']
return item

How do you pass html to Selenium

I have a webcrawler and I want to pass the html+javascript it retrieves into selenium, is this possible? To clarify I do not want to use webdriver.get to retrieve the page with selenium since my crawler is faster.
I ended up scraping the webpage with PyQt4 on a xvfb server since I was using amazon ec2 which doesn't come with x11. The code below loads the webpage containing JavaScript and waits 7 seconds before returning the html so all the JavaScript will have finished loading.
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from xvfbwrapper import Xvfb
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.timerScreen = QTimer()
self.timerScreen.setInterval(7000)
self.timerScreen.setSingleShot(True)
self.timerScreen.timeout.connect(self.getHtml)
self.loadFinished.connect(self.timerScreen.start)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def getHtml(self):
self.frame = self.mainFrame()
self.app.quit()
args = {"nolisten":"tcp"}
vdisplay = Xvfb(**args)
vdisplay.start()
url = 'url here'
r = Render(url)
html = r.frame.toHtml()
print html
f = open("./test.html","wb")
f.write(html.__str__().encode("utf-8"))
f.close()
#stri = str(html).encode("utf-8")
vdisplay.stop()

Categories