PySide. Extracting DOM HTML. AccessNetworkmanager - javascript

I need to extract all calendar data from page like
"http://www.dukascopy.com/swiss/english/marketwatch/calendars/eccalendar/". Firstly - to extract all html with inner dom.
Using eclipse and Python 3.3, win7. Searched here answers, and coded smth based on them.
Looks like:
from PySide import QtGui, QtDeclarative
from PySide.QtGui import QApplication, QDesktopServices, QImage, QPainter
from PySide.QtCore import QByteArray, QUrl, QTimer, QEventLoop, QIODevice, QObject
from PySide.QtWebKit import QWebFrame, QWebView, QWebPage, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
#!/usr/bin/env python
"""
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://www.dukascopy.com/swiss/english/marketwatch/calendars/eccalendar/"))
web.show()
sys.exit(app.exec_())
"""
app = QApplication(sys.argv)
w = QWebView()
request = QNetworkRequest(QUrl("http://www.dukascopy.com/swiss/english/marketwatch/calendars/eccalendar/"))
reply = w.page().networkAccessManager().get(request)
print(reply)
byte_array = reply.readAll()
plist = reply.rawHeaderList()
print(plist)
print(byte_array)
When loading page to QWebView() it works fine (commented code), but I couldn't find how to extract all html from QWebView(). So i tried via "request" - decommented code. And nothing prints.

Try with signals:
def print_content():
print web.page().mainFrame().toHtml() # or toPlainText()
# or
# print web.page().currentFrame().toHtml() # or toPlainText()
and
web.page().mainFrame().loadFinished.connect(print_content)
# or web.loadFinished.connect(print_content)
web.load(QUrl("http://www.dukascopy.com/swiss/english/marketwatch/calendars/eccalendar/"))
web.show()
print_content should be called then loadFinished signal arrives

Related

Programmatically change page in PDF.js with QWebEngineView

I am making an application in PyQt5 that involves displaying a PDF using the QWebEngineView and PDF.js by Mozilla.
I am able to display the PDF no problem, but I cannot figure out how to either:
1: set the page on load, or
2: update the page after it is already loaded
I have tried the numerous options from other Stackoverflow posts that involve using self.runJavaScript() to change it, but it always results in either "Cannot set property of undefined" or "Object is NoneType".
Here is my method:
def load_file(self, file, page=0) -> None:
url = QtCore.QUrl().fromLocalFile(os.path.abspath("./pdfjs/web/viewer.html"))
query = QtCore.QUrlQuery()
query.addQueryItem("file", os.path.normpath(os.path.abspath(file)))
url.setQuery(query)
self.pdf_view.load(url)
where self.pdf_view is QWebEngineView.
I would appreciate any help on how to accomplish this.
EDIT: I was able to specify the page on load with the # symbol, but as for changing the page without re-loading the whole thing is still unknown to me.
The PDF.js viewer loads some scripts that create a PDFViewer object with all the necessary properties for programmatically navigating pages. So you just need to run some simple javascript on the main viewer page to get the functionality you need. To make things a little nicer to work with, it's also helpful to provide a way to run the javascript synchronously so that return values can be accessed more easily.
Below is a simple working demo that implements that (only tested on Linux). Hopefully it should be clear how to adapt it to work with your own application:
import sys, os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
# PDFJS = '/usr/share/pdf.js/web/viewer.html'
PDFJS = './pdfjs/web/viewer.html'
class Window(QtWidgets.QWidget):
def __init__(self):
super().__init__()
self.buttonNext = QtWidgets.QPushButton('Next Page')
self.buttonNext.clicked.connect(lambda: self.changePage(+1))
self.buttonPrev = QtWidgets.QPushButton('Previous Page')
self.buttonPrev.clicked.connect(lambda: self.changePage(-1))
self.viewer = QtWebEngineWidgets.QWebEngineView()
layout = QtWidgets.QGridLayout(self)
layout.addWidget(self.viewer, 0, 0, 1, 2)
layout.addWidget(self.buttonPrev, 1, 0)
layout.addWidget(self.buttonNext, 1, 1)
def loadFile(self, file):
url = QtCore.QUrl.fromLocalFile(os.path.abspath(PDFJS))
query = QtCore.QUrlQuery()
query.addQueryItem('file', os.path.abspath(file))
url.setQuery(query)
self.viewer.load(url)
def execJavaScript(self, script):
result = None
def callback(data):
nonlocal result
result = data
loop.quit()
loop = QtCore.QEventLoop()
QtCore.QTimer.singleShot(
0, lambda: self.viewer.page().runJavaScript(script, callback))
loop.exec()
return result
def changePage(self, delta):
page = self.execJavaScript(
'PDFViewerApplication.pdfViewer.currentPageNumber')
self.setCurrentPage(page + int(delta))
def setCurrentPage(self, page):
count = self.execJavaScript(
'PDFViewerApplication.pdfViewer.pagesCount')
if 1 <= page <= count:
self.execJavaScript(
f'PDFViewerApplication.pdfViewer.currentPageNumber = {page}')
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
window = Window()
if len(sys.argv) > 1:
window.loadFile(sys.argv[1])
window.setGeometry(600, 50, 800, 600)
window.show()
sys.exit(app.exec_())

Parsing html from a javascript rendered url with python object

I would like to extract the market information from the following url and all of its subsequent pages:
https://uk.reuters.com/investing/markets/index/.FTSE?sortBy=&sortDir=&pn=1
I have successfully parsed the data that I want from the first page using some code from the following url:
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages
I have also been able to parse out the url for the next page to feed into a loop in order to grab data from the next page. The problem is it crashes before the next page loads for a reason I don't fully understand.
I have a hunch that the class that I have borrowed from 'impythonist' may be causing the problem. I don't know enough object orientated programming to work out the problem. Here is my code, much of which is borrowed from the the url above:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
import re
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
base_url='https://uk.reuters.com'
complete_next_page='https://uk.reuters.com/investing/markets/index/.FTSE?sortBy=&sortDir=&pn=1'
#LOOP TO RENDER PAGES AND GRAB DATA
while complete_next_page != '':
print ('NEXT PAGE: ',complete_next_page, '\n')
r = Render(complete_next_page) # USE THE CLASS TO RENDER JAVASCRIPT FROM PAGE
result = r.frame.toHtml() # ERROR IS THROWN HERE ON 2nd PAGE
# PARSE THE HTML
soup = BeautifulSoup(result, 'lxml')
row_data=soup.find('div', attrs={'class':'column1 gridPanel grid8'})
print (len(row_data))
# PARSE ALL ROW DATA
stripe_rows=row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows=row_data.findAll('tr', attrs={'class':''})
print (len(stripe_rows))
print (len(non_stripe_rows))
# PARSE SPECIFIC ROW DATA FROM INDEX COMPONENTS
#non_stripe_rows: from 4 to 18 (inclusive) contain data
#stripe_rows: from 2 to 16 (inclusive) contain data
i=2
while i < len(stripe_rows):
print('CURRENT LINE IS: ',str(i))
print(stripe_rows[i])
print('###############################################')
print(non_stripe_rows[i+2])
print('\n')
i+=1
#GETS LINK TO NEXT PAGE
next_page=str(soup.find('div', attrs={'class':'pageNavigation'}).find('li', attrs={'class':'next'}).find('a')['href']) #GETS LINK TO NEXT PAGE WORKS
complete_next_page=base_url+next_page
I have annotated the bits of code that I have written and understand but I don't really know what's going on in the 'Render' class enough to diagnose the error? Unless its something else?
Here is the error:
result = r.frame.toHtml()
AttributeError: 'Render' object has no attribute 'frame'
I don't need to keep the information in the class once I have parsed it out so I was thinking perhaps it could be cleared or reset somehow and then updated to hold the new url information from page 2:n but I have no idea how to do this?
Alternatively if anyone knows another way to grab this specific data from this page and the following ones then that would be equally helpful?
Many thanks in advance.
How about using selenium and phantomjs instead of PyQt.
You can easily get selenium by executing "pip install selenium".
If you use Mac you can get phantomjs by executing "brew install phantomjs".
If your PC is Windows use choco instead of brew, or Ubuntu use apt-get.
from selenium import webdriver
from bs4 import BeautifulSoup
base_url = "https://uk.reuters.com"
first_page = "/business/markets/index/.FTSE?sortBy=&sortDir=&pn=1"
browser = webdriver.PhantomJS()
# PARSE THE HTML
browser.get(base_url + first_page)
soup = BeautifulSoup(browser.page_source, "lxml")
row_data = soup.find('div', attrs={'class':'column1 gridPanel grid8'})
# PARSE ALL ROW DATA
stripe_rows = row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows = row_data.findAll('tr', attrs={'class':''})
print(len(stripe_rows), len(non_stripe_rows))
# GO TO THE NEXT PAGE
next_button = soup.find("li", attrs={"class":"next"})
while next_button:
next_page = next_button.find("a")["href"]
browser.get(base_url + next_page)
soup = BeautifulSoup(browser.page_source, "lxml")
row_data = soup.find('div', attrs={'class':'column1 gridPanel grid8'})
stripe_rows = row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows = row_data.findAll('tr', attrs={'class':''})
print(len(stripe_rows), len(non_stripe_rows))
next_button = soup.find("li", attrs={"class":"next"})
# DONT FORGET THIS!!
browser.quit()
I know the code above is not efficient (too slow I feel), but I think that it will bring you the results you desire. In addition, if the web page you want to scrape does not use Javascript, even PhantomJS and selenium are unnecessary. You can use the requests module. However, since I wanted to show you the contrast with PyQt, I used PhantomJS and Selenium in this answer.

Simulating a JavaScript button click with Scrapy

My intent is to run a scrapy crawler on this web page: http://visit.rio/en/o-que-fazer/outdoors/ . However, there's some resources on id="container" that load by a JavaScript button ("VER MAIS") click only. I've read some stuffs about selenium, but I've got nothing.
You read right, your best bet would be scrapy + selenium using a Firefox browser or a headless one like PhantomJS for faster scraping.
Example adapted from https://stackoverflow.com/a/17979285/2781701
import scrapy
from selenium import webdriver
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['visit.rio']
start_urls = ['http://visit.rio/en/o-que-fazer/outdoors']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('//div[#id="show_more"]/a')
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()

How to get html with javascript rendered sourcecode by using selenium

I run a query in one web page, then I get result url. If I right click see html source, I can see the html code generated by JS. If I simply use urllib, python cannot get the JS code. So I see some solution using selenium. Here's my code:
from selenium import webdriver
url = 'http://www.archives.com/member/Default.aspx?_act=VitalSearchResult&lastName=Smith&state=UT&country=US&deathYear=2004&deathYearSpan=10&location=UT&activityID=9b79d578-b2a7-4665-9021-b104999cf031&RecordType=2'
driver = webdriver.PhantomJS(executable_path='C:\python27\scripts\phantomjs.exe')
driver.get(url)
print driver.page_source
>>> <html><head></head><body></body></html> Obviously It's not right!!
Here's the source code I need in right click windows, (I want the INFORMATION part)
</script></div><div class="searchColRight"><div id="topActions" class="clearfix
noPrint"><div id="breadcrumbs" class="left"><a title="Results Summary"
href="Default.aspx? _act=VitalSearchR ...... <<INFORMATION I NEED>> ...
to view the entire record.</p></div><script xmlns:msxsl="urn:schemas-microsoft-com:xslt">
jQuery(document).ready(function() {
jQuery(".ancestry-information-tooltip").actooltip({
href: "#AncestryInformationTooltip", orientation: "bottomleft"});
});
So my question is: How to get the information generated by JS?
You will need to get get the document via javascript you can use seleniums execute_script function
from time import sleep # this should go at the top of the file
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
print html
That will get everything inside of the <html> tag
It's not necessary to use that workaround, you can use instead:
driver = webdriver.PhantomJS()
driver.get('http://www.google.com/')
html = driver.find_element_by_tag_name('html').get_attribute('innerHTML')
I have same problem about getting Javascript sourcecode from Internet, and I solved it using above Victory's suggestion.
*First: execute_script
driver=webdriver.Chrome()
driver.get(urls)
innerHTML = driver.execute_script("return document.body.innerHTML")
#print(driver.page_source)
*Second: parse html using beautifulsoup (You can Downloaded beautifulsoup by pip command)
import bs4 #import beautifulsoup
import re
from time import sleep
sleep(1) #wait one second
root=bs4.BeautifulSoup(innerHTML,"lxml") #parse HTML using beautifulsoup
viewcount=root.find_all("span",attrs={'class':'short-view-count style-scope yt-view-count-renderer'}) #find the value which you need.
*Third: print out the value you need
for span in viewcount:
print(span.string)
*Full code
from selenium import webdriver
import lxml
urls="http://www.archives.com/member/Default.aspx?_act=VitalSearchResult&lastName=Smith&state=UT&country=US&deathYear=2004&deathYearSpan=10&location=UT&activityID=9b79d578-b2a7-4665-9021-b104999cf031&RecordType=2"
driver = webdriver.PhantomJS()
##driver=webdriver.Chrome()
driver.get(urls)
innerHTML = driver.execute_script("return document.body.innerHTML")
##print(driver.page_source)
import bs4
import re
from time import sleep
sleep(1)
root=bs4.BeautifulSoup(innerHTML,"lxml")
viewcount=root.find_all("span",attrs={'class':'short-view-count style-scope yt-view-count-renderer'})
for span in viewcount:
print(span.string)
driver.quit()
I am thinking that you are getting the source code before the JavaScript has rendered the dynamic HTML.
Initially try putting a few seconds sleep between the navigate and get page source.
If this works, then you can change to a different wait strategy.
You try Dryscrape this browser is fully supported heavy js codes try it i hope it work for you
I met the same problem and finally solved by desired_capabilities.
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
proxy = Proxy(
{
'proxyType': ProxyType.MANUAL,
'httpProxy': 'ip_or_host:port'
}
)
desired_capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(desired_capabilities)
driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
driver.get('test_url')
print driver.page_source

Navigate through content generated by Javascript using Python in Selernium?

I've written a script to test a process involving data input & several pages, but after writing it I've found the forms & main content to be generated from javascript.
The following is a snippet of the script I wrote, and after that initial link the content is generated by JS (its my first python script so excuse any mistakes);
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
browser = webdriver.Firefox()
browser.get('http://127.0.0.1:46727/?ajax=1')
assert "Home" in browser.title
# Find and click the Employer Database link
empDatabaseLink = browser.find_element_by_link_text('Employer Database')
click = ActionChains(browser).click(on_element = empDatabaseLink)
click.perform()
# Content loaded by the above link is generated by the JS
# Find and click the Add Employer button
addEmployerButton = browser.find_element_by_id('Add Employer')
addEmployer = ActionChains(browser).click(on_element = addEmployerButton)
addEmployer.perform()
browser.save_screenshot(r'images\Add_Employer_Form.png')
# Input Employer name
employerName = browser.find_element_by_id('name')
employerName.send_keys("Selenium")
browser.save_screenshot(r'images\Entered_Employer_Name.png')
# Move to next
nextButton = broswer.find_element_by_name('button_next')
moveForward = ActionChains(browser).click(on_element = nextButton)
# Move through various steps
# Then
# Move to Finish
moveForward = ActionChains(browser).click(on_element = nextButton)
How do you access page elements that aren't in the source? I've been looking around & found GetEval but not found anything that I can use :/
Well, to the people of the future, our above conversation appears to have lead to the conclusion that xpath is what mark was looking for. So remember to try xpath, and to use the Selenium IDE and Firebug to locate particularly obstinate page elements.

Categories