Programmatically change page in PDF.js with QWebEngineView - javascript

I am making an application in PyQt5 that involves displaying a PDF using the QWebEngineView and PDF.js by Mozilla.
I am able to display the PDF no problem, but I cannot figure out how to either:
1: set the page on load, or
2: update the page after it is already loaded
I have tried the numerous options from other Stackoverflow posts that involve using self.runJavaScript() to change it, but it always results in either "Cannot set property of undefined" or "Object is NoneType".
Here is my method:
def load_file(self, file, page=0) -> None:
url = QtCore.QUrl().fromLocalFile(os.path.abspath("./pdfjs/web/viewer.html"))
query = QtCore.QUrlQuery()
query.addQueryItem("file", os.path.normpath(os.path.abspath(file)))
url.setQuery(query)
self.pdf_view.load(url)
where self.pdf_view is QWebEngineView.
I would appreciate any help on how to accomplish this.
EDIT: I was able to specify the page on load with the # symbol, but as for changing the page without re-loading the whole thing is still unknown to me.

The PDF.js viewer loads some scripts that create a PDFViewer object with all the necessary properties for programmatically navigating pages. So you just need to run some simple javascript on the main viewer page to get the functionality you need. To make things a little nicer to work with, it's also helpful to provide a way to run the javascript synchronously so that return values can be accessed more easily.
Below is a simple working demo that implements that (only tested on Linux). Hopefully it should be clear how to adapt it to work with your own application:
import sys, os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
# PDFJS = '/usr/share/pdf.js/web/viewer.html'
PDFJS = './pdfjs/web/viewer.html'
class Window(QtWidgets.QWidget):
def __init__(self):
super().__init__()
self.buttonNext = QtWidgets.QPushButton('Next Page')
self.buttonNext.clicked.connect(lambda: self.changePage(+1))
self.buttonPrev = QtWidgets.QPushButton('Previous Page')
self.buttonPrev.clicked.connect(lambda: self.changePage(-1))
self.viewer = QtWebEngineWidgets.QWebEngineView()
layout = QtWidgets.QGridLayout(self)
layout.addWidget(self.viewer, 0, 0, 1, 2)
layout.addWidget(self.buttonPrev, 1, 0)
layout.addWidget(self.buttonNext, 1, 1)
def loadFile(self, file):
url = QtCore.QUrl.fromLocalFile(os.path.abspath(PDFJS))
query = QtCore.QUrlQuery()
query.addQueryItem('file', os.path.abspath(file))
url.setQuery(query)
self.viewer.load(url)
def execJavaScript(self, script):
result = None
def callback(data):
nonlocal result
result = data
loop.quit()
loop = QtCore.QEventLoop()
QtCore.QTimer.singleShot(
0, lambda: self.viewer.page().runJavaScript(script, callback))
loop.exec()
return result
def changePage(self, delta):
page = self.execJavaScript(
'PDFViewerApplication.pdfViewer.currentPageNumber')
self.setCurrentPage(page + int(delta))
def setCurrentPage(self, page):
count = self.execJavaScript(
'PDFViewerApplication.pdfViewer.pagesCount')
if 1 <= page <= count:
self.execJavaScript(
f'PDFViewerApplication.pdfViewer.currentPageNumber = {page}')
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
window = Window()
if len(sys.argv) > 1:
window.loadFile(sys.argv[1])
window.setGeometry(600, 50, 800, 600)
window.show()
sys.exit(app.exec_())

Related

Python Selenium Take a Screenshot Of An Whole Page Without Using Headless Mode

I need to take a screenshot of an element which is very long and not fit on the screen, I can use headless mode to do this but site doesn't allow me to do even with user-agent and other stuff.
But I can access the site with undetectedChromeDriver, so there's a extension to do this stuff called 'HTML Elements Screenshot'.
That extension will allow you to select element and take the screenshot of an whole element for you.
I automated that process with pyautogui and cv2 but I want to do it without this libraries. Is there any Javascript code to do it ? I did my research but can't find any useful. Thanks in advance
The codes I tried:
def save_screenshot(driver, path: str = 'screenshot.png') -> None:
input('Let it go when u ready.')
driver.switch_to.window(driver.window_handles[-1])
original_size = driver.get_window_size()
required_width = driver.execute_script('return document.body.parentNode.scrollWidth')
required_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(required_width, required_height)
driver.save_screenshot(path) # has scrollbar
#driver.find_element_by_tag_name('body').screenshot(path) # avoids scrollbar
driver.set_window_size(original_size['width'], original_size['height'])
def saveScreenshot(driver,path: str="screenshot.png"):
input('Let it go when u ready.')
driver.switch_to.window(driver.window_handles[-1])
el = driver.find_element(By.TAG_NAME,"body")
el.screenshot(path)
driver.quit()
For python you can use pyppeteer.
For javascript you can use puppeteer
You can find the documentation here

Webscraping website that has a button to click

I am trying to webscrape a website that has multiple javascript rendered pages (https://openlibrary.ecampusontario.ca/catalogue/). I am able to get the content from the first page, but I am not sure how to get my script to click on the buttons on the subsequent pages to get that content. Here is my script.
import time
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/rawlins/Downloads/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--window-size=1920x1080')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)
# Load webpage
url = "https://openlibrary.ecampusontario.ca/catalogue/"
browser.get(url)
# to ensure that the page has loaded completely.
time.sleep(3)
data = []
# Parse HTML, close browser
page_soup = soup(browser.page_source, 'lxml')
containers = page_soup.findAll("div", {"class":"result-item tooltip"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find('h4', {'class' : 'textbook-title'}).text.strip()
item['author'] = container.find('p', {'class' : 'textbook-authors'}).text.strip()
item['link'] = "https://openlibrary.ecampusontario.ca/catalogue/" + container.find('h4', {'class' : 'textbook-title'}).a["href"]
item['source'] = "eCampus Ontario"
item['base_url'] = "https://openlibrary.ecampusontario.ca/catalogue/"
data.append(item) # add the item to the list
with open("js-webscrape-2.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()
You do not have to actually click on any button. For example, to search for items with the keyword 'electricity', you navigate to the url
https://openlibrary-repo.ecampusontario.ca/rest/filtered-items?query_field%5B%5D=*&query_op%5B%5D=matches&query_val%5B%5D=(%3Fi)electricity&filters=is_not_withdrawn&offset=0&limit=10000
This will return a json string of items with the first item being:
{"items":[{"uuid":"6af61402-b0ec-40b1-ace2-1aa674c2de9f","name":"Introduction to Electricity, Magnetism, and Circuits","handle":"123456789/579","type":"item","expand":["metadata","parentCollection","parentCollectionList","parentCommunityList","bitstreams","all"],"lastModified":"2019-05-09 15:51:06.91","parentCollection":null,"parentCollectionList":null,"parentCommunityList":null,"bitstreams":null,"withdrawn":"false","archived":"true","link":"/rest/items/6af61402-b0ec-40b1-ace2-1aa674c2de9f","metadata":null}, ...
Now, to get that item, you use its uuid, and navigate to:
https://openlibrary.ecampusontario.ca/catalogue/item/?id=6af61402-b0ec-40b1-ace2-1aa674c2de9f
You can proceed like this for any interaction with that website (this is not always working for all websites, but it is working for your website).
To find out what are the urls that are navigated to when you click such and such button or enter text (what I did for the above urls), you can use fiddler.
I made a little script that can help you (selenium).
what this script does is "while the last page of the catalogue is not selected (in this case, contain 'selected' in it's class), i'll scrape , then click next"
while "selected" not in driver.find_elements_by_css_selector("[id='results-pagecounter-pages'] a")[-1].get_attribute("class"):
#your scraping here
driver.find_element_by_css_selector("[id='next-btn']").click()
There's probably a problem that you'll run into using this method, it doesn't wait for the results to load, but you can figure out what to do from here onwards.
Hope it helps

Parsing html from a javascript rendered url with python object

I would like to extract the market information from the following url and all of its subsequent pages:
https://uk.reuters.com/investing/markets/index/.FTSE?sortBy=&sortDir=&pn=1
I have successfully parsed the data that I want from the first page using some code from the following url:
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages
I have also been able to parse out the url for the next page to feed into a loop in order to grab data from the next page. The problem is it crashes before the next page loads for a reason I don't fully understand.
I have a hunch that the class that I have borrowed from 'impythonist' may be causing the problem. I don't know enough object orientated programming to work out the problem. Here is my code, much of which is borrowed from the the url above:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
import re
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
base_url='https://uk.reuters.com'
complete_next_page='https://uk.reuters.com/investing/markets/index/.FTSE?sortBy=&sortDir=&pn=1'
#LOOP TO RENDER PAGES AND GRAB DATA
while complete_next_page != '':
print ('NEXT PAGE: ',complete_next_page, '\n')
r = Render(complete_next_page) # USE THE CLASS TO RENDER JAVASCRIPT FROM PAGE
result = r.frame.toHtml() # ERROR IS THROWN HERE ON 2nd PAGE
# PARSE THE HTML
soup = BeautifulSoup(result, 'lxml')
row_data=soup.find('div', attrs={'class':'column1 gridPanel grid8'})
print (len(row_data))
# PARSE ALL ROW DATA
stripe_rows=row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows=row_data.findAll('tr', attrs={'class':''})
print (len(stripe_rows))
print (len(non_stripe_rows))
# PARSE SPECIFIC ROW DATA FROM INDEX COMPONENTS
#non_stripe_rows: from 4 to 18 (inclusive) contain data
#stripe_rows: from 2 to 16 (inclusive) contain data
i=2
while i < len(stripe_rows):
print('CURRENT LINE IS: ',str(i))
print(stripe_rows[i])
print('###############################################')
print(non_stripe_rows[i+2])
print('\n')
i+=1
#GETS LINK TO NEXT PAGE
next_page=str(soup.find('div', attrs={'class':'pageNavigation'}).find('li', attrs={'class':'next'}).find('a')['href']) #GETS LINK TO NEXT PAGE WORKS
complete_next_page=base_url+next_page
I have annotated the bits of code that I have written and understand but I don't really know what's going on in the 'Render' class enough to diagnose the error? Unless its something else?
Here is the error:
result = r.frame.toHtml()
AttributeError: 'Render' object has no attribute 'frame'
I don't need to keep the information in the class once I have parsed it out so I was thinking perhaps it could be cleared or reset somehow and then updated to hold the new url information from page 2:n but I have no idea how to do this?
Alternatively if anyone knows another way to grab this specific data from this page and the following ones then that would be equally helpful?
Many thanks in advance.
How about using selenium and phantomjs instead of PyQt.
You can easily get selenium by executing "pip install selenium".
If you use Mac you can get phantomjs by executing "brew install phantomjs".
If your PC is Windows use choco instead of brew, or Ubuntu use apt-get.
from selenium import webdriver
from bs4 import BeautifulSoup
base_url = "https://uk.reuters.com"
first_page = "/business/markets/index/.FTSE?sortBy=&sortDir=&pn=1"
browser = webdriver.PhantomJS()
# PARSE THE HTML
browser.get(base_url + first_page)
soup = BeautifulSoup(browser.page_source, "lxml")
row_data = soup.find('div', attrs={'class':'column1 gridPanel grid8'})
# PARSE ALL ROW DATA
stripe_rows = row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows = row_data.findAll('tr', attrs={'class':''})
print(len(stripe_rows), len(non_stripe_rows))
# GO TO THE NEXT PAGE
next_button = soup.find("li", attrs={"class":"next"})
while next_button:
next_page = next_button.find("a")["href"]
browser.get(base_url + next_page)
soup = BeautifulSoup(browser.page_source, "lxml")
row_data = soup.find('div', attrs={'class':'column1 gridPanel grid8'})
stripe_rows = row_data.findAll('tr', attrs={'class':'stripe'})
non_stripe_rows = row_data.findAll('tr', attrs={'class':''})
print(len(stripe_rows), len(non_stripe_rows))
next_button = soup.find("li", attrs={"class":"next"})
# DONT FORGET THIS!!
browser.quit()
I know the code above is not efficient (too slow I feel), but I think that it will bring you the results you desire. In addition, if the web page you want to scrape does not use Javascript, even PhantomJS and selenium are unnecessary. You can use the requests module. However, since I wanted to show you the contrast with PyQt, I used PhantomJS and Selenium in this answer.

Can I extract comments of any page from https://www.rt.com/ using python3?

I am writing a web crawler. I extracted heading and Main Discussion of the this link but I am unable to find any one of the comment (Ctrl+u -> Ctrl+f . Comment Text). I think the comments are written in JavaScript. Can I extract it?
RT are using a service from spot.im for comments
you need to do make two POST requests, first https://api.spot.im/me/network-token/spotim to get a token, then https://api.spot.im/conversation-read/spot/sp_6phY2k0C/post/353493/get to get the comments as JSON.
i wrote a quick script to do this
import requests
import re
import json
def get_rt_comments(article_url):
spotim_spotId = 'sp_6phY2k0C' # spotim id for RT
post_id = re.search('([0-9]+)', article_url).group(0)
r1 = requests.post('https://api.spot.im/me/network-token/spotim').json()
spotim_token = r1['token']
payload = {
"count": 25, #number of comments to fetch
"sort_by":"best",
"cursor":{"offset":0,"comments_read":0},
"host_url": article_url,
"canonical_url": article_url
}
r2_url ='https://api.spot.im/conversation-read/spot/' + spotim_spotId + '/post/'+ post_id +'/get'
r2 = requests.post(r2_url, data=json.dumps(payload), headers={'X-Spotim-Token': spotim_token , "Content-Type": "application/json"})
return r2.json()
if __name__ == '__main__':
url = 'https://www.rt.com/usa/353493-clinton-speech-affairs-silence/'
comments = get_rt_comments(url)
print(comments)
Yes, if it can be viewed with a web browser, you can extract it.
If you look at the source it is really an iframe that loads a piece of javascript, that then creates a new tag in the document with the source of that script tag loading bundle.js, which really contains the commenting software. This in turns then fetches the actual comments.
Instead of going through this manually, you could consider using for example webkit to create a headless browser that executes the javascript like an ordinary browser. Then you can scrape from that instead of having to manually make your crawler fetch the external resources.
Examples of such headless browsers could be Spynner, Dryscape, or the PhantomJS derived PhantomPy (the latter seems to be an abandoned project now).

how to open pdf in a browser with with disabling the context menu in django?

I have the following code to open up a webpage inside views.py:
def pdfView(request,filesPats):
f = open(filesPats,'rb')
cont = f.read()
response = HttpResponse(cont, mimetype='application/pdf')
response['Content-Disposition'] = 'inline;filename="'+my_file+'.pdf"'
f.close()
return response
I use django 1.4 and python 2.7.
It returns a PDF view and it works fine. But the question is : how to disable the context menu inside the pdf viewer?
You need to take a look at this plugin.
All you need to do is customize it according to your need.

Categories