Follow each link of a page and scrape content, Scrapy + Selenium

Follow each link of a page and scrape content, Scrapy + Selenium - javascript

This is the website I'm working on. On each page, there are 18 posts in a table. I want to access each post and scrape its content, and repeat this for the first 5 pages.
My approach is to make my spider to scrape all links in the 5 pages and iterate over them to get the content. Because the "next page" button and certain text in each post is written by JavaScript, I use Selenium and Scrapy. I ran my spider and could see that Firefox webdriver displays the first 5 pages, but then the spider stopped without scraping any content. Scrapy returns no error message either.
Now I suspect that the failure may be due to:
1) No link is stored into all_links.
2) Somehow parse_content did not run.
My diagnosis may be wrong and I need help with finding the problem. Thank you very much!
This is my spider:
import scrapy
from bjdaxing.items_bjdaxing import BjdaxingItem
from selenium import webdriver
from scrapy.http import TextResponse
import time
all_links = [] # a global variable to store post links
class Bjdaxing(scrapy.Spider):
name = "daxing"
allowed_domains = ["bjdx.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://app.bjdx.gov.cn/cms/daxing/lookliuyan_bjdx.jsp"] # This has to start with http
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url) # request the start url in the browser
i = 1
while i <= 5: # The number of pages to be scraped in this session
response = TextResponse(url = response.url, body = self.driver.page_source, encoding='utf-8') # Assign page source to response. I can treat response as if it's a normal scrapy project.
global all_links
all_links.extend(response.xpath("//a/#href").extract()[0:18])
next = self.driver.find_element_by_xpath(u'//a[text()="\u4e0b\u9875\xa0"]') # locate "next" button
next.click() # Click next page
time.sleep(2) # Wait a few seconds for next page to load.
i += 1
def parse_content(self, response):
item = BjdaxingItem()
global all_links
for link in all_links:
self.driver.get("http://app.bjdx.gov.cn/cms/daxing/") + link
response = TextResponse(url = response.url, body = self.driver.page_source, encoding = 'utf-8')
if len(response.xpath("//table/tbody/tr[1]/td[2]/text()").extract() > 0):
item['title'] = response.xpath("//table/tbody/tr[1]/td[2]/text()").extract()
else:
item['title'] = ""
if len(response.xpath("//table/tbody/tr[3]/td[2]/text()").extract() > 0):
item['netizen'] = response.xpath("//table/tbody/tr[3]/td[2]/text()").extract()
else:
item['netizen'] = ""
if len(response.xpath("//table/tbody/tr[3]/td[4]/text()").extract() > 0):
item['sex'] = response.xpath("//table/tbody/tr[3]/td[4]/text()").extract()
else:
item['sex'] = ""
if len(response.xpath("//table/tbody/tr[5]/td[2]/text()").extract() > 0):
item['time1'] = response.xpath("//table/tbody/tr[5]/td[2]/text()").extract()
else:
item['time1'] = ""
if len(response.xpath("//table/tbody/tr[11]/td[2]/text()").extract() > 0):
item['time2'] = response.xpath("//table/tbody/tr[11]/td[2]/text()").extract()
else:
item['time2'] = ""
if len(response.xpath("//table/tbody/tr[7]/td[2]/text()").extract()) > 0:
question = "".join(response.xpath("//table/tbody/tr[7]/td[2]/text()").extract())
item['question'] = "".join(map(unicode.strip, question))
else: item['question'] = ""
if len(response.xpath("//table/tbody/tr[9]/td[2]/text()").extract()) > 0:
reply = "".join(response.xpath("//table/tbody/tr[9]/td[2]/text()").extract())
item['reply'] = "".join(map(unicode.strip, reply))
else: item['reply'] = ""
if len(response.xpath("//table/tbody/tr[13]/td[2]/text()").extract()) > 0:
agency = "".join(response.xpath("//table/tbody/tr[13]/td[2]/text()").extract())
item['agency'] = "".join(map(unicode.strip, agency))
else: item['agency'] = ""
yield item

Multiple problems and possible improvements here:
you don't have any "link" between the parse() and the parse_content() methods
using global variables is usually a bad practice
you don't need selenium here at all. To follow the pagination you just need to make a POST request to the same url providing the currPage parameter
The idea is to use .start_requests() and create a list/queue of requests to handle the pagination. Follow the pagination and gather the links from the table. Once the queue of requests is empty, switch to following the previously gathered links. Implementation:
import json
from urlparse import urljoin
import scrapy
NUM_PAGES = 5
class Bjdaxing(scrapy.Spider):
name = "daxing"
allowed_domains = ["bjdx.gov.cn"] # DO NOT use www in allowed domains
def __init__(self):
self.pages = []
self.links = []
def start_requests(self):
self.pages = [scrapy.Request("http://app.bjdx.gov.cn/cms/daxing/lookliuyan_bjdx.jsp",
body=json.dumps({"currPage": str(page)}),
method="POST",
callback=self.parse_page,
dont_filter=True)
for page in range(1, NUM_PAGES + 1)]
yield self.pages.pop()
def parse_page(self, response):
base_url = response.url
self.links += [urljoin(base_url, link) for link in response.css("table tr td a::attr(href)").extract()]
try:
yield self.pages.pop()
except IndexError: # no more pages to follow, going over the gathered links
for link in self.links:
yield scrapy.Request(link, callback=self.parse_content)
def parse_content(self, response):
# your parse_content method here

Related

Beautiful Soup find_All doesn't find all blocks

I'm trying to parse headhunter.kz website.
In use: python 3.9, beautifulsoup4.
When i parse pages with vacancies, i parse only 20 div-block with "serp-item" classes, hen in fact there are 40 div blocks. (I open the html file in the browser and see the presence of 40 blocks).
import requests
import os
import time
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
df = pd.DataFrame({})
global_url = "https://almaty.hh.kz/"
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
def get_all_pages():
with open("data/page_1.html") as file:
src = file.read()
#
soup = BeautifulSoup(src,"lxml")
#find("span", {"class":"pager-item-not-in-short-range"}).
pages_count = int(soup.find("div",{"class":"pager"}).find_all("a")[-2].text)
for i in range(1,pages_count+1):
url = f"https://almaty.hh.kz/search/vacancy?area=160&clusters=true&enable_snippets=true&ored_clusters=true&professional_role=84&professional_role=116&professional_role=36&professional_role=157&professional_role=125&professional_role=156&professional_role=160&professional_role=10&professional_role=150&professional_role=25&professional_role=165&professional_role=73&professional_role=96&professional_role=164&professional_role=104&professional_role=112&professional_role=113&professional_role=148&professional_role=114&professional_role=121&professional_role=124&professional_role=20&search_period=30&hhtmFrom=vacancy_search_list&page={i}"
r = requests.get(url = url,headers = headers)
with open(f"data/page_{i}.html","w") as file:
file.write(r.text)
time.sleep(3)
return pages_count+1
def collect_data(pages_count):
for page in range(1, pages_count+1):
with open(f"data/page_{page}.html") as file:
src = file.read()
soup = BeautifulSoup(src,"lxml")
# item_cards = soup.find_all("div",{"class":"a-card__body ddl_product_link"})
# print(len(item_cards))
# for items in item_cards:
# product_title = items.find("a",{"class":"a-card__title link"}).text
# product_price = items.find("span",{"class":"a-card__price-text"}).text
# product_geo = items.find("div",{"class":"a-card__subtitle"}).text
# print(f"Title:{product_title} - Price: {product_price} - GEO: {product_geo}")
#items_divs = soup.find_all("div",{"class":"serp-item"})
items_divs = soup.find_all("div",{"class":"serp-item"})
print(len(items_divs))
urls =[]
for item in items_divs:
item_url = item.find("span",{"data-page-analytics-event":"vacancy_search_suitable_item"}).find("a",{"class":"serp-item__title"}).get("href")
urls.append(item_url)
with open("items_urls.txt","w") as file:
for url in urls:
file.write(f"{url}\n")
get_data(file_path="items_urls.txt")
def get_data(file_path):
result_list = []
with open(file_path) as file:
urls_list = file.readlines()
clear_urls_list =[]
for url in urls_list:
url = url.strip()
clear_urls_list.append(url)
i=0
for url in clear_urls_list:
i+=1
response = requests.get(url=url,headers=headers)
soup = BeautifulSoup(response.text,"lxml")
try:
item_name = soup.find("div",{"class":"main-content"}).find("h1",{"data-qa":"vacancy-title"}).text.strip()
except:
item_name = 'E1'
try:
item_salary = soup.find("div",{"class":"main-content"}).find("div",{"data-qa":"vacancy-salary"}).text.strip()
except:
item_salary = 'E2'
try:
item_exp = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-experience"}).text.strip()
except:
item_exp = 'E3'
try:
company_name = soup.find("div",{"class":"main-content"}).find("span",{"class":"vacancy-company-name"}).find("span").text.strip()
except:
company_name = 'E4'
try:
if soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}):
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}).text.strip()
else:
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time"}).text.strip()
except:
date = 'E5'
try:
if soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}):
address = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}).find("p", {"data-qa":"vacancy-view-location"}).text
except:
address = 'Алматы'
try:
zanyatost = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).find("span").text.strip()
except:
zanyatost = 'E7'
try:
zanyatost2 = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).text.lstrip(', ')
except:
zanyatost2 = 'E8'
print(i)
with open('test.csv','a',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
item_name,
item_salary,
item_exp,
company_name,
date,
address,
zanyatost,
zanyatost2
)
)
def main():
with open('test.csv','w',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
'Должность',
"Зарплата",
"Опыт",
"Компания",
"Дата обьявления",
"Район",
"Тип занятости",
"Тип занятости2"
)
)
pages_count = get_all_pages()
#print(pages_count)
collect_data(pages_count=pages_count)
# #get_data(file_path="items_urls.txt")
# df.to_excel('./test.xlsx')
if __name__ == '__main__':
main()
I tried to use html5lib, html.parser and lxml, but i have the same results.
Also i tried to use soup.select to find the number of div-block with "serp-item" class, but it gives me the same result. I think, that info from remaining block are stored in JS, if i'm right, can someone explain, how to parse remaining blocks?

I think you should you use selenium and try to scroll to end of page before you parse any data
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

Is there another better way to plot Flask graphs on Html front end

I have a flask boilerplate that I am using to display graphs at the frontend of the app. However, this code on HTML div class is giving me an error. What specifically am I missing out here
HTML CODE:
<div class="layer w-100 pX-20 pT-20">
<h6 class="lh-1">Monthly Stats</h6>
<div class="chart" id="bargraph">
<script>
var graphs = {{plot | safe}};
Plotly.plot('bargraph',graphs,{});
</script>
</div>
</div>
The Errors I am getting are
1. Identifier string,literal or numeric expected
2. statement expected
3. Unnecessary semicolon
4. Unterminated statement
5. var used instead of let or const
6. unresolved variable 'plot'
7. Expression is not assignment or call
8. Unresolved variable 'safe'
This is how my index code looks like
from flask import Flask, render_template,request
import plotly
import plotly.graph_objs as go
import pathlib
import pandas as pd
import numpy as np
import json
app = Flask(__name__)
#create path
PATH = pathlib.Path(__file__).parent
DATA_PATH = PATH.joinpath("data").resolve()
#Data function
def data_used():
# Import data
df = pd.read_csv(DATA_PATH.joinpath("fulldata.csv"), low_memory=False)
return df
#####################################################################################
def create_df():
# get counts per NAR type
df_nar = pd.DataFrame(data_used().groupby('Hosp_id')['Document Source'].value_counts())
df_nar = df_nar.rename({'Document Source': 'Doc count'}, axis='columns')
df_nar = df_nar.reset_index()
return df_nar
def create_plot(df_nar):
# set up plotly figure
fig = go.Figure()
# Manage NAR types (who knows, there may be more types with time?)
nars = df_nar['Document Source'].unique()
nars = nars.tolist()
nars.sort(reverse=False)
# add one trace per NAR type and show counts per hospital
data = []
for nar in nars:
# subset dataframe by NAR type
df_ply = df_nar[df_nar['Document Source'] == nar]
# add trace
fig.add_trace(go.Bar(x=df_ply['Hospital_id'], y=df_ply['Doc count'], name='Document Type=' + str(nar)))
# make the figure a bit more presentable
fig.update_layout(title='Document Use per hospital',
yaxis=dict(title='<i>count of Docuement types</i>'),
xaxis=dict(title='<i>Hospital</i>'))
graphJSON = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
return graphJSON
#app.route('/')
def index():
bar = create_plot(create_df())
return render_template('index.html', plot=bar)
if __name__ == '__main__':
app.run(debug = True, port = 8686)
Its Javascript I am using to display the plot on that Div class. the backend has been well coded, I am only getting this error from that section of div class. Anyone experienced it to help

Start with var graphs = "{{plot | safe }}";

Pull variable value from javascript source using BeautifulSoup4 Python

I'm newbie in python programming. I'm learning beautifulsoup to scrap website.
I want to extract and store the value of "stream" to my variable.
My Python code as follows :
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
import re
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
pattern = re.compile('var stream = (.*?);')
scripts = soup.find_all('script')
for script in scripts:
if(pattern.match(str(script.string))):
data = pattern.match(script.string)
links = json.loads(data.groups()[0])
print(links)
This is the source javascript code to get the stream url value.
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)||
navigator.userAgent.match(/webOS/i)||
navigator.userAgent.match(/iPhone/i)||
navigator.userAgent.match(/iPad/i)||
navigator.userAgent.match(/iPod/i)||
navigator.userAgent.match(/BlackBerry/i)||
navigator.userAgent.match(/Windows Phone/i)) {var stream =
"http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}else{var
stream =
"http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461";}jwplayer("THOPTVPlayer").setup({"title":
'thoptv.stream',"stretching":"exactfit","width": "100%","file":
none,"height": "100%","skin": "seven","autostart": "true","logo":
{"file":"https://i.imgur.com/EprI2uu.png","margin":"-0",
"position":"top-left","hide":"false","link":"http://mhdtvlive.co.in"},"androidhls":
true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:"http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4",image:"http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg"});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location
= window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);});
I need to extract the url from stream.
var stream = "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}

Rather then thinking complicated with regex, if the link is the only dynamically changing part, you can split the string with some known separating tokens.
x = """
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)|| navigator.userAgent.match(/webOS/i)|| navigator.userAgent.match(/iPhone/i)|| navigator.userAgent.match(/iPad/i)|| navigator.userAgent.match(/iPod/i)|| navigator.userAgent.match(/BlackBerry/i)|| navigator.userAgent.match(/Windows Phone/i)) {var stream = "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}else{var stream = "http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461";}jwplayer("THOPTVPlayer").setup({"title": 'thoptv.stream',"stretching":"exactfit","width": "100%","file": none,"height": "100%","skin": "seven","autostart": "true","logo": {"file":"https://i.imgur.com/EprI2uu.png","margin":"-0", "position":"top-left","hide":"false","link":"http://mhdtvlive.co.in"},"androidhls": true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:"http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4",image:"http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg"});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location = window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);});
"""
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)
# "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw"

pattern.match() matches the pattern from the beginning of the string. Try using pattern.search() instead - it will match anywhere within the string.
Change your for loop to this:
for script in scripts:
data = pattern.search(script.text)
if data is not None:
stream_url = data.groups()[0]
print(stream_url)
You can also get rid of the surrounding quotes by changing the regex pattern to:
pattern = re.compile('var stream = "(.*?)";')
so that the double quotes are not included in the group.
You might also have noticed that there are two possible stream variables depending on the accessing user agent. For tablet like devices the first would be appropriate, while all other user agents should use the second stream. You can use pattern.findall() to get all of them:
>>> pattern.findall(script.text)
['"http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=LEurobVVelOhbzOZ6EkTwr&pxe=1571716053&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.*AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw*"', '"http://hd.simiptv.com:8080//index.m3u8?key=vaERnLJswnWXM8THmfvDq5&exp=944825312&domain=thoptv.stream&id=461"']

this code works for me
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?
level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
out = list()
for c, i in enumerate(scripts): #go over list
text = i.text
if(text[:2] == "if"): #if the (if) comes first
for count, t in enumerate(text): # then we have reached the correct item in the list
if text[count] == "{" and text[count + 1] == "v" and text[count + 5] == "s": # and if this is here that stream is set
tmp = text[count:] # add this to the tmp varible
break # and end
co = 0
for m in tmp: #loop over the results from prev. result
if m == "\"" and co == 0: #if string is starting
co = 1 #set count to "true" 1
elif m == "\"" and co == 1: # if it is ending stop
print(''.join(out)) #results
break
elif co == 1:
# as long as we are looping over the rigth string
out.append(m) #add to out list
pass
result = ''.join(out) #set result
it basicly filters the string manuely.
but if we use user1767754 method (brilliant by the way) we will end up something like this:
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
x = scripts[3].text
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)

Python BeautifulSoup - Scraping Google Finance historical data

I was trying to scrap Google Finance historical data. I was need of to total number of rows, which is located along with the pagination. The following is the div tag which is responsible for displaying the total number of rows:
<div class="tpsd">1 - 30 of 1634 rows</div>
I tried using the following code to get the data, but its returning an empty list:
soup.find_all('div', 'tpsd')
I tried getting the entire table but even then I was not successful, when I checked the page source I was able to find the value inside a JavaScript function. When I Googled how to get values from script tag, it was mentioned to used regex. So, I tried using regex and the following is my code:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&enddate=Aug+18%2C+2016&num=30&ei=ilC1V6HlPIasuASP9Y7gAQ')
soup = BeautifulSoup(r.content,'lxml')
var = soup.find_all("script")[8].string
a = re.compile('google.finance.applyPagination\((.*)\'http', re.DOTALL)
b = a.search(var)
num = b.group(1)
print(num.replace(',','').split('\n')[3])
I am able to get the values which I want, but my doubt is whether the above code which I used to get the values is correct, or is there any other way better way. Kindly help.

You can easily pass an offset i.e start=.. to the url getting 30 rows at a time which is exactly what is happening with the pagination logic:
from bs4 import BeautifulSoup
import requests
url = "https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&" \
"enddate=Aug+18%2C+2016&num=30&ei=ilC1V6HlPIasuASP9Y7gAQ&start={}"
with requests.session() as s:
start = 0
req = s.get(url.format(start))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
all_rows = table.find_all("tr")
while True:
start += 30
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
if not table:
break
all_rows.extend(table.find_all("tr"))
You can also get the total rows using the script tag and use that with range:
with requests.session() as s:
req = s.get(url.format(0))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
total = int(scr.text.split(",", 3)[2])
all_rows = table.find_all("tr")
for start in range(30, total+1, 30):
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
all_rows.extend(table.find_all("tr"))
print(len(all_rows))
The num=30 is the amount of rows per page, to make less requests you can set it to 200 which seems to be the max and work your step/offset from that.
url = "https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&" \
"enddate=Aug+18%2C+2016&num=200&ei=ilC1V6HlPIasuASP9Y7gAQ&start={}"
with requests.session() as s:
req = s.get(url.format(0))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
total = int(scr.text.split(",", 3)[2])
all_rows = table.find_all("tr")
for start in range(200, total+1, 200):
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
print(url.format(start)
table = soup.select_one("table.gf-table.historical_price")
all_rows.extend(table.find_all("tr"))
If we run the code, you will see we get 1643 rows:
In [7]: with requests.session() as s:
...: req = s.get(url.format(0))
...: soup = BeautifulSoup(req.content, "lxml")
...: table = soup.select_one("table.gf-table.historical_price")
...: scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
...: total = int(scr.text.split(",", 3)[2])
...: all_rows = table.find_all("tr")
...: for start in range(200, total+1, 200):
...: soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
...: table = soup.select_one("table.gf-table.historical_price")
...: all_rows.extend(table.find_all("tr"))
...: print(len(all_rows))
...:
1643
In [8]:

You can just use the python module: https://pypi.python.org/pypi/googlefinance
The api is simple:
#The google finance API that we need.
from googlefinance import getQuotes
#The json handeler, since the API returns a JSON.
import json
intelJSON = (getQuotes('INTC'))
intelDump = json.dumps(intelJSON, indent=2)
intelInfo = json.loads(intelDump)
intelPrice = intelInfo[0]['LastTradePrice']
intelTime = intelInfo[0]['LastTradeDateTimeLong']
print ("As of " + intelTime + ", Intel stock is trading at: " + intelPrice)

I prefer having all the raw CSV files that are available for download from Google Finance. I wrote a quick python script to automatically download all the historical price info for a list of companies -- it's equivalent to how a human might use the "Download to Spreadsheet" link manually.
Here's the GitHub repo, with the downloaded CSV files for all S&P 500 stocks (in the rawCSV folder): https://github.com/liezl200/stockScraper
It uses this link http://www.google.com/finance/historical?q=googl&startdate=May+3%2C+2012&enddate=Apr+30%2C+2017&output=csv where the key here is the last output parameter, output=csv. I use urllib.urlretrieve(download_url, local_csv_filename) to retrieve the CSV.

Multiple entries of Model instances in Django admin on save

I am running a django website on Heroku using uwsgi with 4 processes.
Often , whenever I click save and continue editing I get multiple entries of same instance, separated by 1-2 secs in their creation time.
What could be the issue behind this? I have even tried to prevent multiple clicks by disabling the button using javascript.
Also, the model has its save overridden . Although I dont think this makes any difference.
Venue ModelAdmin - simple calculations and save here
def save_model(self, request, obj, form, change):
obj.checked_by = request.user
if not obj.created_by:
obj.created_by = request.user.username
spaces = obj.venuetospacetypemapper_set.all().values('min_seating_capacity', 'max_seating_capacity')
obj.min_seating_capacity = min([space.get('min_seating_capacity') or 0 for space in spaces] or [0])
obj.max_seating_capacity = sum([space.get('max_seating_capacity') or 0 for space in spaces] or [0])
obj.save()
Since there are a number of inlines in Venue admin, only
VenueMedia model has save overridden:
def save(self, *args, **kwargs):
start=datetime.datetime.now()
print "starting save in admin.save", start
update_image = True
if self.id: # if updating record, then check if image path has changed
update_image = False
orig = VenueMedia.objects.get(id=self.id)
if self.url != orig.url:
update_image = True
print "in save in admin.save, seconds passed=", (datetime.datetime.now()-start).seconds
if update_image:
print "in update_image in admin.save, seconds passed=", (datetime.datetime.now()-start).seconds
image = Img.open(StringIO.StringIO(self.url.read()))
orignal_ratio = float(image.size[0])/image.size[1]
new_height = int(round(orignal_ratio * 171))
image.thumbnail((new_height,171), Img.ANTIALIAS)
output = StringIO.StringIO()
image.save(output, format='JPEG', quality=75)
output.seek(0)
self.thumbnail_url = InMemoryUploadedFile(output, 'ImageField', "%s" % str(self.url.name), 'image/jpeg', output.len, None)
if not self.venueid.thumb_image_path:
self.venueid.thumb_image_path = self.thumbnail_url
self.venueid.save()
print "before calling django.save in admin.save, seconds passed=", (datetime.datetime.now()-start).seconds
super(VenueMedia, self).save(*args, **kwargs)

We Keep Coding

JavaScript is the programming language of the Web.

Follow each link of a page and scrape content, Scrapy + Selenium - javascript

Related

Beautiful Soup find_All doesn't find all blocks

Is there another better way to plot Flask graphs on Html front end

Pull variable value from javascript source using BeautifulSoup4 Python

Python BeautifulSoup - Scraping Google Finance historical data

Multiple entries of Model instances in Django admin on save

Categories

Resources