Beautiful Soup find_All doesn't find all blocks - javascript

I'm trying to parse headhunter.kz website.
In use: python 3.9, beautifulsoup4.
When i parse pages with vacancies, i parse only 20 div-block with "serp-item" classes, hen in fact there are 40 div blocks. (I open the html file in the browser and see the presence of 40 blocks).
import requests
import os
import time
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
df = pd.DataFrame({})
global_url = "https://almaty.hh.kz/"
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
def get_all_pages():
with open("data/page_1.html") as file:
src = file.read()
#
soup = BeautifulSoup(src,"lxml")
#find("span", {"class":"pager-item-not-in-short-range"}).
pages_count = int(soup.find("div",{"class":"pager"}).find_all("a")[-2].text)
for i in range(1,pages_count+1):
url = f"https://almaty.hh.kz/search/vacancy?area=160&clusters=true&enable_snippets=true&ored_clusters=true&professional_role=84&professional_role=116&professional_role=36&professional_role=157&professional_role=125&professional_role=156&professional_role=160&professional_role=10&professional_role=150&professional_role=25&professional_role=165&professional_role=73&professional_role=96&professional_role=164&professional_role=104&professional_role=112&professional_role=113&professional_role=148&professional_role=114&professional_role=121&professional_role=124&professional_role=20&search_period=30&hhtmFrom=vacancy_search_list&page={i}"
r = requests.get(url = url,headers = headers)
with open(f"data/page_{i}.html","w") as file:
file.write(r.text)
time.sleep(3)
return pages_count+1
def collect_data(pages_count):
for page in range(1, pages_count+1):
with open(f"data/page_{page}.html") as file:
src = file.read()
soup = BeautifulSoup(src,"lxml")
# item_cards = soup.find_all("div",{"class":"a-card__body ddl_product_link"})
# print(len(item_cards))
# for items in item_cards:
# product_title = items.find("a",{"class":"a-card__title link"}).text
# product_price = items.find("span",{"class":"a-card__price-text"}).text
# product_geo = items.find("div",{"class":"a-card__subtitle"}).text
# print(f"Title:{product_title} - Price: {product_price} - GEO: {product_geo}")
#items_divs = soup.find_all("div",{"class":"serp-item"})
items_divs = soup.find_all("div",{"class":"serp-item"})
print(len(items_divs))
urls =[]
for item in items_divs:
item_url = item.find("span",{"data-page-analytics-event":"vacancy_search_suitable_item"}).find("a",{"class":"serp-item__title"}).get("href")
urls.append(item_url)
with open("items_urls.txt","w") as file:
for url in urls:
file.write(f"{url}\n")
get_data(file_path="items_urls.txt")
def get_data(file_path):
result_list = []
with open(file_path) as file:
urls_list = file.readlines()
clear_urls_list =[]
for url in urls_list:
url = url.strip()
clear_urls_list.append(url)
i=0
for url in clear_urls_list:
i+=1
response = requests.get(url=url,headers=headers)
soup = BeautifulSoup(response.text,"lxml")
try:
item_name = soup.find("div",{"class":"main-content"}).find("h1",{"data-qa":"vacancy-title"}).text.strip()
except:
item_name = 'E1'
try:
item_salary = soup.find("div",{"class":"main-content"}).find("div",{"data-qa":"vacancy-salary"}).text.strip()
except:
item_salary = 'E2'
try:
item_exp = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-experience"}).text.strip()
except:
item_exp = 'E3'
try:
company_name = soup.find("div",{"class":"main-content"}).find("span",{"class":"vacancy-company-name"}).find("span").text.strip()
except:
company_name = 'E4'
try:
if soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}):
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}).text.strip()
else:
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time"}).text.strip()
except:
date = 'E5'
try:
if soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}):
address = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}).find("p", {"data-qa":"vacancy-view-location"}).text
except:
address = 'Алматы'
try:
zanyatost = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).find("span").text.strip()
except:
zanyatost = 'E7'
try:
zanyatost2 = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).text.lstrip(', ')
except:
zanyatost2 = 'E8'
print(i)
with open('test.csv','a',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
item_name,
item_salary,
item_exp,
company_name,
date,
address,
zanyatost,
zanyatost2
)
)
def main():
with open('test.csv','w',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
'Должность',
"Зарплата",
"Опыт",
"Компания",
"Дата обьявления",
"Район",
"Тип занятости",
"Тип занятости2"
)
)
pages_count = get_all_pages()
#print(pages_count)
collect_data(pages_count=pages_count)
# #get_data(file_path="items_urls.txt")
# df.to_excel('./test.xlsx')
if __name__ == '__main__':
main()
I tried to use html5lib, html.parser and lxml, but i have the same results.
Also i tried to use soup.select to find the number of div-block with "serp-item" class, but it gives me the same result. I think, that info from remaining block are stored in JS, if i'm right, can someone explain, how to parse remaining blocks?

I think you should you use selenium and try to scroll to end of page before you parse any data
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

Related

request_html not returning/rendering whole web page

I am trying to parse a google shopping page and I am trying to do it faster than selenium. I stumbled across request_html and it's been working pretty well. I have almost everything I need from it except one element it isn't parsing from the page. If you go to this google shopping page you will notice that you can hover over some of the product images and see a second one. I am parsing the information from each product but when it comes to both images for some reason request_html is only retrieving the second(hovered) image and not the first(main) one. I have attached my code below I have been trying to find a good way to represent the output of request_html to show what it IS retrieving but I haven't found a way for it to print in a readable manner. To my knowledge request_html can render javascript on pages and in my case, it is just weird that it is getting everything but the first image. I have viewed the 'inspect' part of the page to get the HTML and the image's div class that I am trying to get is '.gOenxf'. Why is request_html not rendering the first image of each product?
for google_post in google_initiate(request):
# parse what is needed
def google_initiate(request):
form = SearchForm(request.POST or None)
if form.is_valid():
url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
session = HTMLSession()
response = session.get(url)
print(response.html)
google_parsed = response.html.find('.sh-dgr__gr-auto.sh-dgr__grid-result')
response.close()
session.close()
return google_parsed
UPDATE:
import requests
from requests_html import HTMLSession
for google_post in google_initiate(request):
post_website = 'Google'
post_parse_page = google_initiate.google_parse_page
try:
post_title = google_post.find('.Xjkr3b', first=True).text
except:
post_title = ''
try:
post_url = str(google_post.find('.xCpuod'))
post_url = 'https://www.google.com' + post_url[post_url.find("href='") + len("href='"):post_url.rfind("'")]
except:
post_url = ''
try:
post_second_website = google_post.find('.aULzUe.IuHnof', first=True).text
if 'Amazon' in post_second_website or 'eBay' in post_second_website or 'Walmart' in post_second_website or 'AliExpress' in post_second_website or 'Craigslist' in post_second_website or 'Facebook Marketplace' in post_second_website or 'Oodle' in post_second_website:
post_second_website = ''
except:
post_second_website = ''
try:
post_second_url = str(google_post.find('.shntl'))
post_second_url = post_second_url[post_second_url.find("href='/url?url=") + len("href='/url?url="):post_second_url.rfind("'")]
if '%' in post_second_url:
post_second_url = post_second_url.split('%')[0]
except:
post_second_url = ''
try:
post_second_image_url = str(google_post.find('img'))
if 'encrypted' in post_second_image_url:
post_second_image_url = post_second_image_url[post_second_image_url.find("data-image-src='") + len("data-image-src='"):post_second_image_url.rfind('')]
else:
post_second_image_url = NO_IMAGE
except:
post_second_image_url = ''
try:
post_price = google_post.find('.a8Pemb.OFFNJ', first=True).text
post_price = str(post_price.split()[0])
try: string first
if '.' not in post_price:
post_price = post_price + '.00'
elif len(post_price.split('.')[1]) == 1:
post_price = post_price + '0'
elif len(post_price.split('.')[1]) == 0:
post_price = post_price + '00'
post_sort_by = post_price.replace(',', '')
post_sort_by = float(post_sort_by.split('$')[1])
except:
post_price = 'n/a'
post_sort_by = ''
except:
post_price = 'n/a'
post_sort_by = ''
try:
post_rating = google_post.find('.Rsc7Yb', first=True).text
except:
post_rating = ''
try:
post_rating_quantity = google_post.find('.NzUzee', first=True).text
post_rating_quantity = str(post_rating_quantity.split()[1])
except:
post_rating_quantity = ''
try:
post_image_url = str(google_post.find('.gOenxf'))
if 'encrypted' in post_image_url:
post_image_url = post_image_url[post_image_url.find("src='") + len("src='"):post_image_url.rfind("'")]
else:
post_image_url = NO_IMAGE
except:
post_image_url = ''
google_final_postings.append((post_title, post_url, post_price, post_image_url, post_rating, post_rating_quantity, post_website, post_second_website, post_second_url, post_second_image_url, post_parse_page, post_sort_by))
def google_initiate(request):
form = SearchForm(request.POST or None)
if form.is_valid():
url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
session = HTMLSession()
response = session.get(url)
google_parsed = response.html.find('.sh-dgr__gr-auto.sh-
dgr__grid-result')
print(google_parsed)
response.close()
session.close()
return google_parsed
Assuming the content in question is dynamically injected by Javascript, you need to call response.html.render() before seeking the element.
def google_initiate(request):
form = SearchForm(request.POST or None)
if form.is_valid():
url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
session = HTMLSession()
response = session.get(url)
response.html.render()
print(response.html)
google_parsed = response.html.find('.sh-dgr__gr-auto.sh-dgr__grid-result')
response.close()
session.close()
return google_parsed
See example in official docs.

Flask app returns value to html in javascript function [duplicate]

This question already has answers here:
Return JSON response from Flask view
(15 answers)
Closed 12 months ago.
I would like to return two values, the model prediction, and the accuracy in my flask app, however the second element (RMSE accuracy is not returned). I would like to return prediction in the first line and then accuracy in the second one - how can I return two values from the app to the js function?
code in the flask app
from flask import Flask, render_template, request
import numpy as np
import pandas as pd
import pickle
app = Flask(__name__)
df = pd.read_csv("Cleaned_data_Wien.csv")
# pipe = pickle.load(open("RandomForestModel.pkl", "rb"))
#app.route('/')
def index():
locations = sorted(df['location'].unique())
rooms = sorted(df['rooms'].unique())
return render_template('index.html', locations=locations, rooms=rooms)
#app.route('/predict', methods=['POST'])
def predict():
location = request.form.get('location') # name
room = request.form.get('room')
m2 = request.form.get('m2')
print(location, room, m2)
# Splitting
X = df.drop(columns=['price'])
y = df.price
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), # non-numeric
remainder='passthrough')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Random forest regression
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=500, random_state=0)
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(column_trans, scaler, model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
outcome = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
outcome['difference'] = outcome['y_test'] - outcome['y_pred']
outcome['difference_percentage'] = round(outcome.difference/(outcome.y_test/100),6)
PROC = round(outcome.difference_percentage.abs().mean(), 2)
MAE = round(mean_absolute_error(y_test, y_pred),4)
RMSE = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
R2 = round(r2_score(y_test, y_pred),4)
# sample
input = pd.DataFrame([[room, m2, location]], columns=['rooms', 'm2', 'location'])
input.location = input.location.astype(int) # if pickle --> must be str
input.rooms = input.rooms.astype(int)
input.m2 = input.m2.astype(float)
prediction = round(pipe.predict(input)[0],2)
return str(prediction), str(RMSE) #THE OUTPUT
if __name__ == "__main__":
app.run(debug=True, port=5001)
index function
function send_data()
{
document.querySelector('form').addEventListener("submit", form_handler);
var fd= new FormData(document.querySelector('form'));
var xhr= new XMLHttpRequest();
xhr.open('POST', '/predict', true);
document.getElementById("prediction").innerHTML = "Please wait predicting price...";
xhr.onreadystatechange = function(){
if(xhr.readyState == XMLHttpRequest.DONE){
document.getElementById('prediction').innerHTML="Prediction: EUR "+xhr.responseText
document.getElementById('model').innerHTML="Model: Random Forest Regression"
document.getElementById('RMSE').innerHTML="RMSE: "+xhr.responseText;
};
};
xhr.onload = function(){};
xhr.send(fd);
You shouldn't be returning a tuple from your predict route. Read here how a view function's response is interpreted by Flask.
You should return a dictionary containing the two values, e.g.
return {'prediction': prediction, 'RMSE': RMSE}
This will be automatically JSONified. In the frontend, you should decode the JSON object and appropriately display the two values (now you're showing the entire response in both elements), e.g.
let result = JSON.parse(xhr.responseText);
document.getElementById('prediction').innerHTML = `Prediction: EUR ${result.prediction}`;
document.getElementById('RMSE').innerHTML = `RMSE: ${result.RMSE}`;

Pull variable value from javascript source using BeautifulSoup4 Python

I'm newbie in python programming. I'm learning beautifulsoup to scrap website.
I want to extract and store the value of "stream" to my variable.
My Python code as follows :
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
import re
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
pattern = re.compile('var stream = (.*?);')
scripts = soup.find_all('script')
for script in scripts:
if(pattern.match(str(script.string))):
data = pattern.match(script.string)
links = json.loads(data.groups()[0])
print(links)
This is the source javascript code to get the stream url value.
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)||
navigator.userAgent.match(/webOS/i)||
navigator.userAgent.match(/iPhone/i)||
navigator.userAgent.match(/iPad/i)||
navigator.userAgent.match(/iPod/i)||
navigator.userAgent.match(/BlackBerry/i)||
navigator.userAgent.match(/Windows Phone/i)) {var stream =
"http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}else{var
stream =
"http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461";}jwplayer("THOPTVPlayer").setup({"title":
'thoptv.stream',"stretching":"exactfit","width": "100%","file":
none,"height": "100%","skin": "seven","autostart": "true","logo":
{"file":"https://i.imgur.com/EprI2uu.png","margin":"-0",
"position":"top-left","hide":"false","link":"http://mhdtvlive.co.in"},"androidhls":
true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:"http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4",image:"http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg"});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location
= window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);});
I need to extract the url from stream.
var stream = "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}
Rather then thinking complicated with regex, if the link is the only dynamically changing part, you can split the string with some known separating tokens.
x = """
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)|| navigator.userAgent.match(/webOS/i)|| navigator.userAgent.match(/iPhone/i)|| navigator.userAgent.match(/iPad/i)|| navigator.userAgent.match(/iPod/i)|| navigator.userAgent.match(/BlackBerry/i)|| navigator.userAgent.match(/Windows Phone/i)) {var stream = "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}else{var stream = "http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461";}jwplayer("THOPTVPlayer").setup({"title": 'thoptv.stream',"stretching":"exactfit","width": "100%","file": none,"height": "100%","skin": "seven","autostart": "true","logo": {"file":"https://i.imgur.com/EprI2uu.png","margin":"-0", "position":"top-left","hide":"false","link":"http://mhdtvlive.co.in"},"androidhls": true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:"http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4",image:"http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg"});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location = window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);});
"""
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)
# "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw"
pattern.match() matches the pattern from the beginning of the string. Try using pattern.search() instead - it will match anywhere within the string.
Change your for loop to this:
for script in scripts:
data = pattern.search(script.text)
if data is not None:
stream_url = data.groups()[0]
print(stream_url)
You can also get rid of the surrounding quotes by changing the regex pattern to:
pattern = re.compile('var stream = "(.*?)";')
so that the double quotes are not included in the group.
You might also have noticed that there are two possible stream variables depending on the accessing user agent. For tablet like devices the first would be appropriate, while all other user agents should use the second stream. You can use pattern.findall() to get all of them:
>>> pattern.findall(script.text)
['"http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=LEurobVVelOhbzOZ6EkTwr&pxe=1571716053&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.*AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw*"', '"http://hd.simiptv.com:8080//index.m3u8?key=vaERnLJswnWXM8THmfvDq5&exp=944825312&domain=thoptv.stream&id=461"']
this code works for me
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?
level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
out = list()
for c, i in enumerate(scripts): #go over list
text = i.text
if(text[:2] == "if"): #if the (if) comes first
for count, t in enumerate(text): # then we have reached the correct item in the list
if text[count] == "{" and text[count + 1] == "v" and text[count + 5] == "s": # and if this is here that stream is set
tmp = text[count:] # add this to the tmp varible
break # and end
co = 0
for m in tmp: #loop over the results from prev. result
if m == "\"" and co == 0: #if string is starting
co = 1 #set count to "true" 1
elif m == "\"" and co == 1: # if it is ending stop
print(''.join(out)) #results
break
elif co == 1:
# as long as we are looping over the rigth string
out.append(m) #add to out list
pass
result = ''.join(out) #set result
it basicly filters the string manuely.
but if we use user1767754 method (brilliant by the way) we will end up something like this:
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
x = scripts[3].text
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)

Python BeautifulSoup - Scraping Google Finance historical data

I was trying to scrap Google Finance historical data. I was need of to total number of rows, which is located along with the pagination. The following is the div tag which is responsible for displaying the total number of rows:
<div class="tpsd">1 - 30 of 1634 rows</div>
I tried using the following code to get the data, but its returning an empty list:
soup.find_all('div', 'tpsd')
I tried getting the entire table but even then I was not successful, when I checked the page source I was able to find the value inside a JavaScript function. When I Googled how to get values from script tag, it was mentioned to used regex. So, I tried using regex and the following is my code:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&enddate=Aug+18%2C+2016&num=30&ei=ilC1V6HlPIasuASP9Y7gAQ')
soup = BeautifulSoup(r.content,'lxml')
var = soup.find_all("script")[8].string
a = re.compile('google.finance.applyPagination\((.*)\'http', re.DOTALL)
b = a.search(var)
num = b.group(1)
print(num.replace(',','').split('\n')[3])
I am able to get the values which I want, but my doubt is whether the above code which I used to get the values is correct, or is there any other way better way. Kindly help.
You can easily pass an offset i.e start=.. to the url getting 30 rows at a time which is exactly what is happening with the pagination logic:
from bs4 import BeautifulSoup
import requests
url = "https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&" \
"enddate=Aug+18%2C+2016&num=30&ei=ilC1V6HlPIasuASP9Y7gAQ&start={}"
with requests.session() as s:
start = 0
req = s.get(url.format(start))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
all_rows = table.find_all("tr")
while True:
start += 30
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
if not table:
break
all_rows.extend(table.find_all("tr"))
You can also get the total rows using the script tag and use that with range:
with requests.session() as s:
req = s.get(url.format(0))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
total = int(scr.text.split(",", 3)[2])
all_rows = table.find_all("tr")
for start in range(30, total+1, 30):
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
all_rows.extend(table.find_all("tr"))
print(len(all_rows))
The num=30 is the amount of rows per page, to make less requests you can set it to 200 which seems to be the max and work your step/offset from that.
url = "https://www.google.com/finance/historical?cid=13564339&startdate=Jan+01%2C+2010&" \
"enddate=Aug+18%2C+2016&num=200&ei=ilC1V6HlPIasuASP9Y7gAQ&start={}"
with requests.session() as s:
req = s.get(url.format(0))
soup = BeautifulSoup(req.content, "lxml")
table = soup.select_one("table.gf-table.historical_price")
scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
total = int(scr.text.split(",", 3)[2])
all_rows = table.find_all("tr")
for start in range(200, total+1, 200):
soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
print(url.format(start)
table = soup.select_one("table.gf-table.historical_price")
all_rows.extend(table.find_all("tr"))
If we run the code, you will see we get 1643 rows:
In [7]: with requests.session() as s:
...: req = s.get(url.format(0))
...: soup = BeautifulSoup(req.content, "lxml")
...: table = soup.select_one("table.gf-table.historical_price")
...: scr = soup.find("script", text=re.compile('google.finance.applyPagination'))
...: total = int(scr.text.split(",", 3)[2])
...: all_rows = table.find_all("tr")
...: for start in range(200, total+1, 200):
...: soup = BeautifulSoup(s.get(url.format(start)).content, "lxml")
...: table = soup.select_one("table.gf-table.historical_price")
...: all_rows.extend(table.find_all("tr"))
...: print(len(all_rows))
...:
1643
In [8]:
You can just use the python module: https://pypi.python.org/pypi/googlefinance
The api is simple:
#The google finance API that we need.
from googlefinance import getQuotes
#The json handeler, since the API returns a JSON.
import json
intelJSON = (getQuotes('INTC'))
intelDump = json.dumps(intelJSON, indent=2)
intelInfo = json.loads(intelDump)
intelPrice = intelInfo[0]['LastTradePrice']
intelTime = intelInfo[0]['LastTradeDateTimeLong']
print ("As of " + intelTime + ", Intel stock is trading at: " + intelPrice)
I prefer having all the raw CSV files that are available for download from Google Finance. I wrote a quick python script to automatically download all the historical price info for a list of companies -- it's equivalent to how a human might use the "Download to Spreadsheet" link manually.
Here's the GitHub repo, with the downloaded CSV files for all S&P 500 stocks (in the rawCSV folder): https://github.com/liezl200/stockScraper
It uses this link http://www.google.com/finance/historical?q=googl&startdate=May+3%2C+2012&enddate=Apr+30%2C+2017&output=csv where the key here is the last output parameter, output=csv. I use urllib.urlretrieve(download_url, local_csv_filename) to retrieve the CSV.

Follow each link of a page and scrape content, Scrapy + Selenium

This is the website I'm working on. On each page, there are 18 posts in a table. I want to access each post and scrape its content, and repeat this for the first 5 pages.
My approach is to make my spider to scrape all links in the 5 pages and iterate over them to get the content. Because the "next page" button and certain text in each post is written by JavaScript, I use Selenium and Scrapy. I ran my spider and could see that Firefox webdriver displays the first 5 pages, but then the spider stopped without scraping any content. Scrapy returns no error message either.
Now I suspect that the failure may be due to:
1) No link is stored into all_links.
2) Somehow parse_content did not run.
My diagnosis may be wrong and I need help with finding the problem. Thank you very much!
This is my spider:
import scrapy
from bjdaxing.items_bjdaxing import BjdaxingItem
from selenium import webdriver
from scrapy.http import TextResponse
import time
all_links = [] # a global variable to store post links
class Bjdaxing(scrapy.Spider):
name = "daxing"
allowed_domains = ["bjdx.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://app.bjdx.gov.cn/cms/daxing/lookliuyan_bjdx.jsp"] # This has to start with http
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url) # request the start url in the browser
i = 1
while i <= 5: # The number of pages to be scraped in this session
response = TextResponse(url = response.url, body = self.driver.page_source, encoding='utf-8') # Assign page source to response. I can treat response as if it's a normal scrapy project.
global all_links
all_links.extend(response.xpath("//a/#href").extract()[0:18])
next = self.driver.find_element_by_xpath(u'//a[text()="\u4e0b\u9875\xa0"]') # locate "next" button
next.click() # Click next page
time.sleep(2) # Wait a few seconds for next page to load.
i += 1
def parse_content(self, response):
item = BjdaxingItem()
global all_links
for link in all_links:
self.driver.get("http://app.bjdx.gov.cn/cms/daxing/") + link
response = TextResponse(url = response.url, body = self.driver.page_source, encoding = 'utf-8')
if len(response.xpath("//table/tbody/tr[1]/td[2]/text()").extract() > 0):
item['title'] = response.xpath("//table/tbody/tr[1]/td[2]/text()").extract()
else:
item['title'] = ""
if len(response.xpath("//table/tbody/tr[3]/td[2]/text()").extract() > 0):
item['netizen'] = response.xpath("//table/tbody/tr[3]/td[2]/text()").extract()
else:
item['netizen'] = ""
if len(response.xpath("//table/tbody/tr[3]/td[4]/text()").extract() > 0):
item['sex'] = response.xpath("//table/tbody/tr[3]/td[4]/text()").extract()
else:
item['sex'] = ""
if len(response.xpath("//table/tbody/tr[5]/td[2]/text()").extract() > 0):
item['time1'] = response.xpath("//table/tbody/tr[5]/td[2]/text()").extract()
else:
item['time1'] = ""
if len(response.xpath("//table/tbody/tr[11]/td[2]/text()").extract() > 0):
item['time2'] = response.xpath("//table/tbody/tr[11]/td[2]/text()").extract()
else:
item['time2'] = ""
if len(response.xpath("//table/tbody/tr[7]/td[2]/text()").extract()) > 0:
question = "".join(response.xpath("//table/tbody/tr[7]/td[2]/text()").extract())
item['question'] = "".join(map(unicode.strip, question))
else: item['question'] = ""
if len(response.xpath("//table/tbody/tr[9]/td[2]/text()").extract()) > 0:
reply = "".join(response.xpath("//table/tbody/tr[9]/td[2]/text()").extract())
item['reply'] = "".join(map(unicode.strip, reply))
else: item['reply'] = ""
if len(response.xpath("//table/tbody/tr[13]/td[2]/text()").extract()) > 0:
agency = "".join(response.xpath("//table/tbody/tr[13]/td[2]/text()").extract())
item['agency'] = "".join(map(unicode.strip, agency))
else: item['agency'] = ""
yield item
Multiple problems and possible improvements here:
you don't have any "link" between the parse() and the parse_content() methods
using global variables is usually a bad practice
you don't need selenium here at all. To follow the pagination you just need to make a POST request to the same url providing the currPage parameter
The idea is to use .start_requests() and create a list/queue of requests to handle the pagination. Follow the pagination and gather the links from the table. Once the queue of requests is empty, switch to following the previously gathered links. Implementation:
import json
from urlparse import urljoin
import scrapy
NUM_PAGES = 5
class Bjdaxing(scrapy.Spider):
name = "daxing"
allowed_domains = ["bjdx.gov.cn"] # DO NOT use www in allowed domains
def __init__(self):
self.pages = []
self.links = []
def start_requests(self):
self.pages = [scrapy.Request("http://app.bjdx.gov.cn/cms/daxing/lookliuyan_bjdx.jsp",
body=json.dumps({"currPage": str(page)}),
method="POST",
callback=self.parse_page,
dont_filter=True)
for page in range(1, NUM_PAGES + 1)]
yield self.pages.pop()
def parse_page(self, response):
base_url = response.url
self.links += [urljoin(base_url, link) for link in response.css("table tr td a::attr(href)").extract()]
try:
yield self.pages.pop()
except IndexError: # no more pages to follow, going over the gathered links
for link in self.links:
yield scrapy.Request(link, callback=self.parse_content)
def parse_content(self, response):
# your parse_content method here

Categories