I'm trying to scrape the href of each model from this webpage: https://www.aprilia.com/en_EN/index.
The html showing the href data can be seen only after clicking on two buttons (the one on the top right corner and the one on the left called "Models"), one after the other.
Sub get_info()
Dim ie As Object
Dim address, str_chk As String
Dim my_data As Object
Dim oHTML_Element As IHTMLElement
Dim i As Long
address = "https://www.aprilia.com/en_EN/index"
Set ie = CreateObject("InternetExplorer.Application")
ie.navigate address 'the one mentioned above
ie.Visible = False
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
For Each oHTML_Element In ie.document.getElementsByName("button")
If oHTML_Element.className = "header__menu-services__nav button button--icon" Then
oHTML_Element.Click
End If
Next
Application.Wait Now + #12:00:05 AM#
For Each oHTML_Element In ie.document.getElementsByName("Models")
oHTML_Element.Click
Next
Application.Wait Now + #12:00:05 AM#
'==>
Set my_data = html.getElementsByClassName("card-product card-product--family")
For Each elem In my_data
For i = 0 To elem.getElementsByTagName("a").Length - 1
str_chk = elem.getElementsByTagName("a")(i).href
ws.Cells(9 + j, 7).Value = str_chk
j = j + 1
Next i
Next elem
ie.Quit
Set ie = Nothing
End Sub
I got
Error '424' - Object Required
where I set my_data.
I guess that means that I'm not able to click on the two buttons and, as a consequence, html code is not available.
***************** Revised code:
Sub get_info22()
Dim address As String
Dim ie, ELE, nodes As Object
Dim i As Long
Dim t As Date
Const MAX_WAIT_SEC As Long = 10
address = "https://www.aprilia.com/en_EN/index"
Set ie = CreateObject("InternetExplorer.Application")
ie.navigate address 'the one mentioned above
ie.Visible = False
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
'************** click on first button
t = Timer
Do
On Error Resume Next
Set ELE = ie.document.querySelector(".header__menu-services__nav")
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While ELE Is Nothing
If Not ELE Is Nothing Then
ELE.Click
End If
While ie.Busy Or ie.readyState <> 4: DoEvents: Wend
'************** click on second button
Do
On Error Resume Next
Set ELE = ie.document.querySelector("li > button")
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While ELE Is Nothing
If Not ELE Is Nothing Then
ELE.Click
End If
While ie.Busy Or ie.readyState <> 4: DoEvents: Wend
'************** get href for each model
Set nodes = ie.document.querySelectorAll(".card-product--family")
ReDim hrefs(nodes.Length - 1)
For i = 0 To nodes.Length - 1
hrefs(i) = nodes.Item(i).href
ActiveSheet.Cells(9 + i, 7).Value = hrefs(i)
Next
Try first to use more precise selectors. For the first button use:
ie.document.querySelector(".header__menu-services__nav").click
That targets the element by one of its classes. Then have a pause e.g.
While ie.Busy Or ie.ReadyState<>4:DoEvents:Wend
Or, use an explicit wait time or loop until next desired element is present.
Then target the next element with type selectors and child combinator as you want the first child button within a li element:
ie.document.querySelector("li > button").click
Then you need another wait.
Finally, you can use a single class from the target elements, with the links, and extract the href attributes and store in an array (for example)
Dim nodes As Object, hrefs(), i As Long
Set nodes = ie.Document.querySelectorAll(".card-product")
ReDim hrefs(nodes.Length - 1)
For i = 0 To nodes.Length - 1
hrefs(i) = nodes.Item(i).href
Next
EDIT:
Seems page uses ajax to retrieve the listings which makes this easier. I show to versions. The first where I grab just those links you describe after two button clicks; the second, where I grab model subtype links as well.
In both I mimic the request the page makes to get that info. In the first I then parse the returned json with a json parser and pull out the model links. With the second, I regex out all href info ie. all submodels.
Json library:
I use jsonconverter.bas. Download raw code from here and add to standard module called JsonConverter . You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
1)
Option Explicit
Public Sub ScrapeModelLinks1()
Dim data As Object, links() As Variant, s As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.aprilia.com/en_EN/aprilia/en/index?ajax=true", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
End With
Set data = JsonConverter.ParseJson(s)("pageData")("main")("component-06")("items")
ReDim links(data.Count)
Dim item As Long, base As String
base = "https://www.aprilia.com"
For item = 1 To data.Count
links(item) = base & data(item)("href")
Next
Stop
End Sub
Public Sub ScrapeModelLinks2()
'grab all href which will include model subtypes
Dim s As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.aprilia.com/en_EN/aprilia/en/index?ajax=true", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
End With
Dim re As Object, matches As Object, links() As Variant
Set re = CreateObject("VBScript.RegExp")
re.Pattern = """href"":""(.*?)"""
re.Global = True
Set matches = re.Execute(s)
ReDim links(matches.Count - 1)
Dim item As Long, base As String
base = "https://www.aprilia.com"
For item = 0 To matches.Count - 1
links(item) = base & matches(item).submatches(0)
Next
Stop
End Sub
Regex explanation:
Related
Unfortunately my old account got deleted when changing website hosts. I looked throughout the site for a solution but only found people dealing with entire tables rather than just trying to obtain one value. I am having an issue grabbing Net Assets with innerText from the following webpage:
https://finance.yahoo.com/quote/UMBWX?p=UMBWX
Here is my attempt:
Sub FetchAssets()
Dim HTML As New HTMLDocument, elem As Object, URL As String
URL = https://finance.yahoo.com/quote/UMBWX?p=UMBWX
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
Application.Wait Now + 1 / (24 * 60 * 60# * 2)
.send
Application.Wait Now + 1 / (24 * 60 * 60# * 2)
HTML.body.innerHTML = .responseText
Set elem = HTML.querySelector("NET_ASSETS-value")
MsgBox elem.innerText
End With
End Sub
This attempt results in: Run-time error '91': Object variable or With block variable not set
I have tried the exact same code and swapped elem.innerText with elem.innerHTML and it results in the same error. The error happens on the MsgBox line of code, it sets elem fine.
Due to my reputation being under 10 I can not embed an image, but here is the line of code from the webpage:
<td class="Ta(end) Fw(600) Lh(14px)" data-test="NET_ASSETS-value" data-reactid="89"><span class="Trsdu(0.3s) " data-reactid="90">539.25M</span></td>
Try this
Sub FetchAssets()
Dim html As New HTMLDocument, elem As Object, sURL As String
sURL = "https://finance.yahoo.com/quote/UMBWX?p=UMBWX"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, False
.send
html.body.innerHTML = .responseText
Set elem = html.querySelector("td[data-test='NET_ASSETS-value']")
MsgBox elem.innerText
End With
End Sub
I need to get the page number in order to extract text from that specific page in a .PDF document. I am using Excel VBA function that makes use of the JSObject from the Acrobat Type Library 10.0
Here is the code snippet and the code hicks up on when I am trying to reference the pageNum property from Doc object. I am trying to avoid the AV Layer and use the PD Layer only, so my macro runs in the background only and doesn't invoke Acrobat Application.
Function getTextFromPDF_JS(ByVal strFilename As String) As String
Dim pdDoc As New AcroPDDoc
Dim pdfPage As Acrobat.AcroPDPage
Dim pdfBookmark As Acrobat.AcroPDBookmark
Dim jso As Object
Dim BookMarkRoot As Object
Dim vBookmark As Variant
Dim objSelection As AcroPDTextSelect
Dim objHighlight As AcroHiliteList
Dim currPage As Integer
Dim strText As String
Dim BM_flag As Boolean
Dim count As Integer
Dim word As Variant
strText = ""
If (pdDoc.Open(strFilename)) Then
Set jso = pdDoc.GetJSObject
Set BookMarkRoot = jso.BookMarkRoot
vBookmark = jso.BookMarkRoot.Children
'Add a function call to see if a particular bookmark exists within the .PDF
Set pdfBookmark = CreateObject("AcroExch.PDBookmark")
BM_flag = pdfBookmark.GetByTitle(pdDoc, "Title Page")
If (BM_flag) Then
For i = 0 To UBound(vBookmark)
If vBookmark(i).Name = "Title Page" Then
vBookmark(i).Execute
jso.pageNum
Set pdfPage = pdDoc.AcquirePage(pageNum)
Set objHighlight = New AcroHiliteList
objHighlight.Add 0, 10000 ' Adjust this up if it's not getting all the text on the page
Set objSelection = pdfPage.CreatePageHilite(objHighlight)
If Not objSelection Is Nothing Then
For tCount = 0 To objSelection.GetNumText - 1
strText = strText & objSelection.GetText(tCount)
Next tCount
End If
Exit For
End If
pdDoc.Close
End If
End If
getTextFromPDF_JS = strText
End Function
jso.pageNum = 0; set a page number
pageNo = jso.pageNum; get a page number
edit: 3.3.19
Mmmh, it seems you have to work with AVDoc in order to get the current actual page via jso.pageNum . Also if you work with AVdoc the Acobat window stay hidden in the background. Example:
strFilename = "d:\Test2.pdf"
set avDoc = CreateObject("AcroExch.AVDoc")
If (avDoc.Open(strFilename,"")) Then
Set pdDoc = avDoc.getPDDoc()
Set jso = pdDoc.GetJSObject
pageNo = jso.pageNum
msgbox(pageNo)
end if
I have a routine that logs in and calls a script from the page where you are logged in, but this report is available to save from the internet and I need it saved automatically without user intervention.
Sub ConnectWeb()
Dim ie As InternetExplorer
Dim C
Dim ULogin As Boolean, ieForm
Dim MyPass As String, MyLogin As String
Dim Linha As Integer
Dim PN As String
MyLogin = Application.InputBox("Por Favor entre com o Login", "Empresa", Default:="User", Type:=2)
MyPass = Application.InputBox("Por favor entre com a senha", "Empresa", Default:="Password", Type:=2)
Set ie = New InternetExplorer
ie.Visible = True
ie.Navigate "http://url"
Do Until ie.ReadyState = READYSTATE_COMPLETE
Loop
ie.Document.all("cuser").innerText = MyLogin
ie.Document.all("cpass").innerText = MyPass
ie.Document.getElementById("cent").Value = "BR"
ie.Document.forms(0).submit
Do While ie.Busy
DoEvents
Loop
PN = "D515005-5304"
'JavaScript to create file
ie.Document.parentWindow.execScript ("printPL('" & PN & "','N%2FC','no')")
End Sub
After the Java script call appears the message with the file generated and the option of Internet Explorer Save, at the moment I need the file to be saved automatically, and preferably in a specific path.
Since then, thanks for the help.
I chose to use chrome and found that javascript sends information byurl, so I used the following logic
Sub test()
Dim chromePath, PN As String
Dim Linha As Integer
chromePath = """C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"""
Linha = 2
Cells(Linha, "A").Select
Do While ActiveCell <> ""
PN = Cells(Linha, "A").Value
strURL = "http://cdroyal4.cdzodiac.com/cgi-bin/wspd_cgi.sh/WService=wslive/wpt/function.r?funct=print.pl&drawing-no=" & PN & "&prelim=no&rev=N%2FC&filename=" & PN
Shell (chromePath & strURL)
Linha = Linha + 1
Cells(Linha, "A").Select
Loop
MsgBox "Complete", vbInformation, "Empresa"
End Sub
With Chrome the download is done automatically, resulting in the expected result.
Thanks to all who have reviewed
I'm trying to loop through a bunch of PDFs and, for each one, flatten it using the Acrobat API to access the PDF's JS Object. Because VBA doesn't have a code-accessible stack trace, I have a wrapper class AcroExchWrapper that wraps each Acrobat API function to identify the function if it fails, e.g.
Public Sub ClosePDF()
On Error GoTo errHandler
'AcroDoc is created with CreateObject("AcroExch.PDDoc")
AcroDoc.Close
Exit Sub
errHandler:
Err.Raise Err.Number, "AcroExchWrapper.ClosePDF (" & Err.source & ")", Err.Description, Err.HelpFile, Err.HelpContext
End Sub
Each PDF is flattened using the following code:
Private Function flattenPDF(pdfPath As String, savePath As String, deleteOld As Boolean) As String
Dim pageCount As Integer
Dim jso As Object
Dim saveResult As Long
Dim openResult As Long
Dim creatingApp As String
On Error GoTo errHandler
creatingApp = ""
If deleteOld Then
If fso.FileExists(savePath) Then
fso.DeleteFile savePath
End If
End If
acroExch.ClosePDF
openResult = acroExch.OpenPDF(pdfPath)
If openResult = 0 Then
Err.Raise "-1", "Flattener.flattenPDF", "unable to open PDF"
End If
DoEvents
Set jso = acroExch.GetJSObject
DoEvents
If jso Is Nothing Then
Err.Raise "-1", "Flattener.flattenPDF", "unable to get JS object"
Else
pageCount = acroExch.GetNumPages
creatingApp = acroExch.GetInfo("Creator")
If pageCount <> -1 Then
flattenPages jso, 0, pageCount - 1
Else
flattenPages jso
End If
DoEvents
saveResult = acroExch.SavePDF(savePath)
If saveResult = 0 Then
Err.Raise "-1", "Flattener.flattenPDF", "unable to save PDF"
End If
acroExch.ClosePDF
If pdfPath <> savePath Then
loggerObj.addEntry pdfPath, savePath, "flattened", creatingApp
End If
End If
flattenPDF = savePath
Exit Function
errHandler:
loggerObj.addErrorEntryFromErrorObj pdfPath, "", "flattenPDF", Err, creatingApp
flattenPDF = ""
End Function
'wrapper for jso.flattenPages to allow for stack trace
Private Sub flattenPages(jso As Object, Optional startPage As Long = -1, Optional endPage As Long = -1)
On Error GoTo errHandler
If startPage = -1 Then
jso.flattenPages
Else
jso.flattenPages startPage, endPage
End If
Exit Sub
errHandler:
Err.Raise Err.Number, "flattenPages (" & Err.source & ")", Err.Description, Err.HelpFile, Err.HelpContext
End Sub
After going through several thousand files--the number is different each time I run the script--flattenPages() raises the following error:
Automation error
The remote procedure call failed.
After that, for all remaining files, when flattenPDF() runs, the first call to acroExch.ClosePDF() raises this error:
The remote server machine does not exist or is unavailable
I haven't been able to find any documentation on why these errors occur when using Acrobat API, whether with the Javascript API (jso.flattenPages()) or the IAC API (PDDoc.Close()). More mysterious is that the application runs fine until it gets to some varying number of files and only then starts throwing these exceptions.
EDIT: I added the following function to AcroExchWrapper to reset Acrobat, to be performed every 100 files:
Public Sub Reset()
ClosePDF
AcroApp.Exit
Set AcroApp = CreateObject("AcroExch.App")
Set AcroDoc = CreateObject("AcroExch.PDDoc")
Exit Sub
However, the same exceptions are still being thrown. This seems to be occurring once 1500-2500 files have been processed.
I am new to vba and coding so, I need help from you guys.
PurpOse of this vba code:
1. open internet explore
2. Put my user id & password and login.
3. select current date.
4. Select a particular option (say X1) from 1st dropdown.
5. Then select particular option (say Y1 which come after selecting X1, if i select X2 diffrent options will come in 2nd dropdown) from 2nd drop down.
6. then select Z1 option in 3rd dropdown which will come only if i am selecting Y1 in 2nd dropdown.
7. then click "save" my selections
Now, i have wriiten a code which correctly performs till step 4. but after that it is not able to select the correct option in 2nd and 3rd drop downs.
Don't know why.
Code I am using:
Dim HTMLDoc As HTMLDocument
Dim oBrowser As InternetExplorer
Private Declare PtrSafe Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As LongPtr)
Private Declare PtrSafe Function apiShowWindow Lib "user32" Alias "ShowWindow" _
(ByVal hwnd As Long, ByVal nCmdShow As Long) As Long
Const SW_MAXIMIZE = 3
Const SW_SHOWNORMAL = 1
Const SW_SHOWMINIMIZED = 2
Private Sub Merchtimetracker_Click()
Dim oHTML_Element As IHTMLElement
Dim sURL As String
Dim ie As Object
Set ie = CreateObject("internetexplorer.application")
On Error GoTo Err_Clear
With ie
.Visible = True
apiShowWindow ie.hwnd, SW_MAXIMIZE
.navigate "https://xxxxxxx"
Do While .Busy
DoEvents
Loop
Do While .readyState <> 4
DoEvents
Loop
End With
Set emailid = ie.document.getelementbyid("emailid")
emailid.Value = "xxxxxx"
Set Password = ie.document.getelementbyid("password")
Password.Value = "xxxxx"
ie.document.getelementsbyname("login_now")(0).Click
Err_Clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
Do While ie.readyState <> 4 Or ie.Busy = True
DoEvents
Loop
ie.document.getelementbyid("datepicker").Value = Format(Date, "yyyy-mm-dd") 'write "Format (Date, "yyyy-mm-dd")" when want to give today's date and if not then just write ("2016-09-13")
Set project = ie.document.getelementbyid("project")
For i = 0 To project.Options.Length - 1
If project.Options(i).Text = "X1" Then
project.selectedindex = i
For j = 0 To task.Options.Length - 1
If task.Options(j).Text = "Y1" Then
task.selectedindex = j
Exit for
End If
Next j
Exit For
End If
Next i
End subcode
Here is a (mostly) simple way to pass a value to a ComboBox.
Sub passValueToComboBox1()
Dim ie As Object
Dim oHTML_Element As IHTMLElement
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = True
ie.navigate "http://peterschleif.bplaced.net/excel/combobox/index.php"
While ie.Busy Or ie.readyState <> 4: DoEvents: Wend
Set oHTML_Element = ie.document.getElementsByName("selectedReportClass")(0)
If Not oHTML_Element Is Nothing Then oHTML_Element.Value = "com.db.moap.report.FUBU7"
For Each oHTML_Element In ie.document.getElementsByTagName("input")
If oHTML_Element.Type = "submit" Then oHTML_Element.Click: Exit For
Next
End Sub