shahin
Active Member
Hi there! Hope you all are doing well. Recently working with some code i got stuck pulling specific data from a certain
web. So far when i worked with sites i could notice that every documents are usually embedded in a regular way
as in, when i go deeper i meant, in multilayered sites i noticed that every link connected to another finally
ends at some point where the elements by which they are called are same. But, in a specific site when i run my
code i see that going to the second layer some links end and some go even deeper. So, for the links end there and
for those going deeper are not with the same elements to be called. A slight tuning will help me accomplish my code.
If they ended at the same time i could do that myself. I tried to make you understand what i'm facing with. For your consideration. I've uploaded the file and gonna paste here the code.
[Const pageurl As String = "http://www.bjs.com"
Public Sub parsehtml()
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, hmm As New HTMLDocument, hmc As New HTMLDocument
Dim topics As Object, topic As Object, fla As Object, z As String, zz As String, vla As Object
Dim i As Long, x As Long, mla As Object, link As String, aa As String, qq As String, docs As Object
Dim cc As String, posts As Object, dla As Object, m As Long, la As Object, validlinks As String, refinedlinks As String
x = 2
http.Open "GET", "http://www.bjs.com/", False
http.send
html.body.innerHTML = http.responseText
Set topics = html.getElementsByClassName("shop-categories")(0)
Set mla = topics.getElementsByTagName("a")
For m = 0 To mla.Length - 1
z = mla(m).getAttribute("href")
link = pageurl & Mid(z, InStr(z, ":") + 1)
Next m
http.Open "GET", link, False
http.send
hmm.body.innerHTML = http.responseText
Set posts = hmm.getElementsByClassName("brick")
For Each fla In posts
Set dla = fla.getElementsByTagName("a")(0)
aa = dla.getAttribute("href")
qq = IIf(Right(aa, 2) = ".1", aa, "")
zz = pageurl & Mid(qq, InStr(qq, ":") + 1)
cc = IIf(Right(zz, 2) = ".1", zz, "")
If cc <> "" Then
refinedlinks = cc
End If
validlinks = refinedlinks
' Cells(x, 1) = validlinks ''Now it produces valid links with some duplicates that
' ''i don't want moreover some go deep some end here. ''So links are here with different lengths.
' x = x + 1
Next fla
''' I'm stuck at this point. Not i can pull links from here nor can go '''deeper. Because object elements are not same for all the links.
http.Open "GET", validlinks, False
http.send
hmc.body.innerHTML = http.responseText
Set topic = hmc.getElementsByClassName("category ng-scope")
For Each docs In topic
Set vla = docs.getElementsByTagName("a")(0)
Cells(x, 1) = vla.getAttribute("href")
x = x = 1
Next docs
End Sub]
web. So far when i worked with sites i could notice that every documents are usually embedded in a regular way
as in, when i go deeper i meant, in multilayered sites i noticed that every link connected to another finally
ends at some point where the elements by which they are called are same. But, in a specific site when i run my
code i see that going to the second layer some links end and some go even deeper. So, for the links end there and
for those going deeper are not with the same elements to be called. A slight tuning will help me accomplish my code.
If they ended at the same time i could do that myself. I tried to make you understand what i'm facing with. For your consideration. I've uploaded the file and gonna paste here the code.
[Const pageurl As String = "http://www.bjs.com"
Public Sub parsehtml()
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, hmm As New HTMLDocument, hmc As New HTMLDocument
Dim topics As Object, topic As Object, fla As Object, z As String, zz As String, vla As Object
Dim i As Long, x As Long, mla As Object, link As String, aa As String, qq As String, docs As Object
Dim cc As String, posts As Object, dla As Object, m As Long, la As Object, validlinks As String, refinedlinks As String
x = 2
http.Open "GET", "http://www.bjs.com/", False
http.send
html.body.innerHTML = http.responseText
Set topics = html.getElementsByClassName("shop-categories")(0)
Set mla = topics.getElementsByTagName("a")
For m = 0 To mla.Length - 1
z = mla(m).getAttribute("href")
link = pageurl & Mid(z, InStr(z, ":") + 1)
Next m
http.Open "GET", link, False
http.send
hmm.body.innerHTML = http.responseText
Set posts = hmm.getElementsByClassName("brick")
For Each fla In posts
Set dla = fla.getElementsByTagName("a")(0)
aa = dla.getAttribute("href")
qq = IIf(Right(aa, 2) = ".1", aa, "")
zz = pageurl & Mid(qq, InStr(qq, ":") + 1)
cc = IIf(Right(zz, 2) = ".1", zz, "")
If cc <> "" Then
refinedlinks = cc
End If
validlinks = refinedlinks
' Cells(x, 1) = validlinks ''Now it produces valid links with some duplicates that
' ''i don't want moreover some go deep some end here. ''So links are here with different lengths.
' x = x + 1
Next fla
''' I'm stuck at this point. Not i can pull links from here nor can go '''deeper. Because object elements are not same for all the links.
http.Open "GET", validlinks, False
http.send
hmc.body.innerHTML = http.responseText
Set topic = hmc.getElementsByClassName("category ng-scope")
For Each docs In topic
Set vla = docs.getElementsByTagName("a")(0)
Cells(x, 1) = vla.getAttribute("href")
x = x = 1
Next docs
End Sub]
Attachments
Last edited: