Home » excel » excel – Parsing HTML with VBA

excel – Parsing HTML with VBA

Posted by: admin April 23, 2020 Leave a comment

Questions:

I am trying to pull data from some 500 urls of a website. All the pages are same in structure. I am facing a problem with understanding the HTML of this particular site

https://www.coworker.com/s-f/6033/united-states_hawaii_honolulu_impact-hub-honolulu

I want to extract Name, Address, Tel and website. My current code:

Sub GetData()
    Dim wsSheet As Worksheet, Rows As Long, links As Variant, IE As Object, link As Variant
    Set wb = ThisWorkbook
    Set wsSheet = wb.Sheets("Sheet1")

    Set IE = New InternetExplorer

    Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = wsSheet.Range("A1:A" & Rows)

    With IE
        .Visible = True
        For Each link In links
            .navigate (link)
            While .Busy Or .readyState <> 4: DoEvents: Wend

        Next
    End With
End Sub
How to&Answers:

Here you go. Without more links to test with this is very fragile. It relies heavily on consistent styling across pages.


XHR Looping link list:

Option Explicit
Public Sub GetInfo()
    Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument
    Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
    Application.ScreenUpdating = False

    With wsSheet
        Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
        If Rows = 1 Then
            ReDim links(1 To 1, 1 To 1)
            links(1, 1) = wsSheet.Range("A1")
        Else
            links = wsSheet.Range("A1:A" & Rows).Value
        End If
        Dim r As Long
        For link = LBound(links, 1) To UBound(links, 1)
            r = r + 1
            Set html = GetHTML(links(link, 1))
            On Error Resume Next
            Dim aNodeList As Object: Set aNodeList = html.querySelectorAll(".col-xs-12.pade_none.muchroom_mail")
            .Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
            .Cells(r, 3) = "Address: " & aNodeList.item(0).innerText
            .Cells(r, 4) = "Tel: " & aNodeList.item(1).innerText
            .Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
             On Error GoTo 0
        Next link
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetHTML(ByVal url As String) As HTMLDocument
    Dim sResponse As String, html As New HTMLDocument
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", url, False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With
    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))

    With html
        .body.innerHTML = sResponse
    End With
    Set GetHTML = html
End Function

Output:

output


References (VBE>Tools>References):

  1. HTML object Library

Internet Explorer:

Option Explicit
Public Sub GetInfo()
    Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument, ie As InternetExplorer
    Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
    Application.ScreenUpdating = False
    With wsSheet
        Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
        If Rows = 1 Then
            ReDim links(1, 1)
            links(1, 1) = wsSheet.Range("A1")
        Else
            links = wsSheet.Range("A1:A" & Rows).Value
        End If
        Dim r As Long

        Set ie = New InternetExplorer
        ie.Visible = True

        For link = LBound(links, 1) To UBound(links, 1)
            ie.navigate links(link, 1)
            While ie.Busy Or ie.readyState < 4: DoEvents: Wend
          '  Application.Wait Now + TimeSerial(0, 0, 10)
            On Error Resume Next
            r = r + 1: Set html = ie.document
            .Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
            .Cells(r, 3) = "Address: " & html.querySelector(".col-xs-12.pade_none.muchroom_mail").innerText
            .Cells(r, 4) = "Tel: " & html.querySelector(".fa.fa-phone.fa-rotate-270 ~ a").innerText
            .Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
            On Error GoTo 0
        Next link
        ie.Quit
    End With
    Application.ScreenUpdating = True
End Sub

References (VBE>Tools>References):

  1. HTML object Library
  2. Microsoft Internet Controls