Home » excel » arrays – Issue counting word frequencies with VBA: same data, different numbers

arrays – Issue counting word frequencies with VBA: same data, different numbers

Posted by: admin May 14, 2020 Leave a comment

Questions:

I’ve made two different scripts in VBA to count the frequency of words contained in a CSV. Both scripts run fine, but I get different numbers for each word and I don’t know why. Here are some of the steps that lead to the moment when the difference appears

Script 1:

Sub Dict_Array_1()

Dim Wb As Workbook, Wb1 As Workbook
Dim Ws As Worksheet, Ws1 As Worksheet
Dim Fd As Office.FileDialog
Dim StrFile As String
Dim i As Long, a As Long, LastR As Long
Dim Arr() As Variant
Dim Ban_() As String, T As String
Dim Ban As Object, Dict As Object
Dim Carac As Variant, w As Variant, Key As Variant 

Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
Application.DisplayStatusBar = False
Application.EnableEvents = False
ActiveSheet.DisplayPageBreaks = False
Application.DisplayAlerts = False

Set Wb = ActiveWorkbook
Set Ws = Wb.ActiveSheet


'---------- CSV ---------------------------------------------------------------------------------------------------------------

Set Fd = Application.FileDialog(msoFileDialogFilePicker)
With Fd
  .AllowMultiSelect = False
  .Title = "Select doc"
  .Filters.Clear
  .Filters.Add "Doc CSV (*.csv)", "*.csv"

    If .Show Then

        On Error GoTo ErrOpen 'ignore this
        Set Wb1 = Workbooks.Open(.SelectedItems(1), ReadOnly:=True, Local:=False) 
        On Error GoTo 0

        Set Ws1 = Wb1.Sheets(1)
        With Ws1
            LastR = .Cells(.Rows.Count, "S").End(xlUp).Row 

            Arr = .Range(Cells(1, 19), Cells(LastR, 19)).Value2 
        End With

        Wb1.Close 0
        Set Wb1 = Nothing
        Set Ws1 = Nothing
    Else
        Exit Sub
    End If
End With

'---------------------------------------- COUNT ----------------------------------------------------------------------------------------------------
'Array with words i want to ban
Ban_ = Split("word1,word2,word3,etc", ",")

'Array with caract i want to ban
Carac = Array(".", ",", ";", ":", "!", "#", "$", "%", "&", "(", ")", "- ", "_", "--", "+", _
                            "=", "~", "/", "\", "{", "}", "[", "]", """", "?", "*", ">>", "»", "«")

Set Ban = CreateObject("Scripting.Dictionary") 'need late binding
Ban.CompareMode = vbTextCompare 'case insensitive
For i = 0 To UBound(Ban_)
    Ban.Add Ban_(i), 1
Next i
Erase Ban_

'Dict to count words
Set Dict = CreateObject("Scripting.Dictionary")
Dict.CompareMode = vbTextCompare 'case insensitive
For a = 1 To UBound(Arr, 1) 
    If Not IsError(Arr(a, 1)) 
        T = Arr(a, 1)
        For i = 0 To UBound(Carac)
            T = Replace(T, Carac(i), "", , , vbTextCompare) 
        Next i
        T = Application.Trim(T) 


        For Each w In Split(T, " ")
            If Not Ban.exists(w) Then
                If Not Dict.exists(w) Then
                    Dict.Add w, 1
                Else
                    Dict.Item(w) = Dict.Item(w) + 1 
                End If
            End If
        Next w
    End If
Next a
Exit Sub

Erase Arr
Erase Carac
Set Ban = Nothing

Script 2 is basically the same, only difference is that I access the .CSV in another way:

Sub Dict_ADODB()
Dim Wb As Workbook, Wb1 As Workbook
Dim Ws As Worksheet, Ws1 As Worksheet
Dim Fd As Office.FileDialog
Dim StrFile As String
Dim i As Long, a As Long, LastR As Long
Dim Arr() As Variant
Dim Ban_() As String, T As String
Dim Ban As Object, Dict As Object
Dim Carac As Variant, w As Variant, Key As Variant 
Dim ObjC As Object, ObjR As Object 'Object Connection / Object Recordset
Const adOpenStatic = 3
Const adLockOptimistic = 3

Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
Application.DisplayStatusBar = False
Application.EnableEvents = False
ActiveSheet.DisplayPageBreaks = False
Application.DisplayAlerts = False

Set Wb = ActiveWorkbook
Set Ws = Wb.ActiveSheet


'---------- CSV ---------------------------------------------------------------------------------------------------------------

Set Fd = Application.FileDialog(msoFileDialogFilePicker)
With Fd
  .AllowMultiSelect = False
  .Title = "Select doc"
  .Filters.Clear
  .Filters.Add "Doc CSV (*.csv)", "*.csv"

    If .Show Then
        '----------- ADODB ---
        Set ObjC = CreateObject("ADODB.Connection")
        Set ObjR = CreateObject("ADODB.RecordSet")

        On Error GoTo ErrOpen 
        ObjC.Open "Provider=Microsoft.Jet.OLEDB.4.0;" & _
                  "Data Source=" & .InitialFileName & ";" & _
                  "Extended Properties=""text;HDR=YES;FMT=Delimited;CharacterSet=65001""" 
        On Error GoTo 0
        'I just need one column
        ObjR.Open "SELECT Message FROM " & Right(.SelectedItems(1), Len(.SelectedItems(1)) - InStrRev(.SelectedItems(1), "\")) & _
                    " WHERE Message IS NOT NULL", _
          ObjC, adOpenStatic, adLockOptimistic
        Arr = ObjR.GetRows() 

        ObjR.Close
        ObjC.Close
        Set ObjR = Nothing
        Set ObjC = Nothing
    Else
        Exit Sub
    End If
End With

'---------------------------------------- COUNT ----------------------------------------------------------------------------------------------------
'Array with word I don't need
Ban_ = Split("word1,word2", ",")

Carac = Array(".", ",", ";", ":", "!", "#", "$", "%", "&", "(", ")", "- ", "_", "--", "+", _
                            "=", "~", "/", "\", "{", "}", "[", "]", """", "?", "*", ">>", "»", "«")

Set Ban = CreateObject("Scripting.Dictionary") 
Ban.CompareMode = vbTextCompare 
For i = 0 To UBound(Ban_)
    Ban.Add Ban_(i), 1
Next i
Erase Ban_

Set Dict = CreateObject("Scripting.Dictionary")
Dict.CompareMode = vbTextCompare 'case insensitive
For a = 0 To UBound(Arr, 2) 
    If Not IsError(Arr(0, a)) Then 
        T = Arr(0, a)
        For i = 0 To UBound(Carac)
            T = Replace(T, Carac(i), "", , , vbTextCompare) 
        Next i
        T = Application.Trim(T) 

        For Each w In Split(T, " ")
            If Not Ban.exists(w) Then
                If Not Dict.exists(w) Then
                    Dict.Add w, 1
                Else
                    Dict.Item(w) = Dict.Item(w) + 1 
                End If
            End If
        Next w
    End If
Next a

Erase Arr
Erase Carac
Set Ban = Nothing
Exit Sub

Here you go. When I do dict.count I do find that the total number of entries is different, which is only partly explained by the use of “WHERE Message IS NOT NULL”. Any idea why would be greatly appreciated!

How to&Answers:

The best case to see what is happening is to write some log this line:

Dict.Add w, 1

E.g., if the values are up to 200, then write:

Dim cnt as long
Dict.Add w, 1
cnt = cnt + 1
Debug.Print cnt, w

If the values are above 200, then only the last 200 would be displayed on the immediate window, thus it will not help you a lot. You can build a String with the log and print the String in a Notepad exactly with the same.

Dim cnt       as Long
Dim logString as String
Dict.Add w, 1
cnt = cnt + 1
logString = logString & VbCrLF & cnt, w

And at the end CreateLogFile logString:

Sub CreateLogFile(Optional strPrint As String)

    Dim fs                      As Object
    Dim obj_text                As Object
    Dim str_filename            As String
    Dim str_new_file            As String
    Dim str_shell               As String

    str_new_file = "\tests_info\"

    str_filename = ThisWorkbook.Path & str_new_file
    If Dir(ThisWorkbook.Path & str_new_file, vbDirectory) = vbNullString Then
         MkDir ThisWorkbook.Path & str_new_file
    End If

    Set fs = CreateObject("Scripting.FileSystemObject")
    Set obj_text = fs.CreateTextFile(str_filename & "\sometext.txt", True)

    obj_text.writeline (strPrint)
    obj_text.Close

    str_shell = "C:\WINDOWS\notepad.exe "
    str_shell = str_shell & str_filename & "\sometext.txt"
    Shell str_shell

End Sub

Answer:

Alright, using a Schema.ini seems to have fixed my issue. Something that is not clear in the documentation is that one should set “colX= Y Type” for each column in the CSV until the one he wants to select (at first I only set “Col19=Message” but it failed because the previous columns where not set…).

I’m sharing the relevant part of the code for anyone interested (Excel 2010 / X86 version):

  Set fs = CreateObject("Scripting.FileSystemObject")
  Set obj_text = fs.CreateTextFile(.InitialFileName & "\Schema.ini", True) 
  obj_text.write ("[" & Right(.SelectedItems(1), Len(.SelectedItems(1)) - InStrRev(.SelectedItems(1), "\")) & "]" & vbNewLine & _
                  "ColNameHeader=False" & vbNewLine & _
                  "CharacterSet=65001" & vbNewLine & _
                  "Format=CSVDelimited" & vbNewLine & _
                  "DecimalSymbol=." & vbNewLine & _
                  "Col1=1 Text" & vbNewLine & _
                  "Col2=2 Text" & vbNewLine & _
                  "Col3=3 Text" & vbNewLine & _
                  "Col4=4 Text" & vbNewLine & _
                  "Col5=5 Text" & vbNewLine & _
                  "Col6=6 Text" & vbNewLine & _
                  "Col7=7 Text" & vbNewLine & _
                  "Col8=8 Text" & vbNewLine & _
                  "Col9=9 Text" & vbNewLine & _
                  "Col10=10 Text" & vbNewLine & _
                  "Col11=11 Text" & vbNewLine & _
                  "Col12=12 Text" & vbNewLine & _
                  "Col13=13 Text" & vbNewLine & _
                  "Col14=14 Text" & vbNewLine & _
                  "Col15=15 Text" & vbNewLine & _
                  "Col16=16 Text" & vbNewLine & _
                  "Col17=17 Text" & vbNewLine & _
                  "Col18=18 Text" & vbNewLine & _
                  "Col19=GOODONE Memo") 'set all the previous cols until the one I need!
  obj_text.Close

  Set ObjC = CreateObject("ADODB.Connection")
  Set ObjR = CreateObject("ADODB.RecordSet")

  ObjC.Open "Provider=Microsoft.Jet.OLEDB.4.0;" & _
            "Data Source=" & .InitialFileName & ";" & _
            "Extended Properties=""text;HDR=No;"""

  ObjR.Open "SELECT GOODONE FROM " & Right(.SelectedItems(1), Len(.SelectedItems(1)) - InStrRev(.SelectedItems(1), "\")), _
    ObjC, 0, 1 

  Arr = ObjR.GetRows()