From 2cc97d254b62b8d23289c539ef79a401a31bc9bd Mon Sep 17 00:00:00 2001 From: youronlydimwit <79888425+youronlydimwit@users.noreply.github.com> Date: Wed, 1 Nov 2023 13:03:28 +0700 Subject: [PATCH] Add files via upload --- VBA/CosineSimilarity.bas | 50 ++++++++++++++++++++++++++++++++++ VBA/JaroWinklerSimilarity.bas | 47 ++++++++++++++++++++++++++++++++ VBA/LevenshteinSimilarity.bas | 25 +++++++++++++++++ VBA/NGramJaccardSimilarity.bas | 45 ++++++++++++++++++++++++++++++ VBA/SorensenDiceSimilarity.bas | 40 +++++++++++++++++++++++++++ 5 files changed, 207 insertions(+) create mode 100644 VBA/CosineSimilarity.bas create mode 100644 VBA/JaroWinklerSimilarity.bas create mode 100644 VBA/LevenshteinSimilarity.bas create mode 100644 VBA/NGramJaccardSimilarity.bas create mode 100644 VBA/SorensenDiceSimilarity.bas diff --git a/VBA/CosineSimilarity.bas b/VBA/CosineSimilarity.bas new file mode 100644 index 0000000..586a715 --- /dev/null +++ b/VBA/CosineSimilarity.bas @@ -0,0 +1,50 @@ +Attribute VB_Name = "Module4" +Function SimilarityCosine(s1 As String, s2 As String) As Double + Dim words1 As Object + Set words1 = CreateObject("Scripting.Dictionary") + + Dim words2 As Object + Set words2 = CreateObject("Scripting.Dictionary") + + Dim i As Integer + Dim j As Integer + Dim dotProduct As Double + Dim magnitude1 As Double + Dim magnitude2 As Double + + ' Split the strings into words and create vectors + Dim words As Variant + words = Split(s1 & " " & s2, " ") + + For i = LBound(words) To UBound(words) + If Trim(words(i)) <> "" Then + words1(Trim(words(i))) = 0 + words2(Trim(words(i))) = 0 + End If + Next i + + ' Populate the vectors + For i = LBound(words) To UBound(words) + If Trim(words(i)) <> "" Then + If i <= UBound(Split(s1, " ")) Then + words1(Trim(words(i))) = words1(Trim(words(i))) + 1 + Else + words2(Trim(words(i))) = words2(Trim(words(i))) + 1 + End If + End If + Next i + + ' Calculate dot product and magnitudes + For Each Key In words1.Keys + dotProduct = dotProduct + (words1(Key) * words2(Key)) + magnitude1 = magnitude1 + (words1(Key) * words1(Key)) + magnitude2 = magnitude2 + (words2(Key) * words2(Key)) + Next Key + + ' Calculate cosine similarity + If magnitude1 * magnitude2 = 0 Then + SimilarityCosine = 0 + Else + SimilarityCosine = dotProduct / (Sqr(magnitude1) * Sqr(magnitude2)) + End If +End Function diff --git a/VBA/JaroWinklerSimilarity.bas b/VBA/JaroWinklerSimilarity.bas new file mode 100644 index 0000000..a87aa7b --- /dev/null +++ b/VBA/JaroWinklerSimilarity.bas @@ -0,0 +1,47 @@ +Attribute VB_Name = "Module2" +Function SimilarityJaroWinkler(s1 As String, s2 As String) As Double + Dim m As Integer ' Number of matching characters + Dim t As Integer ' Number of transpositions + Dim l1 As Integer ' Length of string 1 + Dim l2 As Integer ' Length of string 2 + Dim maxLen As Integer + Dim prefix As Integer + Dim jw As Double ' Jaro-Winkler similarity + + l1 = Len(s1) + l2 = Len(s2) + maxLen = IIf(l1 > l2, l1, l2) + + ' Calculate the number of matching characters + m = 0 + t = 0 + For i = 1 To l1 + For j = IIf(i - 2 > 0, i - 2, 1) To IIf(i + 2 <= l2, i + 2, l2) + If Mid(s1, i, 1) = Mid(s2, j, 1) Then + m = m + 1 + If i <> j Then t = t + 1 + Exit For + End If + Next j + Next i + + ' Calculate the prefix length + prefix = 0 + For i = 1 To IIf(l1 > 4, 4, l1) + If Mid(s1, i, 1) = Mid(s2, i, 1) Then + prefix = prefix + 1 + Else + Exit For + End If + Next i + + ' Calculate the Jaro similarity + If m = 0 Then + jw = 0 + Else + jw = (m / l1 + m / l2 + (m - t) / m) / 3 + jw = jw + (prefix * 0.1 * (1 - jw)) + End If + + SimilarityJaroWinkler = jw +End Function diff --git a/VBA/LevenshteinSimilarity.bas b/VBA/LevenshteinSimilarity.bas new file mode 100644 index 0000000..6c120ed --- /dev/null +++ b/VBA/LevenshteinSimilarity.bas @@ -0,0 +1,25 @@ +Attribute VB_Name = "Module1" +Function SimilarityLevenshtein(target As String, reference As String) As Double + Dim maxLen As Long + Dim minLen As Long + Dim distance As Double + Dim similarity As Double + + ' Remove spaces and convert to lowercase + target = Replace(LCase(target), " ", "") + reference = Replace(LCase(reference), " ", "") + + maxLen = IIf(Len(target) > Len(reference), Len(target), Len(reference)) + minLen = IIf(Len(target) < Len(reference), Len(target), Len(reference)) + + distance = maxLen - minLen + For i = 1 To minLen + If Mid(target, i, 1) <> Mid(reference, i, 1) Then + distance = distance + 1 + End If + Next i + + similarity = (1 - distance / maxLen) + SimilarityLevenshtein = similarity +End Function + diff --git a/VBA/NGramJaccardSimilarity.bas b/VBA/NGramJaccardSimilarity.bas new file mode 100644 index 0000000..64fd09c --- /dev/null +++ b/VBA/NGramJaccardSimilarity.bas @@ -0,0 +1,45 @@ +Attribute VB_Name = "Module5" +Function SimilarityNGramJaccard(s1 As String, s2 As String, n As Integer) As Double + Dim set1 As Object + Set set1 = CreateObject("Scripting.Dictionary") + + Dim set2 As Object + Set set2 = CreateObject("Scripting.Dictionary") + + Dim i As Integer + Dim ngram1 As String + Dim ngram2 As String + + ' Create n-grams from string 1 + For i = 1 To Len(s1) - n + 1 + ngram1 = Mid(s1, i, n) + set1(ngram1) = 1 + Next i + + ' Create n-grams from string 2 + For i = 1 To Len(s2) - n + 1 + ngram2 = Mid(s2, i, n) + set2(ngram2) = 1 + Next i + + ' Calculate the size of the intersection of the n-gram sets + Dim intersectionSize As Integer + intersectionSize = 0 + + For Each Key In set1.Keys + If set2.Exists(Key) Then + intersectionSize = intersectionSize + 1 + End If + Next Key + + ' Calculate the n-gram similarity + Dim similarity As Double + + If set1.Count + set2.Count = 0 Then + similarity = 1 + Else + similarity = intersectionSize / (set1.Count + set2.Count - intersectionSize) + End If + + SimilarityNGramJaccard = similarity +End Function diff --git a/VBA/SorensenDiceSimilarity.bas b/VBA/SorensenDiceSimilarity.bas new file mode 100644 index 0000000..593d440 --- /dev/null +++ b/VBA/SorensenDiceSimilarity.bas @@ -0,0 +1,40 @@ +Attribute VB_Name = "Module3" +Function SimilaritySorensenDiceCoef(s1 As String, s2 As String) As Double + Dim set1 As Object + Set set1 = CreateObject("Scripting.Dictionary") + + Dim set2 As Object + Set set2 = CreateObject("Scripting.Dictionary") + + Dim i As Integer + + ' Create sets of characters from both strings + For i = 1 To Len(s1) + set1(Mid(s1, i, 1)) = 1 + Next i + + For i = 1 To Len(s2) + set2(Mid(s2, i, 1)) = 1 + Next i + + ' Calculate the size of the intersection of the sets + Dim intersectionSize As Integer + intersectionSize = 0 + + For Each Key In set1.Keys + If set2.Exists(Key) Then + intersectionSize = intersectionSize + 1 + End If + Next Key + + ' Calculate the Sørensen-Dice coefficient + Dim coefficient As Double + + If set1.Count + set2.Count = 0 Then + coefficient = 1 + Else + coefficient = 2 * intersectionSize / (set1.Count + set2.Count) + End If + + SimilaritySorensenDiceCoef = coefficient +End Function