mirror of
https://github.com/youronlydimwit/Data_ScienceUse_Cases.git
synced 2025-12-18 16:21:15 +01:00
Add files via upload
This commit is contained in:
50
VBA/CosineSimilarity.bas
Normal file
50
VBA/CosineSimilarity.bas
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
Attribute VB_Name = "Module4"
|
||||||
|
Function SimilarityCosine(s1 As String, s2 As String) As Double
|
||||||
|
Dim words1 As Object
|
||||||
|
Set words1 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim words2 As Object
|
||||||
|
Set words2 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim i As Integer
|
||||||
|
Dim j As Integer
|
||||||
|
Dim dotProduct As Double
|
||||||
|
Dim magnitude1 As Double
|
||||||
|
Dim magnitude2 As Double
|
||||||
|
|
||||||
|
' Split the strings into words and create vectors
|
||||||
|
Dim words As Variant
|
||||||
|
words = Split(s1 & " " & s2, " ")
|
||||||
|
|
||||||
|
For i = LBound(words) To UBound(words)
|
||||||
|
If Trim(words(i)) <> "" Then
|
||||||
|
words1(Trim(words(i))) = 0
|
||||||
|
words2(Trim(words(i))) = 0
|
||||||
|
End If
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Populate the vectors
|
||||||
|
For i = LBound(words) To UBound(words)
|
||||||
|
If Trim(words(i)) <> "" Then
|
||||||
|
If i <= UBound(Split(s1, " ")) Then
|
||||||
|
words1(Trim(words(i))) = words1(Trim(words(i))) + 1
|
||||||
|
Else
|
||||||
|
words2(Trim(words(i))) = words2(Trim(words(i))) + 1
|
||||||
|
End If
|
||||||
|
End If
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Calculate dot product and magnitudes
|
||||||
|
For Each Key In words1.Keys
|
||||||
|
dotProduct = dotProduct + (words1(Key) * words2(Key))
|
||||||
|
magnitude1 = magnitude1 + (words1(Key) * words1(Key))
|
||||||
|
magnitude2 = magnitude2 + (words2(Key) * words2(Key))
|
||||||
|
Next Key
|
||||||
|
|
||||||
|
' Calculate cosine similarity
|
||||||
|
If magnitude1 * magnitude2 = 0 Then
|
||||||
|
SimilarityCosine = 0
|
||||||
|
Else
|
||||||
|
SimilarityCosine = dotProduct / (Sqr(magnitude1) * Sqr(magnitude2))
|
||||||
|
End If
|
||||||
|
End Function
|
||||||
47
VBA/JaroWinklerSimilarity.bas
Normal file
47
VBA/JaroWinklerSimilarity.bas
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
Attribute VB_Name = "Module2"
|
||||||
|
Function SimilarityJaroWinkler(s1 As String, s2 As String) As Double
|
||||||
|
Dim m As Integer ' Number of matching characters
|
||||||
|
Dim t As Integer ' Number of transpositions
|
||||||
|
Dim l1 As Integer ' Length of string 1
|
||||||
|
Dim l2 As Integer ' Length of string 2
|
||||||
|
Dim maxLen As Integer
|
||||||
|
Dim prefix As Integer
|
||||||
|
Dim jw As Double ' Jaro-Winkler similarity
|
||||||
|
|
||||||
|
l1 = Len(s1)
|
||||||
|
l2 = Len(s2)
|
||||||
|
maxLen = IIf(l1 > l2, l1, l2)
|
||||||
|
|
||||||
|
' Calculate the number of matching characters
|
||||||
|
m = 0
|
||||||
|
t = 0
|
||||||
|
For i = 1 To l1
|
||||||
|
For j = IIf(i - 2 > 0, i - 2, 1) To IIf(i + 2 <= l2, i + 2, l2)
|
||||||
|
If Mid(s1, i, 1) = Mid(s2, j, 1) Then
|
||||||
|
m = m + 1
|
||||||
|
If i <> j Then t = t + 1
|
||||||
|
Exit For
|
||||||
|
End If
|
||||||
|
Next j
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Calculate the prefix length
|
||||||
|
prefix = 0
|
||||||
|
For i = 1 To IIf(l1 > 4, 4, l1)
|
||||||
|
If Mid(s1, i, 1) = Mid(s2, i, 1) Then
|
||||||
|
prefix = prefix + 1
|
||||||
|
Else
|
||||||
|
Exit For
|
||||||
|
End If
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Calculate the Jaro similarity
|
||||||
|
If m = 0 Then
|
||||||
|
jw = 0
|
||||||
|
Else
|
||||||
|
jw = (m / l1 + m / l2 + (m - t) / m) / 3
|
||||||
|
jw = jw + (prefix * 0.1 * (1 - jw))
|
||||||
|
End If
|
||||||
|
|
||||||
|
SimilarityJaroWinkler = jw
|
||||||
|
End Function
|
||||||
25
VBA/LevenshteinSimilarity.bas
Normal file
25
VBA/LevenshteinSimilarity.bas
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
Attribute VB_Name = "Module1"
|
||||||
|
Function SimilarityLevenshtein(target As String, reference As String) As Double
|
||||||
|
Dim maxLen As Long
|
||||||
|
Dim minLen As Long
|
||||||
|
Dim distance As Double
|
||||||
|
Dim similarity As Double
|
||||||
|
|
||||||
|
' Remove spaces and convert to lowercase
|
||||||
|
target = Replace(LCase(target), " ", "")
|
||||||
|
reference = Replace(LCase(reference), " ", "")
|
||||||
|
|
||||||
|
maxLen = IIf(Len(target) > Len(reference), Len(target), Len(reference))
|
||||||
|
minLen = IIf(Len(target) < Len(reference), Len(target), Len(reference))
|
||||||
|
|
||||||
|
distance = maxLen - minLen
|
||||||
|
For i = 1 To minLen
|
||||||
|
If Mid(target, i, 1) <> Mid(reference, i, 1) Then
|
||||||
|
distance = distance + 1
|
||||||
|
End If
|
||||||
|
Next i
|
||||||
|
|
||||||
|
similarity = (1 - distance / maxLen)
|
||||||
|
SimilarityLevenshtein = similarity
|
||||||
|
End Function
|
||||||
|
|
||||||
45
VBA/NGramJaccardSimilarity.bas
Normal file
45
VBA/NGramJaccardSimilarity.bas
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
Attribute VB_Name = "Module5"
|
||||||
|
Function SimilarityNGramJaccard(s1 As String, s2 As String, n As Integer) As Double
|
||||||
|
Dim set1 As Object
|
||||||
|
Set set1 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim set2 As Object
|
||||||
|
Set set2 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim i As Integer
|
||||||
|
Dim ngram1 As String
|
||||||
|
Dim ngram2 As String
|
||||||
|
|
||||||
|
' Create n-grams from string 1
|
||||||
|
For i = 1 To Len(s1) - n + 1
|
||||||
|
ngram1 = Mid(s1, i, n)
|
||||||
|
set1(ngram1) = 1
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Create n-grams from string 2
|
||||||
|
For i = 1 To Len(s2) - n + 1
|
||||||
|
ngram2 = Mid(s2, i, n)
|
||||||
|
set2(ngram2) = 1
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Calculate the size of the intersection of the n-gram sets
|
||||||
|
Dim intersectionSize As Integer
|
||||||
|
intersectionSize = 0
|
||||||
|
|
||||||
|
For Each Key In set1.Keys
|
||||||
|
If set2.Exists(Key) Then
|
||||||
|
intersectionSize = intersectionSize + 1
|
||||||
|
End If
|
||||||
|
Next Key
|
||||||
|
|
||||||
|
' Calculate the n-gram similarity
|
||||||
|
Dim similarity As Double
|
||||||
|
|
||||||
|
If set1.Count + set2.Count = 0 Then
|
||||||
|
similarity = 1
|
||||||
|
Else
|
||||||
|
similarity = intersectionSize / (set1.Count + set2.Count - intersectionSize)
|
||||||
|
End If
|
||||||
|
|
||||||
|
SimilarityNGramJaccard = similarity
|
||||||
|
End Function
|
||||||
40
VBA/SorensenDiceSimilarity.bas
Normal file
40
VBA/SorensenDiceSimilarity.bas
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
Attribute VB_Name = "Module3"
|
||||||
|
Function SimilaritySorensenDiceCoef(s1 As String, s2 As String) As Double
|
||||||
|
Dim set1 As Object
|
||||||
|
Set set1 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim set2 As Object
|
||||||
|
Set set2 = CreateObject("Scripting.Dictionary")
|
||||||
|
|
||||||
|
Dim i As Integer
|
||||||
|
|
||||||
|
' Create sets of characters from both strings
|
||||||
|
For i = 1 To Len(s1)
|
||||||
|
set1(Mid(s1, i, 1)) = 1
|
||||||
|
Next i
|
||||||
|
|
||||||
|
For i = 1 To Len(s2)
|
||||||
|
set2(Mid(s2, i, 1)) = 1
|
||||||
|
Next i
|
||||||
|
|
||||||
|
' Calculate the size of the intersection of the sets
|
||||||
|
Dim intersectionSize As Integer
|
||||||
|
intersectionSize = 0
|
||||||
|
|
||||||
|
For Each Key In set1.Keys
|
||||||
|
If set2.Exists(Key) Then
|
||||||
|
intersectionSize = intersectionSize + 1
|
||||||
|
End If
|
||||||
|
Next Key
|
||||||
|
|
||||||
|
' Calculate the S<>rensen-Dice coefficient
|
||||||
|
Dim coefficient As Double
|
||||||
|
|
||||||
|
If set1.Count + set2.Count = 0 Then
|
||||||
|
coefficient = 1
|
||||||
|
Else
|
||||||
|
coefficient = 2 * intersectionSize / (set1.Count + set2.Count)
|
||||||
|
End If
|
||||||
|
|
||||||
|
SimilaritySorensenDiceCoef = coefficient
|
||||||
|
End Function
|
||||||
Reference in New Issue
Block a user