mirror of
https://github.com/youronlydimwit/Data_ScienceUse_Cases.git
synced 2025-12-14 10:50:36 +01:00
Add files via upload
This commit is contained in:
50
VBA/CosineSimilarity.bas
Normal file
50
VBA/CosineSimilarity.bas
Normal file
@@ -0,0 +1,50 @@
|
||||
Attribute VB_Name = "Module4"
|
||||
Function SimilarityCosine(s1 As String, s2 As String) As Double
|
||||
Dim words1 As Object
|
||||
Set words1 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim words2 As Object
|
||||
Set words2 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim i As Integer
|
||||
Dim j As Integer
|
||||
Dim dotProduct As Double
|
||||
Dim magnitude1 As Double
|
||||
Dim magnitude2 As Double
|
||||
|
||||
' Split the strings into words and create vectors
|
||||
Dim words As Variant
|
||||
words = Split(s1 & " " & s2, " ")
|
||||
|
||||
For i = LBound(words) To UBound(words)
|
||||
If Trim(words(i)) <> "" Then
|
||||
words1(Trim(words(i))) = 0
|
||||
words2(Trim(words(i))) = 0
|
||||
End If
|
||||
Next i
|
||||
|
||||
' Populate the vectors
|
||||
For i = LBound(words) To UBound(words)
|
||||
If Trim(words(i)) <> "" Then
|
||||
If i <= UBound(Split(s1, " ")) Then
|
||||
words1(Trim(words(i))) = words1(Trim(words(i))) + 1
|
||||
Else
|
||||
words2(Trim(words(i))) = words2(Trim(words(i))) + 1
|
||||
End If
|
||||
End If
|
||||
Next i
|
||||
|
||||
' Calculate dot product and magnitudes
|
||||
For Each Key In words1.Keys
|
||||
dotProduct = dotProduct + (words1(Key) * words2(Key))
|
||||
magnitude1 = magnitude1 + (words1(Key) * words1(Key))
|
||||
magnitude2 = magnitude2 + (words2(Key) * words2(Key))
|
||||
Next Key
|
||||
|
||||
' Calculate cosine similarity
|
||||
If magnitude1 * magnitude2 = 0 Then
|
||||
SimilarityCosine = 0
|
||||
Else
|
||||
SimilarityCosine = dotProduct / (Sqr(magnitude1) * Sqr(magnitude2))
|
||||
End If
|
||||
End Function
|
||||
47
VBA/JaroWinklerSimilarity.bas
Normal file
47
VBA/JaroWinklerSimilarity.bas
Normal file
@@ -0,0 +1,47 @@
|
||||
Attribute VB_Name = "Module2"
|
||||
Function SimilarityJaroWinkler(s1 As String, s2 As String) As Double
|
||||
Dim m As Integer ' Number of matching characters
|
||||
Dim t As Integer ' Number of transpositions
|
||||
Dim l1 As Integer ' Length of string 1
|
||||
Dim l2 As Integer ' Length of string 2
|
||||
Dim maxLen As Integer
|
||||
Dim prefix As Integer
|
||||
Dim jw As Double ' Jaro-Winkler similarity
|
||||
|
||||
l1 = Len(s1)
|
||||
l2 = Len(s2)
|
||||
maxLen = IIf(l1 > l2, l1, l2)
|
||||
|
||||
' Calculate the number of matching characters
|
||||
m = 0
|
||||
t = 0
|
||||
For i = 1 To l1
|
||||
For j = IIf(i - 2 > 0, i - 2, 1) To IIf(i + 2 <= l2, i + 2, l2)
|
||||
If Mid(s1, i, 1) = Mid(s2, j, 1) Then
|
||||
m = m + 1
|
||||
If i <> j Then t = t + 1
|
||||
Exit For
|
||||
End If
|
||||
Next j
|
||||
Next i
|
||||
|
||||
' Calculate the prefix length
|
||||
prefix = 0
|
||||
For i = 1 To IIf(l1 > 4, 4, l1)
|
||||
If Mid(s1, i, 1) = Mid(s2, i, 1) Then
|
||||
prefix = prefix + 1
|
||||
Else
|
||||
Exit For
|
||||
End If
|
||||
Next i
|
||||
|
||||
' Calculate the Jaro similarity
|
||||
If m = 0 Then
|
||||
jw = 0
|
||||
Else
|
||||
jw = (m / l1 + m / l2 + (m - t) / m) / 3
|
||||
jw = jw + (prefix * 0.1 * (1 - jw))
|
||||
End If
|
||||
|
||||
SimilarityJaroWinkler = jw
|
||||
End Function
|
||||
25
VBA/LevenshteinSimilarity.bas
Normal file
25
VBA/LevenshteinSimilarity.bas
Normal file
@@ -0,0 +1,25 @@
|
||||
Attribute VB_Name = "Module1"
|
||||
Function SimilarityLevenshtein(target As String, reference As String) As Double
|
||||
Dim maxLen As Long
|
||||
Dim minLen As Long
|
||||
Dim distance As Double
|
||||
Dim similarity As Double
|
||||
|
||||
' Remove spaces and convert to lowercase
|
||||
target = Replace(LCase(target), " ", "")
|
||||
reference = Replace(LCase(reference), " ", "")
|
||||
|
||||
maxLen = IIf(Len(target) > Len(reference), Len(target), Len(reference))
|
||||
minLen = IIf(Len(target) < Len(reference), Len(target), Len(reference))
|
||||
|
||||
distance = maxLen - minLen
|
||||
For i = 1 To minLen
|
||||
If Mid(target, i, 1) <> Mid(reference, i, 1) Then
|
||||
distance = distance + 1
|
||||
End If
|
||||
Next i
|
||||
|
||||
similarity = (1 - distance / maxLen)
|
||||
SimilarityLevenshtein = similarity
|
||||
End Function
|
||||
|
||||
45
VBA/NGramJaccardSimilarity.bas
Normal file
45
VBA/NGramJaccardSimilarity.bas
Normal file
@@ -0,0 +1,45 @@
|
||||
Attribute VB_Name = "Module5"
|
||||
Function SimilarityNGramJaccard(s1 As String, s2 As String, n As Integer) As Double
|
||||
Dim set1 As Object
|
||||
Set set1 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim set2 As Object
|
||||
Set set2 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim i As Integer
|
||||
Dim ngram1 As String
|
||||
Dim ngram2 As String
|
||||
|
||||
' Create n-grams from string 1
|
||||
For i = 1 To Len(s1) - n + 1
|
||||
ngram1 = Mid(s1, i, n)
|
||||
set1(ngram1) = 1
|
||||
Next i
|
||||
|
||||
' Create n-grams from string 2
|
||||
For i = 1 To Len(s2) - n + 1
|
||||
ngram2 = Mid(s2, i, n)
|
||||
set2(ngram2) = 1
|
||||
Next i
|
||||
|
||||
' Calculate the size of the intersection of the n-gram sets
|
||||
Dim intersectionSize As Integer
|
||||
intersectionSize = 0
|
||||
|
||||
For Each Key In set1.Keys
|
||||
If set2.Exists(Key) Then
|
||||
intersectionSize = intersectionSize + 1
|
||||
End If
|
||||
Next Key
|
||||
|
||||
' Calculate the n-gram similarity
|
||||
Dim similarity As Double
|
||||
|
||||
If set1.Count + set2.Count = 0 Then
|
||||
similarity = 1
|
||||
Else
|
||||
similarity = intersectionSize / (set1.Count + set2.Count - intersectionSize)
|
||||
End If
|
||||
|
||||
SimilarityNGramJaccard = similarity
|
||||
End Function
|
||||
40
VBA/SorensenDiceSimilarity.bas
Normal file
40
VBA/SorensenDiceSimilarity.bas
Normal file
@@ -0,0 +1,40 @@
|
||||
Attribute VB_Name = "Module3"
|
||||
Function SimilaritySorensenDiceCoef(s1 As String, s2 As String) As Double
|
||||
Dim set1 As Object
|
||||
Set set1 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim set2 As Object
|
||||
Set set2 = CreateObject("Scripting.Dictionary")
|
||||
|
||||
Dim i As Integer
|
||||
|
||||
' Create sets of characters from both strings
|
||||
For i = 1 To Len(s1)
|
||||
set1(Mid(s1, i, 1)) = 1
|
||||
Next i
|
||||
|
||||
For i = 1 To Len(s2)
|
||||
set2(Mid(s2, i, 1)) = 1
|
||||
Next i
|
||||
|
||||
' Calculate the size of the intersection of the sets
|
||||
Dim intersectionSize As Integer
|
||||
intersectionSize = 0
|
||||
|
||||
For Each Key In set1.Keys
|
||||
If set2.Exists(Key) Then
|
||||
intersectionSize = intersectionSize + 1
|
||||
End If
|
||||
Next Key
|
||||
|
||||
' Calculate the S<>rensen-Dice coefficient
|
||||
Dim coefficient As Double
|
||||
|
||||
If set1.Count + set2.Count = 0 Then
|
||||
coefficient = 1
|
||||
Else
|
||||
coefficient = 2 * intersectionSize / (set1.Count + set2.Count)
|
||||
End If
|
||||
|
||||
SimilaritySorensenDiceCoef = coefficient
|
||||
End Function
|
||||
Reference in New Issue
Block a user