-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
OptimalStringAlignment.cs
157 lines (141 loc) · 6.57 KB
/
OptimalStringAlignment.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
using System;
namespace Algorithms.Strings.Similarity
{
/// <summary>
/// Provides methods to calculate the Optimal String Alignment distance between two strings.
///
/// The Optimal String Alignment distance, also known as the restricted Damerau-Levenshtein distance,
/// is a string metric used to measure the difference between two sequences. It is similar to the
/// Levenshtein distance, but it also considers transpositions (swapping of two adjacent characters)
/// as a single operation. This metric is particularly useful when adjacent characters are commonly
/// transposed, such as in typographical errors.
///
/// The OSA distance between two strings is defined as the minimum number of operations required to
/// transform one string into the other, where the operations include:
///
/// 1. Insertion: Adding a single character.
/// 2. Deletion: Removing a single character.
/// 3. Substitution: Replacing one character with another.
/// 4. Transposition: Swapping two adjacent characters (this is what distinguishes OSA from the
/// traditional Levenshtein distance).
///
/// The OSA distance algorithm ensures that no operation is applied more than once to the same
/// character in the same position. This is the main difference between the OSA and the more general
/// Damerau-Levenshtein distance, which does not have this restriction.
///
/// <example>
/// Example Usage:
/// <code>
/// int distance = OptimalStringAlignmentDistance("example", "exmaple");
/// Console.WriteLine(distance); // Output: 1
/// </code>
/// In this example, the strings "example" and "exmaple" differ by one transposition of adjacent characters ('a' and 'm'),
/// so the OSA distance is 1.
///
/// <code>
/// int distance = OptimalStringAlignmentDistance("kitten", "sitting");
/// Console.WriteLine(distance); // Output: 3
/// </code>
/// Here, the strings "kitten" and "sitting" have three differences (substitutions 'k' to 's', 'e' to 'i', and insertion of 'g'),
/// resulting in an OSA distance of 3.
/// </example>
/// </summary>
/// <remarks>
/// This algorithm has a time complexity of O(n * m), where n and m are the lengths of the two input strings.
/// It is efficient for moderate-sized strings but may become computationally expensive for very long strings.
/// </remarks>
public static class OptimalStringAlignment
{
/// <summary>
/// Calculates the Optimal String Alignment distance between two strings.
/// </summary>
/// <param name="firstString">The first string.</param>
/// <param name="secondString">The second string.</param>
/// <returns>The Optimal String Alignment distance between the two strings.</returns>
/// <exception cref="ArgumentNullException">Thrown when either of the input strings is null.</exception>
public static double Calculate(string firstString, string secondString)
{
ArgumentNullException.ThrowIfNull(nameof(firstString));
ArgumentNullException.ThrowIfNull(nameof(secondString));
if (firstString == secondString)
{
return 0.0;
}
if (firstString.Length == 0)
{
return secondString.Length;
}
if (secondString.Length == 0)
{
return firstString.Length;
}
var distanceMatrix = GenerateDistanceMatrix(firstString.Length, secondString.Length);
distanceMatrix = CalculateDistance(firstString, secondString, distanceMatrix);
return distanceMatrix[firstString.Length, secondString.Length];
}
/// <summary>
/// Generates the initial distance matrix for the given lengths of the two strings.
/// </summary>
/// <param name="firstLength">The length of the first string.</param>
/// <param name="secondLength">The length of the second string.</param>
/// <returns>The initialized distance matrix.</returns>
private static int[,] GenerateDistanceMatrix(int firstLength, int secondLength)
{
var distanceMatrix = new int[firstLength + 2, secondLength + 2];
for (var i = 0; i <= firstLength; i++)
{
distanceMatrix[i, 0] = i;
}
for (var j = 0; j <= secondLength; j++)
{
distanceMatrix[0, j] = j;
}
return distanceMatrix;
}
/// <summary>
/// Calculates the distance matrix for the given strings using the Optimal String Alignment algorithm.
/// </summary>
/// <param name="firstString">The first string.</param>
/// <param name="secondString">The second string.</param>
/// <param name="distanceMatrix">The initial distance matrix.</param>
/// <returns>The calculated distance matrix.</returns>
private static int[,] CalculateDistance(string firstString, string secondString, int[,] distanceMatrix)
{
for (var i = 1; i <= firstString.Length; i++)
{
for (var j = 1; j <= secondString.Length; j++)
{
var cost = 1;
if (firstString[i - 1] == secondString[j - 1])
{
cost = 0;
}
distanceMatrix[i, j] = Minimum(
distanceMatrix[i - 1, j - 1] + cost, // substitution
distanceMatrix[i, j - 1] + 1, // insertion
distanceMatrix[i - 1, j] + 1); // deletion
if (i > 1 && j > 1
&& firstString[i - 1] == secondString[j - 2]
&& firstString[i - 2] == secondString[j - 1])
{
distanceMatrix[i, j] = Math.Min(
distanceMatrix[i, j],
distanceMatrix[i - 2, j - 2] + cost); // transposition
}
}
}
return distanceMatrix;
}
/// <summary>
/// Returns the minimum of three integers.
/// </summary>
/// <param name="a">The first integer.</param>
/// <param name="b">The second integer.</param>
/// <param name="c">The third integer.</param>
/// <returns>The minimum of the three integers.</returns>
private static int Minimum(int a, int b, int c)
{
return Math.Min(a, Math.Min(b, c));
}
}
}