Skip to content

Commit 3d228b4

Browse files
committedMay 15, 2012
Re-orgnized, added some documentation
1 parent c6aa21e commit 3d228b4

File tree

1 file changed

+140
-124
lines changed

1 file changed

+140
-124
lines changed
 

‎Sufa/SuffixArray.cs

+140-124
Original file line numberDiff line numberDiff line change
@@ -26,62 +26,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2626

2727
namespace Sufa
2828
{
29-
[Serializable]
30-
internal class Chain : IComparable<Chain>
31-
{
32-
public int head;
33-
public int length;
34-
private string m_str;
35-
36-
public Chain(string str)
37-
{
38-
m_str = str;
39-
}
40-
41-
public int CompareTo(Chain other)
42-
{
43-
return m_str.Substring(head, length).CompareTo(m_str.Substring(other.head, other.length));
44-
}
45-
46-
public override string ToString()
47-
{
48-
return m_str.Substring(head, length);
49-
}
50-
}
51-
52-
[Serializable]
53-
internal class CharComparer : System.Collections.Generic.EqualityComparer<char>
54-
{
55-
public override bool Equals(char x, char y)
56-
{
57-
return x.Equals(y);
58-
}
59-
60-
public override int GetHashCode(char obj)
61-
{
62-
return obj.GetHashCode();
63-
}
64-
}
65-
66-
internal struct SuffixRank
67-
{
68-
public int head;
69-
public int rank;
70-
}
71-
72-
class SuffixRankComparer : IComparer<SuffixRank>
73-
{
74-
public bool Equals(SuffixRank x, SuffixRank y)
75-
{
76-
return x.rank.Equals(y.rank);
77-
}
78-
79-
public int Compare(SuffixRank x, SuffixRank y)
80-
{
81-
return x.rank.CompareTo(y.rank);
82-
}
83-
}
84-
8529
[Serializable]
8630
public class SuffixArray
8731
{
@@ -91,10 +35,9 @@ public class SuffixArray
9135
private int[] m_lcp;
9236
private C5.HashDictionary<char, int> m_chainHeadsDict = new HashDictionary<char, int>(new CharComparer());
9337
private List<Chain> m_chainStack = new List<Chain>();
94-
ArrayList<Chain> m_subChains = new ArrayList<Chain>();
38+
private ArrayList<Chain> m_subChains = new ArrayList<Chain>();
9539
private int m_nextRank = 1;
9640
private string m_str;
97-
//private List<int> m_currentChain = new List<int>();
9841

9942
public int Length
10043
{
@@ -116,8 +59,20 @@ public string Str
11659
get { return m_str; }
11760
}
11861

119-
public SuffixArray(string str) : this(str, true) {}
62+
///
63+
/// <summary>
64+
/// Build a suffix array from string str
65+
/// </summary>
66+
/// <param name="str">A string for which to build a suffix array with LCP information</param>
67+
/// <param name="buildLcps">Also build LCP array</param>
68+
public SuffixArray(string str) : this(str, true) { }
12069

70+
///
71+
/// <summary>
72+
/// Build a suffix array from string str
73+
/// </summary>
74+
/// <param name="str">A string for which to build a suffix array</param>
75+
/// <param name="buildLcps">Also calculate LCP information</param>
12176
public SuffixArray(string str, bool buildLcps)
12277
{
12378
m_str = str;
@@ -134,11 +89,47 @@ public SuffixArray(string str, bool buildLcps)
13489
BuildLcpArray();
13590
}
13691

137-
/// <summary>
138-
/// Link all suffixes that have the same first character
139-
/// </summary>
92+
///
93+
/// <summary>Find the index of a substring </summary>
94+
/// <param name="substr">Substring to look for</param>
95+
/// <returns>First index in the original string. -1 if not found</returns>
96+
public int IndexOf(string substr)
97+
{
98+
int l = 0;
99+
int r = m_sa.Length;
100+
int m = -1;
101+
102+
if ((substr == null) || (substr.Length == 0))
103+
{
104+
return -1;
105+
}
106+
107+
// Binary search for substring
108+
while (r > l)
109+
{
110+
m = (l + r) / 2;
111+
if (m_str.Substring(m_sa[m]).CompareTo(substr) < 0)
112+
{
113+
l = m + 1;
114+
}
115+
else
116+
{
117+
r = m;
118+
}
119+
}
120+
if ((l == r) && (l < m_str.Length) && (m_str.Substring(m_sa[l]).StartsWith(substr)))
121+
{
122+
return m_sa[l];
123+
}
124+
else
125+
{
126+
return -1;
127+
}
128+
}
129+
140130
private void FormInitialChains()
141131
{
132+
// Link all suffixes that have the same first character
142133
FindInitialChains();
143134
SortAndPushSubchains();
144135
}
@@ -205,61 +196,59 @@ private void RefineChains(Chain chain)
205196
m_subChains.Clear();
206197
while (chain.head != EOC)
207198
{
208-
// TODO - refactor this to get rid of the side effect of changing m_isa
209199
int nextIndex = m_isa[chain.head];
210-
UpdateSubChains(chain);
200+
if (chain.head + chain.length > m_str.Length - 1)
201+
{
202+
RankSuffix(chain.head);
203+
}
204+
else
205+
{
206+
ExtendChain(chain);
207+
}
211208
chain.head = nextIndex;
212209
}
213-
// Keep stack lexically sorted
210+
// Keep stack sorted
214211
SortAndPushSubchains();
215212
}
216213

217-
private void UpdateSubChains(Chain chain)
214+
private void ExtendChain(Chain chain)
218215
{
219-
if (chain.head + chain.length > m_str.Length - 1)
216+
char sym = m_str[chain.head + chain.length];
217+
if (m_chainHeadsDict.Contains(sym))
220218
{
221-
RankSuffix(chain.head);
219+
// Continuation of an existing chain, this is the leftmost
220+
// occurence currently known (others may come up later)
221+
m_isa[m_chainHeadsDict[sym]] = chain.head;
222+
m_isa[chain.head] = EOC;
222223
}
223224
else
224225
{
225-
char sym = m_str[chain.head + chain.length];
226-
if (m_chainHeadsDict.Contains(sym))
227-
{
228-
// Continuation of a known chain, this is the leftmost
229-
// occurence currently known (others may come up later)
230-
m_isa[m_chainHeadsDict[sym]] = chain.head;
231-
m_isa[chain.head] = EOC;
232-
}
233-
else
234-
{
235-
// This is the beginning of a new subchain
236-
m_isa[chain.head] = EOC;
237-
Chain newChain = new Chain(m_str);
238-
newChain.head = chain.head;
239-
newChain.length = chain.length + 1;
240-
m_subChains.Add(newChain);
241-
}
242-
// Save index in case we find a continuation of this chain
243-
m_chainHeadsDict[sym] = chain.head;
226+
// This is the beginning of a new subchain
227+
m_isa[chain.head] = EOC;
228+
Chain newChain = new Chain(m_str);
229+
newChain.head = chain.head;
230+
newChain.length = chain.length + 1;
231+
m_subChains.Add(newChain);
244232
}
233+
// Save index in case we find a continuation of this chain
234+
m_chainHeadsDict[sym] = chain.head;
245235
}
246236

247237
private void RefineChainWithInductionSorting(Chain chain)
248238
{
249-
// TODO - refactor m_chainHeadsDict and m_subChains into a subchains class, remove class members and pass a
250-
// variable instead (get rid of global state)
239+
// TODO - refactor/beautify some
240+
ArrayList<SuffixRank> notedSuffixes = new ArrayList<SuffixRank>();
251241
m_chainHeadsDict.Clear();
252242
m_subChains.Clear();
253243

254-
// TODO - and refactor notedSuffixes too
255-
ArrayList<SuffixRank> notedSuffixes = new ArrayList<SuffixRank>();
256-
257244
while (chain.head != EOC)
258245
{
259246
int nextIndex = m_isa[chain.head];
260-
// TODO - refactor
261247
if (chain.head + chain.length > m_str.Length - 1)
262248
{
249+
// If this substring reaches end of string it cannot be extended.
250+
// At this point it's the first in lexicographic order so it's safe
251+
// to just go ahead and rank it.
263252
RankSuffix(chain.head);
264253
}
265254
else if (m_isa[chain.head + chain.length] < 0)
@@ -271,11 +260,11 @@ private void RefineChainWithInductionSorting(Chain chain)
271260
}
272261
else
273262
{
274-
UpdateSubChains(chain);
263+
ExtendChain(chain);
275264
}
276265
chain.head = nextIndex;
277266
}
278-
// Keep stack lexically sorted
267+
// Keep stack sorted
279268
SortAndPushSubchains();
280269
SortAndRankNotedSuffixes(notedSuffixes);
281270
}
@@ -318,39 +307,66 @@ private int CalcLcp(int i, int j)
318307
return lcp;
319308
}
320309

321-
public int IndexOf(string substr)
310+
}
311+
312+
#region HelperClasses
313+
[Serializable]
314+
internal class Chain : IComparable<Chain>
315+
{
316+
public int head;
317+
public int length;
318+
private string m_str;
319+
320+
public Chain(string str)
322321
{
323-
int l = 0;
324-
int r = m_sa.Length;
325-
int m = -1;
322+
m_str = str;
323+
}
326324

327-
if ((substr == null) || (substr.Length == 0))
328-
{
329-
return -1;
330-
}
325+
public int CompareTo(Chain other)
326+
{
327+
return m_str.Substring(head, length).CompareTo(m_str.Substring(other.head, other.length));
328+
}
331329

332-
// Binary search for substring
333-
while (r > l)
334-
{
335-
m = (l + r) / 2;
336-
if (m_str.Substring(m_sa[m]).CompareTo(substr) < 0)
337-
{
338-
l = m + 1;
339-
}
340-
else
341-
{
342-
r = m;
343-
}
344-
}
345-
if ((l == r) && (l < m_str.Length) && (m_str.Substring(m_sa[l]).StartsWith(substr)))
346-
{
347-
return m_sa[l];
348-
}
349-
else
350-
{
351-
return -1;
352-
}
330+
public override string ToString()
331+
{
332+
return m_str.Substring(head, length);
333+
}
334+
}
335+
336+
[Serializable]
337+
internal class CharComparer : System.Collections.Generic.EqualityComparer<char>
338+
{
339+
public override bool Equals(char x, char y)
340+
{
341+
return x.Equals(y);
342+
}
343+
344+
public override int GetHashCode(char obj)
345+
{
346+
return obj.GetHashCode();
347+
}
348+
}
349+
350+
[Serializable]
351+
internal struct SuffixRank
352+
{
353+
public int head;
354+
public int rank;
355+
}
356+
357+
[Serializable]
358+
internal class SuffixRankComparer : IComparer<SuffixRank>
359+
{
360+
public bool Equals(SuffixRank x, SuffixRank y)
361+
{
362+
return x.rank.Equals(y.rank);
363+
}
364+
365+
public int Compare(SuffixRank x, SuffixRank y)
366+
{
367+
return x.rank.CompareTo(y.rank);
353368
}
354369
}
370+
#endregion
355371
}
356372

0 commit comments

Comments
 (0)
Please sign in to comment.