@@ -26,62 +26,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
26
27
27
namespace Sufa
28
28
{
29
- [ Serializable ]
30
- internal class Chain : IComparable < Chain >
31
- {
32
- public int head ;
33
- public int length ;
34
- private string m_str ;
35
-
36
- public Chain ( string str )
37
- {
38
- m_str = str ;
39
- }
40
-
41
- public int CompareTo ( Chain other )
42
- {
43
- return m_str . Substring ( head , length ) . CompareTo ( m_str . Substring ( other . head , other . length ) ) ;
44
- }
45
-
46
- public override string ToString ( )
47
- {
48
- return m_str . Substring ( head , length ) ;
49
- }
50
- }
51
-
52
- [ Serializable ]
53
- internal class CharComparer : System . Collections . Generic . EqualityComparer < char >
54
- {
55
- public override bool Equals ( char x , char y )
56
- {
57
- return x . Equals ( y ) ;
58
- }
59
-
60
- public override int GetHashCode ( char obj )
61
- {
62
- return obj . GetHashCode ( ) ;
63
- }
64
- }
65
-
66
- internal struct SuffixRank
67
- {
68
- public int head ;
69
- public int rank ;
70
- }
71
-
72
- class SuffixRankComparer : IComparer < SuffixRank >
73
- {
74
- public bool Equals ( SuffixRank x , SuffixRank y )
75
- {
76
- return x . rank . Equals ( y . rank ) ;
77
- }
78
-
79
- public int Compare ( SuffixRank x , SuffixRank y )
80
- {
81
- return x . rank . CompareTo ( y . rank ) ;
82
- }
83
- }
84
-
85
29
[ Serializable ]
86
30
public class SuffixArray
87
31
{
@@ -91,10 +35,9 @@ public class SuffixArray
91
35
private int [ ] m_lcp ;
92
36
private C5 . HashDictionary < char , int > m_chainHeadsDict = new HashDictionary < char , int > ( new CharComparer ( ) ) ;
93
37
private List < Chain > m_chainStack = new List < Chain > ( ) ;
94
- ArrayList < Chain > m_subChains = new ArrayList < Chain > ( ) ;
38
+ private ArrayList < Chain > m_subChains = new ArrayList < Chain > ( ) ;
95
39
private int m_nextRank = 1 ;
96
40
private string m_str ;
97
- //private List<int> m_currentChain = new List<int>();
98
41
99
42
public int Length
100
43
{
@@ -116,8 +59,20 @@ public string Str
116
59
get { return m_str ; }
117
60
}
118
61
119
- public SuffixArray ( string str ) : this ( str , true ) { }
62
+ ///
63
+ /// <summary>
64
+ /// Build a suffix array from string str
65
+ /// </summary>
66
+ /// <param name="str">A string for which to build a suffix array with LCP information</param>
67
+ /// <param name="buildLcps">Also build LCP array</param>
68
+ public SuffixArray ( string str ) : this ( str , true ) { }
120
69
70
+ ///
71
+ /// <summary>
72
+ /// Build a suffix array from string str
73
+ /// </summary>
74
+ /// <param name="str">A string for which to build a suffix array</param>
75
+ /// <param name="buildLcps">Also calculate LCP information</param>
121
76
public SuffixArray ( string str , bool buildLcps )
122
77
{
123
78
m_str = str ;
@@ -134,11 +89,47 @@ public SuffixArray(string str, bool buildLcps)
134
89
BuildLcpArray ( ) ;
135
90
}
136
91
137
- /// <summary>
138
- /// Link all suffixes that have the same first character
139
- /// </summary>
92
+ ///
93
+ /// <summary>Find the index of a substring </summary>
94
+ /// <param name="substr">Substring to look for</param>
95
+ /// <returns>First index in the original string. -1 if not found</returns>
96
+ public int IndexOf ( string substr )
97
+ {
98
+ int l = 0 ;
99
+ int r = m_sa . Length ;
100
+ int m = - 1 ;
101
+
102
+ if ( ( substr == null ) || ( substr . Length == 0 ) )
103
+ {
104
+ return - 1 ;
105
+ }
106
+
107
+ // Binary search for substring
108
+ while ( r > l )
109
+ {
110
+ m = ( l + r ) / 2 ;
111
+ if ( m_str . Substring ( m_sa [ m ] ) . CompareTo ( substr ) < 0 )
112
+ {
113
+ l = m + 1 ;
114
+ }
115
+ else
116
+ {
117
+ r = m ;
118
+ }
119
+ }
120
+ if ( ( l == r ) && ( l < m_str . Length ) && ( m_str . Substring ( m_sa [ l ] ) . StartsWith ( substr ) ) )
121
+ {
122
+ return m_sa [ l ] ;
123
+ }
124
+ else
125
+ {
126
+ return - 1 ;
127
+ }
128
+ }
129
+
140
130
private void FormInitialChains ( )
141
131
{
132
+ // Link all suffixes that have the same first character
142
133
FindInitialChains ( ) ;
143
134
SortAndPushSubchains ( ) ;
144
135
}
@@ -205,61 +196,59 @@ private void RefineChains(Chain chain)
205
196
m_subChains . Clear ( ) ;
206
197
while ( chain . head != EOC )
207
198
{
208
- // TODO - refactor this to get rid of the side effect of changing m_isa
209
199
int nextIndex = m_isa [ chain . head ] ;
210
- UpdateSubChains ( chain ) ;
200
+ if ( chain . head + chain . length > m_str . Length - 1 )
201
+ {
202
+ RankSuffix ( chain . head ) ;
203
+ }
204
+ else
205
+ {
206
+ ExtendChain ( chain ) ;
207
+ }
211
208
chain . head = nextIndex ;
212
209
}
213
- // Keep stack lexically sorted
210
+ // Keep stack sorted
214
211
SortAndPushSubchains ( ) ;
215
212
}
216
213
217
- private void UpdateSubChains ( Chain chain )
214
+ private void ExtendChain ( Chain chain )
218
215
{
219
- if ( chain . head + chain . length > m_str . Length - 1 )
216
+ char sym = m_str [ chain . head + chain . length ] ;
217
+ if ( m_chainHeadsDict . Contains ( sym ) )
220
218
{
221
- RankSuffix ( chain . head ) ;
219
+ // Continuation of an existing chain, this is the leftmost
220
+ // occurence currently known (others may come up later)
221
+ m_isa [ m_chainHeadsDict [ sym ] ] = chain . head ;
222
+ m_isa [ chain . head ] = EOC ;
222
223
}
223
224
else
224
225
{
225
- char sym = m_str [ chain . head + chain . length ] ;
226
- if ( m_chainHeadsDict . Contains ( sym ) )
227
- {
228
- // Continuation of a known chain, this is the leftmost
229
- // occurence currently known (others may come up later)
230
- m_isa [ m_chainHeadsDict [ sym ] ] = chain . head ;
231
- m_isa [ chain . head ] = EOC ;
232
- }
233
- else
234
- {
235
- // This is the beginning of a new subchain
236
- m_isa [ chain . head ] = EOC ;
237
- Chain newChain = new Chain ( m_str ) ;
238
- newChain . head = chain . head ;
239
- newChain . length = chain . length + 1 ;
240
- m_subChains . Add ( newChain ) ;
241
- }
242
- // Save index in case we find a continuation of this chain
243
- m_chainHeadsDict [ sym ] = chain . head ;
226
+ // This is the beginning of a new subchain
227
+ m_isa [ chain . head ] = EOC ;
228
+ Chain newChain = new Chain ( m_str ) ;
229
+ newChain . head = chain . head ;
230
+ newChain . length = chain . length + 1 ;
231
+ m_subChains . Add ( newChain ) ;
244
232
}
233
+ // Save index in case we find a continuation of this chain
234
+ m_chainHeadsDict [ sym ] = chain . head ;
245
235
}
246
236
247
237
private void RefineChainWithInductionSorting ( Chain chain )
248
238
{
249
- // TODO - refactor m_chainHeadsDict and m_subChains into a subchains class, remove class members and pass a
250
- // variable instead (get rid of global state)
239
+ // TODO - refactor/beautify some
240
+ ArrayList < SuffixRank > notedSuffixes = new ArrayList < SuffixRank > ( ) ;
251
241
m_chainHeadsDict . Clear ( ) ;
252
242
m_subChains . Clear ( ) ;
253
243
254
- // TODO - and refactor notedSuffixes too
255
- ArrayList < SuffixRank > notedSuffixes = new ArrayList < SuffixRank > ( ) ;
256
-
257
244
while ( chain . head != EOC )
258
245
{
259
246
int nextIndex = m_isa [ chain . head ] ;
260
- // TODO - refactor
261
247
if ( chain . head + chain . length > m_str . Length - 1 )
262
248
{
249
+ // If this substring reaches end of string it cannot be extended.
250
+ // At this point it's the first in lexicographic order so it's safe
251
+ // to just go ahead and rank it.
263
252
RankSuffix ( chain . head ) ;
264
253
}
265
254
else if ( m_isa [ chain . head + chain . length ] < 0 )
@@ -271,11 +260,11 @@ private void RefineChainWithInductionSorting(Chain chain)
271
260
}
272
261
else
273
262
{
274
- UpdateSubChains ( chain ) ;
263
+ ExtendChain ( chain ) ;
275
264
}
276
265
chain . head = nextIndex ;
277
266
}
278
- // Keep stack lexically sorted
267
+ // Keep stack sorted
279
268
SortAndPushSubchains ( ) ;
280
269
SortAndRankNotedSuffixes ( notedSuffixes ) ;
281
270
}
@@ -318,39 +307,66 @@ private int CalcLcp(int i, int j)
318
307
return lcp ;
319
308
}
320
309
321
- public int IndexOf ( string substr )
310
+ }
311
+
312
+ #region HelperClasses
313
+ [ Serializable ]
314
+ internal class Chain : IComparable < Chain >
315
+ {
316
+ public int head ;
317
+ public int length ;
318
+ private string m_str ;
319
+
320
+ public Chain ( string str )
322
321
{
323
- int l = 0 ;
324
- int r = m_sa . Length ;
325
- int m = - 1 ;
322
+ m_str = str ;
323
+ }
326
324
327
- if ( ( substr == null ) || ( substr . Length == 0 ) )
328
- {
329
- return - 1 ;
330
- }
325
+ public int CompareTo ( Chain other )
326
+ {
327
+ return m_str . Substring ( head , length ) . CompareTo ( m_str . Substring ( other . head , other . length ) ) ;
328
+ }
331
329
332
- // Binary search for substring
333
- while ( r > l )
334
- {
335
- m = ( l + r ) / 2 ;
336
- if ( m_str . Substring ( m_sa [ m ] ) . CompareTo ( substr ) < 0 )
337
- {
338
- l = m + 1 ;
339
- }
340
- else
341
- {
342
- r = m ;
343
- }
344
- }
345
- if ( ( l == r ) && ( l < m_str . Length ) && ( m_str . Substring ( m_sa [ l ] ) . StartsWith ( substr ) ) )
346
- {
347
- return m_sa [ l ] ;
348
- }
349
- else
350
- {
351
- return - 1 ;
352
- }
330
+ public override string ToString ( )
331
+ {
332
+ return m_str . Substring ( head , length ) ;
333
+ }
334
+ }
335
+
336
+ [ Serializable ]
337
+ internal class CharComparer : System . Collections . Generic . EqualityComparer < char >
338
+ {
339
+ public override bool Equals ( char x , char y )
340
+ {
341
+ return x . Equals ( y ) ;
342
+ }
343
+
344
+ public override int GetHashCode ( char obj )
345
+ {
346
+ return obj . GetHashCode ( ) ;
347
+ }
348
+ }
349
+
350
+ [ Serializable ]
351
+ internal struct SuffixRank
352
+ {
353
+ public int head ;
354
+ public int rank ;
355
+ }
356
+
357
+ [ Serializable ]
358
+ internal class SuffixRankComparer : IComparer < SuffixRank >
359
+ {
360
+ public bool Equals ( SuffixRank x , SuffixRank y )
361
+ {
362
+ return x . rank . Equals ( y . rank ) ;
363
+ }
364
+
365
+ public int Compare ( SuffixRank x , SuffixRank y )
366
+ {
367
+ return x . rank . CompareTo ( y . rank ) ;
353
368
}
354
369
}
370
+ #endregion
355
371
}
356
372
0 commit comments