-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
177 lines (139 loc) · 4.09 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
from collections import defaultdict
def regularity(sequence):
"""
Compute the regularity of a sequence.
The regularity basically measures what percentage of a user's
visits are to a previously visited place.
Parameters
----------
sequence : list
A list of symbols.
Returns
-------
float
1 minus the ratio between unique and total symbols in the sequence.
"""
n = len(sequence)
n_unique = len(set(sequence))
if n_unique <= 1:
return 1.0
if n_unique == n:
return .0
return 1 - (n_unique / n)
def stationarity(sequence):
"""
Compute the stationarity of a sequence.
A stationary transition is one whose source and destination symbols
are the same. The stationarity measures the percentage of transitions
to the same location.
Parameters
----------
sequence : list
A list of symbols.
Returns
-------
float
Percentage of the sequence that is stationary.
"""
n = len(sequence)
n_unique = len(set(sequence))
if n <= 1:
return 1.0
if n == n_unique:
return .0
stationary_transitions = 0
for i in range(1, n):
if sequence[i - 1] == sequence[i]:
stationary_transitions += 1
return stationary_transitions / (n - 1)
def _suffix_array_manber_myers(s):
"""
Compute the suffix array of a string using the Manber-Meyers algorithm.
Reference: http://algorithmicalley.com/archive/2013/06/30/suffix-arrays.aspx
Parameters
----------
s : string
The input string.
Returns
-------
list
The suffix array of the input string.
"""
def sort_bucket(s, bucket, order):
d = defaultdict(list)
for i in bucket:
key = ''.join(s[i + order // 2:i + order])
d[key].append(i)
result = []
for k, v in sorted(d.items()):
if len(v) > 1:
result += sort_bucket(s, v, 2 * order)
else:
result.append(v[0])
return result
return sort_bucket(s, range(len(s)), 1)
def _kasai(s, sa):
"""
Computes the logest common prefix (LCP) array of a string given its suffix
array using Kasai's algorithm.
References:
- https://web.stanford.edu/class/cs166/lectures/03/Small03.pdf
- https://web.stanford.edu/class/archive/cs/cs166/cs166.1146/
Parameters
----------
s : string
The input string.
sa : list
The suffix array of the input string.
Returns
-------
list
The LCP array of the input string.
"""
n = len(s)
k = 0
lcp = [0] * n
rank = [0] * n
for i in range(n):
rank[sa[i]] = i
for i in range(n):
k = k - 1 if k > 0 else 0
if rank[i] == n - 1:
k = 0
continue
j = sa[rank[i] + 1]
while i + k < n and j + k < n and s[i + k] == s[j + k]:
k += 1
lcp[rank[i]] = k
return lcp
def diversity(sequence):
"""
Returns the ratio of distinct substrings over the total number of
substrings in the sequence. The number of distinct substrings is
computed using the LCP array. The total number of substrings is
computed using a closed-formula. A naive implementation, which is
O(n^2), would be too inefficient for large strings. This implementation
is O(n log n), where n is the size of the input sequence.
Parameters
----------
sequence : list
The input sequence of symbols, where each symbol is seen as a
character in a string.
Returns
-------
float
The ratio of distinct substrings over the total number of
substrings in the sequence
"""
n = len(sequence)
n_unique = len(set(sequence))
if n <= 1:
return 0.0
if n == n_unique:
return .0
total_substrs = (n * (n + 1)) // 2
suffix_array = _suffix_array_manber_myers(sequence)
lcp = _kasai(sequence, suffix_array)
distinct_substrs = total_substrs - sum(lcp)
return distinct_substrs / total_substrs