-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathngrams.py
60 lines (50 loc) · 1.7 KB
/
ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import collections
from os import path
## Stop words are stored in the repository at the same location as this file:
stopword_file = path.join(path.dirname(path.abspath(__file__)),
'stopwords.txt')
with open(stopword_file, 'r') as f:
STOP_WORDS = f.read().split()
def renormalize(data,(newmin,newmax),oldrange=None):
"""
Linearly rescales data to lie between newmin and
newmax. oldrange defaults to the min/max of data.
This is similar to MATLAB's mat2gray, but with no clipping.
"""
data = data.astype('float64')
if oldrange is None:
(oldmin,oldmax) = (np.min(data),np.max(data))
else:
(oldmin,oldmax) = oldrange
slope = (newmin-newmax+0.)/(oldmin-oldmax)
out = slope*(data-oldmin) + newmin
return out
class WordCounter(collections.Counter):
def remove_below_threshold(self,threshold):
new = self.copy()
for word in self:
if self[word] <= threshold:
new.pop(word)
return new
def remove_stopwords(self,stopwords=STOP_WORDS):
new = self.copy()
for word in self:
if word in stopwords:
new.pop(word)
return new
def top_N(self,N):
top = self.most_common(N)
return sorted(top)
# collection.Counter's versions of these are not actually
# in place (and are therefore too slow)
def __iadd__(self,other):
for word in other:
self[word] += other[word]
return self
def __isub__(self,other):
for word in other:
self[word] -= other[word]
assert self[word] >= 0
if self[word] == 0:
self.pop(word)
return self