-
Notifications
You must be signed in to change notification settings - Fork 0
/
misc.py
228 lines (176 loc) · 5.76 KB
/
misc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import numpy as np
import pickle
import scipy.io as sio
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import WordNetLemmatizer
import re
def getAllWords(d):
text_filtered = re.sub(r'[^a-z ]', '', d.lower()).split(" ")
text_filtered2 = []
for c in text_filtered:
if c != ' ':
text_filtered2.append(c)
text_filtered = text_filtered2
word_dict = {}
for i,w in enumerate(text_filtered):
if w not in word_dict:
word_dict[w] = [i]
else:
word_dict[w].append(i)
return word_dict, text_filtered
def countCorrectWords(guess, truth):
filtered_guess = re.sub(r'[^a-z ]', '', guess.lower()).split(" ")
text_filtered2 = []
for c in filtered_guess:
if c != ' ':
text_filtered2.append(c)
filtered_guess = text_filtered2
num_correct_words = sum([1 if w in truth else 0 for w in filtered_guess])
total_words = len(filtered_guess)
# res = "Valid words generated {} out of {}".format(num_correct_words, total_words)
return num_correct_words/total_words
def bleu(guess, truth, word_dict):
def match_seq_len(guess_word_array, truth_word_array):
match_count = 0
j = 0
while guess_word_array[j] == truth_word_array[j] and j < len(guess_word_array)-1 and j <= len(truth_word_array)-1:
match_count += 1
j += 1
return match_count
filtered_guess = re.sub(r'[^a-z ]', '', guess.lower()).split(" ")
filtered_guess2 = []
for c in filtered_guess:
if c != ' ':
filtered_guess2.append(c)
filtered_guess = filtered_guess2
n_grams = []
for i in range(len(filtered_guess)):
n_gram_curr_word = 0
guess_to_match = filtered_guess[i:]
if filtered_guess[i] in word_dict:
for m in word_dict[filtered_guess[i]]:
truth_to_match = truth[m :]
tmp = match_seq_len(guess_to_match, truth_to_match)
if tmp > n_gram_curr_word:
n_gram_curr_word = tmp
n_grams.append(n_gram_curr_word)
return (np.sum(np.square(np.array(n_grams)))) / len(filtered_guess)
def getCharData(fpath:str) -> list:
# read text file
with open(fpath, 'r') as fo:
data = fo.readlines()
# get sentences
sentences = ''.join(data).split('.')
# get data container
data = []
# define charDrop list
dropChars = ['3', '¤', '#', '&']
for sentence in sentences:
sentence = list(sentence)
sentence = [char for char in sentence if char not in dropChars]
data.append(sentence)
return data
def getTextData(fpath: str) -> list:
# read text file
with open(fpath, 'r') as fo:
data = fo.readlines()
# get sentences
sentences = ''.join(data).split('.')
data = []
for sentence in sentences:
# split lines into tokens
sentence = word_tokenize(''.join(sentence))
# keepList = ['.', ',', ':', ';', '\\n']
keepList = []
sentence = [word for word in sentence if word.isalpha() or word in keepList]
# remove stopwords
stops = stopwords.words('english')
sentence = [word for word in sentence if word not in stops]
# stem words
stemmer = SnowballStemmer('english')
sentence = [stemmer.stem(word) for word in sentence]
# # lemmatize words
# wnl = WordNetLemmatizer()
# sentence = [wnl.lemmatize(word.lower()) for word in sentence]
data.append(sentence)
return data
def readData(fpath: str) -> object:
with open(fpath, 'r') as fo:
data = fo.read()
# with open(fpath, 'r') as fo:
# data = fo.readlines()
# # split lines into words and words into chars
# data = [char
# for line in data
# for word in list(line)
# for char in list(word)
# ]
return data
def prepareData(data: object) -> dict:
uniqueChars = set(data)
keyToChar = dict(enumerate(uniqueChars))
# keyToChar = dict(enumerate(np.unique(data)))
charToKey = dict([(val, key) for key, val in keyToChar.items()])
return keyToChar, charToKey
def generateSequences(data: np.array, seq_length: int) -> np.array:
X = []
for i in range(len(data) - seq_length - 1):
X.append(data[i:i+seq_length+1])
train_frac = 0.9
train_n = int(len(X) * train_frac)
X_train = X[:train_n]
X_val = X[train_n:]
return X_train, X_val
def sigmoid(S: np.array) -> np.array:
"""
Parameters
----------
S : dxN score matrix
Returns
-------
S : dxN score matrix w. applied sigmoid activation
"""
return 1 / (1 + np.exp(-S))
def softMax(S: np.array, temperature = 1.0) -> np.array:
"""
Parameters
----------
S : dxN score matrix
T : Scales variance in output probability distribution
Returns
-------
S : dxN score matrix w. applied softmax activation
"""
S = S / temperature
S = np.exp(S)
return S / np.sum(S, axis=0)
def oneHotEncode(k: np.array) -> np.array:
"""
Parameters
----------
k : Nx1 label vector
Returns
-------
Y: NxK one-hot encoded label matrix
"""
numCats = np.max(k)
return np.array([[
1 if idx == label else 0 for idx in range(numCats+1)]
for label in k]
)
def oneHotEncode_v2(k: int, K: int) -> np.array:
"""
Parameters
----------
k : label
K : category size
Returns
-------
y: 1xK one-hot encoded label matrix
"""
y = np.zeros(shape=(1, K))
y[0, k] = 1
return y