-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
62 lines (44 loc) · 1.97 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# modules for data loading and data preparation
import numpy as np
def load_dataset(data_path):
# set encoding to 'Latin-1' when using linux kernel dataset
with open(data_path, encoding='utf-8') as file:
text_data = file.read().lower()
#text_data = text_data[:2000000] when using linux kernel weights
# vocabulary character tokens
vocab_chars = sorted(list(set(text_data)))
return vocab_chars, text_data
# creates char to index mapping and reverse mapping
def create_mapping(vocab_chars):
# mapping from char to numerical index
char_idx = dict((char, idx) for idx, char in enumerate(vocab_chars))
idx_char = dict((idx, char) for idx, char in enumerate(vocab_chars))
return char_idx, idx_char
# tokenize the sentences: we split the
def tokenize_split(text, Tx):
# we split the input data such that for training we always feed the network a fixed sentence of
# 40 characters and for that we make the next character as the output
# i.e for every 40 characters long sentence we have an output character
# Tx: input timesteps
# decides the difference in position in 1st characters of two consecutive input sentences
step = 3
X_input = []
Y_output = []
for i in range(0, len(text) - Tx, step):
X_input.append(text[i: i + Tx])
Y_output.append(text[i + Tx])
return X_input, Y_output
# for creating One hot encoded representation of data
def do_input_OHE(X_input, Y_output, Tx, vocab_chars, char_idx):
# no. of training examples
m = len(X_input)
# create the zero vectors of required size
X = np.zeros((m, Tx, len(vocab_chars)), dtype=np.bool)
Y = np.zeros((m, len(vocab_chars)), dtype=np.bool)
# loop over for every sentence
for i, sentence in enumerate(X_input):
# for Tx timesteps
for timestep, char in enumerate(sentence):
X[i, timestep, char_idx[char]] = 1
Y[i, char_idx[Y_output[i]]] = 1
return X, Y