-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
98 lines (78 loc) · 3.97 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from filehelper import FileHelper
import datetime
class Preprocessing:
def __init__(self, config):
self.config = config
def generate_char_to_int_dictionary(self, data):
"""Creates the mapping of unique characters to unique integers
"""
# Sets are unordered collections of unique elements
charSet = set(data)
# Put the set into a list and sort it
chars = list(charSet)
vocab = sorted(chars)
# FileHelper.save_object_to_file('preprocessingCheckpoints/vocab', vocab)
FileHelper.save_object_to_file(
self.config['preprocessing']['checkpoints']['vocabulary_file'], vocab)
chars_len = len(data)
vocab_len = len(vocab)
print('Input data consists of %d Total Characters and a Vocabular of %d Characters' % (
chars_len, vocab_len))
return dict((character, index) for index, character in enumerate(vocab))
def generate_int_to_char_dictionary(self, data):
"""Creates the mapping of unique integers to unique characters
"""
# Sets are unordered collections of unique elements
charSet = set(data)
# Put the set into a list and sort it
chars = list(charSet)
vocab = sorted(chars)
return dict((index, character) for index, character in enumerate(vocab))
def clean_data_map(self, map):
"""Cleans the data map from unwanted characters to further improve
"""
# TODO Remove unwanted characters from map and reevaluate how well the algorithm performs afterwards
return map
def generate_training_patterns(self, data, char2intDict, saveToFile=False):
"""Generates the mapping between the input and output pairs enconded as integers.
The length of the sequence is used to determine the first sequence, afterwards the window is sliding one index further (always with window size = length of sequence)
"""
# The input
X = []
# The output
Y = []
seq_length = self.config['preprocessing']['sequence_chars_length']
# for i in range(0, len(data) - TRAINING_SEQ_LENGTH, 1):
for i in range(0, len(data) - seq_length, 1):
# Get the text sequence of the desired length
seq_input = data[i:i+seq_length]
# The next character after the sequence
seq_output = data[i + seq_length]
# Append the integers for each char in the input sequence to X
X.append([char2intDict[character] for character in seq_input])
# Append the according integer of the next character that succeeds the input sequence
Y.append(char2intDict[seq_output])
print('Generated %d Patterns from data' % len(X))
# TODO Save to file so that I don't need to recalculate this over and over again
return X, Y
def preprocess(self):
"""Executes the preprocessing which generates the data used for learning
"""
print('Starting Preprocessing Phase...')
start = datetime.datetime.now()
raw_data = FileHelper.read_data_lower(
self.config['preprocessing']['input_file'])
# Model the characters as integers
char2intDict = self.generate_char_to_int_dictionary(raw_data)
# FileHelper.save_object_to_file('preprocessingCheckpoints/char2indexDict', char2indexDict)
FileHelper.save_object_to_file(
self.config['preprocessing']['checkpoints']['char2intDict_file'], char2intDict)
int2CharDict = self.generate_int_to_char_dictionary(raw_data)
FileHelper.save_object_to_file(
self.config['preprocessing']['checkpoints']['int2charDict_file'], int2CharDict)
# Generate the text patterns
X, Y = self.generate_training_patterns(raw_data, char2intDict)
end = datetime.datetime.now()
deltaTime = end-start
print('Preprocessing finished: %ds' % deltaTime.total_seconds())
return X, Y, char2intDict, int2CharDict