-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerator.py
127 lines (107 loc) · 4.91 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# A Text Generator AI that generates a text based on a previously learned text
# This is no model where classification accuracy is the optimization problem.
# This algorithm tries to generalize the dataset and generate new text.
# See: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
# author: Tobias Freundorfer
import numpy
import datetime
# Class that handles everything regarding Files (e.g. saving or loading objects)
from filehelper import FileHelper
# Class that handles the Preprocessing phase
from preprocessing import Preprocessing
# Class that handles the Training phase
from training import Training
# Imports for the NN model used
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
def generate_random_seed(X):
"""Generates a random seed used as starting value from the text generation.
The seed is a sequence from the training data used as a starting point.
In this case it's a random sequence.
"""
startIndex = numpy.random.randint(0, len(X)-1)
seed = X[startIndex]
return seed
def generate_text(desiredTextLength, int2charDict, vocabulary, seed, model):
"""Generates text based on the learned text
"""
print('Starting Text-Generation Phase...')
start = datetime.datetime.now()
text = ""
# Predict the next chars starting from the seed
for i in range(desiredTextLength):
x = numpy.reshape(seed, (1, len(seed), 1))
x = x / float(len(vocabulary))
prediction = model.predict(x, verbose=1)
# Find the prediction with the highest probability
predictedIndex = numpy.argmax(prediction)
predictedChar = int2charDict[predictedIndex]
text += str(predictedChar)
# Append the new predicted char to the seed and repredict the following sequence
seed.append(predictedIndex)
# Move the window by one character
seed = seed[1:len(seed)]
end = datetime.datetime.now()
deltaTime = end-start
print('Generation finished: %ds' % deltaTime.total_seconds())
return text
def main():
X = []
Y = []
char2intDict = None
int2charDict = None
vocabulary = None
config = FileHelper.load_config('config.json')
seq_length = config['preprocessing']['sequence_chars_length']
# Load data or preprocess
if not config['preprocessing']['exec_preprocessing']:
X = FileHelper.load_object_from_file(
config['preprocessing']['checkpoints']['X_file'])
Y = FileHelper.load_object_from_file(
config['preprocessing']['checkpoints']['Y_file'])
char2intDict = FileHelper.load_object_from_file(
config['preprocessing']['checkpoints']['char2intDict_file'])
int2charDict = FileHelper.load_object_from_file(
config['preprocessing']['checkpoints']['int2charDict_file'])
else:
preprocessing = Preprocessing(config)
X, Y, char2intDict, int2charDict = preprocessing.preprocess()
FileHelper.save_object_to_file(
config['preprocessing']['checkpoints']['X_file'], X)
FileHelper.save_object_to_file(
config['preprocessing']['checkpoints']['Y_file'], Y)
vocabulary = FileHelper.load_object_from_file(
config['preprocessing']['checkpoints']['vocabulary_file'])
# Save the unshaped version of X because it's needed for generation later
X_unshaped = X
# Transform the data to the format the LTSM expects it [samples, timesteps, features]
X = numpy.reshape(X, (len(X), seq_length, 1))
# Normalize/rescale all integers to range 0-1
X = X / float(len(vocabulary))
# As usual do one-hot encoding for categorial variables to the output variables (vector of zeros with a single 1 --> 0..N-1 categories)
Y = np_utils.to_categorical(Y)
training = Training(config)
# Define the model
model = training.define_model(X, Y)
if config['training']['exec_training']:
# Train the model
model = training.train(X, Y, char2intDict, vocabulary, model)
else:
# Just set the previously trained weights for the model
model.load_weights(config['training']['load_weights_filename'])
model.compile(loss='categorical_crossentropy', optimizer='adam')
if config['generation']['exec_generation']:
# Generate the random seed used as starting value for text generation
seed = generate_random_seed(X_unshaped)
generatedText = generate_text(
config['generation']['text_chars_length'], int2charDict, vocabulary, seed, model)
# Save the generated text to file
outputFilename = config['generation']['foldername'] + '/' + \
datetime.datetime.now().strftime('%Y%m%d_%H_%M_%S') + '.txt'
FileHelper.write_data(outputFilename, generatedText)
if __name__ == '__main__':
main()