-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_text.py
304 lines (221 loc) · 8.86 KB
/
generate_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# -*- coding: utf-8 -*-
"""generate_text.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/13yr0w4ltEw4qVL3PemzjukyMjCSzxRgZ
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
#importing keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,GRU,TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
url="https://en.wikipedia.org/wiki/Machine_learning"
import urllib.request
from bs4 import BeautifulSoup
import requests
text = requests.get(url).content.decode('utf-8')
print(text[:1000])
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
script = False
res = ""
def handle_starttag(self, tag, attrs):
if tag.lower() in ["script","style"]:
self.script = True
def handle_endtag(self, tag):
if tag.lower() in ["script","style"]:
self.script = False
def handle_data(self, data):
if str.strip(data)=="" or self.script:
return
self.res += ' '+data.replace('[ edit ]','')
parser = MyHTMLParser()
parser.feed(text)
text = parser.res
encoded_string = text.encode("ascii", "ignore")
text = encoded_string.decode()
print(text[:1000])
text_file = open("file.txt", "w")
text_file.write(text)
text_file.close()
#text = open('file.txt','r').read()
#path_to_file = 'file.txt'
text = open('file.txt','r').read()
print(text[:105])
seq_len = 24
vocab = sorted(set(text))
len(vocab)
char_to_ind = {char:ind for ind,char in enumerate(vocab)}
ind_to_char = np.array(vocab)
encoded_text = np.array([char_to_ind[c] for c in text])
total_num_seq = len(text) // (seq_len + 1)
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)
def create_seq_targets(seq):
input_text = seq[:-1]
target_text = seq[1:]
return input_text, target_text
dataset = sequences.map(create_seq_targets)
for input_text,target_text in dataset.take(1):
print(input_text.numpy())
print("".join(ind_to_char[input_text.numpy()]))
print('\n')
print(target_text.numpy())
print("".join(ind_to_char[target_text.numpy()]))
for input_txt, target_txt in dataset.take(1):
print(''.join([ind_to_char[i] for i in np.array(input_txt)]))
print('\n')
print(''.join([ind_to_char[i] for i in np.array(target_txt)]))
#batched:
batch_size = 128 #number of sequence tuples in each batch
buffer_size = 10000 #shuffle this many sequences in the dataset
#first shuffle the dataset and divide it into batches
#drop the last sequences < batch_size
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
#training
seq_len = 100
total_num_seq = len(text)//(seq_len+1)
print('Total Number of Sequences: ', total_num_seq)
#Create training sequences
#tf.data.Dataset.from_tensor_slices function converts a text vector
#into a stream of character indices
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
for i in char_dataset.take(500):
print(ind_to_char[int(i)],end="")
#batch method converts these individual character calls into sequences
#which we can feed in as a batch
#we use seq_len+1 because we will use seq_len characters
#and shift them one step forward
#drop remainder drops the remaining characters < batch_size
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)
import re
batch_size = 128 #number of sequence tuples in each batch
buffer_size = 10000 #shuffle this many sequences in the dataset
#first shuffle the dataset and divide it into batches
#drop the last sequences < batch_size
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
#x=re.sub("\s","",text)
#x=re.findall("\D",x)
#print(x,end=" ")
"""
#Get all the unique characters
vocab = sorted(set(x))
vocab_size = len(vocab)
print(vocab)
print('Total uniques characters: ',vocab_size)
l = 0
for i in dataset:
l += 2
l
#total batches
l = 0
for i in dataset:
l += 1
print('Total Batches:', l)
print('Sequences in each batch: ', batch_size)
print('Characters in each sequence:', seq_len)
print('Characters in dataset: ', len(list(text)))
#using sparse_categorical_crossentropy because
#out predictions will be numbers and not one hot encodings
#we need to define a custom loss function so that we can change
#the from_logits parameter to True
def customize_loss (y_true, y_pred):
return sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)
def create_model(batch_size):
vocab_size_func = vocab_size
embed_dim = 64 #the embedding dimension
rnn_neurons = 1024 #number of rnn units
batch_size_func = batch_size
model = Sequential()
model.add(Embedding(vocab_size_func,
embed_dim,
batch_input_shape=[batch_size_func, None]))
model.add(GRU(rnn_neurons,
return_sequences=True,
stateful=True,
recurrent_initializer='glorot_uniform'))
#model.add(LSTM(rnn_neurons,return_sequences=True,stateful =True,
# recurrent_initializer='glorot_uniform'))
#model.add(GRU(300, return_sequences = True,
# stateful=True , recurrent_initializer='glorot_uniform'))
#model.add(GRU(rnn_neurons, return_sequences = True,
# stateful=True , recurrent_initializer='glorot_uniform'))
model.add(Dense(vocab_size_func))
model.compile(optimizer='adam', loss=customize_loss)
return model
#note this will generate random characters
#dataset.take(1) contains 1 batch = 128 sequence tuples
#model will output 120 characters per sequence
#in the form of probability of those 84 vocab characters
model = create_model(batch_size)
model.summary()
for ex_input, ex_target in dataset.take(1):
ex_pred = model(ex_input)
print(ex_pred.shape)
#changes the character probabilities to integers
sampled_indices = tf.random.categorical(ex_pred[0], num_samples=1)
#maps those integers to characters
char_pred = ''.join([ind_to_char[int(i)] for i in sampled_indices])
print(char_pred)
for ex_input, ex_target in dataset.take(1):
ex_pred = model(ex_input)
print(ex_pred.shape)
#dataset.take(1)
model.fit(dataset, epochs=30, verbose=1)
model.save('generate.h5')
# importing load_model to load the keras model
from tensorflow.keras.models import load_model
#create a new model with a batch size of 1
model = create_model(batch_size=1)
#load the weights from the previous model to our new model
model.load_weights('generate.h5')
#build the model
model.build(tf.TensorShape([1, None]))
#view model summary
print(model.summary())
def generate_text(model_,start_seed, gen_size = 100 , temp = 1.0):
num_generate = gen_size
input_ = [char_to_ind[s] for s in start_seed]
input_ = tf.reshape(input_,[1,16]) #changing the tensor shae
input_ = tf.cast(input_, tf.float32)
text_generated = []
temperature = temp
model_.reset_states()
for i in range(num_generate):
predictions = model_(input_)
predictions = tf.squeeze(predictions,0)
predictions = predictions/temperature
predicted_id = tf.random.categorical(predictions,num_samples = 1)[-1,0].numpy()
input_ = tf.expand_dims([predicted_id],0)
text_generated.append(ind_to_char[predicted_id]) #predicting and joining
return (start_seed+"".join(text_generated))
#generate a text based on input
#note that, this out is not part of the dataset
#but completely auto generated
auto_text = generate_text(model, 'machine lerning', gen_size = 1000)
print(auto_text)
#download the saved model
#model.save('shakespeare.h5')
#from IPython.display import FileLink
#FileLink(r'shakespeare.h5')
def customize_loss (y_true, y_pred):
return keras.losses.sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)
def model(Input_dimen,out_dimen,batch_size):
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim = Input_dimen , output_dim = out_dimen,
batch_input_shape = [batch_size, None]))
model.add(keras.layers.LSTM(1024,return_sequences=True,stateful =True,
recurrent_initializer='glorot_uniform'))
model.add(keras.layers.GRU(500, return_sequences = True,
stateful=True , recurrent_initializer='glorot_uniform'))
model.add(keras.layers.GRU(300, return_sequences = True,
stateful=True , recurrent_initializer='glorot_uniform'))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(Input_dimen)))
model.compile(optimizer = 'adam' , loss = customize_loss)
return model
auto_text = generate_text(model, 'machine learning', gen_size = 100)
print(auto_text)
print(generate_text(model ,'Son', gen_size=1500))