generate_text.py

# -*- coding: utf-8 -*-
"""generate_text.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/13yr0w4ltEw4qVL3PemzjukyMjCSzxRgZ
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
#importing keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,GRU,TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy

url="https://en.wikipedia.org/wiki/Machine_learning"
import urllib.request
from bs4 import BeautifulSoup
import requests

text = requests.get(url).content.decode('utf-8')
print(text[:1000])

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    script = False
    res = ""
    def handle_starttag(self, tag, attrs):
        if tag.lower() in ["script","style"]:
            self.script = True
    def handle_endtag(self, tag):
        if tag.lower() in ["script","style"]:
            self.script = False
    def handle_data(self, data):
        if str.strip(data)=="" or self.script:
            return
        self.res += ' '+data.replace('[ edit ]','')

parser = MyHTMLParser()
parser.feed(text)
text = parser.res
encoded_string = text.encode("ascii", "ignore")
text = encoded_string.decode()

print(text[:1000])

text_file = open("file.txt", "w")

text_file.write(text)

text_file.close()

#text = open('file.txt','r').read()

#path_to_file = 'file.txt'

text = open('file.txt','r').read()
print(text[:105])
seq_len = 24
vocab = sorted(set(text))
len(vocab)
char_to_ind = {char:ind for ind,char in enumerate(vocab)}
ind_to_char = np.array(vocab)
encoded_text = np.array([char_to_ind[c] for c in text])
total_num_seq = len(text) // (seq_len + 1)
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

def create_seq_targets(seq):
    input_text = seq[:-1]
    target_text = seq[1:]
    return input_text, target_text

dataset = sequences.map(create_seq_targets)

for input_text,target_text in dataset.take(1):
    print(input_text.numpy())
    print("".join(ind_to_char[input_text.numpy()]))
    print('\n')
    print(target_text.numpy())
    print("".join(ind_to_char[target_text.numpy()]))

for input_txt, target_txt in dataset.take(1):
    print(''.join([ind_to_char[i] for i in np.array(input_txt)]))
    print('\n')
    print(''.join([ind_to_char[i] for i in np.array(target_txt)]))

#batched:


batch_size = 128 #number of sequence tuples in each batch
buffer_size = 10000 #shuffle this many sequences in the dataset

#first shuffle the dataset and divide it into batches
#drop the last sequences < batch_size
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

#training

seq_len = 100
total_num_seq = len(text)//(seq_len+1)
print('Total Number of Sequences: ', total_num_seq)

#Create training sequences
#tf.data.Dataset.from_tensor_slices function converts a text vector
#into a stream of character indices
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

for i in char_dataset.take(500):
    print(ind_to_char[int(i)],end="")

#batch method converts these individual character calls into sequences
#which we can feed in as a batch
#we use seq_len+1 because we will use seq_len characters
#and shift them one step forward
#drop remainder drops the remaining characters < batch_size
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

import re
batch_size = 128 #number of sequence tuples in each batch
buffer_size = 10000 #shuffle this many sequences in the dataset

#first shuffle the dataset and divide it into batches
#drop the last sequences < batch_size
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

#x=re.sub("\s","",text)
#x=re.findall("\D",x)
#print(x,end=" ")

"""
#Get all the unique characters
vocab = sorted(set(x))
vocab_size = len(vocab)
print(vocab)
print('Total uniques characters: ',vocab_size)

l = 0
for i in dataset:
    l += 2
l

#total batches
l = 0
for i in dataset:
    l += 1

print('Total Batches:', l)
print('Sequences in each batch: ', batch_size)
print('Characters in each sequence:', seq_len)
print('Characters in dataset: ', len(list(text)))


#using sparse_categorical_crossentropy because
#out predictions will be numbers and not one hot encodings
#we need to define a custom loss function so that we can change
#the from_logits parameter to True
def customize_loss (y_true, y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)

def create_model(batch_size):

    vocab_size_func = vocab_size
    embed_dim = 64 #the embedding dimension
    rnn_neurons = 1024 #number of rnn units
    batch_size_func = batch_size
    
    model = Sequential()

    model.add(Embedding(vocab_size_func, 
                        embed_dim, 
                        batch_input_shape=[batch_size_func, None]))
    model.add(GRU(rnn_neurons, 
                  return_sequences=True, 
                  stateful=True, 
                  recurrent_initializer='glorot_uniform'))
    
    #model.add(LSTM(rnn_neurons,return_sequences=True,stateful =True,  
                             #   recurrent_initializer='glorot_uniform'))
    #model.add(GRU(300, return_sequences = True,
     #                           stateful=True , recurrent_initializer='glorot_uniform'))
    #model.add(GRU(rnn_neurons, return_sequences = True,
     #                           stateful=True , recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size_func)) 
       
    model.compile(optimizer='adam', loss=customize_loss)    
    
    return model

#note this will generate random characters
#dataset.take(1) contains 1 batch = 128 sequence tuples
#model will output 120 characters per sequence
#in the form of probability of those 84 vocab characters
model = create_model(batch_size)
model.summary()
for ex_input, ex_target in dataset.take(1):
    ex_pred = model(ex_input)
print(ex_pred.shape)

#changes the character probabilities to integers
sampled_indices = tf.random.categorical(ex_pred[0], num_samples=1)

#maps those integers to characters
char_pred = ''.join([ind_to_char[int(i)] for i in sampled_indices])

print(char_pred)

for ex_input, ex_target in dataset.take(1):
    ex_pred = model(ex_input)
print(ex_pred.shape)

#dataset.take(1)


model.fit(dataset, epochs=30, verbose=1)

model.save('generate.h5')

# importing load_model to load the keras model
from tensorflow.keras.models import load_model

#create a new model with a batch size of 1
model = create_model(batch_size=1)

#load the weights from the previous model to our new model
model.load_weights('generate.h5')

#build the model
model.build(tf.TensorShape([1, None]))

#view model summary
print(model.summary())

def generate_text(model_,start_seed, gen_size = 100 , temp = 1.0):
    num_generate = gen_size
    input_ = [char_to_ind[s] for s in start_seed]
    input_ = tf.reshape(input_,[1,16]) #changing the tensor shae
    input_ = tf.cast(input_,  tf.float32)

    text_generated = []
    temperature = temp
    model_.reset_states()

    for i in range(num_generate):
        predictions = model_(input_)
        predictions = tf.squeeze(predictions,0)
        predictions = predictions/temperature

        predicted_id = tf.random.categorical(predictions,num_samples = 1)[-1,0].numpy()
        input_ = tf.expand_dims([predicted_id],0)
        
        
        text_generated.append(ind_to_char[predicted_id]) #predicting and joining
    return (start_seed+"".join(text_generated))

#generate a text based on input
#note that, this out is not part of the dataset
#but completely auto generated
auto_text = generate_text(model, 'machine lerning', gen_size = 1000)
print(auto_text)

#download the saved model
#model.save('shakespeare.h5')
#from IPython.display import FileLink
#FileLink(r'shakespeare.h5')


def customize_loss (y_true, y_pred):
    return keras.losses.sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)

def model(Input_dimen,out_dimen,batch_size):
    model = keras.Sequential()
    model.add(keras.layers.Embedding(input_dim = Input_dimen , output_dim = out_dimen, 
                                     batch_input_shape = [batch_size, None]))
    model.add(keras.layers.LSTM(1024,return_sequences=True,stateful =True,  
                                recurrent_initializer='glorot_uniform'))
    model.add(keras.layers.GRU(500, return_sequences = True,
                                stateful=True , recurrent_initializer='glorot_uniform'))
    model.add(keras.layers.GRU(300, return_sequences = True,
                                stateful=True , recurrent_initializer='glorot_uniform'))
    model.add(keras.layers.TimeDistributed(keras.layers.Dense(Input_dimen)))
    
    model.compile(optimizer = 'adam' , loss = customize_loss)
    return model


auto_text = generate_text(model, 'machine learning', gen_size = 100)
print(auto_text)

print(generate_text(model ,'Son', gen_size=1500))