Skip to content

Commit

Permalink
Renamed zzzmain.py to main.py, moved all chatbot code to its own dir
Browse files Browse the repository at this point in the history
  • Loading branch information
jopokemine committed Feb 21, 2021
1 parent 6431e75 commit 2f2f12a
Show file tree
Hide file tree
Showing 9 changed files with 435 additions and 264 deletions.
Empty file added chatbot/__init__.py
Empty file.
270 changes: 133 additions & 137 deletions bot.py → chatbot/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,22 @@
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("data", corpus_name)
# corpus_name = "cornell movie-dialogs corpus"


def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
# def printLines(file, n=10):
# with open(file, 'rb') as datafile:
# lines = datafile.readlines()
# for line in lines[:n]:
# print(line)


printLines(os.path.join(corpus, "movie_lines.txt"))
# # Define path to new file
# datafile = os.path.join("data", "formatted_lines_combined.txt")


# Define path to new file
datafile = os.path.join("data", "formatted_lines_qa.txt")

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# # Print a sample of lines
# print("\nSample lines from file:")
# printLines(datafile)

# Default word tokens
PAD_token = 0 # Used for padding short sentences
Expand All @@ -46,8 +42,7 @@ def printLines(file, n=10):


class Voc:
def __init__(self, name):
self.name = name
def __init__(self):
self.trimmed = False
self.word2index = {}
self.word2count = {}
Expand Down Expand Up @@ -115,14 +110,14 @@ def normalizeString(s):


# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
def readVocs(datafile):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in line.split('\t')] for line in lines]
voc = Voc(corpus_name)
voc = Voc()
return voc, pairs


Expand All @@ -145,9 +140,9 @@ def filterPairs(pairs):


# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
def loadPrepareData(datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
voc, pairs = readVocs(datafile)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
Expand All @@ -160,12 +155,12 @@ def loadPrepareData(corpus, corpus_name, datafile, save_dir):


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
# save_dir = os.path.join("data", "save")
# voc, pairs = loadPrepareData(corpus_name, datafile, save_dir)
# # Print some pairs to validate
# print("\npairs:")
# for pair in pairs[:10]:
# print(pair)

MIN_COUNT = 3 # Minimum word count threshold for trimming

Expand Down Expand Up @@ -200,7 +195,7 @@ def trimRareWords(voc, pairs, MIN_COUNT):


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)
# pairs = trimRareWords(voc, pairs, MIN_COUNT)


def indexesFromSentence(voc, sentence):
Expand Down Expand Up @@ -238,7 +233,7 @@ def outputVar(line, voc):
max_target_len = max([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
mask = binaryMatrix(padList)
mask = torch.BoolTensor(mask)
mask = torch.ByteTensor(mask)
padVar = torch.LongTensor(padList)
return padVar, mask, max_target_len

Expand All @@ -256,15 +251,15 @@ def batch2TrainData(voc, pair_batch):


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
# small_batch_size = 5
# batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
# input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)


class EncoderRNN(nn.Module):
Expand Down Expand Up @@ -460,7 +455,7 @@ def train(input_variable, lengths, target_variable, mask, max_target_len, encode
return sum(print_losses) / n_totals


def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, dataset, loadFilename):

# Load batches for each iteration
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
Expand All @@ -470,8 +465,8 @@ def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, deco
print('Initializing ...')
start_iteration = 1
print_loss = 0
if loadFilename:
start_iteration = checkpoint['iteration'] + 1
# if loadFilename:
# start_iteration = checkpoint['iteration'] + 1

# Training loop
print("Training...")
Expand All @@ -493,7 +488,7 @@ def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, deco

# Save checkpoint
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
directory = os.path.join(save_dir, model_name, dataset, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
Expand All @@ -518,7 +513,7 @@ def forward(self, input_seq, input_length, max_length):
# Forward input through encoder model
encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
# Prepare encoder's final hidden layer to be first hidden input to the decoder
decoder_hidden = encoder_hidden[:decoder.n_layers]
decoder_hidden = encoder_hidden[:self.decoder.n_layers]
# Initialize decoder input with SOS_token
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
# Initialize tensors to append decoded words to
Expand Down Expand Up @@ -577,112 +572,113 @@ def evaluateInput(encoder, decoder, searcher, voc):
except KeyError:
print("Error: Encountered unknown word.")


##########################################################
# RUN MODE ###############################################
##########################################################


# Configure models
model_name = 'cb_model'
attn_model = 'dot'
# attn_model = 'general'
# attn_model = 'concat'
# model_name = 'cb_model'
# attn_model = 'dot'
# # attn_model = 'general'
# # attn_model = 'concat'
hidden_size = 1000
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 1000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
# '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
# '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
# If loading on same machine the model was trained on
checkpoint = torch.load(loadFilename)
# If loading a model trained on GPU to CPU
# checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')
# encoder_n_layers = 2
# decoder_n_layers = 2
# dropout = 0.1
# batch_size = 64

# # Set checkpoint to load from; set to None if starting from scratch
# loadFilename = None
# checkpoint_iter = 1000
# # loadFilename = os.path.join(save_dir, model_name, corpus_name,
# # '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
# # '{}_checkpoint.tar'.format(checkpoint_iter))


# # Load model if a loadFilename is provided
# if loadFilename:
# # If loading on same machine the model was trained on
# checkpoint = torch.load(loadFilename)
# # If loading a model trained on GPU to CPU
# # checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
# encoder_sd = checkpoint['en']
# decoder_sd = checkpoint['de']
# encoder_optimizer_sd = checkpoint['en_opt']
# decoder_optimizer_sd = checkpoint['de_opt']
# embedding_sd = checkpoint['embedding']
# voc.__dict__ = checkpoint['voc_dict']


# print('Building encoder and decoder ...')
# # Initialize word embeddings
# embedding = nn.Embedding(voc.num_words, hidden_size)
# if loadFilename:
# embedding.load_state_dict(embedding_sd)
# # Initialize encoder & decoder models
# encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
# decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# if loadFilename:
# encoder.load_state_dict(encoder_sd)
# decoder.load_state_dict(decoder_sd)
# # Use appropriate device
# encoder = encoder.to(device)
# decoder = decoder.to(device)
# print('Models built and ready to go!')

###################################################
# RUN TRAINING ####################################
###################################################

# Configure training/optimization
clip = 50.0
# clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 1000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
encoder_optimizer.load_state_dict(encoder_optimizer_sd)
decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
# for state in encoder_optimizer.state.values():
# for k, v in state.items():
# if isinstance(v, torch.Tensor):
# state[k] = v.cuda()

# for state in decoder_optimizer.state.values():
# for k, v in state.items():
# if isinstance(v, torch.Tensor):
# state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
print_every, save_every, clip, corpus_name, loadFilename)

############################################
# RUN EVALUATION ###########################
############################################

# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)
# learning_rate = 0.0001
# decoder_learning_ratio = 5.0
# n_iteration = 1000
# print_every = 1
# save_every = 500

# # Ensure dropout layers are in train mode
# encoder.train()
# decoder.train()

# # Initialize optimizers
# print('Building optimizers ...')
# encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
# decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
# if loadFilename:
# encoder_optimizer.load_state_dict(encoder_optimizer_sd)
# decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# # If you have cuda, configure cuda to call
# # for state in encoder_optimizer.state.values():
# # for k, v in state.items():
# # if isinstance(v, torch.Tensor):
# # state[k] = v.cuda()

# # for state in decoder_optimizer.state.values():
# # for k, v in state.items():
# # if isinstance(v, torch.Tensor):
# # state[k] = v.cuda()

# # Run training iterations
# print("Starting Training!")
# trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
# embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
# print_every, save_every, clip, corpus_name, loadFilename)

# ############################################
# # RUN EVALUATION ###########################
# ############################################

# # Set dropout layers to eval mode
# encoder.eval()
# decoder.eval()

# # Initialize search module
# searcher = GreedySearchDecoder(encoder, decoder)

# # Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)
Loading

0 comments on commit 2f2f12a

Please sign in to comment.