diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..f3a0a15 --- /dev/null +++ b/dataset.py @@ -0,0 +1,12 @@ + +class IntentDataset: + + def __init__(self, x, y): + self.x_data = x + self.y_data = y + + def __len__(self): + return len(self.x_data) + + def __get_item__(self, index): + return self.x_data[index], self.y_data[index] diff --git a/main.py b/main.py index 39a9308..234941b 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,52 @@ +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader + # TODO this is prob not needed lol from pipeop import pipes - import preprocess +from dataset import IntentDataset -test_string = "very simple python chatbot to suck less at nlp" +test_data = [ + ("advice", "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."), + ("critism", "Whenever you feel like criticizing any one, he told me, just remember that all the people in this world haven't had the advantages that you've had."), + ("communication", "He didn't say any more but we've always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that.") +] @pipes def run(): - print(test_string - >> preprocess.tokenize - >> preprocess.filter_stopwords - >> preprocess.stem + word_dict = [] + xy = [] + + # load data + for (tag, test_string) in test_data: + tokenized = test_string >> preprocess.tokenize + xy.append((tokenized, tag)) + + word_dict.extend( + tokenized >> preprocess.filter_stopwords >> preprocess.stem + ) + + word_dict = word_dict >> set >> sorted + print(word_dict) + print(xy) + + # build training data + x_data = np.array([ + preprocess.bag_words(tokenized, word_dict) for (tokenized, tag) in xy + ]) + y_data = np.array([tag for (tokenized, tag) in xy]) + dataset = IntentDataset(x_data, y_data) + + batch_size = 8 + num_workers = 2 + loader = DataLoader( + dataset=dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers ) run() diff --git a/preprocess.py b/preprocess.py index 2c2a408..efbe79e 100644 --- a/preprocess.py +++ b/preprocess.py @@ -1,4 +1,5 @@ +import numpy as np from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer @@ -19,5 +20,11 @@ def stem(tokenized): return [stemmer.stem(token) for token in tokenized] -def bag_words(): - raise NotImplementedError +def bag_words(tokenized, word_dict): + bag = np.zeros(len(word_dict), dtype=np.float32) + for i, token in enumerate(tokenized): + if token in word_dict: + bag[i] = 1.0 + + return bag +