dataset

KaratsubaLabs · Jun 5, 2022 · 2fcecc3 · 2fcecc3
1 parent 5869e92
commit 2fcecc3
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 8 deletions.
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,12 @@
+
+class IntentDataset:
+
+    def __init__(self, x, y):
+        self.x_data = x
+        self.y_data = y
+
+    def __len__(self):
+        return len(self.x_data)
+
+    def __get_item__(self, index):
+        return self.x_data[index], self.y_data[index]
diff --git a/main.py b/main.py
@@ -1,17 +1,52 @@
 
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
 # TODO this is prob not needed lol
 from pipeop import pipes
-
 import preprocess
+from dataset import IntentDataset
 
-test_string = "very simple python chatbot to suck less at nlp"
+test_data = [
+    ("advice", "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."),
+    ("critism", "Whenever you feel like criticizing any one, he told me, just remember that all the people in this world haven't had the advantages that you've had."),
+    ("communication", "He didn't say any more but we've always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that.")
+]
 
 @pipes
 def run():
-    print(test_string
-        >> preprocess.tokenize
-        >> preprocess.filter_stopwords
-        >> preprocess.stem
+    word_dict = []
+    xy = []
+
+    # load data
+    for (tag, test_string) in test_data:
+        tokenized = test_string >> preprocess.tokenize
+        xy.append((tokenized, tag))
+
+        word_dict.extend(
+            tokenized >> preprocess.filter_stopwords >> preprocess.stem
+        )
+
+    word_dict = word_dict >> set >> sorted
+    print(word_dict)
+    print(xy)
+
+    # build training data
+    x_data = np.array([
+        preprocess.bag_words(tokenized, word_dict) for (tokenized, tag) in xy
+    ])
+    y_data = np.array([tag for (tokenized, tag) in xy])
+    dataset = IntentDataset(x_data, y_data)
+
+    batch_size = 8
+    num_workers = 2
+    loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers
     )
 
 run()
diff --git a/preprocess.py b/preprocess.py
@@ -1,4 +1,5 @@
 
+import numpy as np
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
@@ -19,5 +20,11 @@ def stem(tokenized):
     return [stemmer.stem(token) for token in tokenized]
 
 
-def bag_words():
-    raise NotImplementedError
+def bag_words(tokenized, word_dict):
+    bag = np.zeros(len(word_dict), dtype=np.float32)
+    for i, token in enumerate(tokenized):
+        if token in word_dict:
+            bag[i] = 1.0
+
+    return bag
+