Skip to content

Commit

Permalink
dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
MrPicklePinosaur committed Jun 5, 2022
1 parent 5869e92 commit 2fcecc3
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 8 deletions.
12 changes: 12 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

class IntentDataset:

def __init__(self, x, y):
self.x_data = x
self.y_data = y

def __len__(self):
return len(self.x_data)

def __get_item__(self, index):
return self.x_data[index], self.y_data[index]
47 changes: 41 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,52 @@

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# TODO this is prob not needed lol
from pipeop import pipes

import preprocess
from dataset import IntentDataset

test_string = "very simple python chatbot to suck less at nlp"
test_data = [
("advice", "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."),
("critism", "Whenever you feel like criticizing any one, he told me, just remember that all the people in this world haven't had the advantages that you've had."),
("communication", "He didn't say any more but we've always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that.")
]

@pipes
def run():
print(test_string
>> preprocess.tokenize
>> preprocess.filter_stopwords
>> preprocess.stem
word_dict = []
xy = []

# load data
for (tag, test_string) in test_data:
tokenized = test_string >> preprocess.tokenize
xy.append((tokenized, tag))

word_dict.extend(
tokenized >> preprocess.filter_stopwords >> preprocess.stem
)

word_dict = word_dict >> set >> sorted
print(word_dict)
print(xy)

# build training data
x_data = np.array([
preprocess.bag_words(tokenized, word_dict) for (tokenized, tag) in xy
])
y_data = np.array([tag for (tokenized, tag) in xy])
dataset = IntentDataset(x_data, y_data)

batch_size = 8
num_workers = 2
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)

run()
11 changes: 9 additions & 2 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
Expand All @@ -19,5 +20,11 @@ def stem(tokenized):
return [stemmer.stem(token) for token in tokenized]


def bag_words():
raise NotImplementedError
def bag_words(tokenized, word_dict):
bag = np.zeros(len(word_dict), dtype=np.float32)
for i, token in enumerate(tokenized):
if token in word_dict:
bag[i] = 1.0

return bag

0 comments on commit 2fcecc3

Please sign in to comment.