-
Notifications
You must be signed in to change notification settings - Fork 0
/
playground.py
62 lines (49 loc) · 1.48 KB
/
playground.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# %%
from baseDataLoader import processTxtAsCsv, customDataSet
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
# %%
trainPath = "data/train.csv"
testPath = "data/test.csv"
batchSize = 4
def trainValDataset(dataset, val_split=0.2):
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
datasets = {}
datasets['train'] = Subset(dataset, train_idx)
datasets['val'] = Subset(dataset, val_idx)
return datasets
# %%
processTxtAsCsv(trainPath, testPath)
trainSet = customDataSet(trainPath)
testSet = customDataSet(testPath)
trainLengths = []
for idx in range(25000):
getLen = len(trainSet[idx][0])
trainLengths.append(getLen)
plt.hist(trainLengths)
plt.title("Train Set Token Lengths")
plt.show()
print("Mean: ", sum(trainLengths)/len(trainLengths))
# %%
testLengths = []
for idx in range(25000):
getLen = len(trainSet[idx][0])
testLengths.append(getLen)
plt.hist(testLengths)
plt.title("Test Set Token Lengths")
plt.show()
print("Mean: ", sum(testLengths)/len(testLengths))
# Let's cut the token lengths at 110 because that's the mean
# %%
from collections import Counter
allWords = []
for idx in range(len(trainSet)):
allWords += trainSet[idx][0]
print("total word count: ", len(allWords))
countWords = Counter(allWords)
allWordCount = len(countWords)
print("distinct word count: ", allWordCount)
print(countWords)
# %%