-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataset.py
45 lines (38 loc) · 1.17 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import numpy as np
import tensorflow as tf
import one_hot_encode_word
import re
import random
import pickle
DATA_FOLDER_DIR = "media/"
LANGUAGES = ["English", "Danish", "German", "French", "Italian", "Spanish"]
def create_data(folder):
lines = []
encoder = one_hot_encode_word.OneHotEncode()
for language in LANGUAGES:
index_of_language = LANGUAGES.index(language)
full_path = folder + language + ".txt"
for line in open(full_path, 'rb'):
line = line.decode('unicode_escape')
line = re.sub(r'\s+|\.|\/', '', line)
if len(line) <= 10:
line = line.lower()
line_one_hot = encoder.encode(line)
lines.append([line_one_hot, index_of_language])
random.shuffle(lines)
words = []
labels = []
for word, label in lines:
words.append(word)
labels.append(label)
words = np.array(words)
labels = np.array(labels)
return words, labels
X, y = create_data(DATA_FOLDER_DIR)
pickle_out = open('X.pickle', "wb")
pickle.dump(X, pickle_out)
pickle_out.close()
pickle_out = open('y.pickle', "wb")
pickle.dump(y, pickle_out)
pickle_out.close()