-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathone_hot_vectorization.py
42 lines (31 loc) · 1.27 KB
/
one_hot_vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
for word in sample.split():
if word not in token_index:
token_index[word] = len(token_index) + 1
print("Token Inxex: ", token_index)
max_length = 10
results = np.zeros(shape=(len(samples),
max_length,
max(token_index.values()) + 1))
print("results shape: ", results.shape)
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
print("word: ", word, " j: ", j, " index: ", index)
results[i, j, index] = 1.
print("Vectorizd Results: ", results)
print("Result vector shape: ", results.shape)
# Keras word-level one-hot encoding
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words= 1000)
tokenizer.fit_on_texts(samples)
print("tokenizer: ", tokenizer)
seq = tokenizer.texts_to_sequences(samples)
print("seq: ", seq)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print("one_hot res : ", one_hot_results, " shape: ", one_hot_results.shape)
word_index =tokenizer.word_index
print("Word_indx: ", word_index, " Found unique tokens: ", len(word_index))