-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
34 lines (28 loc) · 843 Bytes
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
word_dict = imdb.get_word_index()
inv_map = {v: k for k, v in word_dict.items()}
top_words = 5000
review_len = 500
def encode_review(text):
result = []
arr = text_to_word_sequence(text, lower=True, split=" ")
for word in arr:
w = encode_word(word)
if w is not None and w <= top_words:
result.append(w)
return result
def encode_word(word):
if word not in word_dict:
return 0
return word_dict[word]
def decode_word(ind):
if ind not in inv_map:
return None
return inv_map[ind]
def encode_batch(arr):
arr = encode_review(arr)
return sequence.pad_sequences([arr], maxlen=review_len)