-
Notifications
You must be signed in to change notification settings - Fork 8
/
text_util.py
69 lines (55 loc) · 2.18 KB
/
text_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
'''
This file is modified from https://raw.githubusercontent.com/spro/practical-pytorch/master/char-rnn-classification/data.py
'''
import torch
import glob
import unicodedata
import string
all_letters = string.printable
n_letters = len(all_letters)
def findFiles(path): return glob.glob(path)
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
# Read a file and split into lines
def readLines(filename):
lines = open(filename).read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
# Build the category_lines dictionary, a list of lines per category
category_lines = {}
all_categories = []
for filename in findFiles('../data/names/*.txt'):
category = filename.split('/')[-1].split('.')[0]
all_categories.append(category)
lines = readLines(filename)
category_lines[category] = lines
n_categories = len(all_categories)
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
return all_letters.find(letter)
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li, letter in enumerate(line):
tensor[li][0][letterToIndex(letter)] = 1
return tensor
# Turn multiple lines into a <bathc_size x line_length x n_letters>
def linesToTensor(lines):
line_length = max([ len(line) for line in lines])
tensor = torch.zeros(len(lines), line_length, n_letters)
for b, line in enumerate(lines):
for li, letter in enumerate(line):
tensor[b][li + line_length - len(line)][letterToIndex(letter)] = 1
return tensor
def word_linesToTensor(lines, corpus):
line_length = max([ len(line.split()) for line in lines ])
tensor = torch.zeros(len(lines), line_length, len(corpus))
for b, line in enumerate(lines):
for li, word in enumerate(line.split()):
tensor[b][li + line_length - len(line.split())][ corpus.index_word(word) ] = 1
return tensor