forked from scotthlee/enriched-LSTMs
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
152 lines (133 loc) · 6.32 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
import numpy as np
import argparse
import h5py
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import load_npz, save_npz, hstack, csr_matrix
import tools.generic as tg
import tools.text as tt
def parse_arguments(parser):
parser.add_argument('--data_dir', type=str, default=None,
help='directory holding the raw data')
parser.add_argument('--input_file', type=str, default=None,
help='file holding the original dataset')
parser.add_argument('--file_type', type=str, default='csv',
choices=['csv', 'tsv'],
help='format for the input file')
parser.add_argument('--encoding', type=str, default='latin1',
help='encoding used in the input file')
parser.add_argument('--text_column', type=str, default='text',
help='column holding the free text')
parser.add_argument('--target_column', type=str, default='code',
help='column to use as the target for classification')
parser.add_argument('--clean_text', type=bool, default=False,
help='whether to clean the free text')
parser.add_argument('--convert_numerals', type=bool, default=False,
help='whether to convert numerals to words')
parser.add_argument('--min_df', type=int, default=5,
help='freq cutoff for replacing tokens with rareword')
parser.add_argument('--max_length', type=int, default=100,
help='length at which to truncate text')
args = parser.parse_args()
return args
if __name__ == '__main__':
parser = argparse.ArgumentParser()
args = parse_arguments(parser)
'''
Part 1: Reading in the data and processing the text
'''
# Importing the data
if args.file_type == 'csv':
records = pd.read_csv(args.data_dir + args.input_file,
encoding=args.encoding)
elif args.file_type == 'tsv':
records = pd.read_csv(args.data_dir + args.input_file,
encoding=args.encoding,
sep='\t')
# Optional text cleaning
if args.clean_text:
text = tt.clean_column(records[args.text_column].astype(str),
remove_empty=False,
numerals=args.convert_numerals)
# Setting the text column to use for vectorization n stuff
else:
text = [doc for doc in records[args.text_column].astype(str)]
# First-pass vectorization to get the overall vocab
text_vec = CountVectorizer(binary=False,
ngram_range=(1, 1),
token_pattern="(?u)\\b\\w+\\b",
decode_error='ignore')
text_vec.fit(text)
vocab = text_vec.vocabulary_
vocab_size = len(list(vocab.keys()))
# Changing words with corpus counts < 5 to 'rareword'
doctermat = text_vec.transform(text)
word_sums = np.sum(doctermat, axis=0)
lim_cols = np.where(word_sums < args.min_df)[1]
where_lim = np.where(np.sum(doctermat[:, lim_cols], axis=1) > 0)[0]
for num in where_lim:
doc = text[num]
for word in doc.split():
if vocab[word] in lim_cols:
doc = doc.replace(word, 'rareword')
text[num] = doc
# Second-pass vectorization on the reduced-size corpus
min_vec = CountVectorizer(binary=False,
analyzer='word',
ngram_range=(1, 1),
token_pattern='\\b\\w+\\b',
decode_error='ignore')
min_vec.fit(text)
vocab = min_vec.vocabulary_
vocab_size = len(list(vocab.keys()))
# Adding 1 to each vocab index to allow for 0 masking
for word in vocab:
vocab[word] += 1
# Writing the vocabulary to disk
vocab_df = pd.DataFrame.from_dict(vocab, orient='index')
vocab_df['word'] = vocab_df.index
vocab_df.columns = ['value', 'word']
vocab_df.to_csv(args.data_dir + 'word_dict.csv', index=False)
# Converting the text strings to sequences of integers
# Clipping the docs to a max of 100 words
max_length = args.max_length
clipped_docs = text.copy()
for i, doc in enumerate(text):
if len(doc.split()) > max_length:
clipped_docs[i] = ' '.join(doc.split()[0:max_length])
# Weeding out docs with tokens that CountVectorizer doesn't recognize;
# this shouldn't be necessary, but I can't be bothered to debug it.
in_vocab = np.where([np.all([word in vocab.keys()
for word in doc.split()])
for doc in clipped_docs])[0]
good_docs = [clipped_docs[i] for i in in_vocab]
good_recs = records.iloc[in_vocab, :]
good_recs.to_csv(args.data_dir + 'records_clipped.csv', index=False)
# Preparing the HDF5 file to hold the output
output = h5py.File(args.data_dir + 'word_sents.hdf5', mode='w')
# Running and saving the splits for the inputs; going with np.uin16
# for the dtype since the vocab size is much smaller than before
int_sents = np.array([tt.pad_integers(tt.to_integer(doc.split(), vocab),
max_length, 0) for doc in good_docs],
dtype=np.uint16)
output['sents'] = int_sents
output.close()
'''
Part 2: Converting the discrete variables to wide format
'''
# Reading in the data
slim_cols = list(records.columns.drop([args.text_column,
args.target_column]))
records = good_recs[slim_cols]
# Making the sparse matrices
sparse_out = [tg.sparsify(records[col].astype(str)) for col in slim_cols]
sparse_csr = hstack([col['data'] for col in sparse_out], format='csr')
sparse_vocab = [col['vocab'] for col in sparse_out]
sparse_vocab = pd.Series([item for sublist in sparse_vocab
for item in sublist])
# Writing the files to disk
save_npz(args.data_dir + 'sparse_records', sparse_csr)
sparse_vocab.to_csv(args.data_dir + 'sparse_vocab.csv',
index=False,
header=False)