-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
41 lines (31 loc) · 1.21 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
STOPWORDS = set([]) # set(stopwords.words('english'))
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = ' '.join([word for word in text.split() if word not in STOPWORDS])
return text
def tokenize(text, word_to_idx):
tokens = []
for word in text.split():
tokens.append(word_to_idx[word])
return tokens
def pad_and_truncate(messages, max_length=30):
features = np.zeros((len(messages), max_length), dtype=int)
for i, sms in enumerate(messages):
if len(sms):
features[i, -len(sms):] = sms[:max_length]
return features
if __name__ == '__main__':
data = pd.read_csv('./data/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'sms'])
data.sms = data.sms.apply(clean_text)
words = set((' '.join(data.sms)).split())
word_to_idx = {word: i for i, word in enumerate(words, 1)}
tokens = data.sms.apply(lambda x: tokenize(x, word_to_idx))
inputs = pad_and_truncate(tokens)
labels = np.array((data.label == 'spam').astype(int))
np.save('./data/labels.npy', labels)
np.save('./data/inputs.npy', inputs)