-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_preprocessing.py
100 lines (61 loc) · 1.97 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
from scipy.sparse import coo_matrix
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
tokenizer = WordPunctTokenizer()
stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")
# In[2]:
def replace_special_character(document):
result = re.sub('[^a-zA-Z\n\.]', ' ', document).replace('.', ' ')
result = ' '.join(result.split())
result = "".join(result.splitlines())
result=re.sub(r'\b\w{1,3}\b', '', result)
return result.strip()
# In[3]:
def removestopword(document):
text = ' '.join([word for word in document.strip().lower().split() if word not in cachedStopWords])
return text
# In[4]:
def readTestFile():
file = 'data.csv'
data_frame = pd.read_csv(file, names=['label', 'text'])
print('finished reading files ... ')
data_frame['text'] = data_frame['text'].apply(lambda x : replace_special_character(x))
print('finished cleaning...')
return data_frame
# In[5]:
data = readTestFile()
# In[6]:
data.head()
# In[7]:
def pre_process(strng, enable_trivial=False):
strng = re.sub('[^a-zA-Z ]', '', strng)
words = []
for i in strng.lower().split():
i = lmtzr.lemmatize(i)
i = str(i)
words.append(i)
strng = words
return strng
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = pre_process(text)
return tokens
# In[8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', lowercase = True,tokenizer=tokenize_and_stem,
token_pattern = r'\b[a-zA-Z]+\b',ngram_range=(1,2),norm='l2')
# In[9]:
matrix = tf.fit_transform(data['text'].tolist())
features = tf.get_feature_names()
# In[11]:
matrix
# In[12]:
features[:10]
# In[ ]: