-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtext_helper.py
93 lines (79 loc) · 2.82 KB
/
text_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#####################################################################
# Text Helper
#
# To process text data
#
#####################################################################
__author__ = "Kornraphop Kawintiranon"
__email__ = "[email protected]"
import string
import tqdm
import re
import concurrent.futures
import multiprocessing
from nltk.corpus import stopwords
from nltk.tokenize.destructive import NLTKWordTokenizer
def parallel_tokenize(corpus, tokenizer=None, n_jobs=-1):
if tokenizer == None:
tokenizer = NLTKWordTokenizer()
if n_jobs < 0:
n_jobs = multiprocessing.cpu_count() - 1
with concurrent.futures.ProcessPoolExecutor(max_workers=n_jobs) as executor:
corpus_tokenized = list(
tqdm.tqdm(executor.map(tokenizer.tokenize, corpus, chunksize=200), total=len(corpus), desc='Tokenizing')
)
return corpus_tokenized
def remove_stopwords(corpus, language='english'):
stop_words = set(stopwords.words(language))
processed_corpus = []
for words in corpus:
words = [w for w in words if not w in stop_words]
processed_corpus.append(words)
return processed_corpus
def remove_punctuations(corpus):
punctuations = string.punctuation
processed_corpus = []
for words in corpus:
words = [w for w in words if not w in punctuations]
processed_corpus.append(words)
return processed_corpus
def decontract(corpus):
processed_corpus = []
for phrase in tqdm.tqdm(corpus, desc="Decontracting"):
phrase = re.sub(r"’", "\'", phrase)
# specific
phrase = re.sub(r"won\'t", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
processed_corpus.append(phrase)
return processed_corpus
def get_word_counts(corpus):
# Initializing Dictionary
d = {}
# Counting number of times each word comes up in list of words (in dictionary)
for words in tqdm.tqdm(corpus, desc="Word Counting"):
for w in words:
d[w] = d.get(w, 0) + 1
return d
if __name__ == "__main__":
"""
For testing
"""
texts = [
"I love NLP.",
"I also really like machine learning. I would like to work on this field."
]
tokenized_texts = parallel_tokenize(texts)
for tokens in tokenized_texts:
print(tokens)
processed_texts = remove_stopwords(tokenized_texts)
for tokens in processed_texts:
print(tokens)