-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
85 lines (66 loc) · 2.16 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from glob import glob
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize as lib_tokenizer
from cleantext import clean
import string
ROOT_DIR = '/code/tai_data'
with open(f'{ROOT_DIR}/vietnamese-stopwords-dash.md', 'r') as f:
STOPWORDS = f.read().split('\n')
dict_map = dict({})
def word_tokenizer(text):
global dict_map
words = text.split()
words_norm = []
for w in words:
if dict_map.get(w, None) is None:
dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
words_norm.append(dict_map[w])
return words_norm
def word_normalizer(text):
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'ynm\w*', ' ', re.sub(r"'RE:(?=[^']*')", ' ', text))
text = clean(text, no_emoji=True, to_ascii=False, no_line_breaks=True, lower=False)
EMOJI_PATTERNS = [
'[:=]\)+',
'[:=]\(+',
'<3',
':[<>v3]',
]
for emoji_pattern in EMOJI_PATTERNS:
text = re.sub(emoji_pattern, ' ', text)
text = re.sub(r"\s+", " ", text)
text = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', ' ', text)
text = text.replace("BULLET : : : :", "")
text = text.replace("=", "")
text = text.replace("/", "")
text = text.replace("`", "")
return text
def strip_answer_string(text):
text = text.strip()
while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
if text[0] != '(' and text[-1] == ')' and '(' in text:
break
if text[-1] == '"' and text[0] != '"' and text.count('"') > 1:
break
text = text[:-1].strip()
while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
if text[0] == '"' and text[-1] != '"' and text.count('"') > 1:
break
text = text[1:].strip()
text = text.strip()
return text
def strip_context(text):
text = text.replace('\n', ' ')
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def remove_stopwords(text):
res = []
for w in text:
if w not in STOPWORDS:
res.append(w)
return ' '.join(res)