-
Notifications
You must be signed in to change notification settings - Fork 2
/
nlp.py
120 lines (99 loc) · 6.41 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# importing the nltk suite
from turtle import pos
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import brown
from nltk.tag import UnigramTagger
'''
Basic NLP for AUME by Joshua Kranabetter Feb 2022.
NLTK part of speech (pos) tagging and word filtering based on tag type.
'''
class WordFilter(object):
def __init__(self):
# Using data
train_sents = brown.tagged_sents()
self.uni_tagger = UnigramTagger(train_sents)
self.pos_discard_cache = []
self.output_cache = []
self.parts_of_speech_filter = {
'$' : False, # dollar [, −, −, —, A, A, C, HK, HK, M, NZ, NZ, S, U.S., U.S, …
'”' : False, # closing quotation mark [‘, ”]
'(' : False, # opening parenthesis [(, [, {]
')' : False, # closing parenthesis [), ], }]
',' : False, # comma [,]
'—' : False, # dash [–]
'.' : False, # sentence terminator [., !, ?]
':' : False, # colon or ellipsis [:, ;, …]
'CC' : False, # conjunction, coordinating [&, ‘n, and, both, but, either, et, for, less,…
'CD' : False, # numeral, cardinal [mid-1890, nine-thirty, forty-two, one-tenth, …
'DT' : False, # determiner [all, an, another, any, both, del, each, eithe…
'EX' : False, # existential there [there]
'FW' : False, # foreign word [gemeinschaft, hund, ich, jeux, habeas, Haemen…
'IN' : False, # preposition or conjunction, subordinating [astride, among, uppon, whether, out, inside, …
'JJ' : True, # adjective or numeral, ordinal [third, ill-mannered, pre-war, regrettable, oi…
'JJR' : True, # adjective, comparative [bleaker, braver, breezier, briefer, brighter,…
'JJS' : True, # adjective, superlative [calmest, cheapest, choicest, classiest, clean…
'LS' : False, # list item marker [A, A., B, B., C, C., D, E, F, First, G, H, I,…
'MD' : False, # modal auxiliary [can, cannot, could, couldn’t, dare, may, migh…
'NN' : False, # noun, common, singular or mass [common-carrier, cabbage, knuckle-duster, Casi…
'NNP' : False, # noun, proper, singular [Motown, Venneboerger, Czestochwa, Ranzer, Con…
'NNPS' : False, # noun, proper, plural [Americans, Americas, Amharas, Amityvilles, Am…
'NNS' : False, # noun, common, plural [undergraduates, scotches, bric-a-brac, produc…
'PDT' : False, # pre-determiner [all, both, half, many, quite, such, sure, this]
'POS' : False, # genitive marker [‘, ‘s]
'PRP' : False, # pronoun, personal [hers, herself, him, himself, hisself, it, its…
'PRP$' : False, # pronoun, possessive [her, his, mine, my, our, ours, their, thy, your]
'RB' : True, # adverb [occasionally, unabatingly, maddeningly, adven…
'RBR' : True, # adverb, comparative [further, gloomier, grander, graver, greater, …
'RBS' : True, # adverb, superlative [best, biggest, bluntest, earliest, farthest, …
'RP' : False, # particle [aboard, about, across, along, apart, around, …
'SYM' : False, # symbol [%, &, ‘, ”, ”., ), )., *, +, ,., <, =, >, @…
'TO' : False, # “to” as preposition or infinitive marker [to]
'UH' : False, # interjection [Goodbye, Goody, Gosh, Wow, Jeepers, Jee-sus, …
'VB' : False, # verb, base form [ask, assemble, assess, assign, assume, atone,…
'VBD' : False, # verb, past tense [dipped, pleaded, swiped, regummed, soaked, ti…
'VBG' : True, # verb, present participle or gerund [telegraphing, stirring, focusing, angering, j…
'VBN' : False, # verb, past participle [multihulled, dilapidated, aerosolized, chaire…
'VBP' : False, # verb, present tense, not 3rd person singular [predominate, wrap, resort, sue, twist, spill,…
'VBZ' : False, # verb, present tense, 3rd person singular [bases, reconstructs, marks, mixes, displeases…
'WDT' : False, # WH-determiner [that, what, whatever, which, whichever]
'WP' : False, # WH-pronoun [that, what, whatever, whatsoever, which, who,…
'WP$' : False, # WH-pronoun, possessive [whose]
'WRB' : False, # Wh-adverb [how, however, whence, whenever, where, whereby…
'“' : False, # opening quotation mark [`, “]
'None': False,
}
def pos_filter(self, tokens):
# if the input is a string, we tokenize the string, else its already a list
if(isinstance(tokens, str)):
print('ERROR: input to pos taggins is list of strings not a string')
# get POS tags, a list of tuples
pos_tags = pos_tag(tokens)
# filter the tags, keeping only tags labelled true in the above parts filter
output = [x[0] for x in pos_tags if self.parts_of_speech_filter[x[1]]]
# add discards to a list
discard = []
discard += [x[0] for x in pos_tags if not self.parts_of_speech_filter[x[1]]]
return output, discard, pos_tags
def uni_filter(self, tokens):
# if the input is a string, we tokenize the string, else its already a list
if(isinstance(tokens, str)):
print('ERROR: input to pos taggins is list of strings not a string')
# get POS tags, a list of tuples
pos_tags = self.uni_tagger.tag(tokens)
pos_list = []
for item in pos_tags:
part = item[1]
if len(part) > 3: print('yep: ', part)
if part is None: part = 'None'
pos_list.append([item[0], part])
print('printing list')
print(pos_list)
# filter the tags, keeping only tags labelled true in the above parts filter
output = [x[0] for x in pos_list if self.parts_of_speech_filter[str(x[1])]]
# add discards to a list
discard = []
discard += [x[0] for x in pos_list if not self.parts_of_speech_filter[x[1]]]
return output, discard, pos_list
def tokenize(self, input):
return word_tokenize(input)