-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathaddress_extract.py
156 lines (120 loc) · 4.86 KB
/
address_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Extract address from unstructured text
import pickle
import nltk
import string
from nltk import pos_tag
from nltk import word_tokenize
from nltk.chunk import ChunkParserI
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag import ClassifierBasedTagger
from nltk.tag.util import untag
from nltk.stem.snowball import SnowballStemmer
# IOB tag name for specifying address
GPE_TAG = "GPE"
class AddressChunker(ChunkParserI):
def __init__(self, train_sents, **kwargs):
self.tagger = ClassifierBasedTagger(
train=train_sents,
feature_detector=self.features,
**kwargs)
def parse(self, tagged_sent):
chunks = self.tagger.tag(tagged_sent)
# Transform the result from [((w1, t1), iob1), ...]
# to the preferred list of triplets format [(w1, t1, iob1), ...]
iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
# Transform the list of triplets to nltk.Tree format
return conlltags2tree(iob_triplets)
def features(self, tokens, index, history):
# for more details see: http://nlpforhackers.io/named-entity-extraction/
"""
`tokens` = a POS-tagged sentence [(w1, t1), ...]
`index` = the index of the token we want to extract features for
`history` = the previous predicted IOB tags
"""
# init the stemmer
stemmer = SnowballStemmer('english')
# Pad the sequence with placeholders
tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
history = ['[START2]', '[START1]'] + list(history)
# shift the index with 2, to accommodate the padding
index += 2
word, pos = tokens[index]
prevword, prevpos = tokens[index - 1]
prevprevword, prevprevpos = tokens[index - 2]
nextword, nextpos = tokens[index + 1]
nextnextword, nextnextpos = tokens[index + 2]
previob = history[index - 1]
contains_dash = '-' in word
contains_dot = '.' in word
allascii = all([True for c in word if c in string.ascii_lowercase])
allcaps = word == word.capitalize()
capitalized = word[0] in string.ascii_uppercase
prevallcaps = prevword == prevword.capitalize()
prevcapitalized = prevword[0] in string.ascii_uppercase
nextallcaps = prevword == prevword.capitalize()
nextcapitalized = prevword[0] in string.ascii_uppercase
f = {
'word': word,
'lemma': stemmer.stem(word),
'pos': pos,
'all-ascii': allascii,
'next-word': nextword,
'next-lemma': stemmer.stem(nextword),
'next-pos': nextpos,
'next-next-word': nextnextword,
'nextnextpos': nextnextpos,
'prev-word': prevword,
'prev-lemma': stemmer.stem(prevword),
'prev-pos': prevpos,
'prev-prev-word': prevprevword,
'prev-prev-pos': prevprevpos,
'prev-iob': previob,
'contains-dash': contains_dash,
'contains-dot': contains_dot,
'all-caps': allcaps,
'capitalized': capitalized,
'prev-all-caps': prevallcaps,
'prev-capitalized': prevcapitalized,
'next-all-caps': nextallcaps,
'next-capitalized': nextcapitalized,
}
return f
def get_address_chunker(dataset_file_name):
"""
returns AddressChunker instance with dataset_file_name as training samples
`dataset_file_name` = file name of pickled list of CoNLL IOB format sentences
"""
with open(dataset_file_name, 'rb') as fp:
dataset = pickle.load(fp)
print(len(dataset))
chunker = AddressChunker(dataset)
return chunker
def get_chuncker_accuracy(chunker, test_samples):
"""
returns score of the chunker against the gold standard
"""
score = chunker.evaluate([
conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
for iobs in test_samples
])
return score.accuracy()
def get_tagged_sentence(chunker, sentence):
"""
returns IOB tagged tree of sentence
"""
return chunker.parse(pos_tag(word_tokenize(sentence)))
def extract_address(chunker, sentence):
"""
returns all addresses in sentence
"""
def tree_filter(tree):
return GPE_TAG == tree.label()
tagged_tree = get_tagged_sentence(chunker, sentence)
addresses = list()
for subtree in tagged_tree.subtrees(filter=tree_filter):
addresses.append(untag(subtree.leaves()))
return addresses
print("Loading dataset...")
chunker = get_address_chunker('dataset/IOB_tagged_addresses.pkl')
print("Done.")
print(extract_address(chunker, "Hey man! Joe lives here: 44 West 22nd Street, New York, NY 12345. Can you contact him now? If you need any help, call me on 12345678"))