-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtag_heb_pos.py
144 lines (107 loc) · 4.6 KB
/
tag_heb_pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import requests
import os
import json
from utils.deconstruct import *
from utils.rlprint import rlprint
data_path = './data/lang_tagged_talmud/'
output_path = './data/pos_tagged_talmud/'
yap_host_address = 'http://localhost:8000/yap/heb/joint'
def remove_aram_prefixes_from_start(word):
return detach_prefixes(word, lang='A')
def get_yap_tagging(heb_phrase):
data = '{{"text": "{}"}}'.format(heb_phrase).encode('utf-8')
yap_response = requests.get(yap_host_address, data=data, headers={'content-type': 'application/json'})
yap_json = json.loads(yap_response.text)
return yap_json
def clean_yap_rval(yap_rval):
"""
Returns list of (token, pos) tuples
"""
token_info = yap_rval['md_lattice'].split('\n')
# info line is structured as: INDEX INDEX+1 TOKEN REVISED_TOKEN POS POS OTHER_INFO ORIGINAL_INDEX
token_info = [info.split('\t') for info in token_info]
pos_list = []
curr = 1
for info in token_info:
if len(info) == 1: # The table ends with empty strings
continue
pos = info[4]
index = info[-1]
# The final POS for the given index is the one that matters, since the preceding ones are prefixes
if curr == index:
pos_list[-1] = pos
else:
curr = index
pos_list.append(pos)
return pos_list
def tag_heb_pos(seq):
heb_phrases = ' '.join(seq) + ' '
rlprint(heb_phrases)
tagged_heb_sequence = get_yap_tagging(heb_phrases)
heb_tags_only = clean_yap_rval(tagged_heb_sequence)
return heb_tags_only
def prep_for_yap(chk_text):
words_for_tagging = [word_forms['word'][1] if word_forms['word'][1] != '' else word_forms['word'][0]
for word_forms in chk_text]
words_for_tagging = [heb_plural(to_hitpael(wd)) for wd in words_for_tagging]
heb_words_only = []
for i in range(len(words_for_tagging)):
# Second line ensures 'מתני' isn't included in the phrase
for_yap = alph_only(words_for_tagging[i]) if chk_text[i]['lang'] == 'R' \
and '׳' not in words_for_tagging[i] \
else '.'
heb_words_only.append(for_yap)
return heb_words_only
if __name__ == '__main__':
print('REMINDER: YAP must be running in order for this program to run.')
run = input('Is YAP running? y/n: ')
if run == 'n':
assert False
files = os.listdir(data_path)
for file in files:
title = file[:-5]
do_masekhet = input('Proceed with ' + title + '? y/n: ')
if do_masekhet == 'n':
continue
with open(data_path + file, encoding='utf-8') as f:
text = json.load(f)
text_with_pos_tags = []
continued = False
prev_chunk_text = []
for page in text:
print('-'*10 + 'next page' + '-'*10)
# What will be written to the file -- just the chunks themselves with each word POS and language tagged
text_with_pos_tags.append([])
for chunk in page:
print('='*10)
chunk_text = chunk['text']
if not continued:
hebrew_words_only = prep_for_yap(chunk_text)
else:
hebrew_words_only = prep_for_yap(prev_chunk_text + chunk_text)
pos_tags = tag_heb_pos(hebrew_words_only)[::-1]
if continued:
prev_chunk_tagged = [{'lang': word_forms['lang'],
'word': word_forms['word'],
'pos': pos_tags.pop()} for word_forms in prev_chunk_text]
text_with_pos_tags[-2][-1] = prev_chunk_tagged
print('*'*10)
for w in prev_chunk_tagged:
rlprint(w['word'][1], end='\t\t\t')
print(w['pos'])
print('*'*10)
curr_chunk_tagged = [{'lang': word_forms['lang'],
'word': word_forms['word'],
'pos': pos_tags.pop()} for word_forms in chunk_text]
text_with_pos_tags[-1].append(curr_chunk_tagged)
if not continued and (chunk['type'] == 'mc' or chunk['type'] == 'gc') and hebrew_words_only[-1] != '.':
prev_chunk_text = chunk_text
continued = True
else:
prev_chunk_text = []
continued = False
for w in curr_chunk_tagged:
rlprint(w['word'][1], end='\t\t\t')
print(w['pos'])
with open(output_path + file, 'w+', encoding='utf-8') as f:
json.dump(text_with_pos_tags, f, ensure_ascii=False, indent=4)