-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedit_raw_defs.py
70 lines (55 loc) · 1.74 KB
/
edit_raw_defs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#go through one of the raw files and change the definitions
#so that they mark collins words properly.
import bisect
data = open('9s_all_raw_csw15.txt').readlines()
words = open('all_csw_marked_alpha.txt').readlines()
wf = open('9s_all_raw_defsym.txt','w')
for line in data:
if len(line) < 2:
wf.write(line)
continue
if line[0:2] != 'A:':
wf.write(line)
continue
#print line
linesplit = line.split()
for i in range(len(linesplit)):
#Only care about stuff that is all uppercase
seg = linesplit[i]
if not seg.isupper():
continue
#We want just the word
justtheword = ''
wordlen = 0
for letter in seg:
if letter.isalpha():
justtheword += letter
wordlen += 1
#ignore -s, -ed, -ing
if justtheword == 'S' or justtheword == 'ED' or justtheword == 'ING':
continue
#The word is already marked
if len(seg) > wordlen:
if seg[wordlen] == '#':
continue
#find the appropriate spot in the word list
k = bisect.bisect_left(words, justtheword)
if k >= len(words):
continue
#figure out if it's british
if words[k][0:wordlen] != justtheword:
addsymbol = '*'
elif words[k][wordlen] == '#':
addsymbol = '#'
else:
continue
#reconstruct the word
newseg = justtheword + addsymbol
if len(seg) > wordlen:
newseg += seg[wordlen:]
linesplit[i] = newseg
#Remake the whole line
newline = ' '.join(linesplit)
wf.write(newline)
wf.write('\n')
wf.close()