-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
334 lines (270 loc) · 12.4 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import itertools
import numpy as np
from multiprocessing import Pool
from create_matrix import *
from formula_parser import *
from valuation import *
def load_property_vectors(ud_prefixes, uriel_prefixes):
"""Load the language-property matrix and return the property vectors.
Args:
ud_prefixes (list of str): A list of desired UD property categories (i.e. prefixes of the property names).
uriel_prefixes (list of str): A list of desired URIEL property categories (i.e. prefixes of the property names).
Returns:
dict of str:np.array: A dictionary which maps property names to property vectors.
list of str: The list of properties in alphabetically sorted order.
list of str: The list of languages in the same order as in the property vectors.
"""
# load matrices
matrixUD, languagesUD, propertiesUD = load_language_vectors("matrices/matrixUD.pickle", name="UD", save_overwrite=False)
matrixURIEL, languagesURIEL, propertiesURIEL = load_language_vectors("matrices/matrixURIEL.pickle", name="URIEL", save_overwrite=False)
# select languages
languages = sorted(list(set(languagesUD).intersection(set(languagesURIEL))))
# select properties
matrixUD, languagesUD, propertiesUD = select_language_vectors((matrixUD, languagesUD, propertiesUD), suffixes=languages, prefixes=ud_prefixes, none_replacement=None)
matrixURIEL, languagesURIEL, propertiesURIEL = select_language_vectors((matrixURIEL, languagesURIEL, propertiesURIEL), suffixes=languages, prefixes=uriel_prefixes, none_replacement=None)
property_vectors = {}
# refine UD property selection
untinteresting_relations = []#["dep", "discourse", "dislocated", "fixed", "goeswith", "list", "orphan", "parataxis", "reparandum", "vocative"] + ["compound", "flat", "punct", "root"]
c = {"S" : 100, "D" : 600, "C" : 600}
matrixUD = np.transpose(np.array(matrixUD, dtype=np.float64))
for i, v in enumerate(matrixUD):
p = propertiesUD[i]
if c[p[0]] > 0 and len([r for r in untinteresting_relations if r in p]) == 0:
property_vectors[p] = v
c[p[0]] -= 1
if p[0] in ["D", "C"] and (i+1) % 6 == 0:
px = p.split("#")
ps = px[1].split("-")
hd, d1, d2 = ps[2], ps[1], ps[0]
property_vectors[px[0]+"#"+hd+"-ini;["+d1+","+d2+"]"] = matrixUD[i-2]+matrixUD[i-5]
property_vectors[px[0]+"#"+hd+"-med;["+d1+","+d2+"]"] = matrixUD[i-1]+matrixUD[i-4]
property_vectors[px[0]+"#"+hd+"-fin;["+d1+","+d2+"]"] = matrixUD[i-0]+matrixUD[i-3]
if p[0] == "D":
property_vectors[px[0]+"#"+d1+"-"+d2+";["+hd+"]"] = matrixUD[i-3]+matrixUD[i-4]+matrixUD[i-5]
# property_vectors[px[0]+"#"+d2+"-"+d1+";["+hd+"]"] = matrixUD[i-0]+matrixUD[i-1]+matrixUD[i-2]
# refine URIEL property selection
matrixURIEL = np.transpose(np.array(matrixURIEL, dtype=np.float64))
for i, v in enumerate(matrixURIEL):
p = propertiesURIEL[i]
if p[0] == "S" or (p[0] == "F" and sum(v) > 1):
property_vectors[p] = v
return property_vectors, sorted(property_vectors.keys()), languages
def build_network(variables, max_vars):
"""Generates disjunctive normal forms (DNFs) by iteratively expanding nodes,
starting with variables, applying negation, conjunction and disjunction to nodes.
Don't use because very slow and runtime-inefficient!
Args:
variables (list of str): A list of variable names.
max_vars (int): The maximum number of variables involved in a formula.
All appearances of a variable are counted individually.
Returns:
dict of str:Term: The network, i.e. a mapping from formulas in DNF to respective terms.
"""
nodes = {variable : Term([variable]) for variable in variables}
checked = set()
while True:
ks = list(nodes.keys())
ks.sort()
new_nodes = {}
for i, k1 in enumerate(ks):
term1 = nodes[k1]
if k1 not in checked:
term_neg = convert_to_dnf(Term([term1], "NEG"))
term_neg_str = traverse_inorder(term_neg)
if term_neg_str not in ["0", "1"] and len(term_neg.atoms()) <= max_vars:
new_nodes[term_neg_str] = term_neg
checked.add(k1)
for j, k2 in enumerate(ks):
if j >= i:
break
if (k1,k2) not in checked:
term2 = nodes[k2]
term_con = convert_to_dnf(Term([term1, term2], "AND"))
term_con_str = traverse_inorder(term_con)
if term_con_str not in ["0", "1"] and len(term_con.atoms()) <= max_vars:
new_nodes[term_con_str] = term_con
term_dis = convert_to_dnf(Term([term1, term2], "OR"))
term_dis_str = traverse_inorder(term_dis)
if term_dis_str not in ["0", "1"] and len(term_dis.atoms()) <= max_vars:
new_nodes[term_dis_str] = term_dis
checked.add((k1,k2))
len1 = len(nodes)
nodes.update(new_nodes)
len2 = len(nodes)
if len2 == len1:
break
return nodes
class SearchState():
"""A class to store information about disjunctive normal forms (DNFs), which are search states.
"""
def __init__(self, term, values, length, disjuncts):
"""Constructor of the class `SearchState`.
Args:
term (Term): The term of the DNF.
values (np.array): The values after evaluating the formula for every language.
length (int): The length (in terms of variables) of the formula.
disjuncts (set of (tuple of str)): The disjuncts of the formula.
A disjunct is represented as a tuple which contains the names of its atomic variables in sorted order.
Negated variables are appened by "NEG ".
"""
self.term = term
self.values = values
self.length = length
self.disjuncts = disjuncts
def __str__(self):
"""String representation of the class `SearchState`.
Returns:
str: inorder traversal of the DNF, vector, length, set of disjuncts as stringified tuple
"""
return str((traverse_inorder(self.term), self.values, self.length, self.disjuncts))
vn_averaging = VProduct() # global variable
def average_value(values, weights):
"""Calculate the weighted average of a vector.
Args:
values (np.array): The value vector.
weights (np.array): The weight vector.
Returns:
float: The weighted average.
"""
vn_averaging.values = values
vn_averaging.weights = weights
return vn_averaging.collapse()
def worthy(values, weights, nan_p=0.25, t=0.5):
"""Checks whether a vector is worth to be kept.
Args:
values (np.array): The vector.
weights (np.array): The corresponding weights.
nan_p (float): Maximum percentage of NaN values.
t (float): Minimum average value.
Returns:
boolean: True iff the vector should be kept.
"""
return np.count_nonzero(np.isnan(values)) <= nan_p*len(values) and average_value(values, weights) >= t
def sort_out_ones(states, weights, max_me=0.05):
"""Separates final states from non-final states.
States with an average value close to 1 are final.
Args:
states (list of SearchState): List of states.
weights (np.array): Weight vector for averaging.
max_me (float): Maximum distance to 1.
Returns:
list of SearchState: List of states with an average value close to 1.
list of SearchState: List of the other states.
"""
ones = []
no_ones = []
for state in states:
if average_value(1-state.values, weights) <= max_me:
ones.append(state)
else:
no_ones.append(state)
return ones, no_ones
def improved(operator, state1, state2, values, weights, i=0.05):
"""Checks whether the connection of two states significantly improved the average value.
disjunction: the new average value is greater than the greatest average value of the state to connect
conjunction: the new average value is smaller than the smallest average value of the state to connect
Args:
operator (str): "OR" or "AND", the connective
state1 (SearchState): 1st state to connect
state2 (SearchState): 2nd state to connect
values (np.array): vector of the connection
weights (np.array): weights for weighted averaging
i (float): minimum improvement
Returns:
boolean: True iff the connection shows an improvement.
"""
m1 = average_value(state1.values, weights)
m2 = average_value(state2.values, weights)
m = average_value(values, weights)
if operator == "OR":
l = len(state1.disjuncts)+len(state2.disjuncts)
return m >= max(m1, m2) + (l-1)*i
elif operator == "AND":
l = len(next(iter(state1.disjuncts)))+len(next(iter(state2.disjuncts)))
return m <= min(m1, m2) - (l-1)*i
return None
def minimal_disjuncts(dnf1, dnf2):
"""Checks whether the disjunction of two DNFs has minimal disjuncts, i.e.
1) there is no atom common to all disjuncts.
2) there are no two disjuncts which are identical except for one atom which
is contained as unnegated atom in one disjunct and as negated atom in the other disjunct.
Args:
dnf1 (SearchState): 1st DNF
dnf2 (SearchState): 2nd DNF
Returns:
boolean: True iff the DNFs can be combined without causing a non-minimal DNF.
"""
disjuncts = dnf1.disjuncts | dnf2.disjuncts
if len(set.intersection(*[set(c) for c in disjuncts])) > 0:
return False
for c1, c2 in itertools.combinations(disjuncts, 2):
if 2*(len(set(c1+c2))-1) == len(c1+c2):
x = list(set(c1).difference(set(c2)) | set(c2).difference(set(c1)))
if len(x) == 2 and x[0].replace("NEG ", "") == x[1].replace("NEG ", ""):
return False
return True
def search(property_vectors, languages, weighted=True, max_vars=4, k_best=100, max_me=0.05, tr_th=0.5, improve=0.05):
"""Searach disjunctive normal forms (DNFs) with maximal average scores.
Args:
property_vectors (dict of str:np.array): A dictionary which maps property names to property vectors.
languages (list of str): The languages with values in the property vectors.
weighted (boolean): True iff phylogenetic weights should be used.
max_vars (int): The maximum number of variables involved in a formula.
All appearances of a variable are counted individually.
k_best (int): Only keeps the k best disjuncts when iteratively constructing DNFs.
max_me (float): Maximum distance to 1 for final states.
tr_th (float): Minimum average value for disjuncts.
improve (float): Minimum improvement per disjunct.
Returns:
list of SearchState: List of ranked final states.
"""
valuation = VFuzzy(property_vectors)
pweights = np.ones(len(languages))
if weighted:
pweights = np.array(get_phylogenetic_weights(languages))
pweights = np.array(pweights)
# construct atoms
atoms = []
for p in sorted(property_vectors.keys()):
# type(p) can be `str` or `numpy.str_` but must be `str`
atoms.append(SearchState(Term([str(p)]), property_vectors[p], 1, set([(str(p),)])))
atoms.append(SearchState(Term([Term([str(p)])], "NEG"), 1-property_vectors[p], 1, set([("NEG " + str(p),)])))
atoms = [atom for atom in atoms if worthy(atom.values, pweights, t=tr_th)]
final_states, atoms = sort_out_ones(atoms, pweights, max_me=max_me)
dnfs = [atoms]
# construct disjuncts (conjunctions)
for n in range(max_vars-1):
conjuncts = dnfs[n]
new_disjuncts = []
for conjunct in conjuncts:
conjunct_atoms = sorted(conjunct.term.atoms(True))
for atom in dnfs[0]:
concatenated_atoms = conjunct_atoms + atom.term.atoms(True)
if atom.term.atoms()[0] not in conjunct.term.atoms() and tuple(concatenated_atoms) == tuple(sorted(concatenated_atoms)):
operands = (conjunct.term.operands if n > 0 else [conjunct.term])
values = valuation._and([conjunct.values, atom.values])
if worthy(values, pweights, t=tr_th):# and improved("AND", conjunct, atom, values, pweights, i=improve):
new_disjuncts.append(SearchState(Term(operands + [atom.term], "AND"), values, 2+n, set([tuple(sorted(concatenated_atoms))])))
dnfs.append(new_disjuncts)
dnfs = sum(dnfs, [])
# construct disjunctions (DNFs)
for n in range(max_vars-1):
dnfs = sorted(dnfs, key=lambda dnf: average_value(dnf.values, pweights), reverse=True)
dnfs = dnfs[:k_best]
new_dnfs = []
for i, dnf1 in enumerate(dnfs):
disjuncts1 = (dnf1.term.operands if dnf1.term.operator == "OR" else [dnf1.term])
for j, dnf2 in enumerate(dnfs):
if j >= i:
break
disjuncts2 = (dnf2.term.operands if dnf2.term.operator == "OR" else [dnf2.term])
if dnf1.length + dnf2.length <= max_vars and len(dnf1.disjuncts.intersection(dnf2.disjuncts)) == 0 and minimal_disjuncts(dnf1, dnf2):
values = valuation._or([dnf1.values, dnf2.values])
if worthy(values, pweights, t=tr_th) and improved("OR", dnf1, dnf2, values, pweights, i=improve):
new_dnfs.append(SearchState(Term(disjuncts1 + disjuncts2, "OR"), values, dnf1.length + dnf2.length, dnf1.disjuncts | dnf2.disjuncts))
new_final_states, new_dnfs = sort_out_ones(new_dnfs, pweights, max_me=max_me)
final_states.extend(new_final_states)
dnfs.extend(new_dnfs)
final_states.extend(dnfs)
final_states = sorted(final_states, key=lambda state: average_value(state.values, pweights), reverse=True)
return final_states