This repository has been archived by the owner on Dec 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
script_dataset.py
342 lines (302 loc) · 17.6 KB
/
script_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""
By Vincent Derkinderen (DTAI lab KU Leuven, 2019)
A script to create a ProbLog dataset from bayesian networks that were compiled into ProbLog programs using ProbLog's
conversion script (12/2019).
Given a ProbLog program of a BN, this can create files used for making decisions or learning (task_type)
* task_type = d: Files used in decision making. Adds decisions based on Annotated Disjunctions and probabilistic facts,
and randomly assigns utilities to terms.
* task_type = d2: Files used in decision making. Adds decisions based on Annotated Disjunctions and probabilistic facts.
For the utilities, new terms are introduced and the body of their rules are constructed using program samples.
* else: files used for learning. Real model: utilities are randomly assigned to terms. Input model: some utilities are
randomly made unknown. Examples to learn from: the real program is sampled and observations are randomly left out.
Input files are read from ./data/raw/inputfile.pl and all files will be created into ./data/processed/
"""
import sys
import shutil
import experiments_utils as utils
import random
from problog.engine import DefaultEngine
from problog.program import PrologFile
def main(argv):
args = _argparser().parse_args(argv[1:])
seed = args.seed
filename = args.inputfile
drop_prob = args.drop
prob_of_unknown = args.unknown_prob
task_type = args.task_type
if task_type == 'd': # decisions + utility on existing nodes
def cr(filename):
_create_decision_template(name=filename, seed=seed)
elif task_type == 'd2': # new utility nodes
nb_util_nodes = args.nb_util_nodes
nb_rules = args.nb_rules
def cr(filename):
_create_decision_template2(name=filename, seed=seed, nb_of_utility_nodes=nb_util_nodes, nb_of_rules=nb_rules)
else: # utility on existing nodes + unknowns + observations
nb_of_examples = args.nb_of_examples
def cr(filename):
_create_template(name=filename, seed=seed, drop_prob=drop_prob, prob_of_unknown=prob_of_unknown,
n=nb_of_examples)
cr(filename)
def _create_decision_template(name, seed):
random.seed(a=seed)
filename = name + '.pl'
new = name + "_decision.pl"
create_dataset(filename=filename, n=1, new_filename=new, decisions=True, learning=False, verbose=True)
def _create_decision_template2(name, seed, nb_of_utility_nodes, nb_of_rules):
random.seed(a=seed)
filename = name + '.pl'
new = name + "_decision2.pl"
create_dataset2(filename=filename, n=1, new_filename=new, decisions=True, learning=False,
nb_utility_nodes=nb_of_utility_nodes, samples_per_utility_node=nb_of_rules, verbose=True)
def _create_template(name, seed, drop_prob, prob_of_unknown, n):
random.seed(a=seed)
filename = name + '.pl'
create_dataset(filename=filename, n=n, decisions=False, learning=True, drop_prob=drop_prob,
prob_of_unknown=prob_of_unknown, verbose=True)
def create_dataset(filename, n, new_filename=None, decisions=True, learning=True, drop_prob=0.25, prob_of_unknown=0.5,
verbose=False):
"""
Create a dataset for problog model ./data/raw/filename.
The following files will be created:
* ./data/processed/new_filename - contains the model with each term queried and utilities attached. When
decisions=True, decisions are also present.
If learning = True then the following files are also created:
* ./data/processed/new_filename-without-pl_input.pl - contains the model with each term queried and utilities
attached. Some utilities are unknown.
* ./data/processed/new_filename-without-pl_examples.pl - contains n partially observed examples and their utility.
:param filename: The name of the ProbLog file to create a dataset for.
:param n: The number of samples to take, aka the number of examples in the resulting examples file.
:param new_filename: The filename to use for the new files. When none, the original filename appended with
_{drop_prob}_{prob_of_unknown}_{n}.pl is used.
:param decisions: Whether to decisions to the dataset.
:param learning: Whether to also create a file with unknown utilities and observations.
:param drop_prob: The probability that a term is dropped in an example.
:param prob_of_unknown: The probability that a utility becomes unknown.
:param verbose: Whether to print when a stage is finished.
:type filename: str
:type n: int
:type new_filename: str
:type decisions: bool
:type learning: bool
:type drop_prob: float
:type prob_of_unknown: float
:type verbose: bool
"""
raw_model_filename = "./data/raw/" + filename
if new_filename is None:
new_filename = filename.replace('.pl', '') + "_{}_{}_{}.pl".format(drop_prob, prob_of_unknown, n)
processed_real_model_filename = "./data/processed/" + new_filename
processed_input_model_filename = f"./data/processed/{new_filename.replace('.pl', '')}_input.pl"
examples_filename = f"./data/processed/{new_filename.replace('.pl', '')}_examples.pl"
def next_random_utility(t, p):
return random.randint(-50, 50)
# Add queries to file
db = _add_queries(raw_model_filename)
if decisions:
db, nb_of_decisions = utils.add_decisions_to_db(db)
if verbose:
print("%s decisions added in: %s" % (nb_of_decisions, processed_real_model_filename))
db_utilities = db.extend()
db_unknown_utilities = db.extend()
# Create real model - add utilities
utils.add_utilities_to_db(db_utilities, next_random_utility, prob_of_utility_for_pos=0.8, prob_of_utility_for_neg=0.5)
utils.save_db(db_utilities, processed_real_model_filename)
if verbose:
print("Utilities added in: %s" % processed_real_model_filename)
if learning:
# Create input model - add unknown utilities
utils.add_unknown_utilities_to_db(db_utilities, db_unknown_utilities, prob_of_unknown=prob_of_unknown)
utils.save_db(db_unknown_utilities, processed_input_model_filename)
if verbose:
print("Unknown utilities added in: %s" % processed_input_model_filename)
# Create observation file
examples = utils.create_observations(processed_real_model_filename, n=n)
examples = utils.drop_observations(examples, drop_prob=drop_prob)
utils.save_examples(examples, examples_filename)
if verbose:
print("Examples constructed: %s" % examples_filename)
return processed_real_model_filename, processed_input_model_filename, examples_filename
else:
return processed_real_model_filename
def create_datasets(filename, n, new_filenames=None, decisions=True, learning=True, drop_probs={0.25},
prob_of_unknown=0.5, verbose=False):
"""
Create a dataset for problog model ./data/raw/filename.
The following files will be created:
* ./data/processed/new_filename - contains the model with each term queried and utilities attached. When
decisions=True, decisions are also present.
If learning = True then the following files are also created:
* ./data/processed/new_filename-without-pl_input.pl - contains the model with each term queried and utilities
attached. Some utilities are unknown.
* ./data/processed/new_filename-without-pl_examples.pl - contains n partially observed examples and their utility.
:param filename: The name of the ProbLog file to create a dataset for.
:param n: The number of samples to take, aka the number of examples in the resulting examples file.
:param new_filenames: The filenames to use for the new files (dict[drop_prob]:filename). When none, the original
filename appended with _{drop_prob}_{prob_of_unknown}_{n}.pl is used.
:param decisions: Whether to decisions to the dataset.
:param learning: Whether to also create a file with unknown utilities and observations.
:param drop_probs: The set of probabilities with which a term is dropped in an example. The length of this
determines the amount of datasets that are created.
:param prob_of_unknown: The probability that a utility becomes unknown.
:param verbose: Whether to print when a stage is finished.
:type filename: str
:type n: int
:type new_filenames: dict[float:str]
:type decisions: bool
:type learning: bool
:type drop_probs: set[float]
:type prob_of_unknown: float
:type verbose: bool
"""
raw_model_filename = "./data/raw/" + filename
drop_probs = list(drop_probs)
if new_filenames is None:
new_filenames = {drop_prob: filename.replace('.pl', '') + f"_{drop_prob}_{prob_of_unknown}_{n}.pl" for
drop_prob in drop_probs}
processed_real_model_filenames = {drop_prob: "./data/processed/" + new_filenames[drop_prob] for drop_prob in drop_probs}
processed_input_model_filenames = {drop_prob: f"./data/processed/{new_filenames[drop_prob].replace('.pl', '')}_input.pl" for drop_prob in drop_probs}
examples_filenames = {drop_prob: f"./data/processed/{new_filenames[drop_prob].replace('.pl', '')}_examples.pl" for drop_prob in drop_probs}
def next_random_utility(t, p):
return random.randint(-50, 50)
# Add queries to file
db = _add_queries(raw_model_filename)
if decisions:
db, nb_of_decisions = utils.add_decisions_to_db(db)
if verbose:
print("%s decisions added in: %s" % (nb_of_decisions, processed_real_model_filenames))
db_utilities = db.extend()
db_unknown_utilities = db.extend()
# Create real model - add utilities
utils.add_utilities_to_db(db_utilities, next_random_utility, prob_of_utility_for_pos=0.8, prob_of_utility_for_neg=0.5)
utils.save_db(db_utilities, processed_real_model_filenames[drop_probs[0]])
for drop_prob in drop_probs[1:]:
shutil.copyfile(processed_real_model_filenames[drop_probs[0]], processed_real_model_filenames[drop_prob])
if verbose:
print("Utilities added in: %s" % processed_real_model_filenames)
if learning:
# Create input model - add unknown utilities
utils.add_unknown_utilities_to_db(db_utilities, db_unknown_utilities, prob_of_unknown=prob_of_unknown)
utils.save_db(db_unknown_utilities, processed_input_model_filenames[drop_probs[0]])
for drop_prob in drop_probs[1:]:
shutil.copyfile(processed_input_model_filenames[drop_probs[0]], processed_input_model_filenames[drop_prob])
if verbose:
print("Unknown utilities added in: %s" % processed_input_model_filenames[0])
# Create observation file
full_examples = utils.create_observations(processed_real_model_filenames[drop_probs[0]], n=n)
random_seed = random.randint(0, 2147000000)
for drop_prob, examples_filename in examples_filenames.items():
random.seed(a=random_seed)
examples = utils.drop_observations(full_examples, drop_prob=drop_prob)
utils.save_examples(examples, examples_filename)
if verbose:
print("Examples constructed: %s" % examples_filename)
return processed_real_model_filenames, processed_input_model_filenames, examples_filenames
else:
return processed_real_model_filenames
def create_dataset2(filename, n, new_filename=None, decisions=True, learning=True, drop_prob=0.25,
prob_of_unknown=0.5, nb_utility_nodes=5, samples_per_utility_node=5, verbose=False):
"""
Create a dataset for problog model ./data/raw/filename and introduce new utility facts to the model. For each
utility fact, samples_per_utility_node rules are introduced such that utility_node :- sampled_observation.
The following files will be created:
* ./data/processed/new_filename - contains the model with each term queried and new utility facts introduced.
When decisions=True, decisions are also present.
If learning = True then the following files are also created:
* ./data/processed/new_filename-without-pl_input.pl - contains the model with each term queried and utilities
attached. Some utilities are unknown.
* ./data/processed/new_filename-without-pl_examples.pl - contains n partially observed examples and their utility.
:param filename: The name of the ProbLog file to create a dataset for.
:param n: The number of samples to take, aka the number of examples in the resulting examples file.
:param new_filename: The filename to use for the new files. When none, the original filename appended with
_{drop_prob}_{prob_of_unknown}_{n}_{nb_utility_nodes}_{samples_per_utility_node}.pl is used.
:param decisions: Whether to decisions to the dataset.
:param learning: Whether to also create a file with unknown utilities and observations.
:param drop_prob: The probability that a term is dropped in an example.
:param prob_of_unknown: The probability that a utility becomes unknown.
:param nb_utility_nodes: The amount of utility nodes to introduce to the model.
:param samples_per_utility_node: The amount of rules to add for each introduced utility fact f: 'f :- sample'.
:param verbose: Whether to print when a stage is finished.
:type filename: str
:type n: int
:type new_filename: str
:type decisions: bool
:type learning: bool
:type drop_prob: float
:type prob_of_unknown: float
:type nb_utility_nodes: int
:type samples_per_utility_node: int
:type verbose: bool
"""
raw_model_filename = "./data/raw/" + filename
if new_filename is None:
new_filename = filename.replace('.pl', '') + "_{}_{}_{}_{}_{}.pl".format(drop_prob, prob_of_unknown, n,
nb_utility_nodes,
samples_per_utility_node)
processed_real_model_filename = "./data/processed/" + new_filename
processed_input_model_filename = f"./data/processed/{new_filename.replace('.pl', '')}_input.pl"
examples_filename = f"./data/processed/{new_filename.replace('.pl', '')}_examples.pl"
# Add queries to file
db = _add_queries(raw_model_filename)
# Create real model - add utilities
db_utilities = utils.add_utility_terms_to_db(db, nb_utility_nodes=nb_utility_nodes,
samples_per_utility_node=samples_per_utility_node)
if verbose:
print("Utilities added in: %s" % processed_real_model_filename)
if decisions:
db_utilities, nb_of_decisions = utils.add_decisions_to_db(db_utilities)
if verbose:
print("%s decisions added in: %s" % (nb_of_decisions, processed_real_model_filename))
utils.save_db(db_utilities, processed_real_model_filename)
if learning:
# Create input model - add unknown utilities
db_unknown_utilities = db.extend()
utils.add_unknown_utilities_to_db(db_utilities, db_unknown_utilities, prob_of_unknown=prob_of_unknown)
utils.save_db(db_unknown_utilities, processed_input_model_filename)
if verbose:
print("Unknown utilities added in: %s" % processed_input_model_filename)
# Create observation file
examples = utils.create_observations(processed_real_model_filename, n=n)
examples = utils.drop_observations(examples, drop_prob=drop_prob)
utils.save_examples(examples, examples_filename)
if verbose:
print("Examples constructed: %s" % examples_filename)
return processed_real_model_filename, processed_input_model_filename, examples_filename
else:
return processed_real_model_filename
def _add_queries(raw_model_filename):
"""
Get a database from raw_model_filename and add 'query(x).' for each term x.
:param raw_model_filename: The ProbLog file to create a database for.
:return: The resulting database.
:rtype: ClauseDB
"""
program = PrologFile(raw_model_filename)
engine = DefaultEngine(label_all=True, keep_order=True)
db = engine.prepare(program)
utils.query_all_terms(db)
return db
def _argparser():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('inputfile', help="The name of the file located in ./data/raw/inputfile.pl")
parser.add_argument('--drop', '-d', type=float, default=0.25,
help='The probability that a term is dropped frm the observations.')
parser.add_argument('--unknown_prob', '-u', type=float, default=0.5,
help='The probability with which a utility is made unknown.')
parser.add_argument('--task_type', '-t', type=str, default='',
help='d or d2 to create decision models, otherwise learning are models are created.')
parser.add_argument('--nb_of_examples', '-e', type=int, default=150,
help='The number of examples to create. Only relevant when task type is not d or d2.')
parser.add_argument('--nb_util_nodes', '-n', type=int, default=5,
help='The number of utility nodes to add. Only relevant when task type is d2.')
parser.add_argument('--nb_rules', '-r', type=int, default=5,
help='The number of utility nodes to add. Only relevant when task type is d2.')
parser.add_argument('--seed', '-s', type=int, default=5,
help='The seed to use by the random module.')
parser.add_argument('-o', '--output', type=str, default=None,
help='Write output to given file (default: write to stdout)')
parser.add_argument('-v', '--verbose', action='count', help='Increase verbosity')
return parser
if __name__ == '__main__':
main(sys.argv)