-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov_check_simulation_true_line.py
746 lines (595 loc) · 31.1 KB
/
markov_check_simulation_true_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
# Description: This script runs the Markov algorithm and parameter selection (MAPS) simulation for the linear Gaussian
# (lg) simulation type. The script calculates the CFI, NFI, and NNFI for a given model using lavaan. The script also
# calculates the adjacency precision, adjacency recall, arrowhead precision, arrowhead recall, BIC, F1 adjacency, F1 all,
# F0.5, F2.0, SHD, average squared distance, average minimum squared difference, and average maximum squared difference.
# The script saves the results to a file in the specified output directory.
#
# This version of the script adds a line to the full results for the true model.
#
# The script uses the following parameters:
# - location: The output directory for the results.
# - file: The file to save the results to.
# - num_nodes: The number of nodes in the graph.
# - avg_degree: The average degree of the graph.
# - num_latents: The number of latent variables in the graph.
# - sample_size: The number of samples to generate.
# - sim_type: The simulation type (lg).
#
# The script uses the following methods:
# - get_stats: This script calculates the CFI, NFI, and NNFI for a given model using lavaan.
# - save_lines: This script saves the results to a file in the specified output directory.
# - print_info: This script prints the information to the console.
# - print_parameter_defs: This script prints the parameter definitions to the console.
# - get_train: This script gets the training data.
# - get_test: This script gets the testing data.
# - get_graph: This script gets the graph.
# - get_sem_im: This script gets the SEM IM.
# - print_lines: This script prints the lines to the console.
# - my_print: This script prints the string to the console.
# - table_line: This script creates a table line for the given algorithm and parameter.
# - header: This script creates a header for the table.
# - pchc_graph: This script creates a graph from the pchc algorithm.
# - index: This script gets the index of the variable name.
# - accuracy: This script calculates the accuracy of the model.
# - markov_check: This script checks the Markov condition.
# - construct_graph: This script constructs a graph from the given graph.
# - bnl_to_tetrad: This script converts the BNL to a Tetrad graph.
# - make_data_cont_dao: This script makes the continuous data using the DaO simulation package.
# - get_model: This script gets the model for the given algorithm and parameter.
# - cpdag: This script checks if the graph is a CPDAG.
# - maps: This script runs the Markov algorithm and parameter selection (MAPS) simulation.
#
# The script uses the following R packages:
# - base: This package provides basic functions in R.
# - lavaan: This package provides functions for latent variable analysis.
# - performance: This package provides functions for performance analysis.
#
# The script uses the following Java packages:
# - edu.cmu.tetrad.search: This package provides functions for searching algorithms.
# - edu.cmu.tetrad.graph: This package provides functions for graph algorithms.
# - edu.cmu.tetrad.data: This package provides functions for data algorithms.
# - edu.cmu.tetrad.sem: This package provides functions for SEM algorithms.
# - edu.cmu.tetrad.util: This package provides utility functions.
# - edu.cmu.tetrad.algcomparison.independence: This package provides functions for independence algorithms.
# - edu.cmu.tetrad.algcomparison.statistic: This package provides functions for statistical algorithms.
# - edu.cmu.tetrad.algcomparison.simulation: This package provides functions for simulation algorithms.
# - edu.cmu.tetrad.algcomparison.graph: This package provides functions for graph algorithms.
# - java.util: This package provides utility functions.
#
# The script uses the following R functions:
# - ListVector: This function creates a list vector.
# - numpy2rpy: This function converts a numpy array to an R array.
# - default_converter: This function converts the default values.
# - get_conversion: This function gets the conversion.
# - converter: This function converts the values.
# - importr: This function imports an R package.
#
# The script uses the following Python packages:
# - numpy: This package provides functions for numerical computing.
# - pandas: This package provides functions for data manipulation.
# - train_test_split: This function splits the data into training and testing sets.
# - DirectLiNGAM: This class provides functions for the DirectLiNGAM algorithm.
# - DagmaLinear: This class provides functions for the DagmaLinear algorithm.
#
# The script uses the following Tetrad packages:
# - Params: This package provides parameters for the algorithms.
# - Parameters: This package provides parameters for the algorithms.
# - LinearFisherModel: This package provides functions for the linear Fisher model.
# - RandomForward: This package provides functions for the random forward algorithm.
# - ContinuousVariable: This package provides functions for continuous variables.
# - DiscreteVariable: This package provides functions for discrete variables.
# - GraphNode: This package provides functions for graph nodes.
# - EdgeListGraph: This package provides functions for edge list graphs.
# - GraphTransforms: This package provides functions for graph transforms.
# - DagToCpdag: This package provides functions for converting a DAG to a CPDAG.
# - SemPm: This package provides functions for SEM PM.
# - SemIm: This package provides functions for SEM IM.
# - CovarianceMatrix: This package provides functions for covariance matrices.
# - IdaCheck: This package provides functions for the IdaCheck algorithm.
# - GraphUtils: This package provides utility functions for graphs.
# - GraphSaveLoadUtils: This package provides functions for saving and loading graphs.
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
BASE_DIR = "../py-tetrad/pytetrad"
sys.path.append(BASE_DIR)
# Start the JVM and import the necessary Java packages
import jpype.imports
jpype.startJVM("-Xmx20g", classpath=[f"{BASE_DIR}/resources/tetrad-current.jar"])
import pytetrad_tools.TetradSearch as TetradSearch
import pytetrad_tools.translate as translate
import java.util as util
import edu.cmu.tetrad.search as tetrad_search
import edu.cmu.tetrad.graph as tetrad_graph
import edu.cmu.tetrad.data as tetrad_data
import edu.cmu.tetrad.sem as tetrad_sem
from edu.cmu.tetrad.util import Params, Parameters
import edu.cmu.tetrad.algcomparison.independence as independence
import edu.cmu.tetrad.algcomparison.statistic as statistic
from edu.cmu.tetrad.graph import Edges
# For linear simulations.
import dao as dao
from lingam import DirectLiNGAM
from dagma.linear import DagmaLinear
# Import R packages
from rpy2.robjects import ListVector
from rpy2.robjects.numpy2ri import numpy2rpy
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import get_conversion
from rpy2.robjects.pandas2ri import converter
from rpy2.robjects.packages import importr
base = importr("base")
lavaan = importr("lavaan")
performance = importr("performance")
import matplotlib.pyplot as plt
class FindGoodModel():
def __init__(self, output_dir, file=None, num_nodes=5, avg_degree=2, num_latents=0, sample_size=100, sim_type='lg',
histogram_dir=None):
print("FindGoodModel", "output_dir", output_dir, "num_nodes", num_nodes, "avg_degree", avg_degree, "num_latents",
num_latents, "sample_size", sample_size, "sim_type", sim_type)
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.histogram_dir = histogram_dir
self.num_nodes = num_nodes
self.avg_degree = avg_degree
self.num_latents = num_latents
self.sample_size = sample_size
self.file = None
self.sim_type = None
self.location = output_dir
self.num_starts = 2
self.alpha = 0.01
self.percentResample = 0.5
self.sim_type = sim_type
self.sample_size = sample_size
self.params = Parameters()
self.params.set(Params.ALPHA, self.alpha)
self.params.set(Params.NUM_STARTS, self.num_starts)
self.frac_dep_under_null = 0
self.base = importr('base')
self.bidag = importr('BiDAG')
self.pchc = importr("pchc")
self.structure_prior = 0
self.pchc = importr('pchc')
self.base = importr('base')
self.bidag = importr('BiDAG')
self.train = None
self.test = None
self.file = file
data, nodes, graph, num_nodes, avg_deg, sem_im, B = self.make_data_cont_dao(num_nodes, avg_degree, sample_size)
self.train, self.test = train_test_split(data, test_size=.5) # , random_state=42)
self.train_java = translate.pandas_data_to_tetrad(self.train)
self.train_numpy = self.train.to_numpy()
self.nodes = self.train_java.getVariables()
self.graph = graph
self.sem_im = sem_im
self.dagma_l1 = 0.03
self.dagma_w = 0.1
self.dagma_T = 5
self.mmhc_max_k = 10
self.mmhc_starts = 10
self.pchc_starts = 10
self.B = B
# This script calculates the CFI, NFI, and NNFI for a given model using lavaan.
def get_stats(self, df, graph):
dag = tetrad_graph.GraphTransforms.dagFromCpdag(graph)
model = str(tetrad_graph.GraphSaveLoadUtils.graphToLavaan(dag))
with (default_converter + converter).context():
r_df = get_conversion().py2rpy(df)
fit = lavaan.lavaan(model, data=r_df)
perf = performance.model_performance(fit)
return {col: perf.rx(i + 1)[0][0] for i, col in enumerate(perf.colnames)}
def save_lines(self, alg, params):
for param in params:
cpdag, p_ad, fd_indep, edges, line, cpdag, data_java = self.table_line(alg, param)
self.my_print(line)
def print_info(self, msg):
self.my_print()
self.my_print(msg)
self.my_print()
def print_parameter_defs(self):
self.my_print('THE FOLLOWING CAN BE GIVEN WITHOUT KNOWING THE GROUND TRUTH:')
self.my_print()
self.my_print('alg = the chosen algorithm')
self.my_print("param = the parameter that's being varied (only one for this script)")
self.my_print('nodes = # of measured nodes in the true graph')
self.my_print(
'cpdag = 1 if the result is a CPDAG, 0 if not')
self.my_print('|G| = # edges in the estimated graph')
self.my_print('num_params = the number of parameters in the model')
self.my_print(
'numind = the number of valid independence tests that were performed for independencies implied by the estimated graph')
self.my_print('p_ad = p-value of the Anderson Darling test of Uniformity')
self.my_print(f'|alpha| = distance of the p-value of the independence test from alpha = {self.alpha}')
self.my_print('bic = the standard BIC score of the estimated graph')
self.my_print('edges = # edges in the estimated graph')
self.my_print(f'sample size = {self.sample_size}')
self.my_print()
self.my_print('THE FOLLOWING REQUIRE KNOWING THE GROUND TRUTH:')
self.my_print()
self.my_print('|G*| = # edges in the true graph')
self.my_print('ap = adjacency precision')
self.my_print('ar = adjacency recall')
self.my_print('ahp = arrowhead precision')
self.my_print('ahr = arrowhead recall')
self.my_print('f1 = adjacency F1 score')
self.my_print('f0.5 = adjacency F0.5 score')
self.my_print('f2 = adjacency F2 score')
self.my_print()
def get_train(self):
return self.train
def get_test(self):
return self.test
def get_graph(self):
return self.graph
def get_sem_im(self):
return self.sem_im
def print_lines(self, lines):
self.header()
for _line in lines:
self.my_print(_line)
def my_print(self, str=''):
print(str, file=self.file, flush=True)
print(str, flush=True)
def table_line(self, alg, param):
graph = self.get_model(alg, param)
self.create_coef_diff_histograms(graph, alg, param, self.histogram_dir)
dag = tetrad_graph.GraphTransforms.dagFromCpdag(graph)
data_java = translate.pandas_data_to_tetrad(self.test)
ap, ar, ahp, ahr, bic, f1_adj, f1_all, f_beta_point5_adj, f_beta_2_adj, shd, avgsd, avgminsd, avgmaxsd, num_params \
= self.accuracy(self.graph, graph, data_java)
test_java = translate.pandas_data_to_tetrad(self.test)
cpdag, a2Star, p_ad, p_ks, kl_div, frac_dep_null, num_test_indep, num_test_dep \
= self.markov_check(graph, alg, test_java, self.params)
stats = self.get_stats(self.test, dag)
cfi = stats["CFI"]
nfi = stats["NFI"]
nnfi = stats["NNFI"]
chisq = stats["Chi2"]
dof = stats["Chi2_df"]
likelihood = stats["Loglikelihood"]
p_value = stats["p_Chi2"]
edges = graph.getNumEdges()
dist_alpha = abs(frac_dep_null - self.alpha)
line = (f"{alg:14} {param:8.3f} {self.graph.getNumNodes():5} {edges:3} {num_params:7.0f}"
f" {cpdag:6} {num_test_indep:9} "
f" {a2Star:8.4f} {p_ad:8.4f} {p_ks:8.4f} {kl_div:8.4f} {likelihood:8.4f}"
f" {dist_alpha:8.4f} {bic:12.4f} {cfi:6.4f} {nfi:6.4f} {nnfi:6.4f} {chisq:6.4f} {dof:6.4f} {p_value:6.4f} "
f" {self.graph.getNumEdges():5} {ap:5.4f} {ar:5.4f} {ahp:5.4f} {ahr:5.4f} {f1_adj:6.4f} {f1_all:6.4f} "
f" {f_beta_point5_adj:5.4f} {f_beta_2_adj:5.4f} {shd:6}")
return graph, p_ad, frac_dep_null, edges, line, graph, data_java
def header(self):
str = (
f"alg param nodes |G| num_params cpdag numind a2* p_ad p_ks kldiv loglik |alpha|"
f" bic cfi nfi nnfi chisq dof pvalue"
f" |G*| ap ar ahp ahr"
f" f1 f1_all f0.5 f2.0 shd")
self.my_print(str)
self.my_print('-' * len(str))
# paramValue is a range of values for the parameter being used. For score-based
# algorithms it will be penalty discount; for constraint-based it will be alpha.
# def get_model(self, alg, paramValue):
# return tetrad_graph.EdgeListGraph()
# Could also use pchc::bnmat(a$dag)
def pchc_graph(self, a, nodes):
dag = a.rx2('dag')
graph = tetrad_graph.EdgeListGraph(nodes)
try:
arcs = dag.rx2('arcs')
half = int(len(arcs) / 2)
for i in range(0, half):
x = arcs[i]
y = arcs[i + half]
graph.addDirectedEdge(nodes.get(self.index(x)), nodes.get(self.index(y)))
except Exception:
print('Arcs not available.')
cpdag = tetrad_graph.GraphTransforms.dagToCpdag(graph)
return cpdag
def index(self, variable_name):
import re
# Extracting digits from the string
digits = re.findall(r'\d+', variable_name)
# Convert the first group of digits to integer
return int(digits[0]) - 1 if digits else None
def accuracy(self, true_graph, est_graph, data):
est_graph = tetrad_graph.GraphUtils.replaceNodes(est_graph, true_graph.getNodes())
true_comparison_graph = tetrad_graph.GraphTransforms.dagToCpdag(true_graph)
est_comparison_graph = tetrad_graph.GraphTransforms.dagToCpdag(est_graph)
ap = statistic.AdjacencyPrecision().getValue(true_comparison_graph, est_comparison_graph, data)
ar = statistic.AdjacencyRecall().getValue(true_comparison_graph, est_comparison_graph, data)
ahp = statistic.ArrowheadPrecision().getValue(true_comparison_graph, est_comparison_graph, data)
ahr = statistic.ArrowheadRecall().getValue(true_comparison_graph, est_comparison_graph, data)
bic = statistic.BicEst().getValue(true_comparison_graph, est_comparison_graph, data)
f1_adj = statistic.F1Adj().getValue(true_comparison_graph, est_comparison_graph, data)
f1_all = statistic.F1All().getValue(true_comparison_graph, est_comparison_graph, data)
shd = statistic.StructuralHammingDistance().getValue(true_comparison_graph, est_comparison_graph, data)
fb1 = statistic.FBetaAdj()
fb1.setBeta(0.5)
f_beta_point5_adj = fb1.getValue(true_comparison_graph, est_comparison_graph, data)
fb2 = statistic.FBetaAdj()
fb2.setBeta(2)
f_beta_2_adj = fb2.getValue(true_comparison_graph, est_comparison_graph, data)
avgsd = np.nan
avgminsd = np.nan
avgmaxsd = np.nan
import traceback
if self.sem_im != None:
try:
ida_check = tetrad_search.IdaCheck(est_comparison_graph, data, self.sem_im)
avgsd = ida_check.getAverageSquaredDistance(ida_check.getOrderedPairs())
avgminsd = ida_check.getAvgMinSquaredDiffEstTrue(ida_check.getOrderedPairs())
avgmaxsd = ida_check.getAvgMaxSquaredDiffEstTrue(ida_check.getOrderedPairs())
except Exception as e:
print("An error occurred:", str(e))
print(traceback.format_exc())
if self.sim_type == 'anclg':
num_params = est_graph.getNumEdges()
else:
num_params = statistic.NumParametersEst().getValue(true_comparison_graph, est_comparison_graph, data)
return ap, ar, ahp, ahr, bic, f1_adj, f1_all, f_beta_point5_adj, f_beta_2_adj, shd, avgsd, avgminsd, avgmaxsd, num_params
def markov_check(self, graph, alg, data, params):
cpdag = self.cpdag(graph)
test = independence.FisherZ().getTest(data, params)
mc = tetrad_search.MarkovCheck(graph, test, tetrad_search.ConditioningSetType.ORDERED_LOCAL_MARKOV)
mc.setPercentResample(0.5)
# We generate results until we have a minimum of p-values for the uniformity test. For
# this, the percent sample needs to be 0.5, so that new samples are generated each time.
#
# Note that an exception is thrown if any method returns graph that is not a legal CPDAG, where there is
# no valid order. This is because we're using ordered local markov. Skip these cases.
try:
mc.generateResults(False)
print("# samples now = " + str(mc.getResults(True).size()))
while mc.getResults(True).size() > 0 and mc.getResults(True).size() < 200:
try:
mc.generateResults(False)
print("# samples now = " + str(mc.getResults(True).size()))
except Exception as e:
break
except Exception as e:
print(f"An error occurred for algorithm {alg}:", str(e))
a2Star = mc.getAndersonDarlingA2Star(True)
p_ad = mc.getAndersonDarlingP(True)
p_ks = mc.getKsPValue(True)
fd_indep = mc.getFractionDependent(True)
num_tests_indep = mc.getNumTests(True)
num_test_dep = mc.getNumTests(False)
results = mc.getResults(True)
p_values = mc.getPValues(results)
# Calculate KL-divergence
bins = 20
dist = np.histogram(p_values, bins)[0] / len(p_values)
# Different fromm uniform?
unif = np.array([1 / bins for _ in range(bins)])
kldiv = np.mean(dist * np.log(np.clip(dist, 1e-6, 1) / unif)) # dist could be 0 :-(
return cpdag, a2Star, p_ad, p_ks, kldiv, fd_indep, num_tests_indep, num_test_dep
def construct_graph(self, g, nodes, cpdag=True):
graph = tetrad_graph.EdgeListGraph(nodes)
for i, a in enumerate(nodes):
for j, b in enumerate(nodes):
if g[i, j]: graph.addDirectedEdge(b, a)
if cpdag: graph = tetrad_graph.GraphTransforms.dagToCpdag(graph)
return graph
def bnl_to_tetrad(self, bnl, cpdag=True):
idx = {f"X{i + 1}": i for i in range(len(self.nodes))}
num_edges = len(bnl) // 2
edges = [(bnl[i], bnl[i + num_edges]) for i in range(num_edges)]
graph = tetrad_graph.EdgeListGraph(self.nodes)
for edge in edges:
graph.addDirectedEdge(self.nodes[idx[edge[0]]], self.nodes[idx[edge[1]]])
if cpdag: tetrad_graph.GraphTransforms.dagToCpdag(graph)
return graph
def make_data_cont_dao(self, num_nodes, avg_deg, sample_size):
"""
Picks a random graph and generates data from it, using the DaO simulation package
(Andrews, B., & Kummerfeld, E. (2024). Better Simulations for Validating Causal Discovery
with the DAG-Adaptation of the Onion Method. arXiv preprint arXiv:2405.13100.)
:param num_nodes: The number of nodes in the graph.
:param avg_deg: The average degree of the graph.
:param num_latents: The number of latent variables in the graph.
:param sample_size: The number of samples to generate.
:return: The data, nodes, graph, number of nodes, and average degree.
"""
p = num_nodes # number of variables
ad = avg_deg # average degree
n = sample_size # number of samples
g = dao.er_dag(p, ad=ad)
g = dao.sf_out(g)
g = dao.randomize_graph(g)
R, B, O = dao.corr(g)
if (self.sim_type == 'exp'):
X = dao.simulate(B, O, n, err=lambda *x: np.random.exponential(x[0], x[1]))
else:
X = dao.simulate(B, O, n)
X = dao.standardize(X)
num_columns = X.shape[1] # Number of columns in the array
column_names = [f'X{i + 1}' for i in range(num_columns)]
df = pd.DataFrame(X, columns=column_names)
cols = df.columns
nodes = util.ArrayList()
for col in cols:
nodes.add(tetrad_data.ContinuousVariable(str(col)))
graph = self.construct_graph(g, nodes)
dag = self.construct_graph(g, nodes, cpdag=False)
# Construct the SEM IM given graph and
cov = tetrad_data.CovarianceMatrix(nodes, R, n)
sem_pm = tetrad_sem.SemPm(dag)
sem_im = tetrad_sem.SemIm(sem_pm, cov)
return df, nodes, graph, num_nodes, avg_deg, sem_im, B
def get_model(self, alg, paramValue):
_search = TetradSearch.TetradSearch(self.train)
_search.set_verbose(False)
_search.use_sem_bic(penalty_discount=paramValue)
nodes = util.ArrayList()
for col in self.train.columns:
nodes.add(tetrad_graph.GraphNode(col))
if alg == 'true':
return self.graph
if alg == 'fges':
_search.use_sem_bic(penalty_discount=paramValue)
_search.run_fges(faithfulness_assumed=False)
elif alg == 'boss':
_search.use_sem_bic(penalty_discount=paramValue)
_search.run_boss()
elif alg == 'grasp':
_search.use_sem_bic(penalty_discount=paramValue)
_search.use_fisher_z(0.05)
_search.run_grasp()
elif alg == 'sp':
_search.use_sem_bic(penalty_discount=paramValue)
_search.run_sp()
elif alg == 'pc':
_search.use_fisher_z(paramValue)
_search.run_pc()
elif alg == 'cpc':
_search.use_fisher_z(paramValue)
_search.run_cpc()
elif alg == 'lingam':
dlingam = DirectLiNGAM()
dlingam.fit(self.train)
W = dlingam.adjacency_matrix_
return self.construct_graph(W, nodes, True)
elif alg == 'bidag':
bge = self.bidag.scoreparameters("bge", numpy2rpy(self.train_numpy), bgepar=ListVector({"am": 1.0}))
itmcmc = self.bidag.iterativeMCMC(scorepar=bge, softlimit=9, hardlimit=12, alpha=self.alpha,
verbose=False)
cpdag = self.construct_graph(np.array(self.base.as_matrix(itmcmc[1]), dtype=int).T, nodes, True)
return cpdag
elif alg == 'pchc':
print("pchc")
bnl = self.pchc.pchc(numpy2rpy(self.train.values), alpha=self.alpha, restart=self.pchc_starts)
return self.bnl_to_tetrad(bnl[1][2], cpdag=True)
elif alg == 'mmhc':
bnl = self.pchc.mmhc(numpy2rpy(self.train.values), max_k=self.mmhc_max_k, alpha=self.alpha, restart=self.mmhc_starts)
return self.bnl_to_tetrad(bnl[1][2], cpdag=True)
elif alg == 'dagma':
model = DagmaLinear(loss_type='l2') # create a linear model with least squares loss
W = model.fit(self.train.to_numpy(), lambda1=paramValue) # fit the model with L1 reg. (coeff. 0.02)
return self.construct_graph(W.T, nodes, True)
else:
raise Exception('Unrecognized alg name: ' + alg)
return _search.get_java()
def cpdag(self, graph):
return graph.paths().isLegalCpdag()
# CAFS = Cross-Algorithm Frugality Search
def cafs(self):
dir = f'markov_check_{self.sim_type}'
penalties = [10.0, 5.0, 4.0, 3, 2.5, 2, 1.75, 1.5, 1.25, 1]
alphas = [0.001, 0.01, 0.05, 0.1, 0.2]
for num_nodes in range(5, 30 + 1, 5): # 5, 10, 15, 20, 25, 30
for avg_degree in range(1, 6 + 1): # 1, 2, 3, 4, 5
if avg_degree > num_nodes - 1:
continue
# Create the output directory if it does not exist
if not os.path.exists(f'{self.location}/{dir}'):
os.makedirs(f'{self.location}/{dir}')
result_file = f'{self.location}/{dir}/result_{num_nodes}_{avg_degree}.txt'
if os.path.exists(result_file):
print("result file exists: " + result_file)
continue
with (open(result_file, 'w') as file,
open(f'{self.location}/{dir}/graph_{num_nodes}_{avg_degree}.txt', 'w') as graph_file,
open(f'{self.location}/{dir}/train_{num_nodes}_{avg_degree}.txt', 'w') as train_file,
open(f'{self.location}/{dir}/test_{num_nodes}_{avg_degree}.txt', 'w') as test_file):
find = FindGoodModel(self.location, file, num_nodes, avg_degree, 0, 1000,
self.sim_type, histogram_dir=self.histogram_dir)
# print parameter defs and header
find.print_parameter_defs()
find.header()
# go through algorithms and parameter choices and save the best lines (print all lines)
find.save_lines('true', [0])
find.save_lines('dagma', [0.1, 0.2, 0.3])
find.save_lines('pc', alphas)
find.save_lines('cpc', alphas)
find.save_lines('fges', penalties)
find.save_lines('grasp', penalties)
find.save_lines('boss', penalties)
find.save_lines('bidag', [0])
find.save_lines('mmhc', [0])
find.save_lines('pchc', [0])
train = translate.pandas_data_to_tetrad(find.get_train())
test = translate.pandas_data_to_tetrad(find.get_test())
graph = find.get_graph()
# get_stats(train, graph)
print(graph, file=graph_file)
print(train, file=train_file)
print(test, file=test_file)
file.close()
graph_file.close()
train_file.close()
test_file.close()
def create_coef_diff_histograms(self, cpdag, alg, param, histogram_dir):
if histogram_dir == None:
return
if (self.num_nodes != 25 or self.avg_degree != 5):
return
if not os.path.exists(histogram_dir):
os.makedirs(histogram_dir)
dag = self.sem_im.getSemPm().getGraph()
# Get the B coefficients from the sem_im
B = np.array(self.sem_im.getEdgeCoef().toArray())
# Now, get all the coefficients for all of the edges in B--i.e. all of the entries in B that are not zero
non_zero_B = B[B != 0]
# Now find the list of directed edges in dag that are also in cpdag and add these to a list.
edges = dag.getEdges()
directed_edges = []
for e in edges:
if Edges.isDirectedEdge(e):
directed_edges.append(e)
# Now, get the coefficients for all the directed edges in the list of directed edges
in_cpdag = []
not_in_cpdag_but_adjacent = []
not_in_cpdag = []
nodes = cpdag.getNodes()
for e in directed_edges:
n1 = cpdag.getNode(Edges.getDirectedEdgeTail(e).getName())
n2 = cpdag.getNode(Edges.getDirectedEdgeHead(e).getName())
tail = nodes.indexOf(Edges.getDirectedEdgeTail(e))
head = nodes.indexOf(Edges.getDirectedEdgeHead(e))
# Check that e is also in cpdag
if cpdag.containsEdge(Edges.directedEdge(n1, n2)):
in_cpdag.append(B[tail, head])
if not cpdag.containsEdge(Edges.directedEdge(n1, n2)):
not_in_cpdag.append(B[tail, head])
if not cpdag.containsEdge(Edges.directedEdge(n1, n2)) and cpdag.isAdjacentTo(n1, n2):
not_in_cpdag_but_adjacent.append(B[tail, head])
in_cpdag = np.array(in_cpdag)
not_in_cpdag = np.array(not_in_cpdag)
not_in_cpdag_but_adjacent = np.array(not_in_cpdag_but_adjacent)
abs_nonzero_B = np.abs(non_zero_B)
abs_in_cpdag = np.abs(in_cpdag)
abs_not_in_cpdag = np.abs(not_in_cpdag)
abs_not_in_cpdag_and_adjacent = np.abs(not_in_cpdag_but_adjacent)
# Create two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4)) # Adjusted the figsize for better spacing with two plots
# Plot the first histogram with percentages
ax1.hist(abs_nonzero_B, bins=10, edgecolor='black', weights=np.ones(len(abs_nonzero_B)) / len(abs_nonzero_B) * 100)
ax1.set_title("True Coefficients")
ax1.set_xlabel("Coefficient Value")
ax1.set_ylabel("Percentage")
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 30)
# Plot the second histogram with percentages
ax2.hist(abs_not_in_cpdag_and_adjacent, bins=10, edgecolor='black', weights=np.ones(len(abs_not_in_cpdag_and_adjacent)) / len(abs_not_in_cpdag_and_adjacent) * 100)
ax2.set_title("In DAG, not in CPDAG but Adjacent")
ax2.set_xlabel("Coefficient Value")
ax2.set_ylabel("Percentage")
ax2.set_xlim(0, 1)
ax2.set_ylim(0, 30)
# Adjust layout to make room for the suptitle
plt.tight_layout(rect=(0.0, 0.0, 1.0, 0.95))
# Add a general caption above all subplots
# fig.suptitle(f"Comparison of Coefficient Distributions for {alg}_{self.num_nodes}_{self.avg_degree}", fontsize=16)
# Adjust layout to make room for the suptitle
plt.tight_layout(rect=(0.0, 0.0, 1.0, 0.95))
# Save the plot to a file
histogram_file = f"histograms_{self.num_nodes}_{self.avg_degree}_{alg}_{param}.png"
plt.savefig(f"{histogram_dir}/{histogram_file}")
# # Show the plot
# plt.show()
output_dir = 'alg_output_with_true'
histogram_dir = "plots/histograms/coef_histograms"
FindGoodModel(output_dir, sim_type='lg', histogram_dir = histogram_dir).cafs()