markov_check_simulation_true_line.py

# Description: This script runs the Markov algorithm and parameter selection (MAPS) simulation for the linear Gaussian
# (lg) simulation type. The script calculates the CFI, NFI, and NNFI for a given model using lavaan. The script also
# calculates the adjacency precision, adjacency recall, arrowhead precision, arrowhead recall, BIC, F1 adjacency, F1 all,
# F0.5, F2.0, SHD, average squared distance, average minimum squared difference, and average maximum squared difference.
# The script saves the results to a file in the specified output directory.
#
# This version of the script adds a line to the full results for the true model.
#
# The script uses the following parameters:
# - location: The output directory for the results.
# - file: The file to save the results to.
# - num_nodes: The number of nodes in the graph.
# - avg_degree: The average degree of the graph.
# - num_latents: The number of latent variables in the graph.
# - sample_size: The number of samples to generate.
# - sim_type: The simulation type (lg).
#
# The script uses the following methods:
# - get_stats: This script calculates the CFI, NFI, and NNFI for a given model using lavaan.
# - save_lines: This script saves the results to a file in the specified output directory.
# - print_info: This script prints the information to the console.
# - print_parameter_defs: This script prints the parameter definitions to the console.
# - get_train: This script gets the training data.
# - get_test: This script gets the testing data.
# - get_graph: This script gets the graph.
# - get_sem_im: This script gets the SEM IM.
# - print_lines: This script prints the lines to the console.
# - my_print: This script prints the string to the console.
# - table_line: This script creates a table line for the given algorithm and parameter.
# - header: This script creates a header for the table.
# - pchc_graph: This script creates a graph from the pchc algorithm.
# - index: This script gets the index of the variable name.
# - accuracy: This script calculates the accuracy of the model.
# - markov_check: This script checks the Markov condition.
# - construct_graph: This script constructs a graph from the given graph.
# - bnl_to_tetrad: This script converts the BNL to a Tetrad graph.
# - make_data_cont_dao: This script makes the continuous data using the DaO simulation package.
# - get_model: This script gets the model for the given algorithm and parameter.
# - cpdag: This script checks if the graph is a CPDAG.
# - maps: This script runs the Markov algorithm and parameter selection (MAPS) simulation.
#
# The script uses the following R packages:
# - base: This package provides basic functions in R.
# - lavaan: This package provides functions for latent variable analysis.
# - performance: This package provides functions for performance analysis.
#
# The script uses the following Java packages:
# - edu.cmu.tetrad.search: This package provides functions for searching algorithms.
# - edu.cmu.tetrad.graph: This package provides functions for graph algorithms.
# - edu.cmu.tetrad.data: This package provides functions for data algorithms.
# - edu.cmu.tetrad.sem: This package provides functions for SEM algorithms.
# - edu.cmu.tetrad.util: This package provides utility functions.
# - edu.cmu.tetrad.algcomparison.independence: This package provides functions for independence algorithms.
# - edu.cmu.tetrad.algcomparison.statistic: This package provides functions for statistical algorithms.
# - edu.cmu.tetrad.algcomparison.simulation: This package provides functions for simulation algorithms.
# - edu.cmu.tetrad.algcomparison.graph: This package provides functions for graph algorithms.
# - java.util: This package provides utility functions.
#
# The script uses the following R functions:
# - ListVector: This function creates a list vector.
# - numpy2rpy: This function converts a numpy array to an R array.
# - default_converter: This function converts the default values.
# - get_conversion: This function gets the conversion.
# - converter: This function converts the values.
# - importr: This function imports an R package.
#
# The script uses the following Python packages:
# - numpy: This package provides functions for numerical computing.
# - pandas: This package provides functions for data manipulation.
# - train_test_split: This function splits the data into training and testing sets.
# - DirectLiNGAM: This class provides functions for the DirectLiNGAM algorithm.
# - DagmaLinear: This class provides functions for the DagmaLinear algorithm.
#
# The script uses the following Tetrad packages:
# - Params: This package provides parameters for the algorithms.
# - Parameters: This package provides parameters for the algorithms.
# - LinearFisherModel: This package provides functions for the linear Fisher model.
# - RandomForward: This package provides functions for the random forward algorithm.
# - ContinuousVariable: This package provides functions for continuous variables.
# - DiscreteVariable: This package provides functions for discrete variables.
# - GraphNode: This package provides functions for graph nodes.
# - EdgeListGraph: This package provides functions for edge list graphs.
# - GraphTransforms: This package provides functions for graph transforms.
# - DagToCpdag: This package provides functions for converting a DAG to a CPDAG.
# - SemPm: This package provides functions for SEM PM.
# - SemIm: This package provides functions for SEM IM.
# - CovarianceMatrix: This package provides functions for covariance matrices.
# - IdaCheck: This package provides functions for the IdaCheck algorithm.
# - GraphUtils: This package provides utility functions for graphs.
# - GraphSaveLoadUtils: This package provides functions for saving and loading graphs.

import os
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

BASE_DIR = "../py-tetrad/pytetrad"
sys.path.append(BASE_DIR)

# Start the JVM and import the necessary Java packages
import jpype.imports

jpype.startJVM("-Xmx20g", classpath=[f"{BASE_DIR}/resources/tetrad-current.jar"])

import pytetrad_tools.TetradSearch as TetradSearch
import pytetrad_tools.translate as translate
import java.util as util
import edu.cmu.tetrad.search as tetrad_search
import edu.cmu.tetrad.graph as tetrad_graph
import edu.cmu.tetrad.data as tetrad_data
import edu.cmu.tetrad.sem as tetrad_sem
from edu.cmu.tetrad.util import Params, Parameters
import edu.cmu.tetrad.algcomparison.independence as independence
import edu.cmu.tetrad.algcomparison.statistic as statistic
from edu.cmu.tetrad.graph import Edges

# For linear simulations.
import dao as dao

from lingam import DirectLiNGAM
from dagma.linear import DagmaLinear

# Import R packages
from rpy2.robjects import ListVector
from rpy2.robjects.numpy2ri import numpy2rpy
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import get_conversion
from rpy2.robjects.pandas2ri import converter
from rpy2.robjects.packages import importr

base = importr("base")
lavaan = importr("lavaan")
performance = importr("performance")

import matplotlib.pyplot as plt

class FindGoodModel():

    def __init__(self, output_dir, file=None, num_nodes=5, avg_degree=2, num_latents=0, sample_size=100, sim_type='lg',
                 histogram_dir=None):
        print("FindGoodModel", "output_dir", output_dir, "num_nodes", num_nodes, "avg_degree", avg_degree, "num_latents",
              num_latents, "sample_size", sample_size, "sim_type", sim_type)

        # Create the output directory if it does not exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        self.histogram_dir = histogram_dir
        self.num_nodes = num_nodes
        self.avg_degree = avg_degree
        self.num_latents = num_latents
        self.sample_size = sample_size
        self.file = None
        self.sim_type = None

        self.location = output_dir

        self.num_starts = 2
        self.alpha = 0.01
        self.percentResample = 0.5
        self.sim_type = sim_type
        self.sample_size = sample_size

        self.params = Parameters()
        self.params.set(Params.ALPHA, self.alpha)
        self.params.set(Params.NUM_STARTS, self.num_starts)

        self.frac_dep_under_null = 0

        self.base = importr('base')
        self.bidag = importr('BiDAG')
        self.pchc = importr("pchc")

        self.structure_prior = 0

        self.pchc = importr('pchc')
        self.base = importr('base')
        self.bidag = importr('BiDAG')

        self.train = None
        self.test = None

        self.file = file

        data, nodes, graph, num_nodes, avg_deg, sem_im, B = self.make_data_cont_dao(num_nodes, avg_degree, sample_size)
        self.train, self.test = train_test_split(data, test_size=.5)  # , random_state=42)

        self.train_java = translate.pandas_data_to_tetrad(self.train)
        self.train_numpy = self.train.to_numpy()
        self.nodes = self.train_java.getVariables()

        self.graph = graph
        self.sem_im = sem_im

        self.dagma_l1 = 0.03
        self.dagma_w = 0.1
        self.dagma_T = 5

        self.mmhc_max_k = 10

        self.mmhc_starts = 10
        self.pchc_starts = 10

        self.B = B

    # This script calculates the CFI, NFI, and NNFI for a given model using lavaan.
    def get_stats(self, df, graph):
        dag = tetrad_graph.GraphTransforms.dagFromCpdag(graph)
        model = str(tetrad_graph.GraphSaveLoadUtils.graphToLavaan(dag))
        with (default_converter + converter).context():
            r_df = get_conversion().py2rpy(df)

        fit = lavaan.lavaan(model, data=r_df)
        perf = performance.model_performance(fit)

        return {col: perf.rx(i + 1)[0][0] for i, col in enumerate(perf.colnames)}

    def save_lines(self, alg, params):
        for param in params:
            cpdag, p_ad, fd_indep, edges, line, cpdag, data_java = self.table_line(alg, param)
            self.my_print(line)

    def print_info(self, msg):
        self.my_print()
        self.my_print(msg)
        self.my_print()

    def print_parameter_defs(self):
        self.my_print('THE FOLLOWING CAN BE GIVEN WITHOUT KNOWING THE GROUND TRUTH:')
        self.my_print()
        self.my_print('alg = the chosen algorithm')
        self.my_print("param = the parameter that's being varied (only one for this script)")
        self.my_print('nodes = # of measured nodes in the true graph')
        self.my_print(
            'cpdag = 1 if the result is a CPDAG, 0 if not')
        self.my_print('|G| = # edges in the estimated graph')
        self.my_print('num_params = the number of parameters in the model')
        self.my_print(
            'numind = the number of valid independence tests that were performed for independencies implied by the estimated graph')
        self.my_print('p_ad = p-value of the Anderson Darling test of Uniformity')
        self.my_print(f'|alpha| = distance of the p-value of the independence test from alpha = {self.alpha}')
        self.my_print('bic = the standard BIC score of the estimated graph')
        self.my_print('edges = # edges in the estimated graph')
        self.my_print(f'sample size = {self.sample_size}')
        self.my_print()
        self.my_print('THE FOLLOWING REQUIRE KNOWING THE GROUND TRUTH:')
        self.my_print()
        self.my_print('|G*| = # edges in the true graph')
        self.my_print('ap = adjacency precision')
        self.my_print('ar = adjacency recall')
        self.my_print('ahp = arrowhead precision')
        self.my_print('ahr = arrowhead recall')
        self.my_print('f1 = adjacency F1 score')
        self.my_print('f0.5 = adjacency F0.5 score')
        self.my_print('f2 = adjacency F2 score')
        self.my_print()

    def get_train(self):
        return self.train

    def get_test(self):
        return self.test

    def get_graph(self):
        return self.graph

    def get_sem_im(self):
        return self.sem_im

    def print_lines(self, lines):
        self.header()
        for _line in lines:
            self.my_print(_line)

    def my_print(self, str=''):
        print(str, file=self.file, flush=True)
        print(str, flush=True)

    def table_line(self, alg, param):
        graph = self.get_model(alg, param)

        self.create_coef_diff_histograms(graph, alg, param, self.histogram_dir)

        dag = tetrad_graph.GraphTransforms.dagFromCpdag(graph)

        data_java = translate.pandas_data_to_tetrad(self.test)

        ap, ar, ahp, ahr, bic, f1_adj, f1_all, f_beta_point5_adj, f_beta_2_adj, shd, avgsd, avgminsd, avgmaxsd, num_params \
            = self.accuracy(self.graph, graph, data_java)

        test_java = translate.pandas_data_to_tetrad(self.test)

        cpdag, a2Star, p_ad, p_ks, kl_div, frac_dep_null, num_test_indep, num_test_dep \
            = self.markov_check(graph, alg, test_java, self.params)

        stats = self.get_stats(self.test, dag)
        cfi = stats["CFI"]
        nfi = stats["NFI"]
        nnfi = stats["NNFI"]
        chisq = stats["Chi2"]
        dof = stats["Chi2_df"]
        likelihood = stats["Loglikelihood"]
        p_value = stats["p_Chi2"]

        edges = graph.getNumEdges()

        dist_alpha = abs(frac_dep_null - self.alpha)

        line = (f"{alg:14} {param:8.3f}  {self.graph.getNumNodes():5}    {edges:3}    {num_params:7.0f}"
                f" {cpdag:6} {num_test_indep:9} "
                f" {a2Star:8.4f} {p_ad:8.4f} {p_ks:8.4f} {kl_div:8.4f} {likelihood:8.4f}"
                f" {dist_alpha:8.4f}  {bic:12.4f} {cfi:6.4f}  {nfi:6.4f}  {nnfi:6.4f}  {chisq:6.4f}  {dof:6.4f}  {p_value:6.4f}  "
                f" {self.graph.getNumEdges():5}  {ap:5.4f} {ar:5.4f} {ahp:5.4f} {ahr:5.4f} {f1_adj:6.4f} {f1_all:6.4f} "
                f" {f_beta_point5_adj:5.4f} {f_beta_2_adj:5.4f} {shd:6}")

        return graph, p_ad, frac_dep_null, edges, line, graph, data_java

    def header(self):
        str = (
            f"alg               param  nodes    |G| num_params  cpdag    numind       a2*     p_ad     p_ks    kldiv   loglik  |alpha|"
            f"           bic    cfi     nfi    nnfi   chisq     dof  pvalue"
            f"  |G*|      ap     ar    ahp    ahr"
            f"     f1 f1_all   f0.5   f2.0    shd")
        self.my_print(str)
        self.my_print('-' * len(str))

        # paramValue is a range of values for the parameter being used. For score-based
        # algorithms it will be penalty discount; for constraint-based it will be alpha.
        # def get_model(self, alg, paramValue):
        #     return tetrad_graph.EdgeListGraph()

        # Could also use pchc::bnmat(a$dag)

    def pchc_graph(self, a, nodes):
        dag = a.rx2('dag')
        graph = tetrad_graph.EdgeListGraph(nodes)

        try:
            arcs = dag.rx2('arcs')
            half = int(len(arcs) / 2)

            for i in range(0, half):
                x = arcs[i]
                y = arcs[i + half]
                graph.addDirectedEdge(nodes.get(self.index(x)), nodes.get(self.index(y)))
        except Exception:
            print('Arcs not available.')

        cpdag = tetrad_graph.GraphTransforms.dagToCpdag(graph)
        return cpdag

    def index(self, variable_name):
        import re

        # Extracting digits from the string
        digits = re.findall(r'\d+', variable_name)

        # Convert the first group of digits to integer
        return int(digits[0]) - 1 if digits else None

    def accuracy(self, true_graph, est_graph, data):
        est_graph = tetrad_graph.GraphUtils.replaceNodes(est_graph, true_graph.getNodes())

        true_comparison_graph = tetrad_graph.GraphTransforms.dagToCpdag(true_graph)
        est_comparison_graph = tetrad_graph.GraphTransforms.dagToCpdag(est_graph)

        ap = statistic.AdjacencyPrecision().getValue(true_comparison_graph, est_comparison_graph, data)
        ar = statistic.AdjacencyRecall().getValue(true_comparison_graph, est_comparison_graph, data)
        ahp = statistic.ArrowheadPrecision().getValue(true_comparison_graph, est_comparison_graph, data)
        ahr = statistic.ArrowheadRecall().getValue(true_comparison_graph, est_comparison_graph, data)
        bic = statistic.BicEst().getValue(true_comparison_graph, est_comparison_graph, data)
        f1_adj = statistic.F1Adj().getValue(true_comparison_graph, est_comparison_graph, data)
        f1_all = statistic.F1All().getValue(true_comparison_graph, est_comparison_graph, data)
        shd = statistic.StructuralHammingDistance().getValue(true_comparison_graph, est_comparison_graph, data)
        fb1 = statistic.FBetaAdj()
        fb1.setBeta(0.5)
        f_beta_point5_adj = fb1.getValue(true_comparison_graph, est_comparison_graph, data)
        fb2 = statistic.FBetaAdj()
        fb2.setBeta(2)
        f_beta_2_adj = fb2.getValue(true_comparison_graph, est_comparison_graph, data)

        avgsd = np.nan
        avgminsd = np.nan
        avgmaxsd = np.nan

        import traceback

        if self.sem_im != None:
            try:
                ida_check = tetrad_search.IdaCheck(est_comparison_graph, data, self.sem_im)
                avgsd = ida_check.getAverageSquaredDistance(ida_check.getOrderedPairs())
                avgminsd = ida_check.getAvgMinSquaredDiffEstTrue(ida_check.getOrderedPairs())
                avgmaxsd = ida_check.getAvgMaxSquaredDiffEstTrue(ida_check.getOrderedPairs())
            except Exception as e:
                print("An error occurred:", str(e))
                print(traceback.format_exc())

        if self.sim_type == 'anclg':
            num_params = est_graph.getNumEdges()
        else:
            num_params = statistic.NumParametersEst().getValue(true_comparison_graph, est_comparison_graph, data)

        return ap, ar, ahp, ahr, bic, f1_adj, f1_all, f_beta_point5_adj, f_beta_2_adj, shd, avgsd, avgminsd, avgmaxsd, num_params

    def markov_check(self, graph, alg, data, params):
        cpdag = self.cpdag(graph)
        test = independence.FisherZ().getTest(data, params)

        mc = tetrad_search.MarkovCheck(graph, test, tetrad_search.ConditioningSetType.ORDERED_LOCAL_MARKOV)
        mc.setPercentResample(0.5)

        # We generate results until we have a minimum of p-values for the uniformity test. For
        # this, the percent sample needs to be 0.5, so that new samples are generated each time.
        #
        # Note that an exception is thrown if any method returns graph that is not a legal CPDAG, where there is
        # no valid order. This is because we're using ordered local markov. Skip these cases.
        try:
            mc.generateResults(False)
            print("# samples now = " + str(mc.getResults(True).size()))

            while mc.getResults(True).size() > 0 and mc.getResults(True).size() < 200:
                try:
                    mc.generateResults(False)
                    print("# samples now = " + str(mc.getResults(True).size()))
                except Exception as e:
                    break
        except Exception as e:
            print(f"An error occurred for algorithm {alg}:", str(e))


        a2Star = mc.getAndersonDarlingA2Star(True)
        p_ad = mc.getAndersonDarlingP(True)
        p_ks = mc.getKsPValue(True)
        fd_indep = mc.getFractionDependent(True)
        num_tests_indep = mc.getNumTests(True)
        num_test_dep = mc.getNumTests(False)
        results = mc.getResults(True)
        p_values = mc.getPValues(results)

        # Calculate KL-divergence
        bins = 20

        dist = np.histogram(p_values, bins)[0] / len(p_values)

        # Different fromm uniform?
        unif = np.array([1 / bins for _ in range(bins)])

        kldiv = np.mean(dist * np.log(np.clip(dist, 1e-6, 1) / unif))  # dist could be 0 :-(

        return cpdag, a2Star, p_ad, p_ks, kldiv, fd_indep, num_tests_indep, num_test_dep

    def construct_graph(self, g, nodes, cpdag=True):
        graph = tetrad_graph.EdgeListGraph(nodes)
        for i, a in enumerate(nodes):
            for j, b in enumerate(nodes):
                if g[i, j]: graph.addDirectedEdge(b, a)
        if cpdag: graph = tetrad_graph.GraphTransforms.dagToCpdag(graph)
        return graph

    def bnl_to_tetrad(self, bnl, cpdag=True):
        idx = {f"X{i + 1}": i for i in range(len(self.nodes))}
        num_edges = len(bnl) // 2
        edges = [(bnl[i], bnl[i + num_edges]) for i in range(num_edges)]
        graph = tetrad_graph.EdgeListGraph(self.nodes)
        for edge in edges:
            graph.addDirectedEdge(self.nodes[idx[edge[0]]], self.nodes[idx[edge[1]]])
        if cpdag: tetrad_graph.GraphTransforms.dagToCpdag(graph)
        return graph

    def make_data_cont_dao(self, num_nodes, avg_deg, sample_size):
        """
         Picks a random graph and generates data from it, using the DaO simulation package
         (Andrews, B., & Kummerfeld, E. (2024). Better Simulations for Validating Causal Discovery
         with the DAG-Adaptation of the Onion Method. arXiv preprint arXiv:2405.13100.)
        :param num_nodes: The number of nodes in the graph.
        :param avg_deg: The average degree of the graph.
        :param num_latents: The number of latent variables in the graph.
        :param sample_size: The number of samples to generate.
        :return: The data, nodes, graph, number of nodes, and average degree.
        """

        p = num_nodes  # number of variables
        ad = avg_deg  # average degree
        n = sample_size  # number of samples

        g = dao.er_dag(p, ad=ad)
        g = dao.sf_out(g)
        g = dao.randomize_graph(g)

        R, B, O = dao.corr(g)

        if (self.sim_type == 'exp'):
            X = dao.simulate(B, O, n, err=lambda *x: np.random.exponential(x[0], x[1]))
        else:
            X = dao.simulate(B, O, n)

        X = dao.standardize(X)

        num_columns = X.shape[1]  # Number of columns in the array
        column_names = [f'X{i + 1}' for i in range(num_columns)]

        df = pd.DataFrame(X, columns=column_names)

        cols = df.columns

        nodes = util.ArrayList()
        for col in cols:
            nodes.add(tetrad_data.ContinuousVariable(str(col)))

        graph = self.construct_graph(g, nodes)
        dag = self.construct_graph(g, nodes, cpdag=False)

        # Construct the SEM IM given graph and
        cov = tetrad_data.CovarianceMatrix(nodes, R, n)
        sem_pm = tetrad_sem.SemPm(dag)
        sem_im = tetrad_sem.SemIm(sem_pm, cov)

        return df, nodes, graph, num_nodes, avg_deg, sem_im, B


    def get_model(self, alg, paramValue):
        _search = TetradSearch.TetradSearch(self.train)
        _search.set_verbose(False)
        _search.use_sem_bic(penalty_discount=paramValue)

        nodes = util.ArrayList()

        for col in self.train.columns:
            nodes.add(tetrad_graph.GraphNode(col))

        if alg == 'true':
            return self.graph
        if alg == 'fges':
            _search.use_sem_bic(penalty_discount=paramValue)
            _search.run_fges(faithfulness_assumed=False)
        elif alg == 'boss':
            _search.use_sem_bic(penalty_discount=paramValue)
            _search.run_boss()
        elif alg == 'grasp':
            _search.use_sem_bic(penalty_discount=paramValue)
            _search.use_fisher_z(0.05)
            _search.run_grasp()
        elif alg == 'sp':
            _search.use_sem_bic(penalty_discount=paramValue)
            _search.run_sp()
        elif alg == 'pc':
            _search.use_fisher_z(paramValue)
            _search.run_pc()
        elif alg == 'cpc':
            _search.use_fisher_z(paramValue)
            _search.run_cpc()
        elif alg == 'lingam':
            dlingam = DirectLiNGAM()
            dlingam.fit(self.train)
            W = dlingam.adjacency_matrix_
            return self.construct_graph(W, nodes, True)
        elif alg == 'bidag':
            bge = self.bidag.scoreparameters("bge", numpy2rpy(self.train_numpy), bgepar=ListVector({"am": 1.0}))
            itmcmc = self.bidag.iterativeMCMC(scorepar=bge, softlimit=9, hardlimit=12, alpha=self.alpha,
                                              verbose=False)
            cpdag = self.construct_graph(np.array(self.base.as_matrix(itmcmc[1]), dtype=int).T, nodes, True)
            return cpdag
        elif alg == 'pchc':
            print("pchc")
            bnl = self.pchc.pchc(numpy2rpy(self.train.values), alpha=self.alpha, restart=self.pchc_starts)
            return self.bnl_to_tetrad(bnl[1][2], cpdag=True)
        elif alg == 'mmhc':
            bnl = self.pchc.mmhc(numpy2rpy(self.train.values), max_k=self.mmhc_max_k, alpha=self.alpha, restart=self.mmhc_starts)
            return self.bnl_to_tetrad(bnl[1][2], cpdag=True)
        elif alg == 'dagma':
            model = DagmaLinear(loss_type='l2')  # create a linear model with least squares loss
            W = model.fit(self.train.to_numpy(), lambda1=paramValue)  # fit the model with L1 reg. (coeff. 0.02)
            return self.construct_graph(W.T, nodes, True)
        else:
            raise Exception('Unrecognized alg name: ' + alg)

        return _search.get_java()


    def cpdag(self, graph):
        return graph.paths().isLegalCpdag()


    # CAFS = Cross-Algorithm Frugality Search
    def cafs(self):
        dir = f'markov_check_{self.sim_type}'

        penalties = [10.0, 5.0, 4.0, 3, 2.5, 2, 1.75, 1.5, 1.25, 1]
        alphas = [0.001, 0.01, 0.05, 0.1, 0.2]

        for num_nodes in range(5, 30 + 1, 5):  # 5, 10, 15, 20, 25, 30
            for avg_degree in range(1, 6 + 1):  # 1, 2, 3, 4, 5
                if avg_degree > num_nodes - 1:
                    continue

                # Create the output directory if it does not exist
                if not os.path.exists(f'{self.location}/{dir}'):
                    os.makedirs(f'{self.location}/{dir}')

                result_file = f'{self.location}/{dir}/result_{num_nodes}_{avg_degree}.txt'

                if os.path.exists(result_file):
                    print("result file exists: " + result_file)
                    continue

                with (open(result_file, 'w') as file,
                      open(f'{self.location}/{dir}/graph_{num_nodes}_{avg_degree}.txt', 'w') as graph_file,
                      open(f'{self.location}/{dir}/train_{num_nodes}_{avg_degree}.txt', 'w') as train_file,
                      open(f'{self.location}/{dir}/test_{num_nodes}_{avg_degree}.txt', 'w') as test_file):
                    find = FindGoodModel(self.location, file, num_nodes, avg_degree, 0, 1000,
                                         self.sim_type, histogram_dir=self.histogram_dir)

                    # print parameter defs and header
                    find.print_parameter_defs()
                    find.header()

                    # go through algorithms and parameter choices and save the best lines (print all lines)
                    find.save_lines('true', [0])
                    find.save_lines('dagma', [0.1, 0.2, 0.3])
                    find.save_lines('pc', alphas)
                    find.save_lines('cpc', alphas)
                    find.save_lines('fges', penalties)
                    find.save_lines('grasp', penalties)
                    find.save_lines('boss', penalties)
                    find.save_lines('bidag', [0])
                    find.save_lines('mmhc', [0])
                    find.save_lines('pchc', [0])

                    train = translate.pandas_data_to_tetrad(find.get_train())
                    test = translate.pandas_data_to_tetrad(find.get_test())
                    graph = find.get_graph()

                    # get_stats(train, graph)

                    print(graph, file=graph_file)
                    print(train, file=train_file)
                    print(test, file=test_file)

                    file.close()
                    graph_file.close()
                    train_file.close()
                    test_file.close()

    def create_coef_diff_histograms(self, cpdag, alg, param, histogram_dir):
        if histogram_dir == None:
            return

        if (self.num_nodes != 25 or self.avg_degree != 5):
            return

        if not os.path.exists(histogram_dir):
            os.makedirs(histogram_dir)

        dag = self.sem_im.getSemPm().getGraph()

        # Get the B coefficients from the sem_im
        B = np.array(self.sem_im.getEdgeCoef().toArray())

        # Now, get all the coefficients for all of the edges in B--i.e. all of the entries in B that are not zero
        non_zero_B = B[B != 0]

        # Now find the list of directed edges in dag that are also in cpdag and add these to a list.
        edges = dag.getEdges()

        directed_edges = []

        for e in edges:
            if Edges.isDirectedEdge(e):
                directed_edges.append(e)

        # Now, get the coefficients for all the directed edges in the list of directed edges
        in_cpdag = []
        not_in_cpdag_but_adjacent = []
        not_in_cpdag = []
        nodes = cpdag.getNodes()

        for e in directed_edges:
            n1 = cpdag.getNode(Edges.getDirectedEdgeTail(e).getName())
            n2 = cpdag.getNode(Edges.getDirectedEdgeHead(e).getName())

            tail = nodes.indexOf(Edges.getDirectedEdgeTail(e))
            head = nodes.indexOf(Edges.getDirectedEdgeHead(e))

            # Check that e is also in cpdag
            if cpdag.containsEdge(Edges.directedEdge(n1, n2)):
                in_cpdag.append(B[tail, head])

            if not cpdag.containsEdge(Edges.directedEdge(n1, n2)):
                not_in_cpdag.append(B[tail, head])

            if not cpdag.containsEdge(Edges.directedEdge(n1, n2)) and cpdag.isAdjacentTo(n1, n2):
                not_in_cpdag_but_adjacent.append(B[tail, head])

        in_cpdag = np.array(in_cpdag)
        not_in_cpdag = np.array(not_in_cpdag)
        not_in_cpdag_but_adjacent = np.array(not_in_cpdag_but_adjacent)

        abs_nonzero_B = np.abs(non_zero_B)
        abs_in_cpdag = np.abs(in_cpdag)
        abs_not_in_cpdag = np.abs(not_in_cpdag)
        abs_not_in_cpdag_and_adjacent = np.abs(not_in_cpdag_but_adjacent)

        # Create two subplots side by side
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))  # Adjusted the figsize for better spacing with two plots

        # Plot the first histogram with percentages
        ax1.hist(abs_nonzero_B, bins=10, edgecolor='black', weights=np.ones(len(abs_nonzero_B)) / len(abs_nonzero_B) * 100)
        ax1.set_title("True Coefficients")
        ax1.set_xlabel("Coefficient Value")
        ax1.set_ylabel("Percentage")
        ax1.set_xlim(0, 1)
        ax1.set_ylim(0, 30)

        # Plot the second histogram with percentages
        ax2.hist(abs_not_in_cpdag_and_adjacent, bins=10, edgecolor='black', weights=np.ones(len(abs_not_in_cpdag_and_adjacent)) / len(abs_not_in_cpdag_and_adjacent) * 100)
        ax2.set_title("In DAG, not in CPDAG but Adjacent")
        ax2.set_xlabel("Coefficient Value")
        ax2.set_ylabel("Percentage")
        ax2.set_xlim(0, 1)
        ax2.set_ylim(0, 30)

        # Adjust layout to make room for the suptitle
        plt.tight_layout(rect=(0.0, 0.0, 1.0, 0.95))


        # Add a general caption above all subplots
        # fig.suptitle(f"Comparison of Coefficient Distributions for {alg}_{self.num_nodes}_{self.avg_degree}", fontsize=16)

        # Adjust layout to make room for the suptitle
        plt.tight_layout(rect=(0.0, 0.0, 1.0, 0.95))

        # Save the plot to a file
        histogram_file = f"histograms_{self.num_nodes}_{self.avg_degree}_{alg}_{param}.png"
        plt.savefig(f"{histogram_dir}/{histogram_file}")

        # # Show the plot
        # plt.show()

output_dir = 'alg_output_with_true'
histogram_dir = "plots/histograms/coef_histograms"

FindGoodModel(output_dir, sim_type='lg', histogram_dir = histogram_dir).cafs()