diff --git a/README.rst b/README.rst index b1c6969..6b5098c 100644 --- a/README.rst +++ b/README.rst @@ -18,6 +18,8 @@ - | |license| * - stats - | |downloads_stats| |downloads_monthly| |downloads_weekly| + * - style + - | |Black| Repository of a data modeling and analysis tool based on Bayesian networks @@ -241,3 +243,6 @@ Citation .. |coverage| image:: https://codecov.io/github/aimclub/BAMT/branch/master/graph/badge.svg?token=fA4qsxGqTC :target: https://codecov.io/github/aimclub/BAMT + +.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg +.. _Black: https://github.com/psf/black diff --git a/tests/BigbraveBNTest.py b/tests/BigbraveBNTest.py index f19a966..21bb808 100644 --- a/tests/BigbraveBNTest.py +++ b/tests/BigbraveBNTest.py @@ -13,17 +13,17 @@ encoder = preprocessing.LabelEncoder() discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" +) -p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data_discrete) info = p.info space_restrictor = BigBraveBN() -space_restrictor.set_possible_edges_by_brave( - df=data_discrete) +space_restrictor.set_possible_edges_by_brave(df=data_discrete) ps = space_restrictor.possible_edges @@ -31,23 +31,22 @@ bn_discrete.add_nodes(descriptor=info) -params = {'white_list': ps} -bn_discrete.add_edges(discretized_data, scoring_function=( - 'K2', K2Score), params=params) +params = {"white_list": ps} +bn_discrete.add_edges(discretized_data, scoring_function=("K2", K2Score), params=params) encoder = preprocessing.LabelEncoder() discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" +) -p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data_continuous) info = p.info space_restrictor = BigBraveBN() -space_restrictor.set_possible_edges_by_brave( - df=data_continuous) +space_restrictor.set_possible_edges_by_brave(df=data_continuous) ps = space_restrictor.possible_edges @@ -55,6 +54,7 @@ bn_continuous.add_nodes(descriptor=info) -params = {'white_list': ps} +params = {"white_list": ps} bn_continuous.add_edges( - discretized_data, scoring_function=('K2', K2Score), params=params) + discretized_data, scoring_function=("K2", K2Score), params=params +) diff --git a/tests/LoadBN.py b/tests/LoadBN.py index f7dac4c..7c7a44f 100644 --- a/tests/LoadBN.py +++ b/tests/LoadBN.py @@ -5,14 +5,23 @@ import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -21,9 +30,11 @@ bn.add_nodes(info) -structure = [("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability")] +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] bn.set_structure(edges=structure) diff --git a/tests/MainTest.py b/tests/MainTest.py index 91d1db2..ff7a26b 100644 --- a/tests/MainTest.py +++ b/tests/MainTest.py @@ -6,20 +6,29 @@ from bamt.preprocessors import Preprocessor import bamt.networks as Networks -''' +""" Optional: You can also uncomment print() that you need. -''' +""" hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv") -cont_data = hack_data[['Gross', 'Netpay', 'Porosity', - 'Permeability', 'Depth']].dropna() -disc_data = hack_data[['Tectonic regime', 'Period', - 'Lithology', 'Structural setting']].dropna() -hybrid_data = hack_data[['Tectonic regime', 'Period', - 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', - 'Permeability', 'Depth']].dropna() +cont_data = hack_data[["Gross", "Netpay", "Porosity", "Permeability", "Depth"]].dropna() +disc_data = hack_data[ + ["Tectonic regime", "Period", "Lithology", "Structural setting"] +].dropna() +hybrid_data = hack_data[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() cont_test_data = cont_data[cont_data.columns[:-1]] cont_target = cont_data[cont_data.columns[-1]] @@ -29,27 +38,24 @@ hybrid_target = hybrid_data[hybrid_data.columns[-1]] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, - encode='ordinal', - strategy='uniform') -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) # Discrete pipeline discretized_data, _ = p.apply(disc_data) disc_bn = Networks.DiscreteBN() info = p.info disc_bn.add_nodes(info) -disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +disc_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) disc_bn.fit_parameters(data=disc_data) disc_bn.calculate_weights(discretized_data) disc_predicted_values = disc_bn.predict(test=disc_test_data) -disc_predicted_values = pd.DataFrame.from_dict( - disc_predicted_values, orient='columns') +disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient="columns") synth_disc_data = disc_bn.sample(50) -disc_bn.save('./disc_bn.json') +disc_bn.save("./disc_bn.json") disc_bn2 = Networks.DiscreteBN() -disc_bn2.load('./disc_bn.json') +disc_bn2.load("./disc_bn.json") synth_disc_data2 = disc_bn2.sample(50) # print(disc_bn.weights) # print(disc_bn2.weights) @@ -63,17 +69,16 @@ cont_bn = Networks.ContinuousBN(use_mixture=True) info = p.info cont_bn.add_nodes(info) -cont_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +cont_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) cont_bn.fit_parameters(data=cont_data) cont_bn.calculate_weights(discretized_data) cont_predicted_values = cont_bn.predict(test=cont_test_data) -cont_predicted_values = pd.DataFrame.from_dict( - cont_predicted_values, orient='columns') +cont_predicted_values = pd.DataFrame.from_dict(cont_predicted_values, orient="columns") synth_cont_data = cont_bn.sample(50) -cont_bn.save('./cont_bn.json') +cont_bn.save("./cont_bn.json") cont_bn2 = Networks.ContinuousBN(use_mixture=True) -cont_bn2.load('./cont_bn.json') +cont_bn2.load("./cont_bn.json") synth_cont_data2 = cont_bn2.sample(50) # print(cont_bn.weights) # print(cont_bn2.weights) @@ -91,21 +96,22 @@ info = p.info hybrid_bn.add_nodes(info) hybrid_bn2.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) -hybrid_bn2.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn2.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) hybrid_bn.fit_parameters(data=hybrid_data) hybrid_bn2.fit_parameters(data=hybrid_data) hybrid_bn.calculate_weights(discretized_data) hybrid_bn2.calculate_weights(discretized_data) hybrid_predicted_values = hybrid_bn.predict(test=hybrid_test_data) hybrid_predicted_values = pd.DataFrame.from_dict( - hybrid_predicted_values, orient='columns') + hybrid_predicted_values, orient="columns" +) synth_hybrid_data = hybrid_bn.sample(50) synth_hybrid_data2 = hybrid_bn2.sample(50) -hybrid_bn.save('./hybrid_bn.json') +hybrid_bn.save("./hybrid_bn.json") hybrid_bn3 = Networks.HybridBN(use_mixture=True) -hybrid_bn3.load('./hybrid_bn.json') +hybrid_bn3.load("./hybrid_bn.json") synth_hybrid_data3 = hybrid_bn3.sample(50) # print(hybrid_bn.weights) # print(hybrid_bn2.weights) @@ -124,9 +130,9 @@ hybrid_bn = Networks.HybridBN(use_mixture=True) info = p.info hybrid_bn.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) hybrid_bn.fit_parameters(data=hybrid_data) -hybrid_bn.save('./hybrid_bn_without_weights.json') +hybrid_bn.save("./hybrid_bn_without_weights.json") hybrid_bn2 = Networks.HybridBN(use_mixture=True) -hybrid_bn2.load('./hybrid_bn_without_weights.json') +hybrid_bn2.load("./hybrid_bn_without_weights.json") # print(hybrid_bn2.weights) diff --git a/tests/MetricsTest.py b/tests/MetricsTest.py index 74997ca..05dc87f 100644 --- a/tests/MetricsTest.py +++ b/tests/MetricsTest.py @@ -12,15 +12,16 @@ h = pd.read_csv("data/real data/hack_processed_with_rf.csv") cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting', - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", +] h = h[cols] print(h.describe()) @@ -29,12 +30,9 @@ print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) # ----------- discrete_data, est = p.apply(h) @@ -42,15 +40,15 @@ bn = Networks.HybridBN(has_logit=True) # all may vary bn.add_nodes(descriptor=info) -bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',)) +bn.add_edges(data=discrete_data, optimizer="HC", scoring_function=("MI",)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() -print(f'PL elapsed: {t2 - t1}') +print(f"PL elapsed: {t2 - t1}") -columns = ['Lithology', 'Structural setting', 'Porosity', 'Depth'] +columns = ["Lithology", "Structural setting", "Porosity", "Depth"] validY = h[columns].dropna() validX = h.drop(columns, axis=1).dropna() @@ -58,4 +56,4 @@ pred_param = bn.predict(validX, parall_count=3) time_2 = time.time() print(pred_param) -print(f'Predict elapsed: {time_2 - time_1}') +print(f"Predict elapsed: {time_2 - time_1}") diff --git a/tests/NetworksTest.py b/tests/NetworksTest.py index 7d9b7f4..c665b44 100644 --- a/tests/NetworksTest.py +++ b/tests/NetworksTest.py @@ -1,6 +1,7 @@ import json import time import itertools + # import abc import pandas as pd @@ -18,12 +19,14 @@ class NetworkTest(object): - def __init__(self, - directory: str, - verbose: bool = False, - case_id: int = 0, - sample_n: int = 500, - sample_tol: float = .6): + def __init__( + self, + directory: str, + verbose: bool = False, + case_id: int = 0, + sample_n: int = 500, + sample_tol: float = 0.6, + ): """ sample_n: number of rows in sample sample_tol: precent of acceptable number of nans. @@ -49,26 +52,23 @@ def test_preprocess(self): if self.case_id == 0: self.discrete_cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting'] - self.cont_cols = [ - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + self.cont_cols = ["Gross", "Netpay", "Porosity", "Permeability", "Depth"] self.hybrid_cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting', - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] # Base of standards self.base = "hack_" + self.type else: @@ -85,9 +85,10 @@ def test_preprocess(self): encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" + ) - p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) + p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data) info = p.info @@ -96,17 +97,14 @@ def test_preprocess(self): assert info == json.load(open(f"{self.base}/hack_info.json")) except AssertionError: failed = True - self.verboseprint( - self._tabularize_output("ERROR", "Bad descriptor") - ) + self.verboseprint(self._tabularize_output("ERROR", "Bad descriptor")) try: assert_frame_equal( discretized_data, - pd.read_csv( - f"{self.base}/hack_data.csv", - index_col=0), - check_dtype=False) + pd.read_csv(f"{self.base}/hack_data.csv", index_col=0), + check_dtype=False, + ) except Exception as ex: failed = True self.verboseprint(self._tabularize_output("ERROR", str(ex))) @@ -157,8 +155,11 @@ def test_predict(self): else: raise Exception("Inner error") - preds = self.bn.predict(test=pd.read_csv(self.directory)[cols[:2]].dropna(), - progress_bar=False, parall_count=2) + preds = self.bn.predict( + test=pd.read_csv(self.directory)[cols[:2]].dropna(), + progress_bar=False, + parall_count=2, + ) # with open(f"{self.base}/hack_predict.json", "r") as f: # p = json.load(f) @@ -166,47 +167,59 @@ def test_predict(self): if self.type == "continuous": # cols: ['Porosity', 'Permeability', 'Depth'] for node in preds.keys(): - right_val = json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node] - test_val = np.mean( - [mx for mx in preds[node] if not np.isnan(mx)]) - assert np.all(np.isclose(test_val, right_val, rtol=.4) - ), f"Predict failed: {node, right_val, test_val}" + right_val = json.load(open(f"{self.base}/hack_predict.json"))[self.sf][ + node + ] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + assert np.all( + np.isclose(test_val, right_val, rtol=0.4) + ), f"Predict failed: {node, right_val, test_val}" elif self.type == "discrete": # cols: ['Lithology', 'Structural setting'] for node in preds.keys(): test_vals = pd.Series(preds[node]).value_counts().to_dict() for category, right_val in json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node].items(): + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): try: - assert np.all(np.isclose(test_vals[category], right_val, atol=5)), \ - f"Predict failed: {node, test_vals[category], right_val}" + assert np.all( + np.isclose(test_vals[category], right_val, atol=5) + ), f"Predict failed: {node, test_vals[category], right_val}" except KeyError as ex: print("Unknown preds category: ", ex.args[0]) continue elif self.type == "hybrid": cont_nodes = [ - node for node in self.bn.nodes_names if self.info["types"][node] == "cont"] + node + for node in self.bn.nodes_names + if self.info["types"][node] == "cont" + ] for node in preds.keys(): if node in cont_nodes: - right_val = json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node] - test_val = np.mean( - [mx for mx in preds[node] if not np.isnan(mx)]) + right_val = json.load(open(f"{self.base}/hack_predict.json"))[ + self.sf + ][node] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) # p[self.sf][node] = test_val s = [right_val, test_val] - assert np.all(np.isclose(min(s), max(s), atol=5, rtol=.6)), \ - f"Predict failed: {node, test_val, right_val}" + assert np.all( + np.isclose(min(s), max(s), atol=5, rtol=0.6) + ), f"Predict failed: {node, test_val, right_val}" else: test_vals = pd.Series(preds[node]).value_counts().to_dict() # p[self.sf][node] = test_vals for category, right_val in json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node].items(): + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): try: - assert np.all(np.isclose(min(test_vals[category], right_val), - max(right_val, test_vals[category]), - atol=100, rtol=.5)), \ - f"Predict failed: {node, test_vals[category], right_val}" + assert np.all( + np.isclose( + min(test_vals[category], right_val), + max(right_val, test_vals[category]), + atol=100, + rtol=0.5, + ) + ), f"Predict failed: {node, test_vals[category], right_val}" except KeyError as ex: print("Unknown preds category: ", ex.args[0]) continue @@ -233,7 +246,7 @@ def use_rules(*args, **kwargs): class TestDiscreteBN(NetworkTest): def __init__(self, **kwargs): super(TestDiscreteBN, self).__init__(**kwargs) - self.type = 'discrete' + self.type = "discrete" def test_structure_learning(self): failed = False @@ -243,22 +256,23 @@ def test_structure_learning(self): try: assert bn.nodes_names == [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load( - open(f"{self.base}/hack_edges.json"))[self.sf] + assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[self.sf] except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -278,14 +292,15 @@ def test_parameters_learning(self): self.bn.fit_parameters(pd.read_csv(self.directory)[self.discrete_cols]) try: - assert self.bn.distributions == json.load( - open(f"{self.base}/hack_params.json"))[self.sf] + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[self.sf] + ) except AssertionError: failed = True self.verboseprint( - self._tabularize_output( - f"Parameters ({self.sf})", - "bad distributions")) + self._tabularize_output(f"Parameters ({self.sf})", "bad distributions") + ) if not failed: status = "OK" @@ -303,10 +318,7 @@ def apply(self): t1 = time.time() self.test_structure_learning() if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_parameters_learning() @@ -321,32 +333,30 @@ def apply(self): class TestContinuousBN(NetworkTest): def __init__(self, **kwargs): super(TestContinuousBN, self).__init__(**kwargs) - self.type = 'continuous' + self.type = "continuous" def test_setters(self): failed = False bn = Networks.ContinuousBN() ns = [] - for d in [ - Nodes.GaussianNode( - name="Node" + - str(id)) for id in range( - 0, - 4)]: + for d in [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 4)]: ns.append(d) bn.set_structure(nodes=ns) bn.set_classifiers( classifiers={ - 'Node0': DecisionTreeClassifier(), - 'Node1': RandomForestClassifier(), - 'Node2': KNeighborsClassifier( - n_neighbors=2)}) - - assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == \ - ["DecisionTreeClassifier()", "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)"], "Setter | Classifiers are wrong." + "Node0": DecisionTreeClassifier(), + "Node1": RandomForestClassifier(), + "Node2": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." if not failed: status = "OK" @@ -363,20 +373,29 @@ def test_structure_learning(self, use_mixture: bool = False): bn.add_nodes(descriptor=self.info) try: - assert bn.nodes_names == json.load( - open(f"{self.base}/hack_nodes.json"))[f"use_mixture={use_mixture}"][self.sf] + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load( - open(f"{self.base}/hack_edges.json"))[f"use_mixture={use_mixture}"][self.sf] + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -389,8 +408,9 @@ def test_structure_learning(self, use_mixture: bool = False): print( self._tabularize_output( - f"Structure ({self.sf}, use_mixture={self.use_mixture})", - status)) + f"Structure ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) def test_parameters_learning(self): failed = False @@ -401,18 +421,26 @@ def test_parameters_learning(self): empty_data = {"mean": [], "covars": [], "coef": []} for k, v in self.bn.distributions.items(): assert all( - [v[obj] != empty for obj, empty in empty_data.items()]), f"Empty data in {k}." - assert .9 <= sum( - v["coef"]) <= 1.1, f"{sum(v['coef'])} || {k}'s: coefs are wrong." + [v[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {k}." + assert ( + 0.9 <= sum(v["coef"]) <= 1.1 + ), f"{sum(v['coef'])} || {k}'s: coefs are wrong." else: - assert self.bn.distributions == json.load(open( - f"{self.base}/hack_params.json"))["use_mixture=False"][self.sf], "Bad distributions." + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[ + "use_mixture=False" + ][self.sf] + ), "Bad distributions." except AssertionError as ex: failed = True self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture})", - ex.args[0])) + ex.args[0], + ) + ) if not failed: status = "OK" @@ -421,8 +449,9 @@ def test_parameters_learning(self): print( self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture})", - status)) + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) def apply(self): print(f"Executing {self.type} BN tests.") @@ -436,10 +465,7 @@ def apply(self): t1 = time.time() self.test_structure_learning(use_mixture=use_mixture) if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_parameters_learning() @@ -454,7 +480,7 @@ def apply(self): class TestHybridBN(NetworkTest): def __init__(self, **kwargs): super(TestHybridBN, self).__init__(**kwargs) - self.type = 'hybrid' + self.type = "hybrid" def test_setters(self): failed = False @@ -462,48 +488,58 @@ def test_setters(self): bn = Networks.HybridBN(has_logit=True) ns = [] for d, g in zip( - [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], - [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)]): + [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], + [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)], + ): ns.append(d) ns.append(g) - edges = [('Node0', 'Node3'), ('Node3', 'Node1'), - ('Node1', 'Node4'), ('Node4', 'Node2'), ('Node2', 'Node5')] + edges = [ + ("Node0", "Node3"), + ("Node3", "Node1"), + ("Node1", "Node4"), + ("Node4", "Node2"), + ("Node2", "Node5"), + ] test_info = { - 'types': { - 'Node0': 'cont', - 'Node1': 'cont', - 'Node2': 'cont', - 'Node3': 'disc', - 'Node4': 'disc', - 'Node5': 'disc'}, - 'signs': { - 'Node0': 'pos', - 'Node1': 'pos', - 'Node2': 'pos'}} + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "disc", + "Node4": "disc", + "Node5": "disc", + }, + "signs": {"Node0": "pos", "Node1": "pos", "Node2": "pos"}, + } # Structure setter - bn.set_structure(info=test_info, - nodes=ns, - edges=edges) - - assert ['Gaussian (LinearRegression)', 'Logit (LogisticRegression)', 'ConditionalGaussian (LinearRegression)', - 'Logit (LogisticRegression)', 'ConditionalGaussian (LinearRegression)', - 'Logit (LogisticRegression)'] == \ - [node.type for node in bn.nodes], "Setter | Nodes are not the same." + bn.set_structure(info=test_info, nodes=ns, edges=edges) + + assert [ + "Gaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + ] == [node.type for node in bn.nodes], "Setter | Nodes are not the same." assert edges == bn.edges, "Setter | Edges are not the same." # Classifiers setters bn.set_classifiers( classifiers={ - 'Node3': DecisionTreeClassifier(), - 'Node4': RandomForestClassifier(), - 'Node5': KNeighborsClassifier( - n_neighbors=2)}) - - assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == \ - ["DecisionTreeClassifier()", "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)"], "Setter | Classifiers are wrong." + "Node3": DecisionTreeClassifier(), + "Node4": RandomForestClassifier(), + "Node5": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." # Parameters setters @@ -521,9 +557,8 @@ def test_setters(self): print(self._tabularize_output("Setters", status)) def test_structure_learning( - self, - use_mixture: bool = False, - has_logit: bool = False): + self, use_mixture: bool = False, has_logit: bool = False + ): self.use_mixture = use_mixture self.has_logit = has_logit failed = False @@ -532,20 +567,29 @@ def test_structure_learning( bn.add_nodes(descriptor=self.info) try: - assert bn.nodes_names == json.load(open(f"{self.base}/hack_nodes.json"))[ - f"use_mixture={use_mixture}"][f"has_logit={has_logit}"][self.sf] + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[ - f"use_mixture={use_mixture}"][f"has_logit={has_logit}"][self.sf] + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -559,24 +603,27 @@ def test_structure_learning( print( self._tabularize_output( f"Structure ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status)) + status, + ) + ) @staticmethod def non_empty_gaussian_nodes(name, node_params): empty_data = {"mean": [], "covars": [], "coef": []} - assert all([node_params[obj] != empty for obj, - empty in empty_data.items()]), f"Empty data in {name}." + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." @staticmethod def non_empty_logit_nodes(name, node_params): empty_data = {"classes": [], "classifier_obj": None} - assert all([node_params[obj] != empty for obj, - empty in empty_data.items()]), f"Empty data in {name}." + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." @staticmethod def sum_equals_to_1(name, node_params): - assert .9 <= sum( - node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." + assert 0.9 <= sum(node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." def _validate_node(self, name, type, node_params, true_vals): try: @@ -585,32 +632,34 @@ def _validate_node(self, name, type, node_params, true_vals): self.non_empty_gaussian_nodes, self.sum_equals_to_1, name=name, - node_params=node_params) + node_params=node_params, + ) elif type == "ConditionalMixtureGaussian": for comb, data in node_params["hybcprob"].items(): self.use_rules( self.non_empty_gaussian_nodes, self.sum_equals_to_1, name=name, - node_params=data) + node_params=data, + ) elif type.startswith("Logit"): self.use_rules( - self.non_empty_logit_nodes, - name=name, - node_params=node_params) + self.non_empty_logit_nodes, name=name, node_params=node_params + ) elif type.startswith("ConditionalLogit"): for comb, data in node_params["hybcprob"].items(): self.use_rules( - self.non_empty_logit_nodes, - name=name, - node_params=data) + self.non_empty_logit_nodes, name=name, node_params=data + ) else: assert node_params == true_vals, f"Parameters error on {name}, {type}" except AssertionError as ex: self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0])) + ex.args[0], + ) + ) def test_parameters_learning(self): failed = False @@ -618,7 +667,8 @@ def test_parameters_learning(self): self.bn.fit_parameters(pd.read_csv(self.directory)[self.hybrid_cols]) try: true_params = json.load(open(f"{self.base}/hack_params.json"))[ - f"use_mixture={self.use_mixture}"][f"has_logit={self.has_logit}"][self.sf] + f"use_mixture={self.use_mixture}" + ][f"has_logit={self.has_logit}"][self.sf] node_type_dict = {node.name: node.type for node in self.bn.nodes} for name, type in node_type_dict.items(): @@ -629,7 +679,9 @@ def test_parameters_learning(self): self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0])) + ex.args[0], + ) + ) if not failed: status = "OK" @@ -639,7 +691,9 @@ def test_parameters_learning(self): print( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status)) + status, + ) + ) def apply(self): print(f"Executing {self.type} BN tests.") @@ -647,19 +701,16 @@ def apply(self): self.test_setters() t0 = time.time() - for use_mixture, has_logit in itertools.product( - [True, False], repeat=2): + for use_mixture, has_logit in itertools.product([True, False], repeat=2): for sf in ["MI", "K2", "BIC"]: self.sf = sf t1 = time.time() self.test_structure_learning( - use_mixture=use_mixture, has_logit=has_logit) + use_mixture=use_mixture, has_logit=has_logit + ) self.test_parameters_learning() if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_sampling() diff --git a/tests/SaveBN.py b/tests/SaveBN.py index 04b3042..8fb2401 100644 --- a/tests/SaveBN.py +++ b/tests/SaveBN.py @@ -2,17 +2,27 @@ import pandas as pd from sklearn import preprocessing as pp import bamt.networks as Networks + # import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -21,9 +31,11 @@ bn.add_nodes(info) -structure = [("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability")] +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] bn.set_structure(edges=structure) diff --git a/tests/main.py b/tests/main.py index 520b62e..bc1904a 100644 --- a/tests/main.py +++ b/tests/main.py @@ -5,15 +5,17 @@ import traceback # Print only errors -logging.getLogger('preprocessor').setLevel(logging.ERROR) +logging.getLogger("preprocessor").setLevel(logging.ERROR) if __name__ == "__main__": t0 = time.time() dir = r"../data/real data/hack_processed_with_rf.csv" - tests = [TestHybridBN(directory=dir), - TestDiscreteBN(directory=dir), - TestContinuousBN(directory=dir)] + tests = [ + TestHybridBN(directory=dir), + TestDiscreteBN(directory=dir), + TestContinuousBN(directory=dir), + ] for test in tests: try: diff --git a/tests/sendingClassifiersLogit.py b/tests/sendingClassifiersLogit.py index 5045077..a0c3613 100644 --- a/tests/sendingClassifiersLogit.py +++ b/tests/sendingClassifiersLogit.py @@ -11,17 +11,23 @@ import pandas as pd hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -40,9 +46,13 @@ bn.add_nodes(info) bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) -bn.set_classifiers(classifiers={'Structural setting': DecisionTreeClassifier(), - 'Lithology': RandomForestClassifier(), - 'Period': KNeighborsClassifier(n_neighbors=2)}) +bn.set_classifiers( + classifiers={ + "Structural setting": DecisionTreeClassifier(), + "Lithology": RandomForestClassifier(), + "Period": KNeighborsClassifier(n_neighbors=2), + } +) bn.fit_parameters(hack_data) diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py index 0168661..35f23a1 100644 --- a/tests/sendingRegressors.py +++ b/tests/sendingRegressors.py @@ -12,17 +12,23 @@ import pandas as pd hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -42,11 +48,11 @@ bn.set_regressor( regressors={ - 'Depth': CatBoostRegressor( - logging_level="Silent", - allow_writing_files=False), - 'Gross': RandomForestRegressor(), - 'Porosity': DecisionTreeRegressor()}) + "Depth": CatBoostRegressor(logging_level="Silent", allow_writing_files=False), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } +) bn.fit_parameters(hack_data) diff --git a/tests/test_builders.py b/tests/test_builders.py index d117e31..491a9ca 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -21,322 +21,713 @@ class TestStructureBuilder(unittest.TestCase): - def setUp(self): self.data = pd.DataFrame(columns=["Node0", "Node1", "Node2"]) - self.descriptor = {"types": {"Node0": "cont", - "Node1": "disc", - "Node2": "disc_num"}, - "signs": {"Node0": "pos"}} + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "disc_num"}, + "signs": {"Node0": "pos"}, + } self.SB = StructureBuilder(descriptor=self.descriptor) def test_restrict(self): self.SB.has_logit = True - self.SB.restrict(data=self.data, - init_nodes=None, - bl_add=None) + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) self.assertEqual(self.SB.black_list, [], msg="Restrict wrong edges.") # --------- self.SB.has_logit = False - self.SB.restrict(data=self.data, - init_nodes=None, - bl_add=None) + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) self.assertEqual( - self.SB.black_list, [ - ('Node0', 'Node1'), ('Node0', 'Node2')], msg="Restricted edges are allowed.") + self.SB.black_list, + [("Node0", "Node1"), ("Node0", "Node2")], + msg="Restricted edges are allowed.", + ) def test_get_family(self): self.assertIsNone(self.SB.get_family()) - self.SB.skeleton['V'] = [GaussianNode(name="Node0"), - DiscreteNode(name="Node1"), - DiscreteNode(name="Node2")] + self.SB.skeleton["V"] = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + DiscreteNode(name="Node2"), + ] self.assertIsNone(self.SB.get_family()) # Note that the method get_family is not supposed to be used by user (only developer), # so we don't cover a case with restricted edges here (we did this in # the previous test). - self.SB.skeleton['E'] = [("Node1", "Node0"), ("Node2", "Node1"), - ("Node2", "Node0")] + self.SB.skeleton["E"] = [ + ("Node1", "Node0"), + ("Node2", "Node1"), + ("Node2", "Node0"), + ] self.SB.get_family() # Node: [[cont_parents], [disc_parents], [children]] data = [ - [[], [], ['Node1', 'Node0']], - [[], ['Node2'], ['Node0']], - [[], ['Node1', 'Node2'], []] + [[], [], ["Node1", "Node0"]], + [[], ["Node2"], ["Node0"]], + [[], ["Node1", "Node2"], []], ] for node_nummer in range(3): - self.assertEqual(self.SB.skeleton["V"][node_nummer].cont_parents, - data[node_nummer][0]) - self.assertEqual(self.SB.skeleton["V"][node_nummer].disc_parents, - data[node_nummer][1]) - self.assertEqual(self.SB.skeleton["V"][node_nummer].children, - data[node_nummer][2]) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].cont_parents, data[node_nummer][0] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].disc_parents, data[node_nummer][1] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].children, data[node_nummer][2] + ) class TestVerticesDefiner(unittest.TestCase): - def setUp(self): - self.descriptor = {"types": {"Node0": "cont", - "Node1": "cont", - "Node2": "cont", - "Node3": "cont", - "Node4": "disc", - "Node5": "disc", - "Node6": "disc_num", - "Node7": "disc_num"}, - "signs": {"Node0": "pos", "Node1": "neg"}} - - self.VD = VerticesDefiner( - descriptor=self.descriptor, regressor=None) + self.descriptor = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + self.VD = VerticesDefiner(descriptor=self.descriptor, regressor=None) def test_first_level(self): self.assertEqual( - self.VD.vertices, [ - GaussianNode( - name="Node0"), GaussianNode( - name="Node1"), GaussianNode( - name="Node2"), GaussianNode( - name="Node3"), DiscreteNode( - name="Node4"), DiscreteNode( - name="Node5"), DiscreteNode( - name="Node6"), DiscreteNode( - name="Node7")]) + self.VD.vertices, + [ + GaussianNode(name="Node0"), + GaussianNode(name="Node1"), + GaussianNode(name="Node2"), + GaussianNode(name="Node3"), + DiscreteNode(name="Node4"), + DiscreteNode(name="Node5"), + DiscreteNode(name="Node6"), + DiscreteNode(name="Node7"), + ], + ) def test_overwrite_vetrex(self): - self.assertEqual(self.VD.skeleton, {'V': [], 'E': []}) + self.assertEqual(self.VD.skeleton, {"V": [], "E": []}) def reload(): - self.VD.skeleton['V'] = self.VD.vertices - self.VD.skeleton['E'] = [ - ("Node0", - "Node7"), - ("Node0", - "Node1"), - ("Node0", - "Node2"), - ("Node0", - "Node5"), - ("Node4", - "Node2"), - ("Node4", - "Node5"), - ("Node4", - "Node6"), - ("Node4", - "Node3")] + self.VD.skeleton["V"] = self.VD.vertices + self.VD.skeleton["E"] = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] self.VD.get_family() data = { "True, True": { - 'Node0': 'MixtureGaussian', - 'Node4': 'Discrete', - 'Node7': 'Logit (LogisticRegression)', - 'Node1': 'MixtureGaussian', - 'Node2': 'ConditionalMixtureGaussian', - 'Node5': 'ConditionalLogit (LogisticRegression)', - 'Node6': 'Discrete', - 'Node3': 'ConditionalMixtureGaussian'}, + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, "True, False": { - 'Node0': 'MixtureGaussian', - 'Node4': 'Discrete', - 'Node7': 'Discrete', - 'Node1': 'MixtureGaussian', - 'Node2': 'ConditionalMixtureGaussian', - 'Node5': 'Discrete', - 'Node6': 'Discrete', - 'Node3': 'ConditionalMixtureGaussian'}, + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, "False, True": { - 'Node0': 'Gaussian (LinearRegression)', - 'Node4': 'Discrete', - 'Node7': 'Logit (LogisticRegression)', - 'Node1': 'Gaussian (LinearRegression)', - 'Node2': 'ConditionalGaussian (LinearRegression)', - 'Node5': 'ConditionalLogit (LogisticRegression)', - 'Node6': 'Discrete', - 'Node3': 'ConditionalGaussian (LinearRegression)'}, + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, "False, False": { - 'Node0': 'Gaussian (LinearRegression)', - 'Node4': 'Discrete', - 'Node7': 'Discrete', - 'Node1': 'Gaussian (LinearRegression)', - 'Node2': 'ConditionalGaussian (LinearRegression)', - 'Node5': 'Discrete', - 'Node6': 'Discrete', - 'Node3': 'ConditionalGaussian (LinearRegression)'}} - - for use_mixture, has_logit in itertools.product( - [True, False], repeat=2): + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + } + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): reload() self.VD.overwrite_vertex( has_logit=has_logit, use_mixture=use_mixture, classifier=None, - regressor=None) + regressor=None, + ) self.assertEqual( - { - node.name: node.type for node in self.VD.skeleton["V"]}, + {node.name: node.type for node in self.VD.skeleton["V"]}, data[f"{use_mixture}, {has_logit}"], - msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}") + msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}", + ) class TestHillClimbDefiner(unittest.TestCase): def setUp(self): - self.descriptor = {'signs': {'Depth': 'pos', - 'Gross': 'pos', - 'Netpay': 'pos', - 'Permeability': 'pos', - 'Porosity': 'pos'}, - 'types': {'Depth': 'cont', - 'Gross': 'cont', - 'Lithology': 'disc', - 'Netpay': 'cont', - 'Period': 'disc', - 'Permeability': 'cont', - 'Porosity': 'cont', - 'Structural setting': 'disc', - 'Tectonic regime': 'disc'}} + self.descriptor = { + "signs": { + "Depth": "pos", + "Gross": "pos", + "Netpay": "pos", + "Permeability": "pos", + "Porosity": "pos", + }, + "types": { + "Depth": "cont", + "Gross": "cont", + "Lithology": "disc", + "Netpay": "cont", + "Period": "disc", + "Permeability": "cont", + "Porosity": "cont", + "Structural setting": "disc", + "Tectonic regime": "disc", + }, + } self.data = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } def test_apply_K2(self): - hcd = HillClimbDefiner(data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("K2",)) - - hcd.apply_K2(data=pd.DataFrame(self.data), - init_edges=None, - progress_bar=False, - remove_init_edges=False, - white_list=None) + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("K2",), + ) + + hcd.apply_K2( + data=pd.DataFrame(self.data), + init_edges=None, + progress_bar=False, + remove_init_edges=False, + white_list=None, + ) right_edges = [ - [ - 'Tectonic regime', 'Structural setting'], [ - 'Tectonic regime', 'Depth'], [ - 'Tectonic regime', 'Netpay'], [ - 'Period', 'Porosity'], [ - 'Period', 'Tectonic regime'], [ - 'Period', 'Netpay'], [ - 'Lithology', 'Permeability'], [ - 'Lithology', 'Period'], [ - 'Lithology', 'Tectonic regime'], [ - 'Structural setting', 'Netpay'], [ - 'Netpay', 'Gross'], [ - 'Porosity', 'Permeability'], [ - 'Porosity', 'Depth'], [ - 'Porosity', 'Netpay'], [ - 'Permeability', 'Netpay']] + ["Tectonic regime", "Structural setting"], + ["Tectonic regime", "Depth"], + ["Tectonic regime", "Netpay"], + ["Period", "Porosity"], + ["Period", "Tectonic regime"], + ["Period", "Netpay"], + ["Lithology", "Permeability"], + ["Lithology", "Period"], + ["Lithology", "Tectonic regime"], + ["Structural setting", "Netpay"], + ["Netpay", "Gross"], + ["Porosity", "Permeability"], + ["Porosity", "Depth"], + ["Porosity", "Netpay"], + ["Permeability", "Netpay"], + ] self.assertEqual(hcd.skeleton["E"], right_edges) def test_apply_group1(self): - hcd = HillClimbDefiner(data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("MI",)) - - hcd.restrict( - data=pd.DataFrame( - self.data), - bl_add=None, - init_nodes=None) + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("MI",), + ) + + hcd.restrict(data=pd.DataFrame(self.data), bl_add=None, init_nodes=None) hcd.apply_group1( - data=pd.DataFrame( - self.data), + data=pd.DataFrame(self.data), progress_bar=False, init_edges=None, remove_init_edges=False, - white_list=None) + white_list=None, + ) right_edges = [ - [ - 'Lithology', 'Depth'], [ - 'Period', 'Gross'], [ - 'Netpay', 'Gross'], [ - 'Period', 'Netpay'], [ - 'Depth', 'Period'], [ - 'Depth', 'Permeability'], [ - 'Netpay', 'Permeability'], [ - 'Period', 'Porosity'], [ - 'Netpay', 'Porosity'], [ - 'Permeability', 'Structural setting'], [ - 'Netpay', 'Structural setting'], [ - 'Period', 'Tectonic regime'], [ - 'Netpay', 'Tectonic regime']] + ["Lithology", "Depth"], + ["Period", "Gross"], + ["Netpay", "Gross"], + ["Period", "Netpay"], + ["Depth", "Period"], + ["Depth", "Permeability"], + ["Netpay", "Permeability"], + ["Period", "Porosity"], + ["Netpay", "Porosity"], + ["Permeability", "Structural setting"], + ["Netpay", "Structural setting"], + ["Period", "Tectonic regime"], + ["Netpay", "Tectonic regime"], + ] self.assertEqual(hcd.skeleton["E"], right_edges) class TestEvoStructureBuilder(unittest.TestCase): - def setUp(self): self.data = pd.read_csv(r"data/benchmark/asia.csv", index_col=0) - self.descriptor = {'types': {'asia': 'disc', - 'tub': 'disc', - 'smoke': 'disc', - 'lung': 'disc', - 'bronc': 'disc', - 'either': 'disc', - 'xray': 'disc', - 'dysp': 'disc'}, - 'signs': {}} - self.evo_builder = EvoStructureBuilder(data=self.data, - descriptor=self.descriptor, - regressor=None, - has_logit=True, - use_mixture=True) + self.descriptor = { + "types": { + "asia": "disc", + "tub": "disc", + "smoke": "disc", + "lung": "disc", + "bronc": "disc", + "either": "disc", + "xray": "disc", + "dysp": "disc", + }, + "signs": {}, + } + self.evo_builder = EvoStructureBuilder( + data=self.data, + descriptor=self.descriptor, + regressor=None, + has_logit=True, + use_mixture=True, + ) # Replace this with your actual reference DAG self.reference_dag = [ - ('asia', 'tub'), - ('tub', 'either'), - ('smoke', 'lung'), - ('smoke', 'bronc'), - ('lung', 'either'), - ('bronc', 'dysp'), - ('either', 'xray'), - ('either', 'dysp') - ] + ("asia", "tub"), + ("tub", "either"), + ("smoke", "lung"), + ("smoke", "bronc"), + ("lung", "either"), + ("bronc", "dysp"), + ("either", "xray"), + ("either", "dysp"), + ] def test_build(self): # placeholder kwargs kwargs = {} self.evo_builder.build( - data=self.data, - classifier=None, - regressor=None, - **kwargs) + data=self.data, classifier=None, regressor=None, **kwargs + ) - obtained_dag = self.evo_builder.skeleton['E'] + obtained_dag = self.evo_builder.skeleton["E"] num_edges = len(obtained_dag) - self.assertGreaterEqual(num_edges, 1, msg="Obtained graph should have at least one edge.") + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) - dist = precision_recall(obtained_dag, self.reference_dag)['SHD'] + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] self.assertLess( dist, 15, - msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}") + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) if __name__ == "__main__": diff --git a/tests/test_networks.py b/tests/test_networks.py index 8eb367d..fc1c590 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -34,17 +34,27 @@ def assertIsDir(self, path): def prepare_bn_and_data(self): # prepare bn where models were set by set_model hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + ] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') + n_bins=5, encode="ordinal", strategy="quantile" + ) p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) + [("encoder", encoder), ("discretizer", discretizer)] + ) discretized_data, est = p.apply(hack_data) @@ -54,31 +64,36 @@ def prepare_bn_and_data(self): self.bn.add_nodes(info) self.bn.add_edges( - discretized_data, scoring_function=( - "BIC",), progress_bar=False) + discretized_data, scoring_function=("BIC",), progress_bar=False + ) self.bn.set_regressor( regressors={ - 'Depth': CatBoostRegressor( - logging_level="Silent", - allow_writing_files=False), - 'Gross': RandomForestRegressor(), - 'Porosity': DecisionTreeRegressor()}) + "Depth": CatBoostRegressor( + logging_level="Silent", allow_writing_files=False + ), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } + ) return hack_data class TestBaseNetwork(TestCaseBase): - def setUp(self): self.bn = BaseNetwork() - self.nodes = [GaussianNode(name="Node0"), DiscreteNode(name="Node1"), - GaussianNode(name="Node2")] + self.nodes = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + GaussianNode(name="Node2"), + ] self.edges = [("Node0", "Node1"), ("Node1", "Node2")] - self.descriptor = {"types": {"Node0": "cont", "Node1": "disc", - "Node2": "cont"}, - "signs": {"Node0": "pos", "Node1": "neg"}} + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {"Node0": "pos", "Node1": "neg"}, + } def test_validate(self): descriptor_t = {"types": {"Node0": "Abstract", "Node1": "Abstract"}} @@ -92,7 +107,7 @@ def test_update_descriptor(self): # Nodes out self.bn.nodes = [GaussianNode(name="Node0")] self.bn.update_descriptor() - self.assertEqual({'Node0': 'cont'}, self.bn.descriptor["types"]) + self.assertEqual({"Node0": "cont"}, self.bn.descriptor["types"]) # It uses only Vertices Definer, test of this is in builders tests. def test_add_nodes(self): @@ -111,21 +126,20 @@ def __init__(self, name): self.name = name # set without mapping - self.assertIsNone( - self.bn.set_nodes( - nodes=[ - GaussianNode( - name="Node0")])) + self.assertIsNone(self.bn.set_nodes(nodes=[GaussianNode(name="Node0")])) - map = {"types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, - "signs": {}} + map = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {}, + } self.bn.set_nodes(nodes=self.nodes, info=map) self.assertEqual(self.bn.nodes, self.nodes) - self.bn.set_nodes(nodes=[MyNode(name="Node-1"), MyNode("Node-2")], - info={"types": {"Node-1": "cont", "Node-2": "disc"}, - "signs": {}}) + self.bn.set_nodes( + nodes=[MyNode(name="Node-1"), MyNode("Node-2")], + info={"types": {"Node-1": "cont", "Node-2": "disc"}, "signs": {}}, + ) self.assertEqual(self.bn.nodes, []) def test_set_edges(self): @@ -135,7 +149,7 @@ def test_set_edges(self): self.bn.set_nodes(nodes=self.nodes, info=self.descriptor) self.bn.set_edges(edges=self.edges) - self.assertEqual([('Node1', 'Node2')], self.bn.edges) + self.assertEqual([("Node1", "Node2")], self.bn.edges) # The test consists of 2 previous methods that are tested, # plus methods of builders, they are tested as well. @@ -176,6 +190,7 @@ def test_save(self): def test_fit_parameters(self): from bamt.networks.base import STORAGE + # here we test only initialization of the folder self.bn.has_logit = True self.bn.nodes = [LogitNode(name="Node0")] @@ -193,103 +208,839 @@ def test_joblib_pathsave(self): self.bn.fit_parameters(hack_data) self.assertGreater( - self.bn.sample( - 100, - progress_bar=False).size, - 0, - "Sampling is broken") + self.bn.sample(100, progress_bar=False).size, 0, "Sampling is broken" + ) - saveloc = self.bn.distributions["Gross"]['hybcprob']["['COMPRESSION']"]['regressor_obj'] + saveloc = self.bn.distributions["Gross"]["hybcprob"]["['COMPRESSION']"][ + "regressor_obj" + ] self.assertIsFile(saveloc) def test_sample(self): data = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } nodes = [ - DiscreteNode( - name='Tectonic regime'), DiscreteNode( - name='Period'), DiscreteNode( - name='Lithology'), DiscreteNode( - name='Structural setting'), DiscreteNode( - name='Gross'), DiscreteNode( - name='Netpay'), DiscreteNode( - name='Porosity'), DiscreteNode( - name='Permeability'), DiscreteNode( - name='Depth')] + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + DiscreteNode(name="Gross"), + DiscreteNode(name="Netpay"), + DiscreteNode(name="Porosity"), + DiscreteNode(name="Permeability"), + DiscreteNode(name="Depth"), + ] - self.bn.set_nodes( - nodes, info={ - "types": { - k.name: "disc" for k in nodes}}) - self.bn.set_edges([["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"]]) + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) self.bn.fit_parameters(pd.DataFrame.from_records(data)) - self.assertIsNotNone( - self.bn.sample( - 50, - as_df=False, - progress_bar=False)) + self.assertIsNotNone(self.bn.sample(50, as_df=False, progress_bar=False)) def test_predict(self): seq = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } data = pd.DataFrame.from_records(seq) nodes = [ - DiscreteNode( - name='Tectonic regime'), DiscreteNode( - name='Period'), DiscreteNode( - name='Lithology'), DiscreteNode( - name='Structural setting')] + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + ] - self.bn.set_nodes( - nodes, info={ - "types": { - k.name: "disc" for k in nodes}}) - self.bn.set_edges([["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"]]) + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) self.bn.fit_parameters(data) - result = self.bn.predict( - data.iloc[:, :3], parall_count=2, progress_bar=False) + result = self.bn.predict(data.iloc[:, :3], parall_count=2, progress_bar=False) self.assertNotEqual(result, {}) for v in result.values(): diff --git a/tests/test_nodes.py b/tests/test_nodes.py index f437725..87d2695 100644 --- a/tests/test_nodes.py +++ b/tests/test_nodes.py @@ -4,6 +4,7 @@ import numpy as np from bamt.nodes import * + logging.getLogger("nodes").setLevel(logging.CRITICAL) @@ -57,13 +58,12 @@ def test_choose_serialization(self): class TestDiscreteNode(unittest.TestCase): - def setUp(self): self.node = discrete_node.DiscreteNode(name="test") self.data_dict = { "node0": np.random.normal(0, 4, 30), - "node1": np.random.normal(0, .1, 30), - "node2": np.random.normal(0, .3, 30), + "node1": np.random.normal(0, 0.1, 30), + "node2": np.random.normal(0, 0.3, 30), "test": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -75,8 +75,7 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertIsNotNone(params["vals"]) self.assertNotEqual(params["vals"], []) @@ -85,31 +84,26 @@ def test_fit_parameters(self): def test_choose(self): pvals = ["cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue([self.node.choose(params, pvals) in params["vals"]]) def test_predict(self): pvals = ["cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue([self.node.predict(params, pvals) in params["vals"]]) - self.assertRaises( - KeyError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) class TestGaussianNode(unittest.TestCase): - def setUp(self): self.node = gaussian_node.GaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "foster-son": np.random.normal(2.5, .2, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -125,9 +119,11 @@ def test_fit_parameters(self): node_without_parents.children = ["node6", "node5"] params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + pd.DataFrame.from_records(self.data_dict) + ) params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + pd.DataFrame.from_records(self.data_dict) + ) self.assertIsNotNone(params_parents["regressor_obj"]) self.assertTrue(pd.isna(params_parents["mean"])) @@ -139,31 +135,25 @@ def test_fit_parameters(self): def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises( - ValueError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(ValueError, self.node.predict, params, ["bad", "values"]) class TestConditionalGaussianNode(unittest.TestCase): - def setUp(self): - self.node = conditional_gaussian_node.ConditionalGaussianNode( - name="test") + self.node = conditional_gaussian_node.ConditionalGaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "foster-son": np.random.normal(2.5, .2, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -177,17 +167,19 @@ def setUp(self): def fit_parameters(self, regressor=None): if regressor is not None: self.node.regressor = regressor - self.node.type = 'ConditionalGaussian' + \ - f" ({type(regressor).__name__})" + self.node.type = "ConditionalGaussian" + f" ({type(regressor).__name__})" node_without_parents = conditional_gaussian_node.ConditionalGaussianNode( - name="foster-son") + name="foster-son" + ) node_without_parents.children = ["node6", "node5"] params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict))["hybcprob"] + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"] params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict))["hybcprob"]['[]'] + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"]["[]"] self.assertIsNone(params_foster["regressor_obj"]) self.assertIsNotNone(params_foster["mean"]) @@ -220,29 +212,24 @@ def fit_parameters(self, regressor=None): def test_choose(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises( - KeyError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) class TestMixtureGaussianNode(unittest.TestCase): - def setUp(self): self.node = mixture_gaussian_node.MixtureGaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -254,33 +241,30 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertAlmostEqual(sum(params["coef"]), 1, delta=1e-5) def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) class TestConditionalMixtureGaussianNode(unittest.TestCase): - def setUp(self): self.node = conditional_mixture_gaussian_node.ConditionalMixtureGaussianNode( - name="test") + name="test" + ) self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -292,9 +276,9 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records( - self.data_dict))["hybcprob"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict))[ + "hybcprob" + ] report = [] # sometimes combination's data can be empty, so we set the percent of # empty combinations @@ -307,26 +291,23 @@ def test_fit_parameters(self): def test_choose(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) class TestLogitNode(unittest.TestCase): - def setUp(self): self.node = logit_node.LogitNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "node2": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "node2": np.random.normal(3, 0.3, 30), "test": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -338,23 +319,19 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertIsNotNone(params["classifier_obj"]) def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(self.node.choose(params, pvals) in params["classes"]) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue( - [self.node.predict(params, pvals) in params["classes"]]) + self.assertTrue([self.node.predict(params, pvals) in params["classes"]]) if __name__ == "__main__":