From e68d85a5b52b1d04a9065556f8556a14772423d4 Mon Sep 17 00:00:00 2001 From: jkaminski Date: Sat, 30 Mar 2024 00:30:40 +0300 Subject: [PATCH 01/18] initial commit for new structure --- bamt/builders/__init__.py | 1 - bamt/builders/builders_base.py | 222 --- bamt/builders/composite_builder.py | 293 ---- bamt/builders/evo_builder.py | 231 ---- bamt/builders/hc_builder.py | 208 --- bamt/{external => dag_optimizers}/__init__.py | 0 .../constraint}/__init__.py | 0 .../constraint/constraint_dag_optimizer.py} | 0 bamt/dag_optimizers/dag_optimizer.py | 10 + .../hybrid}/__init__.py | 0 .../hybrid/hybrid_dag_optimizer.py} | 0 .../score}/__init__.py | 0 .../score/bigbravebn.py} | 0 bamt/dag_optimizers/score/golem_genetic.py | 0 bamt/dag_optimizers/score/hill_climbing.py | 0 bamt/dag_optimizers/score/lsevobn.py | 0 .../score/score_dag_optimizer.py | 9 + bamt/display/__init__.py | 9 - bamt/display/display.py | 175 --- bamt/external/pyBN/__init__.py | 2 - .../pyBN/classes/_tests/test_bayesnet.py | 184 --- bamt/external/pyBN/classes/bayesnet.py | 393 ------ bamt/external/pyBN/utils/__init__.py | 4 - .../utils/_tests/test_independence_tests.py | 56 - .../pyBN/utils/_tests/test_markov_blanket.py | 36 - .../pyBN/utils/_tests/test_orient_edges.py | 38 - .../pyBN/utils/_tests/test_random_sample.py | 33 - bamt/external/pyBN/utils/class_equivalence.py | 33 - bamt/external/pyBN/utils/data.py | 24 - bamt/external/pyBN/utils/graph.py | 54 - .../external/pyBN/utils/independence_tests.py | 181 --- .../pyitlib/DiscreteRandomVariableUtils.py | 852 ------------ bamt/graph/__init__.py | 0 bamt/graph/dag.py | 24 + bamt/graph/graph.py | 30 + bamt/log.py | 67 - bamt/logging.conf | 51 - bamt/mi_entropy_gauss.py | 310 ----- bamt/models/__init__.py | 0 .../__init__.py | 0 .../bayesian_network.py | 19 + .../composite_bayesian_network.py | 15 + .../continuous_bayesian_network.py | 15 + .../discrete_bayesian_network.py | 15 + .../hybrid_bayesian_network.py | 15 + .../probabilistic_structural_model.py | 14 + bamt/networks/__init__.py | 6 - bamt/networks/base.py | 998 -------------- bamt/networks/big_brave_bn.py | 105 -- bamt/networks/composite_bn.py | 114 -- bamt/networks/continuous_bn.py | 15 - bamt/networks/discrete_bn.py | 15 - bamt/networks/hybrid_bn.py | 27 - bamt/nodes/__init__.py | 10 - bamt/nodes/base.py | 59 - bamt/nodes/child_nodes/__init__.py | 0 bamt/nodes/child_nodes/child_node.py | 6 + .../conditional_continuous_node.py | 7 + .../child_nodes/conditional_discrete_node.py | 7 + .../nodes/child_nodes/node_models/__init__.py | 0 bamt/nodes/composite_continuous_node.py | 17 - bamt/nodes/composite_discrete_node.py | 20 - bamt/nodes/conditional_gaussian_node.py | 173 --- bamt/nodes/conditional_logit_node.py | 172 --- .../conditional_mixture_gaussian_node.py | 227 ---- bamt/nodes/discrete_node.py | 113 -- bamt/nodes/gaussian_node.py | 96 -- bamt/nodes/logit_node.py | 91 -- bamt/nodes/mixture_gaussian_node.py | 154 --- bamt/nodes/node.py | 6 + bamt/nodes/root_nodes/__init__.py | 0 bamt/nodes/root_nodes/continuous_node.py | 6 + bamt/nodes/root_nodes/discrete_node.py | 0 bamt/nodes/root_nodes/root_node.py | 6 + bamt/nodes/schema.py | 47 - bamt/parameter_estimators/__init__.py | 0 .../maximum_likelihood_estimator.py | 9 + .../parameters_estimator.py | 10 + bamt/preprocess/discretization.py | 165 --- bamt/preprocess/graph.py | 42 - bamt/preprocess/numpy_pandas.py | 57 - bamt/preprocessors.py | 126 -- bamt/redef_HC.py | 316 ----- bamt/redef_info_scores.py | 179 --- bamt/score_functions/__init__.py | 0 bamt/score_functions/k2_score.py | 9 + .../mutual_information_score.py | 9 + bamt/score_functions/score_function.py | 10 + bamt/utils/EvoUtils.py | 116 -- bamt/utils/GraphUtils.py | 158 --- bamt/utils/MathUtils.py | 161 --- bamt/utils/check_utils.py | 9 - .../CompositeGeneticOperators.py | 189 --- bamt/utils/composite_utils/CompositeModel.py | 17 - bamt/utils/composite_utils/MLUtils.py | 126 -- bamt/utils/composite_utils/lgbm_params.json | 11 - bamt/utils/composite_utils/models_repo.json | 447 ------- bamt/utils/serialization_utils.py | 150 --- tests/BigbraveBNTest.py | 59 - tests/LoadBN.py | 54 - tests/MainTest.py | 137 -- tests/MetricsTest.py | 61 - tests/NetworksTest.py | 722 ---------- tests/SaveBN.py | 49 - tests/main.py | 27 - tests/sendingClassifiersLogit.py | 61 - tests/sendingRegressors.py | 62 - tests/test_builders.py | 728 ---------- tests/test_graph_analyzer.py | 98 -- tests/test_networks.py | 1192 ----------------- tests/test_nodes.py | 464 ------- tests/test_params.json | 1 - 112 files changed, 251 insertions(+), 11900 deletions(-) delete mode 100644 bamt/builders/__init__.py delete mode 100644 bamt/builders/builders_base.py delete mode 100644 bamt/builders/composite_builder.py delete mode 100644 bamt/builders/evo_builder.py delete mode 100644 bamt/builders/hc_builder.py rename bamt/{external => dag_optimizers}/__init__.py (100%) rename bamt/{external/pyBN/classes => dag_optimizers/constraint}/__init__.py (100%) rename bamt/{external/pyBN/classes/_tests/__init__.py => dag_optimizers/constraint/constraint_dag_optimizer.py} (100%) create mode 100644 bamt/dag_optimizers/dag_optimizer.py rename bamt/{external/pyBN/utils/_tests => dag_optimizers/hybrid}/__init__.py (100%) rename bamt/{external/pyitlib/__init__.py => dag_optimizers/hybrid/hybrid_dag_optimizer.py} (100%) rename bamt/{preprocess => dag_optimizers/score}/__init__.py (100%) rename bamt/{utils/__init__.py => dag_optimizers/score/bigbravebn.py} (100%) create mode 100644 bamt/dag_optimizers/score/golem_genetic.py create mode 100644 bamt/dag_optimizers/score/hill_climbing.py create mode 100644 bamt/dag_optimizers/score/lsevobn.py create mode 100644 bamt/dag_optimizers/score/score_dag_optimizer.py delete mode 100644 bamt/display/__init__.py delete mode 100644 bamt/display/display.py delete mode 100644 bamt/external/pyBN/__init__.py delete mode 100644 bamt/external/pyBN/classes/_tests/test_bayesnet.py delete mode 100644 bamt/external/pyBN/classes/bayesnet.py delete mode 100644 bamt/external/pyBN/utils/__init__.py delete mode 100644 bamt/external/pyBN/utils/_tests/test_independence_tests.py delete mode 100644 bamt/external/pyBN/utils/_tests/test_markov_blanket.py delete mode 100644 bamt/external/pyBN/utils/_tests/test_orient_edges.py delete mode 100644 bamt/external/pyBN/utils/_tests/test_random_sample.py delete mode 100644 bamt/external/pyBN/utils/class_equivalence.py delete mode 100644 bamt/external/pyBN/utils/data.py delete mode 100644 bamt/external/pyBN/utils/graph.py delete mode 100644 bamt/external/pyBN/utils/independence_tests.py delete mode 100644 bamt/external/pyitlib/DiscreteRandomVariableUtils.py create mode 100644 bamt/graph/__init__.py create mode 100644 bamt/graph/dag.py create mode 100644 bamt/graph/graph.py delete mode 100644 bamt/log.py delete mode 100644 bamt/logging.conf delete mode 100644 bamt/mi_entropy_gauss.py create mode 100644 bamt/models/__init__.py create mode 100644 bamt/models/probabilistic_structural_models/__init__.py create mode 100644 bamt/models/probabilistic_structural_models/bayesian_network.py create mode 100644 bamt/models/probabilistic_structural_models/composite_bayesian_network.py create mode 100644 bamt/models/probabilistic_structural_models/continuous_bayesian_network.py create mode 100644 bamt/models/probabilistic_structural_models/discrete_bayesian_network.py create mode 100644 bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py create mode 100644 bamt/models/probabilistic_structural_models/probabilistic_structural_model.py delete mode 100644 bamt/networks/__init__.py delete mode 100644 bamt/networks/base.py delete mode 100644 bamt/networks/big_brave_bn.py delete mode 100644 bamt/networks/composite_bn.py delete mode 100644 bamt/networks/continuous_bn.py delete mode 100644 bamt/networks/discrete_bn.py delete mode 100644 bamt/networks/hybrid_bn.py delete mode 100644 bamt/nodes/base.py create mode 100644 bamt/nodes/child_nodes/__init__.py create mode 100644 bamt/nodes/child_nodes/child_node.py create mode 100644 bamt/nodes/child_nodes/conditional_continuous_node.py create mode 100644 bamt/nodes/child_nodes/conditional_discrete_node.py create mode 100644 bamt/nodes/child_nodes/node_models/__init__.py delete mode 100644 bamt/nodes/composite_continuous_node.py delete mode 100644 bamt/nodes/composite_discrete_node.py delete mode 100644 bamt/nodes/conditional_gaussian_node.py delete mode 100644 bamt/nodes/conditional_logit_node.py delete mode 100644 bamt/nodes/conditional_mixture_gaussian_node.py delete mode 100644 bamt/nodes/discrete_node.py delete mode 100644 bamt/nodes/gaussian_node.py delete mode 100644 bamt/nodes/logit_node.py delete mode 100644 bamt/nodes/mixture_gaussian_node.py create mode 100644 bamt/nodes/node.py create mode 100644 bamt/nodes/root_nodes/__init__.py create mode 100644 bamt/nodes/root_nodes/continuous_node.py create mode 100644 bamt/nodes/root_nodes/discrete_node.py create mode 100644 bamt/nodes/root_nodes/root_node.py delete mode 100644 bamt/nodes/schema.py create mode 100644 bamt/parameter_estimators/__init__.py create mode 100644 bamt/parameter_estimators/maximum_likelihood_estimator.py create mode 100644 bamt/parameter_estimators/parameters_estimator.py delete mode 100644 bamt/preprocess/discretization.py delete mode 100644 bamt/preprocess/graph.py delete mode 100644 bamt/preprocess/numpy_pandas.py delete mode 100644 bamt/preprocessors.py delete mode 100644 bamt/redef_HC.py delete mode 100644 bamt/redef_info_scores.py create mode 100644 bamt/score_functions/__init__.py create mode 100644 bamt/score_functions/k2_score.py create mode 100644 bamt/score_functions/mutual_information_score.py create mode 100644 bamt/score_functions/score_function.py delete mode 100644 bamt/utils/EvoUtils.py delete mode 100644 bamt/utils/GraphUtils.py delete mode 100644 bamt/utils/MathUtils.py delete mode 100644 bamt/utils/check_utils.py delete mode 100644 bamt/utils/composite_utils/CompositeGeneticOperators.py delete mode 100644 bamt/utils/composite_utils/CompositeModel.py delete mode 100644 bamt/utils/composite_utils/MLUtils.py delete mode 100644 bamt/utils/composite_utils/lgbm_params.json delete mode 100644 bamt/utils/composite_utils/models_repo.json delete mode 100644 bamt/utils/serialization_utils.py delete mode 100644 tests/BigbraveBNTest.py delete mode 100644 tests/LoadBN.py delete mode 100644 tests/MainTest.py delete mode 100644 tests/MetricsTest.py delete mode 100644 tests/NetworksTest.py delete mode 100644 tests/SaveBN.py delete mode 100644 tests/main.py delete mode 100644 tests/sendingClassifiersLogit.py delete mode 100644 tests/sendingRegressors.py delete mode 100644 tests/test_builders.py delete mode 100644 tests/test_graph_analyzer.py delete mode 100644 tests/test_networks.py delete mode 100644 tests/test_nodes.py delete mode 100644 tests/test_params.json diff --git a/bamt/builders/__init__.py b/bamt/builders/__init__.py deleted file mode 100644 index d886ad6..0000000 --- a/bamt/builders/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = ["builders_base", "evo_builder", "hc_builder"] diff --git a/bamt/builders/builders_base.py b/bamt/builders/builders_base.py deleted file mode 100644 index e04bbff..0000000 --- a/bamt/builders/builders_base.py +++ /dev/null @@ -1,222 +0,0 @@ -import itertools -from typing import Dict, List, Optional, Tuple, Callable, TypedDict, Sequence, Union - -from pandas import DataFrame - -from bamt.log import logger_builder -from bamt.nodes.conditional_gaussian_node import ConditionalGaussianNode -from bamt.nodes.conditional_logit_node import ConditionalLogitNode -from bamt.nodes.conditional_mixture_gaussian_node import ConditionalMixtureGaussianNode -from bamt.nodes.discrete_node import DiscreteNode -from bamt.nodes.gaussian_node import GaussianNode -from bamt.nodes.logit_node import LogitNode -from bamt.nodes.mixture_gaussian_node import MixtureGaussianNode -from bamt.utils import GraphUtils as gru - - -class ParamDict(TypedDict, total=False): - init_edges: Optional[Sequence[str]] - init_nodes: Optional[List[str]] - remove_init_edges: bool - white_list: Optional[Tuple[str, str]] - bl_add: Optional[List[str]] - - -class StructureBuilder(object): - """ - Base Class for Structure Builder. - It can restrict nodes defined by RESTRICTIONS - """ - - def __init__(self, descriptor: Dict[str, Dict[str, str]]): - """ - :param descriptor: a dict with types and signs of nodes - Attributes: - black_list: a list with restricted connections; - """ - self.skeleton = {"V": [], "E": []} - self.descriptor = descriptor - - self.has_logit = bool - - self.black_list = None - - def restrict( - self, - data: DataFrame, - init_nodes: Optional[List[str]], - bl_add: Optional[List[str]], - ): - """ - :param data: data to deal with - :param init_nodes: nodes to begin with (thus they have no parents) - :param bl_add: additional vertices - """ - node_type = self.descriptor["types"] - blacklist = [] - datacol = data.columns.to_list() - - if not self.has_logit: - # Has_logit flag allows BN building edges between cont and disc - RESTRICTIONS = [("cont", "disc"), ("cont", "disc_num")] - for x, y in itertools.product(datacol, repeat=2): - if x != y: - if (node_type[x], node_type[y]) in RESTRICTIONS: - blacklist.append((x, y)) - else: - self.black_list = [] - if init_nodes: - blacklist += [(x, y) for x in datacol for y in init_nodes if x != y] - if bl_add: - blacklist = blacklist + bl_add - self.black_list = blacklist - - def get_family(self): - """ - A function that updates a skeleton; - """ - if not self.skeleton["V"]: - logger_builder.error("Vertex list is None") - return None - if not self.skeleton["E"]: - logger_builder.error("Edges list is None") - return None - for node_instance in self.skeleton["V"]: - node = node_instance.name - children = [] - parents = [] - for edge in self.skeleton["E"]: - if node in edge: - if edge.index(node) == 0: - children.append(edge[1]) - if edge.index(node) == 1: - parents.append(edge[0]) - - disc_parents = [] - cont_parents = [] - for parent in parents: - if self.descriptor["types"][parent] in ["disc", "disc_num"]: - disc_parents.append(parent) - else: - cont_parents.append(parent) - - id = self.skeleton["V"].index(node_instance) - self.skeleton["V"][id].disc_parents = disc_parents - self.skeleton["V"][id].cont_parents = cont_parents - self.skeleton["V"][id].children = children - - ordered = gru.toporder(self.skeleton["V"], self.skeleton["E"]) - not_ordered = [node.name for node in self.skeleton["V"]] - mask = [not_ordered.index(name) for name in ordered] - self.skeleton["V"] = [self.skeleton["V"][i] for i in mask] - - -class VerticesDefiner(StructureBuilder): - """ - Main class for defining vertices - """ - - def __init__( - self, descriptor: Dict[str, Dict[str, str]], regressor: Optional[object] - ): - """ - Automatically creates a list of nodes - """ - super(VerticesDefiner, self).__init__(descriptor=descriptor) - # Notice that vertices are used only by Builders - self.vertices = [] - - node = None - # LEVEL 1: Define a general type of node: Discrete or Gaussian - for vertex, type in self.descriptor["types"].items(): - if type in ["disc_num", "disc"]: - node = DiscreteNode(name=vertex) - elif type == "cont": - node = GaussianNode(name=vertex, regressor=regressor) - else: - msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). - Set vertex manually (by calling set_nodes()) or investigate the error.""" - logger_builder.error(msg) - continue - - self.vertices.append(node) - - def overwrite_vertex( - self, - has_logit: bool, - use_mixture: bool, - classifier: Optional[Callable], - regressor: Optional[Callable], - ): - """ - Level 2: Redefined nodes according structure (parents) - :param classifier: an object to pass into logit, condLogit nodes - :param regressor: an object to pass into gaussian nodes - :param has_logit allows edges from cont to disc nodes - :param use_mixture allows using Mixture - """ - for node_instance in self.vertices: - node = node_instance - if has_logit: - if "Discrete" in node_instance.type: - if node_instance.cont_parents: - if not node_instance.disc_parents: - node = LogitNode( - name=node_instance.name, classifier=classifier - ) - - elif node_instance.disc_parents: - node = ConditionalLogitNode( - name=node_instance.name, classifier=classifier - ) - - if use_mixture: - if "Gaussian" in node_instance.type: - if not node_instance.disc_parents: - node = MixtureGaussianNode(name=node_instance.name) - elif node_instance.disc_parents: - node = ConditionalMixtureGaussianNode(name=node_instance.name) - else: - continue - else: - if "Gaussian" in node_instance.type: - if node_instance.disc_parents: - node = ConditionalGaussianNode( - name=node_instance.name, regressor=regressor - ) - else: - continue - - if node_instance == node: - continue - - id = self.skeleton["V"].index(node_instance) - node.disc_parents = node_instance.disc_parents - node.cont_parents = node_instance.cont_parents - node.children = node_instance.children - self.skeleton["V"][id] = node - - -class EdgesDefiner(StructureBuilder): - def __init__(self, descriptor: Dict[str, Dict[str, str]]): - super(EdgesDefiner, self).__init__(descriptor) - - -class BaseDefiner(VerticesDefiner, EdgesDefiner): - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - scoring_function: Union[Tuple[str, Callable], Tuple[str]], - regressor: Optional[object] = None, - ): - self.scoring_function = scoring_function - self.params = { - "init_edges": None, - "init_nodes": None, - "remove_init_edges": True, - "white_list": None, - "bl_add": None, - } - super().__init__(descriptor, regressor=regressor) - self.optimizer = None # will be defined in subclasses diff --git a/bamt/builders/composite_builder.py b/bamt/builders/composite_builder.py deleted file mode 100644 index 764363a..0000000 --- a/bamt/builders/composite_builder.py +++ /dev/null @@ -1,293 +0,0 @@ -from datetime import timedelta -from typing import Dict, Optional, List, Tuple, Callable - -from golem.core.adapter import DirectAdapter -from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes -from golem.core.log import Log -from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer -from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters -from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum -from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum -from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum -from golem.core.optimisers.objective import Objective, ObjectiveEvaluate -from golem.core.optimisers.optimization_parameters import GraphRequirements -from golem.core.optimisers.optimizer import GraphGenerationParams -from pandas import DataFrame -from sklearn import preprocessing - -import bamt.preprocessors as pp -from bamt.builders.builders_base import VerticesDefiner, EdgesDefiner -from bamt.log import logger_builder -from bamt.nodes.composite_continuous_node import CompositeContinuousNode -from bamt.nodes.composite_discrete_node import CompositeDiscreteNode -from bamt.nodes.discrete_node import DiscreteNode -from bamt.nodes.gaussian_node import GaussianNode -from bamt.utils import EvoUtils as evo -from bamt.utils.composite_utils import CompositeGeneticOperators -from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode - - -class CompositeDefiner(VerticesDefiner, EdgesDefiner): - """ - Object that might take additional methods to decompose structure builder class - """ - - def __init__( - self, - descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object] = None, - ): - super().__init__(descriptor, regressor) - - # Notice that vertices are used only by Builders - self.vertices = [] - - # LEVEL 1: Define a general type of node: Discrete or Сontinuous - for vertex, type in self.descriptor["types"].items(): - if type in ["disc_num", "disc"]: - node = CompositeDiscreteNode(name=vertex) - elif type == "cont": - node = CompositeContinuousNode(name=vertex, regressor=regressor) - else: - msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). - Set vertex manually (by calling set_nodes()) or investigate the error.""" - logger_builder.error(msg) - continue - - self.vertices.append(node) - - -class CompositeStructureBuilder(CompositeDefiner): - """ - This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents - the structure of a Composite Bayesian Network. - - Attributes: - data (DataFrame): Input data used to build the structure. - descriptor (dict): Descriptor describing node types and signs. - """ - - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object], - ): - super(CompositeStructureBuilder, self).__init__( - descriptor=descriptor, regressor=regressor - ) - self.data = data - self.parent_models_dict = {} - self.descriptor = descriptor - self.regressor = regressor - self.params = { - "init_edges": None, - "init_nodes": None, - "remove_init_edges": True, - "white_list": None, - "bl_add": None, - } - self.default_n_jobs = -1 - self.default_pop_size = 15 - self.default_crossover_prob = 0.9 - self.default_mutation_prob = 0.8 - self.default_max_arity = 100 - self.default_max_depth = 100 - self.default_timeout = 180 - self.default_num_of_generations = 50 - self.default_early_stopping_iterations = 50 - self.objective_metric = CompositeGeneticOperators.composite_metric - self.default_crossovers = [ - CrossoverTypesEnum.exchange_edges, - CrossoverTypesEnum.exchange_parents_one, - CrossoverTypesEnum.exchange_parents_both, - CompositeGeneticOperators.custom_crossover_all_model, - ] - self.default_mutations = [ - CompositeGeneticOperators.custom_mutation_add_structure, - CompositeGeneticOperators.custom_mutation_delete_structure, - CompositeGeneticOperators.custom_mutation_reverse_structure, - CompositeGeneticOperators.custom_mutation_add_model, - ] - self.default_selection = [SelectionTypesEnum.tournament] - self.default_constraints = [ - has_no_self_cycled_nodes, - has_no_cycle, - evo.has_no_duplicates, - ] - self.verbose = True - self.logging_level = 50 - - def overwrite_vertex( - self, - regressor: Optional[Callable], - ): - for node_instance in self.vertices: - node = node_instance - if ( - len(node_instance.cont_parents + node_instance.disc_parents) < 1 - and type(node_instance).__name__ == "CompositeContinuousNode" - ): - node = GaussianNode(name=node_instance.name, regressor=regressor) - elif ( - len(node_instance.cont_parents + node_instance.disc_parents) < 1 - and type(node_instance).__name__ == "CompositeDiscreteNode" - ): - node = DiscreteNode(name=node_instance.name) - else: - continue - id_node = self.skeleton["V"].index(node_instance) - node.disc_parents = node_instance.disc_parents - node.cont_parents = node_instance.cont_parents - node.children = node_instance.children - self.skeleton["V"][id_node] = node - - def build( - self, - data: DataFrame, - classifier: Optional[object], - regressor: Optional[object], - **kwargs, - ): - """ - Calls the search method to execute all the evolutionary computations. - - Args: - data (DataFrame): The data from which to build the structure. - classifier (Optional[object]): A classification model for discrete nodes. - regressor (Optional[object]): A regression model for continuous nodes. - """ - best_graph_edge_list, parent_models = self.search(data, **kwargs) - - # Convert the best graph to the format used by the Bayesian Network - self.skeleton["V"] = self.vertices - self.skeleton["E"] = best_graph_edge_list - self.parent_models_dict = parent_models - - self.get_family() - self.overwrite_vertex(regressor=regressor) - - def search(self, data: DataFrame, **kwargs) -> [List[Tuple[str, str]], Dict]: - """ - Executes all the evolutionary computations and returns the best graph's edge list. - - Args: - data (DataFrame): The data from which to build the structure. - - Returns: - best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. - """ - # Get the list of node names - vertices = list(data.columns) - - encoder = preprocessing.LabelEncoder() - p = pp.Preprocessor([("encoder", encoder)]) - preprocessed_data, _ = p.apply(data) - - # Create the initial population - initial = kwargs.get( - "custom_initial_structure", - [ - CompositeModel( - nodes=[ - CompositeNode( - nodes_from=None, - content={ - "name": vertex, - "type": p.nodes_types[vertex], - "parent_model": None, - }, - ) - for vertex in vertices - ] - ) - ], - ) - - # Define the requirements for the evolutionary algorithm - requirements = GraphRequirements( - max_arity=kwargs.get("max_arity", self.default_max_arity), - max_depth=kwargs.get("max_depth", self.default_max_depth), - num_of_generations=kwargs.get( - "num_of_generations", self.default_num_of_generations - ), - timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), - early_stopping_iterations=kwargs.get( - "early_stopping_iterations", self.default_early_stopping_iterations - ), - n_jobs=kwargs.get("n_jobs", self.default_n_jobs), - ) - - # Set the parameters for the evolutionary algorithm - optimizer_parameters = GPAlgorithmParameters( - pop_size=kwargs.get("pop_size", self.default_pop_size), - crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), - mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), - genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, - mutation_types=kwargs.get("custom_mutations", self.default_mutations), - crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), - selection_types=kwargs.get("selection_type", self.default_selection), - ) - - # Set the adapter for the conversion between the graph and the data - # structures used by the optimizer - adapter = DirectAdapter( - base_graph_class=CompositeModel, base_node_class=CompositeNode - ) - - # Set the constraints for the graph - - constraints = kwargs.get("custom_constraints", []) - - constraints.extend(self.default_constraints) - - if kwargs.get("blacklist", None) is not None: - constraints.append(evo.has_no_blacklist_edges) - if kwargs.get("whitelist", None) is not None: - constraints.append(evo.has_only_whitelist_edges) - - graph_generation_params = GraphGenerationParams( - adapter=adapter, rules_for_constraint=constraints - ) - - # Define the objective function to optimize - objective = Objective( - {"custom": kwargs.get("custom_metric", self.objective_metric)} - ) - - # Initialize the optimizer - optimizer = EvoGraphOptimizer( - objective=objective, - initial_graphs=initial, - requirements=requirements, - graph_generation_params=graph_generation_params, - graph_optimizer_params=optimizer_parameters, - ) - - # Define the function to evaluate the objective function - objective_eval = ObjectiveEvaluate(objective, data=preprocessed_data) - - if not kwargs.get("verbose", self.verbose): - Log().reset_logging_level(logging_level=50) - - # Run the optimization - optimized_graph = optimizer.optimise(objective_eval)[0] - - parent_models = self._get_parent_models(optimized_graph) - - # Get the best graph - best_graph_edge_list = optimized_graph.operator.get_edges() - best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) - - return best_graph_edge_list, parent_models - - @staticmethod - def _convert_to_strings(nested_list): - return [[str(item) for item in inner_list] for inner_list in nested_list] - - @staticmethod - def _get_parent_models(graph): - parent_models = {} - for node in graph.nodes: - parent_models[node.content["name"]] = node.content["parent_model"] - return parent_models diff --git a/bamt/builders/evo_builder.py b/bamt/builders/evo_builder.py deleted file mode 100644 index 2921347..0000000 --- a/bamt/builders/evo_builder.py +++ /dev/null @@ -1,231 +0,0 @@ -from datetime import timedelta -from typing import Dict, Optional, List, Tuple - -from golem.core.adapter import DirectAdapter -from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes -from golem.core.log import Log -from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer -from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters -from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum -from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum -from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum -from golem.core.optimisers.objective import Objective, ObjectiveEvaluate -from golem.core.optimisers.optimization_parameters import GraphRequirements -from golem.core.optimisers.optimizer import GraphGenerationParams -from pandas import DataFrame - -from bamt.builders.builders_base import BaseDefiner -from bamt.utils import EvoUtils as evo - - -class EvoDefiner(BaseDefiner): - """ - Object that might take additional methods to decompose structure builder class - """ - - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object] = None, - ): - super().__init__(data, descriptor, regressor) - - -class EvoStructureBuilder(EvoDefiner): - """ - This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents - the structure of a Bayesian Network. - - Attributes: - data (DataFrame): Input data used to build the structure. - descriptor (dict): Descriptor describing node types and signs. - regressor (object): A regression model for continuous nodes. - has_logit (bool): Indicates whether a logit link function should be used. - use_mixture (bool): Indicates whether a mixture model should be used. - """ - - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object], - has_logit: bool, - use_mixture: bool, - ): - super(EvoStructureBuilder, self).__init__( - data=data, descriptor=descriptor, regressor=regressor - ) - self.data = data - self.descriptor = descriptor - self.has_logit = has_logit - self.use_mixture = use_mixture - self.regressor = regressor - self.params = { - "init_edges": None, - "init_nodes": None, - "remove_init_edges": True, - "white_list": None, - "bl_add": None, - } - self.default_n_jobs = -1 - self.default_pop_size = 15 - self.default_crossover_prob = 0.9 - self.default_mutation_prob = 0.8 - self.default_max_arity = 100 - self.default_max_depth = 100 - self.default_timeout = 180 - self.default_num_of_generations = 50 - self.default_early_stopping_iterations = 50 - self.logging_level = 50 - self.objective_metric = evo.K2_metric - self.default_crossovers = [ - CrossoverTypesEnum.exchange_edges, - CrossoverTypesEnum.exchange_parents_one, - CrossoverTypesEnum.exchange_parents_both, - ] - self.default_mutations = [ - evo.custom_mutation_add, - evo.custom_mutation_delete, - evo.custom_mutation_reverse, - ] - self.default_selection = [SelectionTypesEnum.tournament] - self.default_constraints = [ - has_no_self_cycled_nodes, - has_no_cycle, - evo.has_no_duplicates, - ] - self.verbose = True - - def build( - self, - data: DataFrame, - classifier: Optional[object], - regressor: Optional[object], - **kwargs - ): - """ - Calls the search method to execute all the evolutionary computations. - - Args: - data (DataFrame): The data from which to build the structure. - classifier (Optional[object]): A classification model for discrete nodes. - regressor (Optional[object]): A regression model for continuous nodes. - """ - best_graph_edge_list = self.search(data, **kwargs) - - # Convert the best graph to the format used by the Bayesian Network - self.skeleton["V"] = self.vertices - self.skeleton["E"] = best_graph_edge_list - - self.get_family() - self.overwrite_vertex( - has_logit=self.has_logit, - use_mixture=self.use_mixture, - classifier=classifier, - regressor=regressor, - ) - - def search(self, data: DataFrame, **kwargs) -> List[Tuple[str, str]]: - """ - Executes all the evolutionary computations and returns the best graph's edge list. - - Args: - data (DataFrame): The data from which to build the structure. - - Returns: - best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. - """ - # Get the list of node names - nodes_types = data.columns.to_list() - - # Create the initial population - initial = [ - evo.CustomGraphModel( - nodes=kwargs.get( - "init_nodes", - [evo.CustomGraphNode(node_type) for node_type in nodes_types], - ) - ) - ] - - # Define the requirements for the evolutionary algorithm - requirements = GraphRequirements( - max_arity=kwargs.get("max_arity", self.default_max_arity), - max_depth=kwargs.get("max_depth", self.default_max_depth), - num_of_generations=kwargs.get( - "num_of_generations", self.default_num_of_generations - ), - timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), - early_stopping_iterations=kwargs.get( - "early_stopping_iterations", self.default_early_stopping_iterations - ), - n_jobs=kwargs.get("n_jobs", self.default_n_jobs), - ) - - # Set the parameters for the evolutionary algorithm - optimizer_parameters = GPAlgorithmParameters( - pop_size=kwargs.get("pop_size", self.default_pop_size), - crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), - mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), - genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, - mutation_types=kwargs.get("custom_mutations", self.default_mutations), - crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), - selection_types=kwargs.get("selection_type", self.default_selection), - ) - - # Set the adapter for the conversion between the graph and the data - # structures used by the optimizer - adapter = DirectAdapter( - base_graph_class=evo.CustomGraphModel, base_node_class=evo.CustomGraphNode - ) - - # Set the constraints for the graph - - constraints = kwargs.get("custom_constraints", []) - - constraints.extend(self.default_constraints) - - if kwargs.get("blacklist", None) is not None: - constraints.append(evo.has_no_blacklist_edges) - if kwargs.get("whitelist", None) is not None: - constraints.append(evo.has_only_whitelist_edges) - - graph_generation_params = GraphGenerationParams( - adapter=adapter, - rules_for_constraint=constraints, - available_node_types=nodes_types, - ) - - # Define the objective function to optimize - objective = Objective( - {"custom": kwargs.get("custom_metric", self.objective_metric)} - ) - - # Initialize the optimizer - optimizer = EvoGraphOptimizer( - objective=objective, - initial_graphs=initial, - requirements=requirements, - graph_generation_params=graph_generation_params, - graph_optimizer_params=optimizer_parameters, - ) - - # Define the function to evaluate the objective function - objective_eval = ObjectiveEvaluate(objective, data=data) - - if not kwargs.get("verbose", self.verbose): - Log().reset_logging_level(logging_level=50) - - # Run the optimization - optimized_graph = optimizer.optimise(objective_eval)[0] - - # Get the best graph - best_graph_edge_list = optimized_graph.operator.get_edges() - best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) - - return best_graph_edge_list - - @staticmethod - def _convert_to_strings(nested_list): - return [tuple([str(item) for item in inner_list]) for inner_list in nested_list] diff --git a/bamt/builders/hc_builder.py b/bamt/builders/hc_builder.py deleted file mode 100644 index 3f92e14..0000000 --- a/bamt/builders/hc_builder.py +++ /dev/null @@ -1,208 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Callable, Union - -from pandas import DataFrame -from pgmpy.base import DAG -from pgmpy.estimators import HillClimbSearch - -from bamt.builders.builders_base import ParamDict, BaseDefiner -from bamt.log import logger_builder -from bamt.redef_HC import hc as hc_method -from bamt.utils import GraphUtils as gru - - -class HillClimbDefiner(BaseDefiner): - """ - Object to define structure and pass it into skeleton - """ - - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - scoring_function: Union[Tuple[str, Callable], Tuple[str]], - regressor: Optional[object] = None, - ): - super().__init__(data, descriptor, scoring_function, regressor) - self.optimizer = HillClimbSearch(data) - - def apply_K2( - self, - data: DataFrame, - init_edges: Optional[List[Tuple[str, str]]], - progress_bar: bool, - remove_init_edges: bool, - white_list: Optional[List[Tuple[str, str]]], - ): - """ - :param init_edges: list of tuples, a graph to start learning with - :param remove_init_edges: allows changes in a model defined by user - :param data: user's data - :param progress_bar: verbose regime - :param white_list: list of allowed edges - """ - if not all([i in ["disc", "disc_num"] for i in gru.nodes_types(data).values()]): - logger_builder.error( - f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}" - ) - return None - - if len(self.scoring_function) != 2: - from pgmpy.estimators import K2Score - - scoring_function = K2Score - else: - scoring_function = self.scoring_function[1] - - if not init_edges: - best_model = self.optimizer.estimate( - scoring_method=scoring_function(data), - black_list=self.black_list, - white_list=white_list, - show_progress=progress_bar, - ) - else: - if remove_init_edges: - startdag = DAG() - nodes = [str(v) for v in self.vertices] - startdag.add_nodes_from(nodes=nodes) - startdag.add_edges_from(ebunch=init_edges) - best_model = self.optimizer.estimate( - black_list=self.black_list, - white_list=white_list, - start_dag=startdag, - show_progress=False, - ) - else: - best_model = self.optimizer.estimate( - black_list=self.black_list, - white_list=white_list, - fixed_edges=init_edges, - show_progress=False, - ) - - structure = [list(x) for x in list(best_model.edges())] - self.skeleton["E"] = structure - - def apply_group1( - self, - data: DataFrame, - progress_bar: bool, - init_edges: Optional[List[Tuple[str, str]]], - remove_init_edges: bool, - white_list: Optional[List[Tuple[str, str]]], - ): - """ - This method implements the group of scoring functions. - Group: - "MI" - Mutual Information, - "LL" - Log Likelihood, - "BIC" - Bayesian Information Criteria, - "AIC" - Akaike information Criteria. - """ - column_name_dict = dict([(n.name, i) for i, n in enumerate(self.vertices)]) - blacklist_new = [] - for pair in self.black_list: - blacklist_new.append((column_name_dict[pair[0]], column_name_dict[pair[1]])) - if white_list: - white_list_old = white_list[:] - white_list = [] - for pair in white_list_old: - white_list.append( - (column_name_dict[pair[0]], column_name_dict[pair[1]]) - ) - if init_edges: - init_edges_old = init_edges[:] - init_edges = [] - for pair in init_edges_old: - init_edges.append( - (column_name_dict[pair[0]], column_name_dict[pair[1]]) - ) - - bn = hc_method( - data, - metric=self.scoring_function[0], - restriction=white_list, - init_edges=init_edges, - remove_geo_edges=remove_init_edges, - black_list=blacklist_new, - debug=progress_bar, - ) - structure = [] - nodes = sorted(list(bn.nodes())) - for rv in nodes: - for pa in bn.F[rv]["parents"]: - structure.append( - [ - list(column_name_dict.keys())[ - list(column_name_dict.values()).index(pa) - ], - list(column_name_dict.keys())[ - list(column_name_dict.values()).index(rv) - ], - ] - ) - self.skeleton["E"] = structure - - -class HCStructureBuilder(HillClimbDefiner): - """ - Final object with build method - """ - - def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - scoring_function: Tuple[str, Callable], - regressor: Optional[object], - has_logit: bool, - use_mixture: bool, - ): - """ - :param data: train data - :param descriptor: map for data - """ - - super(HCStructureBuilder, self).__init__( - descriptor=descriptor, - data=data, - scoring_function=scoring_function, - regressor=regressor, - ) - self.use_mixture = use_mixture - self.has_logit = has_logit - - def build( - self, - data: DataFrame, - progress_bar: bool, - classifier: Optional[object], - regressor: Optional[object], - params: Optional[ParamDict] = None, - **kwargs, - ): - if params: - for param, value in params.items(): - self.params[param] = value - - init_nodes = self.params.pop("init_nodes") - bl_add = self.params.pop("bl_add") - - # Level 1 - self.skeleton["V"] = self.vertices - - self.restrict(data, init_nodes, bl_add) - if self.scoring_function[0] == "K2": - self.apply_K2(data=data, progress_bar=progress_bar, **self.params) - elif self.scoring_function[0] in ["MI", "LL", "BIC", "AIC"]: - self.apply_group1(data=data, progress_bar=progress_bar, **self.params) - - # Level 2 - - self.get_family() - self.overwrite_vertex( - has_logit=self.has_logit, - use_mixture=self.use_mixture, - classifier=classifier, - regressor=regressor, - ) diff --git a/bamt/external/__init__.py b/bamt/dag_optimizers/__init__.py similarity index 100% rename from bamt/external/__init__.py rename to bamt/dag_optimizers/__init__.py diff --git a/bamt/external/pyBN/classes/__init__.py b/bamt/dag_optimizers/constraint/__init__.py similarity index 100% rename from bamt/external/pyBN/classes/__init__.py rename to bamt/dag_optimizers/constraint/__init__.py diff --git a/bamt/external/pyBN/classes/_tests/__init__.py b/bamt/dag_optimizers/constraint/constraint_dag_optimizer.py similarity index 100% rename from bamt/external/pyBN/classes/_tests/__init__.py rename to bamt/dag_optimizers/constraint/constraint_dag_optimizer.py diff --git a/bamt/dag_optimizers/dag_optimizer.py b/bamt/dag_optimizers/dag_optimizer.py new file mode 100644 index 0000000..b9c992f --- /dev/null +++ b/bamt/dag_optimizers/dag_optimizer.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class DAGOptimizer(ABC): + def __init__(self): + pass + + @abstractmethod + def optimize(self): + pass diff --git a/bamt/external/pyBN/utils/_tests/__init__.py b/bamt/dag_optimizers/hybrid/__init__.py similarity index 100% rename from bamt/external/pyBN/utils/_tests/__init__.py rename to bamt/dag_optimizers/hybrid/__init__.py diff --git a/bamt/external/pyitlib/__init__.py b/bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py similarity index 100% rename from bamt/external/pyitlib/__init__.py rename to bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py diff --git a/bamt/preprocess/__init__.py b/bamt/dag_optimizers/score/__init__.py similarity index 100% rename from bamt/preprocess/__init__.py rename to bamt/dag_optimizers/score/__init__.py diff --git a/bamt/utils/__init__.py b/bamt/dag_optimizers/score/bigbravebn.py similarity index 100% rename from bamt/utils/__init__.py rename to bamt/dag_optimizers/score/bigbravebn.py diff --git a/bamt/dag_optimizers/score/golem_genetic.py b/bamt/dag_optimizers/score/golem_genetic.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/dag_optimizers/score/hill_climbing.py b/bamt/dag_optimizers/score/hill_climbing.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/dag_optimizers/score/lsevobn.py b/bamt/dag_optimizers/score/lsevobn.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/dag_optimizers/score/score_dag_optimizer.py b/bamt/dag_optimizers/score/score_dag_optimizer.py new file mode 100644 index 0000000..a9ae0fb --- /dev/null +++ b/bamt/dag_optimizers/score/score_dag_optimizer.py @@ -0,0 +1,9 @@ +from bamt.dag_optimizers.dag_optimizer import DAGOptimizer + + +class ScoreDAGOptimizer(DAGOptimizer): + def __init__(self): + super().__init__() + + def optimize(self): + pass diff --git a/bamt/display/__init__.py b/bamt/display/__init__.py deleted file mode 100644 index a22b2c3..0000000 --- a/bamt/display/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .display import Display - - -def plot_(output, *args): - return Display(output).build(*args) - - -def get_info_(bn, as_df): - return Display(output=None).get_info(bn, as_df) diff --git a/bamt/display/display.py b/bamt/display/display.py deleted file mode 100644 index 2568fb7..0000000 --- a/bamt/display/display.py +++ /dev/null @@ -1,175 +0,0 @@ -import random -from typing import Dict - -import matplotlib -import matplotlib.pyplot as plt -import networkx as nx -import numpy as np -from pandas import DataFrame -from pyvis.network import Network - -from bamt.log import logger_display - - -class Display(object): - """Display object""" - - def __init__(self, output): - """ - :param output: output file name - """ - self.output = output - self.class2color = Dict - self.name2class = Dict - - @staticmethod - def _validate_dir(output): - """Format validation""" - if not output.endswith(".html"): - logger_display.error("This version allows only html format.") - return None - return True - - @staticmethod - def _make_network(nodes, edges, **kwargs): - """Make network and graph - :param nodes: nodes - :param edges: edges - :param kwargs: a dict passed to pyvis.network.Network - """ - g = nx.DiGraph() - network_params = dict( - height=kwargs.get("height", "800px"), - width=kwargs.get("width", "100%"), - notebook=kwargs.get("notebook", True), - directed=kwargs.get("directed", nx.is_directed(g)), - layout=kwargs.get("layout", "hierarchical"), - ) - - g.add_nodes_from(nodes) - g.add_edges_from(edges) - - network = Network(**network_params) - return network, g - - def _init_colors(self, nodes): - # Qualitative class of colormaps - q_classes = [ - "Pastel1", - "Pastel2", - "Paired", - "Accent", - "Dark2", - "Set1", - "Set2", - "Set3", - "tab10", - "tab20", - "tab20b", - "tab20c", - ] - - hex_colors = [] - for cls in q_classes: - rgb_colors = plt.get_cmap(cls).colors - hex_colors.extend( - [matplotlib.colors.rgb2hex(rgb_color) for rgb_color in rgb_colors] - ) - - class_number = len(set([node.type for node in nodes])) - hex_colors_indexes = [ - random.randint(0, len(hex_colors) - 1) for _ in range(class_number) - ] - - hex_colors = np.array(hex_colors) - - hex_colors_picked = hex_colors[hex_colors_indexes] - class2color = { - cls: color - for cls, color in zip(set([node.type for node in nodes]), hex_colors_picked) - } - name2class = {node.name: node.type for node in nodes} - self.class2color = class2color - self.name2class = name2class - - def build(self, nodes, edges, **kwargs): - """Build and show network - - :param nodes: nodes - :param edges: edges - :param kwargs: a dict passed to pyvis.network.Network.add_node - """ - if not self._validate_dir(self.output): - logger_display.error(f"Output error: {self.output}") - return None - - if not kwargs: - node_params = dict(font={"size": 36}, size=45) - else: - node_params = kwargs - - self._init_colors(nodes) - - nodes_names = [node.name for node in nodes] - - network, g = self._make_network(nodes_names, edges) - nodes_sorted = np.array(list(nx.topological_generations(g)), dtype=object) - - for level in range(len(nodes_sorted)): - for node_i in range(len(nodes_sorted[level])): - name = nodes_sorted[level][node_i] - cls = self.name2class[name] - color = self.class2color[cls] - network.add_node( - name, - label=name, - color=color, - level=level, - title=f"{name} ({cls})", - **node_params, - ) - - for edge in g.edges: - network.add_edge(edge[0], edge[1]) - - network.hrepulsion(node_distance=300, central_gravity=0.5) - - return network.show(self.output) - - @staticmethod - def get_info(bn, as_df): - if as_df: - names = [] - types_n = [] - types_d = [] - parents = [] - parents_types = [] - for n in bn.nodes: - names.append(n) - types_n.append(n.type) - types_d.append(bn.descriptor["types"][n.name]) - parents_types.append( - [ - bn.descriptor["types"][name] - for name in n.cont_parents + n.disc_parents - ] - ) - parents.append([name for name in n.cont_parents + n.disc_parents]) - return DataFrame( - { - "name": names, - "node_type": types_n, - "data_type": types_d, - "parents": parents, - "parents_types": parents_types, - } - ) - else: - for n in bn.nodes: - print( - f"{n.name: <20} | " - f"{n.type: <50} | " - f"{bn.descriptor['types'][n.name]: <10} | " - f"{str([bn.descriptor['types'][name] for name in n.cont_parents + n.disc_parents]): <50} | " - f"{str([name for name in n.cont_parents + n.disc_parents])}" - ) diff --git a/bamt/external/pyBN/__init__.py b/bamt/external/pyBN/__init__.py deleted file mode 100644 index 321c5ec..0000000 --- a/bamt/external/pyBN/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from bamt.external.pyBN.classes import * -from bamt.external.pyBN.utils import * diff --git a/bamt/external/pyBN/classes/_tests/test_bayesnet.py b/bamt/external/pyBN/classes/_tests/test_bayesnet.py deleted file mode 100644 index 8beca5c..0000000 --- a/bamt/external/pyBN/classes/_tests/test_bayesnet.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -******** -UnitTest -BayesNet -******** - -Method Checks that -assertEqual(a, b) a == b -assertNotEqual(a, b) a != b -assertTrue(x) bool(x) is True -assertFalse(x) bool(x) is False -assertIs(a, b) a is b -assertIsNot(a, b) a is not b -assertIsNone(x) x is None -assertIsNotNone(x) x is not None -assertIn(a, b) a in b -assertNotIn(a, b) a not in b -assertIsInstance(a, b) isinstance(a, b) -assertNotIsInstance(a, b) not isinstance(a, b) - -assertAlmostEqual(a, b) round(a-b, 7) == 0 -assertNotAlmostEqual(a, b) round(a-b, 7) != 0 -assertGreater(a, b) a > b -assertGreaterEqual(a, b) a >= b -assertLess(a, b) a < b -assertLessEqual(a, b) a <= b - -assertListEqual(a, b) lists -assertTupleEqual(a, b) tuples -assertSetEqual(a, b) sets or frozensets -assertDictEqual(a, b) dicts - -""" -__author__ = """Nicholas Cullen """ - -import os -import unittest -from os.path import dirname - -from external.pyBN.classes.bayesnet import BayesNet -from external.pyBN.readwrite.read import read_bn - - -class BayesNetTestCase(unittest.TestCase): - def setUp(self): - self.bn = BayesNet() - self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") - self.bn_bif = read_bn(os.path.join(self.dpath, "cancer.bif")) - self.bn_bn = read_bn(os.path.join(self.dpath, "cmu.bn")) - - def tearDown(self): - pass - - def test_isinstance(self): - self.assertIsInstance(self.bn, BayesNet) - - def test_V_bif(self): - self.assertListEqual( - self.bn_bif.V, ["Smoker", "Pollution", "Cancer", "Xray", "Dyspnoea"] - ) - - def test_E_bif(self): - self.assertDictEqual( - self.bn_bif.E, - { - "Cancer": ["Xray", "Dyspnoea"], - "Dyspnoea": [], - "Pollution": ["Cancer"], - "Smoker": ["Cancer"], - "Xray": [], - }, - ) - - def test_F_bif(self): - self.assertDictEqual( - self.bn_bif.F, - { - "Cancer": { - "cpt": [0.03, 0.97, 0.05, 0.95, 0.001, 0.999, 0.02, 0.98], - "parents": ["Pollution", "Smoker"], - "values": ["True", "False"], - }, - "Dyspnoea": { - "cpt": [0.65, 0.35, 0.3, 0.7], - "parents": ["Cancer"], - "values": ["True", "False"], - }, - "Pollution": { - "cpt": [0.9, 0.1], - "parents": [], - "values": ["low", "high"], - }, - "Smoker": { - "cpt": [0.3, 0.7], - "parents": [], - "values": ["True", "False"], - }, - "Xray": { - "cpt": [0.9, 0.1, 0.2, 0.8], - "parents": ["Cancer"], - "values": ["positive", "negative"], - }, - }, - ) - - def test_V_bn(self): - self.assertListEqual( - self.bn_bn.V, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] - ) - - def test_E_bn(self): - self.assertDictEqual( - self.bn_bn.E, - { - "Alarm": ["JohnCalls", "MaryCalls"], - "Burglary": ["Alarm"], - "Earthquake": ["Alarm"], - "JohnCalls": [], - "MaryCalls": [], - }, - ) - - def test_F_bn(self): - self.assertDictEqual( - self.bn_bn.F, - { - "Alarm": { - "cpt": [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95], - "parents": ["Earthquake", "Burglary"], - "values": ["No", "Yes"], - }, - "Burglary": { - "cpt": [0.999, 0.001], - "parents": [], - "values": ["No", "Yes"], - }, - "Earthquake": { - "cpt": [0.998, 0.002], - "parents": [], - "values": ["No", "Yes"], - }, - "JohnCalls": { - "cpt": [0.95, 0.05, 0.1, 0.9], - "parents": ["Alarm"], - "values": ["No", "Yes"], - }, - "MaryCalls": { - "cpt": [0.99, 0.01, 0.3, 0.7], - "parents": ["Alarm"], - "values": ["No", "Yes"], - }, - }, - ) - - def test_nodes(self): - n = list(self.bn_bn.nodes()) - self.assertListEqual( - n, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] - ) - - def test_cpt(self): - cpt = list(self.bn_bn.cpt("Alarm")) - self.assertListEqual(cpt, [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95]) - - def test_card(self): - self.assertEqual(self.bn_bn.card("Alarm"), 2) - - def test_scope(self): - self.assertListEqual( - self.bn_bn.scope("Alarm"), ["Alarm", "Earthquake", "Burglary"] - ) - - def test_parents(self): - self.assertListEqual(self.bn_bn.parents("Alarm"), ["Earthquake", "Burglary"]) - - def test_values(self): - self.assertListEqual(self.bn_bn.values("Alarm"), ["No", "Yes"]) - - def test_values_idx(self): - self.assertEqual(self.bn_bn.values("Alarm")[1], "Yes") - - -if __name__ == "__main__": - unittest.main(exit=False) diff --git a/bamt/external/pyBN/classes/bayesnet.py b/bamt/external/pyBN/classes/bayesnet.py deleted file mode 100644 index 88c6893..0000000 --- a/bamt/external/pyBN/classes/bayesnet.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -************** -BayesNet Class -************** - -Overarching class for Discrete Bayesian Networks. - - -Design Specs ------------- - -- Bayesian Network - - - - F - - key: - - rv - - values: - - children - - - parents - - - values - - - cpt - - - - V - - - list of rvs - - - - E - - key: - - rv - - values: - - list of rv's children - -Notes ------ -- Edges can be inferred from Factorization, but Vertex values - must be specified. -""" - -__author__ = """Nicholas Cullen """ - -from copy import deepcopy - -import numpy as np - -from bamt.external.pyBN.utils.class_equivalence import are_class_equivalent -from bamt.external.pyBN.utils.graph import topsort - - -class BayesNet(object): - """ - Overarching class for Bayesian Networks - - """ - - def __init__(self, E=None, value_dict=None, file=None): - """ - Initialize the BayesNet class. - - Arguments - --------- - *V* : a list of strings - vertices in topsort order - *E* : a dict, where key = vertex, val = list of its children - *F* : a dict, - where key = rv, - val = another dict with - keys = - 'parents', - 'values', - 'cpt' - - *V* : a dict - - Notes - ----- - - """ - if file is not None: - import pyBN.io.read as ior - - bn = ior.read_bn(file) - self.V = bn.V - self.E = bn.E - self.F = bn.F - else: - if E is not None: - # assert (value_dict is not None), 'Must set values if E is set.' - self.set_structure(E, value_dict) - else: - self.V = [] - self.E = {} - self.F = {} - - def __eq__(self, y): - """ - Tests whether two Bayesian Networks are - equivalent - i.e. they contain the same - node/edge structure, and equality of - conditional probabilities. - """ - return are_class_equivalent(self, y) - - def __hash__(self): - """ - Allows BayesNet objects to be used - as keys in a dictionary (i.e. hashable) - """ - return hash((str(self.V), str(self.E))) - - def copy(self): - V = deepcopy(self.V) - E = deepcopy(self.E) - F = {} - for v in V: - F[v] = {} - F[v]["cpt"] = deepcopy(self.F[v]["cpt"]) - F[v]["parents"] = deepcopy(self.F[v]["parents"]) - F[v]["values"] = deepcopy(self.F[v]["values"]) - bn = BayesNet() - bn.V = V - bn.E = E - bn.F = F - - return bn - - def add_node(self, rv, cpt=[], parents=[], values=[]): - self.V.append(rv) - self.F[rv] = {"cpt": cpt, "parents": parents, "values": values} - - def add_edge(self, u, v): - if not self.has_node(u): - self.add_node(u) - if not self.has_node(v): - self.add_node(v) - if self.has_edge(u, v): - print("Edge already exists") - else: - self.E[u].append(v) - self.F[v]["parents"].append(u) - # self.V = topsort(self.E) - # HOW DO I RECALCULATE CPT? - - def remove_edge(self, u, v): - self.E[u].remove(v) - self.F[v]["parents"].remove(u) - - def reverse_arc(self, u, v): - if self.has_edge(u, v): - self.E[u].remove(v) - self.E[v].append(u) - - def set_data(self, rv, data): - assert isinstance(data, dict), "data must be dictionary" - self.F[rv] = data - - def set_cpt(self, rv, cpt): - self.F[rv]["cpt"] = cpt - - def set_parents(self, rv, parents): - self.F[rv]["parents"] = parents - - def set_values(self, rv, values): - self.F[rv]["values"] = values - - def nodes(self): - for v in self.V: - yield v - - def node_idx(self, rv): - try: - return self.V.index(rv) - except ValueError: - return -1 - - def has_node(self, rv): - return rv in self.V - - def has_edge(self, u, v): - return v in self.E[u] - - def edges(self): - for u in self.nodes(): - for v in self.E[u]: - yield (u, v) - - def num_edges(self): - num = 0 - for u in self.nodes(): - num += len(self.E[u]) - return num - - def num_params(self): - num = 0 - for u in self.nodes(): - num += len(self.F[u]["cpt"]) - return num - - def scope_size(self, rv): - return len(self.F[rv]["parents"]) + 1 - - def num_nodes(self): - return len(self.V) - - def cpt(self, rv): - return self.F[rv]["cpt"] - - def card(self, rv): - return len(self.F[rv]["values"]) - - def scope(self, rv): - scope = [rv] - scope.extend(self.F[rv]["parents"]) - return scope - - def parents(self, rv): - return self.F[rv]["parents"] - - def children(self, rv): - return self.E[rv] - - def degree(self, rv): - return len(self.parents(rv)) + len(self.children(rv)) - - def values(self, rv): - return self.F[rv]["values"] - - def value_idx(self, rv, val): - try: - return self.F[rv]["values"].index(val) - except ValueError: - print("Value Index Error") - return -1 - - def stride(self, rv, n): - if n == rv: - return 1 - else: - card_list = [self.card(rv)] - card_list.extend([self.card(p) for p in self.parents(rv)]) - n_idx = self.parents(rv).index(n) + 1 - return int(np.prod(card_list[0:n_idx])) - - def flat_cpt(self, by_var=False, by_parents=False): - """ - Return all cpt values in the BN as a flattened - numpy array ordered by bn.nodes() - i.e. topsort - """ - if by_var: - cpt = np.array([sum(self.cpt(rv)) for rv in self.nodes()]) - elif by_parents: - cpt = np.array( - [ - sum(self.cpt(rv)[i : (i + self.card(rv))]) - for rv in self.nodes() - for i in range(len(self.cpt(rv)) / self.card(rv)) - ] - ) - else: - cpt = np.array([val for rv in self.nodes() for val in self.cpt(rv)]) - return cpt - - def cpt_indices(self, target, val_dict): - """ - Get the index of the CPT which corresponds - to a dictionary of rv=val sets. This can be - used for parameter learning to increment the - appropriate cpt frequency value based on - observations in the data. - - There is definitely a fast way to do this. - -- check if (idx - rv_stride*value_idx) % (rv_card*rv_stride) == 0 - - Arguments - --------- - *target* : a string - Main RV - - *val_dict* : a dictionary, where - key=rv,val=rv value - - """ - stride = dict([(n, self.stride(target, n)) for n in self.scope(target)]) - # if len(val_dict)==len(self.parents(target)): - # idx = sum([self.value_idx(rv,val)*stride[rv] \ - # for rv,val in val_dict.items()]) - # else: - card = dict([(n, self.card(n)) for n in self.scope(target)]) - idx = set(range(len(self.cpt(target)))) - for rv, val in val_dict.items(): - val_idx = self.value_idx(rv, val) - rv_idx = [] - s_idx = val_idx * stride[rv] - while s_idx < len(self.cpt(target)): - rv_idx.extend(range(s_idx, (s_idx + stride[rv]))) - s_idx += stride[rv] * card[rv] - idx = idx.intersection(set(rv_idx)) - - return list(idx) - - def cpt_str_idx(self, rv, idx): - """ - Return string representation of RV=VAL and - Parents=Val for the given idx of the given rv's cpt. - """ - rv_val = self.values(rv)[idx % self.card(rv)] - s = str(rv) + "=" + str(rv_val) + "|" - _idx = 1 - for parent in self.parents(rv): - for val in self.values(parent): - if idx in self.cpt_indices(rv, {rv: rv_val, parent: val}): - s += str(parent) + "=" + str(val) - if _idx < len(self.parents(rv)): - s += "," - _idx += 1 - return s - - def set_structure(self, edge_dict, value_dict=None): - """ - Set the structure of a BayesNet object. This - function is mostly used to instantiate a BN - skeleton after structure learning algorithms. - - See "structure_learn" folder & algorithms - - Arguments - --------- - *edge_dict* : a dictionary, - where key = rv, - value = list of rv's children - NOTE: THIS MUST BE DIRECTED ALREADY! - - *value_dict* : a dictionary, - where key = rv, - value = list of rv's possible values - - Returns - ------- - None - - Effects - ------- - - sets self.V in topsort order from edge_dict - - sets self.E - - creates self.F structure and sets the parents - - Notes - ----- - - """ - - self.V = topsort(edge_dict) - self.E = edge_dict - self.F = dict([(rv, {}) for rv in self.nodes()]) - for rv in self.nodes(): - self.F[rv] = { - "parents": [p for p in self.nodes() if rv in self.children(p)], - "cpt": [], - "values": [], - } - if value_dict is not None: - self.F[rv]["values"] = value_dict[rv] - - def adj_list(self): - """ - Returns adjacency list of lists, where - each list element is a vertex, and each sub-list is - a list of that vertex's neighbors. - """ - adj_list = [[] for _ in self.V] - vi_map = dict((self.V[i], i) for i in range(len(self.V))) - for u, v in self.edges(): - adj_list[vi_map[u]].append(vi_map[v]) - return adj_list - - def moralized_edges(self): - """ - Moralized graph is the original graph PLUS - an edge between every set of common effect - structures - - i.e. all parents of a node are connected. - - This function has be validated. - - Returns - ------- - *e* : a python list of parent-child tuples. - - """ - e = set() - for u in self.nodes(): - for p1 in self.parents(u): - e.add((p1, u)) - for p2 in self.parents(u): - if p1 != p2 and (p2, p1) not in e: - e.add((p1, p2)) - return list(e) diff --git a/bamt/external/pyBN/utils/__init__.py b/bamt/external/pyBN/utils/__init__.py deleted file mode 100644 index 1de46d8..0000000 --- a/bamt/external/pyBN/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from bamt.external.pyBN.utils.class_equivalence import * -from bamt.external.pyBN.utils.data import * -from bamt.external.pyBN.utils.graph import * -from bamt.external.pyBN.utils.independence_tests import * diff --git a/bamt/external/pyBN/utils/_tests/test_independence_tests.py b/bamt/external/pyBN/utils/_tests/test_independence_tests.py deleted file mode 100644 index fefc12c..0000000 --- a/bamt/external/pyBN/utils/_tests/test_independence_tests.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -**************** -UnitTest -Constraint Tests -**************** - -""" -__author__ = """Nicholas Cullen """ - -import os -import unittest -from os.path import dirname - -import numpy as np -from external.pyBN.independence.constraint_tests import mi_test - - -class ConstraintTestsTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") - self.data = np.loadtxt( - os.path.join(self.dpath, "lizards.csv"), - delimiter=",", - dtype="int32", - skiprows=1, - ) - - def tearDown(self): - pass - - def test_mi_two_vars_value_a(self): - self.assertEqual(mi_test(self.data[:, (0, 1)]), 0.0004) - - def test_mi_two_vars_value_b(self): - self.assertEqual(mi_test(self.data[:, (0, 2)]), 0.0014) - - def test_mi_two_vars_symmetry(self): - self.assertEqual(mi_test(self.data[:, (1, 0)]), mi_test(self.data[:, (0, 1)])) - - def test_mi_three_vars_value_a(self): - self.assertEqual(mi_test(self.data), 0.0009) - - def test_mi_three_vars_symmetry(self): - self.assertEqual( - mi_test(self.data[:, (0, 1, 2)]), mi_test(self.data[:, (1, 0, 2)]) - ) - - def test_mi_random_three(self): - np.random.seed(3636) - self.data = np.random.randint(1, 10, size=((10000, 3))) - self.assertEqual(mi_test(self.data), 0.0211) - - def test_mi_random_four(self): - np.random.seed(3636) - self.data = np.random.randint(1, 10, size=((10000, 4))) - self.assertEqual(mi_test(self.data), 0.0071) diff --git a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py deleted file mode 100644 index af97eb1..0000000 --- a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -************** -Markov Blanket -Unit Test -************** -""" - -__author__ = """Nicholas Cullen """ - -import os -import unittest -from os.path import dirname - -from external.pyBN.independence.markov_blanket import markov_blanket -from external.pyBN.readwrite.read import read_bn - - -class ConstraintTestsTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") - self.bn = read_bn(os.path.join(self.dpath, "cmu.bn")) - - def tearDown(self): - pass - - def test_markov_blanket(self): - self.assertDictEqual( - markov_blanket(self.bn), - { - "Alarm": ["Earthquake", "Burglary", "JohnCalls", "MaryCalls"], - "Burglary": ["Alarm", "Earthquake"], - "Earthquake": ["Alarm", "Burglary"], - "JohnCalls": ["Alarm"], - "MaryCalls": ["Alarm"], - }, - ) diff --git a/bamt/external/pyBN/utils/_tests/test_orient_edges.py b/bamt/external/pyBN/utils/_tests/test_orient_edges.py deleted file mode 100644 index 4242fc1..0000000 --- a/bamt/external/pyBN/utils/_tests/test_orient_edges.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -******** -UnitTest -OrientEdges -******** - -""" -__author__ = """Nicholas Cullen """ - -import os -import unittest -from os.path import dirname - -import numpy as np -from external.pyBN.structure_learn.orient_edges import orient_edges_gs, orient_edges_pc - - -class OrientEdgesTestCase(unittest.TestCase): - def setUp(self): - pass - - def tearDown(self): - pass - - def test_orient_edges_pc(self): - e = {0: [1, 2], 1: [0], 2: [0]} - b = {0: [], 1: {2: (0,)}, 2: {1: (0,)}} - self.assertDictEqual(orient_edges_pc(e, b), {0: [1, 2], 1: [], 2: []}) - - def test_orient_edges_gs(self): - e = {0: [1, 2], 1: [0], 2: [0]} - b = {0: [1, 2], 1: [0], 2: [0]} - dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") - path = os.path.join(dpath, "lizards.csv") - data = np.loadtxt(path, dtype="int32", skiprows=1, delimiter=",") - self.assertDictEqual( - orient_edges_gs(e, b, data, 0.05), {0: [1, 2], 1: [], 2: []} - ) diff --git a/bamt/external/pyBN/utils/_tests/test_random_sample.py b/bamt/external/pyBN/utils/_tests/test_random_sample.py deleted file mode 100644 index bf3e92d..0000000 --- a/bamt/external/pyBN/utils/_tests/test_random_sample.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -******** -UnitTest -Misc -******** - -""" -__author__ = """Nicholas Cullen """ - -import os -import unittest -from os.path import dirname - -import numpy as np -from external.pyBN.readwrite.read import read_bn -from external.pyBN.utils.random_sample import random_sample - - -class RandomSampleTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") - self.bn = read_bn(os.path.join(self.dpath, "cancer.bif")) - - def tearDown(self): - pass - - def test_random_sample(self): - np.random.seed(3636) - sample = random_sample(self.bn, 5) - self.assertListEqual( - list(sample.ravel()), - [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0], - ) diff --git a/bamt/external/pyBN/utils/class_equivalence.py b/bamt/external/pyBN/utils/class_equivalence.py deleted file mode 100644 index 9e284b3..0000000 --- a/bamt/external/pyBN/utils/class_equivalence.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -**************** -Equivalence Code -**************** - -This code is for testing whether or not -two Bayesian networks belong to the same -equivalence class - i.e. they have the same -edges when viewed as undirected graphs. - -Also, this code is for actually generating -equivalent Bayesian networks from a given BN. - -""" - - -def are_class_equivalent(x, y): - """ - Check whether two Bayesian networks belong - to the same equivalence class. - """ - are_equivalent = True - - if set(list(x.nodes())) != set(list(y.nodes())): - are_equivalent = False - else: - for rv in x.nodes(): - rv_x_neighbors = set(x.parents(rv)) + set(y.children(rv)) - rv_y_neighbors = set(y.parents(rv)) + set(y.children(rv)) - if rv_x_neighbors != rv_y_neighbors: - are_equivalent = False - break - return are_equivalent diff --git a/bamt/external/pyBN/utils/data.py b/bamt/external/pyBN/utils/data.py deleted file mode 100644 index ad4781a..0000000 --- a/bamt/external/pyBN/utils/data.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -These are functions for dealing with datasets -that have strings as values - as those values -must be converted to integers in order to be -used in many structure learning functions, for -example. - -There is, of course, the issue of how to get those -string values back into the underlying representation -after the learning occurs.. -""" -import numpy as np - - -def unique_bins(data): - """ - Get the unique values for each column in a dataset. - """ - bins = np.empty(len(data.T), dtype=np.int32) - i = 0 - for col in data.T: - bins[i] = len(np.unique(col)) - i += 1 - return bins diff --git a/bamt/external/pyBN/utils/graph.py b/bamt/external/pyBN/utils/graph.py deleted file mode 100644 index 92daf4d..0000000 --- a/bamt/external/pyBN/utils/graph.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Collection of Graph Algorithms. - -Any networkx dependencies should exist here only, but -there are currently a few exceptions which will -eventually be corrected. - -""" - -import networkx as nx - - -def would_cause_cycle(e, u, v, reverse=False): - """ - Test if adding the edge u -> v to the BayesNet - object would create a DIRECTED (i.e. illegal) cycle. - """ - G = nx.DiGraph(e) - if reverse: - G.remove_edge(v, u) - G.add_edge(u, v) - try: - nx.find_cycle(G, source=u) - return True - except BaseException: - return False - - -def topsort(edge_dict, root=None): - """ - List of nodes in topological sort order from edge dict - where key = rv and value = list of rv's children - """ - queue = [] - if root is not None: - queue = [root] - else: - for rv in edge_dict.keys(): - prior = True - for p in edge_dict.keys(): - if rv in edge_dict[p]: - prior = False - if prior: - queue.append(rv) - - visited = [] - while queue: - vertex = queue.pop(0) - if vertex not in visited: - visited.append(vertex) - for nbr in edge_dict[vertex]: - queue.append(nbr) - # queue.extend(edge_dict[vertex]) # add all vertex's children - return visited diff --git a/bamt/external/pyBN/utils/independence_tests.py b/bamt/external/pyBN/utils/independence_tests.py deleted file mode 100644 index cc803fe..0000000 --- a/bamt/external/pyBN/utils/independence_tests.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -****************************** -Conditional Independence Tests -for Constraint-based Learning -****************************** - -Implemented Constraint-based Tests ----------------------------------- -- mutual information -- Pearson's X^2 - -I may consider putting this code into its own class structure. The -main benefit I could see from doing this would be the ability to -cache joint/marginal/conditional probabilities for expedited tests. - -""" - -__author__ = """Nicholas Cullen """ - -import numpy as np - -from bamt.external.pyBN.utils.data import unique_bins - - -def mutual_information(data, conditional=False): - # bins = np.amax(data, axis=0)+1 # read levels for each variable - bins = unique_bins(data) - if len(bins) == 1: - hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts - Px = (hist / hist.sum()) + (1e-7) - MI = -1 * np.sum(Px * np.log(Px)) - return round(MI, 4) - - if len(bins) == 2: - hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts - - Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z - Px = np.sum(Pxy, axis=1) # P(X,Z) - Py = np.sum(Pxy, axis=0) # P(Y,Z) - - PxPy = np.outer(Px, Py) - Pxy += 1e-7 - PxPy += 1e-7 - MI = np.sum(Pxy * np.log(Pxy / (PxPy))) - return round(MI, 4) - elif len(bins) > 2 and conditional: - # CHECK FOR > 3 COLUMNS -> concatenate Z into one column - if len(bins) > 3: - data = data.astype("str") - ncols = len(bins) - for i in range(len(data)): - data[i, 2] = "".join(data[i, 2:ncols]) - data = data.astype(np.int64)[:, 0:3] - - bins = np.amax(data, axis=0) - hist, _ = np.histogramdd(data, bins=bins) # frequency counts - - Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z - Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) - Pxz = np.sum(Pxyz, axis=1) # P(X,Z) - Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) - - Pxy_z = Pxyz / (Pz + 1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) - Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) - Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) - - Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) - for i in range(bins[0]): - for j in range(bins[1]): - for k in range(bins[2]): - Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] - Pxyz += 1e-7 - Pxy_z += 1e-7 - Px_y_z += 1e-7 - MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) - - return round(MI, 4) - elif len(bins) > 2 and conditional == False: - data = data.astype("str") - ncols = len(bins) - for i in range(len(data)): - data[i, 1] = "".join(data[i, 1:ncols]) - data = data.astype(np.int64)[:, 0:2] - - hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts - - Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z - Px = np.sum(Pxy, axis=1) # P(X,Z) - Py = np.sum(Pxy, axis=0) # P(Y,Z) - - PxPy = np.outer(Px, Py) - Pxy += 1e-7 - PxPy += 1e-7 - MI = np.sum(Pxy * np.log(Pxy / (PxPy))) - return round(MI, 4) - - -def entropy(data): - """ - In the context of structure learning, and more specifically - in constraint-based algorithms which rely on the mutual information - test for conditional independence, it has been proven that the variable - X in a set which MAXIMIZES mutual information is also the variable which - MINIMIZES entropy. This fact can be used to reduce the computational - requirements of tests based on the following relationship: - - Entropy is related to marginal mutual information as follows: - MI(X;Y) = H(X) - H(X|Y) - - Entropy is related to conditional mutual information as follows: - MI(X;Y|Z) = H(X|Z) - H(X|Y,Z) - - For one varibale, H(X) is equal to the following: - -1 * sum of p(x) * log(p(x)) - - For two variables H(X|Y) is equal to the following: - sum over x,y of p(x,y)*log(p(y)/p(x,y)) - - For three variables, H(X|Y,Z) is equal to the following: - -1 * sum of p(x,y,z) * log(p(x|y,z)), - where p(x|y,z) = p(x,y,z)/p(y)*p(z) - Arguments - ---------- - *data* : a nested numpy array - The data from which to learn - must have at least three - variables. All conditioned variables (i.e. Z) are compressed - into one variable. - - Returns - ------- - *H* : entropy value - - """ - try: - cols = data.shape[1] - except IndexError: - cols = 1 - - bins = np.amax(data, axis=0) - if isinstance(bins, np.ndarray): - for i in range(len(bins)): - if bins[i] == 0: - bins[i] = 1 - else: - bins = 1 - # bins = unique_bins(data) - - if cols == 1: - hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts - Px = hist / hist.sum() + (1e-7) - H = -1 * np.sum(Px * np.log(Px)) - - elif cols == 2: # two variables -> assume X then Y - hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts - - Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z - Py = np.sum(Pxy, axis=0) # P(Y) - Py += 1e-7 - Pxy += 1e-7 - H = np.sum(Pxy * np.log(Py / Pxy)) - - else: - # CHECK FOR > 3 COLUMNS -> concatenate Z into one column - if cols > 3: - data = data.astype("str") - ncols = len(bins) - for i in range(len(data)): - data[i, 2] = "".join(data[i, 2:ncols]) - data = data.astype(np.int64)[:, 0:3] - - bins = np.amax(data, axis=0) - hist, _ = np.histogramdd(data, bins=bins) # frequency counts - - Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z - Pyz = np.sum(Pxyz, axis=0) - - Pxyz += 1e-7 # for log -inf - Pyz += 1e-7 - H = -1 * np.sum(Pxyz * np.log(Pxyz)) + np.sum(Pyz * np.log(Pyz)) - - return round(H, 4) diff --git a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py deleted file mode 100644 index 004a237..0000000 --- a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py +++ /dev/null @@ -1,852 +0,0 @@ -import warnings - -import numpy as np -import pandas as pd -import sklearn.preprocessing - -NONE_REPLACEMENT = -32768 - - -def information_mutual_conditional( - x, - y, - z, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - alphabet_x=None, - alphabet_y=None, - Alphabet_Z=None, - keep_dims=False, -): - x, fill_value_X = _sanitise_array_input(x, fill_value) - y, fill_value_Y = _sanitise_array_input(y, fill_value) - z, fill_value_Z = _sanitise_array_input(z, fill_value) - if alphabet_x is not None: - alphabet_x, fill_value_Alphabet_X = _sanitise_array_input( - alphabet_x, fill_value - ) - alphabet_x, _ = _autocreate_alphabet(alphabet_x, fill_value_Alphabet_X) - else: - alphabet_x, fill_value_Alphabet_X = _autocreate_alphabet(x, fill_value_X) - if alphabet_y is not None: - alphabet_y, fill_value_Alphabet_Y = _sanitise_array_input( - alphabet_y, fill_value - ) - alphabet_y, _ = _autocreate_alphabet(alphabet_y, fill_value_Alphabet_Y) - else: - alphabet_y, fill_value_Alphabet_Y = _autocreate_alphabet(y, fill_value_Y) - if Alphabet_Z is not None: - Alphabet_Z, fill_value_Alphabet_Z = _sanitise_array_input( - Alphabet_Z, fill_value - ) - Alphabet_Z, _ = _autocreate_alphabet(Alphabet_Z, fill_value_Alphabet_Z) - else: - Alphabet_Z, fill_value_Alphabet_Z = _autocreate_alphabet(z, fill_value_Z) - - if x.size == 0: - raise ValueError("arg X contains no elements") - if y.size == 0: - raise ValueError("arg Y contains no elements") - if z.size == 0: - raise ValueError("arg Z contains no elements") - if np.any(_isnan(x)): - raise ValueError("arg X contains NaN values") - if np.any(_isnan(y)): - raise ValueError("arg Y contains NaN values") - if np.any(_isnan(z)): - raise ValueError("arg Z contains NaN values") - if alphabet_x.size == 0: - raise ValueError("arg Alphabet_X contains no elements") - if np.any(_isnan(alphabet_x)): - raise ValueError("arg Alphabet_X contains NaN values") - if alphabet_y.size == 0: - raise ValueError("arg Alphabet_Y contains no elements") - if np.any(_isnan(alphabet_y)): - raise ValueError("arg Alphabet_Y contains NaN values") - if Alphabet_Z.size == 0: - raise ValueError("arg Alphabet_Z contains no elements") - if np.any(_isnan(Alphabet_Z)): - raise ValueError("arg Alphabet_Z contains NaN values") - if _isnan(fill_value_X): - raise ValueError("fill value for arg X is NaN") - if _isnan(fill_value_Y): - raise ValueError("fill value for arg Y is NaN") - if _isnan(fill_value_Z): - raise ValueError("fill value for arg Z is NaN") - if x.shape[:-1] != alphabet_x.shape[:-1]: - raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") - if y.shape[:-1] != alphabet_y.shape[:-1]: - raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") - if z.shape[:-1] != Alphabet_Z.shape[:-1]: - raise ValueError("leading dimensions of args Z and Alphabet_Z do not " "match") - if not cartesian_product and (x.shape != y.shape or x.shape != z.shape): - raise ValueError("dimensions of args X, Y, Z do not match") - if cartesian_product and (x.shape[-1] != y.shape[-1] or x.shape[-1] != z.shape[-1]): - raise ValueError("trailing dimensions of args X, Y, Z do not match") - if not (np.isscalar(base) and np.isreal(base) and base > 0): - raise ValueError("arg base not a positive real-valued scalar") - - S, fill_value = _map_observations_to_integers( - (x, alphabet_x, y, alphabet_y, z, Alphabet_Z), - ( - fill_value_X, - fill_value_Alphabet_X, - fill_value_Y, - fill_value_Alphabet_Y, - fill_value_Z, - fill_value_Alphabet_Z, - ), - ) - x, alphabet_x, y, alphabet_y, z, Alphabet_Z = S - - if not cartesian_product: - I = np.empty(x.shape[:-1]) - if I.ndim > 0: - I[:] = np.NaN - else: - I = np.float64(np.NaN) - else: - shapeI_Z = z.shape[:-1] - z = np.reshape(z, (-1, z.shape[-1])) - Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) - I = [] - for i in range(z.shape[0]): - def f(X, Y, Alphabet_X, Alphabet_Y): - return information_mutual_conditional( - X, - Y, - z[i], - False, - base, - fill_value, - estimator, - Alphabet_X, - Alphabet_Y, - Alphabet_Z[i], - ) - - I.append(_cartesian_product_apply(x, y, f, alphabet_x, alphabet_y)) - shapeI_XY = I[0].shape - if len(shapeI_Z) == 0: - I = np.array(I)[0].reshape(shapeI_XY) - else: - I = np.array(I) - I = np.rollaxis(I, 0, len(I.shape)) - I = I.reshape(np.append(shapeI_XY, shapeI_Z).astype("int")) - return I - - # Re-shape H, X,Y,Z so that we may handle multi-dimensional arrays - # equivalently and iterate across 0th axis - x = np.reshape(x, (-1, x.shape[-1])) - y = np.reshape(y, (-1, y.shape[-1])) - z = np.reshape(z, (-1, z.shape[-1])) - alphabet_x = np.reshape(alphabet_x, (-1, alphabet_x.shape[-1])) - alphabet_y = np.reshape(alphabet_y, (-1, alphabet_y.shape[-1])) - Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) - orig_shape_I = I.shape - I = np.reshape(I, (-1, 1)) - - for i in range(x.shape[0]): - I_ = ( - entropy_joint( - np.vstack((x[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_x[i], Alphabet_Z[i]), fill_value), - ) - + entropy_joint( - np.vstack((y[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_y[i], Alphabet_Z[i]), fill_value), - ) - - entropy_joint( - np.vstack((x[i], y[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_x[i], alphabet_y[i], Alphabet_Z[i]), fill_value), - ) - - entropy_joint(z[i], base, fill_value, estimator, Alphabet_Z[i]) - ) - I[i] = I_ - - # Reverse re-shaping - I = np.reshape(I, orig_shape_I) - - if keep_dims and not cartesian_product: - I = I[..., np.newaxis] - - return I - - -def information_mutual( - X, - Y=None, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - Alphabet_X=None, - Alphabet_Y=None, - keep_dims=False, -): - H_conditional = entropy_conditional( - X, Y, cartesian_product, base, fill_value, estimator, Alphabet_X, Alphabet_Y - ) - H = entropy(X, base, fill_value, estimator, Alphabet_X) - - H = np.reshape( - H, np.append(H.shape, np.ones(H_conditional.ndim - H.ndim)).astype("int") - ) - - H = H - H_conditional - - if keep_dims and not cartesian_product: - H = H[..., np.newaxis] - - return H - - -def entropy_pmf(P, base=2, require_valid_pmf=True, keep_dims=False): - P, _ = _sanitise_array_input(P) - - if P.size == 0: - raise ValueError("arg P contains no elements") - if np.any(_isnan(P)): - raise ValueError("arg P contains NaN values") - if np.any(np.logical_or(P < 0, P > 1)): - raise ValueError("arg P contains values outside unit interval") - if require_valid_pmf and not np.allclose(np.sum(P, axis=-1), 1): - raise ValueError("arg P does not sum to unity across last axis") - if not (np.isscalar(base) and np.isreal(base) and base > 0): - raise ValueError("arg base not a positive real-valued scalar") - - H = -np.sum(P * np.log2(P + np.spacing(0)), axis=-1) - H = H / np.log2(base) - - if keep_dims: - H = H[..., np.newaxis] - - return H - - -def entropy_joint( - X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False -): - X, fill_value_X = _sanitise_array_input(X, fill_value) - if Alphabet_X is not None: - Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( - Alphabet_X, fill_value - ) - Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) - else: - Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) - - if X.size == 0: - raise ValueError("arg X contains no elements") - if np.any(_isnan(X)): - raise ValueError("arg X contains NaN values") - if Alphabet_X.size == 0: - raise ValueError("arg Alphabet_X contains no elements") - if np.any(_isnan(Alphabet_X)): - raise ValueError("arg Alphabet_X contains NaN values") - if _isnan(fill_value_X): - raise ValueError("fill value for arg X is NaN") - if X.shape[:-1] != Alphabet_X.shape[:-1]: - raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") - if not (np.isscalar(base) and np.isreal(base) and base > 0): - raise ValueError("arg base not a positive real-valued scalar") - - S, fill_value = _map_observations_to_integers( - (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) - ) - X, Alphabet_X = S - - X = np.reshape(X, (-1, X.shape[-1])) - Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) - - _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) - - for i in range(X.shape[0]): - X = X[:, X[i].argsort(kind="mergesort")] - - B = np.any(X[:, 1:] != X[:, :-1], axis=0) - I = np.append(np.where(B), X.shape[1] - 1) - L = np.diff(np.append(-1, I)) - - alphabet_X = X[:, I] - if estimator != "ML": - n_additional_empty_bins = _determine_number_additional_empty_bins( - L, alphabet_X, Alphabet_X, fill_value - ) - else: - n_additional_empty_bins = 0 - L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) - if not np.any(L): - return np.float64(np.NaN) - - # P_0 is the probability mass assigned to each additional empty bin - P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) - H_0 = n_additional_empty_bins * P_0 * -np.log2(P_0 + np.spacing(0)) / np.log2(base) - H = entropy_pmf(P, base, require_valid_pmf=False) + H_0 - - if keep_dims: - H = H[..., np.newaxis] - - return H - - -def entropy_conditional( - X, - Y=None, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - Alphabet_X=None, - Alphabet_Y=None, - keep_dims=False, -): - if Y is None: - Y = X - cartesian_product = True - Alphabet_Y = Alphabet_X - - X, fill_value_X = _sanitise_array_input(X, fill_value) - Y, fill_value_Y = _sanitise_array_input(Y, fill_value) - if Alphabet_X is not None: - Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( - Alphabet_X, fill_value - ) - Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) - else: - Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) - if Alphabet_Y is not None: - Alphabet_Y, fill_value_Alphabet_Y = _sanitise_array_input( - Alphabet_Y, fill_value - ) - Alphabet_Y, _ = _autocreate_alphabet(Alphabet_Y, fill_value_Alphabet_Y) - else: - Alphabet_Y, fill_value_Alphabet_Y = _autocreate_alphabet(Y, fill_value_Y) - - if X.size == 0: - raise ValueError("arg X contains no elements") - if Y.size == 0: - raise ValueError("arg Y contains no elements") - if np.any(_isnan(X)): - raise ValueError("arg X contains NaN values") - if np.any(_isnan(Y)): - raise ValueError("arg Y contains NaN values") - if Alphabet_X.size == 0: - raise ValueError("arg Alphabet_X contains no elements") - if np.any(_isnan(Alphabet_X)): - raise ValueError("arg Alphabet_X contains NaN values") - if Alphabet_Y.size == 0: - raise ValueError("arg Alphabet_Y contains no elements") - if np.any(_isnan(Alphabet_Y)): - raise ValueError("arg Alphabet_Y contains NaN values") - if _isnan(fill_value_X): - raise ValueError("fill value for arg X is NaN") - if _isnan(fill_value_Y): - raise ValueError("fill value for arg Y is NaN") - if X.shape[:-1] != Alphabet_X.shape[:-1]: - raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") - if Y.shape[:-1] != Alphabet_Y.shape[:-1]: - raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") - if not cartesian_product and X.shape != Y.shape: - raise ValueError("dimensions of args X and Y do not match") - if cartesian_product and X.shape[-1] != Y.shape[-1]: - raise ValueError("trailing dimensions of args X and Y do not match") - if not (np.isscalar(base) and np.isreal(base) and base > 0): - raise ValueError("arg base not a positive real-valued scalar") - - S, fill_value = _map_observations_to_integers( - (X, Alphabet_X, Y, Alphabet_Y), - (fill_value_X, fill_value_Alphabet_X, fill_value_Y, fill_value_Alphabet_Y), - ) - X, Alphabet_X, Y, Alphabet_Y = S - - if not cartesian_product: - H = np.empty(X.shape[:-1]) - if H.ndim > 0: - H[:] = np.NaN - else: - H = np.float64(np.NaN) - else: - - def f(X, Y, Alphabet_X, Alphabet_Y): - return entropy_conditional( - X, Y, False, base, fill_value, estimator, Alphabet_X, Alphabet_Y - ) - - return _cartesian_product_apply(X, Y, f, Alphabet_X, Alphabet_Y) - - # Re-shape H, X and Y, so that we may handle multi-dimensional arrays - # equivalently and iterate across 0th axis - X = np.reshape(X, (-1, X.shape[-1])) - Y = np.reshape(Y, (-1, Y.shape[-1])) - Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) - Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) - orig_shape_H = H.shape - H = np.reshape(H, (-1, 1)) - - for i in range(X.shape[0]): - H[i] = entropy_joint( - np.vstack((X[i], Y[i])), - base, - fill_value, - estimator, - _vstack_pad((Alphabet_X[i], Alphabet_Y[i]), fill_value), - ) - entropy(Y[i], base, fill_value, estimator, Alphabet_Y[i]) - - # Reverse re-shaping - H = np.reshape(H, orig_shape_H) - - if keep_dims and not cartesian_product: - H = H[..., np.newaxis] - - return H - - -def entropy(X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False): - X, fill_value_X = _sanitise_array_input(X, fill_value) - if Alphabet_X is not None: - Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( - Alphabet_X, fill_value - ) - Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) - else: - Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) - - if X.size == 0: - raise ValueError("arg X contains no elements") - if np.any(_isnan(X)): - raise ValueError("arg X contains NaN values") - if Alphabet_X.size == 0: - raise ValueError("arg Alphabet_X contains no elements") - if np.any(_isnan(Alphabet_X)): - raise ValueError("arg Alphabet_X contains NaN values") - if _isnan(fill_value_X): - raise ValueError("fill value for arg X is NaN") - if X.shape[:-1] != Alphabet_X.shape[:-1]: - raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") - if not (np.isscalar(base) and np.isreal(base) and base > 0): - raise ValueError("arg base not a positive real-valued scalar") - - S, fill_value = _map_observations_to_integers( - (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) - ) - X, Alphabet_X = S - - H = np.empty(X.shape[:-1]) - if H.ndim > 0: - H[:] = np.NaN - else: - H = np.float64(np.NaN) - - # Re-shape H and X, so that we may handle multi-dimensional arrays - # equivalently and iterate across 0th axis - X = np.reshape(X, (-1, X.shape[-1])) - Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) - orig_shape_H = H.shape - H = np.reshape(H, (-1, 1)) - - _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) - - # NB: This is not joint entropy. Elements in each row are sorted - # independently - X = np.sort(X, axis=1) - - # Compute symbol run-lengths - # Compute symbol change indicators - B = X[:, 1:] != X[:, :-1] - for i in range(X.shape[0]): - # Obtain symbol change positions - I = np.append(np.where(B[i]), X.shape[1] - 1) - # Compute run lengths - L = np.diff(np.append(-1, I)) - - alphabet_X = X[i, I] - if estimator != "ML": - n_additional_empty_bins = _determine_number_additional_empty_bins( - L, alphabet_X, Alphabet_X[i], fill_value - ) - else: - n_additional_empty_bins = 0 - L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) - if not np.any(L): - continue - - # P_0 is the probability mass assigned to each additional empty bin - P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) - H_0 = ( - n_additional_empty_bins - * P_0 - * -np.log2(P_0 + np.spacing(0)) - / np.log2(base) - ) - H[i] = entropy_pmf(P, base, require_valid_pmf=False) + H_0 - - # Reverse re-shaping - H = np.reshape(H, orig_shape_H) - - if keep_dims: - H = H[..., np.newaxis] - - return H - - -def _autocreate_alphabet(X, fill_value): - Lengths = np.apply_along_axis(lambda x: np.unique(x).size, axis=-1, arr=X) - max_length = np.max(Lengths) - - def pad_with_fillvalue(x): - return np.append(x, np.tile(fill_value, int(max_length - x.size))) - - Alphabet = np.apply_along_axis( - lambda x: pad_with_fillvalue(np.unique(x)), axis=-1, arr=X - ) - return (Alphabet, fill_value) - - -def _sanitise_array_input(x, fill_value=-1): - # Avoid Python 3 issues with numpy arrays containing None elements - if np.any(np.equal(x, None)) or fill_value is None: - x = np.array(x, copy=False) - assert np.all(x != NONE_REPLACEMENT) - m = np.equal(x, None) - x = np.where(m, NONE_REPLACEMENT, x) - if fill_value is None: - x = np.array(x, copy=False) - fill_value = NONE_REPLACEMENT - - if isinstance(x, (pd.core.frame.DataFrame, pd.core.series.Series)): - # Create masked array, honouring Dataframe/Series missing entries - # NB: We transpose for convenience, so that quantities are computed for - # each column - x = np.ma.MaskedArray(x, x.isnull()) - - if isinstance(x, np.ma.MaskedArray): - fill_value = x.fill_value - - if np.any(x == fill_value): - warnings.warn("Masked array contains data equal to fill value") - - if x.dtype.kind in ("S", "U"): - kind = x.dtype.kind - current_dtype_len = int(x.dtype.str.split(kind)[1]) - if current_dtype_len < len(fill_value): - # Fix numpy's broken array string type behaviour which causes - # X.filled() placeholder entries to be no longer than - # non-placeholder entries - warnings.warn( - "Changing numpy array dtype internally to " - "accommodate fill_value string length" - ) - m = x.mask - x = np.array(x.filled(), dtype=kind + str(len(fill_value))) - x[m] = fill_value - else: - x = x.filled() - else: - x = x.filled() - else: - x = np.array(x, copy=False) - - if x.dtype.kind not in "biufcmMOSUV": - raise TypeError("Unsupported array dtype") - - if x.size == 1 and x.ndim == 0: - x = np.array((x,)) - - return x, np.array(fill_value) - - -def _map_observations_to_integers(Symbol_matrices, Fill_values): - assert len(Symbol_matrices) == len(Fill_values) - FILL_VALUE = -1 - if np.any([A.dtype != "int" for A in Symbol_matrices]) or np.any( - np.array(Fill_values) != FILL_VALUE - ): - L = sklearn.preprocessing.LabelEncoder() - F = [np.atleast_1d(v) for v in Fill_values] - L.fit(np.concatenate([A.ravel() for A in Symbol_matrices] + F)) - Symbol_matrices = [ - L.transform(A.ravel()).reshape(A.shape) for A in Symbol_matrices - ] - Fill_values = [L.transform(np.atleast_1d(f)) for f in Fill_values] - - for A, f in zip(Symbol_matrices, Fill_values): - assert not np.any(A == FILL_VALUE) - A[A == f] = FILL_VALUE - - assert np.all([A.dtype == "int" for A in Symbol_matrices]) - return Symbol_matrices, FILL_VALUE - - -def _isnan(X): - X = np.array(X, copy=False) - if X.dtype in ("int", "float"): - return np.isnan(X) - else: - f = np.vectorize(_isnan_element) - return f(X) - - -def _isnan_element(x): - if isinstance(x, type(np.nan)): - return np.isnan(x) - else: - return False - - -def _determine_number_additional_empty_bins( - Counts, Alphabet, Full_Alphabet, fill_value -): - alphabet_sizes = np.sum(np.atleast_2d(Full_Alphabet) != fill_value, axis=-1) - if np.any(alphabet_sizes != fill_value): - joint_alphabet_size = np.prod(alphabet_sizes[alphabet_sizes > 0]) - if joint_alphabet_size <= 0: - raise ValueError( - "Numerical overflow detected. Joint alphabet " "size too large." - ) - else: - joint_alphabet_size = 0 - return joint_alphabet_size - np.sum( - np.all(np.atleast_2d(Alphabet) != fill_value, axis=0) - ) - - -def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): - # TODO Documentation should present the following guidelines: - # 1) Good-Turing may be used if slope requirement satisfied and if - # unobserved symbols have been defined (TODO Clarify what the requirement - # is) - # 2) James-Stein approach may be used as an alternative - # 3) Dirichlet prior may be used in all other cases - - assert (np.sum(Counts) > 0) - assert (np.all(Counts.astype('int') == Counts)) - assert (n_additional_empty_bins >= 0) - Counts = Counts.astype('int') - - if isinstance(estimator, str): - estimator = estimator.upper().replace(' ', '') - - if np.isreal(estimator) or estimator in ('ML', 'PERKS', 'MINIMAX'): - if np.isreal(estimator): - alpha = estimator - elif estimator == 'PERKS': - alpha = 1.0 / (Counts.size + n_additional_empty_bins) - elif estimator == 'MINIMAX': - alpha = np.sqrt(np.sum(Counts)) / \ - (Counts.size + n_additional_empty_bins) - else: - alpha = 0 - Theta = (Counts + alpha) / \ - (1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins)) - # Theta_0 is the probability mass assigned to each additional empty bin - if n_additional_empty_bins > 0: - Theta_0 = alpha / (1.0 * np.sum(Counts) + - alpha * (Counts.size + n_additional_empty_bins)) - else: - Theta_0 = 0 - - elif estimator == 'GOOD-TURING': - # TODO We could also add a Chen-Chao vocabulary size estimator (See - # Bhat Suma's thesis) - - # The following notation is based on Gale and Sampson (1995) - # Determine histogram of counts N_r (index r denotes count) - X = np.sort(Counts) - B = X[1:] != X[:-1] # Compute symbol change indicators - I = np.append(np.where(B), X.size - 1) # Obtain symbol change positions - N_r = np.zeros(X[I[-1]] + 1) - N_r[X[I]] = np.diff(np.append(-1, I)) # Compute run lengths - N_r[0] = 0 # Ensures that unobserved symbols do not interfere - - # Compute Z_r, a locally averaged version of N_r - R = np.where(N_r)[0] - Q = np.append(0, R[:-1]) - T = np.append(R[1:], 2 * R[-1] - Q[-1]) - Z_r = np.zeros_like(N_r) - Z_r[R] = N_r[R] / (0.5 * (T - Q)) - - # Fit least squares regression line to plot of log(Z_r) versus log(r) - x = np.log10(np.arange(1, Z_r.size)) - with np.errstate(invalid='ignore', divide='ignore'): - y = np.log10(Z_r[1:]) - x = x[np.isfinite(y)] - y = y[np.isfinite(y)] - m, c = np.linalg.lstsq(np.vstack([x, np.ones(x.size)]).T, y, - rcond=None)[0] - if m >= -1: - warnings.warn("Regression slope < -1 requirement in linear " - "Good-Turing estimate not satisfied") - # Compute smoothed value of N_r based on interpolation - # We need to refer to SmoothedN_{r+1} for all observed values of r - SmoothedN_r = np.zeros(N_r.size + 1) - SmoothedN_r[1:] = 10 ** (np.log10(np.arange(1, SmoothedN_r.size)) * - m + c) - - # Determine threshold value of r at which to use smoothed values of N_r - # (SmoothedN_r), as apposed to straightforward N_r. - # Variance of Turing estimate - with np.errstate(invalid='ignore', divide='ignore'): - VARr_T = (np.arange(N_r.size) + 1) ** 2 * \ - (1.0 * np.append(N_r[1:], 0) / (N_r ** 2)) * \ - (1 + np.append(N_r[1:], 0) / N_r) - x = (np.arange(N_r.size) + 1) * 1.0 * np.append(N_r[1:], 0) / N_r - y = (np.arange(N_r.size) + 1) * \ - 1.0 * SmoothedN_r[1:] / (SmoothedN_r[:-1]) - assert (np.isinf(VARr_T[0]) or np.isnan(VARr_T[0])) - turing_is_sig_diff = np.abs(x - y) > 1.96 * np.sqrt(VARr_T) - assert (turing_is_sig_diff[0] == np.array(False)) - # NB: 0th element can be safely ignored, since always 0 - T = np.where(turing_is_sig_diff == np.array(False))[0] - if T.size > 1: - thresh_r = T[1] - # Use smoothed estimates from the first non-significant - # np.abs(SmoothedN_r-N_r) position onwards - SmoothedN_r[:thresh_r] = N_r[:thresh_r] - else: - # Use only non-smoothed estimates (except for SmoothedN_r[-1]) - SmoothedN_r[:-1] = N_r - - # Estimate probability of encountering one particular symbol among the - # objects observed r times, r>0 - p_r = np.zeros(N_r.size) - N = np.sum(Counts) - p_r[1:] = (np.arange(1, N_r.size) + 1) * \ - 1.0 * SmoothedN_r[2:] / (SmoothedN_r[1:-1] * N) - # Estimate probability of observing any unseen symbol - p_r[0] = 1.0 * N_r[1] / N - - # Assign probabilities to observed symbols - Theta = np.array([p_r[r] for r in Counts]) - Theta[Counts == 0] = 0 - - # Normalise probabilities for observed symbols, so that they sum to one - if np.any(Counts == 0) or n_additional_empty_bins > 0: - Theta = (1 - p_r[0]) * Theta / np.sum(Theta) - else: - warnings.warn("No unobserved outcomes specified. Disregarding the " - "probability mass allocated to any unobserved " - "outcomes.") - Theta = Theta / np.sum(Theta) - - # Divide p_0 among unobserved symbols - with np.errstate(invalid='ignore', divide='ignore'): - p_emptybin = p_r[0] / (np.sum(Counts == 0) + - n_additional_empty_bins) - Theta[Counts == 0] = p_emptybin - # Theta_0 is the probability mass assigned to each additional empty bin - if n_additional_empty_bins > 0: - Theta_0 = p_emptybin - else: - Theta_0 = 0 - - elif estimator == 'JAMES-STEIN': - Theta, _ = _estimate_probabilities(Counts, 'ML') - p_uniform = 1.0 / (Counts.size + n_additional_empty_bins) - with np.errstate(invalid='ignore', divide='ignore'): - Lambda = (1 - np.sum(Theta ** 2)) / \ - ((np.sum(Counts) - 1) * - (np.sum((p_uniform - Theta) ** 2) + - n_additional_empty_bins * p_uniform ** 2)) - - if Lambda > 1: - Lambda = 1 - elif Lambda < 0: - Lambda = 0 - elif np.isnan(Lambda): - Lambda = 1 - - Theta = Lambda * p_uniform + (1 - Lambda) * Theta - # Theta_0 is the probability mass assigned to each additional empty bin - if n_additional_empty_bins > 0: - Theta_0 = Lambda * p_uniform - else: - Theta_0 = 0 - else: - raise ValueError("invalid value specified for estimator") - - return Theta, Theta_0 - - -def _remove_counts_at_fill_value(Counts, Alphabet, fill_value): - I = np.any(np.atleast_2d(Alphabet) == fill_value, axis=0) - if np.any(I): - Counts = Counts[~I] - Alphabet = Alphabet.T[~I].T - return (Counts, Alphabet) - - -def _cartesian_product_apply(X, Y, function, Alphabet_X=None, Alphabet_Y=None): - assert X.ndim > 0 and Y.ndim > 0 - assert X.size > 0 and Y.size > 0 - if Alphabet_X is not None or Alphabet_Y is not None: - assert Alphabet_X.ndim > 0 and Alphabet_Y.ndim > 0 - assert Alphabet_X.size > 0 and Alphabet_Y.size > 0 - - H = np.empty(np.append(X.shape[:-1], Y.shape[:-1]).astype("int")) - if H.ndim > 0: - H[:] = np.NaN - else: - H = np.float64(np.NaN) - - X = np.reshape(X, (-1, X.shape[-1])) - Y = np.reshape(Y, (-1, Y.shape[-1])) - if Alphabet_X is not None or Alphabet_Y is not None: - Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) - Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) - orig_shape_H = H.shape - H = np.reshape(H, (-1, 1)) - - n = 0 - for i in range(X.shape[0]): - for j in range(Y.shape[0]): - if Alphabet_X is not None or Alphabet_Y is not None: - H[n] = function(X[i], Y[j], Alphabet_X[i], Alphabet_Y[j]) - else: - H[n] = function(X[i], Y[j]) - n = n + 1 - - # Reverse re-shaping - H = np.reshape(H, orig_shape_H) - - return H - - -def _verify_alphabet_sufficiently_large(X, Alphabet, fill_value): - assert not np.any(X == np.array(None)) - assert not np.any(Alphabet == np.array(None)) - for i in range(X.shape[0]): - I = X[i] != fill_value - J = Alphabet[i] != fill_value - # NB: This causes issues when both arguments contain None. But it is - # always called after observations have all been mapped to integers. - if np.setdiff1d(X[i, I], Alphabet[i, J]).size > 0: - raise ValueError( - "provided alphabet does not contain all observed " "outcomes" - ) - - -def _vstack_pad(arrays, fill_value): - max_length = max([A.shape[-1] for A in arrays]) - arrays = [ - np.append( - A, - np.tile( - fill_value, - np.append(A.shape[:-1], max_length - A.shape[-1]).astype(int), - ), - ) - for A in arrays - ] - return np.vstack(arrays) diff --git a/bamt/graph/__init__.py b/bamt/graph/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/graph/dag.py b/bamt/graph/dag.py new file mode 100644 index 0000000..03a1770 --- /dev/null +++ b/bamt/graph/dag.py @@ -0,0 +1,24 @@ +from .graph import Graph + + +class DirectedAcyclicGraph(Graph): + def __init__(self): + super().__init__() + + def add_edge(self): + pass + + def remove_edge(self): + pass + + def get_parents(self): + pass + + def get_children(self): + pass + + def get_edges(self): + pass + + def get_nodes(self): + pass diff --git a/bamt/graph/graph.py b/bamt/graph/graph.py new file mode 100644 index 0000000..c2628fc --- /dev/null +++ b/bamt/graph/graph.py @@ -0,0 +1,30 @@ +from abc import ABC, abstractmethod + + +class Graph(ABC): + def __init__(self): + pass + + @abstractmethod + def add_edge(self): + pass + + @abstractmethod + def remove_edge(self): + pass + + @abstractmethod + def get_parents(self): + pass + + @abstractmethod + def get_children(self): + pass + + @abstractmethod + def get_edges(self): + pass + + @abstractmethod + def get_nodes(self): + pass diff --git a/bamt/log.py b/bamt/log.py deleted file mode 100644 index 74d65fc..0000000 --- a/bamt/log.py +++ /dev/null @@ -1,67 +0,0 @@ -import logging -import logging.config -import os - - -class BamtLogger: - def __init__(self): - self.file_handler = False - self.enabled = True - self.base = os.path.join(os.path.dirname(os.path.abspath(__file__))) - self.log_file_path = os.path.join(self.base, "logging.conf") - logging.config.fileConfig(self.log_file_path) - - self.loggers = dict( - logger_builder=logging.getLogger("builder"), - logger_network=logging.getLogger("network"), - logger_preprocessor=logging.getLogger("preprocessor"), - logger_nodes=logging.getLogger("nodes"), - logger_display=logging.getLogger("display"), - ) - - def has_handler(self, logger, handler_type): - """Check if a logger has a handler of a specific type.""" - return any(isinstance(handler, handler_type) for handler in logger.handlers) - - def remove_handler_type(self, logger, handler_type): - """Remove all handlers of a specific type from a logger.""" - for handler in logger.handlers[:]: - if isinstance(handler, handler_type): - logger.removeHandler(handler) - - def switch_console_out(self, value: bool): - """ - Turn off console output from logger. - """ - assert isinstance(value, bool) - handler_class = logging.StreamHandler if not value else logging.NullHandler - - for logger in self.loggers.values(): - if self.has_handler(logger, handler_class): - self.remove_handler_type(logger, handler_class) - logger.addHandler(logging.NullHandler() if not value else logging.root.handlers[0]) - - def switch_file_out(self, value: bool, log_file: str): - """ - Send all messages in file - """ - assert isinstance(value, bool) - - for logger in self.loggers.values(): - if value: - file_handler = logging.FileHandler(log_file, mode="a") - file_handler.setFormatter(logging.root.handlers[0].formatter) - logger.addHandler(file_handler) - else: - self.remove_handler_type(logger, logging.FileHandler) - - -bamt_logger = BamtLogger() - -( - logger_builder, - logger_network, - logger_preprocessor, - logger_nodes, - logger_display, -) = list(bamt_logger.loggers.values()) diff --git a/bamt/logging.conf b/bamt/logging.conf deleted file mode 100644 index 87663ef..0000000 --- a/bamt/logging.conf +++ /dev/null @@ -1,51 +0,0 @@ -[loggers] -keys=root, preprocessor, builder, nodes, network, display - -[handlers] -keys=consoleHandler - -[formatters] -keys=simpleFormatter - -[logger_root] -level=INFO -handlers=consoleHandler - -[logger_preprocessor] -level=INFO -qualname=preprocessor -handlers=consoleHandler -propagate=0 - -[logger_network] -level=INFO -qualname=network -handlers=consoleHandler -propagate=0 - -[logger_builder] -level=INFO -qualname=builder -handlers=consoleHandler -propagate=0 - -[logger_nodes] -level=INFO -qualname=nodes -handlers=consoleHandler -propagate=0 - -[logger_display] -level=INFO -qualname=display -handlers=consoleHandler -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=INFO -formatter=simpleFormatter -args=(sys.stdout,) - -[formatter_simpleFormatter] -format=%(asctime)s | %(levelname)-8s | %(filename)s-%(funcName)s-%(lineno)04d | %(message)s diff --git a/bamt/mi_entropy_gauss.py b/bamt/mi_entropy_gauss.py deleted file mode 100644 index 48b1ef9..0000000 --- a/bamt/mi_entropy_gauss.py +++ /dev/null @@ -1,310 +0,0 @@ -import math -import sys -from copy import copy -from typing import List - -import numpy as np -import pandas as pd - -from bamt.external.pyBN.utils.independence_tests import mutual_information, entropy -from bamt.preprocess.discretization import get_nodes_type -from bamt.preprocess.graph import edges_to_dict -from bamt.preprocess.numpy_pandas import loc_to_DataFrame - - -def query_filter(data: pd.DataFrame, columns: List, values: List): - """ - Filters the data according to the column-value list - Arguments - ---------- - *data* : pandas.DataFrame - Returns - ------- - *data_trim* : pandas.DataFrame - Filtered data. - Effects - ------- - None - """ - data_copy = copy(data) - filter_str = "`" + str(columns[0]) + "`" + " == " + str(values[0]) - if len(columns) == 1: - return data_copy.query(filter_str) - else: - for i in range(1, len(columns)): - filter_str += " & " + "`" + str(columns[i]) + "`" + " == " + str(values[i]) - data_trim = data_copy.query(filter_str) - return data_trim - - -def entropy_gauss(pd_data): - """ - Calculate entropy for Gaussian multivariate distributions. - Arguments - ---------- - *data* : pd.DataFrame - Returns - ------- - *entropy* : a float - The entropy for Gaussian multivariate distributions. - Effects - ------- - None - """ - if not isinstance(pd_data, pd.Series): - data = copy(pd_data).values.T - else: - data = np.array(copy(pd_data)).T - if data.size == 0: - return 0.0 - flag_row = False - flag_col = False - - if isinstance(data[0], np.float64): - flag_row = True - elif (len(data[0]) < 2) | (data.ndim < 2): - flag_row = True - elif data.shape[0] < 2: - flag_row = True - if isinstance(copy(data).T[0], np.float64): - flag_col = True - elif (len(copy(data).T) < 2) | (copy(data).T.ndim < 2): - flag_col = True - elif data.shape[1] < 2: - flag_col = True - - if flag_row & flag_col: - return sys.float_info.max - elif flag_row | flag_col: - var = np.var(data) - if var > 1e-16: - return 0.5 * (1 + math.log(var * 2 * math.pi)) - else: - return sys.float_info.min - else: - var = np.linalg.det(np.cov(data)) - N = var.ndim - if var > 1e-16: - return 0.5 * (N * (1 + math.log(2 * math.pi)) + math.log(var)) - else: - return sys.float_info.min - - -def entropy_all(data, method="MI"): - """ - For one varibale, H(X) is equal to the following: - -1 * sum of p(x) * log(p(x)) - For two variables H(X|Y) is equal to the following: - sum over x,y of p(x,y)*log(p(y)/p(x,y)) - For three variables, H(X|Y,Z) is equal to the following: - -1 * sum of p(x,y,z) * log(p(x|y,z)), - where p(x|y,z) = p(x,y,z)/p(y)*p(z) - Arguments - ---------- - *data* : pd.DataFrame - Returns - ------- - *H* : entropy value""" - if isinstance(data, np.ndarray): - return entropy_all(loc_to_DataFrame(data), method=method) - elif isinstance(data, pd.Series): - return entropy_all(pd.DataFrame(data), method) - elif isinstance(data, pd.DataFrame): - nodes_type = get_nodes_type(data) - column_disc = [] - for key in nodes_type: - if nodes_type[key] == "disc": - column_disc.append(key) - column_cont = [] - for key in nodes_type: - if nodes_type[key] == "cont": - column_cont.append(key) - data_disc = data[column_disc] - data_cont = data[column_cont] - - if len(column_cont) == 0: - return entropy(data_disc.values) - elif len(column_disc) == 0: - return entropy_gauss(data_cont) - else: - H_disc = entropy(data_disc.values) - dict_comb = {} - comb_prob = {} - for i in range(len(data_disc)): - row = data_disc.iloc[i] - comb = "" - for _, val in row.items(): - comb = comb + str(val) + ", " - if comb not in dict_comb: - dict_comb[comb] = row - comb_prob[comb] = 1 - else: - comb_prob[comb] += 1 - - H_cond = 0.0 - for key in list(dict_comb.keys()): - filtered_data = query_filter(data, column_disc, list(dict_comb[key])) - filtered_data = filtered_data[column_cont] - if comb_prob[key] == 1: - if (method == "BIC") | (method == "AIC"): - H_cond += ( - comb_prob[key] - / len(data_disc) - * entropy_gauss(data[column_cont]) - ) - else: - H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max - else: - H_cond += ( - comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) - ) - if (method == "BIC") | (method == "AIC"): - if H_cond > entropy_gauss(data[column_cont]): - H_cond = entropy_gauss(data[column_cont]) - - return H_disc + H_cond - - -def entropy_cond(data, column_cont, column_disc, method): - data_cont = data[column_cont] - data_disc = data[column_disc] - H_gauss = entropy_gauss(data_cont) - H_cond = 0.0 - - dict_comb = {} - comb_prob = {} - for i in range(len(data_disc)): - row = data_disc.iloc[i] - comb = "" - for _, val in row.items(): - comb = comb + str(val) + ", " - if comb not in dict_comb: - dict_comb[comb] = row - comb_prob[comb] = 1 - else: - comb_prob[comb] += 1 - - for key in list(dict_comb.keys()): - filtered_data = query_filter(data, column_disc, list(dict_comb[key])) - filtered_data = filtered_data[column_cont] - if comb_prob[key] == 1: - if (method == "BIC") | (method == "AIC"): - H_cond += comb_prob[key] / len(data_disc) * H_gauss - else: - H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max - else: - H_cond += comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) - if (method == "BIC") | (method == "AIC"): - if H_cond > H_gauss: - return H_gauss - else: - return H_cond - return H_cond - - -def mi_gauss(data, method="MI", conditional=False): - """ - Calculate Mutual Information based on entropy. - In the case of continuous uses entropy for Gaussian multivariate distributions. - Arguments - ---------- - *data* : pandas.DataFrame - Returns - ------- - *MI* : a float - The Mutual Information - Effects - ------- - None - Notes - ----- - - Need to preprocess data with code_categories - """ - if isinstance(data, np.ndarray): - return mi_gauss(loc_to_DataFrame(data), method, conditional) - elif isinstance(data, pd.Series): - return mi_gauss(pd.DataFrame(data)) - elif isinstance(data, pd.DataFrame): - nodes_type = get_nodes_type(data) - if conditional: - # Hill-Climbing does not use conditional MI, but other algorithms may require it - # At the moment it counts on condition of the last row in the list - # of columns - print("Warning: conditional == True") - nodes_type_trim = copy(nodes_type) - data_trim = copy(data) - list_keys = list(nodes_type_trim.keys) - del nodes_type_trim[list_keys[-1]] - del data_trim[list_keys[-1]] - return mi_gauss(data, nodes_type, method) - mi_gauss( - data_trim, nodes_type, method - ) - else: - column_disc = [] - for key in nodes_type: - if nodes_type[key] == "disc": - column_disc.append(key) - column_cont = [] - for key in nodes_type: - if nodes_type[key] == "cont": - column_cont.append(key) - data_disc = data[column_disc] - data_cont = data[column_cont] - - H_gauss = 0.0 - H_cond = 0.0 - - if len(column_cont) == 0: - return mutual_information(data_disc.values, conditional=False) - elif len(column_disc) == 0: - if len(column_cont) == 1: - return entropy_gauss(data_cont) - else: - data_last = data_cont[[column_cont[-1]]] - column_cont_trim = copy(column_cont) - del column_cont_trim[-1] - data_cont_trim = data[column_cont_trim] - - H_gauss = ( - entropy_gauss(data_last) - + entropy_gauss(data_cont_trim) - - entropy_gauss(data_cont) - ) - H_gauss = min( - H_gauss, entropy_gauss(data_last), entropy_gauss(data_cont_trim) - ) - # H_gauss = entropy_gauss(data_cont) - H_cond = 0.0 - else: - H_gauss = entropy_gauss(data_cont) - H_cond = entropy_cond(data, column_cont, column_disc, method) - - return H_gauss - H_cond - - -def mi(edges: list, data: pd.DataFrame, method="MI"): - """ - Bypasses all nodes and summarizes scores, - taking into account the parent-child relationship. - Arguments - ---------- - *edges* : list - *data* : pd.DataFrame - Returns - ------- - *sum_score* : float - Effects - ------- - None - """ - parents_dict = edges_to_dict(edges) - sum_score = 0.0 - nodes_with_edges = parents_dict.keys() - for var in nodes_with_edges: - child_parents = [var] - child_parents.extend(parents_dict[var]) - sum_score += mi_gauss(copy(data[child_parents]), method) - nodes_without_edges = list(set(data.columns).difference(set(nodes_with_edges))) - for var in nodes_without_edges: - sum_score += mi_gauss(copy(data[var]), method) - return sum_score diff --git a/bamt/models/__init__.py b/bamt/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/models/probabilistic_structural_models/__init__.py b/bamt/models/probabilistic_structural_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/models/probabilistic_structural_models/bayesian_network.py b/bamt/models/probabilistic_structural_models/bayesian_network.py new file mode 100644 index 0000000..36f6f2f --- /dev/null +++ b/bamt/models/probabilistic_structural_models/bayesian_network.py @@ -0,0 +1,19 @@ +from .probabilistic_structural_model import ProbabilisticStructuralModel +from abc import abstractmethod + + +class BayesianNetwork(ProbabilisticStructuralModel): + def __init__(self): + super().__init__() + + @abstractmethod + def fit(self): + pass + + @abstractmethod + def predict(self): + pass + + @abstractmethod + def sample(self): + pass diff --git a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py new file mode 100644 index 0000000..d8c2f86 --- /dev/null +++ b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py @@ -0,0 +1,15 @@ +from .bayesian_network import BayesianNetwork + + +class ContinuousBayesianNetwork(BayesianNetwork): + def __init__(self): + super().__init__() + + def fit(self): + pass + + def predict(self): + pass + + def sample(self): + pass diff --git a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py new file mode 100644 index 0000000..d8c2f86 --- /dev/null +++ b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py @@ -0,0 +1,15 @@ +from .bayesian_network import BayesianNetwork + + +class ContinuousBayesianNetwork(BayesianNetwork): + def __init__(self): + super().__init__() + + def fit(self): + pass + + def predict(self): + pass + + def sample(self): + pass diff --git a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py new file mode 100644 index 0000000..fcec53e --- /dev/null +++ b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py @@ -0,0 +1,15 @@ +from .bayesian_network import BayesianNetwork + + +class DiscreteBayesianNetwork(BayesianNetwork): + def __init__(self): + super().__init__() + + def fit(self): + pass + + def predict(self): + pass + + def sample(self): + pass diff --git a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py new file mode 100644 index 0000000..8dcf844 --- /dev/null +++ b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py @@ -0,0 +1,15 @@ +from .bayesian_network import BayesianNetwork + + +class HybridBayesianNetwork(BayesianNetwork): + def __init__(self): + super().__init__() + + def fit(self): + pass + + def predict(self): + pass + + def sample(self): + pass diff --git a/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py b/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py new file mode 100644 index 0000000..b2b3b56 --- /dev/null +++ b/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + + +class ProbabilisticStructuralModel(ABC): + def __init__(self): + pass + + @abstractmethod + def fit(self): + pass + + @abstractmethod + def predict(self): + pass diff --git a/bamt/networks/__init__.py b/bamt/networks/__init__.py deleted file mode 100644 index 7f8df8f..0000000 --- a/bamt/networks/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from bamt.networks.base import BaseNetwork -from bamt.networks.big_brave_bn import BigBraveBN -from bamt.networks.composite_bn import CompositeBN -from bamt.networks.continuous_bn import ContinuousBN -from bamt.networks.discrete_bn import DiscreteBN -from bamt.networks.hybrid_bn import HybridBN diff --git a/bamt/networks/base.py b/bamt/networks/base.py deleted file mode 100644 index 3d8af8d..0000000 --- a/bamt/networks/base.py +++ /dev/null @@ -1,998 +0,0 @@ -import json -import os.path as path -import random -import re -from copy import deepcopy -from typing import Dict, Tuple, List, Callable, Optional, Type, Union, Any, Sequence - -import numpy as np -import pandas as pd -from joblib import Parallel, delayed -from pgmpy.estimators import K2Score -from sklearn.preprocessing import LabelEncoder -from tqdm import tqdm - -import bamt.builders as builders -from bamt.builders.builders_base import ParamDict -from bamt.builders.evo_builder import EvoStructureBuilder -from bamt.builders.hc_builder import HCStructureBuilder -from bamt.display import plot_, get_info_ -from bamt.external.pyitlib.DiscreteRandomVariableUtils import ( - entropy, - information_mutual, - information_mutual_conditional, - entropy_conditional, -) -from bamt.log import logger_network -from bamt.nodes.base import BaseNode -from bamt.utils import GraphUtils, serialization_utils, check_utils - - -class BaseNetwork(object): - """ - Base class for Bayesian Network - """ - - def __init__(self): - """ - nodes: a list of nodes instances - edges: a list of edges - distributions: dict - """ - self.sf_name = None - self.type = "Abstract" - self._allowed_dtypes = ["Abstract"] - self.nodes = [] - self.edges = [] - self.weights = {} - self.descriptor = {"types": {}, "signs": {}} - self.distributions = {} - self.has_logit = False - self.use_mixture = False - self.encoders = {} - - @property - def nodes_names(self) -> List[str]: - return [node.name for node in self.nodes] - - def __getitem__(self, node_name: str) -> Type[BaseNode]: - if node_name in self.nodes_names: - index = self.nodes_names.index(node_name) - return self.nodes[index] - - def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: - types = descriptor["types"] - return ( - True if all([a in self._allowed_dtypes for a in types.values()]) else False - ) - - def update_descriptor(self): - new_nodes_names = [node.name for node in self.nodes] - self.descriptor["types"] = { - node: type - for node, type in self.descriptor["types"].items() - if node in new_nodes_names - } - if "cont" in self.descriptor["types"].values(): - self.descriptor["signs"] = { - node: sign - for node, sign in self.descriptor["signs"].items() - if node in new_nodes_names - } - - def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): - """ - Function for initializing nodes in Bayesian Network - descriptor: dict with types and signs of nodes - """ - if not self.validate(descriptor=descriptor): - if not self.type == "Hybrid" or not self.type == "Composite": - logger_network.error( - f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" - ) - return - else: - logger_network.error( - f"Descriptor validation failed due to wrong type of column(s)." - ) - return - elif ["Abstract"] in self._allowed_dtypes: - return None - self.descriptor = descriptor - worker_1 = builders.builders_base.VerticesDefiner(descriptor, regressor=None) - - # first stage - worker_1.skeleton["V"] = worker_1.vertices - # second stage - worker_1.overwrite_vertex( - has_logit=False, - use_mixture=self.use_mixture, - classifier=None, - regressor=None, - ) - self.nodes = worker_1.vertices - - def add_edges( - self, - data: pd.DataFrame, - scoring_function: Union[Tuple[str, Callable], Tuple[str]] = ("K2", K2Score), - progress_bar: bool = True, - classifier: Optional[object] = None, - regressor: Optional[object] = None, - params: Optional[ParamDict] = None, - optimizer: str = "HC", - **kwargs, - ): - """ - Base function for Structure learning - scoring_function: tuple with the following format (NAME, scoring_function) or (NAME, ) - Params: - init_edges: list of tuples, a graph to start learning with - remove_init_edges: allows changes in a model defined by user - white_list: list of allowed edges - """ - if not self.has_logit and check_utils.is_model(classifier): - logger_network.error("Classifiers dict with use_logit=False is forbidden.") - return None - - # params validation - if params: - # init_edges validation - if not self.has_logit and "init_edges" in params.keys(): - type_map = np.array( - [ - [ - self.descriptor["types"][node1], - self.descriptor["types"][node2], - ] - for node1, node2 in params["init_edges"] - ] - ) - failed = (type_map[:, 0] == "cont") & ( - (type_map[:, 1] == "disc") | (type_map[:, 1] == "disc_num") - ) - if sum(failed): - logger_network.warning( - f"Edges between continuous nodes and disc nodes are forbidden (has_logit = {self.has_logit}), " - f"they will be ignored. Indexes: {np.where(failed)[0]}" - ) - params["init_edges"] = [ - params["init_edges"][i] - for i in range(len(params["init_edges"])) - if i not in np.where(failed)[0] - ] - - if not self.validate(descriptor=self.descriptor): - logger_network.error( - f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" - ) - return None - if optimizer == "HC": - worker = HCStructureBuilder( - data=data, - descriptor=self.descriptor, - scoring_function=scoring_function, - has_logit=self.has_logit, - use_mixture=self.use_mixture, - regressor=regressor, - ) - elif optimizer == "Evo": - worker = EvoStructureBuilder( - data=data, - descriptor=self.descriptor, - has_logit=self.has_logit, - use_mixture=self.use_mixture, - regressor=regressor, - ) - else: - logger_network.error(f"Optimizer {optimizer} is not supported") - return None - - self.sf_name = scoring_function[0] - - worker.build( - data=data, - params=params, - classifier=classifier, - regressor=regressor, - progress_bar=progress_bar, - **kwargs, - ) - - # update family - self.nodes = worker.skeleton["V"] - self.edges = worker.skeleton["E"] - - def calculate_weights(self, discretized_data: pd.DataFrame): - """ - Provide calculation of link strength according mutual information between node and its parent(-s) values. - """ - import bamt.utils.GraphUtils as gru - - data_descriptor = gru.nodes_types(discretized_data) - if not all([i in ["disc", "disc_num"] for i in data_descriptor.values()]): - logger_network.error( - f"calculate_weghts() method deals only with discrete data. Continuous data: " - + f"{[col for col, type in data_descriptor.items() if type not in ['disc', 'disc_num']]}" - ) - if not self.edges: - logger_network.error( - "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" - ) - if not self.nodes: - logger_network.error( - "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" - ) - weights = dict() - - for node in self.nodes: - parents = node.cont_parents + node.disc_parents - if parents is None: - continue - y = discretized_data[node.name].values - if len(parents) == 1: - x = discretized_data[parents[0]].values - ls_true = information_mutual(X=y, Y=x) - entropy_value = entropy(X=y) - weight = ls_true / entropy_value - weights[(parents[0], node.name)] = weight - else: - for parent_node in parents: - x = discretized_data[parent_node].values - other_parents = [tmp for tmp in parents if tmp != parent_node] - z = list() - for other_parent in other_parents: - z.append(list(discretized_data[other_parent].values)) - ls_true = np.average( - information_mutual_conditional( - x=y, y=x, z=z, cartesian_product=True - ) - ) - entropy_value = ( - np.average( - entropy_conditional(X=y, Y=z, cartesian_product=True) - ) - + 1e-8 - ) - weight = ls_true / entropy_value - weights[(parent_node, node.name)] = weight - self.weights = weights - - def set_nodes(self, nodes: List, info: Optional[Dict] = None): - """ - additional function to set nodes manually. User should be aware that - nodes must be a subclass of BaseNode. - Params: - nodes: dict with name and node (if a lot of nodes should be added) - info: descriptor - """ - if not info and not self.descriptor["types"]: - logger_network.error( - "In case of manual setting nodes user should set map for them as well." - ) - return - self.nodes = [] - for node in nodes: - if issubclass(type(node), BaseNode): - self.nodes.append(node) - else: - logger_network.error(f"{node} is not an instance of {BaseNode}") - - if info: - self.descriptor = info - - def set_edges(self, edges: Optional[List[Sequence[str]]] = None): - """ - additional function to set edges manually. User should be aware that - nodes must be a subclass of BaseNode. - param: edges dict with name and node (if a lot of nodes should be added) - """ - - if not self.nodes: - logger_network.error("Graph without nodes") - self.edges = [] - for node1, node2 in edges: - if isinstance(node1, str) and isinstance(node2, str): - if self[node1] and self[node2]: - if ( - not self.has_logit - and self.descriptor["types"][node1] == "cont" - and self.descriptor["types"][node2] == "disc" - ): - logger_network.warning( - f"Restricted edge detected (has_logit=False) : [{node1}, {node2}]" - ) - continue - else: - self.edges.append((node1, node2)) - else: - logger_network.error(f"Unknown nodes : [{node1}, {node2}]") - continue - else: - logger_network.error( - f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]" - ) - continue - self.update_descriptor() - - def set_structure( - self, - info: Optional[Dict] = None, - nodes: Optional[List] = None, - edges: Optional[List[Sequence[str]]] = None, - ): - """ - Function to set structure manually - info: Descriptor - nodes, edges: - """ - if nodes and (info or (self.descriptor["types"] and self.descriptor["signs"])): - if ( - any("mixture" in node.type.lower() for node in nodes) - and not self.use_mixture - ): - logger_network.error("Compatibility error: use mixture.") - return - if ( - any("logit" in node.type.lower() for node in nodes) - and not self.has_logit - ): - logger_network.error("Compatibility error: has logit.") - return - self.set_nodes(nodes=nodes, info=info) - if isinstance(edges, list): - if not self.nodes: - logger_network.error("Nodes/info detection failed.") - return - - builder = builders.builders_base.VerticesDefiner( - descriptor=self.descriptor, regressor=None - ) # init worker - builder.skeleton["V"] = builder.vertices # 1 stage - if len(edges) != 0: - # set edges and register members - self.set_edges(edges=edges) - builder.skeleton["E"] = self.edges - builder.get_family() - - builder.overwrite_vertex( - has_logit=self.has_logit, - use_mixture=self.use_mixture, - classifier=None, - regressor=None, - ) - - self.set_nodes(nodes=builder.skeleton["V"]) - - def _param_validation(self, params: Dict[str, Any]) -> bool: - if all(self[i] for i in params.keys()): - for name, info in params.items(): - try: - self[name].choose(node_info=info, pvals=[]) - except Exception as ex: - logger_network.error("Validation failed", exc_info=ex) - return False - return True - else: - logger_network.error("Param validation failed due to unknown nodes.") - return False - - def set_parameters(self, parameters: Dict): - if not self.nodes: - logger_network.error("Failed on search of BN's nodes.") - - self.distributions = parameters - - for node, data in self.distributions.items(): - if "hybcprob" in data.keys(): - if "mixture" in self[node].type.lower(): - continue - else: - if "gaussian" in self[node].type.lower(): - model_type = "regressor" - else: - model_type = "classifier" - - model = None - for v in data["hybcprob"].values(): - if v[model_type]: - model = v[model_type] - break - else: - continue - if not model: - logger_network.warning( - f"Classifier/regressor for {node} hadn't been used." - ) - - self[node].type = re.sub( - r"\([\s\S]*\)", f"({model})", self[node].type - ) - else: - if data.get("serialization", False): - regressor = data.get("regressor", None) - classifier = data.get("classifier", None) - self[node].type = re.sub( - r"\([\s\S]*\)", f"({regressor or classifier})", self[node].type - ) - else: - self[node].type = re.sub( - r"\([\s\S]*\)", f"({None})", self[node].type - ) - - @staticmethod - def _save_to_file(outdir: str, data: Union[dict, list]): - """ - Function to save data to json file - :param outdir: output directory - :param data: dictionary to be saved - """ - if not outdir.endswith(".json"): - raise TypeError( - f"Unappropriated file format. Expected: .json. Got: {path.splitext(outdir)[-1]}" - ) - with open(outdir, "w+") as out: - json.dump(data, out) - return True - - def save_params(self, outdir: str): - """ - Function to save BN params to json file - outdir: output directory - """ - return self._save_to_file(outdir, self.distributions) - - def save_structure(self, outdir: str): - """ - Function to save BN edges to json file - outdir: output directory - """ - return self._save_to_file(outdir, self.edges) - - def save(self, bn_name, models_dir: str = "models_dir"): - """ - Function to save the whole BN to json file. - - :param bn_name: unique name of bn user want to save. It will be used as file name (e.g. bn_name.json). - :param models_dir: if picklization is broken, joblib will serialize models in compressed files - in models directory. - - :return: saving status. - """ - distributions = deepcopy(self.distributions) - new_weights = {str(key): self.weights[key] for key in self.weights} - - to_serialize = {} - # separate logit and gaussian nodes from distributions to serialize bn's models - for node_name in distributions.keys(): - if "Mixture" in self[node_name].type: - continue - if self[node_name].type.startswith("Gaussian"): - if not distributions[node_name]["regressor"]: - continue - if ( - "Gaussian" in self[node_name].type - or "Logit" in self[node_name].type - or "ConditionalLogit" in self[node_name].type - ): - to_serialize[node_name] = [ - self[node_name].type, - distributions[node_name], - ] - - serializer = serialization_utils.ModelsSerializer( - bn_name=bn_name, models_dir=models_dir - ) - serialized_dist = serializer.serialize(to_serialize) - - for serialized_node in serialized_dist.keys(): - distributions[serialized_node] = serialized_dist[serialized_node] - - outdict = { - "info": self.descriptor, - "edges": self.edges, - "parameters": distributions, - "weights": new_weights, - } - return self._save_to_file(f"{bn_name}.json", outdict) - - def load(self, - input_data: Union[str, Dict], - models_dir: str = "/"): - """ - Function to load the whole BN from json file. - :param input_data: input path to json file with bn. - :param models_dir: directory with models. - - :return: loading status. - """ - if isinstance(input_data, str): - with open(input_data) as f: - input_dict = json.load(f) - elif isinstance(input_data, dict): - input_dict = deepcopy(input_data) - else: - logger_network.error(f"Unknown input type: {type(input_data)}") - return - - self.add_nodes(input_dict["info"]) - self.set_structure(edges=input_dict["edges"]) - - # check compatibility with father network. - if not self.use_mixture: - for node_data in input_dict["parameters"].values(): - if "hybcprob" not in node_data.keys(): - continue - else: - # Since we don't have information about types of nodes, we - # should derive it from parameters. - if any( - list(node_keys.keys()) == ["covars", "mean", "coef"] - for node_keys in node_data["hybcprob"].values() - ): - logger_network.error( - f"This crucial parameter is not the same as father's parameter: use_mixture." - ) - return - - # check if edges before and after are the same.They can be different in - # the case when user sets forbidden edges. - if not self.has_logit: - if not all( - edges_before == [edges_after[0], edges_after[1]] - for edges_before, edges_after in zip(input_dict["edges"], self.edges) - ): - logger_network.error( - f"This crucial parameter is not the same as father's parameter: has_logit." - ) - return - - deserializer = serialization_utils.Deserializer(models_dir) - - to_deserialize = {} - # separate logit and gaussian nodes from distributions to deserialize bn's models - for node_name in input_dict["parameters"].keys(): - if "Mixture" in self[node_name].type: - continue - if self[node_name].type.startswith("Gaussian"): - if not input_dict["parameters"][node_name]["regressor"]: - continue - - if ( - "Gaussian" in self[node_name].type - or "Logit" in self[node_name].type - or "ConditionalLogit" in self[node_name].type - ): - if input_dict["parameters"][node_name].get("serialization", False): - to_deserialize[node_name] = [ - self[node_name].type, - input_dict["parameters"][node_name], - ] - elif "hybcprob" in input_dict["parameters"][node_name].keys(): - to_deserialize[node_name] = [ - self[node_name].type, - input_dict["parameters"][node_name], - ] - else: - continue - - deserialized_parameters = deserializer.apply(to_deserialize) - distributions = input_dict["parameters"].copy() - - for serialized_node in deserialized_parameters.keys(): - distributions[serialized_node] = deserialized_parameters[serialized_node] - - self.set_parameters(parameters=distributions) - - if input_dict.get("weights", False): - str_keys = list(input_dict["weights"].keys()) - tuple_keys = [eval(key) for key in str_keys] - weights = {} - for tuple_key in tuple_keys: - weights[tuple_key] = input_dict["weights"][str(tuple_key)] - self.weights = weights - return True - - def fit_parameters(self, data: pd.DataFrame, n_jobs: int = 1): - """ - Base function for parameter learning - """ - if data.isnull().values.any(): - logger_network.error("Dataframe contains NaNs.") - return - - if type(self).__name__ == "CompositeBN": - data = self._encode_categorical_data(data) - - # Turn all discrete values to str for learning algorithm - if "disc_num" in self.descriptor["types"].values(): - columns_names = [ - name - for name, t in self.descriptor["types"].items() - if t in ["disc_num"] - ] - data[columns_names] = data.loc[:, columns_names].astype("str") - - def worker(node): - return node.fit_parameters(data) - - results = Parallel(n_jobs=n_jobs)(delayed(worker)(node) for node in self.nodes) - - # code for debugging, do not remove - # results = [worker(node) for node in self.nodes] - - for result, node in zip(results, self.nodes): - self.distributions[node.name] = result - - def get_info(self, as_df: bool = True) -> Optional[pd.DataFrame]: - """Return a table with name, type, parents_type, parents_names""" - return get_info_(self, as_df) - - def sample( - self, - n: int, - models_dir: Optional[str] = None, - progress_bar: bool = True, - evidence: Optional[Dict[str, Union[str, int, float]]] = None, - as_df: bool = True, - predict: bool = False, - parall_count: int = 1, - filter_neg: bool = True, - ) -> Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: - """ - Sampling from Bayesian Network - n: int number of samples - evidence: values for nodes from user - parall_count: number of threads. Defaults to 1. - filter_neg: either filter negative vals or not. - """ - from joblib import Parallel, delayed - - random.seed() - if not self.distributions.items(): - logger_network.error( - "Parameter learning wasn't done. Call fit_parameters method" - ) - return None - if evidence: - for node in self.nodes: - if (node.type == "Discrete") & (node.name in evidence.keys()): - if not (isinstance(evidence[node.name], str)): - evidence[node.name] = str(int(evidence[node.name])) - - def wrapper(): - output = {} - for node in self.nodes: - parents = node.cont_parents + node.disc_parents - if evidence and node.name in evidence.keys(): - output[node.name] = evidence[node.name] - else: - if not parents: - pvals = None - else: - if self.type == "Discrete": - pvals = [str(output[t]) for t in parents] - else: - pvals = [output[t] for t in parents] - - # If any nan from parents, sampling from node blocked. - if any(pd.isnull(pvalue) for pvalue in pvals): - output[node.name] = np.nan - continue - node_data = self.distributions[node.name] - if models_dir and ("hybcprob" in node_data.keys()): - for obj, obj_data in node_data["hybcprob"].items(): - if "serialization" in obj_data.keys(): - if "gaussian" in node.type.lower(): - model_type = "regressor" - else: - model_type = "classifier" - if ( - obj_data["serialization"] == "joblib" - and obj_data[f"{model_type}_obj"] - ): - new_path = ( - models_dir - + f"\\{node.name.replace(' ', '_')}\\{obj}.joblib.compressed" - ) - node_data["hybcprob"][obj][ - f"{model_type}_obj" - ] = new_path - if predict: - output[node.name] = node.predict(node_data, pvals=pvals) - else: - output[node.name] = node.choose(node_data, pvals=pvals) - return output - - if predict: - seq = [] - for _ in tqdm(range(n), position=0, leave=True): - result = wrapper() - seq.append(result) - elif progress_bar: - seq = Parallel(n_jobs=parall_count)( - delayed(wrapper)() for _ in tqdm(range(n), position=0, leave=True) - ) - else: - seq = Parallel(n_jobs=parall_count)(delayed(wrapper)() for _ in range(n)) - - # code for debugging, don't remove - # seq = [] - # for _ in tqdm(range(n), position=0, leave=True): - # result = wrapper() - # seq.append(result) - - seq_df = pd.DataFrame.from_dict(seq, orient="columns") - seq_df.dropna(inplace=True) - cont_nodes = [ - c.name - for c in self.nodes - if type(c).__name__ - not in ( - "DiscreteNode", - "LogitNode", - "CompositeDiscreteNode", - "ConditionalLogitNode", - ) - ] - positive_columns = [ - c for c in cont_nodes if self.descriptor["signs"][c] == "pos" - ] - if filter_neg: - seq_df = seq_df[(seq_df[positive_columns] >= 0).all(axis=1)] - seq_df.reset_index(inplace=True, drop=True) - - seq = seq_df.to_dict("records") - sample_output = pd.DataFrame.from_dict(seq, orient="columns") - - if as_df: - if type(self).__name__ == "CompositeBN": - sample_output = self._decode_categorical_data(sample_output) - return sample_output - else: - return seq - - def predict( - self, - test: pd.DataFrame, - parall_count: int = 1, - progress_bar: bool = True, - models_dir: Optional[str] = None, - ) -> Dict[str, Union[List[str], List[int], List[float]]]: - """ - Function to predict columns from given data. - Note that train data and test data must have different columns. - Both train and test datasets must be cleaned from NaNs. - - Args: - test (pd.DataFrame): test dataset - parall_count (int, optional):number of threads. Defaults to 1. - progress_bar: verbose mode. - - Returns: - predicted data (dict): dict with column as key and predicted data as value - """ - if test.isnull().any().any(): - logger_network.error("Test data contains NaN values.") - return {} - - from joblib import Parallel, delayed - - def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): - preds = {column_name: list() for column_name in columns} - - if len(test) == 1: - for i in range(test.shape[0]): - test_row = dict(test.iloc[i, :]) - for n, key in enumerate(columns): - try: - sample = bn.sample( - 1, - evidence=test_row, - predict=True, - progress_bar=False, - models_dir=models_dir, - ) - if sample.empty: - preds[key].append(np.nan) - continue - if bn.descriptor["types"][key] == "cont": - if (bn.descriptor["signs"][key] == "pos") & ( - sample.loc[0, key] < 0 - ): - # preds[key].append(np.nan) - preds[key].append(0) - else: - preds[key].append(sample.loc[0, key]) - else: - preds[key].append(sample.loc[0, key]) - except Exception as ex: - logger_network.error(ex) - preds[key].append(np.nan) - return preds - else: - logger_network.error("Wrapper for one row from pandas.DataFrame") - return {} - - columns = list(set(self.nodes_names) - set(test.columns.to_list())) - if not columns: - logger_network.error("Test data is the same as train.") - return {} - - preds = {column_name: list() for column_name in columns} - - if progress_bar: - processed_list = Parallel(n_jobs=parall_count)( - delayed(wrapper)(self, test.loc[[i]], columns, models_dir) - for i in tqdm(test.index, position=0, leave=True) - ) - else: - processed_list = Parallel(n_jobs=parall_count)( - delayed(wrapper)(self, test.loc[[i]], columns, models_dir) - for i in test.index - ) - - for i in range(test.shape[0]): - curr_pred = processed_list[i] - for n, key in enumerate(columns): - preds[key].append(curr_pred[key][0]) - - return preds - - def set_classifiers(self, classifiers: Dict[str, object]): - """ - Set classifiers for logit nodes. - classifiers: dict with node_name and Classifier - """ - if not self.has_logit: - logger_network.error("Logit nodes are forbidden.") - return None - - for node in self.nodes: - if "Logit" in node.type: - if node.name in classifiers.keys(): - node.classifier = classifiers[node.name] - node.type = re.sub( - r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type - ) - else: - continue - - def set_regressor(self, regressors: Dict[str, object]): - """ - Set classifiers for gaussian nodes. - classifiers: dict with node_name and Classifier - """ - - for node in self.nodes: - if "Gaussian" in node.type: - if node.name in regressors.keys(): - node.regressor = regressors[node.name] - node.type = re.sub( - r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type - ) - else: - continue - - def plot(self, output: str): - """ - Visualize a Bayesian Network. Result will be saved - in the parent directory in folder visualization_result. - output: str name of output file - """ - return plot_(output, self.nodes, self.edges) - - def markov_blanket(self, node_name, plot_to: Optional[str] = None): - """ - A method to get markov blanket of a node. - :param node_name: name of node - :param plot_to: directory to plot graph, the file must have html extension. - - :return structure: json with {"nodes": [...], "edges": [...]} - """ - structure = GraphUtils.GraphAnalyzer(self).markov_blanket(node_name) - if plot_to: - plot_( - plot_to, [self[name] for name in structure["nodes"]], structure["edges"] - ) - return structure - - def find_family( - self, - node_name: str, - height: int = 1, - depth: int = 1, - with_nodes: Optional[List] = None, - plot_to: Optional[str] = None, - ): - """ - A method to get markov blanket of a node. - :param node_name: name of node - :param height: a number of layers up to node (its parents) that will be taken - :param depth: a number of layers down to node (its children) that will be taken - :param with_nodes: include nodes in return - :param plot_to: directory to plot graph, the file must have html extension. - - :return structure: json with {"nodes": [...], "edges": [...]} - """ - structure = GraphUtils.GraphAnalyzer(self).find_family( - node_name, height, depth, with_nodes - ) - - if plot_to: - plot_( - plot_to, [self[name] for name in structure["nodes"]], structure["edges"] - ) - - def fill_gaps(self, df: pd.DataFrame, **kwargs): - """ - Fill NaNs with sampled values. - - :param df: dataframe with NaNs - :param kwargs: the same params as bn.predict - - :return df, failed: filled DataFrame and list of failed rows (sometimes predict can return np.nan) - """ - if not self.distributions: - logger_network.error("To call this method you must train parameters.") - - # create a mimic row to get a dataframe from iloc method - list = [np.nan for _ in range(df.shape[1])] - df.loc["mimic"] = list - - def fill_row(df, i): - row = df.iloc[[i, -1], :].drop(["mimic"], axis=0) - - evidences = row.dropna(axis=1) - - return row.index[0], self.predict(evidences, progress_bar=False, **kwargs) - - failed = [] - for index in range(df.shape[0] - 1): - if df.iloc[index].isna().any(): - true_pos, result = fill_row(df, index) - if any(pd.isna(v[0]) for v in result.values()): - failed.append(true_pos) - continue - else: - for column, value in result.items(): - df.loc[true_pos, column] = value[0] - else: - continue - df.drop(failed, inplace=True) - return df.drop(["mimic"]), failed - - def get_dist(self, node_name: str, pvals: Optional[dict] = None): - """ - Get a distribution from node with known parent values (conditional distribution). - - :param node_name: name of node - :param pvals: parent values - """ - if not self.distributions: - logger_network.error("Empty parameters. Call fit_params first.") - return - node = self[node_name] - - parents = node.cont_parents + node.disc_parents - if not parents: - return self.distributions[node_name] - - pvals = [pvals[parent] for parent in parents] - - return node.get_dist(node_info=self.distributions[node_name], pvals=pvals) - - def _encode_categorical_data(self, data): - for column in data.select_dtypes(include=["object", "string"]).columns: - encoder = LabelEncoder() - data[column] = encoder.fit_transform(data[column]) - self.encoders[column] = encoder - return data - - def _decode_categorical_data(self, data): - data = data.apply( - lambda col: pd.to_numeric(col).astype(int) if col.dtype == "object" else col - ) - for column, encoder in self.encoders.items(): - data[column] = encoder.inverse_transform(data[column]) - return data diff --git a/bamt/networks/big_brave_bn.py b/bamt/networks/big_brave_bn.py deleted file mode 100644 index e4adc9a..0000000 --- a/bamt/networks/big_brave_bn.py +++ /dev/null @@ -1,105 +0,0 @@ -import math - -import numpy as np -import pandas as pd -from sklearn.metrics import mutual_info_score -from sklearn.preprocessing import OrdinalEncoder - - -class BigBraveBN: - def __init__(self): - self.possible_edges = [] - - def set_possible_edges_by_brave( - self, - df: pd.DataFrame, - n_nearest: int = 5, - threshold: float = 0.3, - proximity_metric: str = "MI", - ) -> list: - """Returns list of possible edges for structure learning and sets it into attribute - - Args: - df (pd.DataFrame): Data. - n_nearest (int): Number of nearest neighbors to consider. Default is 5. - threshold (float): Threshold for selecting edges. Default is 0.3. - proximity_metric (str): Metric used to calculate proximity. Default is "MI". - - Returns: - None: Modifies the object's possible_edges attribute. - """ - df_copy = df.copy(deep=True) - proximity_matrix = self._get_proximity_matrix(df_copy, proximity_metric) - brave_matrix = self._get_brave_matrix(df_copy.columns, proximity_matrix, n_nearest) - - threshold_value = brave_matrix.max(numeric_only=True).max() * threshold - filtered_brave_matrix = brave_matrix[brave_matrix > threshold_value].stack() - self.possible_edges = filtered_brave_matrix.index.tolist() - return self.possible_edges - - @staticmethod - def _get_n_nearest( - data: pd.DataFrame, columns: list, corr: bool = False, number_close: int = 5 - ) -> list: - """Returns N nearest neighbors for every column of dataframe.""" - groups = [] - for c in columns: - close_ind = data[c].sort_values(ascending=not corr).index.tolist() - groups.append(close_ind[: number_close + 1]) - return groups - - @staticmethod - def _get_proximity_matrix(df: pd.DataFrame, proximity_metric: str) -> pd.DataFrame: - """Returns matrix of proximity for the dataframe.""" - encoder = OrdinalEncoder() - df_coded = df.copy() - columns_to_encode = list(df_coded.select_dtypes(include=["category", "object"])) - df_coded[columns_to_encode] = encoder.fit_transform(df_coded[columns_to_encode]) - - if proximity_metric == "MI": - df_distance = pd.DataFrame( - np.zeros((len(df.columns), len(df.columns))), - columns=df.columns, - index=df.columns, - ) - for c1 in df.columns: - for c2 in df.columns: - dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) - df_distance.loc[c1, c2] = dist - return df_distance - - elif proximity_metric == "pearson": - return df_coded.corr(method="pearson") - - def _get_brave_matrix( - self, df_columns: pd.Index, proximity_matrix: pd.DataFrame, n_nearest: int = 5 - ) -> pd.DataFrame: - """Returns matrix of Brave coefficients for the DataFrame.""" - brave_matrix = pd.DataFrame( - np.zeros((len(df_columns), len(df_columns))), - columns=df_columns, - index=df_columns, - ) - groups = self._get_n_nearest( - proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest - ) - - for c1 in df_columns: - for c2 in df_columns: - a = b = c = d = 0.0 - if c1 != c2: - for g in groups: - a += (c1 in g) & (c2 in g) - b += (c1 in g) & (c2 not in g) - c += (c1 not in g) & (c2 in g) - d += (c1 not in g) & (c2 not in g) - - divisor = (math.sqrt((a + c) * (b + d))) * ( - math.sqrt((a + b) * (c + d)) - ) - br = (a * len(groups) + (a + c) * (a + b)) / ( - divisor if divisor != 0 else 0.0000000001 - ) - brave_matrix.loc[c1, c2] = br - - return brave_matrix diff --git a/bamt/networks/composite_bn.py b/bamt/networks/composite_bn.py deleted file mode 100644 index e3739aa..0000000 --- a/bamt/networks/composite_bn.py +++ /dev/null @@ -1,114 +0,0 @@ -import re -from typing import Optional, Dict - -import pandas as pd - -from bamt.builders.composite_builder import CompositeStructureBuilder, CompositeDefiner -from bamt.log import logger_network -from bamt.networks.base import BaseNetwork -from bamt.utils.composite_utils.MLUtils import MlModels - - -class CompositeBN(BaseNetwork): - """ - Composite Bayesian Network with Machine Learning Models support - """ - - def __init__(self): - super(CompositeBN, self).__init__() - self._allowed_dtypes = ["cont", "disc", "disc_num"] - self.type = "Composite" - self.parent_models = {} - - def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): - """ - Function for initializing nodes in Bayesian Network - descriptor: dict with types and signs of nodes - """ - self.descriptor = descriptor - - worker_1 = CompositeDefiner(descriptor=descriptor, regressor=None) - self.nodes = worker_1.vertices - - def add_edges( - self, - data: pd.DataFrame, - progress_bar: bool = True, - classifier: Optional[object] = None, - regressor: Optional[object] = None, - **kwargs, - ): - worker = CompositeStructureBuilder( - data=data, descriptor=self.descriptor, regressor=regressor - ) - - worker.build( - data=data, - classifier=classifier, - regressor=regressor, - progress_bar=progress_bar, - **kwargs, - ) - - # update family - self.nodes = worker.skeleton["V"] - self.edges = worker.skeleton["E"] - self.parent_models = worker.parent_models_dict - self.set_models(self.parent_models) - - def set_models(self, parent_models): - ml_models = MlModels() - ml_models_dict = ml_models.dict_models - for node in self.nodes: - if ( - type(node).__name__ == "CompositeDiscreteNode" - and parent_models[node.name] is not None - and len(node.cont_parents + node.disc_parents) > 0 - ): - self.set_classifiers( - {node.name: ml_models_dict[parent_models[node.name]]()} - ) - logger_network.info( - msg=f"{ml_models_dict[parent_models[node.name]]} classifier has been set for {node.name}" - ) - elif ( - type(node).__name__ == "CompositeContinuousNode" - and parent_models[node.name] is not None - and len(node.cont_parents + node.disc_parents) > 0 - ): - self.set_regressor( - {node.name: ml_models_dict[parent_models[node.name]]()} - ) - logger_network.info( - msg=f"{ml_models_dict[parent_models[node.name]]} regressor has been set for {node.name}" - ) - else: - pass - - def set_classifiers(self, classifiers: Dict[str, object]): - """ - Set classifiers for logit nodes. - classifiers: dict with node_name and Classifier - """ - for node in self.nodes: - if node.name in classifiers.keys(): - node.classifier = classifiers[node.name] - node.type = re.sub( - r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type - ) - else: - continue - - def set_regressor(self, regressors: Dict[str, object]): - """ - Set regressor for gaussian nodes. - classifiers: dict with node_name and regressors - """ - for node in self.nodes: - if node.name in regressors.keys(): - node.regressor = regressors[node.name] - node.type = re.sub( - r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type - ) - else: - continue diff --git a/bamt/networks/continuous_bn.py b/bamt/networks/continuous_bn.py deleted file mode 100644 index 2c2f327..0000000 --- a/bamt/networks/continuous_bn.py +++ /dev/null @@ -1,15 +0,0 @@ -from .base import BaseNetwork - - -class ContinuousBN(BaseNetwork): - """ - Bayesian Network with Continuous Types of Nodes - """ - - def __init__(self, use_mixture: bool = False): - super(ContinuousBN, self).__init__() - self.type = "Continuous" - self._allowed_dtypes = ["cont"] - self.has_logit = None - self.use_mixture = use_mixture - self.scoring_function = "" diff --git a/bamt/networks/discrete_bn.py b/bamt/networks/discrete_bn.py deleted file mode 100644 index 90a0d07..0000000 --- a/bamt/networks/discrete_bn.py +++ /dev/null @@ -1,15 +0,0 @@ -from .base import BaseNetwork - - -class DiscreteBN(BaseNetwork): - """ - Bayesian Network with Discrete Types of Nodes - """ - - def __init__(self): - super(DiscreteBN, self).__init__() - self.type = "Discrete" - self.scoring_function = "" - self._allowed_dtypes = ["disc", "disc_num"] - self.has_logit = None - self.use_mixture = None diff --git a/bamt/networks/hybrid_bn.py b/bamt/networks/hybrid_bn.py deleted file mode 100644 index 2942da8..0000000 --- a/bamt/networks/hybrid_bn.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Dict - -from .base import BaseNetwork - - -class HybridBN(BaseNetwork): - """ - Bayesian Network with Mixed Types of Nodes - """ - - def __init__(self, has_logit: bool = False, use_mixture: bool = False): - super(HybridBN, self).__init__() - self._allowed_dtypes = ["cont", "disc", "disc_num"] - self.type = "Hybrid" - self.has_logit = has_logit - self.use_mixture = use_mixture - - def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: - types = descriptor["types"] - s = set(types.values()) - return ( - True - if ({"cont", "disc", "disc_num"} == s) - or ({"cont", "disc"} == s) - or ({"cont", "disc_num"} == s) - else False - ) diff --git a/bamt/nodes/__init__.py b/bamt/nodes/__init__.py index f896bc3..e69de29 100644 --- a/bamt/nodes/__init__.py +++ b/bamt/nodes/__init__.py @@ -1,10 +0,0 @@ -__all__ = [ - "base", - "conditional_gaussian_node", - "conditional_logit_node", - "conditional_mixture_gaussian_node", - "discrete_node", - "logit_node", - "gaussian_node", - "mixture_gaussian_node", -] diff --git a/bamt/nodes/base.py b/bamt/nodes/base.py deleted file mode 100644 index f42b0a3..0000000 --- a/bamt/nodes/base.py +++ /dev/null @@ -1,59 +0,0 @@ -import pickle -from typing import Union - - -class BaseNode(object): - """ - Base class for nodes. - """ - - def __init__(self, name: str): - """ - :param name: name for node (taken from column name) - type: node type - disc_parents: list with discrete parents - cont_parents: list with continuous parents - children: node's children - """ - self.name = name - self.type = "abstract" - - self.disc_parents = [] - self.cont_parents = [] - self.children = [] - - def __repr__(self): - return f"{self.name}" - - def __eq__(self, other): - if not isinstance(other, BaseNode): - # don't attempt to compare against unrelated types - return NotImplemented - - return ( - self.name == other.name - and self.type == other.type - and self.disc_parents == other.disc_parents - and self.cont_parents == other.cont_parents - and self.children == other.children - ) - - @staticmethod - def choose_serialization(model) -> Union[str, Exception]: - try: - ex_b = pickle.dumps(model, protocol=4) - model_ser = ex_b.decode("latin1").replace("'", '"') - - if type(model).__name__ == "CatBoostRegressor": - a = model_ser.encode("latin1") - else: - a = model_ser.replace('"', "'").encode("latin1") - - classifier_body = pickle.loads(a) - return "pickle" - except Exception as ex: - return ex - - @staticmethod - def get_dist(node_info, pvals): - pass diff --git a/bamt/nodes/child_nodes/__init__.py b/bamt/nodes/child_nodes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/child_nodes/child_node.py b/bamt/nodes/child_nodes/child_node.py new file mode 100644 index 0000000..d99d45d --- /dev/null +++ b/bamt/nodes/child_nodes/child_node.py @@ -0,0 +1,6 @@ +from bamt.nodes.node import Node + + +class ChildNode(Node): + def __init__(self): + super().__init__() diff --git a/bamt/nodes/child_nodes/conditional_continuous_node.py b/bamt/nodes/child_nodes/conditional_continuous_node.py new file mode 100644 index 0000000..171b4a8 --- /dev/null +++ b/bamt/nodes/child_nodes/conditional_continuous_node.py @@ -0,0 +1,7 @@ +from .child_node import ChildNode + + +class ConditionalContinuousNode(ChildNode): + def __init__(self): + super().__init__() + self._model = None diff --git a/bamt/nodes/child_nodes/conditional_discrete_node.py b/bamt/nodes/child_nodes/conditional_discrete_node.py new file mode 100644 index 0000000..0a01b00 --- /dev/null +++ b/bamt/nodes/child_nodes/conditional_discrete_node.py @@ -0,0 +1,7 @@ +from .child_node import ChildNode + + +class ConditionalDiscreteNode(ChildNode): + def __init__(self): + super().__init__() + self._model = None diff --git a/bamt/nodes/child_nodes/node_models/__init__.py b/bamt/nodes/child_nodes/node_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/composite_continuous_node.py b/bamt/nodes/composite_continuous_node.py deleted file mode 100644 index b446402..0000000 --- a/bamt/nodes/composite_continuous_node.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Optional, Union - -from sklearn import linear_model - -from .gaussian_node import GaussianNode -from .schema import GaussianParams, HybcprobParams - -NodeInfo = Union[GaussianParams, HybcprobParams] - - -class CompositeContinuousNode(GaussianNode): - def __init__(self, name, regressor: Optional[object] = None): - super(CompositeContinuousNode, self).__init__(name) - if regressor is None: - regressor = linear_model.LinearRegression() - self.regressor = regressor - self.type = "CompositeContinuous" + f" ({type(self.regressor).__name__})" diff --git a/bamt/nodes/composite_discrete_node.py b/bamt/nodes/composite_discrete_node.py deleted file mode 100644 index 6d82db5..0000000 --- a/bamt/nodes/composite_discrete_node.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Optional - -from sklearn import linear_model - -from .logit_node import LogitNode - - -class CompositeDiscreteNode(LogitNode): - """ - Class for composite discrete node. - """ - - def __init__(self, name, classifier: Optional[object] = None): - super(CompositeDiscreteNode, self).__init__(name) - if classifier is None: - classifier = linear_model.LogisticRegression( - multi_class="multinomial", solver="newton-cg", max_iter=100 - ) - self.classifier = classifier - self.type = "CompositeDiscrete" + f" ({type(self.classifier).__name__})" diff --git a/bamt/nodes/conditional_gaussian_node.py b/bamt/nodes/conditional_gaussian_node.py deleted file mode 100644 index 58801b6..0000000 --- a/bamt/nodes/conditional_gaussian_node.py +++ /dev/null @@ -1,173 +0,0 @@ -import itertools -import math -import random -from typing import Dict, Optional, List, Union - -import numpy as np -from pandas import DataFrame -from sklearn import linear_model -from sklearn.base import clone -from sklearn.metrics import mean_squared_error as mse - -from .base import BaseNode -from .schema import CondGaussParams - - -class ConditionalGaussianNode(BaseNode): - """ - Main class for Conditional Gaussian Node - """ - - def __init__(self, name, regressor: Optional[object] = None): - super(ConditionalGaussianNode, self).__init__(name) - if regressor is None: - regressor = linear_model.LinearRegression() - self.regressor = regressor - self.type = "ConditionalGaussian" + f" ({type(self.regressor).__name__})" - - def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams]]: - """ - Train params for Conditional Gaussian Node. - Return: - {"hybcprob": { : CondGaussParams}} - """ - hycprob = dict() - values = [] - combinations = [] - for d_p in self.disc_parents: - values.append(np.unique(data[d_p].values)) - for xs in itertools.product(*values): - combinations.append(list(xs)) - for comb in combinations: - mask = np.full(len(data), True) - for col, val in zip(self.disc_parents, comb): - mask = (mask) & (data[col] == val) - new_data = data[mask] - key_comb = [str(x) for x in comb] - if new_data.shape[0] > 0: - if self.cont_parents: - model = clone(self.regressor) - model.fit( - new_data[self.cont_parents].values, new_data[self.name].values - ) - predicted_value = model.predict(new_data[self.cont_parents].values) - variance = mse( - new_data[self.name].values, predicted_value, squared=False - ) - hycprob[str(key_comb)] = { - "variance": variance, - "mean": np.nan, - "regressor_obj": model, - "regressor": type(self.regressor).__name__, - "serialization": None, - } - else: - mean_base = np.mean(new_data[self.name].values) - variance = np.var(new_data[self.name].values) - hycprob[str(key_comb)] = { - "variance": variance, - "mean": mean_base, - "regressor_obj": None, - "regressor": None, - "serialization": None, - } - else: - hycprob[str(key_comb)] = { - "variance": np.nan, - "regressor": None, - "regressor_obj": None, - "serialization": None, - "mean": np.nan, - } - return {"hybcprob": hycprob} - - def get_dist(self, node_info, pvals): - dispvals = [] - lgpvals = [] - for pval in pvals: - if isinstance(pval, str): - dispvals.append(pval) - else: - lgpvals.append(pval) - - lgdistribution = node_info["hybcprob"][str(dispvals)] - - # JOBLIB - - if self.cont_parents: - flag = False - for el in lgpvals: - if str(el) == "nan": - flag = True - break - if flag: - return np.nan, np.nan - else: - if lgdistribution["regressor"]: - model = lgdistribution["regressor_obj"] - - cond_mean = model.predict(np.array(lgpvals).reshape(1, -1))[0] - variance = lgdistribution["variance"] - return cond_mean, variance - else: - return np.nan, np.nan - - else: - return lgdistribution["mean"], math.sqrt(lgdistribution["variance"]) - - def choose( - self, - node_info: Dict[str, Dict[str, CondGaussParams]], - pvals: List[Union[str, float]], - ) -> float: - """ - Return value from ConditionalLogit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - cond_mean, variance = self.get_dist(node_info, pvals) - if np.isnan(cond_mean) or np.isnan(variance): - return np.nan - - return random.gauss(cond_mean, variance) - - def predict( - self, - node_info: Dict[str, Dict[str, CondGaussParams]], - pvals: List[Union[str, float]], - ) -> float: - """ - Return value from ConditionalLogit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - dispvals = [] - lgpvals = [] - for pval in pvals: - if isinstance(pval, str): - dispvals.append(pval) - else: - lgpvals.append(pval) - - lgdistribution = node_info["hybcprob"][str(dispvals)] - - if self.cont_parents: - flag = False - for el in lgpvals: - if str(el) == "nan": - flag = True - break - if flag: - return np.nan - else: - if lgdistribution["regressor"]: - model = lgdistribution["regressor_obj"] - return model.predict(np.array(lgpvals).reshape(1, -1))[0] - else: - return np.nan - else: - return lgdistribution["mean"] diff --git a/bamt/nodes/conditional_logit_node.py b/bamt/nodes/conditional_logit_node.py deleted file mode 100644 index 08010c0..0000000 --- a/bamt/nodes/conditional_logit_node.py +++ /dev/null @@ -1,172 +0,0 @@ -import itertools -import random -from typing import Optional, List, Union, Dict - -import numpy as np -from pandas import DataFrame -from sklearn import linear_model -from sklearn.base import clone - -from .base import BaseNode -from .schema import LogitParams - - -class ConditionalLogitNode(BaseNode): - """ - Main class for Conditional Logit Node - """ - - def __init__(self, name: str, classifier: Optional[object] = None): - super(ConditionalLogitNode, self).__init__(name) - if classifier is None: - classifier = linear_model.LogisticRegression( - multi_class="multinomial", solver="newton-cg", max_iter=100 - ) - self.classifier = classifier - self.type = "ConditionalLogit" + f" ({type(self.classifier).__name__})" - - def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, LogitParams]]: - """ - Train params on data - Return: - {"hybcprob": { : LogitParams}} - """ - hycprob = dict() - values = [] - combinations = [] - for d_p in self.disc_parents: - values.append(np.unique(data[d_p].values)) - for xs in itertools.product(*values): - combinations.append(list(xs)) - for comb in combinations: - mask = np.full(len(data), True) - for col, val in zip(self.disc_parents, comb): - mask = (mask) & (data[col] == val) - new_data = data[mask] - # mean_base = [np.nan] - classes = [np.nan] - key_comb = [str(x) for x in comb] - if new_data.shape[0] != 0: - model = clone(self.classifier) - values = set(new_data[self.name]) - if len(values) > 1: - model.fit( - X=new_data[self.cont_parents].values, - y=new_data[self.name].values, - ) - classes = list(model.classes_) - hycprob[str(key_comb)] = { - "classes": classes, - "classifier_obj": model, - "classifier": type(self.classifier).__name__, - "serialization": None, - } - else: - classes = list(values) - hycprob[str(key_comb)] = { - "classes": classes, - "classifier": type(self.classifier).__name__, - "classifier_obj": None, - "serialization": None, - } - - else: - hycprob[str(key_comb)] = { - "classes": list(classes), - "classifier": type(self.classifier).__name__, - "classifier_obj": None, - "serialization": None, - } - return {"hybcprob": hycprob} - - @staticmethod - def get_dist(node_info, pvals, **kwargs): - dispvals = [] - lgpvals = [] - for pval in pvals: - if isinstance(pval, str): - dispvals.append(pval) - else: - lgpvals.append(pval) - - if any(parent_value == "nan" for parent_value in dispvals): - return np.nan - - lgdistribution = node_info["hybcprob"][str(dispvals)] - - # JOBLIB - if len(lgdistribution["classes"]) > 1: - model = lgdistribution["classifier_obj"] - distribution = model.predict_proba(np.array(lgpvals).reshape(1, -1))[0] - - if not kwargs.get("inner", False): - return distribution - else: - return distribution, lgdistribution - else: - if not kwargs.get("inner", False): - return np.array([1.0]) - else: - return np.array([1.0]), lgdistribution - - def choose( - self, - node_info: Dict[str, Dict[str, LogitParams]], - pvals: List[Union[str, float]], - ) -> str: - """ - Return value from ConditionalLogit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - distribution, lgdistribution = self.get_dist(node_info, pvals, inner=True) - - # JOBLIB - if len(lgdistribution["classes"]) > 1: - rand = random.random() - rindex = 0 - lbound = 0 - ubound = 0 - for interval in range(len(lgdistribution["classes"])): - ubound += distribution[interval] - if lbound <= rand < ubound: - rindex = interval - break - else: - lbound = ubound - return str(lgdistribution["classes"][rindex]) - else: - return str(lgdistribution["classes"][0]) - - @staticmethod - def predict( - node_info: Dict[str, Dict[str, LogitParams]], pvals: List[Union[str, float]] - ) -> str: - """ - Return value from ConditionalLogit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - dispvals = [] - lgpvals = [] - for pval in pvals: - if isinstance(pval, str): - dispvals.append(pval) - else: - lgpvals.append(pval) - - lgdistribution = node_info["hybcprob"][str(dispvals)] - - # JOBLIB - if len(lgdistribution["classes"]) > 1: - model = lgdistribution["classifier_obj"] - pred = model.predict(np.array(lgpvals).reshape(1, -1))[0] - - return str(pred) - - else: - return str(lgdistribution["classes"][0]) diff --git a/bamt/nodes/conditional_mixture_gaussian_node.py b/bamt/nodes/conditional_mixture_gaussian_node.py deleted file mode 100644 index bdd3263..0000000 --- a/bamt/nodes/conditional_mixture_gaussian_node.py +++ /dev/null @@ -1,227 +0,0 @@ -import itertools -from typing import Union, List, Optional, Dict - -import numpy as np -from gmr import GMM -from pandas import DataFrame - -from bamt.utils.MathUtils import component -from .base import BaseNode -from .schema import CondMixtureGaussParams - - -class ConditionalMixtureGaussianNode(BaseNode): - """ - Main class for Conditional Mixture Gaussian Node - """ - - def __init__(self, name): - super(ConditionalMixtureGaussianNode, self).__init__(name) - self.type = "ConditionalMixtureGaussian" - - def fit_parameters( - self, data: DataFrame - ) -> Dict[str, Dict[str, CondMixtureGaussParams]]: - """ - Train params for Conditional Mixture Gaussian Node. - Return: - {"hybcprob": { : CondMixtureGaussParams}} - """ - hycprob = dict() - values = [] - combinations = [] - for d_p in self.disc_parents: - values.append(np.unique(data[d_p].values)) - for xs in itertools.product(*values): - combinations.append(list(xs)) - for comb in combinations: - mask = np.full(len(data), True) - for col, val in zip(self.disc_parents, comb): - mask = (mask) & (data[col] == val) - new_data = data[mask] - new_data.reset_index(inplace=True, drop=True) - key_comb = [str(x) for x in comb] - nodes = [self.name] + self.cont_parents - if new_data.shape[0] > 5: - if self.cont_parents: - # component(new_data, nodes, - # 'LRTS')#int((component(new_data, nodes, 'aic') + - # component(new_data, nodes, 'bic')) / 2) - n_comp = int( - ( - component(new_data, nodes, "aic") - + component(new_data, nodes, "bic") - ) - / 2 - ) - # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples( - new_data[nodes].values, n_iter=500, init_params="kmeans++" - ) - else: - # component(new_data, [node], - # 'LRTS')#int((component(new_data, [node], 'aic') + - # component(new_data, [node], 'bic')) / 2) - n_comp = int( - ( - component(new_data, [self.name], "aic") - + component(new_data, [self.name], "bic") - ) - / 2 - ) - # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples( - np.transpose([new_data[self.name].values]), - n_iter=500, - init_params="kmeans++", - ) - means = gmm.means.tolist() - cov = gmm.covariances.tolist() - # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) - w = gmm.priors.tolist() # [] - # for row in weigts: - # w.append(np.mean(row)) - hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} - elif new_data.shape[0] != 0: - n_comp = 1 - gmm = GMM(n_components=n_comp) - if self.cont_parents: - gmm.from_samples(new_data[nodes].values) - else: - gmm.from_samples(np.transpose([new_data[self.name].values])) - means = gmm.means.tolist() - cov = gmm.covariances.tolist() - # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) - w = gmm.priors.tolist() # [] - # for row in weigts: - # w.append(np.mean(row)) - hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} - else: - if self.cont_parents: - hycprob[str(key_comb)] = { - "covars": np.nan, - "mean": np.nan, - "coef": [], - } - else: - hycprob[str(key_comb)] = { - "covars": np.nan, - "mean": np.nan, - "coef": [], - } - return {"hybcprob": hycprob} - - @staticmethod - def get_dist(node_info, pvals): - lgpvals = [] - dispvals = [] - - for pval in pvals: - if (isinstance(pval, str)) | (isinstance(pval, int)): - dispvals.append(pval) - else: - lgpvals.append(pval) - - lgdistribution = node_info["hybcprob"][str(dispvals)] - mean = lgdistribution["mean"] - covariance = lgdistribution["covars"] - w = lgdistribution["coef"] - - if len(w) != 0: - if len(lgpvals) != 0: - indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] - if not np.isnan(np.array(lgpvals)).all(): - n_comp = len(w) - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - cond_gmm = gmm.condition(indexes, [lgpvals]) - return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors - else: - return np.nan, np.nan, np.nan - else: - n_comp = len(w) - gmm = GMM( - n_components=n_comp, priors=w, means=mean, covariances=covariance - ) - return gmm.means, gmm.covariances, gmm.priors - else: - return np.nan, np.nan, np.nan - - def choose( - self, - node_info: Dict[str, Dict[str, CondMixtureGaussParams]], - pvals: List[Union[str, float]], - ) -> Optional[float]: - """ - Function to get value from ConditionalMixtureGaussian node - params: - node_info: nodes info from distributions - pvals: parent values - """ - mean, covariance, w = self.get_dist(node_info, pvals) - - # check if w is nan or list of weights - if not isinstance(w, np.ndarray): - return np.nan - - n_comp = len(w) - - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - sample = gmm.sample(1)[0][0] - return sample - - @staticmethod - def predict( - node_info: Dict[str, Dict[str, CondMixtureGaussParams]], - pvals: List[Union[str, float]], - ) -> Optional[float]: - """ - Function to get prediction from ConditionalMixtureGaussian node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - dispvals = [] - lgpvals = [] - for pval in pvals: - if (isinstance(pval, str)) | (isinstance(pval, int)): - dispvals.append(pval) - else: - lgpvals.append(pval) - lgdistribution = node_info["hybcprob"][str(dispvals)] - mean = lgdistribution["mean"] - covariance = lgdistribution["covars"] - w = lgdistribution["coef"] - if len(w) != 0: - if len(lgpvals) != 0: - indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] - if not np.isnan(np.array(lgpvals)).all(): - n_comp = len(w) - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - sample = gmm.predict(indexes, [lgpvals])[0][0] - else: - sample = np.nan - else: - # n_comp = len(w) - # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) - sample = 0 - for ind, wi in enumerate(w): - sample += wi * mean[ind][0] - else: - sample = np.nan - return sample diff --git a/bamt/nodes/discrete_node.py b/bamt/nodes/discrete_node.py deleted file mode 100644 index 705f1ee..0000000 --- a/bamt/nodes/discrete_node.py +++ /dev/null @@ -1,113 +0,0 @@ -import random -from itertools import product -from typing import Type, Dict, Union, List - -import numpy as np -from pandas import DataFrame, crosstab - -from .base import BaseNode -from .schema import DiscreteParams - - -class DiscreteNode(BaseNode): - """ - Main class of Discrete Node - """ - - def __init__(self, name): - super(DiscreteNode, self).__init__(name) - self.type = "Discrete" - - def fit_parameters(self, data: DataFrame, num_workers: int = 1): - """ - Train params for Discrete Node - data: DataFrame to train on - num_workers: number of Parallel Workers - Method returns probas dict with the following format {[: value]} - and vals, list of appeared values in combinations - """ - - def worker(node: Type[BaseNode]) -> DiscreteParams: - parents = node.disc_parents + node.cont_parents - dist = data[node.name].value_counts(normalize=True).sort_index() - vals = [str(i) for i in dist.index.to_list()] - - if not parents: - cprob = dist.to_list() - else: - cprob = { - str([str(i) for i in comb]): [1 / len(vals) for _ in vals] - for comb in product(*[data[p].unique() for p in parents]) - } - - conditional_dist = crosstab( - data[node.name].to_list(), - [data[p] for p in parents], - normalize="columns", - ).T - tight_form = conditional_dist.to_dict("tight") - - for comb, probs in zip(tight_form["index"], tight_form["data"]): - if len(parents) > 1: - cprob[str([str(i) for i in comb])] = probs - else: - cprob[f"['{comb}']"] = probs - return {"cprob": cprob, "vals": vals} - - # pool = ThreadPoolExecutor(num_workers) - # future = pool.submit(worker, self) - result = worker(self) - return result - - @staticmethod - def get_dist(node_info, pvals): - if not pvals: - return node_info["cprob"] - else: - # noinspection PyTypeChecker - return node_info["cprob"][str(pvals)] - - def choose(self, node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: - """ - Return value from discrete node - params: - node_info: nodes info from distributions - pvals: parent values - """ - vals = node_info["vals"] - dist = np.array(self.get_dist(node_info, pvals)) - - cumulative_dist = np.cumsum(dist) - - rand = np.random.random() - rindex = np.searchsorted(cumulative_dist, rand) - - return vals[rindex] - - @staticmethod - def predict(node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: - """function for prediction based on evidence values in discrete node - - Args: - node_info (Dict[str, Union[float, str]]): parameters of node - pvals (List[str]): values in parents nodes - - Returns: - str: prediction - """ - - vals = node_info["vals"] - disct = [] - if not pvals: - dist = node_info["cprob"] - else: - # noinspection PyTypeChecker - dist = node_info["cprob"][str(pvals)] - max_value = max(dist) - indices = [index for index, value in enumerate(dist) if value == max_value] - max_ind = 0 - if len(indices) == 1: - max_ind = indices[0] - else: - max_ind = random.choice(indices) - return vals[max_ind] diff --git a/bamt/nodes/gaussian_node.py b/bamt/nodes/gaussian_node.py deleted file mode 100644 index 15144a8..0000000 --- a/bamt/nodes/gaussian_node.py +++ /dev/null @@ -1,96 +0,0 @@ -import math -import random -from typing import Optional, List - -import numpy as np -from pandas import DataFrame -from sklearn import linear_model -from sklearn.metrics import mean_squared_error as mse - -from .base import BaseNode -from .schema import GaussianParams - - -class GaussianNode(BaseNode): - """ - Main class for Gaussian Node - """ - - def __init__(self, name, regressor: Optional[object] = None): - super(GaussianNode, self).__init__(name) - if regressor is None: - regressor = linear_model.LinearRegression() - self.regressor = regressor - self.type = "Gaussian" + f" ({type(self.regressor).__name__})" - - def fit_parameters(self, data: DataFrame, **kwargs) -> GaussianParams: - parents = self.cont_parents - if type(self).__name__ == "CompositeContinuousNode": - parents = parents + self.disc_parents - if parents: - self.regressor.fit(data[parents].values, data[self.name].values, **kwargs) - predicted_value = self.regressor.predict(data[parents].values) - variance = mse(data[self.name].values, predicted_value, squared=False) - return { - "mean": np.nan, - "regressor_obj": self.regressor, - "regressor": type(self.regressor).__name__, - "variance": variance, - "serialization": None, - } - else: - mean_base = np.mean(data[self.name].values) - variance = np.var(data[self.name].values) - return { - "mean": mean_base, - "regressor_obj": None, - "regressor": None, - "variance": variance, - "serialization": None, - } - - def get_dist(self, node_info, pvals): - var = node_info["variance"] - if pvals: - for el in pvals: - if str(el) == "nan": - return np.nan - model = node_info["regressor_obj"] - - if type(self).__name__ == "CompositeContinuousNode": - pvals = [int(item) if isinstance(item, str) else item for item in pvals] - - cond_mean = model.predict(np.array(pvals).reshape(1, -1))[0] - return cond_mean, var - else: - return node_info["mean"], math.sqrt(var) - - def choose(self, node_info: GaussianParams, pvals: List[float]) -> float: - """ - Return value from Logit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - cond_mean, var = self.get_dist(node_info, pvals) - return random.gauss(cond_mean, var) - - @staticmethod - def predict(node_info: GaussianParams, pvals: List[float]) -> float: - """ - Return prediction from Logit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - if pvals: - for el in pvals: - if str(el) == "nan": - return np.nan - model = node_info["regressor_obj"] - pred = model.predict(np.array(pvals).reshape(1, -1))[0] - return pred - else: - return node_info["mean"] diff --git a/bamt/nodes/logit_node.py b/bamt/nodes/logit_node.py deleted file mode 100644 index 9b16004..0000000 --- a/bamt/nodes/logit_node.py +++ /dev/null @@ -1,91 +0,0 @@ -import random -from typing import Optional, List, Union - -import numpy as np -from pandas import DataFrame -from sklearn import linear_model - -from .base import BaseNode -from .schema import LogitParams - - -class LogitNode(BaseNode): - """ - Main class for logit node - """ - - def __init__(self, name, classifier: Optional[object] = None): - super(LogitNode, self).__init__(name) - if classifier is None: - classifier = linear_model.LogisticRegression( - multi_class="multinomial", solver="newton-cg", max_iter=100 - ) - self.classifier = classifier - self.type = "Logit" + f" ({type(self.classifier).__name__})" - - def fit_parameters(self, data: DataFrame, **kwargs) -> LogitParams: - parents = self.cont_parents + self.disc_parents - self.classifier.fit(X=data[parents].values, y=data[self.name].values, **kwargs) - - return { - "classes": list(self.classifier.classes_), - "classifier_obj": self.classifier, - "classifier": type(self.classifier).__name__, - "serialization": None, - } - - def get_dist(self, node_info, pvals): - if len(node_info["classes"]) > 1: - model = node_info["classifier_obj"] - if type(self).__name__ == "CompositeDiscreteNode": - pvals = [int(item) if isinstance(item, str) else item for item in pvals] - - return model.predict_proba(np.array(pvals).reshape(1, -1))[0] - else: - return np.array([1.0]) - - def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: - """ - Return value from Logit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - rindex = 0 - - distribution = self.get_dist(node_info, pvals) - - if len(node_info["classes"]) > 1: - rand = random.random() - lbound = 0 - ubound = 0 - for interval in range(len(node_info["classes"])): - ubound += distribution[interval] - if lbound <= rand < ubound: - rindex = interval - break - else: - lbound = ubound - - return str(node_info["classes"][rindex]) - else: - return str(node_info["classes"][0]) - - @staticmethod - def predict(node_info: LogitParams, pvals: List[Union[float]]) -> str: - """ - Return prediction from Logit node - params: - node_info: nodes info from distributions - pvals: parent values - """ - - if len(node_info["classes"]) > 1: - model = node_info["classifier_obj"] - pred = model.predict(np.array(pvals).reshape(1, -1))[0] - - return str(pred) - - else: - return str(node_info["classes"][0]) diff --git a/bamt/nodes/mixture_gaussian_node.py b/bamt/nodes/mixture_gaussian_node.py deleted file mode 100644 index 22cd2b3..0000000 --- a/bamt/nodes/mixture_gaussian_node.py +++ /dev/null @@ -1,154 +0,0 @@ -from typing import Union, List, Optional - -import numpy as np -from gmr import GMM -from pandas import DataFrame - -from bamt.utils.MathUtils import component -from .base import BaseNode -from .schema import MixtureGaussianParams - - -class MixtureGaussianNode(BaseNode): - """ - Main class for Mixture Gaussian Node - """ - - def __init__(self, name): - super(MixtureGaussianNode, self).__init__(name) - self.type = "MixtureGaussian" - - def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: - """ - Train params for Mixture Gaussian Node - """ - parents = self.disc_parents + self.cont_parents - if not parents: - n_comp = int( - ( - component(data, [self.name], "aic") - + component(data, [self.name], "bic") - ) - / 2 - ) # component(data, [node], 'LRTS')# - # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples( - np.transpose([data[self.name].values]), - n_iter=500, - init_params="kmeans++", - ) - means = gmm.means.tolist() - cov = gmm.covariances.tolist() - # weigts = np.transpose(gmm.to_responsibilities(np.transpose([data[node].values]))) - w = gmm.priors.tolist() # [] - # for row in weigts: - # w.append(np.mean(row)) - return {"mean": means, "coef": w, "covars": cov} - if parents: - if not self.disc_parents and self.cont_parents: - nodes = [self.name] + self.cont_parents - new_data = data[nodes] - new_data.reset_index(inplace=True, drop=True) - n_comp = int( - ( - component(new_data, nodes, "aic") - + component(new_data, nodes, "bic") - ) - / 2 - ) # component(new_data, nodes, 'LRTS')# - # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples( - new_data[nodes].values, n_iter=500, init_params="kmeans++" - ) - means = gmm.means.tolist() - cov = gmm.covariances.tolist() - # weigts = np.transpose(gmm.to_responsibilities(new_data[nodes].values)) - w = gmm.priors.tolist() # [] - # for row in weigts: - # w.append(np.mean(row)) - return {"mean": means, "coef": w, "covars": cov} - - @staticmethod - def get_dist(node_info, pvals): - mean = node_info["mean"] - covariance = node_info["covars"] - w = node_info["coef"] - n_comp = len(node_info["coef"]) - if n_comp != 0: - if pvals: - indexes = [i for i in range(1, len(pvals) + 1)] - if not np.isnan(np.array(pvals)).all(): - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - cond_gmm = gmm.condition(indexes, [pvals]) - return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors - else: - return np.nan, np.nan, np.nan - else: - gmm = GMM( - n_components=n_comp, priors=w, means=mean, covariances=covariance - ) - return gmm.means, gmm.covariances, gmm.priors - else: - return np.nan, np.nan, np.nan - - def choose( - self, node_info: MixtureGaussianParams, pvals: List[Union[str, float]] - ) -> Optional[float]: - """ - Func to get value from current node - node_info: nodes info from distributions - pvals: parent values - Return value from MixtureGaussian node - """ - mean, covariance, w = self.get_dist(node_info, pvals) - - n_comp = len(w) - - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - return gmm.sample(1)[0][0] - - @staticmethod - def predict( - node_info: MixtureGaussianParams, pvals: List[Union[str, float]] - ) -> Optional[float]: - """ - Func to get prediction from current node - node_info: nodes info from distributions - pvals: parent values - Return value from MixtureGaussian node - """ - mean = node_info["mean"] - covariance = node_info["covars"] - w = node_info["coef"] - n_comp = len(node_info["coef"]) - if n_comp != 0: - if pvals: - indexes = [i for i in range(1, len(pvals) + 1)] - if not np.isnan(np.array(pvals)).all(): - gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance, - ) - sample = gmm.predict(indexes, [pvals])[0][0] - else: - sample = np.nan - else: - # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) - sample = 0 - for ind, wi in enumerate(w): - sample += wi * mean[ind][0] - else: - sample = np.nan - return sample diff --git a/bamt/nodes/node.py b/bamt/nodes/node.py new file mode 100644 index 0000000..6dc4140 --- /dev/null +++ b/bamt/nodes/node.py @@ -0,0 +1,6 @@ +from abc import ABC + + +class Node(ABC): + def __init__(self): + pass diff --git a/bamt/nodes/root_nodes/__init__.py b/bamt/nodes/root_nodes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/root_nodes/continuous_node.py b/bamt/nodes/root_nodes/continuous_node.py new file mode 100644 index 0000000..67e551a --- /dev/null +++ b/bamt/nodes/root_nodes/continuous_node.py @@ -0,0 +1,6 @@ +from .root_node import RootNode + + +class ContinuousNode(RootNode): + def __init__(self): + super().__init__() diff --git a/bamt/nodes/root_nodes/discrete_node.py b/bamt/nodes/root_nodes/discrete_node.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/root_nodes/root_node.py b/bamt/nodes/root_nodes/root_node.py new file mode 100644 index 0000000..030e05a --- /dev/null +++ b/bamt/nodes/root_nodes/root_node.py @@ -0,0 +1,6 @@ +from bamt.nodes.node import Node + + +class RootNode(Node): + def __init__(self): + super().__init__() diff --git a/bamt/nodes/schema.py b/bamt/nodes/schema.py deleted file mode 100644 index 2db2642..0000000 --- a/bamt/nodes/schema.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Dict, List, Any, Union, TypedDict, Optional - -from numpy import ndarray - - -class DiscreteParams(TypedDict): - cprob: Union[List[Union[list, Any]], Dict[str, list]] - vals: List[str] - - -class MixtureGaussianParams(TypedDict): - mean: List[float] - coef: List[float] - covars: List[float] - - -class GaussianParams(TypedDict): - regressor: str - regressor_obj: Optional[object] - variance: Union[ndarray, float] - mean: Union[ndarray, float] - serialization: Optional[str] - - -class CondGaussParams(TypedDict): - regressor: str - regressor_obj: Optional[object] - variance: Union[ndarray, float] - mean: Union[ndarray, float] - serialization: Optional[str] - - -class CondMixtureGaussParams(TypedDict): - mean: Optional[List[float]] - coef: List[float] - covars: Optional[List[float]] - - -class LogitParams(TypedDict): - classes: List[int] - classifier: str - classifier_obj: Optional[object] - serialization: Optional[str] - - -class HybcprobParams(TypedDict): - hybcprob: Dict[str, CondGaussParams] diff --git a/bamt/parameter_estimators/__init__.py b/bamt/parameter_estimators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/parameter_estimators/maximum_likelihood_estimator.py b/bamt/parameter_estimators/maximum_likelihood_estimator.py new file mode 100644 index 0000000..63783b5 --- /dev/null +++ b/bamt/parameter_estimators/maximum_likelihood_estimator.py @@ -0,0 +1,9 @@ +from .parameters_estimator import ParametersEstimator + + +class MaximumLikelihoodEstimator(ParametersEstimator): + def __init__(self): + pass + + def estimate(self): + pass diff --git a/bamt/parameter_estimators/parameters_estimator.py b/bamt/parameter_estimators/parameters_estimator.py new file mode 100644 index 0000000..b93734c --- /dev/null +++ b/bamt/parameter_estimators/parameters_estimator.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class ParametersEstimator(ABC): + def __init__(self): + pass + + @abstractmethod + def estimate(self): + pass diff --git a/bamt/preprocess/discretization.py b/bamt/preprocess/discretization.py deleted file mode 100644 index 9a36f86..0000000 --- a/bamt/preprocess/discretization.py +++ /dev/null @@ -1,165 +0,0 @@ -from copy import copy -from typing import Tuple - -import pandas as pd -from sklearn import preprocessing -from sklearn.preprocessing import KBinsDiscretizer - - -def get_nodes_sign(data: pd.DataFrame) -> dict: - """Function to define sign of the node - neg - if node has negative values - pos - if node has only positive values - - Args: - data (pd.DataFrame): input dataset - - Returns: - dict: output dictionary where 'key' - node name and 'value' - sign of data - """ - nodes_types = get_nodes_type(data) - columns_sign = dict() - for c in data.columns.to_list(): - if nodes_types[c] == "cont": - if (data[c] < 0).any(): - columns_sign[c] = "neg" - else: - columns_sign[c] = "pos" - return columns_sign - - -def get_nodes_type(data: pd.DataFrame) -> dict: - """Function to define the type of the node - disc - discrete node - cont - continuous - Args: - data (pd.DataFrame): input dataset - - Returns: - dict: output dictionary where 'key' - node name and 'value' - node type - """ - column_type = dict() - for c in data.columns.to_list(): - if (data[c].dtypes == "float64") | (data[c].dtypes == "float32"): - column_type[c] = "cont" - if ( - (data[c].dtypes == "str") - | (data[c].dtypes == "O") - | (data[c].dtypes == "b") - ): - column_type[c] = "disc" - if (data[c].dtypes == "int64") | (data[c].dtypes == "int32"): - column_type[c] = "disc" - return column_type - - -def discretization( - data: pd.DataFrame, method: str, columns: list, bins: int = 5 -) -> Tuple[pd.DataFrame, KBinsDiscretizer]: - """Discretization of continuous parameters - - Args: - data (pd.DataFrame): input dataset - method (str): discretization approach (equal_intervals, equal_frequency, kmeans) - columns (list): name of columns for discretization - bins (int, optional): number of bins. Defaults to 5. - - Returns: - pd.DataFrame: output dataset with discretized parameters - KBinsDiscretizer: fitted exemplar of discretization class - """ - data = data.dropna() - data.reset_index(inplace=True, drop=True) - d_data = copy(data) - est = KBinsDiscretizer(n_bins=bins, encode="ordinal") - strategy_dict = { - "equal_intervals": "uniform", - "equal_frequency": "quantile", - "kmeans": "kmeans", - } - if method in strategy_dict: - est.strategy = strategy_dict[method] - data_discrete = est.fit_transform(d_data.loc[:, columns].values) - d_data[columns] = data_discrete.astype("int") - else: - raise Exception("This discretization method is not supported") - - return d_data, est - - -def label_encoding(data, columns): - d_data = copy(data) - encoder_dict = dict() - for column in columns: - le = preprocessing.LabelEncoder() - d_data[column] = le.fit_transform(d_data[column].values) - mapping = dict(zip(le.classes_, range(len(le.classes_)))) - encoder_dict[column] = mapping - return d_data, encoder_dict - - -def onehot_encoding(data, columns): - d_data = pd.get_dummies(data, columns=columns) - return d_data, None - - -def code_categories( - data: pd.DataFrame, method: str, columns: list -) -> Tuple[pd.DataFrame, dict]: - """Encoding categorical parameters - - Args: - data (pd.DataFrame): input dataset - method (str): method of encoding (label or onehot) - columns (list): name of categorical columns - - Returns: - pd.DataFrame: output dataset with encoded parameters - dict: dictionary with values and codes - """ - data = data.dropna() - data.reset_index(inplace=True, drop=True) - encoding_func_dict = {"label": label_encoding, "onehot": onehot_encoding} - if method in encoding_func_dict: - d_data, encoder_dict = encoding_func_dict[method](data, columns) - else: - raise Exception("This encoding method is not supported") - - return d_data, encoder_dict - - -def inverse_discretization( - data: pd.DataFrame, columns: list, discretizer: KBinsDiscretizer -) -> pd.DataFrame: - """Inverse discretization for numeric params - - Args: - data (pd.DataFrame): input dataset with discrete values - columns (list): colums for inverse_discretization - discretizer (KBinsDiscretizer): fitted exemplar of discretization class - - Returns: - pd.DataFrame: output dataset with continuous values - """ - new_data = copy(data) - new_data[columns] = discretizer.inverse_transform(new_data[columns].values) - - return new_data - - -def decode(data: pd.DataFrame, columns: list, encoder_dict: dict) -> pd.DataFrame: - """Decoding categorical params to initial labels - - Args: - data (pd.DataFrame): input dataset with encoded params - columns (list): columns for decoding - encoder_dict (dict): dictionary with values and codes - Returns: - pd.DataFrame: output dataset with decoded params - """ - for column in columns: - dict_parameter = encoder_dict[column] - inv_map = {v: k for k, v in dict_parameter.items()} - data[column] = data[column].apply(lambda x: inv_map(x)) - - return data diff --git a/bamt/preprocess/graph.py b/bamt/preprocess/graph.py deleted file mode 100644 index 87e688a..0000000 --- a/bamt/preprocess/graph.py +++ /dev/null @@ -1,42 +0,0 @@ -from collections import defaultdict - - -def nodes_from_edges(edges: list): - """ - Retrieves all nodes from the list of edges. - Arguments - ---------- - *edges* : list - - Returns - ------- - *nodes* : list - - Effects - ------- - None - """ - return set().union(edges) - - -def edges_to_dict(edges: list): - """ - Transfers the list of edges to the dictionary of parents. - Arguments - ---------- - *edges* : list - - Returns - ------- - *parents_dict* : dict - - Effects - ------- - None - """ - nodes = nodes_from_edges(edges) - parents_dict = defaultdict(list, {node: [] for node in nodes}) - parents_dict.update( - {child: parents_dict[child] + [parent] for parent, child in edges} - ) - return parents_dict diff --git a/bamt/preprocess/numpy_pandas.py b/bamt/preprocess/numpy_pandas.py deleted file mode 100644 index 32242df..0000000 --- a/bamt/preprocess/numpy_pandas.py +++ /dev/null @@ -1,57 +0,0 @@ -# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) -# parentdir = os.path.dirname(currentdir) -# sys.path.insert(0,parentdir) -import numpy as np -import pandas as pd - - -def loc_to_DataFrame(data: np.array): - """Function to convert array to DataFrame - Args: - data (np.array): input array - - Returns: - data (pd.DataFrame): with string columns for filtering - """ - nodes_type = get_type_numpy(data) - if data.T.ndim == 1: - data = data.T - nodes_type = {0: nodes_type[0]} - dtype = { - key: "int64" if value == "disc" else "float64" - for key, value in nodes_type.items() - if value in ["disc", "cont"] - } - df = pd.DataFrame(data).astype(dtype) - df.columns = df.columns.map(str) - return df - - -def get_type_numpy(data: np.array): - """Function to define the type of the columns of array - disc - discrete node - cont - continuous - Args: - data (np.array): input array - - Returns: - dict: output dictionary where 'key' - node name and 'value' - node type - Notes: - -- You may have problems with confusing rows and columns - """ - arr = data.T - - column_type = {} - for i, row in enumerate(arr): - if row.ndim == 0 or row.T.ndim == 0: - row_is_integer = np.issubdtype(row, np.integer) or row.is_integer() - column_type[i] = "disc" if row_is_integer else "cont" - else: - all_row_is_integer = all( - np.issubdtype(x, np.integer) or x.is_integer() for x in row - ) - column_type[i] = "disc" if all_row_is_integer else "cont" - if column_type[i] not in ["disc", "cont"]: - print("get_type_numpy: Incorrect type of row") - print(row) - return column_type diff --git a/bamt/preprocessors.py b/bamt/preprocessors.py deleted file mode 100644 index 847ec9c..0000000 --- a/bamt/preprocessors.py +++ /dev/null @@ -1,126 +0,0 @@ -from typing import Tuple, Dict - -from pandas import DataFrame - -from bamt.log import logger_preprocessor -from bamt.utils import GraphUtils as gru - - -class BasePreprocessor(object): - """ - Base for Preprocessor - """ - - def __init__(self): - self.nodes_signs = {} - self.nodes_types = {} - - @staticmethod - def get_nodes_types(data): - return gru.nodes_types(data=data) - - def get_nodes_signs(self, data): - return gru.nodes_signs(nodes_types=self.nodes_types, data=data) - - def code_categories( - self, data: DataFrame, encoder - ) -> Tuple[DataFrame, Dict[str, Dict]]: - """Encoding categorical parameters - - Args: - data (DataFrame): input dataset - encoder: any object with fit_transform method - - Returns: - pd.DataFrame: output dataset with encoded parameters - dict: dictionary with values and codes - """ - columns = [ - col for col in data.columns.to_list() if self.nodes_types[col] == "disc" - ] - df = data.copy() # INPUT DF. Debugging SettingWithCopyWarning - if not columns: - return df, None - data = df[columns] # DATA TO CATEGORIZE - encoder_dict = dict() - - for col_name, column in data.items(): - # Iterate over (column name, Series) pairs. - try: - df[col_name] = encoder.fit_transform(column.values) - except TypeError as exc: - logger_preprocessor.error( - f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}" - ) - try: - mapping = dict(zip(encoder.classes_, range(len(encoder.classes_)))) - encoder_dict[col_name] = mapping - except BaseException: - pass - return df, encoder_dict - - def discretize(self, data: DataFrame, discretizer) -> tuple: - columns = [ - col for col in data.columns.to_list() if self.nodes_types[col] == "cont" - ] - df = data.copy() - if not columns: - return df, None - data = df[columns] - - data_discrete = discretizer.fit_transform(data.values) - df[columns] = data_discrete.astype("int") - - return df, discretizer - - def decode(self): - pass - - -class Preprocessor(BasePreprocessor): - def __init__(self, pipeline: list): - super().__init__() - assert isinstance(pipeline, list), "pipeline must be list" - self.pipeline = pipeline - self.coder = None - - @property - def info(self): - return {"types": self.nodes_types, "signs": self.nodes_signs} - - def scan(self, data: DataFrame): - """ - Function to scan data. If something is wrong, it will be send to log file - """ - columns_cont = [ - col for col in data.columns.to_list() if self.nodes_types[col] == "cont" - ] - if not columns_cont: - logger_preprocessor.info("No one column is continuous") - - columns_disc = [ - col - for col in data.columns.to_list() - if self.nodes_types[col] in ["disc", "disc_num"] - ] - if not columns_disc: - logger_preprocessor.info("No one column is discrete") - - def apply(self, data: DataFrame) -> Tuple[DataFrame, Dict]: - """ - Apply pipeline - data: data to apply on - """ - df = data.copy() - self.nodes_types = self.get_nodes_types(data) - if list(self.nodes_types.keys()) != data.columns.to_list(): - logger_preprocessor.error("Nodes_types dictionary are not full.") - return None, None - self.nodes_signs = self.get_nodes_signs(data) - self.scan(df) - for name, instrument in self.pipeline: - if name == "encoder": - df, self.coder = self.code_categories(data=data, encoder=instrument) - if name == "discretizer": - df, est = self.discretize(data=df, discretizer=instrument) - return df, self.coder diff --git a/bamt/redef_HC.py b/bamt/redef_HC.py deleted file mode 100644 index bab3119..0000000 --- a/bamt/redef_HC.py +++ /dev/null @@ -1,316 +0,0 @@ -""" -********************** -Greedy Hill-Climbing -for Structure Learning -********************** - -Code for Searching through the space of -possible Bayesian Network structures. - -Various optimization procedures are employed, -from greedy search to simulated annealing, and -so on - mostly using scipy.optimize. - -Local search - possible moves: -- Add edge -- Delete edge -- Invert edge - -Strategies to improve Greedy Hill-Climbing: -- Random Restarts - - when we get stuck, take some number of - random steps and then start climbing again. -- Tabu List - - keep a list of the K steps most recently taken, - and say that the search cannt reverse (undo) any - of these steps. -""" - -from bamt.external.pyBN.classes.bayesnet import BayesNet -from bamt.external.pyBN.utils.graph import would_cause_cycle -from bamt.mi_entropy_gauss import mi_gauss -from bamt.redef_info_scores import log_lik_local, BIC_local, AIC_local - - -def hc( - data, - metric="MI", - max_iter=200, - debug=False, - init_nodes=None, - restriction=None, - init_edges=None, - remove_geo_edges=True, - black_list=None, -): - """ - Greedy Hill Climbing search proceeds by choosing the move - which maximizes the increase in fitness of the - network at the current step. It continues until - it reaches a point where there does not exist any - feasible single move that increases the network fitness. - - It is called "greedy" because it simply does what is - best at the current iteration only, and thus does not - look ahead to what may be better later on in the search. - - For computational saving, a Priority Queue (python's heapq) - can be used to maintain the best operators and reduce the - complexity of picking the best operator from O(n^2) to O(nlogn). - This works by maintaining the heapq of operators sorted by their - delta score, and each time a move is made, we only have to recompute - the O(n) delta-scores which were affected by the move. The rest of - the operator delta-scores are not affected. - - For additional computational efficiency, we can cache the - sufficient statistics for various families of distributions - - therefore, computing the mutual information for a given family - only needs to happen once. - - The possible moves are the following: - - add edge - - delete edge - - invert edge - - Arguments - --------- - *data* : pd.DataFrame - The data from which the Bayesian network - structure will be learned. - - *metric* : a string - Which score metric to use. - Options: - - AIC - - BIC / MDL - - LL (log-likelihood) - - *max_iter* : an integer - The maximum number of iterations of the - hill-climbing algorithm to run. Note that - the algorithm will terminate on its own if no - improvement is made in a given iteration. - - *debug* : boolean - Whether to print(the scores/moves of the) - algorithm as its happening. - - *init_nodes* : a list of initialize nodes (number of nodes according to the dataset) - - *restriction* : a list of 2-tuples - For MMHC algorithm, the list of allowable edge additions. - - Returns - ------- - *bn* : a BayesNet object - - """ - nrow = data.shape[0] - ncol = data.shape[1] - - names = range(ncol) - - # INITIALIZE NETWORK W/ NO EDGES - # maintain children and parents dict for fast lookups - c_dict = dict([(n, []) for n in names]) - p_dict = dict([(n, []) for n in names]) - if init_edges: - for edge in init_edges: - c_dict[edge[0]].append(edge[1]) - p_dict[edge[1]].append(edge[0]) - - bn = BayesNet(c_dict) - - mutual_information = mi_gauss - if metric == "BIC": - mutual_information = BIC_local - if metric == "AIC": - mutual_information = AIC_local - if metric == "LL": - mutual_information = log_lik_local - - data = data.values - - cache = dict() - - _iter = 0 - improvement = True - - while improvement: - improvement = False - max_delta = 0 - - if debug: - print("ITERATION: ", _iter) - - ### TEST ARC ADDITIONS ### - for u in bn.nodes(): - for v in bn.nodes(): - if ( - v not in c_dict[u] - and u != v - and not would_cause_cycle(c_dict, u, v) - and len(p_dict[v]) != 3 - ): - # FOR MMHC ALGORITHM -> Edge Restrictions - if ( - (init_nodes is None or not (v in init_nodes)) - and (restriction is None or (u, v) in restriction) - and (black_list is None or not ((u, v) in black_list)) - ): - # SCORE FOR 'V' -> gaining a parent - # without 'u' as parent - old_cols = (v,) + tuple(p_dict[v]) - if old_cols not in cache: - cache[old_cols] = mutual_information(data[:, old_cols]) - mi_old = cache[old_cols] - new_cols = old_cols + (u,) # with'u' as parent - if new_cols not in cache: - cache[new_cols] = mutual_information(data[:, new_cols]) - mi_new = cache[new_cols] - delta_score = nrow * (mi_old - mi_new) - - if delta_score > max_delta: - if debug: - print("Improved Arc Addition: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Addition" - max_arc = (u, v) - - # ### TEST ARC DELETIONS ### - for u in bn.nodes(): - for v in bn.nodes(): - if v in c_dict[u]: - # SCORE FOR 'V' -> losing a parent - old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent - if old_cols not in cache: - cache[old_cols] = mutual_information(data[:, old_cols]) - mi_old = cache[old_cols] - new_cols = tuple([i for i in old_cols if i != u]) - if new_cols not in cache: - cache[new_cols] = mutual_information(data[:, new_cols]) - mi_new = cache[new_cols] - delta_score = nrow * (mi_old - mi_new) - - if delta_score > max_delta: - if init_edges is None: - if debug: - print("Improved Arc Deletion: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Deletion" - max_arc = (u, v) - else: - if (u, v) in init_edges: - if remove_geo_edges: - if debug: - print("Improved Arc Deletion: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Deletion" - max_arc = (u, v) - else: - if debug: - print("Improved Arc Deletion: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Deletion" - max_arc = (u, v) - - # ### TEST ARC REVERSALS ### - for u in bn.nodes(): - for v in bn.nodes(): - if ( - v in c_dict[u] - and not would_cause_cycle(c_dict, v, u, reverse=True) - and len(p_dict[u]) != 3 - and (init_nodes is None or not (u in init_nodes)) - and (restriction is None or (v, u) in restriction) - and (black_list is None or not ((v, u) in black_list)) - ): - old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent - if old_cols not in cache: - cache[old_cols] = mutual_information(data[:, old_cols]) - mi_old = cache[old_cols] - new_cols = old_cols + (v,) # with 'v' as parent - if new_cols not in cache: - cache[new_cols] = mutual_information(data[:, new_cols]) - mi_new = cache[new_cols] - delta1 = -1 * nrow * (mi_old - mi_new) - # SCORE FOR 'V' -> losing 'u' as parent - old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent - if old_cols not in cache: - cache[old_cols] = mutual_information(data[:, old_cols]) - mi_old = cache[old_cols] - # without 'u' as parent - new_cols = tuple([u for i in old_cols if i != u]) - if new_cols not in cache: - cache[new_cols] = mutual_information(data[:, new_cols]) - mi_new = cache[new_cols] - delta2 = nrow * (mi_old - mi_new) - # COMBINED DELTA-SCORES - delta_score = delta1 + delta2 - - if delta_score > max_delta: - if init_edges is None: - if debug: - print("Improved Arc Reversal: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Reversal" - max_arc = (u, v) - else: - if (u, v) in init_edges: - if remove_geo_edges: - if debug: - print("Improved Arc Reversal: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Reversal" - max_arc = (u, v) - else: - if debug: - print("Improved Arc Reversal: ", (u, v)) - print("Delta Score: ", delta_score) - max_delta = delta_score - max_operation = "Reversal" - max_arc = (u, v) - - if max_delta != 0: - improvement = True - u, v = max_arc - if max_operation == "Addition": - if debug: - print("ADDING: ", max_arc, "\n") - c_dict[u].append(v) - p_dict[v].append(u) - - elif max_operation == "Deletion": - if debug: - print("DELETING: ", max_arc, "\n") - c_dict[u].remove(v) - p_dict[v].remove(u) - - elif max_operation == "Reversal": - if debug: - print("REVERSING: ", max_arc, "\n") - c_dict[u].remove(v) - p_dict[v].remove(u) - c_dict[v].append(u) - p_dict[u].append(v) - - else: - if debug: - print("No Improvement on Iter: ", _iter) - - ### TEST FOR MAX ITERATION ### - _iter += 1 - if _iter > max_iter: - if debug: - print("Max Iteration Reached") - break - - bn = BayesNet(c_dict) - - return bn diff --git a/bamt/redef_info_scores.py b/bamt/redef_info_scores.py deleted file mode 100644 index 0890550..0000000 --- a/bamt/redef_info_scores.py +++ /dev/null @@ -1,179 +0,0 @@ -import sys -import warnings -from copy import copy - -import numpy as np -import pandas as pd - -from bamt.mi_entropy_gauss import mi_gauss as mutual_information, entropy_all as entropy -from bamt.preprocess.graph import edges_to_dict -from bamt.preprocess.numpy_pandas import get_type_numpy - - -def info_score(edges: list, data: pd.DataFrame, method="LL"): - score_funcs = {"LL": log_lik_local, "BIC": BIC_local, "AIC": AIC_local} - score = score_funcs.get(method.upper(), BIC_local) - - parents_dict = edges_to_dict(edges) - nodes_with_edges = parents_dict.keys() - scores = [ - score(data[child_parents].copy(), method) - for var in nodes_with_edges - for child_parents in ([var] + parents_dict[var],) - ] - scores += [ - score(data[[var]].copy(), method) - for var in set(data.columns).difference(set(nodes_with_edges)) - ] - return sum(scores) - - -##### INFORMATION-THEORETIC SCORING FUNCTIONS ##### - - -def log_likelihood(bn, data, method="LL"): - """ - Determining log-likelihood of the parameters - of a Bayesian Network. This is a quite simple - score/calculation, but it is useful as a straight-forward - structure learning score. - - Semantically, this can be considered as the evaluation - of the log-likelihood of the data, given the structure - and parameters of the BN: - - log( P( D | Theta_G, G ) ) - where Theta_G are the parameters and G is the structure. - - However, for computational reasons it is best to take - advantage of the decomposability of the log-likelihood score. - - As an example, if you add an edge from A->B, then you simply - need to calculate LOG(P'(B|A)) - Log(P(B)), and if the value - is positive then the edge improves the fitness score and should - therefore be included. - - Even more, you can expand and manipulate terms to calculate the - difference between the new graph and the original graph as follows: - Score(G') - Score(G) = M * I(X,Y), - where M is the number of data points and I(X,Y) is - the marginal mutual information calculated using - the empirical distribution over the data. - - In general, the likelihood score decomposes as follows: - LL(D | Theta_G, G) = - M * Sum over Variables ( I ( X , Parents(X) ) ) - - M * Sum over Variables ( H( X ) ), - where 'I' is mutual information and 'H' is the entropy, - and M is the number of data points - - Moreover, it is clear to see that H(X) is independent of the choice - of graph structure (G). Thus, we must only determine the difference - in the mutual information score of the original graph which had a given - node and its original parents, and the new graph which has a given node - and new parents. - - NOTE: This assumes the parameters have already - been learned for the BN's given structure. - - LL = LL - f(N)*|B|, where f(N) = 0 - - Arguments - --------- - *bn* : a BayesNet object - Must have both structure and parameters - instantiated. - Notes - ----- - NROW = data.shape[0] - mi_score = 0 - ent_score = 0 - for rv in bn.nodes(): - cols = tuple([bn.V.index(rv)].extend([bn.V.index(p) for p in bn.parents(rv)])) - mi_score += mutual_information(data[:,cols]) - ent_score += entropy(data[:,bn.V.index(rv)]) - - return NROW * (mi_score - ent_score) - """ - - NROW = data.shape[0] - mi_scores = [ - mutual_information( - data[:, (bn.V.index(rv),) + tuple([bn.V.index(p) for p in bn.parents(rv)])], - method=method, - ) - for rv in bn.nodes() - ] - ent_scores = [entropy(data[:, bn.V.index(rv)], method=method) for rv in bn.nodes()] - return NROW * (sum(mi_scores) - sum(ent_scores)) - - -def log_lik_local(data, method="LL"): - NROW = data.shape[0] - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - if isinstance(data, pd.DataFrame): - return NROW * ( - mutual_information(data, method=method) - - entropy(data.iloc[:, 0], method=method) - ) - elif isinstance(data, pd.Series): - return 0.0 - elif isinstance(data, np.ndarray): - return NROW * ( - mutual_information(data, method=method) - - entropy(data[:, 0], method=method) - ) - - -def BIC_local(data, method="BIC"): - NROW = data.shape[0] - log_score = log_lik_local(data, method=method) - try: - penalty = 0.5 * num_params(data) * np.log(NROW) - except OverflowError as err: - penalty = sys.float_info.max - return log_score - penalty - - -def num_params(data): - # Convert pandas DataFrame to numpy array - if isinstance(data, pd.DataFrame): - data = data.values - # Convert pandas Series to numpy array - if isinstance(data, pd.Series): - data = np.array(copy(data)) - - # Calculate number of parameters for numpy array - if isinstance(data, np.ndarray): - node_type = get_type_numpy(data) - columns_for_discrete = [ - param for param, node in node_type.items() if node == "cont" - ] - columns_for_code = [ - param for param, node in node_type.items() if node == "disc" - ] - - prod = 1 - for var in columns_for_code: - prod *= ( - len(np.unique(data[:, var])) if data.ndim != 1 else len(np.unique(data)) - ) - if columns_for_discrete: - prod *= len(columns_for_discrete) - - # Handle overflow error - try: - return prod - except OverflowError: - return sys.float_info.max - - # Raise an error if data type is unexpected - print("Num_params: Unexpected data type") - print(data) - return None - - -def AIC_local(data, method="AIC"): - log_score = log_lik_local(data, method=method) - penalty = num_params(data) - return log_score - penalty diff --git a/bamt/score_functions/__init__.py b/bamt/score_functions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/score_functions/k2_score.py b/bamt/score_functions/k2_score.py new file mode 100644 index 0000000..4a46b6a --- /dev/null +++ b/bamt/score_functions/k2_score.py @@ -0,0 +1,9 @@ +from .score_function import ScoreFunction + + +class K2Score(ScoreFunction): + def __init__(self): + super().__init__() + + def estimate(self): + pass diff --git a/bamt/score_functions/mutual_information_score.py b/bamt/score_functions/mutual_information_score.py new file mode 100644 index 0000000..0bd2b7e --- /dev/null +++ b/bamt/score_functions/mutual_information_score.py @@ -0,0 +1,9 @@ +from .score_function import ScoreFunction + + +class MutualInformationScore(ScoreFunction): + def __init__(self): + super().__init__() + + def estimate(self): + pass diff --git a/bamt/score_functions/score_function.py b/bamt/score_functions/score_function.py new file mode 100644 index 0000000..e3ec70f --- /dev/null +++ b/bamt/score_functions/score_function.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class ScoreFunction(ABC): + def __init__(self): + pass + + @abstractmethod + def estimate(self): + pass diff --git a/bamt/utils/EvoUtils.py b/bamt/utils/EvoUtils.py deleted file mode 100644 index 5266a75..0000000 --- a/bamt/utils/EvoUtils.py +++ /dev/null @@ -1,116 +0,0 @@ -import random - -import pandas as pd -from golem.core.dag.convert import graph_structure_as_nx_graph -from golem.core.dag.graph_utils import ordered_subnodes_hierarchy -from golem.core.optimisers.graph import OptGraph, OptNode -from pgmpy.estimators import K2Score -from pgmpy.models import BayesianNetwork - - -class CustomGraphModel(OptGraph): - def evaluate(self, data: pd.DataFrame): - nodes = data.columns.to_list() - _, labels = graph_structure_as_nx_graph(self) - return len(nodes) - - -class CustomGraphNode(OptNode): - def __str__(self): - return f'{self.content["name"]}' - - -def K2_metric(graph: CustomGraphModel, data: pd.DataFrame): - graph_nx, labels = graph_structure_as_nx_graph(graph) - struct = [] - for meta_edge in graph_nx.edges(): - l1 = str(labels[meta_edge[0]]) - l2 = str(labels[meta_edge[1]]) - struct.append([l1, l2]) - - bn_model = BayesianNetwork(struct) - bn_model.add_nodes_from(data.columns) - - score = K2Score(data).score(bn_model) - return -score - - -def custom_mutation_add(graph: CustomGraphModel, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = random.choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] - nodes_not_cycling = random_node.descriptive_id not in [ - n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) - ] and other_random_node.descriptive_id not in [ - n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) - ] - if nodes_not_cycling: - random_node.nodes_from.append(other_random_node) - break - - except Exception as ex: - print(f"Incorrect connection: {ex}") - return graph - - -def custom_mutation_delete(graph: OptGraph, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = random.choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] - if ( - random_node.nodes_from is not None - and other_random_node in random_node.nodes_from - ): - random_node.nodes_from.remove(other_random_node) - break - except Exception as ex: - print(ex) - return graph - - -def custom_mutation_reverse(graph: OptGraph, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = random.choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] - if ( - random_node.nodes_from is not None - and other_random_node in random_node.nodes_from - ): - random_node.nodes_from.remove(other_random_node) - other_random_node.nodes_from.append(random_node) - break - except Exception as ex: - print(ex) - return graph - - -def has_no_duplicates(graph): - _, labels = graph_structure_as_nx_graph(graph) - if len(labels.values()) != len(set(labels.values())): - raise ValueError("Custom graph has duplicates") - return True - - -def has_no_blacklist_edges(graph, blacklist): - nx_graph, _ = graph_structure_as_nx_graph(graph) - for edge in nx_graph.edges(): - if edge in blacklist: - raise ValueError("Graph contains blacklisted edges") - return True - - -def has_only_whitelist_edges(graph, whitelist): - nx_graph, _ = graph_structure_as_nx_graph(graph) - for edge in nx_graph.edges(): - if edge not in whitelist: - raise ValueError("Graph contains non-whitelisted edges") - return True diff --git a/bamt/utils/GraphUtils.py b/bamt/utils/GraphUtils.py deleted file mode 100644 index ccffae1..0000000 --- a/bamt/utils/GraphUtils.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import Dict, List, Tuple, Type - -import networkx as nx -from pandas import DataFrame - -from bamt.log import logger_preprocessor -from bamt.nodes.base import BaseNode - - -def nodes_types(data: DataFrame) -> Dict[str, str]: - """ - Function to define the type of the node - disc - discrete node - cont - continuous - Args: - data: input dataset - - Returns: - dict: output dictionary where 'key' - node name and 'value' - node type - """ - - column_type = dict() - for c in data.columns.to_list(): - disc = ["str", "O", "b", "categorical", "object", "bool"] - disc_numerical = ["int32", "int64"] - cont = ["float32", "float64"] - if data[c].dtype.name in disc: - column_type[c] = "disc" - elif data[c].dtype.name in cont: - column_type[c] = "cont" - elif data[c].dtype.name in disc_numerical: - column_type[c] = "disc_num" - else: - logger_preprocessor.error(f"Unsupported data type. Dtype: {data[c].dtypes}") - - return column_type - - -def nodes_signs(nodes_types: dict, data: DataFrame) -> Dict[str, str]: - """Function to define sign of the node - neg - if node has negative values - pos - if node has only positive values - - Args: - data (pd.DataFrame): input dataset - nodes_types (dict): dict with nodes_types - - Returns: - dict: output dictionary where 'key' - node name and 'value' - sign of data - """ - if list(nodes_types.keys()) != data.columns.to_list(): - logger_preprocessor.error("Nodes_types dictionary is not full.") - return - columns_sign = dict() - for c in data.columns.to_list(): - if nodes_types[c] == "cont": - if (data[c] < 0).any(): - columns_sign[c] = "neg" - else: - columns_sign[c] = "pos" - return columns_sign - - -def get_descriptor(data) -> Dict[str, Dict[str, str]]: - return {"types": nodes_types(data), "signs": nodes_signs(nodes_types(data), data)} - - -def toporder(nodes: List[Type[BaseNode]], edges: List[Tuple]) -> List[List[str]]: - """ - Function for topological sorting - """ - G = nx.DiGraph() - G.add_nodes_from([node.name for node in nodes]) - G.add_edges_from(edges) - return list(nx.topological_sort(G)) - - -class GraphAnalyzer(object): - """ - Object to analyze DAG. - """ - - def __init__(self, bn): - self.bn = bn - - def _isolate_structure(self, nodes): - isolated_edges = [] - for edge in self.bn.edges: - if edge[0] in nodes and edge[1] in nodes: - isolated_edges.append(edge) - return isolated_edges - - def markov_blanket(self, node_name: str): - node = self.bn[node_name] - - parents = node.cont_parents + node.disc_parents - children = node.children - fremd_eltern = [] - - for child in node.children: - all_parents = self.bn[child].cont_parents + self.bn[child].disc_parents - - if all_parents == [node_name]: - continue - else: - new = all_parents - fremd_eltern.extend(new) - - nodes = parents + children + fremd_eltern + [node_name] - - edges = self._isolate_structure(nodes) - return {"nodes": list(set(nodes)), "edges": edges} - - def _collect_height(self, node_name, height): - nodes = [] - node = self.bn[node_name] - if height <= 0: - return [] - - if height == 1: - return node.disc_parents + node.cont_parents - - for parent in node.cont_parents + node.disc_parents: - nodes.append(parent) - nodes.extend(self._collect_height(parent, height=height - 1)) - return nodes - - def _collect_depth(self, node_name, depth): - nodes = [] - node = self.bn[node_name] - - if depth <= 0: - return [] - - if depth == 1: - return node.children - - for child in node.children: - nodes.append(child) - nodes.extend(self._collect_depth(child, depth=depth - 1)) - - return nodes - - def find_family(self, *args): - node_name, height, depth, with_nodes = args - if not with_nodes: - with_nodes = [] - else: - with_nodes = list(with_nodes) - nodes = ( - self._collect_depth(node_name, depth) - + self._collect_height(node_name, height) - + [node_name] - ) - - nodes = list(set(nodes + with_nodes)) - - return {"nodes": nodes, "edges": self._isolate_structure(nodes + with_nodes)} diff --git a/bamt/utils/MathUtils.py b/bamt/utils/MathUtils.py deleted file mode 100644 index ed08105..0000000 --- a/bamt/utils/MathUtils.py +++ /dev/null @@ -1,161 +0,0 @@ -import math - -import numpy as np -from scipy import stats -from scipy.stats.distributions import chi2 -from sklearn.mixture import GaussianMixture - - -def lrts_comp(data): - n = 0 - biggets_p = -1 * np.infty - comp_biggest = 0 - max_comp = 10 - if len(data) < max_comp: - max_comp = len(data) - for i in range(1, max_comp + 1, 1): - gm1 = GaussianMixture(n_components=i, random_state=0) - gm2 = GaussianMixture(n_components=i + 1, random_state=0) - gm1.fit(data) - ll1 = np.mean(gm1.score_samples(data)) - gm2.fit(data) - ll2 = np.mean(gm2.score_samples(data)) - LR = 2 * (ll2 - ll1) - p = chi2.sf(LR, 1) - if p > biggets_p: - biggets_p = p - comp_biggest = i - n = comp_biggest - return n - - -def mix_norm_cdf(x, weights, means, covars): - mcdf = 0.0 - for i in range(len(weights)): - mcdf += weights[i] * stats.norm.cdf(x, loc=means[i][0], scale=covars[i][0][0]) - return mcdf - - -def theoretical_quantile(data, n_comp): - model = GaussianMixture(n_components=n_comp, random_state=0) - model.fit(data) - q = [] - x = [] - # step = ((np.max(model.sample(100000)[0])) - (np.min(model.sample(100000)[0])))/1000 - step = (np.max(data) - np.min(data)) / 1000 - d = np.arange(np.min(data), np.max(data), step) - for i in d: - x.append(i) - q.append(mix_norm_cdf(i, model.weights_, model.means_, model.covariances_)) - return x, q - - -def quantile_mix(p, vals, q): - ind = q.index(min(q, key=lambda x: abs(x - p))) - return vals[ind] - - -def probability_mix(val, vals, q): - ind = vals.index(min(vals, key=lambda x: abs(x - val))) - return q[ind] - - -def sum_dist(data, vals, q): - percs = np.linspace(1, 100, 10) - x = np.quantile(data, percs / 100) - y = [] - for p in percs: - y.append(quantile_mix(p / 100, vals, q)) - dist = 0 - for xi, yi in zip(x, y): - dist = dist + (abs(-1 * xi + yi)) / math.sqrt(2) - return dist - - -def component(data, columns, method): - n = 1 - max_comp = 10 - x = [] - if data.shape[0] < max_comp: - max_comp = data.shape[0] - if len(columns) == 1: - x = np.transpose([data[columns[0]].values]) - else: - x = data[columns].values - if method == "aic": - lowest_aic = np.infty - comp_lowest = 0 - for i in range(1, max_comp + 1, 1): - gm1 = GaussianMixture(n_components=i, random_state=0) - gm1.fit(x) - aic1 = gm1.aic(x) - if aic1 < lowest_aic: - lowest_aic = aic1 - comp_lowest = i - n = comp_lowest - - if method == "bic": - lowest_bic = np.infty - comp_lowest = 0 - for i in range(1, max_comp + 1, 1): - gm1 = GaussianMixture(n_components=i, random_state=0) - gm1.fit(x) - bic1 = gm1.bic(x) - if bic1 < lowest_bic: - lowest_bic = bic1 - comp_lowest = i - n = comp_lowest - - if method == "LRTS": - n = lrts_comp(x) - if method == "quantile": - biggest_p = -1 * np.infty - comp_biggest = 0 - for i in range(1, max_comp, 1): - vals, q = theoretical_quantile(x, i) - dist = sum_dist(x, vals, q) - p = probability_mix(dist, vals, q) - if p > biggest_p: - biggest_p = p - comp_biggest = i - n = comp_biggest - return n - - -def _child_dict(net: list): - res_dict = dict() - for e0, e1 in net: - if e1 in res_dict: - res_dict[e1].append(e0) - else: - res_dict[e1] = [e0] - return res_dict - - -def precision_recall(pred_net: list, true_net: list, decimal=4): - pred_dict = _child_dict(pred_net) - true_dict = _child_dict(true_net) - corr_undirected = 0 - corr_dir = 0 - for e0, e1 in pred_net: - flag = True - if e1 in true_dict: - if e0 in true_dict[e1]: - corr_undirected += 1 - corr_dir += 1 - flag = False - if (e0 in true_dict) and flag: - if e1 in true_dict[e0]: - corr_undirected += 1 - pred_len = len(pred_net) - true_len = len(true_net) - shd = pred_len + true_len - corr_undirected - corr_dir - return { - "AP": round(corr_undirected / pred_len, decimal), - "AR": round(corr_undirected / true_len, decimal), - # 'F1_undir': round(2 * (corr_undirected / pred_len) * (corr_undirected / true_len) / (corr_undirected / pred_len + corr_undirected / true_len), decimal), - "AHP": round(corr_dir / pred_len, decimal), - "AHR": round(corr_dir / true_len, decimal), - # 'F1_directed': round(2*(corr_dir/pred_len)*(corr_dir/true_len)/(corr_dir/pred_len+corr_dir/true_len), decimal), - "SHD": shd, - } diff --git a/bamt/utils/check_utils.py b/bamt/utils/check_utils.py deleted file mode 100644 index ec44a33..0000000 --- a/bamt/utils/check_utils.py +++ /dev/null @@ -1,9 +0,0 @@ -def is_model(model): - try: - methods = dir(model) - if "fit" in methods: - return True - else: - return False - except Exception: - return False diff --git a/bamt/utils/composite_utils/CompositeGeneticOperators.py b/bamt/utils/composite_utils/CompositeGeneticOperators.py deleted file mode 100644 index f42493a..0000000 --- a/bamt/utils/composite_utils/CompositeGeneticOperators.py +++ /dev/null @@ -1,189 +0,0 @@ -from math import log10 -from random import choice - -import pandas as pd -from golem.core.dag.graph_utils import ordered_subnodes_hierarchy -from numpy import std, mean, log -from scipy.stats import norm -from sklearn.metrics import mean_squared_error -from sklearn.model_selection import train_test_split - -from .CompositeModel import CompositeModel -from .MLUtils import MlModels - - -def custom_crossover_all_model( - graph_first: CompositeModel, graph_second: CompositeModel, max_depth -): - num_cros = 100 - try: - for _ in range(num_cros): - selected_node1 = choice(graph_first.nodes) - if selected_node1.nodes_from is None or selected_node1.nodes_from == []: - continue - - selected_node2 = graph_second.get_nodes_by_name(str(selected_node1))[0] - if selected_node2.nodes_from is None or selected_node2.nodes_from == []: - continue - - model1 = selected_node1.content["parent_model"] - model2 = selected_node2.content["parent_model"] - - selected_node1.content["parent_model"] = model2 - selected_node2.content["parent_model"] = model1 - - break - - except Exception as ex: - print(ex) - return graph_first, graph_second - - -def custom_mutation_add_structure(graph: CompositeModel, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[choice(range(len(graph.nodes)))] - nodes_not_cycling = random_node.descriptive_id not in [ - n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) - ] and other_random_node.descriptive_id not in [ - n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) - ] - if nodes_not_cycling: - other_random_node.nodes_from.append(random_node) - ml_models = MlModels() - other_random_node.content[ - "parent_model" - ] = ml_models.get_model_by_children_type(other_random_node) - break - - except Exception as ex: - graph.log.warn(f"Incorrect connection: {ex}") - return graph - - -def custom_mutation_delete_structure(graph: CompositeModel, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[choice(range(len(graph.nodes)))] - if ( - random_node.nodes_from is not None - and other_random_node in random_node.nodes_from - ): - random_node.nodes_from.remove(other_random_node) - if not random_node.nodes_from: - random_node.content["parent_model"] = None - break - except Exception as ex: - print(ex) - return graph - - -def custom_mutation_reverse_structure(graph: CompositeModel, **kwargs): - num_mut = 100 - try: - for _ in range(num_mut): - rid = choice(range(len(graph.nodes))) - random_node = graph.nodes[rid] - other_random_node = graph.nodes[choice(range(len(graph.nodes)))] - if ( - random_node.nodes_from is not None - and other_random_node in random_node.nodes_from - ): - random_node.nodes_from.remove(other_random_node) - if not random_node.nodes_from: - random_node.content["parent_model"] = None - other_random_node.nodes_from.append(random_node) - ml_models = MlModels() - other_random_node.content[ - "parent_model" - ] = ml_models.get_model_by_children_type(other_random_node) - break - except Exception as ex: - print(ex) - return graph - - -def custom_mutation_add_model(graph: CompositeModel, **kwargs): - try: - all_nodes = graph.nodes - nodes_with_parents = [ - node - for node in all_nodes - if (node.nodes_from != [] and node.nodes_from is not None) - ] - if not nodes_with_parents: - return graph - node = choice(nodes_with_parents) - ml_models = MlModels() - node.content["parent_model"] = ml_models.get_model_by_children_type(node) - except Exception as ex: - print(ex) - return graph - - -def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02): - data_all = data - data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42) - score, len_data = 0, len(data_train) - for node in graph.nodes: - data_of_node_train = data_train[node.content["name"]] - data_of_node_test = data_test[node.content["name"]] - if node.nodes_from is None or node.nodes_from == []: - if node.content["type"] == "cont": - mu, sigma = mean(data_of_node_train), std(data_of_node_train) - score += norm.logpdf( - data_of_node_test.values, loc=mu, scale=sigma - ).sum() - else: - count = data_of_node_train.value_counts() - frequency = log(count / len_data) - index = frequency.index.tolist() - for value in data_of_node_test: - if value in index: - score += frequency[value] - else: - model, columns, target, idx = ( - MlModels().dict_models[node.content["parent_model"]](), - [n.content["name"] for n in node.nodes_from], - data_of_node_train.to_numpy(), - data_train.index.to_numpy(), - ) - setattr(model, "max_iter", 100000) - features = data_train[columns].to_numpy() - if len(set(target)) == 1: - continue - fitted_model = model.fit(features, target) - - features = data_test[columns].to_numpy() - target = data_of_node_test.to_numpy() - if node.content["type"] == "cont": - predict = fitted_model.predict(features) - mse = mean_squared_error(target, predict, squared=False) + 0.0000001 - a = norm.logpdf(target, loc=predict, scale=mse) - score += a.sum() - else: - predict_proba = fitted_model.predict_proba(features) - idx = pd.array(list(range(len(target)))) - li = [] - - for i in idx: - a = predict_proba[i] - try: - b = a[target[i]] - except BaseException: - b = 0.0000001 - if b < 0.0000001: - b = 0.0000001 - li.append(log(b)) - score += sum(li) - - edges_count = len(graph.get_edges()) - score -= (edges_count * percent) * log10(len_data) * edges_count - - return -score diff --git a/bamt/utils/composite_utils/CompositeModel.py b/bamt/utils/composite_utils/CompositeModel.py deleted file mode 100644 index 344a7c8..0000000 --- a/bamt/utils/composite_utils/CompositeModel.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Optional, Union, List - -from golem.core.dag.graph_delegate import GraphDelegate -from golem.core.dag.linked_graph_node import LinkedGraphNode - - -class CompositeNode(LinkedGraphNode): - def __str__(self): - return self.content["name"] - - -class CompositeModel(GraphDelegate): - def __init__( - self, nodes: Optional[Union[LinkedGraphNode, List[LinkedGraphNode]]] = None - ): - super().__init__(nodes) - self.unique_pipeline_id = 1 diff --git a/bamt/utils/composite_utils/MLUtils.py b/bamt/utils/composite_utils/MLUtils.py deleted file mode 100644 index 9c3cefb..0000000 --- a/bamt/utils/composite_utils/MLUtils.py +++ /dev/null @@ -1,126 +0,0 @@ -import json -from random import choice - -import pkg_resources -from typing import Union - -from catboost import CatBoostClassifier, CatBoostRegressor -from golem.core.dag.graph_node import GraphNode -from sklearn.cluster import KMeans -from sklearn.ensemble import ( - AdaBoostRegressor, - ExtraTreesRegressor, - GradientBoostingRegressor, - RandomForestClassifier, - RandomForestRegressor, -) -from sklearn.linear_model import ( - Lasso, - LinearRegression, - LogisticRegression, - Ridge, - SGDRegressor, -) -from sklearn.naive_bayes import BernoulliNB, MultinomialNB -from sklearn.neural_network import MLPClassifier -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from xgboost import XGBClassifier, XGBRegressor - -from .CompositeModel import CompositeNode - -# Try to import LGBMRegressor and LGBMClassifier from lightgbm, if not available set to None -try: - from lightgbm.sklearn import LGBMRegressor, LGBMClassifier -except ModuleNotFoundError: - LGBMRegressor = None - LGBMClassifier = None - -lgbm_params = "lgbm_params.json" -models_repo = "models_repo.json" -lgbm_params_path = pkg_resources.resource_filename(__name__, lgbm_params) -models_repo_path = pkg_resources.resource_filename(__name__, models_repo) - - -class MlModels: - def __init__(self): - self.operations_by_types = { - "xgbreg": "XGBRegressor", - "adareg": "AdaBoostRegressor", - "gbr": "GradientBoostingRegressor", - "dtreg": "DecisionTreeRegressor", - "treg": "ExtraTreesRegressor", - "rfr": "RandomForestRegressor", - "linear": "LinearRegression", - "ridge": "Ridge", - "lasso": "Lasso", - "sgdr": "SGDRegressor", - "lgbmreg": "LGBMRegressor", - "catboostreg": "CatBoostRegressor", - "xgboost": "XGBClassifier", - "logit": "LogisticRegression", - "bernb": "BernoulliNB", - "multinb": "MultinomialNB", - "dt": "DecisionTreeClassifier", - "rf": "RandomForestClassifier", - "mlp": "MLPClassifier", - "catboost": "CatBoostClassifier", - "kmeans": "KMeans", - } - - self.dict_models = { - "XGBRegressor": XGBRegressor, - "AdaBoostRegressor": AdaBoostRegressor, - "GradientBoostingRegressor": GradientBoostingRegressor, - "DecisionTreeRegressor": DecisionTreeRegressor, - "ExtraTreesRegressor": ExtraTreesRegressor, - "RandomForestRegressor": RandomForestRegressor, - "LinearRegression": LinearRegression, - "Ridge": Ridge, - "Lasso": Lasso, - "SGDRegressor": SGDRegressor, - "LGBMRegressor": LGBMRegressor, - "CatBoostRegressor": CatBoostRegressor, - "XGBClassifier": XGBClassifier, - "LogisticRegression": LogisticRegression, - "BernoulliNB": BernoulliNB, - "MultinomialNB": MultinomialNB, - "DecisionTreeClassifier": DecisionTreeClassifier, - "RandomForestClassifier": RandomForestClassifier, - "MLPClassifier": MLPClassifier, - "LGBMClassifier": LGBMClassifier, - "CatBoostClassifier": CatBoostClassifier, - "KMeans": KMeans, - } - - # Include LGBMRegressor and LGBMClassifier if they were imported successfully - if LGBMRegressor is not None: - self.dict_models["LGBMRegressor"] = LGBMRegressor - self.operations_by_types["lgbmreg"] = "LGBMRegressor" - if LGBMClassifier is not None: - self.dict_models["LGBMClassifier"] = LGBMClassifier - self.operations_by_types["lgbm"] = "LGBMClassifier" - - if LGBMClassifier and LGBMRegressor is not None: - with open(lgbm_params_path) as file: - self.lgbm_dict = json.load(file) - - def get_model_by_children_type(self, node: Union[GraphNode, CompositeNode]): - candidates = [] - if node.content["type"] == "cont": - type_model = "regr" - else: - type_model = "class" - forbidden_tags = ["non-default", "expensive"] - with open(models_repo_path, "r") as f: - models_json = json.load(f) - models = models_json["operations"] - if LGBMClassifier and LGBMRegressor is not None: - models = models | self.lgbm_dict - for model, value in models.items(): - if ( - model not in ["knnreg", "knn", "qda"] - and list(set(value["tags"]).intersection(forbidden_tags)) == [] - and type_model in value["meta"] - ): - candidates.append(model) - return self.operations_by_types[choice(candidates)] diff --git a/bamt/utils/composite_utils/lgbm_params.json b/bamt/utils/composite_utils/lgbm_params.json deleted file mode 100644 index 5c5fa9d..0000000 --- a/bamt/utils/composite_utils/lgbm_params.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "lgbm": { - "meta": "sklearn_class", - "tags": ["boosting", "tree", "non_linear", "mix"] - }, - "lgbmreg": { - "meta": "sklearn_regr", - "presets": ["*tree"], - "tags": ["boosting", "tree", "non_multi", "non_linear", "mix"] - } -} \ No newline at end of file diff --git a/bamt/utils/composite_utils/models_repo.json b/bamt/utils/composite_utils/models_repo.json deleted file mode 100644 index 38a4d20..0000000 --- a/bamt/utils/composite_utils/models_repo.json +++ /dev/null @@ -1,447 +0,0 @@ -{ - "metadata": { - "custom_class": { - "accepted_node_types": [ - "any" - ], - "description": "Implementations of the custom classification models", - "forbidden_node_types": "[]", - "input_type": "[DataTypesEnum.table]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.classification", - "FedotClassificationStrategy" - ], - "tags": [ - "ml", - "custom" - ], - "tasks": "[TaskTypesEnum.classification]" - }, - "custom_regr": { - "accepted_node_types": [ - "any" - ], - "description": "Implementations of the custom regression models", - "forbidden_node_types": "[]", - "input_type": "[DataTypesEnum.table]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.regression", - "FedotRegressionStrategy" - ], - "tags": [ - "ml", - "custom" - ], - "tasks": "[TaskTypesEnum.regression]" - }, - "sklearn_class": { - "accepted_node_types": [ - "any" - ], - "description": "Implementations of the classification models from scikit-learn framework", - "forbidden_node_types": "[]", - "input_type": "[DataTypesEnum.table]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.classification", - "SkLearnClassificationStrategy" - ], - "tags": [ - "ml", - "sklearn" - ], - "tasks": "[TaskTypesEnum.classification]" - }, - "sklearn_clust": { - "accepted_node_types": [ - "any" - ], - "description": "Implementations of the clustering models from scikit-learn framework", - "forbidden_node_types": "[]", - "input_type": "[DataTypesEnum.table]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.clustering", - "SkLearnClusteringStrategy" - ], - "tags": [ - "ml", - "sklearn" - ], - "tasks": "[TaskTypesEnum.clustering]" - }, - "sklearn_regr": { - "accepted_node_types": [ - "any" - ], - "description": "Implementations of the regression models from scikit-learn framework", - "forbidden_node_types": "[]", - "input_type": "[DataTypesEnum.table]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.regression", - "SkLearnRegressionStrategy" - ], - "tags": [ - "ml", - "sklearn", - "composition" - ], - "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting]" - }, - "ts_model": { - "description": "Implementations of the time series models", - "input_type": "[DataTypesEnum.ts, DataTypesEnum.multi_ts]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.time_series", - "FedotTsForecastingStrategy" - ], - "tags": [ - "time_series" - ], - "tasks": "[TaskTypesEnum.ts_forecasting]" - }, - "custom_model": { - "description": "Implementations of the models specified by user with external code source", - "input_type": "[DataTypesEnum.ts, DataTypesEnum.table, DataTypesEnum.text]", - "output_type": "[DataTypesEnum.table]", - "strategies": [ - "fedot.core.operations.evaluation.custom", - "CustomModelStrategy" - ], - "tags": [ - "non-default" - ], - "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting, TaskTypesEnum.classification, TaskTypesEnum.clustering]" - } - }, - "operations": { - "adareg": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts", "*tree"], - "tags": [ - "boosting", - "non_multi", - "non_linear", - "cont" - ] - }, - "ar": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "interpretable", - "non_lagged", - "linear", - "correct_params" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "arima": { - "meta": "ts_model", - "presets": ["ts"], - "tags": [ - "simple", - "interpretable", - "non_lagged", - "linear", - "new_data_refit" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "clstm": { - "meta": "ts_model", - "tags": [ - "non_lagged", - "non_linear" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "bernb": { - "meta": "sklearn_class", - "presets": ["fast_train"], - "tags": [ - "bayesian", "non_multi", "linear", "disc" - ] - }, - "catboost": { - "meta": "sklearn_class", - "presets": ["*tree"], - "tags": [ - "boosting", "non-default", "non_linear", "mix" - ] - }, - "catboostreg": { - "meta": "sklearn_regr", - "presets": ["*tree"], - "tags": [ - "boosting", "non_multi", "non-default", "non_linear", "mix" - ] - }, - "dt": { - "meta": "sklearn_class", - "presets": ["fast_train", "*tree"], - "tags": [ - "tree", - "interpretable", - "non_linear", - "cont" - ] - }, - "dtreg": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts", "*tree"], - "tags": [ - "tree", - "interpretable", - "non_linear", - "cont" - ] - }, - "gbr": { - "meta": "sklearn_regr", - "presets": ["*tree"], - "tags": [ - "boosting", - "non_multi", - "non_linear", - "cont" - ] - }, - "kmeans": { - "meta": "sklearn_clust", - "presets": ["fast_train"], - "tags": ["linear"] - }, - "knn": { - "meta": "custom_class", - "presets": ["fast_train"], - "tags": [ - "simple", - "correct_params", - "non_linear", - "cont" - ] - }, - "knnreg": { - "meta": "custom_regr", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "correct_params", - "non_linear", - "cont" - ] - }, - "lasso": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "linear", - "interpretable", - "cont" - ] - }, - "lda": { - "meta": "custom_class", - "presets": ["fast_train"], - "tags": [ - "discriminant", "linear", "correct_params", "non-default", "cont" - ] - }, - "linear": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", "linear", "interpretable", "cont" - ] - }, - "logit": { - "meta": "sklearn_class", - "presets": ["fast_train"], - "tags": [ - "simple", - "linear", - "interpretable", - "non_multi", - "cont" - ] - }, - "mlp": { - "meta": "sklearn_class", - "tags": [ - "neural", - "non_linear", - "cont" - ] - }, - "multinb": { - "meta": "sklearn_class", - "presets": ["fast_train"], - "tags": [ - "non-default", - "bayesian", - "non_multi", - "linear", - "disc" - ] - }, - "qda": { - "meta": "custom_class", - "presets": ["fast_train"], - "tags": [ - "discriminant", - "quadratic", - "non_linear", - "cont" - ] - }, - "rf": { - "meta": "sklearn_class", - "presets": ["fast_train", "*tree"], - "tags": ["tree", "non_linear", "cont"] - }, - "rfr": { - "meta": "sklearn_regr", - "presets": ["fast_train", "*tree"], - "tags": ["tree", "non_linear", "cont"] - }, - "ridge": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "linear", - "interpretable", - "cont" - ] - }, - "polyfit": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "non_lagged", - "non_linear", - "interpretable", - "correct_params" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "sgdr": { - "meta": "sklearn_regr", - "presets": ["fast_train", "ts"], - "tags": [ - "non_multi", "non_linear", "cont" - ] - }, - "stl_arima": { - "meta": "ts_model", - "presets": ["ts"], - "tags": [ - "simple", - "interpretable", - "non_lagged", - "linear", - "new_data_refit" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "glm": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "interpretable", - "non_lagged", - "correct_params", - "non_linear" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "ets": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "simple", - "interpretable", - "non_lagged", - "non_linear" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "locf": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "non_linear", - "simple", - "interpretable", - "non_lagged" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "ts_naive_average": { - "meta": "ts_model", - "presets": ["fast_train", "ts"], - "tags": [ - "non_linear", - "simple", - "interpretable", - "non_lagged" - ], - "input_type": "[DataTypesEnum.ts]" - }, - "svc": { - "meta": "custom_class", - "tags": [ - "no_prob", - "expensive", - "non_linear" - ] - }, - "treg": { - "meta": "sklearn_regr", - "presets": ["*tree"], - "tags": [ - "tree", - "non_linear", - "cont" - ] - }, - "xgboost": { - "meta": "sklearn_class", - "presets": ["*tree"], - "tags": [ - "boosting", "tree", "non-default", "non_linear", "mix" - ] - }, - "xgbreg": { - "meta": "sklearn_regr", - "presets": ["*tree"], - "tags": [ - "boosting", "tree", "non_multi", "non-default", "non_linear", "cont" - ] - }, - "cnn": { - "meta": "custom_class", - "tags": [ - "deep", "non-default", "non_linear" - ], - "input_type": "[DataTypesEnum.image]", - "output_type": "[DataTypesEnum.table]" - }, - "custom": { - "meta": "custom_model", - "tags": [ - "custom_model", - "non-default" - ] - } - } -} diff --git a/bamt/utils/serialization_utils.py b/bamt/utils/serialization_utils.py deleted file mode 100644 index 1e4ff74..0000000 --- a/bamt/utils/serialization_utils.py +++ /dev/null @@ -1,150 +0,0 @@ -import os -import pickle -from typing import Union, Tuple - -import joblib - -import bamt.utils.check_utils as check_utils -from bamt.log import logger_nodes - - -class ModelsSerializer: - def __init__(self, bn_name, models_dir): - self.bn_name = bn_name - self.models_dir = models_dir - self.serialization = None - - if os.path.isdir(models_dir): - if bn_name in os.listdir(models_dir): - raise AssertionError(f"Name must be unique. | {os.listdir(models_dir)}") - - @staticmethod - def choose_serialization(model) -> Tuple[str, Union[Exception, int]]: - try: - ex_b = pickle.dumps(model, protocol=4) - model_ser = ex_b.decode("latin1").replace("'", '"') - - if type(model).__name__ == "CatBoostRegressor": - a = model_ser.encode("latin1") - else: - a = model_ser.replace('"', "'").encode("latin1") - - classifier_body = pickle.loads(a) - return "pickle", 1 - except Exception: - return "joblib", -1 - - def get_path_joblib(self, models_dir, node_name: str) -> str: - """ - Args: - node_name: name of node - specific: more specific unique name for node. - For example, combination. - - Return: - Path to node. - """ - path = os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") - - if not os.path.isdir(path): - os.makedirs( - os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") - ) - return path - - def serialize_instance(self, instance: dict, model_type, node_name, specific=False): - """Every distribution contains a dict with params with models""" - model_ser = None - - model = instance[f"{model_type}_obj"] - if not check_utils.is_model(model): - return instance - - serialization, status = self.choose_serialization(model) - if status == -1: - logger_nodes.warning( - f"{node_name}:{'' if not specific else specific}::Pickle failed. BAMT will use Joblib." - ) - if serialization == "pickle": - ex_b = pickle.dumps(model, protocol=4) - model_ser = ex_b.decode("latin1") - elif serialization == "joblib": - path = self.get_path_joblib( - models_dir=self.models_dir, node_name=node_name.replace(" ", "_") - ) - if not specific: - destination = f"{node_name.replace(' ', '_')}.joblib.compressed" - else: - destination = f"{specific}.joblib.compressed" - path = os.path.abspath(os.path.join(path, destination)) - joblib.dump(model, path, compress=True, protocol=4) - else: - logger_nodes.error("Serialization detection failed.") - return - instance["serialization"] = serialization - instance[f"{model_type}_obj"] = model_ser or path - return instance - - def serialize(self, distributions): - result = {} - for node_name, [node_type, dist] in distributions.items(): - result[node_name] = {} - model_type = "regressor" if "Gaussian" in node_type else "classifier" - if "Conditional" in node_type: - result[node_name]["hybcprob"] = {} - for combination, dist_nested in dist["hybcprob"].items(): - instance_serialized = self.serialize_instance( - instance=dist_nested, - model_type=model_type, - node_name=node_name, - specific=combination, - ) - result[node_name]["hybcprob"][combination] = instance_serialized - else: - instance_serialized = self.serialize_instance( - instance=dist, model_type=model_type, node_name=node_name - ) - result[node_name] = instance_serialized - return result - - -class Deserializer: - def __init__(self, models_dir): - self.models_dir = models_dir - - @staticmethod - def deserialize_instance(instance: dict, model_type): - model_repr = instance[f"{model_type}_obj"] - if model_repr is None: - return instance - - serialization = instance["serialization"] - - if serialization == "pickle": - bytes_model = model_repr.encode("latin1") - model = pickle.loads(bytes_model) - else: - model = joblib.load(model_repr) - - instance[f"{model_type}_obj"] = model - return instance - - def apply(self, distributions): - result = {} - for node_name, [node_type, dist] in distributions.items(): - model_type = "regressor" if "Gaussian" in node_type else "classifier" - result[node_name] = {} - if "Conditional" in node_type: - result[node_name]["hybcprob"] = {} - for combination, dist_nested in dist["hybcprob"].items(): - instance_deserialized = self.deserialize_instance( - instance=dist_nested, - model_type=model_type, - ) - result[node_name]["hybcprob"][combination] = instance_deserialized - else: - instance_deserialized = self.deserialize_instance( - instance=dist, model_type=model_type - ) - result[node_name] = instance_deserialized - return result diff --git a/tests/BigbraveBNTest.py b/tests/BigbraveBNTest.py deleted file mode 100644 index d21d215..0000000 --- a/tests/BigbraveBNTest.py +++ /dev/null @@ -1,59 +0,0 @@ -import pandas as pd -from pgmpy.estimators import K2Score -from sklearn import preprocessing - -import bamt.preprocessors as pp -from bamt.networks.big_brave_bn import BigBraveBN -from bamt.networks.continuous_bn import ContinuousBN -from bamt.networks.discrete_bn import DiscreteBN - -data_discrete = pd.read_csv(r"../Data/benchmark/pigs.csv") -data_continuous = pd.read_csv(r"../Data/benchmark/arth150.csv") - -encoder = preprocessing.LabelEncoder() -discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode="ordinal", strategy="uniform" -) - -p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) -discretized_data, est = p.apply(data_discrete) - -info = p.info - -space_restrictor = BigBraveBN() - -space_restrictor.set_possible_edges_by_brave(df=data_discrete) - -ps = space_restrictor.possible_edges - -bn_discrete = DiscreteBN() - -bn_discrete.add_nodes(descriptor=info) - -params = {"white_list": ps} -bn_discrete.add_edges(discretized_data, scoring_function=("K2", K2Score), params=params) - -encoder = preprocessing.LabelEncoder() -discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode="ordinal", strategy="uniform" -) - -p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) -discretized_data, est = p.apply(data_continuous) - -info = p.info - -space_restrictor = BigBraveBN() - -space_restrictor.set_possible_edges_by_brave(df=data_continuous) - -ps = space_restrictor.possible_edges - -bn_continuous = ContinuousBN() - -bn_continuous.add_nodes(descriptor=info) - -params = {"white_list": ps} -bn_continuous.add_edges( - discretized_data, scoring_function=("K2", K2Score), params=params -) diff --git a/tests/LoadBN.py b/tests/LoadBN.py deleted file mode 100644 index 6335bb2..0000000 --- a/tests/LoadBN.py +++ /dev/null @@ -1,54 +0,0 @@ -import json - -import pandas as pd -from sklearn import preprocessing as pp - -import bamt.networks as Networks -from bamt.preprocessors import Preprocessor - -hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] -] - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") - -p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -discretized_data, est = p.apply(hack_data) - -bn = Networks.HybridBN(use_mixture=True, has_logit=True) -info = p.info - -bn.add_nodes(info) - -structure = [ - ("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability"), -] - -bn.set_structure(edges=structure) - -bn.get_info(as_df=False) - -with open("hack_p.json") as params: - params = json.load(params) - bn.set_parameters(params) - -bn.plot("gg3.html") - -bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) -bn2.load("hack.json") - -print(bn2.sample(50)) diff --git a/tests/MainTest.py b/tests/MainTest.py deleted file mode 100644 index 4796fe6..0000000 --- a/tests/MainTest.py +++ /dev/null @@ -1,137 +0,0 @@ -import pandas as pd -from pgmpy.estimators import K2Score -from sklearn import preprocessing as pp - -import bamt.networks as Networks -from bamt.preprocessors import Preprocessor - -""" -Optional: -You can also uncomment print() that you need. -""" - -hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv") -cont_data = hack_data[["Gross", "Netpay", "Porosity", "Permeability", "Depth"]].dropna() -disc_data = hack_data[ - ["Tectonic regime", "Period", "Lithology", "Structural setting"] -].dropna() -hybrid_data = hack_data[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] -].dropna() - -cont_test_data = cont_data[cont_data.columns[:-1]] -cont_target = cont_data[cont_data.columns[-1]] -disc_test_data = disc_data[disc_data.columns[:-1]] -disc_target = disc_data[disc_data.columns[-1]] -hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] -hybrid_target = hybrid_data[hybrid_data.columns[-1]] - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") -p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -# Discrete pipeline -discretized_data, _ = p.apply(disc_data) -disc_bn = Networks.DiscreteBN() -info = p.info -disc_bn.add_nodes(info) -disc_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) -disc_bn.fit_parameters(data=disc_data) -disc_bn.calculate_weights(discretized_data) -disc_predicted_values = disc_bn.predict(test=disc_test_data) -disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient="columns") -synth_disc_data = disc_bn.sample(50) - -disc_bn.save("./disc_bn.json") -disc_bn2 = Networks.DiscreteBN() -disc_bn2.load("./disc_bn.json") -synth_disc_data2 = disc_bn2.sample(50) -# print(disc_bn.weights) -# print(disc_bn2.weights) -# print(disc_bn.get_info()) -# print(disc_bn2.get_info()) -# print(synth_disc_data) -# print(synth_disc_data2) - -# Continuous pipeline -discretized_data, _ = p.apply(cont_data) -cont_bn = Networks.ContinuousBN(use_mixture=True) -info = p.info -cont_bn.add_nodes(info) -cont_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) -cont_bn.fit_parameters(data=cont_data) -cont_bn.calculate_weights(discretized_data) -cont_predicted_values = cont_bn.predict(test=cont_test_data) -cont_predicted_values = pd.DataFrame.from_dict(cont_predicted_values, orient="columns") -synth_cont_data = cont_bn.sample(50) - -cont_bn.save("./cont_bn.json") -cont_bn2 = Networks.ContinuousBN(use_mixture=True) -cont_bn2.load("./cont_bn.json") -synth_cont_data2 = cont_bn2.sample(50) -# print(cont_bn.weights) -# print(cont_bn2.weights) -# print('RMSE on predicted values with continuous data: ' + -# f'{mse(cont_target, cont_predicted_values, squared=False)}') -# print(cont_bn.get_info()) -# print(cont_bn2.get_info()) -# print(synth_cont_data) -# print(synth_cont_data2) - -# Hybrid pipeline -discretized_data, _ = p.apply(hybrid_data) -hybrid_bn = Networks.HybridBN(use_mixture=True) -hybrid_bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) -info = p.info -hybrid_bn.add_nodes(info) -hybrid_bn2.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) -hybrid_bn2.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) -hybrid_bn.fit_parameters(data=hybrid_data) -hybrid_bn2.fit_parameters(data=hybrid_data) -hybrid_bn.calculate_weights(discretized_data) -hybrid_bn2.calculate_weights(discretized_data) -hybrid_predicted_values = hybrid_bn.predict(test=hybrid_test_data) -hybrid_predicted_values = pd.DataFrame.from_dict( - hybrid_predicted_values, orient="columns" -) -synth_hybrid_data = hybrid_bn.sample(50) -synth_hybrid_data2 = hybrid_bn2.sample(50) - -hybrid_bn.save("./hybrid_bn.json") -hybrid_bn3 = Networks.HybridBN(use_mixture=True) -hybrid_bn3.load("./hybrid_bn.json") -synth_hybrid_data3 = hybrid_bn3.sample(50) -# print(hybrid_bn.weights) -# print(hybrid_bn2.weights) -# print(hybrid_bn3.weights) -# print('RMSE on predicted values with hybrid data: ' + -# f'{mse(hybrid_target, hybrid_predicted_values, squared=False)}') -# print(hybrid_bn.get_info()) -# print(hybrid_bn2.get_info()) -# print(hybrid_bn3.get_info()) -# print(synth_hybrid_data) -# print(synth_hybrid_data2) -# print(synth_hybrid_data3) - -# Save and load BN without weights -discretized_data, _ = p.apply(hybrid_data) -hybrid_bn = Networks.HybridBN(use_mixture=True) -info = p.info -hybrid_bn.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) -hybrid_bn.fit_parameters(data=hybrid_data) -hybrid_bn.save("./hybrid_bn_without_weights.json") -hybrid_bn2 = Networks.HybridBN(use_mixture=True) -hybrid_bn2.load("./hybrid_bn_without_weights.json") -# print(hybrid_bn2.weights) diff --git a/tests/MetricsTest.py b/tests/MetricsTest.py deleted file mode 100644 index 5f04deb..0000000 --- a/tests/MetricsTest.py +++ /dev/null @@ -1,61 +0,0 @@ -import time - -import pandas as pd -from sklearn import preprocessing as pp - -import bamt.networks as Networks -from bamt.preprocessors import Preprocessor - -start = time.time() - - -p1 = time.time() -print(f"Time elapsed for importing: {p1 - start}") - -h = pd.read_csv("data/real data/hack_processed_with_rf.csv") -cols = [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", -] -h = h[cols] - -print(h.describe()) -print("-----") -p2 = time.time() -print(f"Time elapsed for preparing data: {p2 - p1}") - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") - -p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -# ----------- -discrete_data, est = p.apply(h) -info = p.info - -bn = Networks.HybridBN(has_logit=True) # all may vary -bn.add_nodes(descriptor=info) -bn.add_edges(data=discrete_data, optimizer="HC", scoring_function=("MI",)) - -bn.get_info(as_df=False) -t1 = time.time() -bn.fit_parameters(data=h) -t2 = time.time() -print(f"PL elapsed: {t2 - t1}") - -columns = ["Lithology", "Structural setting", "Porosity", "Depth"] -validY = h[columns].dropna() -validX = h.drop(columns, axis=1).dropna() - -time_1 = time.time() -pred_param = bn.predict(validX, parall_count=3) -time_2 = time.time() -print(pred_param) -print(f"Predict elapsed: {time_2 - time_1}") diff --git a/tests/NetworksTest.py b/tests/NetworksTest.py deleted file mode 100644 index 607bdf9..0000000 --- a/tests/NetworksTest.py +++ /dev/null @@ -1,722 +0,0 @@ -import itertools -import json -import time - -import numpy as np -import pandas as pd -from pandas.testing import assert_frame_equal -from sklearn import preprocessing as pp -from sklearn.ensemble import RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.tree import DecisionTreeClassifier - -import bamt.networks as Networks -import bamt.nodes as Nodes -from bamt.preprocessors import Preprocessor - - -# import abc - - -class NetworkTest(object): - def __init__( - self, - directory: str, - verbose: bool = False, - case_id: int = 0, - sample_n: int = 500, - sample_tol: float = 0.6, - ): - """ - sample_n: number of rows in sample - sample_tol: precent of acceptable number of nans. - If number of nan more than sample_int * sample_tol, then sample test failed. - """ - self.bn = Networks.BaseNetwork() - self.sf = "" - self.type = "abstract" - self.case_id = case_id - self.sample_n = sample_n - self.sample_tol = sample_tol - self.verboseprint = print if verbose else lambda *a: None - self.directory = directory - - @staticmethod - def _tabularize_output(message1: str, message2: str): - # message1 - usually class of output (error, success) - # message2 - usually description of message 1 - return f"{message1: <52} | {message2: >3}" - - def test_preprocess(self): - failed = False - - if self.case_id == 0: - self.discrete_cols = [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - ] - self.cont_cols = ["Gross", "Netpay", "Porosity", "Permeability", "Depth"] - self.hybrid_cols = [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] - # Base of standards - self.base = "hack_" + self.type - else: - self.discrete_cols = [] - self.cont_cols = [] - self.hybrid_cols = [] - - if self.type == "discrete": - data = pd.read_csv(self.directory)[self.discrete_cols] - elif self.type == "continuous": - data = pd.read_csv(self.directory)[self.cont_cols] - else: - data = pd.read_csv(self.directory)[self.hybrid_cols] - - encoder = pp.LabelEncoder() - discretizer = pp.KBinsDiscretizer( - n_bins=5, encode="ordinal", strategy="uniform" - ) - - p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - - discretized_data, est = p.apply(data) - info = p.info - - try: - assert info == json.load(open(f"{self.base}/hack_info.json")) - except AssertionError: - failed = True - self.verboseprint(self._tabularize_output("ERROR", "Bad descriptor")) - - try: - assert_frame_equal( - discretized_data, - pd.read_csv(f"{self.base}/hack_data.csv", index_col=0), - check_dtype=False, - ) - except Exception as ex: - failed = True - self.verboseprint(self._tabularize_output("ERROR", str(ex))) - - if not failed: - status = "OK" - else: - status = "Failed" - print(self._tabularize_output("Preprocess", status)) - - self.info = info - self.data = discretized_data - - def test_structure_learning(self): - pass - - def test_parameters_learning(self): - pass - - def test_sampling(self): - failed = False - - sample = self.bn.sample(n=self.sample_n, progress_bar=False) - - if sample.empty: - self.verboseprint("Sampling", "Dataframe is empty") - return - - if sample.isna().sum().sum() > (self.sample_n * self.sample_tol): - failed = True - - if not failed: - status = "OK" - else: - status = "Failed" - - print(self._tabularize_output(f"Sampling ({self.sf})", status)) - - def test_predict(self): - failed = False - - if self.type == "discrete": - cols = self.discrete_cols - elif self.type == "continuous": - cols = self.cont_cols - elif self.type == "hybrid": - cols = self.hybrid_cols - else: - raise Exception("Inner error") - - preds = self.bn.predict( - test=pd.read_csv(self.directory)[cols[:2]].dropna(), - progress_bar=False, - parall_count=2, - ) - - # with open(f"{self.base}/hack_predict.json", "r") as f: - # p = json.load(f) - - if self.type == "continuous": - # cols: ['Porosity', 'Permeability', 'Depth'] - for node in preds.keys(): - right_val = json.load(open(f"{self.base}/hack_predict.json"))[self.sf][ - node - ] - test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) - assert np.all( - np.isclose(test_val, right_val, rtol=0.4) - ), f"Predict failed: {node, right_val, test_val}" - elif self.type == "discrete": - # cols: ['Lithology', 'Structural setting'] - for node in preds.keys(): - test_vals = pd.Series(preds[node]).value_counts().to_dict() - for category, right_val in json.load( - open(f"{self.base}/hack_predict.json") - )[self.sf][node].items(): - try: - assert np.all( - np.isclose(test_vals[category], right_val, atol=5) - ), f"Predict failed: {node, test_vals[category], right_val}" - except KeyError as ex: - print("Unknown preds category: ", ex.args[0]) - continue - elif self.type == "hybrid": - cont_nodes = [ - node - for node in self.bn.nodes_names - if self.info["types"][node] == "cont" - ] - for node in preds.keys(): - if node in cont_nodes: - right_val = json.load(open(f"{self.base}/hack_predict.json"))[ - self.sf - ][node] - test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) - # p[self.sf][node] = test_val - s = [right_val, test_val] - assert np.all( - np.isclose(min(s), max(s), atol=5, rtol=0.6) - ), f"Predict failed: {node, test_val, right_val}" - else: - test_vals = pd.Series(preds[node]).value_counts().to_dict() - # p[self.sf][node] = test_vals - for category, right_val in json.load( - open(f"{self.base}/hack_predict.json") - )[self.sf][node].items(): - try: - assert np.all( - np.isclose( - min(test_vals[category], right_val), - max(right_val, test_vals[category]), - atol=100, - rtol=0.5, - ) - ), f"Predict failed: {node, test_vals[category], right_val}" - except KeyError as ex: - print("Unknown preds category: ", ex.args[0]) - continue - - if not failed: - status = "OK" - else: - status = "Failed" - - print(self._tabularize_output(f"Predict ({self.sf})", status)) - - # with open(f"{self.base}/hack_predict.json", "w") as f: - # json.dump(p, f) - - def apply(self): - pass - - @staticmethod - def use_rules(*args, **kwargs): - for rule in args: - rule(**kwargs) - - -class TestDiscreteBN(NetworkTest): - def __init__(self, **kwargs): - super(TestDiscreteBN, self).__init__(**kwargs) - self.type = "discrete" - - def test_structure_learning(self): - failed = False - - bn = Networks.DiscreteBN() - bn.add_nodes(descriptor=self.info) - - try: - assert bn.nodes_names == [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - ] - except AssertionError: - failed = True - self.verboseprint( - self._tabularize_output( - "ERROR", "first stage failed (wrong init nodes)." - ) - ) - - bn.add_edges(self.data, (self.sf,), progress_bar=False) - - try: - assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[self.sf] - except AssertionError: - failed = True - self.verboseprint(f"Stage 2 failed with {self.sf}.") - - if not failed: - self.bn = bn - status = "OK" - else: - self.bn = None - status = "Failed" - - print(self._tabularize_output(f"Structure ({self.sf})", status)) - - def test_parameters_learning(self): - failed = False - - self.bn.fit_parameters(pd.read_csv(self.directory)[self.discrete_cols]) - - try: - assert ( - self.bn.distributions - == json.load(open(f"{self.base}/hack_params.json"))[self.sf] - ) - except AssertionError: - failed = True - self.verboseprint( - self._tabularize_output(f"Parameters ({self.sf})", "bad distributions") - ) - - if not failed: - status = "OK" - else: - status = "Failed" - - print(self._tabularize_output(f"Parameters ({self.sf})", status)) - - def apply(self): - print(f"Executing {self.type} BN tests.") - self.test_preprocess() - t0 = time.time() - for sf in ["MI", "K2", "BIC"]: - self.sf = sf - t1 = time.time() - self.test_structure_learning() - if not self.bn: - print(self._tabularize_output(f"Error on {sf}", "No structure")) - print("-" * 8) - continue - self.test_parameters_learning() - self.test_sampling() - self.test_predict() - t2 = time.time() - print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8, "\n") - t3 = time.time() - print(f"\nElapsed time: {t3 - t0}") - - -class TestContinuousBN(NetworkTest): - def __init__(self, **kwargs): - super(TestContinuousBN, self).__init__(**kwargs) - self.type = "continuous" - - def test_setters(self): - failed = False - - bn = Networks.ContinuousBN() - ns = [] - for d in [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 4)]: - ns.append(d) - - bn.set_structure(nodes=ns) - bn.set_classifiers( - classifiers={ - "Node0": DecisionTreeClassifier(), - "Node1": RandomForestClassifier(), - "Node2": KNeighborsClassifier(n_neighbors=2), - } - ) - - assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == [ - "DecisionTreeClassifier()", - "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)", - ], "Setter | Classifiers are wrong." - - if not failed: - status = "OK" - else: - status = "Failed" - - print(self._tabularize_output("Setters", status)) - - def test_structure_learning(self, use_mixture: bool = False): - self.use_mixture = use_mixture - failed = False - - bn = Networks.ContinuousBN(use_mixture=use_mixture) - bn.add_nodes(descriptor=self.info) - - try: - assert ( - bn.nodes_names - == json.load(open(f"{self.base}/hack_nodes.json"))[ - f"use_mixture={use_mixture}" - ][self.sf] - ) - except AssertionError: - failed = True - self.verboseprint( - self._tabularize_output( - "ERROR", "first stage failed (wrong init nodes)." - ) - ) - - bn.add_edges(self.data, (self.sf,), progress_bar=False) - - try: - assert ( - bn.edges - == json.load(open(f"{self.base}/hack_edges.json"))[ - f"use_mixture={use_mixture}" - ][self.sf] - ) - except AssertionError: - failed = True - self.verboseprint(f"Stage 2 failed with {self.sf}.") - - if not failed: - self.bn = bn - status = "OK" - else: - status = "Failed" - - print( - self._tabularize_output( - f"Structure ({self.sf}, use_mixture={self.use_mixture})", status - ) - ) - - def test_parameters_learning(self): - failed = False - - self.bn.fit_parameters(pd.read_csv(self.directory)[self.cont_cols]) - try: - if self.use_mixture: - empty_data = {"mean": [], "covars": [], "coef": []} - for k, v in self.bn.distributions.items(): - assert all( - [v[obj] != empty for obj, empty in empty_data.items()] - ), f"Empty data in {k}." - assert ( - 0.9 <= sum(v["coef"]) <= 1.1 - ), f"{sum(v['coef'])} || {k}'s: coefs are wrong." - else: - assert ( - self.bn.distributions - == json.load(open(f"{self.base}/hack_params.json"))[ - "use_mixture=False" - ][self.sf] - ), "Bad distributions." - except AssertionError as ex: - failed = True - self.verboseprint( - self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture})", - ex.args[0], - ) - ) - - if not failed: - status = "OK" - else: - status = "Failed" - - print( - self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture})", status - ) - ) - - def apply(self): - print(f"Executing {self.type} BN tests.") - self.test_preprocess() - # Auskommentieren mich - # self._test_setters() - t0 = time.time() - for use_mixture in [True, False]: - for sf in ["MI", "K2", "BIC"]: - self.sf = sf - t1 = time.time() - self.test_structure_learning(use_mixture=use_mixture) - if not self.bn: - print(self._tabularize_output(f"Error on {sf}", "No structure")) - print("-" * 8) - continue - self.test_parameters_learning() - self.test_sampling() - self.test_predict() - t2 = time.time() - print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) - t3 = time.time() - print(f"\nElapsed time: {t3 - t0}") - - -class TestHybridBN(NetworkTest): - def __init__(self, **kwargs): - super(TestHybridBN, self).__init__(**kwargs) - self.type = "hybrid" - - def test_setters(self): - failed = False - - bn = Networks.HybridBN(has_logit=True) - ns = [] - for d, g in zip( - [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], - [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)], - ): - ns.append(d) - ns.append(g) - edges = [ - ("Node0", "Node3"), - ("Node3", "Node1"), - ("Node1", "Node4"), - ("Node4", "Node2"), - ("Node2", "Node5"), - ] - test_info = { - "types": { - "Node0": "cont", - "Node1": "cont", - "Node2": "cont", - "Node3": "disc", - "Node4": "disc", - "Node5": "disc", - }, - "signs": {"Node0": "pos", "Node1": "pos", "Node2": "pos"}, - } - - # Structure setter - bn.set_structure(info=test_info, nodes=ns, edges=edges) - - assert [ - "Gaussian (LinearRegression)", - "Logit (LogisticRegression)", - "ConditionalGaussian (LinearRegression)", - "Logit (LogisticRegression)", - "ConditionalGaussian (LinearRegression)", - "Logit (LogisticRegression)", - ] == [node.type for node in bn.nodes], "Setter | Nodes are not the same." - assert edges == bn.edges, "Setter | Edges are not the same." - - # Classifiers setters - - bn.set_classifiers( - classifiers={ - "Node3": DecisionTreeClassifier(), - "Node4": RandomForestClassifier(), - "Node5": KNeighborsClassifier(n_neighbors=2), - } - ) - - assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == [ - "DecisionTreeClassifier()", - "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)", - ], "Setter | Classifiers are wrong." - - # Parameters setters - - with open("test_params.json") as f: - p = json.load(f) - - bn.set_parameters(p) - assert bn.distributions == p, "Setter | Parameters are not the same." - - if not failed: - status = "OK" - else: - status = "Failed" - - print(self._tabularize_output("Setters", status)) - - def test_structure_learning( - self, use_mixture: bool = False, has_logit: bool = False - ): - self.use_mixture = use_mixture - self.has_logit = has_logit - failed = False - - bn = Networks.HybridBN(use_mixture=use_mixture, has_logit=has_logit) - bn.add_nodes(descriptor=self.info) - - try: - assert ( - bn.nodes_names - == json.load(open(f"{self.base}/hack_nodes.json"))[ - f"use_mixture={use_mixture}" - ][f"has_logit={has_logit}"][self.sf] - ) - except AssertionError: - failed = True - self.verboseprint( - self._tabularize_output( - "ERROR", "first stage failed (wrong init nodes)." - ) - ) - - bn.add_edges(self.data, (self.sf,), progress_bar=False) - - try: - assert ( - bn.edges - == json.load(open(f"{self.base}/hack_edges.json"))[ - f"use_mixture={use_mixture}" - ][f"has_logit={has_logit}"][self.sf] - ) - except AssertionError: - failed = True - self.verboseprint(f"Stage 2 failed with {self.sf}.") - - if not failed: - self.bn = bn - status = "OK" - else: - status = "Failed" - - print( - self._tabularize_output( - f"Structure ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status, - ) - ) - - @staticmethod - def non_empty_gaussian_nodes(name, node_params): - empty_data = {"mean": [], "covars": [], "coef": []} - assert all( - [node_params[obj] != empty for obj, empty in empty_data.items()] - ), f"Empty data in {name}." - - @staticmethod - def non_empty_logit_nodes(name, node_params): - empty_data = {"classes": [], "classifier_obj": None} - assert all( - [node_params[obj] != empty for obj, empty in empty_data.items()] - ), f"Empty data in {name}." - - @staticmethod - def sum_equals_to_1(name, node_params): - assert 0.9 <= sum(node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." - - def _validate_node(self, name, type, node_params, true_vals): - try: - if type == "MixtureGaussian": - self.use_rules( - self.non_empty_gaussian_nodes, - self.sum_equals_to_1, - name=name, - node_params=node_params, - ) - elif type == "ConditionalMixtureGaussian": - for comb, data in node_params["hybcprob"].items(): - self.use_rules( - self.non_empty_gaussian_nodes, - self.sum_equals_to_1, - name=name, - node_params=data, - ) - elif type.startswith("Logit"): - self.use_rules( - self.non_empty_logit_nodes, name=name, node_params=node_params - ) - elif type.startswith("ConditionalLogit"): - for comb, data in node_params["hybcprob"].items(): - self.use_rules( - self.non_empty_logit_nodes, name=name, node_params=data - ) - else: - assert node_params == true_vals, f"Parameters error on {name}, {type}" - except AssertionError as ex: - self.verboseprint( - self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0], - ) - ) - - def test_parameters_learning(self): - failed = False - - self.bn.fit_parameters(pd.read_csv(self.directory)[self.hybrid_cols]) - try: - true_params = json.load(open(f"{self.base}/hack_params.json"))[ - f"use_mixture={self.use_mixture}" - ][f"has_logit={self.has_logit}"][self.sf] - - node_type_dict = {node.name: node.type for node in self.bn.nodes} - for name, type in node_type_dict.items(): - node_params = self.bn.distributions[name] - self._validate_node(name, type, node_params, true_params[name]) - except AssertionError as ex: - failed = True - self.verboseprint( - self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0], - ) - ) - - if not failed: - status = "OK" - else: - status = "Failed" - - print( - self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status, - ) - ) - - def apply(self): - print(f"Executing {self.type} BN tests.") - self.test_preprocess() - self.test_setters() - t0 = time.time() - - for use_mixture, has_logit in itertools.product([True, False], repeat=2): - for sf in ["MI", "K2", "BIC"]: - self.sf = sf - t1 = time.time() - self.test_structure_learning( - use_mixture=use_mixture, has_logit=has_logit - ) - self.test_parameters_learning() - if not self.bn: - print(self._tabularize_output(f"Error on {sf}", "No structure")) - print("-" * 8) - continue - self.test_sampling() - self.test_predict() - t2 = time.time() - print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) - - t3 = time.time() - print(f"\nElapsed time: {t3 - t0}") diff --git a/tests/SaveBN.py b/tests/SaveBN.py deleted file mode 100644 index 0a210ea..0000000 --- a/tests/SaveBN.py +++ /dev/null @@ -1,49 +0,0 @@ -import pandas as pd -from sklearn import preprocessing as pp - -import bamt.networks as Networks -from bamt.preprocessors import Preprocessor - -# import json - -hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] -] - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") - -p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -discretized_data, est = p.apply(hack_data) - -bn = Networks.HybridBN(use_mixture=True, has_logit=True) -info = p.info - -bn.add_nodes(info) - -structure = [ - ("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability"), -] - -bn.set_structure(edges=structure) - -bn.get_info(as_df=False) - -bn.fit_parameters(data=hack_data) -print(bn.sample(4)) -bn.save_params("hack_p.json") -# bn.save_structure("hack_s.json") -bn.save("hack.json") diff --git a/tests/main.py b/tests/main.py deleted file mode 100644 index 37083e0..0000000 --- a/tests/main.py +++ /dev/null @@ -1,27 +0,0 @@ -import logging -import time -import traceback - -from NetworksTest import TestHybridBN, TestContinuousBN, TestDiscreteBN - -# Print only errors -logging.getLogger("preprocessor").setLevel(logging.ERROR) - -if __name__ == "__main__": - t0 = time.time() - dir = r"../data/real data/hack_processed_with_rf.csv" - - tests = [ - TestHybridBN(directory=dir), - TestDiscreteBN(directory=dir), - TestContinuousBN(directory=dir), - ] - - for test in tests: - try: - test.apply() - except Exception as ex: - traceback.print_exc() - continue - - print(f"Total time: {time.time() - t0}") diff --git a/tests/sendingClassifiersLogit.py b/tests/sendingClassifiersLogit.py deleted file mode 100644 index ce312f0..0000000 --- a/tests/sendingClassifiersLogit.py +++ /dev/null @@ -1,61 +0,0 @@ -# import json - -import pandas as pd -from sklearn import preprocessing as pp -from sklearn.ensemble import RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.tree import DecisionTreeClassifier - -import bamt.networks as Nets -import bamt.preprocessors as preprocessors - -hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] -] - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") - -p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -discretized_data, est = p.apply(hack_data) - -bn = Nets.HybridBN(has_logit=True) -info = p.info - -# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: -# net_data = json.load(f) - -# bn.add_nodes(net_data["info"]) -# bn.set_structure(edges=net_data["edges"]) -# bn.set_parameters(net_data["parameters"]) -# -# print(bn.sample(10, models_dir=r"")) - -bn.add_nodes(info) - -bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) -bn.set_classifiers( - classifiers={ - "Structural setting": DecisionTreeClassifier(), - "Lithology": RandomForestClassifier(), - "Period": KNeighborsClassifier(n_neighbors=2), - } -) - -bn.fit_parameters(hack_data) - -# bn.save("mynet.json") -print(bn.sample(500).shape) - -bn.get_info(as_df=False) diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py deleted file mode 100644 index b0c79ae..0000000 --- a/tests/sendingRegressors.py +++ /dev/null @@ -1,62 +0,0 @@ -# import json - -import pandas as pd -from catboost import CatBoostRegressor -from sklearn import preprocessing as pp -from sklearn.ensemble import RandomForestRegressor - -# from sklearn.linear_model import ElasticNet -from sklearn.tree import DecisionTreeRegressor - -import bamt.preprocessors as preprocessors -from bamt.networks import HybridBN - -hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] -].dropna() - -encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") - -p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - -discretized_data, est = p.apply(hack_data) - -bn = HybridBN(has_logit=True) -info = p.info - -# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: -# net_data = json.load(f) - -# bn.add_nodes(net_data["info"]) -# bn.set_structure(edges=net_data["edges"]) -# bn.set_parameters(net_data["parameters"]) - -bn.add_nodes(info) - -bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) - -bn.set_regressor( - regressors={ - "Depth": CatBoostRegressor(logging_level="Silent", allow_writing_files=False), - "Gross": RandomForestRegressor(), - "Porosity": DecisionTreeRegressor(), - } -) - -bn.fit_parameters(hack_data) - -# bn.save("mynet") - -print(bn.sample(100).shape) -bn.get_info(as_df=False) diff --git a/tests/test_builders.py b/tests/test_builders.py deleted file mode 100644 index 530a2b4..0000000 --- a/tests/test_builders.py +++ /dev/null @@ -1,728 +0,0 @@ -import itertools -import logging -import unittest - -import pandas as pd - -from bamt.builders.builders_base import StructureBuilder, VerticesDefiner -from bamt.builders.evo_builder import EvoStructureBuilder -from bamt.builders.hc_builder import HillClimbDefiner -from bamt.nodes.discrete_node import DiscreteNode -from bamt.nodes.gaussian_node import GaussianNode -from bamt.utils.MathUtils import precision_recall - -logging.getLogger("builder").setLevel(logging.CRITICAL) - - -class TestStructureBuilder(unittest.TestCase): - def setUp(self): - self.data = pd.DataFrame(columns=["Node0", "Node1", "Node2"]) - self.descriptor = { - "types": {"Node0": "cont", "Node1": "disc", "Node2": "disc_num"}, - "signs": {"Node0": "pos"}, - } - self.SB = StructureBuilder(descriptor=self.descriptor) - - def test_restrict(self): - self.SB.has_logit = True - - self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) - - self.assertEqual(self.SB.black_list, [], msg="Restrict wrong edges.") - - # --------- - self.SB.has_logit = False - - self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) - - self.assertEqual( - self.SB.black_list, - [("Node0", "Node1"), ("Node0", "Node2")], - msg="Restricted edges are allowed.", - ) - - def test_get_family(self): - self.assertIsNone(self.SB.get_family()) - - self.SB.skeleton["V"] = [ - GaussianNode(name="Node0"), - DiscreteNode(name="Node1"), - DiscreteNode(name="Node2"), - ] - self.assertIsNone(self.SB.get_family()) - # Note that the method get_family is not supposed to be used by user (only developer), - # so we don't cover a case with restricted edges here (we did this in - # the previous test). - self.SB.skeleton["E"] = [ - ("Node1", "Node0"), - ("Node2", "Node1"), - ("Node2", "Node0"), - ] - self.SB.get_family() - - # Node: [[cont_parents], [disc_parents], [children]] - data = [ - [[], [], ["Node1", "Node0"]], - [[], ["Node2"], ["Node0"]], - [[], ["Node1", "Node2"], []], - ] - for node_nummer in range(3): - self.assertEqual( - self.SB.skeleton["V"][node_nummer].cont_parents, data[node_nummer][0] - ) - self.assertEqual( - self.SB.skeleton["V"][node_nummer].disc_parents, data[node_nummer][1] - ) - self.assertEqual( - self.SB.skeleton["V"][node_nummer].children, data[node_nummer][2] - ) - - -class TestVerticesDefiner(unittest.TestCase): - def setUp(self): - self.descriptor = { - "types": { - "Node0": "cont", - "Node1": "cont", - "Node2": "cont", - "Node3": "cont", - "Node4": "disc", - "Node5": "disc", - "Node6": "disc_num", - "Node7": "disc_num", - }, - "signs": {"Node0": "pos", "Node1": "neg"}, - } - - self.VD = VerticesDefiner(descriptor=self.descriptor, regressor=None) - - def test_first_level(self): - self.assertEqual( - self.VD.vertices, - [ - GaussianNode(name="Node0"), - GaussianNode(name="Node1"), - GaussianNode(name="Node2"), - GaussianNode(name="Node3"), - DiscreteNode(name="Node4"), - DiscreteNode(name="Node5"), - DiscreteNode(name="Node6"), - DiscreteNode(name="Node7"), - ], - ) - - def test_overwrite_vetrex(self): - self.assertEqual(self.VD.skeleton, {"V": [], "E": []}) - - def reload(): - self.VD.skeleton["V"] = self.VD.vertices - self.VD.skeleton["E"] = [ - ("Node0", "Node7"), - ("Node0", "Node1"), - ("Node0", "Node2"), - ("Node0", "Node5"), - ("Node4", "Node2"), - ("Node4", "Node5"), - ("Node4", "Node6"), - ("Node4", "Node3"), - ] - self.VD.get_family() - - data = { - "True, True": { - "Node0": "MixtureGaussian", - "Node4": "Discrete", - "Node7": "Logit (LogisticRegression)", - "Node1": "MixtureGaussian", - "Node2": "ConditionalMixtureGaussian", - "Node5": "ConditionalLogit (LogisticRegression)", - "Node6": "Discrete", - "Node3": "ConditionalMixtureGaussian", - }, - "True, False": { - "Node0": "MixtureGaussian", - "Node4": "Discrete", - "Node7": "Discrete", - "Node1": "MixtureGaussian", - "Node2": "ConditionalMixtureGaussian", - "Node5": "Discrete", - "Node6": "Discrete", - "Node3": "ConditionalMixtureGaussian", - }, - "False, True": { - "Node0": "Gaussian (LinearRegression)", - "Node4": "Discrete", - "Node7": "Logit (LogisticRegression)", - "Node1": "Gaussian (LinearRegression)", - "Node2": "ConditionalGaussian (LinearRegression)", - "Node5": "ConditionalLogit (LogisticRegression)", - "Node6": "Discrete", - "Node3": "ConditionalGaussian (LinearRegression)", - }, - "False, False": { - "Node0": "Gaussian (LinearRegression)", - "Node4": "Discrete", - "Node7": "Discrete", - "Node1": "Gaussian (LinearRegression)", - "Node2": "ConditionalGaussian (LinearRegression)", - "Node5": "Discrete", - "Node6": "Discrete", - "Node3": "ConditionalGaussian (LinearRegression)", - }, - } - - for use_mixture, has_logit in itertools.product([True, False], repeat=2): - reload() - self.VD.overwrite_vertex( - has_logit=has_logit, - use_mixture=use_mixture, - classifier=None, - regressor=None, - ) - self.assertEqual( - {node.name: node.type for node in self.VD.skeleton["V"]}, - data[f"{use_mixture}, {has_logit}"], - msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}", - ) - - -class TestHillClimbDefiner(unittest.TestCase): - def setUp(self): - self.descriptor = { - "signs": { - "Depth": "pos", - "Gross": "pos", - "Netpay": "pos", - "Permeability": "pos", - "Porosity": "pos", - }, - "types": { - "Depth": "cont", - "Gross": "cont", - "Lithology": "disc", - "Netpay": "cont", - "Period": "disc", - "Permeability": "cont", - "Porosity": "cont", - "Structural setting": "disc", - "Tectonic regime": "disc", - }, - } - self.data = { - "Tectonic regime": [ - 0, - 1, - 4, - 4, - 0, - 2, - 0, - 0, - 0, - 0, - 3, - 1, - 0, - 3, - 0, - 1, - 4, - 0, - 4, - 3, - 4, - 0, - 1, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 3, - 2, - 3, - 2, - 3, - 3, - 3, - 0, - ], - "Period": [ - 3, - 1, - 4, - 4, - 1, - 1, - 0, - 0, - 3, - 5, - 3, - 9, - 0, - 5, - 0, - 3, - 5, - 3, - 2, - 4, - 4, - 1, - 5, - 7, - 7, - 7, - 1, - 1, - 1, - 1, - 4, - 6, - 8, - 4, - 4, - 5, - 4, - 7, - 5, - 5, - 0, - ], - "Lithology": [ - 2, - 4, - 6, - 4, - 2, - 2, - 2, - 2, - 4, - 4, - 4, - 4, - 1, - 4, - 1, - 4, - 4, - 4, - 5, - 3, - 2, - 2, - 2, - 4, - 1, - 1, - 3, - 4, - 4, - 4, - 4, - 2, - 0, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 2, - ], - "Structural setting": [ - 2, - 6, - 10, - 10, - 7, - 5, - 8, - 8, - 2, - 2, - 6, - 6, - 3, - 7, - 3, - 6, - 10, - 9, - 3, - 0, - 0, - 7, - 6, - 6, - 6, - 7, - 6, - 6, - 6, - 6, - 8, - 2, - 9, - 4, - 7, - 6, - 1, - 8, - 4, - 4, - 3, - ], - "Gross": [ - 1, - 3, - 1, - 3, - 1, - 0, - 2, - 3, - 0, - 4, - 4, - 4, - 0, - 3, - 0, - 0, - 3, - 4, - 0, - 4, - 3, - 2, - 2, - 4, - 0, - 4, - 1, - 2, - 2, - 4, - 2, - 4, - 3, - 1, - 1, - 1, - 2, - 3, - 0, - 2, - 1, - ], - "Netpay": [ - 3, - 2, - 1, - 4, - 2, - 0, - 2, - 2, - 1, - 4, - 3, - 4, - 0, - 3, - 1, - 1, - 0, - 4, - 1, - 3, - 4, - 3, - 3, - 4, - 0, - 4, - 0, - 1, - 2, - 4, - 2, - 3, - 2, - 1, - 2, - 0, - 2, - 4, - 1, - 3, - 0, - ], - "Porosity": [ - 3, - 0, - 4, - 3, - 3, - 1, - 0, - 0, - 3, - 0, - 2, - 1, - 2, - 3, - 0, - 2, - 3, - 0, - 0, - 4, - 2, - 4, - 2, - 2, - 1, - 1, - 1, - 3, - 3, - 2, - 4, - 3, - 1, - 4, - 4, - 4, - 3, - 1, - 4, - 4, - 0, - ], - "Permeability": [ - 4, - 0, - 3, - 3, - 2, - 1, - 1, - 1, - 1, - 0, - 4, - 4, - 1, - 3, - 1, - 4, - 3, - 0, - 0, - 3, - 0, - 1, - 2, - 0, - 2, - 2, - 1, - 2, - 3, - 4, - 3, - 2, - 2, - 2, - 4, - 4, - 3, - 0, - 4, - 4, - 0, - ], - "Depth": [ - 1, - 4, - 3, - 4, - 1, - 3, - 1, - 3, - 1, - 4, - 3, - 4, - 1, - 2, - 1, - 4, - 0, - 4, - 0, - 0, - 3, - 2, - 3, - 2, - 2, - 3, - 4, - 2, - 2, - 4, - 1, - 0, - 2, - 0, - 4, - 0, - 1, - 2, - 0, - 0, - 3, - ], - } - - def test_apply_K2(self): - hcd = HillClimbDefiner( - data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("K2",), - ) - - hcd.apply_K2( - data=pd.DataFrame(self.data), - init_edges=None, - progress_bar=False, - remove_init_edges=False, - white_list=None, - ) - - right_edges = [ - ["Tectonic regime", "Structural setting"], - ["Tectonic regime", "Depth"], - ["Tectonic regime", "Netpay"], - ["Period", "Porosity"], - ["Period", "Tectonic regime"], - ["Period", "Netpay"], - ["Lithology", "Permeability"], - ["Lithology", "Period"], - ["Lithology", "Tectonic regime"], - ["Structural setting", "Netpay"], - ["Netpay", "Gross"], - ["Porosity", "Permeability"], - ["Porosity", "Depth"], - ["Porosity", "Netpay"], - ["Permeability", "Netpay"], - ] - - self.assertEqual(hcd.skeleton["E"], right_edges) - - def test_apply_group1(self): - hcd = HillClimbDefiner( - data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("MI",), - ) - - hcd.restrict(data=pd.DataFrame(self.data), bl_add=None, init_nodes=None) - hcd.apply_group1( - data=pd.DataFrame(self.data), - progress_bar=False, - init_edges=None, - remove_init_edges=False, - white_list=None, - ) - - right_edges = [ - ["Lithology", "Depth"], - ["Period", "Gross"], - ["Netpay", "Gross"], - ["Period", "Netpay"], - ["Depth", "Period"], - ["Depth", "Permeability"], - ["Netpay", "Permeability"], - ["Period", "Porosity"], - ["Netpay", "Porosity"], - ["Permeability", "Structural setting"], - ["Netpay", "Structural setting"], - ["Period", "Tectonic regime"], - ["Netpay", "Tectonic regime"], - ] - - self.assertEqual(hcd.skeleton["E"], right_edges) - - -class TestEvoStructureBuilder(unittest.TestCase): - def setUp(self): - self.data = pd.read_csv(r"data/benchmark/asia.csv", index_col=0) - self.descriptor = { - "types": { - "asia": "disc", - "tub": "disc", - "smoke": "disc", - "lung": "disc", - "bronc": "disc", - "either": "disc", - "xray": "disc", - "dysp": "disc", - }, - "signs": {}, - } - self.evo_builder = EvoStructureBuilder( - data=self.data, - descriptor=self.descriptor, - regressor=None, - has_logit=True, - use_mixture=True, - ) - # Replace this with your actual reference DAG - self.reference_dag = [ - ("asia", "tub"), - ("tub", "either"), - ("smoke", "lung"), - ("smoke", "bronc"), - ("lung", "either"), - ("bronc", "dysp"), - ("either", "xray"), - ("either", "dysp"), - ] - - def test_build(self): - # placeholder kwargs - kwargs = {} - self.evo_builder.build( - data=self.data, classifier=None, regressor=None, verbose=False, **kwargs - ) - - obtained_dag = self.evo_builder.skeleton["E"] - num_edges = len(obtained_dag) - self.assertGreaterEqual( - num_edges, 1, msg="Obtained graph should have at least one edge." - ) - - dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] - self.assertLess( - dist, - 15, - msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", - ) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/test_graph_analyzer.py b/tests/test_graph_analyzer.py deleted file mode 100644 index a87bcb2..0000000 --- a/tests/test_graph_analyzer.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging -import unittest - -from bamt.builders.builders_base import VerticesDefiner -from bamt.networks.discrete_bn import DiscreteBN -from bamt.nodes.discrete_node import DiscreteNode -from bamt.utils import GraphUtils - -logging.getLogger("builder").setLevel(logging.CRITICAL) - - -class TestGraphAnalyzer(unittest.TestCase): - def setUp(self): - self.bn = DiscreteBN() - definer = VerticesDefiner( - descriptor={ - "types": { - "Node0": "disc", - "Node1": "disc", - "Node2": "disc", - "Node3": "disc_num", - "Node4": "disc", - "Node5": "disc", - "Node6": "disc", - "Node7": "disc", - "Node8": "disc", - "Node9": "disc", - }, - "signs": {}, - }, - regressor=None, - ) - - definer.skeleton["V"] = [DiscreteNode(name=f"Node{i}") for i in range(10)] - definer.skeleton["E"] = [ - ("Node0", "Node1"), - ("Node0", "Node2"), - ("Node2", "Node3"), - ("Node4", "Node7"), - ("Node1", "Node5"), - ("Node5", "Node6"), - ("Node7", "Node0"), - ("Node8", "Node1"), - ("Node9", "Node2"), - ] - definer.get_family() - - self.bn.nodes = definer.skeleton["V"] - self.bn.edges = definer.skeleton["E"] - - self.analyzer = GraphUtils.GraphAnalyzer(self.bn) - - def test_markov_blanket(self): - result = self.analyzer.markov_blanket("Node0") - result["nodes"] = sorted(result["nodes"]) - self.assertEqual( - { - "edges": [ - ("Node0", "Node1"), - ("Node0", "Node2"), - ("Node7", "Node0"), - ("Node8", "Node1"), - ("Node9", "Node2"), - ], - "nodes": sorted(["Node0", "Node1", "Node2", "Node7", "Node8", "Node9"]), - }, - result, - ) - - def test_find_family(self): - without_parents = self.analyzer.find_family("Node0", 0, 2, None) - without_parents["nodes"] = sorted(without_parents["nodes"]) - self.assertEqual( - { - "nodes": sorted(["Node3", "Node2", "Node1", "Node0", "Node5"]), - "edges": [ - ("Node0", "Node1"), - ("Node0", "Node2"), - ("Node2", "Node3"), - ("Node1", "Node5"), - ], - }, - without_parents, - ) - - without_children = self.analyzer.find_family("Node0", 2, 0, None) - without_children["nodes"] = sorted(without_children["nodes"]) - self.assertEqual( - { - "nodes": sorted(["Node4", "Node7", "Node0"]), - "edges": [("Node4", "Node7"), ("Node7", "Node0")], - }, - without_children, - ) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/test_networks.py b/tests/test_networks.py deleted file mode 100644 index b2df174..0000000 --- a/tests/test_networks.py +++ /dev/null @@ -1,1192 +0,0 @@ -import json -import logging -import pathlib as pl -import unittest - -import pandas as pd -from catboost import CatBoostRegressor -from sklearn import preprocessing as pp -from sklearn.ensemble import RandomForestRegressor -from sklearn.tree import DecisionTreeRegressor - -import bamt.preprocessors as bp -from bamt.networks.composite_bn import CompositeBN -from bamt.networks.hybrid_bn import BaseNetwork, HybridBN -from bamt.nodes.discrete_node import DiscreteNode -from bamt.nodes.gaussian_node import GaussianNode -from bamt.utils.MathUtils import precision_recall -from bamt.utils.composite_utils.CompositeGeneticOperators import ( - custom_mutation_add_model, - custom_crossover_all_model, -) -from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode - -logging.getLogger("network").setLevel(logging.CRITICAL) - - -class TestCaseBase(unittest.TestCase): - def assertIsFile(self, path): - if not pl.Path(path).resolve().is_file(): - raise AssertionError("File does not exist: %s" % str(path)) - - def assertIsDir(self, path): - if not pl.Path(path).resolve().is_dir(): - raise AssertionError("Direction does not exist: %s" % str(path)) - - def prepare_bn_and_data(self): - # prepare bn where models were set by set_model - hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - [ - "Tectonic regime", - "Period", - "Lithology", - "Structural setting", - "Gross", - "Netpay", - "Porosity", - "Permeability", - "Depth", - ] - ] - - encoder = pp.LabelEncoder() - discretizer = pp.KBinsDiscretizer( - n_bins=5, encode="ordinal", strategy="quantile" - ) - - p = bp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) - - hack_data.dropna(inplace=True) - hack_data.reset_index(inplace=True, drop=True) - - discretized_data, est = p.apply(hack_data) - - self.bn = HybridBN(has_logit=True) - info = p.info - - self.bn.add_nodes(info) - - self.bn.add_edges( - discretized_data, scoring_function=("BIC",), progress_bar=False - ) - - self.bn.set_regressor( - regressors={ - "Depth": CatBoostRegressor( - logging_level="Silent", allow_writing_files=False - ), - "Gross": RandomForestRegressor(), - "Porosity": DecisionTreeRegressor(), - } - ) - return hack_data - - -class TestBaseNetwork(TestCaseBase): - def setUp(self): - self.bn = BaseNetwork() - - self.nodes = [ - GaussianNode(name="Node0"), - DiscreteNode(name="Node1"), - GaussianNode(name="Node2"), - ] - - self.edges = [("Node0", "Node1"), ("Node1", "Node2")] - self.descriptor = { - "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, - "signs": {"Node0": "pos", "Node1": "neg"}, - } - - def test_validate(self): - descriptor_t = {"types": {"Node0": "Abstract", "Node1": "Abstract"}} - descriptor_f = {"types": {"Node0": "Abstract", "Node1": "cont"}} - - self.assertFalse(self.bn.validate(descriptor_f)) - self.assertTrue(self.bn.validate(descriptor_t)) - - def test_update_descriptor(self): - self.bn.descriptor = self.descriptor - # Nodes out - self.bn.nodes = [GaussianNode(name="Node0")] - self.bn.update_descriptor() - self.assertEqual({"Node0": "cont"}, self.bn.descriptor["types"]) - - # It uses only Vertices Definer, test of this is in builders tests. - def test_add_nodes(self): - pass - - def test_add_edges(self): - # It uses builders - pass - - def test_calculate_weights(self): - pass - - def test_set_nodes(self): - class MyNode: - def __init__(self, name): - self.name = name - - # set without mapping - self.assertIsNone(self.bn.set_nodes(nodes=[GaussianNode(name="Node0")])) - - map = { - "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, - "signs": {}, - } - - self.bn.set_nodes(nodes=self.nodes, info=map) - self.assertEqual(self.bn.nodes, self.nodes) - - self.bn.set_nodes( - nodes=[MyNode(name="Node-1"), MyNode("Node-2")], - info={"types": {"Node-1": "cont", "Node-2": "disc"}, "signs": {}}, - ) - self.assertEqual(self.bn.nodes, []) - - def test_set_edges(self): - self.edges.extend([(0, 1), (pd.NA, "1")]) - - self.bn.has_logit = False - self.bn.set_nodes(nodes=self.nodes, info=self.descriptor) - self.bn.set_edges(edges=self.edges) - - self.assertEqual([("Node1", "Node2")], self.bn.edges) - - # The test consists of 2 previous methods that are tested, - # plus methods of builders, they are tested as well. - def test_set_structure(self): - pass - - # Node testing. - def test_param_validation(self): - pass - - def test_set_parameters(self): - pass - - def test_save_params(self): - self.bn.distributions = {"AAA": "BBB"} - with self.assertRaises(TypeError): - self.bn.save_params("out.txt") - - self.assertTrue(self.bn.save_params("out.json")) - - self.assertIsFile("out.json") - - self.assertEqual(json.load(open("out.json")), self.bn.distributions) - pl.Path("out.json").unlink() - - def test_save_structure(self): - self.bn.edges = self.edges - - with self.assertRaises(TypeError): - self.bn.save_structure("out.txt") - - self.assertTrue(self.bn.save_structure("out.json")) - self.assertIsFile("out.json") - f = json.load(open("out.json")) - for i in range(len(f)): - self.assertEqual(list(self.bn.edges[i]), f[i]) - pl.Path("out.json").unlink() - - # Covers by prev tests. - def test_save(self): - pass - - def test_fit_parameters(self): - """ - General test, the full one is in the tests of each node. - It is frozen for a while. - """ - pass - # bn = BaseNetwork() - # shape = 500 - # data = pd.DataFrame({"Node0": np.random.poisson(5, shape), - # "Node1": np.random.choice(["cat1", "cat2", "cat3"], shape), - # "Node2": np.random.normal(1.5, 4, shape)}) - # - # - # bn.set_structure(info=self.descriptor, nodes=self.nodes, edges=self.edges) - # bn.get_info(as_df=False) - - def test_joblib_pathsave(self): - hack_data = self.prepare_bn_and_data() - self.bn.fit_parameters(hack_data) - - self.assertGreater( - self.bn.sample(100, progress_bar=False).size, 0, "Sampling is broken" - ) - - combination_package = self.bn.distributions["Gross"]["hybcprob"][ - "['COMPRESSION']" - ] - regressor_obj = combination_package["regressor_obj"] - - if combination_package["serialization"] == "joblib": - self.assertIsFile(regressor_obj) - - def test_sample(self): - data = { - "Tectonic regime": [ - 0, - 1, - 4, - 4, - 0, - 2, - 0, - 0, - 0, - 0, - 3, - 1, - 0, - 3, - 0, - 1, - 4, - 0, - 4, - 3, - 4, - 0, - 1, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 3, - 2, - 3, - 2, - 3, - 3, - 3, - 0, - ], - "Period": [ - 3, - 1, - 4, - 4, - 1, - 1, - 0, - 0, - 3, - 5, - 3, - 9, - 0, - 5, - 0, - 3, - 5, - 3, - 2, - 4, - 4, - 1, - 5, - 7, - 7, - 7, - 1, - 1, - 1, - 1, - 4, - 6, - 8, - 4, - 4, - 5, - 4, - 7, - 5, - 5, - 0, - ], - "Lithology": [ - 2, - 4, - 6, - 4, - 2, - 2, - 2, - 2, - 4, - 4, - 4, - 4, - 1, - 4, - 1, - 4, - 4, - 4, - 5, - 3, - 2, - 2, - 2, - 4, - 1, - 1, - 3, - 4, - 4, - 4, - 4, - 2, - 0, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 2, - ], - "Structural setting": [ - 2, - 6, - 10, - 10, - 7, - 5, - 8, - 8, - 2, - 2, - 6, - 6, - 3, - 7, - 3, - 6, - 10, - 9, - 3, - 0, - 0, - 7, - 6, - 6, - 6, - 7, - 6, - 6, - 6, - 6, - 8, - 2, - 9, - 4, - 7, - 6, - 1, - 8, - 4, - 4, - 3, - ], - "Gross": [ - 1, - 3, - 1, - 3, - 1, - 0, - 2, - 3, - 0, - 4, - 4, - 4, - 0, - 3, - 0, - 0, - 3, - 4, - 0, - 4, - 3, - 2, - 2, - 4, - 0, - 4, - 1, - 2, - 2, - 4, - 2, - 4, - 3, - 1, - 1, - 1, - 2, - 3, - 0, - 2, - 1, - ], - "Netpay": [ - 3, - 2, - 1, - 4, - 2, - 0, - 2, - 2, - 1, - 4, - 3, - 4, - 0, - 3, - 1, - 1, - 0, - 4, - 1, - 3, - 4, - 3, - 3, - 4, - 0, - 4, - 0, - 1, - 2, - 4, - 2, - 3, - 2, - 1, - 2, - 0, - 2, - 4, - 1, - 3, - 0, - ], - "Porosity": [ - 3, - 0, - 4, - 3, - 3, - 1, - 0, - 0, - 3, - 0, - 2, - 1, - 2, - 3, - 0, - 2, - 3, - 0, - 0, - 4, - 2, - 4, - 2, - 2, - 1, - 1, - 1, - 3, - 3, - 2, - 4, - 3, - 1, - 4, - 4, - 4, - 3, - 1, - 4, - 4, - 0, - ], - "Permeability": [ - 4, - 0, - 3, - 3, - 2, - 1, - 1, - 1, - 1, - 0, - 4, - 4, - 1, - 3, - 1, - 4, - 3, - 0, - 0, - 3, - 0, - 1, - 2, - 0, - 2, - 2, - 1, - 2, - 3, - 4, - 3, - 2, - 2, - 2, - 4, - 4, - 3, - 0, - 4, - 4, - 0, - ], - "Depth": [ - 1, - 4, - 3, - 4, - 1, - 3, - 1, - 3, - 1, - 4, - 3, - 4, - 1, - 2, - 1, - 4, - 0, - 4, - 0, - 0, - 3, - 2, - 3, - 2, - 2, - 3, - 4, - 2, - 2, - 4, - 1, - 0, - 2, - 0, - 4, - 0, - 1, - 2, - 0, - 0, - 3, - ], - } - nodes = [ - DiscreteNode(name="Tectonic regime"), - DiscreteNode(name="Period"), - DiscreteNode(name="Lithology"), - DiscreteNode(name="Structural setting"), - DiscreteNode(name="Gross"), - DiscreteNode(name="Netpay"), - DiscreteNode(name="Porosity"), - DiscreteNode(name="Permeability"), - DiscreteNode(name="Depth"), - ] - - self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) - self.bn.set_edges( - [ - ["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"], - ] - ) - self.bn.fit_parameters(pd.DataFrame.from_records(data)) - self.assertIsNotNone(self.bn.sample(50, as_df=False, progress_bar=False)) - - def test_predict(self): - seq = { - "Tectonic regime": [ - 0, - 1, - 4, - 4, - 0, - 2, - 0, - 0, - 0, - 0, - 3, - 1, - 0, - 3, - 0, - 1, - 4, - 0, - 4, - 3, - 4, - 0, - 1, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 3, - 2, - 3, - 2, - 3, - 3, - 3, - 0, - ], - "Period": [ - 3, - 1, - 4, - 4, - 1, - 1, - 0, - 0, - 3, - 5, - 3, - 9, - 0, - 5, - 0, - 3, - 5, - 3, - 2, - 4, - 4, - 1, - 5, - 7, - 7, - 7, - 1, - 1, - 1, - 1, - 4, - 6, - 8, - 4, - 4, - 5, - 4, - 7, - 5, - 5, - 0, - ], - "Lithology": [ - 2, - 4, - 6, - 4, - 2, - 2, - 2, - 2, - 4, - 4, - 4, - 4, - 1, - 4, - 1, - 4, - 4, - 4, - 5, - 3, - 2, - 2, - 2, - 4, - 1, - 1, - 3, - 4, - 4, - 4, - 4, - 2, - 0, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 2, - ], - "Structural setting": [ - 2, - 6, - 10, - 10, - 7, - 5, - 8, - 8, - 2, - 2, - 6, - 6, - 3, - 7, - 3, - 6, - 10, - 9, - 3, - 0, - 0, - 7, - 6, - 6, - 6, - 7, - 6, - 6, - 6, - 6, - 8, - 2, - 9, - 4, - 7, - 6, - 1, - 8, - 4, - 4, - 3, - ], - "Gross": [ - 1, - 3, - 1, - 3, - 1, - 0, - 2, - 3, - 0, - 4, - 4, - 4, - 0, - 3, - 0, - 0, - 3, - 4, - 0, - 4, - 3, - 2, - 2, - 4, - 0, - 4, - 1, - 2, - 2, - 4, - 2, - 4, - 3, - 1, - 1, - 1, - 2, - 3, - 0, - 2, - 1, - ], - "Netpay": [ - 3, - 2, - 1, - 4, - 2, - 0, - 2, - 2, - 1, - 4, - 3, - 4, - 0, - 3, - 1, - 1, - 0, - 4, - 1, - 3, - 4, - 3, - 3, - 4, - 0, - 4, - 0, - 1, - 2, - 4, - 2, - 3, - 2, - 1, - 2, - 0, - 2, - 4, - 1, - 3, - 0, - ], - "Porosity": [ - 3, - 0, - 4, - 3, - 3, - 1, - 0, - 0, - 3, - 0, - 2, - 1, - 2, - 3, - 0, - 2, - 3, - 0, - 0, - 4, - 2, - 4, - 2, - 2, - 1, - 1, - 1, - 3, - 3, - 2, - 4, - 3, - 1, - 4, - 4, - 4, - 3, - 1, - 4, - 4, - 0, - ], - "Permeability": [ - 4, - 0, - 3, - 3, - 2, - 1, - 1, - 1, - 1, - 0, - 4, - 4, - 1, - 3, - 1, - 4, - 3, - 0, - 0, - 3, - 0, - 1, - 2, - 0, - 2, - 2, - 1, - 2, - 3, - 4, - 3, - 2, - 2, - 2, - 4, - 4, - 3, - 0, - 4, - 4, - 0, - ], - "Depth": [ - 1, - 4, - 3, - 4, - 1, - 3, - 1, - 3, - 1, - 4, - 3, - 4, - 1, - 2, - 1, - 4, - 0, - 4, - 0, - 0, - 3, - 2, - 3, - 2, - 2, - 3, - 4, - 2, - 2, - 4, - 1, - 0, - 2, - 0, - 4, - 0, - 1, - 2, - 0, - 0, - 3, - ], - } - data = pd.DataFrame.from_records(seq) - nodes = [ - DiscreteNode(name="Tectonic regime"), - DiscreteNode(name="Period"), - DiscreteNode(name="Lithology"), - DiscreteNode(name="Structural setting"), - ] - - self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) - self.bn.set_edges( - [ - ["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"], - ] - ) - self.bn.fit_parameters(data) - - result = self.bn.predict(data.iloc[:, :3], parall_count=2, progress_bar=False) - self.assertNotEqual(result, {}) - - for v in result.values(): - for item in v: - self.assertFalse(pd.isna(item)) - - -class TestBigBraveBN(unittest.SkipTest): - pass - - -class TestCompositeNetwork(unittest.TestCase): - def setUp(self): - self.data = pd.read_csv(r"data/benchmark/healthcare.csv", index_col=0) - self.descriptor = { - "types": { - "A": "disc", - "C": "disc", - "D": "cont", - "H": "disc", - "I": "cont", - "O": "cont", - "T": "cont", - }, - "signs": {"D": "pos", "I": "neg", "O": "pos", "T": "pos"}, - } - self.reference_dag = [ - ("A", "C"), - ("A", "D"), - ("A", "H"), - ("A", "O"), - ("C", "I"), - ("D", "I"), - ("H", "D"), - ("I", "T"), - ("O", "T"), - ] - - self.comparative_dag = [("I", "T"), ("O", "T")] - - def test_learning(self): - bn, _ = self._get_starter_bn(self.data) - - bn.add_edges(self.data, verbose=False) - - bn.fit_parameters(self.data) - - obtained_dag = bn.edges - num_edges = len(obtained_dag) - self.assertGreaterEqual( - num_edges, 1, msg="Obtained graph should have at least one edge." - ) - - dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] - self.assertLess( - dist, - 25, - msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", - ) - - for node in bn.nodes: - if type(node).__name__ == "CompositeContinuousNode": - self.assertIsNotNone( - node.regressor, - msg="CompositeContinuousNode does not have regressor", - ) - if type(node).__name__ == "CompositeDiscreteNode": - self.assertIsNotNone( - node.classifier, - msg="CompositeDiscreteNode does not have classifier", - ) - - def test_learning_models(self): - bn, p = self._get_starter_bn(self.data[["I", "O", "T"]]) - - parent_node_a = CompositeNode( - nodes_from=None, - content={ - "name": "I", - "type": p.nodes_types["I"], - "parent_model": None, - }, - ) - - parent_node_h = CompositeNode( - nodes_from=None, - content={ - "name": "O", - "type": p.nodes_types["O"], - "parent_model": None, - }, - ) - - child_node = CompositeNode( - nodes_from=[parent_node_a, parent_node_h], - content={ - "name": "T", - "type": p.nodes_types["T"], - "parent_model": "CatBoostRegressor", - }, - ) - - comp_model = CompositeModel(nodes=[parent_node_a, parent_node_h, child_node]) - - bn.add_edges( - self.data[["I", "O", "T"]], - verbose=False, - custom_mutations=[custom_mutation_add_model], - custom_crossovers=[custom_crossover_all_model], - custom_initial_structure=[comp_model], - ) - - output_structure = [ - tuple([str(item) for item in inner_list]) for inner_list in bn.edges - ] - - self.assertEqual( - output_structure, - self.comparative_dag, - msg="Obtained BN should have reference structure", - ) - - @staticmethod - def _get_starter_bn(data): - encoder = pp.LabelEncoder() - p = bp.Preprocessor([("encoder", encoder)]) - - _, _ = p.apply(data) - - info = p.info - - bn = CompositeBN() - bn.add_nodes(info) - - return bn, p - - -if __name__ == "__main__": - unittest.main(verbosity=3) diff --git a/tests/test_nodes.py b/tests/test_nodes.py deleted file mode 100644 index 878d736..0000000 --- a/tests/test_nodes.py +++ /dev/null @@ -1,464 +0,0 @@ -import logging -import unittest - -import numpy as np -import pandas as pd - -from bamt.networks.hybrid_bn import HybridBN -from bamt.nodes import * - -logging.getLogger("nodes").setLevel(logging.CRITICAL) - - -class MyTest(unittest.TestCase): - unittest.skip("This is an assertion.") - - def assertDist(self, dist, node_type): - if node_type in ("disc", "disc_num"): - return self.assertAlmostEqual(sum(dist), 1) - else: - return self.assertEqual(len(dist), 2, msg=f"Error on {dist}") - - def assertDistMixture(self, dist): - return self.assertEqual(len(dist), 3, msg=f"Error on {dist}") - - -class TestBaseNode(MyTest): - def setUp(self): - np.random.seed(510) - - hybrid_bn = HybridBN(has_logit=True) - - info = { - "types": { - "Node0": "cont", - "Node1": "cont", - "Node2": "cont", - "Node3": "cont", - "Node4": "disc", - "Node5": "disc", - "Node6": "disc_num", - "Node7": "disc_num", - }, - "signs": {"Node0": "pos", "Node1": "neg", "Node2": "neg", "Node3": "neg"}, - } - - data = pd.DataFrame( - { - "Node0": np.random.normal(0, 4, 30), - "Node1": np.random.normal(0, 0.1, 30), - "Node2": np.random.normal(0, 0.3, 30), - "Node3": np.random.normal(0, 0.3, 30), - "Node4": np.random.choice(["cat1", "cat2", "cat3"], 30), - "Node5": np.random.choice(["cat4", "cat5", "cat6"], 30), - "Node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - "Node7": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - ) - - nodes = [ - gaussian_node.GaussianNode(name="Node0"), - gaussian_node.GaussianNode(name="Node1"), - gaussian_node.GaussianNode(name="Node2"), - gaussian_node.GaussianNode(name="Node3"), - discrete_node.DiscreteNode(name="Node4"), - discrete_node.DiscreteNode(name="Node5"), - discrete_node.DiscreteNode(name="Node6"), - discrete_node.DiscreteNode(name="Node7"), - ] - - edges = [ - ("Node0", "Node7"), - ("Node0", "Node1"), - ("Node0", "Node2"), - ("Node0", "Node5"), - ("Node4", "Node2"), - ("Node4", "Node5"), - ("Node4", "Node6"), - ("Node4", "Node3"), - ] - - hybrid_bn.set_structure(info, nodes=nodes, edges=edges) - hybrid_bn.fit_parameters(data) - self.bn = hybrid_bn - self.data = data - self.info = info - - def test_equality(self): - test = base.BaseNode(name="node0") - first = base.BaseNode(name="node1") - - test_clone = test - - # different name - self.assertFalse(test == first) - self.assertTrue(test == test_clone) - - # different type - test_clone.type = "gaussian" - test.type = "gaussian" - - first.type = "discrete" - - self.assertFalse(test == first) - self.assertTrue(test == test_clone) - - # different disc_parents - disc_parents = [f"node{i}" for i in range(2, 6)] - test.disc_parents, test_clone.disc_parents = disc_parents, disc_parents - first = disc_parents[:3] - - self.assertFalse(test == first) - self.assertTrue(test == test_clone) - - # different cont_parents - cont_parents = [f"node{i}" for i in range(6, 10)] - test.disc_parents, test_clone.disc_parents = cont_parents, cont_parents - first = cont_parents[:3] - - self.assertFalse(test == first) - self.assertTrue(test == test_clone) - - # different children - children = [f"node{i}" for i in range(5)] - test.disc_parents, test_clone.disc_parents = children, children - first = children[:3] - - self.assertFalse(test == first) - self.assertTrue(test == test_clone) - - def test_get_dist_mixture(self): - hybrid_bn = HybridBN(use_mixture=True, has_logit=True) - - hybrid_bn.set_structure(self.info, self.bn.nodes, self.bn.edges) - hybrid_bn.fit_parameters(self.data) - - mixture_gauss = hybrid_bn["Node1"] - cond_mixture_gauss = hybrid_bn["Node2"] - - for i in range(-2, 0, 2): - dist = hybrid_bn.get_dist(mixture_gauss.name, pvals={"Node0": i}) - self.assertDistMixture(dist) - - for i in range(-2, 0, 2): - for j in self.data[cond_mixture_gauss.disc_parents[0]].unique().tolist(): - dist = hybrid_bn.get_dist( - cond_mixture_gauss.name, pvals={"Node0": float(i), "Node4": j} - ) - self.assertDistMixture(dist) - - def test_get_dist(self): - for node in self.bn.nodes: - if not node.cont_parents + node.disc_parents: - dist = self.bn.get_dist(node.name) - if "mean" in dist.keys(): - self.assertTrue(isinstance(dist["mean"], float)) - self.assertTrue(isinstance(dist["variance"], float)) - else: - self.assertAlmostEqual(sum(dist["cprob"]), 1) - continue - - if len(node.cont_parents + node.disc_parents) == 1: - if node.disc_parents: - pvals = self.data[node.disc_parents[0]].unique().tolist() - parent = node.disc_parents[0] - else: - pvals = range(-5, 5, 1) - parent = node.cont_parents[0] - - for pval in pvals: - dist = self.bn.get_dist(node.name, {parent: pval}) - self.assertDist(dist, self.info["types"][node.name]) - else: - for i in self.data[node.disc_parents[0]].unique().tolist(): - for j in range(-5, 5, 1): - dist = self.bn.get_dist( - node.name, - {node.cont_parents[0]: float(j), node.disc_parents[0]: i}, - ) - self.assertDist(dist, self.info["types"][node.name]) - - # ??? - def test_choose_serialization(self): - pass - - -class TestDiscreteNode(unittest.TestCase): - def setUp(self): - self.node = discrete_node.DiscreteNode(name="test") - self.data_dict = { - "node0": np.random.normal(0, 4, 30), - "node1": np.random.normal(0, 0.1, 30), - "node2": np.random.normal(0, 0.3, 30), - "test": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - self.node.disc_parents = ["node4", "node5"] - # self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def test_fit_parameters(self): - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertIsNotNone(params["vals"]) - self.assertNotEqual(params["vals"], []) - - for comb, probas in params["cprob"].items(): - self.assertAlmostEqual(sum(probas), 1, delta=1e-5) - - def test_choose(self): - pvals = ["cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue([self.node.choose(params, pvals) in params["vals"]]) - - def test_predict(self): - pvals = ["cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue([self.node.predict(params, pvals) in params["vals"]]) - self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) - - -class TestGaussianNode(unittest.TestCase): - def setUp(self): - self.node = gaussian_node.GaussianNode(name="test") - self.data_dict = { - "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, 0.1, 30), - "foster-son": np.random.normal(2.5, 0.2, 30), - "test": np.random.normal(3, 0.3, 30), - "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - # self.node.disc_parents = ["node4", "node5"] - self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def test_fit_parameters(self): - node_without_parents = gaussian_node.GaussianNode(name="foster-son") - node_without_parents.children = ["node6", "node5"] - - params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict) - ) - params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict) - ) - - self.assertIsNotNone(params_parents["regressor_obj"]) - self.assertTrue(pd.isna(params_parents["mean"])) - self.assertIsNotNone(params_parents["variance"]) - - self.assertIsNone(params_foster["regressor_obj"]) - self.assertFalse(pd.isna(params_foster["mean"])) - self.assertIsNotNone(params_foster["variance"]) - - def test_choose(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue(isinstance(self.node.choose(params, pvals), float)) - - def test_predict(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises(ValueError, self.node.predict, params, ["bad", "values"]) - - -class TestConditionalGaussianNode(unittest.TestCase): - def setUp(self): - self.node = conditional_gaussian_node.ConditionalGaussianNode(name="test") - self.data_dict = { - "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, 0.1, 30), - "foster-son": np.random.normal(2.5, 0.2, 30), - "test": np.random.normal(3, 0.3, 30), - "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - self.node.disc_parents = ["node4", "node5"] - self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def fit_parameters(self, regressor=None): - if regressor is not None: - self.node.regressor = regressor - self.node.type = "ConditionalGaussian" + f" ({type(regressor).__name__})" - - node_without_parents = conditional_gaussian_node.ConditionalGaussianNode( - name="foster-son" - ) - node_without_parents.children = ["node6", "node5"] - - params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict) - )["hybcprob"] - params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict) - )["hybcprob"]["[]"] - - self.assertIsNone(params_foster["regressor_obj"]) - self.assertIsNotNone(params_foster["mean"]) - - # Since there can be empty dictionaries, - # we have to count them and if a fraction is greater than the threshold - # test failed. - report = [] - for comb, data in params_parents.items(): - if all(pd.isna(x) for x in data.values()): - report.append(1) - continue - else: - report.append(0) - - if pd.isna(data["mean"]): - self.assertIsNotNone(data["regressor_obj"]) - self.assertIsNotNone(data["variance"]) - else: - self.assertIsNone(data["regressor_obj"]) - self.assertIsNotNone(data["mean"]) - - self.assertLess(sum(report) / len(report), 0.3) - # print(params_parents, params_foster, sep="\n\n") - - # for k, v in params_parents.items(): - # print(k, True if v["regressor_obj"] else False, v["regressor"]) - - # print(sum(report) / len(report), node_without_parents.regressor) - - def test_choose(self): - pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue(isinstance(self.node.choose(params, pvals), float)) - - def test_predict(self): - pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) - - -class TestMixtureGaussianNode(unittest.TestCase): - def setUp(self): - self.node = mixture_gaussian_node.MixtureGaussianNode(name="test") - self.data_dict = { - "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, 0.1, 30), - "test": np.random.normal(3, 0.3, 30), - "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - # self.node.disc_parents = ["node4", "node5"] - self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def test_fit_parameters(self): - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertAlmostEqual(sum(params["coef"]), 1, delta=1e-5) - - def test_choose(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue(isinstance(self.node.choose(params, pvals), float)) - - def test_predict(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - - -class TestConditionalMixtureGaussianNode(unittest.TestCase): - def setUp(self): - self.node = conditional_mixture_gaussian_node.ConditionalMixtureGaussianNode( - name="test" - ) - self.data_dict = { - "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, 0.1, 30), - "test": np.random.normal(3, 0.3, 30), - "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - self.node.disc_parents = ["node4", "node5"] - self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def test_fit_parameters(self): - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict))[ - "hybcprob" - ] - report = [] - # sometimes combination's data can be empty, so we set the percent of - # empty combinations - for comb, data in params.items(): - if np.isclose(sum(data["coef"]), 1, atol=1e-5): - report.append(0) - else: - report.append(1) - self.assertLess(sum(report) / len(report), 0.3) - - def test_choose(self): - pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue(isinstance(self.node.choose(params, pvals), float)) - - def test_predict(self): - pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - - -class TestLogitNode(unittest.TestCase): - def setUp(self): - self.node = logit_node.LogitNode(name="test") - self.data_dict = { - "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, 0.1, 30), - "node2": np.random.normal(3, 0.3, 30), - "test": np.random.choice(["cat1", "cat2", "cat3"], 30), - "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), - "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), - "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), - } - - # self.node.disc_parents = ["node4", "node5"] - self.node.cont_parents = ["node0", "node1"] - self.node.children = ["node6"] - - def test_fit_parameters(self): - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertIsNotNone(params["classifier_obj"]) - - def test_choose(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue(self.node.choose(params, pvals) in params["classes"]) - - def test_predict(self): - pvals = [1.05, 1.95] - params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - - self.assertTrue([self.node.predict(params, pvals) in params["classes"]]) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/test_params.json b/tests/test_params.json deleted file mode 100644 index e3ebc03..0000000 --- a/tests/test_params.json +++ /dev/null @@ -1 +0,0 @@ -{"Node0": {"mean": 56.7183257918552, "coef": [], "variance": 7149.17326145042}, "Node3": {"classes": ["COMPRESSION", "EVAPORITE", "EXTENSION", "GRAVITY", "INVERSION", "STRIKE-SLIP", "TRANSPRESSION", "TRANSTENSION", "UPLIFT"], "classifier_obj": "\u0080\u0004\u0095\u00b1\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\t\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u000bCOMPRESSION\u0094\u008c\tEVAPORITE\u0094\u008c\tEXTENSION\u0094\u008c\u0007GRAVITY\u0094\u008c\tINVERSION\u0094\u008c\u000bSTRIKE-SLIP\u0094\u008c\rTRANSPRESSION\u0094\u008c\fTRANSTENSION\u0094\u008c\u0006UPLIFT\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004+\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\tK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003h>NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089CH\u00a9\u0093\n\u00a1b\u001ak?\u00fc\u00a5\u00ad\u00e1\u00a2\u0001P\u00bf\u00f4G\u00ab\u00dd\u00f4\u00cc{?e.QC-\u00bau?\u001f\u00bd\u00acb&>t?j\u0092\u00f9\u0092\u00f0aw?1\u00cb\u00967N\u00dc\u0092\u00bfk\u0000\u00ac\u00d1\u00a4`\u007f?\u00a1\u0096\u0006\u00d1\u00b6Q\u008d\u00bf\u0094t\u0094b\u008c\nintercept_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\t\u0085\u0094hI\u0089CH'\u009cX\u008cG&\u0007@\u009a\u0011\u00e6\u00e9\u00feR\u0002\u00c0\u008f\u0085\u00f3\u00df\u00d5\u00da\u00fa?\u00df\u00ee\u00a1\u00e4\u0001#\u00f7?\u00e5\u00e1\b\u00b5\u009c\u000b\u00fb?x\u000b1Q\u0088\u00c8\u00d5?`r\u001f\u0006\u00b2\u00c2\u00f1\u00bf%B\u0006\u00c9=\u0002\u0006\u00c0\u00e0\u00b6\u00a3:\u00faZ\u00fe\u00bf\u0094t\u0094b\u008c\u0010_sklearn_version\u0094\u008c\u00051.0.2\u0094ub.", "classifier": "LogisticRegression", "serialization": "pickle"}, "Node1": {"hybcprob": {"['COMPRESSION']": {"variance": 53.87084286837278, "mean": 15.428971962616822, "coef": []}, "['EVAPORITE']": {"variance": 0.0, "mean": 27.0, "coef": []}, "['EXTENSION']": {"variance": 49.697271531886926, "mean": 18.712820512820514, "coef": []}, "['GRAVITY']": {"variance": 45.05489795918368, "mean": 21.12857142857143, "coef": []}, "['INVERSION']": {"variance": 46.653795918367344, "mean": 21.314285714285713, "coef": []}, "['STRIKE-SLIP']": {"variance": 46.55263157894737, "mean": 20.5, "coef": []}, "['TRANSPRESSION']": {"variance": 2.25, "mean": 18.5, "coef": []}, "['TRANSTENSION']": {"variance": 0.0, "mean": 25.0, "coef": []}, "['UPLIFT']": {"variance": 0.0, "mean": 10.0, "coef": []}}}, "Node4": {"classes": ["ARCHEAN", "CAMBRIAN", "CAMBRIAN-ORDOVICIAN", "CAMBRIAN-ORDOVICIAN/CARBONIFEROUS", "CARBONIFEROUS", "CARBONIFEROUS-CRETACEOUS", "CARBONIFEROUS-PERMIAN", "CRETACEOUS", "CRETACEOUS-PALEOGENE", "DEVONIAN", "DEVONIAN-CARBONIFEROUS", "DEVONIAN-PERMIAN", "JURASSIC", "JURASSIC-CRETACEOUS", "MESOZOIC", "NEOGENE", "ORDOVICIAN", "PALEOGENE", "PALEOGENE-NEOGENE", "PALEOZOIC", "PALEOZOIC-CRETACEOUS", "PERMIAN", "PERMIAN-TRIASSIC", "PROTEROZOIC", "PROTEROZOIC-CAMBRIAN", "SILURIAN", "TRIASSIC", "TRIASSIC-JURASSIC"], "classifier_obj": "\u0080\u0004\u0095N\u0006\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\u001c\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u0007ARCHEAN\u0094\u008c\bCAMBRIAN\u0094\u008c\u0013CAMBRIAN-ORDOVICIAN\u0094\u008c!CAMBRIAN-ORDOVICIAN/CARBONIFEROUS\u0094\u008c\rCARBONIFEROUS\u0094\u008c\u0018CARBONIFEROUS-CRETACEOUS\u0094\u008c\u0015CARBONIFEROUS-PERMIAN\u0094\u008c\nCRETACEOUS\u0094\u008c\u0014CRETACEOUS-PALEOGENE\u0094\u008c\bDEVONIAN\u0094\u008c\u0016DEVONIAN-CARBONIFEROUS\u0094\u008c\u0010DEVONIAN-PERMIAN\u0094\u008c\bJURASSIC\u0094\u008c\u0013JURASSIC-CRETACEOUS\u0094\u008c\bMESOZOIC\u0094\u008c\u0007NEOGENE\u0094\u008c\nORDOVICIAN\u0094\u008c\tPALEOGENE\u0094\u008c\u0011PALEOGENE-NEOGENE\u0094\u008c\tPALEOZOIC\u0094\u008c\u0014PALEOZOIC-CRETACEOUS\u0094\u008c\u0007PERMIAN\u0094\u008c\u0010PERMIAN-TRIASSIC\u0094\u008c\u000bPROTEROZOIC\u0094\u008c\u0014PROTEROZOIC-CAMBRIAN\u0094\u008c\bSILURIAN\u0094\u008c\bTRIASSIC\u0094\u008c\u0011TRIASSIC-JURASSIC\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004!\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u001cK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003hQNNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u00e0LS1\u00b9zE\u00ec\u00bf\u00d1\u00c7\u00bf\u0092Y\u00c5\u00c2\u00bf\u00a6.\u0085\u00aen\u0001\u00dd\u00bf\u0016\u0099\u00f0\u00e0'\u0002\u00d2?\u00c3o\u00ed\u000e\u00f0Z\u00b3?\u00daFH.\u0082\u00c7\u0091?\u00bds\u0099\u00fe\u00a5#\u00bb?\u0093?\u00b0\u00aflW\u00d2?\u00a6\u008a.\u0096\u007f\u00aa\u00d6?\u00bf7|:\u00be\u0095\u00ab\u00bf\u00eb\u0017\u00de\u00edN&\u00e1\u00bf\u00d3S\u00e6 \u00d8H\u00b6\u00bf}\u001b\u0088VKS\u00ce?z\u00b8\u0090;\u00f74\u0092?oK\u001f\u00c4v\u0012\u00a9?\u008f\u00a9\u00f8s\u00b4\u00a3\u00da?\u00ffS)+v\u0092\u00d6\u00bf]\u00ea\u00af&\u001a~\u00d3?\u0094\u00f2\u00e1M\u00c1\u0092\u00d2?\u00fb\u00de\u00da?\u001a\u0090\u00c1?z\u0092\u009d\u00c3\u00e2\u0096\u00c9?\u00e5m\u0098T\u00ed(\u00a6?\u00e26 Date: Fri, 5 Apr 2024 15:09:43 +0300 Subject: [PATCH 02/18] major changes to initial structure --- bamt/{graph => core}/__init__.py | 0 bamt/{nodes => core/graph}/__init__.py | 0 bamt/{ => core}/graph/dag.py | 0 bamt/{ => core}/graph/graph.py | 0 .../node_models}/__init__.py | 0 .../node_models/continuous_distribution.py | 22 +++++++++++++++++++ bamt/core/node_models/distribution.py | 14 ++++++++++++ .../node_models/empirical_distribution.py | 17 ++++++++++++++ .../node_models => core/nodes}/__init__.py | 0 .../nodes/child_nodes}/__init__.py | 0 .../nodes/child_nodes/child_node.py | 2 +- .../conditional_continuous_node.py | 0 .../child_nodes/conditional_discrete_node.py | 0 .../child_nodes/node_models/__init__.py} | 0 bamt/{ => core}/nodes/node.py | 0 bamt/core/nodes/root_nodes/__init__.py | 0 .../nodes/root_nodes/continuous_node.py | 0 bamt/core/nodes/root_nodes/discrete_node.py | 0 bamt/{ => core}/nodes/root_nodes/root_node.py | 2 +- .../bayesian_network.py | 3 ++- 20 files changed, 57 insertions(+), 3 deletions(-) rename bamt/{graph => core}/__init__.py (100%) rename bamt/{nodes => core/graph}/__init__.py (100%) rename bamt/{ => core}/graph/dag.py (100%) rename bamt/{ => core}/graph/graph.py (100%) rename bamt/{nodes/child_nodes => core/node_models}/__init__.py (100%) create mode 100644 bamt/core/node_models/continuous_distribution.py create mode 100644 bamt/core/node_models/distribution.py create mode 100644 bamt/core/node_models/empirical_distribution.py rename bamt/{nodes/child_nodes/node_models => core/nodes}/__init__.py (100%) rename bamt/{nodes/root_nodes => core/nodes/child_nodes}/__init__.py (100%) rename bamt/{ => core}/nodes/child_nodes/child_node.py (66%) rename bamt/{ => core}/nodes/child_nodes/conditional_continuous_node.py (100%) rename bamt/{ => core}/nodes/child_nodes/conditional_discrete_node.py (100%) rename bamt/{nodes/root_nodes/discrete_node.py => core/nodes/child_nodes/node_models/__init__.py} (100%) rename bamt/{ => core}/nodes/node.py (100%) create mode 100644 bamt/core/nodes/root_nodes/__init__.py rename bamt/{ => core}/nodes/root_nodes/continuous_node.py (100%) create mode 100644 bamt/core/nodes/root_nodes/discrete_node.py rename bamt/{ => core}/nodes/root_nodes/root_node.py (66%) diff --git a/bamt/graph/__init__.py b/bamt/core/__init__.py similarity index 100% rename from bamt/graph/__init__.py rename to bamt/core/__init__.py diff --git a/bamt/nodes/__init__.py b/bamt/core/graph/__init__.py similarity index 100% rename from bamt/nodes/__init__.py rename to bamt/core/graph/__init__.py diff --git a/bamt/graph/dag.py b/bamt/core/graph/dag.py similarity index 100% rename from bamt/graph/dag.py rename to bamt/core/graph/dag.py diff --git a/bamt/graph/graph.py b/bamt/core/graph/graph.py similarity index 100% rename from bamt/graph/graph.py rename to bamt/core/graph/graph.py diff --git a/bamt/nodes/child_nodes/__init__.py b/bamt/core/node_models/__init__.py similarity index 100% rename from bamt/nodes/child_nodes/__init__.py rename to bamt/core/node_models/__init__.py diff --git a/bamt/core/node_models/continuous_distribution.py b/bamt/core/node_models/continuous_distribution.py new file mode 100644 index 0000000..9d31e21 --- /dev/null +++ b/bamt/core/node_models/continuous_distribution.py @@ -0,0 +1,22 @@ +import numpy as np + +from .distribution import Distribution + + +class ContinuousDistribution(Distribution): + def __init__(self, distribution_model=None, **parameters): + self._distribution = distribution_model + self._parameters = parameters + + def fit(self, X: np.ndarray) -> None: + if self._distribution is None: + # TODO: implement an algorithm that finds a distribution and fits it with chosen parameters + pass + else: + pass + + def sample(self, num_samples: int) -> np.ndarray: + pass + + def __str__(self): + return str(self._distribution.name) + " continuous distribution" diff --git a/bamt/core/node_models/distribution.py b/bamt/core/node_models/distribution.py new file mode 100644 index 0000000..ed3a747 --- /dev/null +++ b/bamt/core/node_models/distribution.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + +import numpy as np + + +class Distribution(ABC): + + @abstractmethod + def fit(self, X: np.ndarray) -> None: + pass + + @abstractmethod + def sample(self, num_samples: int) -> np.ndarray: + pass diff --git a/bamt/core/node_models/empirical_distribution.py b/bamt/core/node_models/empirical_distribution.py new file mode 100644 index 0000000..ec63673 --- /dev/null +++ b/bamt/core/node_models/empirical_distribution.py @@ -0,0 +1,17 @@ +import numpy as np + +from .distribution import Distribution + + +class EmpiricalDistribution(Distribution): + def __init__(self): + pass + + def fit(self, X: np.ndarray) -> None: + pass + + def sample(self, num_samples: int) -> np.ndarray: + pass + + def __str__(self): + return "Empirical Distribution" diff --git a/bamt/nodes/child_nodes/node_models/__init__.py b/bamt/core/nodes/__init__.py similarity index 100% rename from bamt/nodes/child_nodes/node_models/__init__.py rename to bamt/core/nodes/__init__.py diff --git a/bamt/nodes/root_nodes/__init__.py b/bamt/core/nodes/child_nodes/__init__.py similarity index 100% rename from bamt/nodes/root_nodes/__init__.py rename to bamt/core/nodes/child_nodes/__init__.py diff --git a/bamt/nodes/child_nodes/child_node.py b/bamt/core/nodes/child_nodes/child_node.py similarity index 66% rename from bamt/nodes/child_nodes/child_node.py rename to bamt/core/nodes/child_nodes/child_node.py index d99d45d..fc1636d 100644 --- a/bamt/nodes/child_nodes/child_node.py +++ b/bamt/core/nodes/child_nodes/child_node.py @@ -1,4 +1,4 @@ -from bamt.nodes.node import Node +from bamt.core.nodes.node import Node class ChildNode(Node): diff --git a/bamt/nodes/child_nodes/conditional_continuous_node.py b/bamt/core/nodes/child_nodes/conditional_continuous_node.py similarity index 100% rename from bamt/nodes/child_nodes/conditional_continuous_node.py rename to bamt/core/nodes/child_nodes/conditional_continuous_node.py diff --git a/bamt/nodes/child_nodes/conditional_discrete_node.py b/bamt/core/nodes/child_nodes/conditional_discrete_node.py similarity index 100% rename from bamt/nodes/child_nodes/conditional_discrete_node.py rename to bamt/core/nodes/child_nodes/conditional_discrete_node.py diff --git a/bamt/nodes/root_nodes/discrete_node.py b/bamt/core/nodes/child_nodes/node_models/__init__.py similarity index 100% rename from bamt/nodes/root_nodes/discrete_node.py rename to bamt/core/nodes/child_nodes/node_models/__init__.py diff --git a/bamt/nodes/node.py b/bamt/core/nodes/node.py similarity index 100% rename from bamt/nodes/node.py rename to bamt/core/nodes/node.py diff --git a/bamt/core/nodes/root_nodes/__init__.py b/bamt/core/nodes/root_nodes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/root_nodes/continuous_node.py b/bamt/core/nodes/root_nodes/continuous_node.py similarity index 100% rename from bamt/nodes/root_nodes/continuous_node.py rename to bamt/core/nodes/root_nodes/continuous_node.py diff --git a/bamt/core/nodes/root_nodes/discrete_node.py b/bamt/core/nodes/root_nodes/discrete_node.py new file mode 100644 index 0000000..e69de29 diff --git a/bamt/nodes/root_nodes/root_node.py b/bamt/core/nodes/root_nodes/root_node.py similarity index 66% rename from bamt/nodes/root_nodes/root_node.py rename to bamt/core/nodes/root_nodes/root_node.py index 030e05a..faf7ff1 100644 --- a/bamt/nodes/root_nodes/root_node.py +++ b/bamt/core/nodes/root_nodes/root_node.py @@ -1,4 +1,4 @@ -from bamt.nodes.node import Node +from bamt.core.nodes.node import Node class RootNode(Node): diff --git a/bamt/models/probabilistic_structural_models/bayesian_network.py b/bamt/models/probabilistic_structural_models/bayesian_network.py index 36f6f2f..e9cf91b 100644 --- a/bamt/models/probabilistic_structural_models/bayesian_network.py +++ b/bamt/models/probabilistic_structural_models/bayesian_network.py @@ -1,6 +1,7 @@ -from .probabilistic_structural_model import ProbabilisticStructuralModel from abc import abstractmethod +from .probabilistic_structural_model import ProbabilisticStructuralModel + class BayesianNetwork(ProbabilisticStructuralModel): def __init__(self): From 6b1462da211d9b8fd5f10b46d37a13494451e2e6 Mon Sep 17 00:00:00 2001 From: jrzkaminski Date: Fri, 5 Apr 2024 15:51:50 +0300 Subject: [PATCH 03/18] some more changes to structure --- bamt/core/node_models/classifier.py | 25 +++++++++++++++++++ bamt/core/node_models/prediction_model.py | 20 +++++++++++++++ bamt/core/node_models/regressor.py | 25 +++++++++++++++++++ bamt/core/nodes/root_nodes/continuous_node.py | 4 +++ bamt/core/nodes/root_nodes/discrete_node.py | 6 +++++ .../composite_bayesian_network.py | 5 +++- .../continuous_bayesian_network.py | 3 +++ .../discrete_bayesian_network.py | 3 +++ .../hybrid_bayesian_network.py | 3 +++ 9 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 bamt/core/node_models/classifier.py create mode 100644 bamt/core/node_models/prediction_model.py create mode 100644 bamt/core/node_models/regressor.py diff --git a/bamt/core/node_models/classifier.py b/bamt/core/node_models/classifier.py new file mode 100644 index 0000000..afe9df4 --- /dev/null +++ b/bamt/core/node_models/classifier.py @@ -0,0 +1,25 @@ +from .prediction_model import PredictionModel + +import numpy as np + + +class Classifier(PredictionModel): + def __init__(self, classifier=None, **parameters): + self._classifier = classifier + self._parameters = parameters + + def fit(self, X: np.ndarray, y: np.ndarray) -> None: + if self._classifier is None: + # TODO: implement an algorithm that finds a classifier and fits it with chosen parameters + pass + else: + self._classifier.fit(X, y) + + def predict(self, X: np.ndarray) -> np.ndarray: + return self._classifier.predict(X) + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + return self._classifier.predict_proba(X) + + def __str__(self): + return str(self._classifier) diff --git a/bamt/core/node_models/prediction_model.py b/bamt/core/node_models/prediction_model.py new file mode 100644 index 0000000..c2b5df7 --- /dev/null +++ b/bamt/core/node_models/prediction_model.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod + +import numpy as np + + +class PredictionModel(ABC): + """Represents general prediction model implementations. Each prediction model should provide a fit and a predict + method.""" + + @abstractmethod + def fit(self, X: np.ndarray, Y: np.ndarray) -> None: + raise NotImplementedError + + @abstractmethod + def predict(self, X: np.ndarray) -> np.ndarray: + raise NotImplementedError + + @abstractmethod + def predict_proba(self, X: np.ndarray) -> np.ndarray: + raise NotImplementedError diff --git a/bamt/core/node_models/regressor.py b/bamt/core/node_models/regressor.py new file mode 100644 index 0000000..d69941a --- /dev/null +++ b/bamt/core/node_models/regressor.py @@ -0,0 +1,25 @@ +from .prediction_model import PredictionModel + +import numpy as np + + +class Regressor(PredictionModel): + def __init__(self, regressor=None, **parameters): + self._regressor = regressor + self._parameters = parameters + + def fit(self, X: np.ndarray, y: np.ndarray) -> None: + if self._regressor is None: + # TODO: implement an algorithm that finds a regressor and fits it with chosen parameters + pass + else: + self._regressor.fit(X, y) + + def predict(self, X: np.ndarray) -> np.ndarray: + return self._regressor.predict(X) + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + return self._regressor.predict_proba(X) + + def __str__(self): + return str(self._regressor) diff --git a/bamt/core/nodes/root_nodes/continuous_node.py b/bamt/core/nodes/root_nodes/continuous_node.py index 67e551a..f725000 100644 --- a/bamt/core/nodes/root_nodes/continuous_node.py +++ b/bamt/core/nodes/root_nodes/continuous_node.py @@ -4,3 +4,7 @@ class ContinuousNode(RootNode): def __init__(self): super().__init__() + self._distribution = None + + def __str__(self): + return "Continuous Node with " + str(self._distribution) diff --git a/bamt/core/nodes/root_nodes/discrete_node.py b/bamt/core/nodes/root_nodes/discrete_node.py index e69de29..d9b04c8 100644 --- a/bamt/core/nodes/root_nodes/discrete_node.py +++ b/bamt/core/nodes/root_nodes/discrete_node.py @@ -0,0 +1,6 @@ +from .root_node import RootNode + + +class DiscreteNode(RootNode): + def __init__(self): + super().__init__() diff --git a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py index d8c2f86..486eaf7 100644 --- a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py +++ b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py @@ -1,7 +1,7 @@ from .bayesian_network import BayesianNetwork -class ContinuousBayesianNetwork(BayesianNetwork): +class CompositeBayesianNetwork(BayesianNetwork): def __init__(self): super().__init__() @@ -13,3 +13,6 @@ def predict(self): def sample(self): pass + + def __str__(self): + return "Composite Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py index d8c2f86..618c4dc 100644 --- a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py +++ b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py @@ -13,3 +13,6 @@ def predict(self): def sample(self): pass + + def __str__(self): + return "Continuous Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py index fcec53e..b7654a6 100644 --- a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py +++ b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py @@ -13,3 +13,6 @@ def predict(self): def sample(self): pass + + def __str__(self): + return "Discrete Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py index 8dcf844..c4256df 100644 --- a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py +++ b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py @@ -13,3 +13,6 @@ def predict(self): def sample(self): pass + + def __str__(self): + return "Hybrid Bayesian Network" From 4362fa9abaab2692ec4490782d4bcb6976e5f5dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Sat, 27 Apr 2024 22:52:03 +0300 Subject: [PATCH 04/18] requirements update (#109) --- .../nodes/child_nodes/node_models/__init__.py | 0 pyproject.toml | 23 ++++++------- requirements.txt | 34 +++++++++++-------- 3 files changed, 31 insertions(+), 26 deletions(-) delete mode 100644 bamt/core/nodes/child_nodes/node_models/__init__.py diff --git a/bamt/core/nodes/child_nodes/node_models/__init__.py b/bamt/core/nodes/child_nodes/node_models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyproject.toml b/pyproject.toml index fab78d1..8000b61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "BAMT" -version = "1.2.01" +version = "2.0.0" description = "data modeling and analysis tool based on Bayesian networks" authors = ["Roman Netrogolov ", "Irina Deeva ", @@ -22,20 +22,19 @@ packages = [ ] [tool.poetry.dependencies] -python = ">=3.9,<3.11" -setuptools = "65.6.3" -numpy = ">=1.24.2" -matplotlib = "3.6.2" -pandas = "2.0.3" +python = "^3.10" +numpy = "^1.26.0" +matplotlib = "^3.8.0" +pandas = "^2.2.0" gmr = "1.6.2" -scikit-learn = "^1.2.0" -scipy = "^1.8.0" -pyvis = ">=0.2.1" +scikit-learn = "^1.4.2" +scipy = "^1.13.0" +pyvis = "^0.3.1" missingno = "^0.5.1" -pgmpy = "0.1.20" -thegolem = ">=0.3.3" +pgmpy = "^0.1.20" +thegolem = "^0.3.3" xgboost = ">=1.7.6" -catboost = ">=1.0.6" +catboost = ">=2.0.0" lightgbm = {version = ">=3.3.5", optional = true } [tool.poetry.extras] diff --git a/requirements.txt b/requirements.txt index 4ffdb37..374a8a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,21 @@ -numpy>=1.24.2 -matplotlib==3.6.2 -pandas>=2.0.0 +# Data Manipulation +numpy>=1.26.0 +pandas>=2.2.0 + +# Graph manipulation +networkx>=3.3 +thegolem>=0.4.0 + +# ML modeling frameworks +scikit-learn>=1.4.2 +scipy>=1.13.0 +catboost>=1.2.1 +xgboost>=2.0.0 + +# visualization +matplotlib>=3.8.0 +pyvis>=0.3.1 + +# TODO: exclude these libraries gmr==1.6.2 -scikit-learn>=1.2.0 -scipy>=1.9.3 -pyvis>=0.2.1 -pgmpy==0.1.20 -catboost>=1.0.6 -joblib>=1.1.1 -networkx>=3.1 -tqdm>=4.65.0 -thegolem>=0.3.3 -typing>=3.7.4.3 -xgboost>=1.7.6 +pgmpy==0.1.20 \ No newline at end of file From 27ff9faa3295e618e6043186eaedfad38e3c46c1 Mon Sep 17 00:00:00 2001 From: Pavel <95717191+PabloKarpacho@users.noreply.github.com> Date: Sat, 15 Jun 2024 13:42:35 +0300 Subject: [PATCH 05/18] Integrational tests (#105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add files via upload 2 benchmarks for tests added * Add files via upload International test added * Update tests/test_Integrational.py Co-authored-by: Jerzy Kamiński <86363785+jrzkaminski@users.noreply.github.com> * Update test_Integrational.py * Add files via upload * Add files via upload --------- Co-authored-by: Jerzy Kamiński <86363785+jrzkaminski@users.noreply.github.com> --- data/benchmark/auto_price.csv | 160 +++++++++++++++++++ data/benchmark/new_thyroid.csv | 216 +++++++++++++++++++++++++ tests/test_Integrational.py | 277 +++++++++++++++++++++++++++++++++ 3 files changed, 653 insertions(+) create mode 100644 data/benchmark/auto_price.csv create mode 100644 data/benchmark/new_thyroid.csv create mode 100644 tests/test_Integrational.py diff --git a/data/benchmark/auto_price.csv b/data/benchmark/auto_price.csv new file mode 100644 index 0000000..4e073ad --- /dev/null +++ b/data/benchmark/auto_price.csv @@ -0,0 +1,160 @@ +,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,target +0,2.0,164.0,99.8000030517578,176.60000610351562,66.19999694824219,54.29999923706055,2337.0,109.0,3.190000057220459,3.400000095367432,10.0,102.0,5500.0,24.0,30.0,13950.0 +1,2.0,164.0,99.4000015258789,176.60000610351562,66.4000015258789,54.29999923706055,2824.0,136.0,3.190000057220459,3.400000095367432,8.0,115.0,5500.0,18.0,22.0,17450.0 +2,1.0,158.0,105.8000030517578,192.6999969482422,71.4000015258789,55.70000076293945,2844.0,136.0,3.190000057220459,3.400000095367432,8.5,110.0,5500.0,19.0,25.0,17710.0 +3,1.0,158.0,105.8000030517578,192.6999969482422,71.4000015258789,55.900001525878906,3086.0,131.0,3.130000114440918,3.400000095367432,8.300000190734863,140.0,5500.0,17.0,20.0,23875.0 +4,2.0,192.0,101.1999969482422,176.8000030517578,64.80000305175781,54.29999923706055,2395.0,108.0,3.5,2.799999952316284,8.800000190734863,101.0,5800.0,23.0,29.0,16430.0 +5,0.0,192.0,101.1999969482422,176.8000030517578,64.80000305175781,54.29999923706055,2395.0,108.0,3.5,2.799999952316284,8.800000190734863,101.0,5800.0,23.0,29.0,16925.0 +6,0.0,188.0,101.1999969482422,176.8000030517578,64.80000305175781,54.29999923706055,2710.0,164.0,3.309999942779541,3.190000057220459,9.0,121.0,4250.0,21.0,28.0,20970.0 +7,0.0,188.0,101.1999969482422,176.8000030517578,64.80000305175781,54.29999923706055,2765.0,164.0,3.309999942779541,3.190000057220459,9.0,121.0,4250.0,21.0,28.0,21105.0 +8,2.0,121.0,88.4000015258789,141.10000610351562,60.29999923706055,53.20000076293945,1488.0,61.0,2.9100000858306885,3.0299999713897705,9.5,48.0,5100.0,47.0,53.0,5151.0 +9,1.0,98.0,94.5,155.89999389648438,63.59999847412109,52.0,1874.0,90.0,3.0299999713897705,3.109999895095825,9.600000381469728,70.0,5400.0,38.0,43.0,6295.0 +10,0.0,81.0,94.5,158.8000030517578,63.59999847412109,52.0,1909.0,90.0,3.0299999713897705,3.109999895095825,9.600000381469728,70.0,5400.0,38.0,43.0,6575.0 +11,1.0,118.0,93.6999969482422,157.3000030517578,63.79999923706055,50.79999923706055,1876.0,90.0,2.970000028610229,3.2300000190734863,9.40999984741211,68.0,5500.0,37.0,41.0,5572.0 +12,1.0,118.0,93.6999969482422,157.3000030517578,63.79999923706055,50.79999923706055,1876.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6377.0 +13,1.0,118.0,93.6999969482422,157.3000030517578,63.79999923706055,50.79999923706055,2128.0,98.0,3.0299999713897705,3.390000104904175,7.599999904632568,102.0,5500.0,24.0,30.0,7957.0 +14,1.0,148.0,93.6999969482422,157.3000030517578,63.79999923706055,50.59999847412109,1967.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6229.0 +15,1.0,148.0,93.6999969482422,157.3000030517578,63.79999923706055,50.59999847412109,1989.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6692.0 +16,1.0,148.0,93.6999969482422,157.3000030517578,63.79999923706055,50.59999847412109,1989.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,7609.0 +17,-1.0,110.0,103.3000030517578,174.60000610351562,64.5999984741211,59.79999923706055,2535.0,122.0,3.3399999141693115,3.4600000381469727,8.5,88.0,5000.0,24.0,30.0,8921.0 +18,3.0,145.0,95.9000015258789,173.1999969482422,66.30000305175781,50.20000076293945,2811.0,156.0,3.5999999046325684,3.900000095367432,7.0,145.0,5000.0,19.0,24.0,12964.0 +19,2.0,137.0,86.5999984741211,144.60000610351562,63.900001525878906,50.79999923706055,1713.0,92.0,2.9100000858306885,3.4100000858306885,9.600000381469728,58.0,4800.0,49.0,54.0,6479.0 +20,2.0,137.0,86.5999984741211,144.60000610351562,63.900001525878906,50.79999923706055,1819.0,92.0,2.9100000858306885,3.4100000858306885,9.199999809265137,76.0,6000.0,31.0,38.0,6855.0 +21,1.0,101.0,93.6999969482422,150.0,64.0,52.59999847412109,1837.0,79.0,2.9100000858306885,3.069999933242798,10.100000381469728,60.0,5500.0,38.0,42.0,5399.0 +22,1.0,101.0,93.6999969482422,150.0,64.0,52.59999847412109,1940.0,92.0,2.9100000858306885,3.4100000858306885,9.199999809265137,76.0,6000.0,30.0,34.0,6529.0 +23,1.0,101.0,93.6999969482422,150.0,64.0,52.59999847412109,1956.0,92.0,2.9100000858306885,3.4100000858306885,9.199999809265137,76.0,6000.0,30.0,34.0,7129.0 +24,0.0,110.0,96.5,163.39999389648438,64.0,54.5,2010.0,92.0,2.9100000858306885,3.4100000858306885,9.199999809265137,76.0,6000.0,30.0,34.0,7295.0 +25,0.0,78.0,96.5,157.10000610351562,63.900001525878906,58.29999923706055,2024.0,92.0,2.9200000762939453,3.4100000858306885,9.199999809265137,76.0,6000.0,30.0,34.0,7295.0 +26,0.0,106.0,96.5,167.5,65.19999694824219,53.29999923706055,2236.0,110.0,3.150000095367432,3.5799999237060547,9.0,86.0,5800.0,27.0,33.0,7895.0 +27,0.0,106.0,96.5,167.5,65.19999694824219,53.29999923706055,2289.0,110.0,3.150000095367432,3.5799999237060547,9.0,86.0,5800.0,27.0,33.0,9095.0 +28,0.0,85.0,96.5,175.39999389648438,65.19999694824219,54.09999847412109,2304.0,110.0,3.150000095367432,3.5799999237060547,9.0,86.0,5800.0,27.0,33.0,8845.0 +29,0.0,85.0,96.5,175.39999389648438,62.5,54.09999847412109,2372.0,110.0,3.150000095367432,3.5799999237060547,9.0,86.0,5800.0,27.0,33.0,10295.0 +30,0.0,85.0,96.5,175.39999389648438,65.19999694824219,54.09999847412109,2465.0,110.0,3.150000095367432,3.5799999237060547,9.0,101.0,5800.0,24.0,28.0,12945.0 +31,1.0,107.0,96.5,169.10000610351562,66.0,51.0,2293.0,110.0,3.150000095367432,3.5799999237060547,9.100000381469728,100.0,5500.0,25.0,31.0,10345.0 +32,0.0,145.0,113.0,199.6000061035156,69.5999984741211,52.79999923706055,4066.0,258.0,3.630000114440918,4.170000076293945,8.100000381469727,176.0,4750.0,15.0,19.0,32250.0 +33,1.0,104.0,93.0999984741211,159.10000610351562,64.19999694824219,54.09999847412109,1890.0,91.0,3.0299999713897705,3.150000095367432,9.0,68.0,5000.0,30.0,31.0,5195.0 +34,1.0,104.0,93.0999984741211,159.10000610351562,64.19999694824219,54.09999847412109,1900.0,91.0,3.0299999713897705,3.150000095367432,9.0,68.0,5000.0,31.0,38.0,6095.0 +35,1.0,104.0,93.0999984741211,159.10000610351562,64.19999694824219,54.09999847412109,1905.0,91.0,3.0299999713897705,3.150000095367432,9.0,68.0,5000.0,31.0,38.0,6795.0 +36,1.0,113.0,93.0999984741211,166.8000030517578,64.19999694824219,54.09999847412109,1945.0,91.0,3.0299999713897705,3.150000095367432,9.0,68.0,5000.0,31.0,38.0,6695.0 +37,1.0,113.0,93.0999984741211,166.8000030517578,64.19999694824219,54.09999847412109,1950.0,91.0,3.0799999237060547,3.150000095367432,9.0,68.0,5000.0,31.0,38.0,7395.0 +38,1.0,129.0,98.8000030517578,177.8000030517578,66.5,53.70000076293945,2385.0,122.0,3.390000104904175,3.390000104904175,8.600000381469727,84.0,4800.0,26.0,32.0,8845.0 +39,0.0,115.0,98.8000030517578,177.8000030517578,66.5,55.5,2410.0,122.0,3.390000104904175,3.390000104904175,8.600000381469727,84.0,4800.0,26.0,32.0,8495.0 +40,1.0,129.0,98.8000030517578,177.8000030517578,66.5,53.70000076293945,2385.0,122.0,3.390000104904175,3.390000104904175,8.600000381469727,84.0,4800.0,26.0,32.0,10595.0 +41,0.0,115.0,98.8000030517578,177.8000030517578,66.5,55.5,2410.0,122.0,3.390000104904175,3.390000104904175,8.600000381469727,84.0,4800.0,26.0,32.0,10245.0 +42,0.0,115.0,98.8000030517578,177.8000030517578,66.5,55.5,2425.0,122.0,3.390000104904175,3.390000104904175,8.600000381469727,84.0,4800.0,26.0,32.0,11245.0 +43,0.0,118.0,104.9000015258789,175.0,66.0999984741211,54.400001525878906,2670.0,140.0,3.759999990463257,3.1600000858306885,8.0,120.0,5000.0,19.0,27.0,18280.0 +44,-1.0,93.0,110.0,190.8999938964844,70.30000305175781,56.5,3515.0,183.0,3.5799999237060547,3.640000104904175,21.5,123.0,4350.0,22.0,25.0,25552.0 +45,-1.0,93.0,110.0,190.8999938964844,70.30000305175781,58.70000076293945,3750.0,183.0,3.5799999237060547,3.640000104904175,21.5,123.0,4350.0,22.0,25.0,28248.0 +46,0.0,93.0,106.6999969482422,187.5,70.30000305175781,54.900001525878906,3495.0,183.0,3.5799999237060547,3.640000104904175,21.5,123.0,4350.0,22.0,25.0,28176.0 +47,-1.0,93.0,115.5999984741211,202.6000061035156,71.69999694824219,56.29999923706055,3770.0,183.0,3.5799999237060547,3.640000104904175,21.5,123.0,4350.0,22.0,25.0,31600.0 +48,3.0,142.0,96.5999984741211,180.3000030517578,70.5,50.79999923706055,3685.0,234.0,3.4600000381469727,3.0999999046325684,8.300000190734863,155.0,4750.0,16.0,18.0,35056.0 +49,2.0,161.0,93.6999969482422,157.3000030517578,64.4000015258789,50.79999923706055,1918.0,92.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,37.0,41.0,5389.0 +50,2.0,161.0,93.6999969482422,157.3000030517578,64.4000015258789,50.79999923706055,1944.0,92.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6189.0 +51,2.0,161.0,93.6999969482422,157.3000030517578,64.4000015258789,50.79999923706055,2004.0,92.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6669.0 +52,1.0,161.0,93.0,157.3000030517578,63.79999923706055,50.79999923706055,2145.0,98.0,3.0299999713897705,3.390000104904175,7.599999904632568,102.0,5500.0,24.0,30.0,7689.0 +53,3.0,153.0,96.3000030517578,173.0,65.4000015258789,49.400001525878906,2370.0,110.0,3.1700000762939453,3.4600000381469727,7.5,116.0,5500.0,23.0,30.0,9959.0 +54,3.0,153.0,96.3000030517578,173.0,65.4000015258789,49.400001525878906,2328.0,122.0,3.3499999046325684,3.4600000381469727,8.5,88.0,5000.0,25.0,32.0,8499.0 +55,1.0,125.0,96.3000030517578,172.39999389648438,65.4000015258789,51.59999847412109,2365.0,122.0,3.3499999046325684,3.4600000381469727,8.5,88.0,5000.0,25.0,32.0,6989.0 +56,1.0,125.0,96.3000030517578,172.39999389648438,65.4000015258789,51.59999847412109,2405.0,122.0,3.3499999046325684,3.4600000381469727,8.5,88.0,5000.0,25.0,32.0,8189.0 +57,1.0,125.0,96.3000030517578,172.39999389648438,65.4000015258789,51.59999847412109,2403.0,110.0,3.1700000762939453,3.4600000381469727,7.5,116.0,5500.0,23.0,30.0,9279.0 +58,-1.0,137.0,96.3000030517578,172.39999389648438,65.4000015258789,51.59999847412109,2403.0,110.0,3.1700000762939453,3.4600000381469727,7.5,116.0,5500.0,23.0,30.0,9279.0 +59,1.0,128.0,94.5,165.3000030517578,63.79999923706055,54.5,1889.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,5499.0 +60,1.0,128.0,94.5,165.3000030517578,63.79999923706055,54.5,2017.0,103.0,2.990000009536743,3.470000028610229,21.899999618530277,55.0,4800.0,45.0,50.0,7099.0 +61,1.0,128.0,94.5,165.3000030517578,63.79999923706055,54.5,1918.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,6649.0 +62,1.0,122.0,94.5,165.3000030517578,63.79999923706055,54.5,1938.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,6849.0 +63,1.0,103.0,94.5,170.1999969482422,63.79999923706055,53.5,2024.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,7349.0 +64,1.0,128.0,94.5,165.3000030517578,63.79999923706055,54.5,1951.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,7299.0 +65,1.0,128.0,94.5,165.60000610351562,63.79999923706055,53.29999923706055,2028.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,7799.0 +66,1.0,122.0,94.5,165.3000030517578,63.79999923706055,54.5,1971.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,7499.0 +67,1.0,103.0,94.5,170.1999969482422,63.79999923706055,53.5,2037.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,7999.0 +68,2.0,168.0,95.0999984741211,162.39999389648438,63.79999923706055,53.29999923706055,2008.0,97.0,3.150000095367432,3.289999961853028,9.399999618530272,69.0,5200.0,31.0,37.0,8249.0 +69,0.0,106.0,97.1999969482422,173.39999389648438,65.19999694824219,54.70000076293945,2324.0,120.0,3.3299999237060547,3.470000028610229,8.5,97.0,5200.0,27.0,34.0,8949.0 +70,0.0,106.0,97.1999969482422,173.39999389648438,65.19999694824219,54.70000076293945,2302.0,120.0,3.3299999237060547,3.470000028610229,8.5,97.0,5200.0,27.0,34.0,9549.0 +71,0.0,128.0,100.4000015258789,181.6999969482422,66.5,55.09999847412109,3095.0,181.0,3.430000066757202,3.2699999809265137,9.0,152.0,5200.0,17.0,22.0,13499.0 +72,0.0,108.0,100.4000015258789,184.6000061035156,66.5,56.09999847412109,3296.0,181.0,3.430000066757202,3.2699999809265137,9.0,152.0,5200.0,17.0,22.0,14399.0 +73,0.0,108.0,100.4000015258789,184.6000061035156,66.5,55.09999847412109,3060.0,181.0,3.430000066757202,3.2699999809265137,9.0,152.0,5200.0,19.0,25.0,13499.0 +74,3.0,194.0,91.3000030517578,170.6999969482422,67.9000015258789,49.70000076293945,3071.0,181.0,3.430000066757202,3.2699999809265137,9.0,160.0,5200.0,19.0,25.0,17199.0 +75,3.0,194.0,91.3000030517578,170.6999969482422,67.9000015258789,49.70000076293945,3139.0,181.0,3.430000066757202,3.2699999809265137,7.800000190734863,200.0,5200.0,17.0,23.0,19699.0 +76,1.0,231.0,99.1999969482422,178.5,67.9000015258789,49.70000076293945,3139.0,181.0,3.430000066757202,3.2699999809265137,9.0,160.0,5200.0,19.0,25.0,18399.0 +77,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3020.0,120.0,3.4600000381469727,3.190000057220459,8.399999618530273,97.0,5000.0,19.0,24.0,11900.0 +78,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3197.0,152.0,3.700000047683716,3.5199999809265137,21.0,95.0,4150.0,28.0,33.0,13200.0 +79,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3075.0,120.0,3.4600000381469727,2.190000057220459,8.399999618530273,95.0,5000.0,19.0,24.0,15580.0 +80,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3252.0,152.0,3.700000047683716,3.5199999809265137,21.0,95.0,4150.0,28.0,33.0,16900.0 +81,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3075.0,120.0,3.4600000381469727,3.190000057220459,8.399999618530273,97.0,5000.0,19.0,24.0,16630.0 +82,0.0,161.0,107.9000015258789,186.6999969482422,68.4000015258789,56.70000076293945,3252.0,152.0,3.700000047683716,3.5199999809265137,21.0,95.0,4150.0,28.0,33.0,17950.0 +83,0.0,161.0,108.0,186.6999969482422,68.30000305175781,56.0,3130.0,134.0,3.609999895095825,3.2100000381469727,7.0,142.0,5600.0,18.0,24.0,18150.0 +84,1.0,119.0,93.6999969482422,157.3000030517578,63.79999923706055,50.79999923706055,1918.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,37.0,41.0,5572.0 +85,1.0,119.0,93.6999969482422,157.3000030517578,63.79999923706055,50.79999923706055,2128.0,98.0,3.0299999713897705,3.390000104904175,7.599999904632568,102.0,5500.0,24.0,30.0,7957.0 +86,1.0,154.0,93.6999969482422,157.3000030517578,63.79999923706055,50.59999847412109,1967.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6229.0 +87,1.0,154.0,93.6999969482422,167.3000030517578,63.79999923706055,50.79999923706055,1989.0,90.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,6692.0 +88,1.0,154.0,93.6999969482422,167.3000030517578,63.79999923706055,50.79999923706055,2191.0,98.0,2.970000028610229,3.2300000190734863,9.399999618530272,68.0,5500.0,31.0,38.0,7609.0 +89,-1.0,74.0,103.3000030517578,174.60000610351562,64.5999984741211,59.79999923706055,2535.0,122.0,3.3499999046325684,3.4600000381469727,8.5,88.0,5000.0,24.0,30.0,8921.0 +90,3.0,186.0,94.5,168.89999389648438,68.30000305175781,50.20000076293945,2778.0,151.0,3.940000057220459,3.109999895095825,9.5,143.0,5500.0,19.0,27.0,22018.0 +91,3.0,150.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2658.0,121.0,3.539999961853028,3.069999933242798,9.3100004196167,110.0,5250.0,21.0,28.0,11850.0 +92,2.0,104.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2695.0,121.0,3.539999961853028,3.069999933242798,9.300000190734863,110.0,5250.0,21.0,28.0,12170.0 +93,3.0,150.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2707.0,121.0,2.539999961853028,2.069999933242798,9.300000190734863,110.0,5250.0,21.0,28.0,15040.0 +94,2.0,104.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2758.0,121.0,3.539999961853028,3.069999933242798,9.300000190734863,110.0,5250.0,21.0,28.0,15510.0 +95,3.0,150.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2808.0,121.0,3.539999961853028,3.069999933242798,9.0,160.0,5500.0,19.0,26.0,18150.0 +96,2.0,104.0,99.0999984741211,186.6000061035156,66.5,56.09999847412109,2847.0,121.0,3.539999961853028,3.069999933242798,9.0,160.0,5500.0,19.0,26.0,18620.0 +97,2.0,83.0,93.6999969482422,156.89999389648438,63.400001525878906,53.70000076293945,2050.0,97.0,3.619999885559082,2.359999895095825,9.0,69.0,4900.0,31.0,36.0,5118.0 +98,2.0,83.0,93.6999969482422,157.89999389648438,63.59999847412109,53.70000076293945,2120.0,108.0,3.619999885559082,2.640000104904175,8.699999809265137,73.0,4400.0,26.0,31.0,7053.0 +99,2.0,83.0,93.3000030517578,157.3000030517578,63.79999923706055,55.70000076293945,2240.0,108.0,3.619999885559082,2.640000104904175,8.699999809265137,73.0,4400.0,26.0,31.0,7603.0 +100,0.0,102.0,97.1999969482422,172.0,65.4000015258789,52.5,2145.0,108.0,3.619999885559082,2.640000104904175,9.5,82.0,4800.0,32.0,37.0,7126.0 +101,0.0,102.0,97.1999969482422,172.0,65.4000015258789,52.5,2190.0,108.0,3.619999885559082,2.640000104904175,9.5,82.0,4400.0,28.0,33.0,7775.0 +102,0.0,102.0,97.1999969482422,172.0,65.4000015258789,52.5,2340.0,108.0,3.619999885559082,2.640000104904175,9.0,94.0,5200.0,26.0,32.0,9960.0 +103,0.0,102.0,97.0,172.0,65.4000015258789,54.29999923706055,2385.0,108.0,3.619999885559082,2.640000104904175,9.0,82.0,4800.0,24.0,25.0,9233.0 +104,0.0,102.0,97.0,172.0,65.4000015258789,54.29999923706055,2510.0,108.0,3.619999885559082,2.640000104904175,7.699999809265137,111.0,4800.0,24.0,29.0,11259.0 +105,0.0,89.0,97.0,173.5,65.4000015258789,53.0,2290.0,108.0,3.619999885559082,2.640000104904175,9.0,82.0,4800.0,28.0,32.0,7463.0 +106,0.0,89.0,97.0,173.5,65.4000015258789,53.0,2455.0,108.0,3.619999885559082,2.640000104904175,9.0,94.0,5200.0,25.0,31.0,10198.0 +107,0.0,85.0,96.9000015258789,173.60000610351562,65.4000015258789,54.900001525878906,2420.0,108.0,3.619999885559082,2.640000104904175,9.0,82.0,4800.0,23.0,29.0,8013.0 +108,0.0,85.0,96.9000015258789,173.60000610351562,65.4000015258789,54.900001525878906,2650.0,108.0,3.619999885559082,2.640000104904175,7.699999809265137,111.0,4800.0,23.0,23.0,11694.0 +109,1.0,87.0,95.6999969482422,158.6999969482422,63.59999847412109,54.5,1985.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,35.0,39.0,5348.0 +110,1.0,87.0,95.6999969482422,158.6999969482422,63.59999847412109,54.5,2040.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,31.0,38.0,6338.0 +111,1.0,74.0,95.6999969482422,158.6999969482422,63.59999847412109,54.5,2015.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,31.0,38.0,6488.0 +112,0.0,77.0,95.6999969482422,169.6999969482422,63.59999847412109,59.09999847412109,2280.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,31.0,37.0,6918.0 +113,0.0,81.0,95.6999969482422,169.6999969482422,63.59999847412109,59.09999847412109,2290.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,27.0,32.0,7898.0 +114,0.0,91.0,95.6999969482422,169.6999969482422,63.59999847412109,59.09999847412109,3110.0,92.0,3.049999952316284,3.0299999713897705,9.0,62.0,4800.0,27.0,32.0,8778.0 +115,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,53.0,2081.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,30.0,37.0,6938.0 +116,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,52.79999923706055,2109.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,30.0,37.0,7198.0 +117,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,53.0,2275.0,110.0,3.2699999809265137,3.3499999046325684,22.5,56.0,4500.0,34.0,36.0,7898.0 +118,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,52.79999923706055,2275.0,110.0,3.2699999809265137,3.3499999046325684,22.5,56.0,4500.0,38.0,47.0,7788.0 +119,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,53.0,2094.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,38.0,47.0,7738.0 +120,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,52.79999923706055,2122.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,28.0,34.0,8358.0 +121,0.0,91.0,95.6999969482422,166.3000030517578,64.4000015258789,52.79999923706055,2140.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,28.0,34.0,9258.0 +122,1.0,168.0,94.5,168.6999969482422,64.0,52.59999847412109,2169.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,29.0,34.0,8058.0 +123,1.0,168.0,94.5,168.6999969482422,64.0,52.59999847412109,2204.0,98.0,3.190000057220459,3.0299999713897705,9.0,70.0,4800.0,29.0,34.0,8238.0 +124,1.0,168.0,94.5,168.6999969482422,64.0,52.59999847412109,2265.0,98.0,3.240000009536743,3.0799999237060547,9.399999618530272,112.0,6600.0,26.0,29.0,9298.0 +125,1.0,168.0,94.5,168.6999969482422,64.0,52.59999847412109,2300.0,98.0,3.240000009536743,3.0799999237060547,9.399999618530272,112.0,6600.0,26.0,29.0,9538.0 +126,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,52.0,2540.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,8449.0 +127,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,52.0,2536.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,9639.0 +128,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,52.0,2551.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,9989.0 +129,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,52.0,2679.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,11199.0 +130,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,52.0,2714.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,11549.0 +131,2.0,134.0,98.4000015258789,176.1999969482422,65.5999984741211,53.0,2975.0,146.0,3.619999885559082,3.5,9.300000190734863,116.0,4800.0,24.0,30.0,17669.0 +132,-1.0,65.0,102.4000015258789,175.60000610351562,66.5,54.900001525878906,2326.0,122.0,3.309999942779541,3.539999961853028,8.699999809265137,92.0,4200.0,29.0,34.0,8948.0 +133,-1.0,65.0,102.4000015258789,175.60000610351562,66.5,54.900001525878906,2480.0,110.0,3.2699999809265137,3.3499999046325684,22.5,73.0,4500.0,30.0,33.0,10698.0 +134,-1.0,65.0,102.4000015258789,175.60000610351562,66.5,53.900001525878906,2414.0,122.0,3.309999942779541,3.539999961853028,8.699999809265137,92.0,4200.0,27.0,32.0,9988.0 +135,-1.0,65.0,102.4000015258789,175.60000610351562,66.5,54.900001525878906,2414.0,122.0,3.309999942779541,3.539999961853028,8.699999809265137,92.0,4200.0,27.0,32.0,10898.0 +136,-1.0,65.0,102.4000015258789,175.60000610351562,66.5,53.900001525878906,2458.0,122.0,3.309999942779541,3.539999961853028,8.699999809265137,92.0,4200.0,27.0,32.0,11248.0 +137,3.0,197.0,102.9000015258789,183.5,67.69999694824219,52.0,2976.0,171.0,3.2699999809265137,3.3499999046325684,9.300000190734863,161.0,5200.0,20.0,24.0,16558.0 +138,3.0,197.0,102.9000015258789,183.5,67.69999694824219,52.0,3016.0,171.0,3.2699999809265137,3.3499999046325684,9.300000190734863,161.0,5200.0,19.0,24.0,15998.0 +139,-1.0,90.0,104.5,187.8000030517578,66.5,54.09999847412109,3131.0,171.0,3.2699999809265137,3.3499999046325684,9.199999809265137,156.0,5200.0,20.0,24.0,15690.0 +140,2.0,122.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2261.0,97.0,3.009999990463257,3.400000095367432,23.0,52.0,4800.0,37.0,46.0,7775.0 +141,2.0,122.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2209.0,109.0,3.190000057220459,3.400000095367432,9.0,85.0,5250.0,27.0,34.0,7975.0 +142,2.0,94.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2264.0,97.0,3.009999990463257,3.400000095367432,23.0,52.0,4800.0,37.0,46.0,7995.0 +143,2.0,94.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2212.0,109.0,3.190000057220459,3.400000095367432,9.0,85.0,5250.0,27.0,34.0,8195.0 +144,2.0,94.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2275.0,109.0,3.190000057220459,3.400000095367432,9.0,85.0,5250.0,27.0,34.0,8495.0 +145,2.0,94.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2319.0,97.0,3.009999990463257,3.400000095367432,23.0,68.0,4500.0,37.0,42.0,9495.0 +146,2.0,94.0,97.3000030517578,171.6999969482422,65.5,55.70000076293945,2300.0,109.0,3.190000057220459,3.400000095367432,10.0,100.0,5500.0,26.0,32.0,9995.0 +147,3.0,256.0,94.5,165.6999969482422,64.0,51.400001525878906,2221.0,109.0,3.190000057220459,3.400000095367432,8.5,90.0,5500.0,24.0,29.0,9980.0 +148,-2.0,103.0,104.3000030517578,188.8000030517578,67.19999694824219,56.20000076293945,2912.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,23.0,28.0,12940.0 +149,-1.0,74.0,104.3000030517578,188.8000030517578,67.19999694824219,57.5,3034.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,23.0,28.0,13415.0 +150,-2.0,103.0,104.3000030517578,188.8000030517578,67.19999694824219,56.20000076293945,2935.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,24.0,28.0,15985.0 +151,-1.0,74.0,104.3000030517578,188.8000030517578,67.19999694824219,57.5,3042.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,24.0,28.0,16515.0 +152,-2.0,103.0,104.3000030517578,188.8000030517578,67.19999694824219,56.20000076293945,3045.0,130.0,3.619999885559082,3.150000095367432,7.5,162.0,5100.0,17.0,22.0,18420.0 +153,-1.0,74.0,104.3000030517578,188.8000030517578,67.19999694824219,57.5,3157.0,130.0,3.619999885559082,3.150000095367432,7.5,162.0,5100.0,17.0,22.0,18950.0 +154,-1.0,95.0,109.0999984741211,188.8000030517578,68.9000015258789,55.5,2952.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,23.0,28.0,16845.0 +155,-1.0,95.0,109.0999984741211,188.8000030517578,68.80000305175781,55.5,3049.0,141.0,3.7799999713897705,3.150000095367432,8.699999809265137,160.0,5300.0,19.0,25.0,19045.0 +156,-1.0,95.0,109.0999984741211,188.8000030517578,68.9000015258789,55.5,3012.0,173.0,3.5799999237060547,2.869999885559082,8.800000190734863,134.0,5500.0,18.0,23.0,21485.0 +157,-1.0,95.0,109.0999984741211,188.8000030517578,68.9000015258789,55.5,3217.0,145.0,3.009999990463257,3.400000095367432,23.0,106.0,4800.0,26.0,27.0,22470.0 +158,-1.0,95.0,109.0999984741211,188.8000030517578,68.9000015258789,55.5,3062.0,141.0,3.7799999713897705,3.150000095367432,9.5,114.0,5400.0,19.0,25.0,22625.0 diff --git a/data/benchmark/new_thyroid.csv b/data/benchmark/new_thyroid.csv new file mode 100644 index 0000000..2c6a086 --- /dev/null +++ b/data/benchmark/new_thyroid.csv @@ -0,0 +1,216 @@ +,2,3,4,5,6,target +0,107.0,10.1,2.2,0.9,2.7,1 +1,113.0,9.9,3.1,2.0,5.9,1 +2,127.0,12.9,2.4,1.4,0.6,1 +3,109.0,5.3,1.6,1.4,1.5,1 +4,105.0,7.3,1.5,1.5,-0.1,1 +5,105.0,6.1,2.1,1.4,7.0,1 +6,110.0,10.4,1.6,1.6,2.7,1 +7,114.0,9.9,2.4,1.5,5.7,1 +8,106.0,9.4,2.2,1.5,0.0,1 +9,107.0,13.0,1.1,0.9,3.1,1 +10,106.0,4.2,1.2,1.6,1.4,1 +11,110.0,11.3,2.3,0.9,3.3,1 +12,116.0,9.2,2.7,1.0,4.2,1 +13,112.0,8.1,1.9,3.7,2.0,1 +14,122.0,9.7,1.6,0.9,2.2,1 +15,109.0,8.4,2.1,1.1,3.6,1 +16,111.0,8.4,1.5,0.8,1.2,1 +17,114.0,6.7,1.5,1.0,3.5,1 +18,119.0,10.6,2.1,1.3,1.1,1 +19,115.0,7.1,1.3,1.3,2.0,1 +20,101.0,7.8,1.2,1.0,1.7,1 +21,103.0,10.1,1.3,0.7,0.1,1 +22,109.0,10.4,1.9,0.4,-0.1,1 +23,102.0,7.6,1.8,2.0,2.5,1 +24,121.0,10.1,1.7,1.3,0.1,1 +25,100.0,6.1,2.4,1.8,3.8,1 +26,106.0,9.6,2.4,1.0,1.3,1 +27,116.0,10.1,2.2,1.6,0.8,1 +28,105.0,11.1,2.0,1.0,1.0,1 +29,110.0,10.4,1.8,1.0,2.3,1 +30,120.0,8.4,1.1,1.4,1.4,1 +31,116.0,11.1,2.0,1.2,2.3,1 +32,110.0,7.8,1.9,2.1,6.4,1 +33,90.0,8.1,1.6,1.4,1.1,1 +34,117.0,12.2,1.9,1.2,3.9,1 +35,117.0,11.0,1.4,1.5,2.1,1 +36,113.0,9.0,2.0,1.8,1.6,1 +37,106.0,9.4,1.5,0.8,0.5,1 +38,130.0,9.5,1.7,0.4,3.2,1 +39,100.0,10.5,2.4,0.9,1.9,1 +40,121.0,10.1,2.4,0.8,3.0,1 +41,110.0,9.2,1.6,1.5,0.3,1 +42,129.0,11.9,2.7,1.2,3.5,1 +43,121.0,13.5,1.5,1.6,0.5,1 +44,123.0,8.1,2.3,1.0,5.1,1 +45,107.0,8.4,1.8,1.5,0.8,1 +46,109.0,10.0,1.3,1.8,4.3,1 +47,120.0,6.8,1.9,1.3,1.9,1 +48,100.0,9.5,2.5,1.3,-0.2,1 +49,118.0,8.1,1.9,1.5,13.7,1 +50,100.0,11.3,2.5,0.7,-0.3,1 +51,103.0,12.2,1.2,1.3,2.7,1 +52,115.0,8.1,1.7,0.6,2.2,1 +53,119.0,8.0,2.0,0.6,3.2,1 +54,106.0,9.4,1.7,0.9,3.1,1 +55,114.0,10.9,2.1,0.3,1.4,1 +56,93.0,8.9,1.5,0.8,2.7,1 +57,120.0,10.4,2.1,1.1,1.8,1 +58,106.0,11.3,1.8,0.9,1.0,1 +59,110.0,8.7,1.9,1.6,4.4,1 +60,103.0,8.1,1.4,0.5,3.8,1 +61,101.0,7.1,2.2,0.8,2.2,1 +62,115.0,10.4,1.8,1.6,2.0,1 +63,116.0,10.0,1.7,1.5,4.3,1 +64,117.0,9.2,1.9,1.5,6.8,1 +65,106.0,6.7,1.5,1.2,3.9,1 +66,118.0,10.5,2.1,0.7,3.5,1 +67,97.0,7.8,1.3,1.2,0.9,1 +68,113.0,11.1,1.7,0.8,2.3,1 +69,104.0,6.3,2.0,1.2,4.0,1 +70,96.0,9.4,1.5,1.0,3.1,1 +71,120.0,12.4,2.4,0.8,1.9,1 +72,133.0,9.7,2.9,0.8,1.9,1 +73,126.0,9.4,2.3,1.0,4.0,1 +74,113.0,8.5,1.8,0.8,0.5,1 +75,109.0,9.7,1.4,1.1,2.1,1 +76,119.0,12.9,1.5,1.3,3.6,1 +77,101.0,7.1,1.6,1.5,1.6,1 +78,108.0,10.4,2.1,1.3,2.4,1 +79,117.0,6.7,2.2,1.8,6.7,1 +80,115.0,15.3,2.3,2.0,2.0,1 +81,91.0,8.0,1.7,2.1,4.6,1 +82,103.0,8.5,1.8,1.9,1.1,1 +83,98.0,9.1,1.4,1.9,-0.3,1 +84,111.0,7.8,2.0,1.8,4.1,1 +85,107.0,13.0,1.5,2.8,1.7,1 +86,119.0,11.4,2.3,2.2,1.6,1 +87,122.0,11.8,2.7,1.7,2.3,1 +88,105.0,8.1,2.0,1.9,-0.5,1 +89,109.0,7.6,1.3,2.2,1.9,1 +90,105.0,9.5,1.8,1.6,3.6,1 +91,112.0,5.9,1.7,2.0,1.3,1 +92,112.0,9.5,2.0,1.2,0.7,1 +93,98.0,8.6,1.6,1.6,6.0,1 +94,109.0,12.4,2.3,1.7,0.8,1 +95,114.0,9.1,2.6,1.5,1.5,1 +96,114.0,11.1,2.4,2.0,-0.3,1 +97,110.0,8.4,1.4,1.0,1.9,1 +98,120.0,7.1,1.2,1.5,4.3,1 +99,108.0,10.9,1.2,1.9,1.0,1 +100,108.0,8.7,1.2,2.2,2.5,1 +101,116.0,11.9,1.8,1.9,1.5,1 +102,113.0,11.5,1.5,1.9,2.9,1 +103,105.0,7.0,1.5,2.7,4.3,1 +104,114.0,8.4,1.6,1.6,-0.2,1 +105,114.0,8.1,1.6,1.6,0.5,1 +106,105.0,11.1,1.1,0.8,1.2,1 +107,107.0,13.8,1.5,1.0,1.9,1 +108,116.0,11.5,1.8,1.4,5.4,1 +109,102.0,9.5,1.4,1.1,1.6,1 +110,116.0,16.1,0.9,1.3,1.5,1 +111,118.0,10.6,1.8,1.4,3.0,1 +112,109.0,8.9,1.7,1.0,0.9,1 +113,110.0,7.0,1.0,1.6,4.3,1 +114,104.0,9.6,1.1,1.3,0.8,1 +115,105.0,8.7,1.5,1.1,1.5,1 +116,102.0,8.5,1.2,1.3,1.4,1 +117,112.0,6.8,1.7,1.4,3.3,1 +118,111.0,8.5,1.6,1.1,3.9,1 +119,111.0,8.5,1.6,1.2,7.7,1 +120,103.0,7.3,1.0,0.7,0.5,1 +121,98.0,10.4,1.6,2.3,-0.7,1 +122,117.0,7.8,2.0,1.0,3.9,1 +123,111.0,9.1,1.7,1.2,4.1,1 +124,101.0,6.3,1.5,0.9,2.9,1 +125,106.0,8.9,0.7,1.0,2.3,1 +126,102.0,8.4,1.5,0.8,2.4,1 +127,115.0,10.6,0.8,2.1,4.6,1 +128,130.0,10.0,1.6,0.9,4.6,1 +129,101.0,6.7,1.3,1.0,5.7,1 +130,110.0,6.3,1.0,0.8,1.0,1 +131,103.0,9.5,2.9,1.4,-0.1,1 +132,113.0,7.8,2.0,1.1,3.0,1 +133,112.0,10.6,1.6,0.9,-0.1,1 +134,118.0,6.5,1.2,1.2,1.7,1 +135,109.0,9.2,1.8,1.1,4.4,1 +136,116.0,7.8,1.4,1.1,3.7,1 +137,127.0,7.7,1.8,1.9,6.4,1 +138,108.0,6.5,1.0,0.9,1.5,1 +139,108.0,7.1,1.3,1.6,2.2,1 +140,105.0,5.7,1.0,0.9,0.9,1 +141,98.0,5.7,0.4,1.3,2.8,1 +142,112.0,6.5,1.2,1.2,2.0,1 +143,118.0,12.2,1.5,1.0,2.3,1 +144,94.0,7.5,1.2,1.3,4.4,1 +145,126.0,10.4,1.7,1.2,3.5,1 +146,114.0,7.5,1.1,1.6,4.4,1 +147,111.0,11.9,2.3,0.9,3.8,1 +148,104.0,6.1,1.8,0.5,0.8,1 +149,102.0,6.6,1.2,1.4,1.3,1 +150,139.0,16.4,3.8,1.1,-0.2,2 +151,111.0,16.0,2.1,0.9,-0.1,2 +152,113.0,17.2,1.8,1.0,0.0,2 +153,65.0,25.3,5.8,1.3,0.2,2 +154,88.0,24.1,5.5,0.8,0.1,2 +155,65.0,18.2,10.0,1.3,0.1,2 +156,134.0,16.4,4.8,0.6,0.1,2 +157,110.0,20.3,3.7,0.6,0.2,2 +158,67.0,23.3,7.4,1.8,-0.6,2 +159,95.0,11.1,2.7,1.6,-0.3,2 +160,89.0,14.3,4.1,0.5,0.2,2 +161,89.0,23.8,5.4,0.5,0.1,2 +162,88.0,12.9,2.7,0.1,0.2,2 +163,105.0,17.4,1.6,0.3,0.4,2 +164,89.0,20.1,7.3,1.1,-0.2,2 +165,99.0,13.0,3.6,0.7,-0.1,2 +166,80.0,23.0,10.0,0.9,-0.1,2 +167,89.0,21.8,7.1,0.7,-0.1,2 +168,99.0,13.0,3.1,0.5,-0.1,2 +169,68.0,14.7,7.8,0.6,-0.2,2 +170,97.0,14.2,3.6,1.5,0.3,2 +171,84.0,21.5,2.7,1.1,-0.6,2 +172,84.0,18.5,4.4,1.1,-0.3,2 +173,98.0,16.7,4.3,1.7,0.2,2 +174,94.0,20.5,1.8,1.4,-0.5,2 +175,99.0,17.5,1.9,1.4,0.3,2 +176,76.0,25.3,4.5,1.2,-0.1,2 +177,110.0,15.2,1.9,0.7,-0.2,2 +178,144.0,22.3,3.3,1.3,0.6,2 +179,105.0,12.0,3.3,1.1,0.0,2 +180,88.0,16.5,4.9,0.8,0.1,2 +181,97.0,15.1,1.8,1.2,-0.2,2 +182,106.0,13.4,3.0,1.1,0.0,2 +183,79.0,19.0,5.5,0.9,0.3,2 +184,92.0,11.1,2.0,0.7,-0.2,2 +185,125.0,2.3,0.9,16.5,9.5,3 +186,120.0,6.8,2.1,10.4,38.6,3 +187,108.0,3.5,0.6,1.7,1.4,3 +188,120.0,3.0,2.5,1.2,4.5,3 +189,119.0,3.8,1.1,23.0,5.7,3 +190,141.0,5.6,1.8,9.2,14.4,3 +191,129.0,1.5,0.6,12.5,2.9,3 +192,118.0,3.6,1.5,11.6,48.8,3 +193,120.0,1.9,0.7,18.5,24.0,3 +194,119.0,0.8,0.7,56.4,21.6,3 +195,123.0,5.6,1.1,13.7,56.3,3 +196,115.0,6.3,1.2,4.7,14.4,3 +197,126.0,0.5,0.2,12.2,8.8,3 +198,121.0,4.7,1.8,11.2,53.0,3 +199,131.0,2.7,0.8,9.9,4.7,3 +200,134.0,2.0,0.5,12.2,2.2,3 +201,141.0,2.5,1.3,8.5,7.5,3 +202,113.0,5.1,0.7,5.8,19.6,3 +203,136.0,1.4,0.3,32.6,8.4,3 +204,120.0,3.4,1.8,7.5,21.5,3 +205,125.0,3.7,1.1,8.5,25.9,3 +206,123.0,1.9,0.3,22.8,22.2,3 +207,112.0,2.6,0.7,41.0,19.0,3 +208,134.0,1.9,0.6,18.4,8.2,3 +209,119.0,5.1,1.1,7.0,40.8,3 +210,118.0,6.5,1.3,1.7,11.5,3 +211,139.0,4.2,0.7,4.3,6.3,3 +212,103.0,5.1,1.4,1.2,5.0,3 +213,97.0,4.7,1.1,2.1,12.6,3 +214,102.0,5.3,1.4,1.3,6.7,3 diff --git a/tests/test_Integrational.py b/tests/test_Integrational.py new file mode 100644 index 0000000..013da0c --- /dev/null +++ b/tests/test_Integrational.py @@ -0,0 +1,277 @@ +import pytest +import itertools +import bamt.networks as networks +import bamt.preprocessors as pp +from pgmpy.estimators import K2Score +import pandas as pd +import numpy as np +from pandas.testing import assert_frame_equal +from sklearn import preprocessing +from sklearn.model_selection import train_test_split + + +class Builder: + def __init__(self): + self.data_paths = { + "Continuous": "data/benchmark/auto_price.csv", + "Discrete": "tests/hack_discrete/hack_data.csv", + "Hybrid": "data/benchmark/new_thyroid.csv", + } + + self.tail = { + "Continuous": ["Continuous", "target"], + "Discrete": ["Discrete", "Tectonic regime"], + "Hybrid": ["Hybrid", "target"], + } + + self.scoring = [("K2", K2Score), "BIC", "MI"] + self.optimizer = ["HC"] + self.use_mixture = [False, True] + self.has_logit = [False, True] + + self.static = {} + + self.dynamic = { + "Continuous": [self.use_mixture, [False], self.optimizer, self.scoring], + "Discrete": [[False], [False], self.optimizer, self.scoring], + "Hybrid": [self.use_mixture, self.has_logit, self.optimizer, self.scoring], + } + + def create_from_config(self): + """Method to collect data from config""" + self.static = dict( + Discrete=[self.data_paths["Discrete"], *self.tail["Discrete"]], + Continuous=[self.data_paths["Continuous"], *self.tail["Continuous"]], + Hybrid=[self.data_paths["Hybrid"], *self.tail["Hybrid"]], + evo=[False, False, "Evo", self.scoring[0]], + ) + + def create_evo_item(self, net_type): + evo_item = self.static["evo"][:] + evo_item.insert(0, self.data_paths[net_type]) + evo_item.extend(self.tail[net_type]) + return evo_item + + @staticmethod + def insert_list(loc, what, to): + new = to[:] + new[loc:loc] = what + return new + + def create_net_items(self, net_type): + static = self.static[net_type][:] + dynamic_part = map(list, itertools.product(*self.dynamic[net_type])) + return list(map(lambda x: self.insert_list(1, x, static), dynamic_part)) + + def get_params(self): + self.create_from_config() + params = [] + for net_type in ["Discrete", "Continuous", "Hybrid"]: + params.extend( + self.create_net_items(net_type) + [self.create_evo_item(net_type)] + ) + return params + + +params = Builder().get_params() + + +def initialize_bn(bn_type, use_mixture, has_logit): + if bn_type == "Discrete": + bn = networks.DiscreteBN() + elif bn_type == "Continuous": + bn = networks.ContinuousBN(use_mixture=use_mixture) + elif bn_type == "Hybrid": + bn = networks.HybridBN(has_logit=has_logit, use_mixture=use_mixture) + elif bn_type == "Composite": + bn = networks.CompositeBN() + return bn + + +def prepare_data(directory): + data = pd.read_csv(directory, index_col=0) + train, test = train_test_split(data, test_size=0.33, random_state=42) + + encoder = preprocessing.LabelEncoder() + discretizer = preprocessing.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="quantile" + ) + + p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + discretized_data, est = p.apply(train) + info = p.info + return info, discretized_data, train, test + + +class TestNetwork: + # Checking the equality of predictions (trained and loaded network) before and after saving + @pytest.mark.parametrize( + "directory, use_mixture, has_logit, optimizer, scoring, bn_type, target", params + ) + def test_1( + self, directory, use_mixture, has_logit, optimizer, scoring, bn_type, target + ): + test_id = "test_1" + + bn = initialize_bn(bn_type, use_mixture, has_logit) + info, discretized_data, train, test = prepare_data(directory) + bn.add_nodes(info) + if bn_type != "Composite": + bn.add_edges( + discretized_data, + optimizer=optimizer, + scoring_function=scoring, + progress_bar=False, + ) + else: + bn.add_edges(train) + + bn.fit_parameters(train) + predict = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + bn.save("bn") + + bn = initialize_bn(bn_type, use_mixture, has_logit) + bn.load("bn.json") + predict_loaded = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + + try: + assert_frame_equal(pd.DataFrame(predict), pd.DataFrame(predict_loaded)) + print(f"{test_id} runned successfully") + except AssertionError: + print( + f"params: {dict(zip(['use_mixture', 'has_logit', 'optimizer', 'scoring', 'bn_type'], use_mixture, has_logit, optimizer, scoring, bn_type))}" + ) + raise + + # Checking the prediction algorithm (trained network) before and after saving + @pytest.mark.parametrize( + "directory, use_mixture, has_logit, optimizer, scoring, bn_type, target", params + ) + def test_2( + self, directory, use_mixture, has_logit, optimizer, scoring, bn_type, target + ): + test_id = "test_2" + + bn = initialize_bn(bn_type, use_mixture, has_logit) + info, discretized_data, train, test = prepare_data(directory) + bn.add_nodes(info) + if bn_type != "Composite": + bn.add_edges( + discretized_data, + optimizer=optimizer, + scoring_function=scoring, + progress_bar=False, + ) + else: + bn.add_edges(train) + + bn.fit_parameters(train) + predict = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + bn.save("bn") + + predict2 = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + + try: + assert_frame_equal(pd.DataFrame(predict), pd.DataFrame(predict2)) + print(f"{test_id} runned successfully") + except AssertionError: + print( + f"params: {dict(zip(['use_mixture', 'has_logit', 'optimizer', 'scoring', 'bn_type'], use_mixture, has_logit, optimizer, scoring, bn_type))}" + ) + raise + + # Checking network predictions without edges + @pytest.mark.parametrize( + "directory, use_mixture, has_logit, optimizer, scoring, bn_type, target", params + ) + def test_3( + self, directory, use_mixture, has_logit, optimizer, scoring, bn_type, target + ): + test_id = "test_3" + + bn = initialize_bn(bn_type, use_mixture, has_logit) + info, discretized_data, train, test = prepare_data(directory) + bn.add_nodes(info) + bn.fit_parameters(train) + + predict = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + + try: + if info["types"][target] == "cont": + if use_mixture: + mean = bn.distributions[target]["mean"] + w = bn.distributions[target]["coef"] + sample = 0 + for ind, wi in enumerate(w): + sample += wi * mean[ind][0] + else: + sample = train[target].mean() + + assert np.all(np.array(predict[target]) == sample) + + elif info["types"][target] == "disc_num": + most_frequent = train[target].value_counts().index[0] + assert np.all(np.array(predict[target]) == most_frequent) + + print(f"{test_id} runned successfully") + except AssertionError: + print( + f"params: {dict(zip(['use_mixture', 'has_logit', 'optimizer', 'scoring', 'bn_type'], use_mixture, has_logit, optimizer, scoring, bn_type))}" + ) + raise + + # Checking the network trained on the 1 sample + @pytest.mark.parametrize( + "directory, use_mixture, has_logit, optimizer, scoring, bn_type, target", params + ) + def test_4( + self, directory, use_mixture, has_logit, optimizer, scoring, bn_type, target + ): + test_id = "test_4" + + if use_mixture == False: + + bn = initialize_bn(bn_type, use_mixture, has_logit) + info, discretized_data, train, test = prepare_data(directory) + + bn.add_nodes(info) + + train_data_1 = pd.DataFrame(train.iloc[0].to_dict(), index=[0]) + disc_data_1 = pd.DataFrame(discretized_data.iloc[0].to_dict(), index=[0]) + + if bn_type != "Composite": + bn.add_edges( + disc_data_1, + optimizer=optimizer, + scoring_function=scoring, + progress_bar=False, + ) + else: + bn.add_edges(train_data_1) + + bn.fit_parameters(train_data_1) + + predict = bn.predict( + test[[x for x in test.columns if x != target]], progress_bar=False + ) + + try: + assert np.all(np.array(predict[target]) == train_data_1[target][0]) + print(f"{test_id} runned successfully") + except AssertionError: + print( + f"params: {dict(zip(['use_mixture', 'has_logit', 'optimizer', 'scoring', 'bn_type'], use_mixture, has_logit, optimizer, scoring, bn_type))}" + ) + raise + else: + pass From faaa9a4488af1768bf32ec5d068da6de3742954b Mon Sep 17 00:00:00 2001 From: anton-golubkov Date: Sat, 15 Jun 2024 16:50:14 +0600 Subject: [PATCH 06/18] Add seed parameter to BaseNetwork sample function (#110) Allows to generate replicable samples --- bamt/networks/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bamt/networks/base.py b/bamt/networks/base.py index 3d8af8d..8090918 100644 --- a/bamt/networks/base.py +++ b/bamt/networks/base.py @@ -638,6 +638,7 @@ def sample( predict: bool = False, parall_count: int = 1, filter_neg: bool = True, + seed: Optional[int] = None, ) -> Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: """ Sampling from Bayesian Network @@ -645,10 +646,13 @@ def sample( evidence: values for nodes from user parall_count: number of threads. Defaults to 1. filter_neg: either filter negative vals or not. + seed: seed value to use for random number generator """ from joblib import Parallel, delayed - random.seed() + random.seed(seed) + np.random.seed(seed) + if not self.distributions.items(): logger_network.error( "Parameter learning wasn't done. Call fit_parameters method" From 8cffbde5b9aaa9592c537e55a71ac730eb92f58c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Sat, 15 Jun 2024 14:24:58 +0300 Subject: [PATCH 07/18] Delete poetry.lock, update gitignore (#111) * Delete poetry.lock * Update .gitignore --- .gitignore | 163 +++++++- poetry.lock | 1139 --------------------------------------------------- 2 files changed, 162 insertions(+), 1140 deletions(-) delete mode 100644 poetry.lock diff --git a/.gitignore b/.gitignore index 8875bf3..e49349e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,165 @@ tutorials/bamt /temp.ipynb /bamt_house_price_example_feature_analysis.ipynb /bamt_house_price_example_data_sampling.ipynb -/tmp.ipynb \ No newline at end of file +/tmp.ipynb + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index 8ea5c7b..0000000 --- a/poetry.lock +++ /dev/null @@ -1,1139 +0,0 @@ -[[package]] -name = "appnope" -version = "0.1.3" -description = "Disable App Nap on macOS >= 10.9" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "asttokens" -version = "2.2.1" -description = "Annotate AST trees with source code positions" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -six = "*" - -[package.extras] -test = ["astroid", "pytest"] - -[[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.extras] -cov = ["attrs", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs"] -docs = ["furo", "sphinx", "myst-parser", "zope.interface", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"] -tests = ["attrs", "zope.interface"] -tests-no-zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] -tests_no_zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] - -[[package]] -name = "backcall" -version = "0.2.0" -description = "Specifications for callback functions passed in to an API" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" - -[[package]] -name = "contourpy" -version = "1.0.6" -description = "Python library for calculating contours of 2D quadrilateral grids" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = ">=1.16" - -[package.extras] -bokeh = ["bokeh", "selenium"] -docs = ["docutils (<0.18)", "sphinx (<=5.2.0)", "sphinx-rtd-theme"] -test = ["pytest", "matplotlib", "pillow", "flake8", "isort"] -test-minimal = ["pytest"] -test-no-codebase = ["pytest", "matplotlib", "pillow"] - -[[package]] -name = "cycler" -version = "0.11.0" -description = "Composable style cycles" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "decorator" -version = "5.1.1" -description = "Decorators for Humans" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "executing" -version = "1.2.0" -description = "Get the currently executing AST node of a frame, and other information" -category = "main" -optional = false -python-versions = "*" - -[package.extras] -tests = ["asttokens", "pytest", "littleutils", "rich"] - -[[package]] -name = "fonttools" -version = "4.38.0" -description = "Tools to manipulate font files" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"] -graphite = ["lz4 (>=1.7.4.2)"] -interpolatable = ["scipy", "munkres"] -lxml = ["lxml (>=4.0,<5)"] -pathops = ["skia-pathops (>=0.5.0)"] -plot = ["matplotlib"] -repacker = ["uharfbuzz (>=0.23.0)"] -symfont = ["sympy"] -type1 = ["xattr"] -ufo = ["fs (>=2.2.0,<3)"] -unicode = ["unicodedata2 (>=14.0.0)"] -woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"] - -[[package]] -name = "future" -version = "0.18.2" -description = "Clean single-source support for Python 3 and 2" -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "gmr" -version = "1.6.2" -description = "Gaussian Mixture Regression" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -numpy = "*" -scipy = "*" - -[package.extras] -all = ["matplotlib", "scikit-learn", "svgpathtools"] -doc = ["pdoc3"] -test = ["nose", "coverage"] - -[[package]] -name = "iniconfig" -version = "1.1.1" -description = "iniconfig: brain-dead simple config-ini parsing" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "ipython" -version = "8.7.0" -description = "IPython: Productive Interactive Computing" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -appnope = {version = "*", markers = "sys_platform == \"darwin\""} -backcall = "*" -colorama = {version = "*", markers = "sys_platform == \"win32\""} -decorator = "*" -jedi = ">=0.16" -matplotlib-inline = "*" -pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} -pickleshare = "*" -prompt-toolkit = ">=3.0.11,<3.1.0" -pygments = ">=2.4.0" -stack-data = "*" -traitlets = ">=5" - -[package.extras] -all = ["black", "ipykernel", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "docrepr", "matplotlib", "stack-data", "pytest (<7)", "typing-extensions", "pytest (<7.1)", "pytest-asyncio", "testpath", "nbconvert", "nbformat", "ipywidgets", "notebook", "ipyparallel", "qtconsole", "curio", "matplotlib (!=3.2.0)", "numpy (>=1.20)", "pandas", "trio"] -black = ["black"] -doc = ["ipykernel", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "docrepr", "matplotlib", "stack-data", "pytest (<7)", "typing-extensions", "pytest (<7.1)", "pytest-asyncio", "testpath"] -kernel = ["ipykernel"] -nbconvert = ["nbconvert"] -nbformat = ["nbformat"] -notebook = ["ipywidgets", "notebook"] -parallel = ["ipyparallel"] -qtconsole = ["qtconsole"] -test = ["pytest (<7.1)", "pytest-asyncio", "testpath"] -test_extra = ["pytest (<7.1)", "pytest-asyncio", "testpath", "curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.20)", "pandas", "trio"] - -[[package]] -name = "jedi" -version = "0.18.2" -description = "An autocompletion tool for Python that can be used for text editors." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -parso = ">=0.8.0,<0.9.0" - -[package.extras] -docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx-rtd-theme (==0.4.3)", "sphinx (==1.8.5)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] -qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] -testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] - -[[package]] -name = "jinja2" -version = "3.1.2" -description = "A very fast and expressive template engine." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -MarkupSafe = ">=2.0" - -[package.extras] -i18n = ["Babel (>=2.7)"] - -[[package]] -name = "joblib" -version = "1.2.0" -description = "Lightweight pipelining with Python functions" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "jsonpickle" -version = "3.0.1" -description = "Python library for serializing any arbitrary object graph into JSON" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8 (>=1.1.1)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy"] -"testing.libs" = ["simplejson", "ujson"] - -[[package]] -name = "kiwisolver" -version = "1.4.4" -description = "A fast implementation of the Cassowary constraint solver" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "markupsafe" -version = "2.1.1" -description = "Safely add untrusted strings to HTML/XML markup." -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "matplotlib" -version = "3.6.2" -description = "Python plotting package" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -contourpy = ">=1.0.1" -cycler = ">=0.10" -fonttools = ">=4.22.0" -kiwisolver = ">=1.0.1" -numpy = ">=1.19" -packaging = ">=20.0" -pillow = ">=6.2.0" -pyparsing = ">=2.2.1" -python-dateutil = ">=2.7" -setuptools_scm = ">=7" - -[[package]] -name = "matplotlib-inline" -version = "0.1.6" -description = "Inline Matplotlib backend for Jupyter" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -traitlets = "*" - -[[package]] -name = "missingno" -version = "0.5.1" -description = "Missing data visualization module for Python." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -matplotlib = "*" -numpy = "*" -scipy = "*" -seaborn = "*" - -[package.extras] -tests = ["pytest", "pytest-mpl"] - -[[package]] -name = "networkx" -version = "2.8.8" -description = "Python package for creating and manipulating graphs and networks" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] -developer = ["pre-commit (>=2.20)", "mypy (>=0.982)"] -doc = ["sphinx (>=5.2)", "pydata-sphinx-theme (>=0.11)", "sphinx-gallery (>=0.11)", "numpydoc (>=1.5)", "pillow (>=9.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] -test = ["pytest (>=7.2)", "pytest-cov (>=4.0)", "codecov (>=2.1)"] - -[[package]] -name = "numpy" -version = "1.24.0" -description = "Fundamental package for array computing in Python" -category = "main" -optional = false -python-versions = ">=3.8" - -[[package]] -name = "nvidia-cublas-cu11" -version = "11.10.3.66" -description = "CUBLAS native runtime libraries" -category = "main" -optional = false -python-versions = ">=3" - -[[package]] -name = "nvidia-cuda-nvrtc-cu11" -version = "11.7.99" -description = "NVRTC native runtime libraries" -category = "main" -optional = false -python-versions = ">=3" - -[[package]] -name = "nvidia-cuda-runtime-cu11" -version = "11.7.99" -description = "CUDA Runtime native Libraries" -category = "main" -optional = false -python-versions = ">=3" - -[[package]] -name = "nvidia-cudnn-cu11" -version = "8.5.0.96" -description = "cuDNN runtime libraries" -category = "main" -optional = false -python-versions = ">=3" - -[[package]] -name = "opt-einsum" -version = "3.3.0" -description = "Optimizing numpys einsum function" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -numpy = ">=1.7" - -[package.extras] -docs = ["sphinx (==1.2.3)", "sphinxcontrib-napoleon", "sphinx-rtd-theme", "numpydoc"] -tests = ["pytest", "pytest-cov", "pytest-pep8"] - -[[package]] -name = "packaging" -version = "22.0" -description = "Core utilities for Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "pandas" -version = "1.5.2" -description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, -] -python-dateutil = ">=2.8.1" -pytz = ">=2020.1" - -[package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] - -[[package]] -name = "parso" -version = "0.8.3" -description = "A Python Parser" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.extras] -qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] -testing = ["docopt", "pytest (<6.0.0)"] - -[[package]] -name = "patsy" -version = "0.5.3" -description = "A Python package for describing statistical models and for building design matrices." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -numpy = ">=1.4" -six = "*" - -[package.extras] -test = ["pytest", "pytest-cov", "scipy"] - -[[package]] -name = "pexpect" -version = "4.8.0" -description = "Pexpect allows easy control of interactive console applications." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -ptyprocess = ">=0.5" - -[[package]] -name = "pgmpy" -version = "0.1.20" -description = "A library for Probabilistic Graphical Models" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -joblib = "*" -networkx = "*" -numpy = "*" -opt-einsum = "*" -pandas = "*" -pyparsing = "*" -scikit-learn = "*" -scipy = "*" -statsmodels = "*" -torch = "*" -tqdm = "*" - -[package.extras] -all = ["networkx", "numpy", "scipy", "scikit-learn", "pandas", "pyparsing", "torch", "statsmodels", "tqdm", "joblib", "opt-einsum", "xdoctest", "pytest", "pytest-cov", "coverage", "codecov", "mock", "black", "daft"] -tests = ["xdoctest", "pytest", "pytest-cov", "coverage", "codecov", "mock", "black"] - -[[package]] -name = "pickleshare" -version = "0.7.5" -description = "Tiny 'shelve'-like database with concurrency support" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pillow" -version = "9.3.0" -description = "Python Imaging Library (Fork)" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] -tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] - -[[package]] -name = "pluggy" -version = "1.0.0" -description = "plugin and hook calling mechanisms for python" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[[package]] -name = "pomegranate" -version = "0.14.8" -description = "Pomegranate is a graphical models library for Python, implemented in Cython for speed." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -joblib = ">=0.9.0b4" -networkx = ">=2.4" -numpy = ">=1.20.0" -pyyaml = "*" -scipy = ">=0.17.0" - -[package.extras] -gpu = ["cupy"] -plotting = ["pygraphviz", "matplotlib"] - -[[package]] -name = "prompt-toolkit" -version = "3.0.36" -description = "Library for building powerful interactive command lines in Python" -category = "main" -optional = false -python-versions = ">=3.6.2" - -[package.dependencies] -wcwidth = "*" - -[[package]] -name = "ptyprocess" -version = "0.7.0" -description = "Run a subprocess in a pseudo terminal" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pure-eval" -version = "0.2.2" -description = "Safely evaluate AST nodes without side effects" -category = "main" -optional = false -python-versions = "*" - -[package.extras] -tests = ["pytest"] - -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[[package]] -name = "pygments" -version = "2.13.0" -description = "Pygments is a syntax highlighting package written in Python." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.extras] -plugins = ["importlib-metadata"] - -[[package]] -name = "pyitlib" -version = "0.2.2" -description = "A library of information-theoretic methods" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -future = ">=0.16.0" -pandas = ">=0.20.2numpy" -scikit-learn = ">=0.16.0" -scipy = ">=1.0.1" - -[[package]] -name = "pyparsing" -version = "3.0.9" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" -optional = false -python-versions = ">=3.6.8" - -[package.extras] -diagrams = ["railroad-diagrams", "jinja2"] - -[[package]] -name = "pytest" -version = "7.1.3" -description = "pytest: simple powerful testing with Python" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -attrs = ">=19.2.0" -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" -py = ">=1.8.2" -tomli = ">=1.0.0" - -[package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] - -[[package]] -name = "python-dateutil" -version = "2.8.2" -description = "Extensions to the standard Python datetime module" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "pytz" -version = "2022.7" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pyvis" -version = "0.3.1" -description = "A Python network graph visualization library" -category = "main" -optional = false -python-versions = ">3.6" - -[package.dependencies] -ipython = ">=5.3.0" -jinja2 = ">=2.9.6" -jsonpickle = ">=1.4.1" -networkx = ">=1.11" - -[[package]] -name = "pyyaml" -version = "6.0" -description = "YAML parser and emitter for Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "scikit-learn" -version = "1.2.0" -description = "A set of python modules for machine learning and data mining" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3" -scipy = ">=1.3.2" -threadpoolctl = ">=2.0.0" - -[package.extras] -benchmark = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "memory-profiler (>=0.57.0)"] -docs = ["matplotlib (>=3.1.3)", "scikit-image (>=0.16.2)", "pandas (>=1.0.5)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.2.0)", "Pillow (>=7.1.2)", "pooch (>=1.6.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)", "plotly (>=5.10.0)"] -examples = ["matplotlib (>=3.1.3)", "scikit-image (>=0.16.2)", "pandas (>=1.0.5)", "seaborn (>=0.9.0)", "pooch (>=1.6.0)", "plotly (>=5.10.0)"] -tests = ["matplotlib (>=3.1.3)", "scikit-image (>=0.16.2)", "pandas (>=1.0.5)", "pytest (>=5.3.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "black (>=22.3.0)", "mypy (>=0.961)", "pyamg (>=4.0.0)", "numpydoc (>=1.2.0)", "pooch (>=1.6.0)"] - -[[package]] -name = "scipy" -version = "1.9.3" -description = "Fundamental algorithms for scientific computing in Python" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -numpy = ">=1.18.5,<1.26.0" - -[package.extras] -test = ["pytest", "pytest-cov", "pytest-xdist", "asv", "mpmath", "gmpy2", "threadpoolctl", "scikit-umfpack"] -doc = ["sphinx (!=4.1.0)", "pydata-sphinx-theme (==0.9.0)", "sphinx-panels (>=0.5.2)", "matplotlib (>2)", "numpydoc", "sphinx-tabs"] -dev = ["mypy", "typing-extensions", "pycodestyle", "flake8"] - -[[package]] -name = "seaborn" -version = "0.12.1" -description = "Statistical data visualization" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -matplotlib = ">=3.1,<3.6.1 || >3.6.1" -numpy = ">=1.17" -pandas = ">=0.25" - -[package.extras] -dev = ["pytest", "pytest-cov", "pytest-xdist", "flake8", "mypy", "pandas-stubs", "pre-commit"] -docs = ["numpydoc", "nbconvert", "ipykernel", "sphinx-copybutton", "sphinx-issues", "sphinx-design", "pyyaml", "pydata_sphinx_theme (==0.10.0rc2)"] -stats = ["scipy (>=1.3)", "statsmodels (>=0.10)"] - -[[package]] -name = "setuptools-scm" -version = "7.1.0" -description = "the blessed package to manage your versions by scm tags" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -packaging = ">=20.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} -typing-extensions = "*" - -[package.extras] -test = ["pytest (>=6.2)", "virtualenv (>20)"] -toml = ["setuptools (>=42)"] - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "stack-data" -version = "0.6.2" -description = "Extract data from python stack frames and tracebacks for informative displays" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -asttokens = ">=2.1.0" -executing = ">=1.2.0" -pure-eval = "*" - -[package.extras] -tests = ["pytest", "typeguard", "pygments", "littleutils", "cython"] - -[[package]] -name = "statsmodels" -version = "0.13.2" -description = "Statistical computations and models for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = ">=1.17" -packaging = ">=21.3" -pandas = ">=0.25" -patsy = ">=0.5.2" -scipy = ">=1.3" - -[package.extras] -build = ["cython (>=0.29.26)"] -develop = ["cython (>=0.29.26)"] -docs = ["sphinx", "nbconvert", "jupyter-client", "ipykernel", "matplotlib", "nbformat", "numpydoc", "pandas-datareader"] - -[[package]] -name = "statsmodels" -version = "0.13.3" -description = "Statistical computations and models for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = [ - {version = ">=1.17", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""}, - {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, -] -packaging = ">=21.3" -pandas = ">=0.25" -patsy = ">=0.5.2" -scipy = {version = ">=1.3", markers = "(python_version > \"3.7\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""} - -[package.extras] -build = ["cython (>=0.29.32)"] -develop = ["cython (>=0.29.32)", "cython (>=0.29.32,<3.0.0)", "setuptools_scm[toml] (>=7.0.0,<7.1.0)", "oldest-supported-numpy (>=2022.4.18)", "matplotlib (>=3)", "colorama", "joblib", "jinja2", "pytest (>=7.0.1,<7.1.0)", "pytest-randomly", "pytest-xdist", "flake8", "isort", "pywinpty"] -docs = ["sphinx", "nbconvert", "jupyter-client", "ipykernel", "matplotlib", "nbformat", "numpydoc", "pandas-datareader"] - -[[package]] -name = "statsmodels" -version = "0.13.4" -description = "Statistical computations and models for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = [ - {version = ">=1.17", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""}, - {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, -] -packaging = ">=21.3" -pandas = ">=0.25" -patsy = ">=0.5.2" -scipy = {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""} - -[package.extras] -build = ["cython (>=0.29.32)"] -develop = ["cython (>=0.29.32)", "cython (>=0.29.32,<3.0.0)", "setuptools_scm[toml] (>=7.0.0,<7.1.0)", "oldest-supported-numpy (>=2022.4.18)", "matplotlib (>=3)", "colorama", "joblib", "jinja2", "pytest (>=7.0.1,<7.1.0)", "pytest-randomly", "pytest-xdist", "flake8", "isort", "pywinpty"] -docs = ["sphinx", "nbconvert", "jupyter-client", "ipykernel", "matplotlib", "nbformat", "numpydoc", "pandas-datareader"] - -[[package]] -name = "statsmodels" -version = "0.13.5" -description = "Statistical computations and models for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = [ - {version = ">=1.17", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""}, - {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, -] -packaging = ">=21.3" -pandas = ">=0.25" -patsy = ">=0.5.2" -scipy = {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""} - -[package.extras] -build = ["cython (>=0.29.32)"] -develop = ["cython (>=0.29.32)", "cython (>=0.29.32,<3.0.0)", "setuptools-scm[toml] (>=7.0.0,<7.1.0)", "oldest-supported-numpy (>=2022.4.18)", "matplotlib (>=3)", "colorama", "joblib", "jinja2", "pytest (>=7.0.1,<7.1.0)", "pytest-randomly", "pytest-xdist", "flake8", "isort", "pywinpty"] -docs = ["sphinx", "nbconvert", "jupyter-client", "ipykernel", "matplotlib", "nbformat", "numpydoc", "pandas-datareader"] - -[[package]] -name = "threadpoolctl" -version = "3.1.0" -description = "threadpoolctl" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "torch" -version = "1.13.1" -description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -category = "main" -optional = false -python-versions = ">=3.7.0" - -[package.dependencies] -nvidia-cublas-cu11 = {version = "11.10.3.66", markers = "platform_system == \"Linux\""} -nvidia-cuda-nvrtc-cu11 = {version = "11.7.99", markers = "platform_system == \"Linux\""} -nvidia-cuda-runtime-cu11 = {version = "11.7.99", markers = "platform_system == \"Linux\""} -nvidia-cudnn-cu11 = {version = "8.5.0.96", markers = "platform_system == \"Linux\""} -typing-extensions = "*" - -[package.extras] -opt-einsum = ["opt-einsum (>=3.3)"] - -[[package]] -name = "tqdm" -version = "4.64.1" -description = "Fast, Extensible Progress Meter" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[package.extras] -dev = ["py-make (>=0.1.0)", "twine", "wheel"] -notebook = ["ipywidgets (>=6)"] -slack = ["slack-sdk"] -telegram = ["requests"] - -[[package]] -name = "traitlets" -version = "5.8.0" -description = "Traitlets Python configuration system" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] -test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] - -[[package]] -name = "typing-extensions" -version = "4.4.0" -description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "wcwidth" -version = "0.2.5" -description = "Measures the displayed width of unicode strings in a terminal" -category = "main" -optional = false -python-versions = "*" - -[metadata] -lock-version = "1.1" -python-versions = ">=3.9,<3.11" -content-hash = "de813fd55bcec86ac2ea53f5f60b9f804a293af5c56de030b703aef0bf36614f" - -[metadata.files] -appnope = [] -asttokens = [] -attrs = [] -backcall = [ - {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, - {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, -] -colorama = [] -contourpy = [] -cycler = [ - {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"}, - {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"}, -] -decorator = [ - {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, - {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, -] -executing = [] -fonttools = [] -future = [] -gmr = [ - {file = "gmr-1.6.2.tar.gz", hash = "sha256:953e3f350ac94557612a1832cba0c319389d4f857fe0cf8cd51a1706c3935e6d"}, -] -iniconfig = [] -ipython = [] -jedi = [] -jinja2 = [] -joblib = [] -jsonpickle = [] -kiwisolver = [] -markupsafe = [ - {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, - {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, -] -matplotlib = [] -matplotlib-inline = [] -missingno = [ - {file = "missingno-0.5.1-py3-none-any.whl", hash = "sha256:74e8fa1ea68c9482479a9429009856ae6cc64725f085092429950e30a8d78f55"}, - {file = "missingno-0.5.1.tar.gz", hash = "sha256:22e1735a9213df7425e76123ebcc627d11a2608a1d725b90c6a2d7329db718db"}, -] -networkx = [] -numpy = [] -nvidia-cublas-cu11 = [] -nvidia-cuda-nvrtc-cu11 = [] -nvidia-cuda-runtime-cu11 = [] -nvidia-cudnn-cu11 = [] -opt-einsum = [] -packaging = [] -pandas = [] -parso = [ - {file = "parso-0.8.3-py2.py3-none-any.whl", hash = "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"}, - {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"}, -] -patsy = [] -pexpect = [ - {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, - {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, -] -pgmpy = [] -pickleshare = [ - {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, - {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, -] -pillow = [] -pluggy = [] -pomegranate = [ - {file = "pomegranate-0.14.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d5f1d83eb44d924640fbe952c38e904d8aa97882ffc85fb16e120a40e0463e3c"}, - {file = "pomegranate-0.14.8-cp37-cp37m-win32.whl", hash = "sha256:741bc1a5c2fd483b4713a58feac81ac6a5bf4354b326bfe5e320d193a08fbfc2"}, - {file = "pomegranate-0.14.8-cp37-cp37m-win_amd64.whl", hash = "sha256:967e06272526accf54418cd3dbce96cdf62083690142a5f124cba05256663939"}, - {file = "pomegranate-0.14.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6808b8bc990e6755d8160ee660472b463ece4f2662dea17dd2184a426903027d"}, - {file = "pomegranate-0.14.8-cp38-cp38-win32.whl", hash = "sha256:61911a91dfb3e7158329707890c844da71919c0dc9b5974cc5941781021c0ee3"}, - {file = "pomegranate-0.14.8-cp38-cp38-win_amd64.whl", hash = "sha256:22038290a34d98c19bf13f1a89df3fe5f1799aa4ba5ac5edd6efc1fbf68ae564"}, - {file = "pomegranate-0.14.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1932f8e91bfb9bc3ae710e3e73c88daed161d25e969a5531e67918db4bb6b99"}, - {file = "pomegranate-0.14.8-cp39-cp39-win32.whl", hash = "sha256:f0f5d9ba93e4ac542a2a260957ff6e1d4e52e714c3993a7af5cf3a0e42e14253"}, - {file = "pomegranate-0.14.8-cp39-cp39-win_amd64.whl", hash = "sha256:8774b66c7882d15a64b47d91cdb333a6c6711f4d96dfa83bf285a9bcf5c0cd97"}, - {file = "pomegranate-0.14.8.tar.gz", hash = "sha256:2296651290482dd53204ffaaaea267ceee057ce1b3ef1f9d9793febe66d6693d"}, -] -prompt-toolkit = [] -ptyprocess = [ - {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, - {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, -] -pure-eval = [ - {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"}, - {file = "pure_eval-0.2.2.tar.gz", hash = "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"}, -] -py = [] -pygments = [] -pyitlib = [] -pyparsing = [] -pytest = [] -python-dateutil = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, -] -pytz = [] -pyvis = [] -pyyaml = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, -] -scikit-learn = [] -scipy = [] -seaborn = [] -setuptools-scm = [] -six = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] -stack-data = [] -statsmodels = [ - {file = "statsmodels-0.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3e7ca5b7e678c0bb7a24f5c735d58ac104a50eb61b17c484cce0e221a095560f"}, - {file = "statsmodels-0.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:066a75d5585378b2df972f81a90b9a3da5e567b7d4833300c1597438c1a35e29"}, - {file = "statsmodels-0.13.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f15f38dfc9c5c091662cb619e12322047368c67aef449c7554d9b324a15f7a94"}, - {file = "statsmodels-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c4ccc6b4744613367e8a233bd952c8a838db8f528f9fe033bda25aa13fc7d08"}, - {file = "statsmodels-0.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:855b1cc2a91ab140b9bcf304b1731705805ce73223bf500b988804968554c0ed"}, - {file = "statsmodels-0.13.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b69c9af7606325095f7c40c581957bad9f28775653d41537c1ec4cd1b185ff5b"}, - {file = "statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab31bac0f72b83bca1f217a12ec6f309a56485a50c4a705fbdd63112213d4da4"}, - {file = "statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d680b910b57fc0aa87472662cdfe09aae0e21db4bdf19ccd6420fd4dffda892"}, - {file = "statsmodels-0.13.2-cp37-cp37m-win32.whl", hash = "sha256:9e9a3f661d372431850d55157d049e079493c97fc06f550d23d8c8c70805cc48"}, - {file = "statsmodels-0.13.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c9f6326870c095ef688f072cd476b932aff0906d60193eaa08e93ec23b29ca83"}, - {file = "statsmodels-0.13.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5bc050f25f1ba1221efef9ea01b751c60935ad787fcd4259f4ece986f2da9141"}, - {file = "statsmodels-0.13.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:426b1c8ea3918d3d27dbfa38f2bee36cabf41d32163e2cbb3adfb0178b24626a"}, - {file = "statsmodels-0.13.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45b80fac4a63308b1e93fa9dc27a8598930fd5dfd77c850ca077bb850254c6d7"}, - {file = "statsmodels-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78ee69ec0e0f79f627245c65f8a495b8581c2ea19084aac63941815feb15dcf3"}, - {file = "statsmodels-0.13.2-cp38-cp38-win32.whl", hash = "sha256:20483cc30e11aa072b30d307bb80470f86a23ae8fffa51439ca54509d7aa9b05"}, - {file = "statsmodels-0.13.2-cp38-cp38-win_amd64.whl", hash = "sha256:bf43051a92231ccb9de95e4b6d22d3b15e499ee5ee9bff0a20e6b6ad293e34cb"}, - {file = "statsmodels-0.13.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6bf0dfed5f5edb59b5922b295392cd276463b10a5e730f7e57ee4ff2d8e9a87e"}, - {file = "statsmodels-0.13.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a403b559c5586dab7ac0fc9e754c737b017c96cce0ddd66ff9094764cdaf293d"}, - {file = "statsmodels-0.13.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f23554dd025ea354ce072ba32bfaa840d2b856372e5734290e181d27a1f9e0c"}, - {file = "statsmodels-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:815f4df713e3eb6f40ae175c71f2a70d32f9219b5b4d23d4e0faab1171ba93ba"}, - {file = "statsmodels-0.13.2-cp39-cp39-win32.whl", hash = "sha256:461c82ab2265fa8457b96afc23ef3ca19f42eb070436e0241b57e58a38863901"}, - {file = "statsmodels-0.13.2-cp39-cp39-win_amd64.whl", hash = "sha256:39daab5a8a9332c8ea83d6464d065080c9ba65f236daf6a64aa18f64ef776fad"}, - {file = "statsmodels-0.13.2.tar.gz", hash = "sha256:77dc292c9939c036a476f1770f9d08976b05437daa229928da73231147cde7d4"}, -] -threadpoolctl = [ - {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, - {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, -] -tomli = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] -torch = [] -tqdm = [] -traitlets = [] -typing-extensions = [] -wcwidth = [ - {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, - {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, -] From 85408e02422e4f41ddcf89eb80d1f2bfeb5527f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Sat, 15 Jun 2024 15:02:12 +0300 Subject: [PATCH 08/18] New README.md (#112) * Create README.md * Delete README.rst * Update README.md --- README.md | 160 ++++++++++++++++++++++++++++++++++ README.rst | 248 ----------------------------------------------------- 2 files changed, 160 insertions(+), 248 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/README.md b/README.md new file mode 100644 index 0000000..3268555 --- /dev/null +++ b/README.md @@ -0,0 +1,160 @@ +![BAMT framework logo](docs/images/BAMT_white_bg.png) + +# BAMT - Bayesian Analytical and Modelling Toolkit + +Repository of a data modeling and analysis tool based on Bayesian networks. + +## Badges + +| team | ![ITMO](https://raw.githubusercontent.com/ITMO-NSS-team/open-source-ops/cd771018e80e9164f7b661bd2191061ab58f94de/badges/ITMO_badge.svg) ![NCCR](https://raw.githubusercontent.com/ITMO-NSS-team/open-source-ops/cd771018e80e9164f7b661bd2191061ab58f94de/badges/NCCR_badge.svg) | +|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| package | ![pypi](https://badge.fury.io/py/bamt.svg) ![Supported Python Versions](https://img.shields.io/badge/python_3.9-passing-success) ![Supported Python Versions](https://img.shields.io/badge/python_3.10-passing-success) | +| tests | ![Build](https://github.com/ITMO-NSS-team/BAMT/actions/workflows/bamtcodecov.yml/badge.svg) ![coverage](https://codecov.io/github/aimclub/BAMT/branch/master/graph/badge.svg?token=fA4qsxGqTC) | +| docs | ![Documentation Status](https://readthedocs.org/projects/bamt/badge/?version=latest) | +| license | ![license](https://img.shields.io/github/license/ITMO-NSS-team/BAMT) | +| stats | ![downloads](https://static.pepy.tech/personalized-badge/bamt?period=total&units=international_system&left_color=grey&right_color=blue&left_text=downloads) ![downloads/month](https://static.pepy.tech/personalized-badge/bamt?period=month&units=international_system&left_color=grey&right_color=blue&left_text=downloads/month) ![downloads/week](https://static.pepy.tech/personalized-badge/bamt?period=week&units=international_system&left_color=grey&right_color=blue&left_text=downloads/week) | +| style | ![Black](https://img.shields.io/badge/code%20style-black-000000.svg) | + +## Introduction + +BAMT - Bayesian Analytical and Modelling Toolkit. This repository contains a data modeling and analysis tool based on Bayesian networks. It can be divided into two main parts - algorithms for constructing and training Bayesian networks on data and algorithms for applying Bayesian networks for filling gaps, generating synthetic data, assessing edge strength, etc. + +![bamt readme scheme](docs/images/bamt_readme_scheme.png) + +## Installation + +BAMT package is available via PyPi: + +```bash +pip install bamt +``` + +## BAMT Features + +The following algorithms for Bayesian Networks learning are implemented: + +- Building the structure of a Bayesian network based on expert knowledge by directly specifying the structure of the network. +- Building the structure of a Bayesian network on data using three algorithms - Hill Climbing, evolutionary, and PC (PC is currently under development). For Hill Climbing, the following score functions are implemented - MI, K2, BIC, AIC. The algorithms work on both discrete and mixed data. +- Learning the parameters of distributions in the nodes of the network based on Gaussian distribution and Mixture Gaussian distribution with automatic selection of the number of components. +- Non-parametric learning of distributions at nodes using classification and regression models. +- BigBraveBN - algorithm for structural learning of Bayesian networks with a large number of nodes. Tested on networks with up to 500 nodes. + +### Difference from existing implementations: + +- Algorithms work on mixed data. +- Structural learning implements score-functions for mixed data. +- Parametric learning implements the use of a mixture of Gaussian distributions to approximate continuous distributions. +- Non-parametric learning of distributions with various user-specified regression and classification models. +- The algorithm for structural training of large Bayesian networks (> 10 nodes) is based on local training of small networks with their subsequent algorithmic connection. + +![bn example gif](img/BN_gif.gif) + +For example, in terms of data analysis and modeling using Bayesian networks, a pipeline has been implemented to generate synthetic data by sampling from Bayesian networks. + +![synthetics generation](img/synth_gen.png) + +## How to use + +Then the necessary classes are imported from the library: + +```python +from bamt.networks.hybrid_bn import HybridBN +``` + +Next, a network instance is created and training (structure and parameters) is performed: + +```python +bn = HybridBN(has_logit=False, use_mixture=True) +bn.add_edges(preprocessed_data) +bn.fit_parameters(data) +``` + +## Examples & Tutorials + +More examples can be found in [Documentation](https://bamt.readthedocs.io/en/latest/examples/learn_save.html). + +## Publications about BAMT + +We have published several articles about BAMT: + +- [Advanced Approach for Distributions Parameters Learning in Bayesian Networks with Gaussian Mixture Models and Discriminative Models](https://www.mdpi.com/2227-7390/11/2/343) (2023) +- [BigBraveBN: algorithm of structural learning for bayesian networks with a large number of nodes](https://www.sciencedirect.com/science/article/pii/S1877050922016945) (2022) +- [MIxBN: Library for learning Bayesian networks from mixed data](https://www.sciencedirect.com/science/article/pii/S1877050921020925) (2021) +- [Oil and Gas Reservoirs Parameters Analysis Using Mixed Learning of Bayesian Networks](https://link.springer.com/chapter/10.1007/978-3-030-77961-0_33) (2021) +- [Bayesian Networks-based personal data synthesis](https://dl.acm.org/doi/abs/10.1145/3411170.3411243) (2020) + +## Project structure + +The latest stable version of the library is available in the master branch. + +It includes the following modules and directories: + +- [bamt](https://github.com/ITMO-NSS-team/BAMT/tree/master/bamt) - directory with the framework code: + - Preprocessing - module for data preprocessing + - Networks - module for building and training Bayesian networks + - Nodes - module for nodes support of Bayesian networks + - Utilities - module for mathematical and graph utilities +- [data](https://github.com/ITMO-NSS-team/BAMT/tree/master/data) - directory with data for experiments and tests +- [tests](https://github.com/ITMO-NSS-team/BAMT/tree/master/tests) - directory with unit and integration tests +- [tutorials](https://github.com/ITMO-NSS-team/BAMT/tree/master/tutorials) - directory with tutorials +- [docs](https://github.com/ITMO-NSS-team/BAMT/tree/master/docs) - directory with RTD documentation + +### Preprocessing + +Preprocessor module allows users to transform data according to the pipeline (similar to the pipeline in scikit-learn). + +### Networks + +Three types of networks are implemented: + +- HybridBN - Bayesian network with mixed data +- DiscreteBN - Bayesian network with discrete data +- ContinuousBN - Bayesian network with continuous data + +They are inherited from the abstract class BaseNetwork. + +### Nodes + +Contains classes for nodes of Bayesian networks. + +### Utilities + +Utilities module contains mathematical and graph utilities to support the main functionality of the library. + +## Web-BAMT + +A web interface for BAMT is currently under development. The repository is available at [web-BAMT](https://github.com/aimclub/Web-BAMT). + +## Contacts + +If you have questions or suggestions, you can contact us at the following address: ideeva@itmo.ru (Irina Deeva) + +Our resources: + +- [Natural Systems Simulation Team](https://itmo-nss-team.github.io/) +- [NSS team Telegram channel](https://t.me/NSS_group) +- [NSS lab YouTube channel](https://www.youtube.com/@nsslab/videos) + +## Citation + +```bibtex +@misc{BAMT, + author={BAMT}, + title = {Repository experiments and data}, + year = {2021}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/ITMO-NSS-team/BAMT.git}}, + url = {https://github.com/ITMO-NSS-team/BAMT.git} +} + +@article{deeva2023advanced, + title={Advanced Approach for Distributions Parameters Learning in Bayesian Networks with Gaussian Mixture Models and Discriminative Models}, + author={Deeva, Irina and Bubnova, Anna and Kalyuzhnaya, Anna V}, + journal={Mathematics}, + volume={11}, + number={2}, + pages={343}, + year={2023}, +} +``` diff --git a/README.rst b/README.rst deleted file mode 100644 index 8f5b3ff..0000000 --- a/README.rst +++ /dev/null @@ -1,248 +0,0 @@ -.. image:: /docs/images/BAMT_white_bg.png - :align: center - :alt: BAMT framework logo - -.. start-badges -.. list-table:: - :stub-columns: 1 - - * - team - - | |ITMO| |NCCR| - * - package - - | |pypi| |py_9| |py_10| - * - tests - - | |Build| |coverage| - * - docs - - |docs| - * - license - - | |license| - * - stats - - | |downloads_stats| |downloads_monthly| |downloads_weekly| - * - style - - | |Black| - -Repository of a data modeling and analysis tool based on Bayesian networks - -BAMT - Bayesian Analytical and Modelling Toolkit. This repository contains a data modeling and analysis tool based on Bayesian networks. It can be divided into two main parts - algorithms for constructing and training Bayesian networks on data and algorithms for applying Bayesian networks for filling gaps, generating synthetic data, assessing edges strength e.t.c. - -.. image:: docs/images/bamt_readme_scheme.png - :target: docs/images/bamt_readme_scheme.png - :align: center - :alt: bamt readme scheme - -Installation -^^^^^^^^^^^^ - -BAMT package is available via PyPi: - -.. code-block:: bash - - pip install bamt - -BAMT Features -^^^^^^^^^^^^^ - -The following algorithms for Bayesian Networks learning are implemented: - - -* Building the structure of a Bayesian network based on expert knowledge by directly specifying the structure of the network; -* Building the structure of a Bayesian network on data using three algorithms - Hill Climbing, evolutionary and PC (PC is currently under development). For Hill Climbing, the following score functions are implemented - MI, K2, BIC, AIC. The algorithms work on both discrete and mixed data. -* Learning the parameters of distributions in the nodes of the network based on Gaussian distribution and Mixture Gaussian distribution with automatic selection of the number of components. -* Non-parametric learning of distributions at nodes using classification and regression models. -* BigBraveBN - algorithm for structural learning of Bayesian networks with a large number of nodes. Tested on networks with up to 500 nodes. - -Difference from existing implementations: - - -* Algorithms work on mixed data; -* Structural learning implements score-functions for mixed data; -* Parametric learning implements the use of a mixture of Gaussian distributions to approximate continuous distributions; -* Non-parametric learning of distributions with various user-specified regression and classification models; -* The algorithm for structural training of large Bayesian networks (> 10 nodes) is based on local training of small networks with their subsequent algorithmic connection. - -.. image:: img/BN_gif.gif - :target: img/BN_gif.gif - :align: center - :alt: bn example gif - -For example, in terms of data analysis and modeling using Bayesian networks, a pipeline has been implemented to generate synthetic data by sampling from Bayesian networks. - - - -.. image:: img/synth_gen.png - :target: img/synth_gen.png - :align: center - :height: 300px - :width: 600px - :alt: synthetics generation - - -How to use -^^^^^^^^^^ - -Then the necessary classes are imported from the library: - -.. code-block:: python - - from bamt.networks.hybrid_bn import HybridBN - -Next, a network instance is created and training (structure and parameters) is performed: - -.. code-block:: python - - bn = HybridBN(has_logit=False, use_mixture=True) - bn.add_edges(preprocessed_data) - bn.fit_parameters(data) - - - -Examples & Tutorials -^^^^^^^^^^^^^^^^^^^^^^ - -More examples can be found in `Documentation `__. - -Publications about BAMT -^^^^^^^^^^^^^^^^^^^^^^^ - -We have published several articles about BAMT: - -* `Advanced Approach for Distributions Parameters Learning in Bayesian Networks with Gaussian Mixture Models and Discriminative Models `__ (2023) -* `BigBraveBN: algorithm of structural learning for bayesian networks with a large number of nodes `__ (2022) -* `MIxBN: Library for learning Bayesian networks from mixed data `__ (2021) -* `Oil and Gas Reservoirs Parameters Analysis Using Mixed Learning of Bayesian Networks `__ (2021) -* `Bayesian Networks-based personal data synthesis `__ (2020) - - -Project structure -^^^^^^^^^^^^^^^^^ - -The latest stable version of the library is available in the master branch. - -It includes the following modules and direcotries: - -* `bamt `__ - directory with the framework code: - * Preprocessing - module for data preprocessing - * Networks - module for building and training Bayesian networks - * Nodes - module for nodes support of Bayesian networks - * Utilities - module for mathematical and graph utilities -* `data `__ - directory with data for experiments and tests -* `tests `__ - directory with unit and integration tests -* `tutorials `__ - directory with tutorials -* `docs `__ - directory with RTD documentation - -Preprocessing -============= - -Preprocessor module allows user to transform data according pipeline (similar to pipeline in scikit-learn). - -Networks -======== - -Three types of networks are implemented: - -* HybridBN - Bayesian network with mixed data -* DiscreteBN - Bayesian network with discrete data -* ContinuousBN - Bayesian network with continuous data - -They are inherited from the abstract class BaseNetwork. - -Nodes -===== - -Contains classes for nodes of Bayesian networks. - -Utilities -========= - -Utilities module contains mathematical and graph utilities to support the main functionality of the library. - - -Web-BAMT -^^^^^^^^ - -A web interface for BAMT is currently under development. -The repository is available at `web-BAMT `__ - -Contacts -^^^^^^^^ - -If you have questions or suggestions, you can contact us at the following address: ideeva@itmo.ru (Irina Deeva) - -Our resources: - -* `Natural Systems Simulation Team `__ -* `NSS team Telegram channel `__ -* `NSS lab YouTube channel `__ - - -Citation -^^^^^^^^ - -@misc{BAMT, - author={BAMT}, - title = {Repository experiments and data}, - year = {2021}, - publisher = {GitHub}, - journal = {GitHub repository}, - howpublished = {\url{https://github.com/ITMO-NSS-team/BAMT.git}}, - url = {https://github.com/ITMO-NSS-team/BAMT.git} -} - -@article{deeva2023advanced, - title={Advanced Approach for Distributions Parameters Learning in Bayesian Networks with Gaussian Mixture Models and Discriminative Models}, - author={Deeva, Irina and Bubnova, Anna and Kalyuzhnaya, Anna V}, - journal={Mathematics}, - volume={11}, - number={2}, - pages={343}, - year={2023}, - publisher={MDPI} -} - - - - -.. |docs| image:: https://readthedocs.org/projects/bamt/badge/?version=latest - :target: https://bamt.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - -.. |ITMO| image:: https://raw.githubusercontent.com/ITMO-NSS-team/open-source-ops/cd771018e80e9164f7b661bd2191061ab58f94de/badges/ITMO_badge.svg - -.. |NCCR| image:: https://raw.githubusercontent.com/ITMO-NSS-team/open-source-ops/cd771018e80e9164f7b661bd2191061ab58f94de/badges/NCCR_badge.svg - -.. |pypi| image:: https://badge.fury.io/py/bamt.svg - :target: https://badge.fury.io/py/bamt - -.. |py_10| image:: https://img.shields.io/badge/python_3.10-passing-success - :alt: Supported Python Versions - :target: https://img.shields.io/badge/python_3.10-passing-success - -.. |py_8| image:: https://img.shields.io/badge/python_3.8-passing-success - :alt: Supported Python Versions - :target: https://img.shields.io/badge/python_3.8-passing-success - -.. |py_9| image:: https://img.shields.io/badge/python_3.9-passing-success - :alt: Supported Python Versions - :target: https://img.shields.io/badge/python_3.9-passing-success - -.. |license| image:: https://img.shields.io/github/license/ITMO-NSS-team/BAMT - :alt: Supported Python Versions - :target: https://github.com/ITMO-NSS-team/BAMT/blob/master/LICENCE - -.. |downloads_stats| image:: https://static.pepy.tech/personalized-badge/bamt?period=total&units=international_system&left_color=grey&right_color=blue&left_text=downloads - :target: https://pepy.tech/project/bamt - -.. |downloads_monthly| image:: https://static.pepy.tech/personalized-badge/bamt?period=month&units=international_system&left_color=grey&right_color=blue&left_text=downloads/month - :target: https://pepy.tech/project/bamt - -.. |downloads_weekly| image:: https://static.pepy.tech/personalized-badge/bamt?period=week&units=international_system&left_color=grey&right_color=blue&left_text=downloads/week - :target: https://pepy.tech/project/bamt - -.. |Build| image:: https://github.com/ITMO-NSS-team/BAMT/actions/workflows/bamtcodecov.yml/badge.svg - :target: https://github.com/ITMO-NSS-team/BAMT/actions/workflows/bamtcodecov.yml - -.. |coverage| image:: https://codecov.io/github/aimclub/BAMT/branch/master/graph/badge.svg?token=fA4qsxGqTC - :target: https://codecov.io/github/aimclub/BAMT - -.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg -.. _Black: https://github.com/psf/black From 0f298975ffe759d9f161dba5ccfc9ba3b556b9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:10:03 +0300 Subject: [PATCH 09/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4ffdb37..b012db2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.24.2 +numpy>=1.24.2, <2.0.0 matplotlib==3.6.2 pandas>=2.0.0 gmr==1.6.2 From e499ec84e4627862c055ad41e094bcbe6261e06c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:10:26 +0300 Subject: [PATCH 10/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b012db2..7757748 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ gmr==1.6.2 scikit-learn>=1.2.0 scipy>=1.9.3 pyvis>=0.2.1 -pgmpy==0.1.20 +pgmpy==0.1.25 catboost>=1.0.6 joblib>=1.1.1 networkx>=3.1 From 4e7caab3b5c0b43a7b0ef18b35fe384bddab72b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:26:39 +0300 Subject: [PATCH 11/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7757748..a1e6583 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.24.2, <2.0.0 +numpy==1.26.4 matplotlib==3.6.2 pandas>=2.0.0 gmr==1.6.2 From 4ce6c0d254e1493b0d3468d4d4ab33c2279c17ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:28:38 +0300 Subject: [PATCH 12/18] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a1e6583..bbaa779 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -numpy==1.26.4 +numpy>=2.0.0 matplotlib==3.6.2 -pandas>=2.0.0 +pandas>=2.2.2 gmr==1.6.2 scikit-learn>=1.2.0 scipy>=1.9.3 From cce19bc9131bb7605f98fed37c5461f70feb6c00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:44:41 +0300 Subject: [PATCH 13/18] Update bamtcodecov.yml --- .github/workflows/bamtcodecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bamtcodecov.yml b/.github/workflows/bamtcodecov.yml index d5b12a3..3c08a9b 100644 --- a/.github/workflows/bamtcodecov.yml +++ b/.github/workflows/bamtcodecov.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.9' + python-version: '3.11' - name: Update pip run: python -m pip install --upgrade pip - name: Install requirements From a19c435fbc316951f368bb1c34ae0d565b66c673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:51:47 +0300 Subject: [PATCH 14/18] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bbaa779..dad16dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -numpy>=2.0.0 +numpy==1.26.4 matplotlib==3.6.2 -pandas>=2.2.2 +pandas>=2.0.3 gmr==1.6.2 scikit-learn>=1.2.0 scipy>=1.9.3 From 826126277d8bc036d4f2d09f88d06f86ae098ba4 Mon Sep 17 00:00:00 2001 From: jrzkaminski Date: Fri, 5 Jul 2024 15:33:21 +0300 Subject: [PATCH 15/18] black autoformat yml --- .github/workflows/black.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..af032f6 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: black-action +on: [push, pull_request] +jobs: + linter_name: + name: Run black formatter + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: rickstaa/action-black@v1.3.3 + with: + black_args: "." \ No newline at end of file From 5c838d6f8f7b837a9212da90b2dbb9c06a0ad2d0 Mon Sep 17 00:00:00 2001 From: jrzkaminski Date: Fri, 5 Jul 2024 15:34:51 +0300 Subject: [PATCH 16/18] mkdocs migration --- .github/workflows/ci.yml | 29 +++++++++++++++ .readthedocs.yml | 39 -------------------- mkdocs.yml | 77 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .readthedocs.yml create mode 100644 mkdocs.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..64749c8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,29 @@ +name: ci +on: + push: + branches: + - master + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material 'mkdocstrings[python]' + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index 2efbd82..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Read the Docs configuration file for Sphinx projects -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the OS, Python version and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.11" - # You can also specify other tool versions: - # nodejs: "20" - # rust: "1.70" - # golang: "1.20" - -# Build documentation in the "docs/" directory with Sphinx -sphinx: - configuration: docs/source/conf.py - # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs - # builder: "dirhtml" - # Fail on all warnings to avoid broken references - # fail_on_warning: true - -# Optionally build your docs in additional formats such as PDF and ePub -# formats: -# - pdf -# - epub - -# Optional but recommended, declare the Python requirements required -# to build your documentation -# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html -# python: -# install: -# - requirements: docs/requirements.txt - -python: - install: - - requirements: other_requirements/readthedocs.txt diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..0363f66 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,77 @@ +site_name: BAMT +repo_name: aimclub/BAMT +repo_url: https://github.com/aimclub/BAMT +theme: + name: material + locale: en + features: + - announce.dismiss + - content.action.edit + - content.action.view + - content.code.annotate + - content.code.copy + # - content.code.select + # - content.footnote.tooltips + # - content.tabs.link + - content.tooltips + # - header.autohide + # - navigation.expand + - navigation.footer + - navigation.indexes + - navigation.instant + # - navigation.instant.prefetch + - navigation.instant.progress + # - navigation.prune + - navigation.sections + - navigation.tabs + # - navigation.tabs.sticky + - navigation.top + - navigation.tracking + - search.highlight + - search.share + - search.suggest + - toc.follow + # - toc.integrate + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/link + name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/toggle-switch + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/toggle-switch-off + name: Switch to system preference + font: + text: Roboto + code: Roboto Mono + +nav: + - Home: index.md + - Getting Started: getting-started.md + - Installation: installation.md + - API Reference: + - logger: reference/logger.md +#extra: +# social: +# - icon: fontawesome/brands/github +# link: https://github.com/myusername/myproject +plugins: + - search + - mkdocstrings +markdown_extensions: + - admonition + - codehilite + - footnotes + - meta + - toc: + permalink: True From abcab5442a66bcd8c21be0b46ed8f4080d09674e Mon Sep 17 00:00:00 2001 From: jrzkaminski Date: Fri, 5 Jul 2024 15:57:13 +0300 Subject: [PATCH 17/18] Revert "merging master into 2.0.0" This reverts commit 785ad136f6847c41e27db014016ca7071cac03e3, reversing changes made to a19c435fbc316951f368bb1c34ae0d565b66c673. --- .github/workflows/black.yml | 11 - .github/workflows/ci.yml | 29 - .readthedocs.yml | 39 + bamt/builders/__init__.py | 1 + bamt/builders/builders_base.py | 222 +++ bamt/builders/composite_builder.py | 293 ++++ bamt/builders/evo_builder.py | 231 ++++ bamt/builders/hc_builder.py | 208 +++ bamt/core/graph/dag.py | 24 - bamt/core/graph/graph.py | 30 - bamt/core/node_models/classifier.py | 25 - .../node_models/continuous_distribution.py | 22 - bamt/core/node_models/distribution.py | 14 - .../node_models/empirical_distribution.py | 17 - bamt/core/node_models/prediction_model.py | 20 - bamt/core/node_models/regressor.py | 25 - bamt/core/nodes/child_nodes/child_node.py | 6 - .../conditional_continuous_node.py | 7 - .../child_nodes/conditional_discrete_node.py | 7 - bamt/core/nodes/node.py | 6 - bamt/core/nodes/root_nodes/continuous_node.py | 10 - bamt/core/nodes/root_nodes/discrete_node.py | 6 - bamt/core/nodes/root_nodes/root_node.py | 6 - bamt/dag_optimizers/constraint/__init__.py | 0 .../constraint/constraint_dag_optimizer.py | 0 bamt/dag_optimizers/dag_optimizer.py | 10 - bamt/dag_optimizers/hybrid/__init__.py | 0 .../hybrid/hybrid_dag_optimizer.py | 0 bamt/dag_optimizers/score/__init__.py | 0 bamt/dag_optimizers/score/bigbravebn.py | 0 bamt/dag_optimizers/score/golem_genetic.py | 0 bamt/dag_optimizers/score/hill_climbing.py | 0 bamt/dag_optimizers/score/lsevobn.py | 0 .../score/score_dag_optimizer.py | 9 - bamt/display/__init__.py | 9 + bamt/display/display.py | 175 +++ bamt/{core => external}/__init__.py | 0 bamt/external/pyBN/__init__.py | 2 + .../pyBN/classes}/__init__.py | 0 .../pyBN/classes/_tests}/__init__.py | 0 .../pyBN/classes/_tests/test_bayesnet.py | 184 +++ bamt/external/pyBN/classes/bayesnet.py | 393 ++++++ bamt/external/pyBN/utils/__init__.py | 4 + .../pyBN/utils/_tests}/__init__.py | 0 .../utils/_tests/test_independence_tests.py | 56 + .../pyBN/utils/_tests/test_markov_blanket.py | 36 + .../pyBN/utils/_tests/test_orient_edges.py | 38 + .../pyBN/utils/_tests/test_random_sample.py | 33 + bamt/external/pyBN/utils/class_equivalence.py | 33 + bamt/external/pyBN/utils/data.py | 24 + bamt/external/pyBN/utils/graph.py | 54 + .../external/pyBN/utils/independence_tests.py | 181 +++ .../pyitlib/DiscreteRandomVariableUtils.py | 852 ++++++++++++ .../pyitlib}/__init__.py | 0 bamt/log.py | 67 + bamt/logging.conf | 51 + bamt/mi_entropy_gauss.py | 310 +++++ bamt/models/__init__.py | 0 .../__init__.py | 0 .../bayesian_network.py | 20 - .../composite_bayesian_network.py | 18 - .../continuous_bayesian_network.py | 18 - .../discrete_bayesian_network.py | 18 - .../hybrid_bayesian_network.py | 18 - .../probabilistic_structural_model.py | 14 - bamt/networks/__init__.py | 6 + bamt/networks/base.py | 1002 ++++++++++++++ bamt/networks/big_brave_bn.py | 105 ++ bamt/networks/composite_bn.py | 114 ++ bamt/networks/continuous_bn.py | 15 + bamt/networks/discrete_bn.py | 15 + bamt/networks/hybrid_bn.py | 27 + bamt/nodes/__init__.py | 10 + bamt/nodes/base.py | 59 + bamt/nodes/composite_continuous_node.py | 17 + bamt/nodes/composite_discrete_node.py | 20 + bamt/nodes/conditional_gaussian_node.py | 173 +++ bamt/nodes/conditional_logit_node.py | 172 +++ .../conditional_mixture_gaussian_node.py | 227 ++++ bamt/nodes/discrete_node.py | 113 ++ bamt/nodes/gaussian_node.py | 96 ++ bamt/nodes/logit_node.py | 91 ++ bamt/nodes/mixture_gaussian_node.py | 154 +++ bamt/nodes/schema.py | 47 + bamt/parameter_estimators/__init__.py | 0 .../maximum_likelihood_estimator.py | 9 - .../parameters_estimator.py | 10 - .../root_nodes => preprocess}/__init__.py | 0 bamt/preprocess/discretization.py | 165 +++ bamt/preprocess/graph.py | 42 + bamt/preprocess/numpy_pandas.py | 57 + bamt/preprocessors.py | 126 ++ bamt/redef_HC.py | 316 +++++ bamt/redef_info_scores.py | 179 +++ bamt/score_functions/__init__.py | 0 bamt/score_functions/k2_score.py | 9 - .../mutual_information_score.py | 9 - bamt/score_functions/score_function.py | 10 - bamt/utils/EvoUtils.py | 116 ++ bamt/utils/GraphUtils.py | 158 +++ bamt/utils/MathUtils.py | 161 +++ bamt/{dag_optimizers => utils}/__init__.py | 0 bamt/utils/check_utils.py | 9 + .../CompositeGeneticOperators.py | 189 +++ bamt/utils/composite_utils/CompositeModel.py | 17 + bamt/utils/composite_utils/MLUtils.py | 126 ++ bamt/utils/composite_utils/lgbm_params.json | 11 + bamt/utils/composite_utils/models_repo.json | 447 +++++++ bamt/utils/serialization_utils.py | 150 +++ mkdocs.yml | 77 -- pyproject.toml | 23 +- requirements.txt | 34 +- tests/BigbraveBNTest.py | 59 + tests/LoadBN.py | 54 + tests/MainTest.py | 137 ++ tests/MetricsTest.py | 61 + tests/NetworksTest.py | 722 ++++++++++ tests/SaveBN.py | 49 + tests/main.py | 27 + tests/sendingClassifiersLogit.py | 61 + tests/sendingRegressors.py | 62 + tests/test_builders.py | 728 ++++++++++ tests/test_graph_analyzer.py | 98 ++ tests/test_networks.py | 1192 +++++++++++++++++ tests/test_nodes.py | 464 +++++++ tests/test_params.json | 1 + 126 files changed, 11969 insertions(+), 545 deletions(-) delete mode 100644 .github/workflows/black.yml delete mode 100644 .github/workflows/ci.yml create mode 100644 .readthedocs.yml create mode 100644 bamt/builders/__init__.py create mode 100644 bamt/builders/builders_base.py create mode 100644 bamt/builders/composite_builder.py create mode 100644 bamt/builders/evo_builder.py create mode 100644 bamt/builders/hc_builder.py delete mode 100644 bamt/core/graph/dag.py delete mode 100644 bamt/core/graph/graph.py delete mode 100644 bamt/core/node_models/classifier.py delete mode 100644 bamt/core/node_models/continuous_distribution.py delete mode 100644 bamt/core/node_models/distribution.py delete mode 100644 bamt/core/node_models/empirical_distribution.py delete mode 100644 bamt/core/node_models/prediction_model.py delete mode 100644 bamt/core/node_models/regressor.py delete mode 100644 bamt/core/nodes/child_nodes/child_node.py delete mode 100644 bamt/core/nodes/child_nodes/conditional_continuous_node.py delete mode 100644 bamt/core/nodes/child_nodes/conditional_discrete_node.py delete mode 100644 bamt/core/nodes/node.py delete mode 100644 bamt/core/nodes/root_nodes/continuous_node.py delete mode 100644 bamt/core/nodes/root_nodes/discrete_node.py delete mode 100644 bamt/core/nodes/root_nodes/root_node.py delete mode 100644 bamt/dag_optimizers/constraint/__init__.py delete mode 100644 bamt/dag_optimizers/constraint/constraint_dag_optimizer.py delete mode 100644 bamt/dag_optimizers/dag_optimizer.py delete mode 100644 bamt/dag_optimizers/hybrid/__init__.py delete mode 100644 bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py delete mode 100644 bamt/dag_optimizers/score/__init__.py delete mode 100644 bamt/dag_optimizers/score/bigbravebn.py delete mode 100644 bamt/dag_optimizers/score/golem_genetic.py delete mode 100644 bamt/dag_optimizers/score/hill_climbing.py delete mode 100644 bamt/dag_optimizers/score/lsevobn.py delete mode 100644 bamt/dag_optimizers/score/score_dag_optimizer.py create mode 100644 bamt/display/__init__.py create mode 100644 bamt/display/display.py rename bamt/{core => external}/__init__.py (100%) create mode 100644 bamt/external/pyBN/__init__.py rename bamt/{core/graph => external/pyBN/classes}/__init__.py (100%) rename bamt/{core/node_models => external/pyBN/classes/_tests}/__init__.py (100%) create mode 100644 bamt/external/pyBN/classes/_tests/test_bayesnet.py create mode 100644 bamt/external/pyBN/classes/bayesnet.py create mode 100644 bamt/external/pyBN/utils/__init__.py rename bamt/{core/nodes => external/pyBN/utils/_tests}/__init__.py (100%) create mode 100644 bamt/external/pyBN/utils/_tests/test_independence_tests.py create mode 100644 bamt/external/pyBN/utils/_tests/test_markov_blanket.py create mode 100644 bamt/external/pyBN/utils/_tests/test_orient_edges.py create mode 100644 bamt/external/pyBN/utils/_tests/test_random_sample.py create mode 100644 bamt/external/pyBN/utils/class_equivalence.py create mode 100644 bamt/external/pyBN/utils/data.py create mode 100644 bamt/external/pyBN/utils/graph.py create mode 100644 bamt/external/pyBN/utils/independence_tests.py create mode 100644 bamt/external/pyitlib/DiscreteRandomVariableUtils.py rename bamt/{core/nodes/child_nodes => external/pyitlib}/__init__.py (100%) create mode 100644 bamt/log.py create mode 100644 bamt/logging.conf create mode 100644 bamt/mi_entropy_gauss.py delete mode 100644 bamt/models/__init__.py delete mode 100644 bamt/models/probabilistic_structural_models/__init__.py delete mode 100644 bamt/models/probabilistic_structural_models/bayesian_network.py delete mode 100644 bamt/models/probabilistic_structural_models/composite_bayesian_network.py delete mode 100644 bamt/models/probabilistic_structural_models/continuous_bayesian_network.py delete mode 100644 bamt/models/probabilistic_structural_models/discrete_bayesian_network.py delete mode 100644 bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py delete mode 100644 bamt/models/probabilistic_structural_models/probabilistic_structural_model.py create mode 100644 bamt/networks/__init__.py create mode 100644 bamt/networks/big_brave_bn.py create mode 100644 bamt/networks/composite_bn.py create mode 100644 bamt/networks/continuous_bn.py create mode 100644 bamt/networks/discrete_bn.py create mode 100644 bamt/networks/hybrid_bn.py create mode 100644 bamt/nodes/__init__.py create mode 100644 bamt/nodes/base.py create mode 100644 bamt/nodes/composite_continuous_node.py create mode 100644 bamt/nodes/composite_discrete_node.py create mode 100644 bamt/nodes/conditional_gaussian_node.py create mode 100644 bamt/nodes/conditional_logit_node.py create mode 100644 bamt/nodes/conditional_mixture_gaussian_node.py create mode 100644 bamt/nodes/discrete_node.py create mode 100644 bamt/nodes/gaussian_node.py create mode 100644 bamt/nodes/logit_node.py create mode 100644 bamt/nodes/mixture_gaussian_node.py create mode 100644 bamt/nodes/schema.py delete mode 100644 bamt/parameter_estimators/__init__.py delete mode 100644 bamt/parameter_estimators/maximum_likelihood_estimator.py delete mode 100644 bamt/parameter_estimators/parameters_estimator.py rename bamt/{core/nodes/root_nodes => preprocess}/__init__.py (100%) create mode 100644 bamt/preprocess/discretization.py create mode 100644 bamt/preprocess/graph.py create mode 100644 bamt/preprocess/numpy_pandas.py create mode 100644 bamt/preprocessors.py create mode 100644 bamt/redef_HC.py create mode 100644 bamt/redef_info_scores.py delete mode 100644 bamt/score_functions/__init__.py delete mode 100644 bamt/score_functions/k2_score.py delete mode 100644 bamt/score_functions/mutual_information_score.py delete mode 100644 bamt/score_functions/score_function.py create mode 100644 bamt/utils/EvoUtils.py create mode 100644 bamt/utils/GraphUtils.py create mode 100644 bamt/utils/MathUtils.py rename bamt/{dag_optimizers => utils}/__init__.py (100%) create mode 100644 bamt/utils/check_utils.py create mode 100644 bamt/utils/composite_utils/CompositeGeneticOperators.py create mode 100644 bamt/utils/composite_utils/CompositeModel.py create mode 100644 bamt/utils/composite_utils/MLUtils.py create mode 100644 bamt/utils/composite_utils/lgbm_params.json create mode 100644 bamt/utils/composite_utils/models_repo.json create mode 100644 bamt/utils/serialization_utils.py delete mode 100644 mkdocs.yml create mode 100644 tests/BigbraveBNTest.py create mode 100644 tests/LoadBN.py create mode 100644 tests/MainTest.py create mode 100644 tests/MetricsTest.py create mode 100644 tests/NetworksTest.py create mode 100644 tests/SaveBN.py create mode 100644 tests/main.py create mode 100644 tests/sendingClassifiersLogit.py create mode 100644 tests/sendingRegressors.py create mode 100644 tests/test_builders.py create mode 100644 tests/test_graph_analyzer.py create mode 100644 tests/test_networks.py create mode 100644 tests/test_nodes.py create mode 100644 tests/test_params.json diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index af032f6..0000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: black-action -on: [push, pull_request] -jobs: - linter_name: - name: Run black formatter - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: rickstaa/action-black@v1.3.3 - with: - black_args: "." \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 64749c8..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: ci -on: - push: - branches: - - master - - main -permissions: - contents: write -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Configure Git Credentials - run: | - git config user.name github-actions[bot] - git config user.email 41898282+github-actions[bot]@users.noreply.github.com - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - - uses: actions/cache@v4 - with: - key: mkdocs-material-${{ env.cache_id }} - path: .cache - restore-keys: | - mkdocs-material- - - run: pip install mkdocs-material 'mkdocstrings[python]' - - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..2efbd82 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,39 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt + +python: + install: + - requirements: other_requirements/readthedocs.txt diff --git a/bamt/builders/__init__.py b/bamt/builders/__init__.py new file mode 100644 index 0000000..d886ad6 --- /dev/null +++ b/bamt/builders/__init__.py @@ -0,0 +1 @@ +__all__ = ["builders_base", "evo_builder", "hc_builder"] diff --git a/bamt/builders/builders_base.py b/bamt/builders/builders_base.py new file mode 100644 index 0000000..e04bbff --- /dev/null +++ b/bamt/builders/builders_base.py @@ -0,0 +1,222 @@ +import itertools +from typing import Dict, List, Optional, Tuple, Callable, TypedDict, Sequence, Union + +from pandas import DataFrame + +from bamt.log import logger_builder +from bamt.nodes.conditional_gaussian_node import ConditionalGaussianNode +from bamt.nodes.conditional_logit_node import ConditionalLogitNode +from bamt.nodes.conditional_mixture_gaussian_node import ConditionalMixtureGaussianNode +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.nodes.logit_node import LogitNode +from bamt.nodes.mixture_gaussian_node import MixtureGaussianNode +from bamt.utils import GraphUtils as gru + + +class ParamDict(TypedDict, total=False): + init_edges: Optional[Sequence[str]] + init_nodes: Optional[List[str]] + remove_init_edges: bool + white_list: Optional[Tuple[str, str]] + bl_add: Optional[List[str]] + + +class StructureBuilder(object): + """ + Base Class for Structure Builder. + It can restrict nodes defined by RESTRICTIONS + """ + + def __init__(self, descriptor: Dict[str, Dict[str, str]]): + """ + :param descriptor: a dict with types and signs of nodes + Attributes: + black_list: a list with restricted connections; + """ + self.skeleton = {"V": [], "E": []} + self.descriptor = descriptor + + self.has_logit = bool + + self.black_list = None + + def restrict( + self, + data: DataFrame, + init_nodes: Optional[List[str]], + bl_add: Optional[List[str]], + ): + """ + :param data: data to deal with + :param init_nodes: nodes to begin with (thus they have no parents) + :param bl_add: additional vertices + """ + node_type = self.descriptor["types"] + blacklist = [] + datacol = data.columns.to_list() + + if not self.has_logit: + # Has_logit flag allows BN building edges between cont and disc + RESTRICTIONS = [("cont", "disc"), ("cont", "disc_num")] + for x, y in itertools.product(datacol, repeat=2): + if x != y: + if (node_type[x], node_type[y]) in RESTRICTIONS: + blacklist.append((x, y)) + else: + self.black_list = [] + if init_nodes: + blacklist += [(x, y) for x in datacol for y in init_nodes if x != y] + if bl_add: + blacklist = blacklist + bl_add + self.black_list = blacklist + + def get_family(self): + """ + A function that updates a skeleton; + """ + if not self.skeleton["V"]: + logger_builder.error("Vertex list is None") + return None + if not self.skeleton["E"]: + logger_builder.error("Edges list is None") + return None + for node_instance in self.skeleton["V"]: + node = node_instance.name + children = [] + parents = [] + for edge in self.skeleton["E"]: + if node in edge: + if edge.index(node) == 0: + children.append(edge[1]) + if edge.index(node) == 1: + parents.append(edge[0]) + + disc_parents = [] + cont_parents = [] + for parent in parents: + if self.descriptor["types"][parent] in ["disc", "disc_num"]: + disc_parents.append(parent) + else: + cont_parents.append(parent) + + id = self.skeleton["V"].index(node_instance) + self.skeleton["V"][id].disc_parents = disc_parents + self.skeleton["V"][id].cont_parents = cont_parents + self.skeleton["V"][id].children = children + + ordered = gru.toporder(self.skeleton["V"], self.skeleton["E"]) + not_ordered = [node.name for node in self.skeleton["V"]] + mask = [not_ordered.index(name) for name in ordered] + self.skeleton["V"] = [self.skeleton["V"][i] for i in mask] + + +class VerticesDefiner(StructureBuilder): + """ + Main class for defining vertices + """ + + def __init__( + self, descriptor: Dict[str, Dict[str, str]], regressor: Optional[object] + ): + """ + Automatically creates a list of nodes + """ + super(VerticesDefiner, self).__init__(descriptor=descriptor) + # Notice that vertices are used only by Builders + self.vertices = [] + + node = None + # LEVEL 1: Define a general type of node: Discrete or Gaussian + for vertex, type in self.descriptor["types"].items(): + if type in ["disc_num", "disc"]: + node = DiscreteNode(name=vertex) + elif type == "cont": + node = GaussianNode(name=vertex, regressor=regressor) + else: + msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). + Set vertex manually (by calling set_nodes()) or investigate the error.""" + logger_builder.error(msg) + continue + + self.vertices.append(node) + + def overwrite_vertex( + self, + has_logit: bool, + use_mixture: bool, + classifier: Optional[Callable], + regressor: Optional[Callable], + ): + """ + Level 2: Redefined nodes according structure (parents) + :param classifier: an object to pass into logit, condLogit nodes + :param regressor: an object to pass into gaussian nodes + :param has_logit allows edges from cont to disc nodes + :param use_mixture allows using Mixture + """ + for node_instance in self.vertices: + node = node_instance + if has_logit: + if "Discrete" in node_instance.type: + if node_instance.cont_parents: + if not node_instance.disc_parents: + node = LogitNode( + name=node_instance.name, classifier=classifier + ) + + elif node_instance.disc_parents: + node = ConditionalLogitNode( + name=node_instance.name, classifier=classifier + ) + + if use_mixture: + if "Gaussian" in node_instance.type: + if not node_instance.disc_parents: + node = MixtureGaussianNode(name=node_instance.name) + elif node_instance.disc_parents: + node = ConditionalMixtureGaussianNode(name=node_instance.name) + else: + continue + else: + if "Gaussian" in node_instance.type: + if node_instance.disc_parents: + node = ConditionalGaussianNode( + name=node_instance.name, regressor=regressor + ) + else: + continue + + if node_instance == node: + continue + + id = self.skeleton["V"].index(node_instance) + node.disc_parents = node_instance.disc_parents + node.cont_parents = node_instance.cont_parents + node.children = node_instance.children + self.skeleton["V"][id] = node + + +class EdgesDefiner(StructureBuilder): + def __init__(self, descriptor: Dict[str, Dict[str, str]]): + super(EdgesDefiner, self).__init__(descriptor) + + +class BaseDefiner(VerticesDefiner, EdgesDefiner): + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): + self.scoring_function = scoring_function + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + super().__init__(descriptor, regressor=regressor) + self.optimizer = None # will be defined in subclasses diff --git a/bamt/builders/composite_builder.py b/bamt/builders/composite_builder.py new file mode 100644 index 0000000..764363a --- /dev/null +++ b/bamt/builders/composite_builder.py @@ -0,0 +1,293 @@ +from datetime import timedelta +from typing import Dict, Optional, List, Tuple, Callable + +from golem.core.adapter import DirectAdapter +from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes +from golem.core.log import Log +from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer +from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters +from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum +from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum +from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum +from golem.core.optimisers.objective import Objective, ObjectiveEvaluate +from golem.core.optimisers.optimization_parameters import GraphRequirements +from golem.core.optimisers.optimizer import GraphGenerationParams +from pandas import DataFrame +from sklearn import preprocessing + +import bamt.preprocessors as pp +from bamt.builders.builders_base import VerticesDefiner, EdgesDefiner +from bamt.log import logger_builder +from bamt.nodes.composite_continuous_node import CompositeContinuousNode +from bamt.nodes.composite_discrete_node import CompositeDiscreteNode +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils import EvoUtils as evo +from bamt.utils.composite_utils import CompositeGeneticOperators +from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode + + +class CompositeDefiner(VerticesDefiner, EdgesDefiner): + """ + Object that might take additional methods to decompose structure builder class + """ + + def __init__( + self, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object] = None, + ): + super().__init__(descriptor, regressor) + + # Notice that vertices are used only by Builders + self.vertices = [] + + # LEVEL 1: Define a general type of node: Discrete or Сontinuous + for vertex, type in self.descriptor["types"].items(): + if type in ["disc_num", "disc"]: + node = CompositeDiscreteNode(name=vertex) + elif type == "cont": + node = CompositeContinuousNode(name=vertex, regressor=regressor) + else: + msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). + Set vertex manually (by calling set_nodes()) or investigate the error.""" + logger_builder.error(msg) + continue + + self.vertices.append(node) + + +class CompositeStructureBuilder(CompositeDefiner): + """ + This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents + the structure of a Composite Bayesian Network. + + Attributes: + data (DataFrame): Input data used to build the structure. + descriptor (dict): Descriptor describing node types and signs. + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object], + ): + super(CompositeStructureBuilder, self).__init__( + descriptor=descriptor, regressor=regressor + ) + self.data = data + self.parent_models_dict = {} + self.descriptor = descriptor + self.regressor = regressor + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + self.default_n_jobs = -1 + self.default_pop_size = 15 + self.default_crossover_prob = 0.9 + self.default_mutation_prob = 0.8 + self.default_max_arity = 100 + self.default_max_depth = 100 + self.default_timeout = 180 + self.default_num_of_generations = 50 + self.default_early_stopping_iterations = 50 + self.objective_metric = CompositeGeneticOperators.composite_metric + self.default_crossovers = [ + CrossoverTypesEnum.exchange_edges, + CrossoverTypesEnum.exchange_parents_one, + CrossoverTypesEnum.exchange_parents_both, + CompositeGeneticOperators.custom_crossover_all_model, + ] + self.default_mutations = [ + CompositeGeneticOperators.custom_mutation_add_structure, + CompositeGeneticOperators.custom_mutation_delete_structure, + CompositeGeneticOperators.custom_mutation_reverse_structure, + CompositeGeneticOperators.custom_mutation_add_model, + ] + self.default_selection = [SelectionTypesEnum.tournament] + self.default_constraints = [ + has_no_self_cycled_nodes, + has_no_cycle, + evo.has_no_duplicates, + ] + self.verbose = True + self.logging_level = 50 + + def overwrite_vertex( + self, + regressor: Optional[Callable], + ): + for node_instance in self.vertices: + node = node_instance + if ( + len(node_instance.cont_parents + node_instance.disc_parents) < 1 + and type(node_instance).__name__ == "CompositeContinuousNode" + ): + node = GaussianNode(name=node_instance.name, regressor=regressor) + elif ( + len(node_instance.cont_parents + node_instance.disc_parents) < 1 + and type(node_instance).__name__ == "CompositeDiscreteNode" + ): + node = DiscreteNode(name=node_instance.name) + else: + continue + id_node = self.skeleton["V"].index(node_instance) + node.disc_parents = node_instance.disc_parents + node.cont_parents = node_instance.cont_parents + node.children = node_instance.children + self.skeleton["V"][id_node] = node + + def build( + self, + data: DataFrame, + classifier: Optional[object], + regressor: Optional[object], + **kwargs, + ): + """ + Calls the search method to execute all the evolutionary computations. + + Args: + data (DataFrame): The data from which to build the structure. + classifier (Optional[object]): A classification model for discrete nodes. + regressor (Optional[object]): A regression model for continuous nodes. + """ + best_graph_edge_list, parent_models = self.search(data, **kwargs) + + # Convert the best graph to the format used by the Bayesian Network + self.skeleton["V"] = self.vertices + self.skeleton["E"] = best_graph_edge_list + self.parent_models_dict = parent_models + + self.get_family() + self.overwrite_vertex(regressor=regressor) + + def search(self, data: DataFrame, **kwargs) -> [List[Tuple[str, str]], Dict]: + """ + Executes all the evolutionary computations and returns the best graph's edge list. + + Args: + data (DataFrame): The data from which to build the structure. + + Returns: + best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. + """ + # Get the list of node names + vertices = list(data.columns) + + encoder = preprocessing.LabelEncoder() + p = pp.Preprocessor([("encoder", encoder)]) + preprocessed_data, _ = p.apply(data) + + # Create the initial population + initial = kwargs.get( + "custom_initial_structure", + [ + CompositeModel( + nodes=[ + CompositeNode( + nodes_from=None, + content={ + "name": vertex, + "type": p.nodes_types[vertex], + "parent_model": None, + }, + ) + for vertex in vertices + ] + ) + ], + ) + + # Define the requirements for the evolutionary algorithm + requirements = GraphRequirements( + max_arity=kwargs.get("max_arity", self.default_max_arity), + max_depth=kwargs.get("max_depth", self.default_max_depth), + num_of_generations=kwargs.get( + "num_of_generations", self.default_num_of_generations + ), + timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), + early_stopping_iterations=kwargs.get( + "early_stopping_iterations", self.default_early_stopping_iterations + ), + n_jobs=kwargs.get("n_jobs", self.default_n_jobs), + ) + + # Set the parameters for the evolutionary algorithm + optimizer_parameters = GPAlgorithmParameters( + pop_size=kwargs.get("pop_size", self.default_pop_size), + crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), + mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), + genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, + mutation_types=kwargs.get("custom_mutations", self.default_mutations), + crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), + selection_types=kwargs.get("selection_type", self.default_selection), + ) + + # Set the adapter for the conversion between the graph and the data + # structures used by the optimizer + adapter = DirectAdapter( + base_graph_class=CompositeModel, base_node_class=CompositeNode + ) + + # Set the constraints for the graph + + constraints = kwargs.get("custom_constraints", []) + + constraints.extend(self.default_constraints) + + if kwargs.get("blacklist", None) is not None: + constraints.append(evo.has_no_blacklist_edges) + if kwargs.get("whitelist", None) is not None: + constraints.append(evo.has_only_whitelist_edges) + + graph_generation_params = GraphGenerationParams( + adapter=adapter, rules_for_constraint=constraints + ) + + # Define the objective function to optimize + objective = Objective( + {"custom": kwargs.get("custom_metric", self.objective_metric)} + ) + + # Initialize the optimizer + optimizer = EvoGraphOptimizer( + objective=objective, + initial_graphs=initial, + requirements=requirements, + graph_generation_params=graph_generation_params, + graph_optimizer_params=optimizer_parameters, + ) + + # Define the function to evaluate the objective function + objective_eval = ObjectiveEvaluate(objective, data=preprocessed_data) + + if not kwargs.get("verbose", self.verbose): + Log().reset_logging_level(logging_level=50) + + # Run the optimization + optimized_graph = optimizer.optimise(objective_eval)[0] + + parent_models = self._get_parent_models(optimized_graph) + + # Get the best graph + best_graph_edge_list = optimized_graph.operator.get_edges() + best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) + + return best_graph_edge_list, parent_models + + @staticmethod + def _convert_to_strings(nested_list): + return [[str(item) for item in inner_list] for inner_list in nested_list] + + @staticmethod + def _get_parent_models(graph): + parent_models = {} + for node in graph.nodes: + parent_models[node.content["name"]] = node.content["parent_model"] + return parent_models diff --git a/bamt/builders/evo_builder.py b/bamt/builders/evo_builder.py new file mode 100644 index 0000000..2921347 --- /dev/null +++ b/bamt/builders/evo_builder.py @@ -0,0 +1,231 @@ +from datetime import timedelta +from typing import Dict, Optional, List, Tuple + +from golem.core.adapter import DirectAdapter +from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes +from golem.core.log import Log +from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer +from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters +from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum +from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum +from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum +from golem.core.optimisers.objective import Objective, ObjectiveEvaluate +from golem.core.optimisers.optimization_parameters import GraphRequirements +from golem.core.optimisers.optimizer import GraphGenerationParams +from pandas import DataFrame + +from bamt.builders.builders_base import BaseDefiner +from bamt.utils import EvoUtils as evo + + +class EvoDefiner(BaseDefiner): + """ + Object that might take additional methods to decompose structure builder class + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object] = None, + ): + super().__init__(data, descriptor, regressor) + + +class EvoStructureBuilder(EvoDefiner): + """ + This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents + the structure of a Bayesian Network. + + Attributes: + data (DataFrame): Input data used to build the structure. + descriptor (dict): Descriptor describing node types and signs. + regressor (object): A regression model for continuous nodes. + has_logit (bool): Indicates whether a logit link function should be used. + use_mixture (bool): Indicates whether a mixture model should be used. + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): + super(EvoStructureBuilder, self).__init__( + data=data, descriptor=descriptor, regressor=regressor + ) + self.data = data + self.descriptor = descriptor + self.has_logit = has_logit + self.use_mixture = use_mixture + self.regressor = regressor + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + self.default_n_jobs = -1 + self.default_pop_size = 15 + self.default_crossover_prob = 0.9 + self.default_mutation_prob = 0.8 + self.default_max_arity = 100 + self.default_max_depth = 100 + self.default_timeout = 180 + self.default_num_of_generations = 50 + self.default_early_stopping_iterations = 50 + self.logging_level = 50 + self.objective_metric = evo.K2_metric + self.default_crossovers = [ + CrossoverTypesEnum.exchange_edges, + CrossoverTypesEnum.exchange_parents_one, + CrossoverTypesEnum.exchange_parents_both, + ] + self.default_mutations = [ + evo.custom_mutation_add, + evo.custom_mutation_delete, + evo.custom_mutation_reverse, + ] + self.default_selection = [SelectionTypesEnum.tournament] + self.default_constraints = [ + has_no_self_cycled_nodes, + has_no_cycle, + evo.has_no_duplicates, + ] + self.verbose = True + + def build( + self, + data: DataFrame, + classifier: Optional[object], + regressor: Optional[object], + **kwargs + ): + """ + Calls the search method to execute all the evolutionary computations. + + Args: + data (DataFrame): The data from which to build the structure. + classifier (Optional[object]): A classification model for discrete nodes. + regressor (Optional[object]): A regression model for continuous nodes. + """ + best_graph_edge_list = self.search(data, **kwargs) + + # Convert the best graph to the format used by the Bayesian Network + self.skeleton["V"] = self.vertices + self.skeleton["E"] = best_graph_edge_list + + self.get_family() + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) + + def search(self, data: DataFrame, **kwargs) -> List[Tuple[str, str]]: + """ + Executes all the evolutionary computations and returns the best graph's edge list. + + Args: + data (DataFrame): The data from which to build the structure. + + Returns: + best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. + """ + # Get the list of node names + nodes_types = data.columns.to_list() + + # Create the initial population + initial = [ + evo.CustomGraphModel( + nodes=kwargs.get( + "init_nodes", + [evo.CustomGraphNode(node_type) for node_type in nodes_types], + ) + ) + ] + + # Define the requirements for the evolutionary algorithm + requirements = GraphRequirements( + max_arity=kwargs.get("max_arity", self.default_max_arity), + max_depth=kwargs.get("max_depth", self.default_max_depth), + num_of_generations=kwargs.get( + "num_of_generations", self.default_num_of_generations + ), + timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), + early_stopping_iterations=kwargs.get( + "early_stopping_iterations", self.default_early_stopping_iterations + ), + n_jobs=kwargs.get("n_jobs", self.default_n_jobs), + ) + + # Set the parameters for the evolutionary algorithm + optimizer_parameters = GPAlgorithmParameters( + pop_size=kwargs.get("pop_size", self.default_pop_size), + crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), + mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), + genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, + mutation_types=kwargs.get("custom_mutations", self.default_mutations), + crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), + selection_types=kwargs.get("selection_type", self.default_selection), + ) + + # Set the adapter for the conversion between the graph and the data + # structures used by the optimizer + adapter = DirectAdapter( + base_graph_class=evo.CustomGraphModel, base_node_class=evo.CustomGraphNode + ) + + # Set the constraints for the graph + + constraints = kwargs.get("custom_constraints", []) + + constraints.extend(self.default_constraints) + + if kwargs.get("blacklist", None) is not None: + constraints.append(evo.has_no_blacklist_edges) + if kwargs.get("whitelist", None) is not None: + constraints.append(evo.has_only_whitelist_edges) + + graph_generation_params = GraphGenerationParams( + adapter=adapter, + rules_for_constraint=constraints, + available_node_types=nodes_types, + ) + + # Define the objective function to optimize + objective = Objective( + {"custom": kwargs.get("custom_metric", self.objective_metric)} + ) + + # Initialize the optimizer + optimizer = EvoGraphOptimizer( + objective=objective, + initial_graphs=initial, + requirements=requirements, + graph_generation_params=graph_generation_params, + graph_optimizer_params=optimizer_parameters, + ) + + # Define the function to evaluate the objective function + objective_eval = ObjectiveEvaluate(objective, data=data) + + if not kwargs.get("verbose", self.verbose): + Log().reset_logging_level(logging_level=50) + + # Run the optimization + optimized_graph = optimizer.optimise(objective_eval)[0] + + # Get the best graph + best_graph_edge_list = optimized_graph.operator.get_edges() + best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) + + return best_graph_edge_list + + @staticmethod + def _convert_to_strings(nested_list): + return [tuple([str(item) for item in inner_list]) for inner_list in nested_list] diff --git a/bamt/builders/hc_builder.py b/bamt/builders/hc_builder.py new file mode 100644 index 0000000..3f92e14 --- /dev/null +++ b/bamt/builders/hc_builder.py @@ -0,0 +1,208 @@ +from typing import Dict, List, Optional, Tuple, Callable, Union + +from pandas import DataFrame +from pgmpy.base import DAG +from pgmpy.estimators import HillClimbSearch + +from bamt.builders.builders_base import ParamDict, BaseDefiner +from bamt.log import logger_builder +from bamt.redef_HC import hc as hc_method +from bamt.utils import GraphUtils as gru + + +class HillClimbDefiner(BaseDefiner): + """ + Object to define structure and pass it into skeleton + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): + super().__init__(data, descriptor, scoring_function, regressor) + self.optimizer = HillClimbSearch(data) + + def apply_K2( + self, + data: DataFrame, + init_edges: Optional[List[Tuple[str, str]]], + progress_bar: bool, + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): + """ + :param init_edges: list of tuples, a graph to start learning with + :param remove_init_edges: allows changes in a model defined by user + :param data: user's data + :param progress_bar: verbose regime + :param white_list: list of allowed edges + """ + if not all([i in ["disc", "disc_num"] for i in gru.nodes_types(data).values()]): + logger_builder.error( + f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}" + ) + return None + + if len(self.scoring_function) != 2: + from pgmpy.estimators import K2Score + + scoring_function = K2Score + else: + scoring_function = self.scoring_function[1] + + if not init_edges: + best_model = self.optimizer.estimate( + scoring_method=scoring_function(data), + black_list=self.black_list, + white_list=white_list, + show_progress=progress_bar, + ) + else: + if remove_init_edges: + startdag = DAG() + nodes = [str(v) for v in self.vertices] + startdag.add_nodes_from(nodes=nodes) + startdag.add_edges_from(ebunch=init_edges) + best_model = self.optimizer.estimate( + black_list=self.black_list, + white_list=white_list, + start_dag=startdag, + show_progress=False, + ) + else: + best_model = self.optimizer.estimate( + black_list=self.black_list, + white_list=white_list, + fixed_edges=init_edges, + show_progress=False, + ) + + structure = [list(x) for x in list(best_model.edges())] + self.skeleton["E"] = structure + + def apply_group1( + self, + data: DataFrame, + progress_bar: bool, + init_edges: Optional[List[Tuple[str, str]]], + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): + """ + This method implements the group of scoring functions. + Group: + "MI" - Mutual Information, + "LL" - Log Likelihood, + "BIC" - Bayesian Information Criteria, + "AIC" - Akaike information Criteria. + """ + column_name_dict = dict([(n.name, i) for i, n in enumerate(self.vertices)]) + blacklist_new = [] + for pair in self.black_list: + blacklist_new.append((column_name_dict[pair[0]], column_name_dict[pair[1]])) + if white_list: + white_list_old = white_list[:] + white_list = [] + for pair in white_list_old: + white_list.append( + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) + if init_edges: + init_edges_old = init_edges[:] + init_edges = [] + for pair in init_edges_old: + init_edges.append( + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) + + bn = hc_method( + data, + metric=self.scoring_function[0], + restriction=white_list, + init_edges=init_edges, + remove_geo_edges=remove_init_edges, + black_list=blacklist_new, + debug=progress_bar, + ) + structure = [] + nodes = sorted(list(bn.nodes())) + for rv in nodes: + for pa in bn.F[rv]["parents"]: + structure.append( + [ + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(pa) + ], + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(rv) + ], + ] + ) + self.skeleton["E"] = structure + + +class HCStructureBuilder(HillClimbDefiner): + """ + Final object with build method + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Tuple[str, Callable], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): + """ + :param data: train data + :param descriptor: map for data + """ + + super(HCStructureBuilder, self).__init__( + descriptor=descriptor, + data=data, + scoring_function=scoring_function, + regressor=regressor, + ) + self.use_mixture = use_mixture + self.has_logit = has_logit + + def build( + self, + data: DataFrame, + progress_bar: bool, + classifier: Optional[object], + regressor: Optional[object], + params: Optional[ParamDict] = None, + **kwargs, + ): + if params: + for param, value in params.items(): + self.params[param] = value + + init_nodes = self.params.pop("init_nodes") + bl_add = self.params.pop("bl_add") + + # Level 1 + self.skeleton["V"] = self.vertices + + self.restrict(data, init_nodes, bl_add) + if self.scoring_function[0] == "K2": + self.apply_K2(data=data, progress_bar=progress_bar, **self.params) + elif self.scoring_function[0] in ["MI", "LL", "BIC", "AIC"]: + self.apply_group1(data=data, progress_bar=progress_bar, **self.params) + + # Level 2 + + self.get_family() + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) diff --git a/bamt/core/graph/dag.py b/bamt/core/graph/dag.py deleted file mode 100644 index 03a1770..0000000 --- a/bamt/core/graph/dag.py +++ /dev/null @@ -1,24 +0,0 @@ -from .graph import Graph - - -class DirectedAcyclicGraph(Graph): - def __init__(self): - super().__init__() - - def add_edge(self): - pass - - def remove_edge(self): - pass - - def get_parents(self): - pass - - def get_children(self): - pass - - def get_edges(self): - pass - - def get_nodes(self): - pass diff --git a/bamt/core/graph/graph.py b/bamt/core/graph/graph.py deleted file mode 100644 index c2628fc..0000000 --- a/bamt/core/graph/graph.py +++ /dev/null @@ -1,30 +0,0 @@ -from abc import ABC, abstractmethod - - -class Graph(ABC): - def __init__(self): - pass - - @abstractmethod - def add_edge(self): - pass - - @abstractmethod - def remove_edge(self): - pass - - @abstractmethod - def get_parents(self): - pass - - @abstractmethod - def get_children(self): - pass - - @abstractmethod - def get_edges(self): - pass - - @abstractmethod - def get_nodes(self): - pass diff --git a/bamt/core/node_models/classifier.py b/bamt/core/node_models/classifier.py deleted file mode 100644 index afe9df4..0000000 --- a/bamt/core/node_models/classifier.py +++ /dev/null @@ -1,25 +0,0 @@ -from .prediction_model import PredictionModel - -import numpy as np - - -class Classifier(PredictionModel): - def __init__(self, classifier=None, **parameters): - self._classifier = classifier - self._parameters = parameters - - def fit(self, X: np.ndarray, y: np.ndarray) -> None: - if self._classifier is None: - # TODO: implement an algorithm that finds a classifier and fits it with chosen parameters - pass - else: - self._classifier.fit(X, y) - - def predict(self, X: np.ndarray) -> np.ndarray: - return self._classifier.predict(X) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - return self._classifier.predict_proba(X) - - def __str__(self): - return str(self._classifier) diff --git a/bamt/core/node_models/continuous_distribution.py b/bamt/core/node_models/continuous_distribution.py deleted file mode 100644 index 9d31e21..0000000 --- a/bamt/core/node_models/continuous_distribution.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np - -from .distribution import Distribution - - -class ContinuousDistribution(Distribution): - def __init__(self, distribution_model=None, **parameters): - self._distribution = distribution_model - self._parameters = parameters - - def fit(self, X: np.ndarray) -> None: - if self._distribution is None: - # TODO: implement an algorithm that finds a distribution and fits it with chosen parameters - pass - else: - pass - - def sample(self, num_samples: int) -> np.ndarray: - pass - - def __str__(self): - return str(self._distribution.name) + " continuous distribution" diff --git a/bamt/core/node_models/distribution.py b/bamt/core/node_models/distribution.py deleted file mode 100644 index ed3a747..0000000 --- a/bamt/core/node_models/distribution.py +++ /dev/null @@ -1,14 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np - - -class Distribution(ABC): - - @abstractmethod - def fit(self, X: np.ndarray) -> None: - pass - - @abstractmethod - def sample(self, num_samples: int) -> np.ndarray: - pass diff --git a/bamt/core/node_models/empirical_distribution.py b/bamt/core/node_models/empirical_distribution.py deleted file mode 100644 index ec63673..0000000 --- a/bamt/core/node_models/empirical_distribution.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np - -from .distribution import Distribution - - -class EmpiricalDistribution(Distribution): - def __init__(self): - pass - - def fit(self, X: np.ndarray) -> None: - pass - - def sample(self, num_samples: int) -> np.ndarray: - pass - - def __str__(self): - return "Empirical Distribution" diff --git a/bamt/core/node_models/prediction_model.py b/bamt/core/node_models/prediction_model.py deleted file mode 100644 index c2b5df7..0000000 --- a/bamt/core/node_models/prediction_model.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np - - -class PredictionModel(ABC): - """Represents general prediction model implementations. Each prediction model should provide a fit and a predict - method.""" - - @abstractmethod - def fit(self, X: np.ndarray, Y: np.ndarray) -> None: - raise NotImplementedError - - @abstractmethod - def predict(self, X: np.ndarray) -> np.ndarray: - raise NotImplementedError - - @abstractmethod - def predict_proba(self, X: np.ndarray) -> np.ndarray: - raise NotImplementedError diff --git a/bamt/core/node_models/regressor.py b/bamt/core/node_models/regressor.py deleted file mode 100644 index d69941a..0000000 --- a/bamt/core/node_models/regressor.py +++ /dev/null @@ -1,25 +0,0 @@ -from .prediction_model import PredictionModel - -import numpy as np - - -class Regressor(PredictionModel): - def __init__(self, regressor=None, **parameters): - self._regressor = regressor - self._parameters = parameters - - def fit(self, X: np.ndarray, y: np.ndarray) -> None: - if self._regressor is None: - # TODO: implement an algorithm that finds a regressor and fits it with chosen parameters - pass - else: - self._regressor.fit(X, y) - - def predict(self, X: np.ndarray) -> np.ndarray: - return self._regressor.predict(X) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - return self._regressor.predict_proba(X) - - def __str__(self): - return str(self._regressor) diff --git a/bamt/core/nodes/child_nodes/child_node.py b/bamt/core/nodes/child_nodes/child_node.py deleted file mode 100644 index fc1636d..0000000 --- a/bamt/core/nodes/child_nodes/child_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from bamt.core.nodes.node import Node - - -class ChildNode(Node): - def __init__(self): - super().__init__() diff --git a/bamt/core/nodes/child_nodes/conditional_continuous_node.py b/bamt/core/nodes/child_nodes/conditional_continuous_node.py deleted file mode 100644 index 171b4a8..0000000 --- a/bamt/core/nodes/child_nodes/conditional_continuous_node.py +++ /dev/null @@ -1,7 +0,0 @@ -from .child_node import ChildNode - - -class ConditionalContinuousNode(ChildNode): - def __init__(self): - super().__init__() - self._model = None diff --git a/bamt/core/nodes/child_nodes/conditional_discrete_node.py b/bamt/core/nodes/child_nodes/conditional_discrete_node.py deleted file mode 100644 index 0a01b00..0000000 --- a/bamt/core/nodes/child_nodes/conditional_discrete_node.py +++ /dev/null @@ -1,7 +0,0 @@ -from .child_node import ChildNode - - -class ConditionalDiscreteNode(ChildNode): - def __init__(self): - super().__init__() - self._model = None diff --git a/bamt/core/nodes/node.py b/bamt/core/nodes/node.py deleted file mode 100644 index 6dc4140..0000000 --- a/bamt/core/nodes/node.py +++ /dev/null @@ -1,6 +0,0 @@ -from abc import ABC - - -class Node(ABC): - def __init__(self): - pass diff --git a/bamt/core/nodes/root_nodes/continuous_node.py b/bamt/core/nodes/root_nodes/continuous_node.py deleted file mode 100644 index f725000..0000000 --- a/bamt/core/nodes/root_nodes/continuous_node.py +++ /dev/null @@ -1,10 +0,0 @@ -from .root_node import RootNode - - -class ContinuousNode(RootNode): - def __init__(self): - super().__init__() - self._distribution = None - - def __str__(self): - return "Continuous Node with " + str(self._distribution) diff --git a/bamt/core/nodes/root_nodes/discrete_node.py b/bamt/core/nodes/root_nodes/discrete_node.py deleted file mode 100644 index d9b04c8..0000000 --- a/bamt/core/nodes/root_nodes/discrete_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from .root_node import RootNode - - -class DiscreteNode(RootNode): - def __init__(self): - super().__init__() diff --git a/bamt/core/nodes/root_nodes/root_node.py b/bamt/core/nodes/root_nodes/root_node.py deleted file mode 100644 index faf7ff1..0000000 --- a/bamt/core/nodes/root_nodes/root_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from bamt.core.nodes.node import Node - - -class RootNode(Node): - def __init__(self): - super().__init__() diff --git a/bamt/dag_optimizers/constraint/__init__.py b/bamt/dag_optimizers/constraint/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/constraint/constraint_dag_optimizer.py b/bamt/dag_optimizers/constraint/constraint_dag_optimizer.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/dag_optimizer.py b/bamt/dag_optimizers/dag_optimizer.py deleted file mode 100644 index b9c992f..0000000 --- a/bamt/dag_optimizers/dag_optimizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class DAGOptimizer(ABC): - def __init__(self): - pass - - @abstractmethod - def optimize(self): - pass diff --git a/bamt/dag_optimizers/hybrid/__init__.py b/bamt/dag_optimizers/hybrid/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py b/bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/__init__.py b/bamt/dag_optimizers/score/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/bigbravebn.py b/bamt/dag_optimizers/score/bigbravebn.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/golem_genetic.py b/bamt/dag_optimizers/score/golem_genetic.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/hill_climbing.py b/bamt/dag_optimizers/score/hill_climbing.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/lsevobn.py b/bamt/dag_optimizers/score/lsevobn.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/score_dag_optimizer.py b/bamt/dag_optimizers/score/score_dag_optimizer.py deleted file mode 100644 index a9ae0fb..0000000 --- a/bamt/dag_optimizers/score/score_dag_optimizer.py +++ /dev/null @@ -1,9 +0,0 @@ -from bamt.dag_optimizers.dag_optimizer import DAGOptimizer - - -class ScoreDAGOptimizer(DAGOptimizer): - def __init__(self): - super().__init__() - - def optimize(self): - pass diff --git a/bamt/display/__init__.py b/bamt/display/__init__.py new file mode 100644 index 0000000..a22b2c3 --- /dev/null +++ b/bamt/display/__init__.py @@ -0,0 +1,9 @@ +from .display import Display + + +def plot_(output, *args): + return Display(output).build(*args) + + +def get_info_(bn, as_df): + return Display(output=None).get_info(bn, as_df) diff --git a/bamt/display/display.py b/bamt/display/display.py new file mode 100644 index 0000000..2568fb7 --- /dev/null +++ b/bamt/display/display.py @@ -0,0 +1,175 @@ +import random +from typing import Dict + +import matplotlib +import matplotlib.pyplot as plt +import networkx as nx +import numpy as np +from pandas import DataFrame +from pyvis.network import Network + +from bamt.log import logger_display + + +class Display(object): + """Display object""" + + def __init__(self, output): + """ + :param output: output file name + """ + self.output = output + self.class2color = Dict + self.name2class = Dict + + @staticmethod + def _validate_dir(output): + """Format validation""" + if not output.endswith(".html"): + logger_display.error("This version allows only html format.") + return None + return True + + @staticmethod + def _make_network(nodes, edges, **kwargs): + """Make network and graph + :param nodes: nodes + :param edges: edges + :param kwargs: a dict passed to pyvis.network.Network + """ + g = nx.DiGraph() + network_params = dict( + height=kwargs.get("height", "800px"), + width=kwargs.get("width", "100%"), + notebook=kwargs.get("notebook", True), + directed=kwargs.get("directed", nx.is_directed(g)), + layout=kwargs.get("layout", "hierarchical"), + ) + + g.add_nodes_from(nodes) + g.add_edges_from(edges) + + network = Network(**network_params) + return network, g + + def _init_colors(self, nodes): + # Qualitative class of colormaps + q_classes = [ + "Pastel1", + "Pastel2", + "Paired", + "Accent", + "Dark2", + "Set1", + "Set2", + "Set3", + "tab10", + "tab20", + "tab20b", + "tab20c", + ] + + hex_colors = [] + for cls in q_classes: + rgb_colors = plt.get_cmap(cls).colors + hex_colors.extend( + [matplotlib.colors.rgb2hex(rgb_color) for rgb_color in rgb_colors] + ) + + class_number = len(set([node.type for node in nodes])) + hex_colors_indexes = [ + random.randint(0, len(hex_colors) - 1) for _ in range(class_number) + ] + + hex_colors = np.array(hex_colors) + + hex_colors_picked = hex_colors[hex_colors_indexes] + class2color = { + cls: color + for cls, color in zip(set([node.type for node in nodes]), hex_colors_picked) + } + name2class = {node.name: node.type for node in nodes} + self.class2color = class2color + self.name2class = name2class + + def build(self, nodes, edges, **kwargs): + """Build and show network + + :param nodes: nodes + :param edges: edges + :param kwargs: a dict passed to pyvis.network.Network.add_node + """ + if not self._validate_dir(self.output): + logger_display.error(f"Output error: {self.output}") + return None + + if not kwargs: + node_params = dict(font={"size": 36}, size=45) + else: + node_params = kwargs + + self._init_colors(nodes) + + nodes_names = [node.name for node in nodes] + + network, g = self._make_network(nodes_names, edges) + nodes_sorted = np.array(list(nx.topological_generations(g)), dtype=object) + + for level in range(len(nodes_sorted)): + for node_i in range(len(nodes_sorted[level])): + name = nodes_sorted[level][node_i] + cls = self.name2class[name] + color = self.class2color[cls] + network.add_node( + name, + label=name, + color=color, + level=level, + title=f"{name} ({cls})", + **node_params, + ) + + for edge in g.edges: + network.add_edge(edge[0], edge[1]) + + network.hrepulsion(node_distance=300, central_gravity=0.5) + + return network.show(self.output) + + @staticmethod + def get_info(bn, as_df): + if as_df: + names = [] + types_n = [] + types_d = [] + parents = [] + parents_types = [] + for n in bn.nodes: + names.append(n) + types_n.append(n.type) + types_d.append(bn.descriptor["types"][n.name]) + parents_types.append( + [ + bn.descriptor["types"][name] + for name in n.cont_parents + n.disc_parents + ] + ) + parents.append([name for name in n.cont_parents + n.disc_parents]) + return DataFrame( + { + "name": names, + "node_type": types_n, + "data_type": types_d, + "parents": parents, + "parents_types": parents_types, + } + ) + else: + for n in bn.nodes: + print( + f"{n.name: <20} | " + f"{n.type: <50} | " + f"{bn.descriptor['types'][n.name]: <10} | " + f"{str([bn.descriptor['types'][name] for name in n.cont_parents + n.disc_parents]): <50} | " + f"{str([name for name in n.cont_parents + n.disc_parents])}" + ) diff --git a/bamt/core/__init__.py b/bamt/external/__init__.py similarity index 100% rename from bamt/core/__init__.py rename to bamt/external/__init__.py diff --git a/bamt/external/pyBN/__init__.py b/bamt/external/pyBN/__init__.py new file mode 100644 index 0000000..321c5ec --- /dev/null +++ b/bamt/external/pyBN/__init__.py @@ -0,0 +1,2 @@ +from bamt.external.pyBN.classes import * +from bamt.external.pyBN.utils import * diff --git a/bamt/core/graph/__init__.py b/bamt/external/pyBN/classes/__init__.py similarity index 100% rename from bamt/core/graph/__init__.py rename to bamt/external/pyBN/classes/__init__.py diff --git a/bamt/core/node_models/__init__.py b/bamt/external/pyBN/classes/_tests/__init__.py similarity index 100% rename from bamt/core/node_models/__init__.py rename to bamt/external/pyBN/classes/_tests/__init__.py diff --git a/bamt/external/pyBN/classes/_tests/test_bayesnet.py b/bamt/external/pyBN/classes/_tests/test_bayesnet.py new file mode 100644 index 0000000..8beca5c --- /dev/null +++ b/bamt/external/pyBN/classes/_tests/test_bayesnet.py @@ -0,0 +1,184 @@ +""" +******** +UnitTest +BayesNet +******** + +Method Checks that +assertEqual(a, b) a == b +assertNotEqual(a, b) a != b +assertTrue(x) bool(x) is True +assertFalse(x) bool(x) is False +assertIs(a, b) a is b +assertIsNot(a, b) a is not b +assertIsNone(x) x is None +assertIsNotNone(x) x is not None +assertIn(a, b) a in b +assertNotIn(a, b) a not in b +assertIsInstance(a, b) isinstance(a, b) +assertNotIsInstance(a, b) not isinstance(a, b) + +assertAlmostEqual(a, b) round(a-b, 7) == 0 +assertNotAlmostEqual(a, b) round(a-b, 7) != 0 +assertGreater(a, b) a > b +assertGreaterEqual(a, b) a >= b +assertLess(a, b) a < b +assertLessEqual(a, b) a <= b + +assertListEqual(a, b) lists +assertTupleEqual(a, b) tuples +assertSetEqual(a, b) sets or frozensets +assertDictEqual(a, b) dicts + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +from external.pyBN.classes.bayesnet import BayesNet +from external.pyBN.readwrite.read import read_bn + + +class BayesNetTestCase(unittest.TestCase): + def setUp(self): + self.bn = BayesNet() + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn_bif = read_bn(os.path.join(self.dpath, "cancer.bif")) + self.bn_bn = read_bn(os.path.join(self.dpath, "cmu.bn")) + + def tearDown(self): + pass + + def test_isinstance(self): + self.assertIsInstance(self.bn, BayesNet) + + def test_V_bif(self): + self.assertListEqual( + self.bn_bif.V, ["Smoker", "Pollution", "Cancer", "Xray", "Dyspnoea"] + ) + + def test_E_bif(self): + self.assertDictEqual( + self.bn_bif.E, + { + "Cancer": ["Xray", "Dyspnoea"], + "Dyspnoea": [], + "Pollution": ["Cancer"], + "Smoker": ["Cancer"], + "Xray": [], + }, + ) + + def test_F_bif(self): + self.assertDictEqual( + self.bn_bif.F, + { + "Cancer": { + "cpt": [0.03, 0.97, 0.05, 0.95, 0.001, 0.999, 0.02, 0.98], + "parents": ["Pollution", "Smoker"], + "values": ["True", "False"], + }, + "Dyspnoea": { + "cpt": [0.65, 0.35, 0.3, 0.7], + "parents": ["Cancer"], + "values": ["True", "False"], + }, + "Pollution": { + "cpt": [0.9, 0.1], + "parents": [], + "values": ["low", "high"], + }, + "Smoker": { + "cpt": [0.3, 0.7], + "parents": [], + "values": ["True", "False"], + }, + "Xray": { + "cpt": [0.9, 0.1, 0.2, 0.8], + "parents": ["Cancer"], + "values": ["positive", "negative"], + }, + }, + ) + + def test_V_bn(self): + self.assertListEqual( + self.bn_bn.V, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) + + def test_E_bn(self): + self.assertDictEqual( + self.bn_bn.E, + { + "Alarm": ["JohnCalls", "MaryCalls"], + "Burglary": ["Alarm"], + "Earthquake": ["Alarm"], + "JohnCalls": [], + "MaryCalls": [], + }, + ) + + def test_F_bn(self): + self.assertDictEqual( + self.bn_bn.F, + { + "Alarm": { + "cpt": [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95], + "parents": ["Earthquake", "Burglary"], + "values": ["No", "Yes"], + }, + "Burglary": { + "cpt": [0.999, 0.001], + "parents": [], + "values": ["No", "Yes"], + }, + "Earthquake": { + "cpt": [0.998, 0.002], + "parents": [], + "values": ["No", "Yes"], + }, + "JohnCalls": { + "cpt": [0.95, 0.05, 0.1, 0.9], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + "MaryCalls": { + "cpt": [0.99, 0.01, 0.3, 0.7], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + }, + ) + + def test_nodes(self): + n = list(self.bn_bn.nodes()) + self.assertListEqual( + n, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) + + def test_cpt(self): + cpt = list(self.bn_bn.cpt("Alarm")) + self.assertListEqual(cpt, [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95]) + + def test_card(self): + self.assertEqual(self.bn_bn.card("Alarm"), 2) + + def test_scope(self): + self.assertListEqual( + self.bn_bn.scope("Alarm"), ["Alarm", "Earthquake", "Burglary"] + ) + + def test_parents(self): + self.assertListEqual(self.bn_bn.parents("Alarm"), ["Earthquake", "Burglary"]) + + def test_values(self): + self.assertListEqual(self.bn_bn.values("Alarm"), ["No", "Yes"]) + + def test_values_idx(self): + self.assertEqual(self.bn_bn.values("Alarm")[1], "Yes") + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/bamt/external/pyBN/classes/bayesnet.py b/bamt/external/pyBN/classes/bayesnet.py new file mode 100644 index 0000000..88c6893 --- /dev/null +++ b/bamt/external/pyBN/classes/bayesnet.py @@ -0,0 +1,393 @@ +""" +************** +BayesNet Class +************** + +Overarching class for Discrete Bayesian Networks. + + +Design Specs +------------ + +- Bayesian Network - + + - F - + key: + - rv - + values: + - children - + - parents - + - values - + - cpt - + + - V - + - list of rvs - + + - E - + key: + - rv - + values: + - list of rv's children - +Notes +----- +- Edges can be inferred from Factorization, but Vertex values + must be specified. +""" + +__author__ = """Nicholas Cullen """ + +from copy import deepcopy + +import numpy as np + +from bamt.external.pyBN.utils.class_equivalence import are_class_equivalent +from bamt.external.pyBN.utils.graph import topsort + + +class BayesNet(object): + """ + Overarching class for Bayesian Networks + + """ + + def __init__(self, E=None, value_dict=None, file=None): + """ + Initialize the BayesNet class. + + Arguments + --------- + *V* : a list of strings - vertices in topsort order + *E* : a dict, where key = vertex, val = list of its children + *F* : a dict, + where key = rv, + val = another dict with + keys = + 'parents', + 'values', + 'cpt' + + *V* : a dict + + Notes + ----- + + """ + if file is not None: + import pyBN.io.read as ior + + bn = ior.read_bn(file) + self.V = bn.V + self.E = bn.E + self.F = bn.F + else: + if E is not None: + # assert (value_dict is not None), 'Must set values if E is set.' + self.set_structure(E, value_dict) + else: + self.V = [] + self.E = {} + self.F = {} + + def __eq__(self, y): + """ + Tests whether two Bayesian Networks are + equivalent - i.e. they contain the same + node/edge structure, and equality of + conditional probabilities. + """ + return are_class_equivalent(self, y) + + def __hash__(self): + """ + Allows BayesNet objects to be used + as keys in a dictionary (i.e. hashable) + """ + return hash((str(self.V), str(self.E))) + + def copy(self): + V = deepcopy(self.V) + E = deepcopy(self.E) + F = {} + for v in V: + F[v] = {} + F[v]["cpt"] = deepcopy(self.F[v]["cpt"]) + F[v]["parents"] = deepcopy(self.F[v]["parents"]) + F[v]["values"] = deepcopy(self.F[v]["values"]) + bn = BayesNet() + bn.V = V + bn.E = E + bn.F = F + + return bn + + def add_node(self, rv, cpt=[], parents=[], values=[]): + self.V.append(rv) + self.F[rv] = {"cpt": cpt, "parents": parents, "values": values} + + def add_edge(self, u, v): + if not self.has_node(u): + self.add_node(u) + if not self.has_node(v): + self.add_node(v) + if self.has_edge(u, v): + print("Edge already exists") + else: + self.E[u].append(v) + self.F[v]["parents"].append(u) + # self.V = topsort(self.E) + # HOW DO I RECALCULATE CPT? + + def remove_edge(self, u, v): + self.E[u].remove(v) + self.F[v]["parents"].remove(u) + + def reverse_arc(self, u, v): + if self.has_edge(u, v): + self.E[u].remove(v) + self.E[v].append(u) + + def set_data(self, rv, data): + assert isinstance(data, dict), "data must be dictionary" + self.F[rv] = data + + def set_cpt(self, rv, cpt): + self.F[rv]["cpt"] = cpt + + def set_parents(self, rv, parents): + self.F[rv]["parents"] = parents + + def set_values(self, rv, values): + self.F[rv]["values"] = values + + def nodes(self): + for v in self.V: + yield v + + def node_idx(self, rv): + try: + return self.V.index(rv) + except ValueError: + return -1 + + def has_node(self, rv): + return rv in self.V + + def has_edge(self, u, v): + return v in self.E[u] + + def edges(self): + for u in self.nodes(): + for v in self.E[u]: + yield (u, v) + + def num_edges(self): + num = 0 + for u in self.nodes(): + num += len(self.E[u]) + return num + + def num_params(self): + num = 0 + for u in self.nodes(): + num += len(self.F[u]["cpt"]) + return num + + def scope_size(self, rv): + return len(self.F[rv]["parents"]) + 1 + + def num_nodes(self): + return len(self.V) + + def cpt(self, rv): + return self.F[rv]["cpt"] + + def card(self, rv): + return len(self.F[rv]["values"]) + + def scope(self, rv): + scope = [rv] + scope.extend(self.F[rv]["parents"]) + return scope + + def parents(self, rv): + return self.F[rv]["parents"] + + def children(self, rv): + return self.E[rv] + + def degree(self, rv): + return len(self.parents(rv)) + len(self.children(rv)) + + def values(self, rv): + return self.F[rv]["values"] + + def value_idx(self, rv, val): + try: + return self.F[rv]["values"].index(val) + except ValueError: + print("Value Index Error") + return -1 + + def stride(self, rv, n): + if n == rv: + return 1 + else: + card_list = [self.card(rv)] + card_list.extend([self.card(p) for p in self.parents(rv)]) + n_idx = self.parents(rv).index(n) + 1 + return int(np.prod(card_list[0:n_idx])) + + def flat_cpt(self, by_var=False, by_parents=False): + """ + Return all cpt values in the BN as a flattened + numpy array ordered by bn.nodes() - i.e. topsort + """ + if by_var: + cpt = np.array([sum(self.cpt(rv)) for rv in self.nodes()]) + elif by_parents: + cpt = np.array( + [ + sum(self.cpt(rv)[i : (i + self.card(rv))]) + for rv in self.nodes() + for i in range(len(self.cpt(rv)) / self.card(rv)) + ] + ) + else: + cpt = np.array([val for rv in self.nodes() for val in self.cpt(rv)]) + return cpt + + def cpt_indices(self, target, val_dict): + """ + Get the index of the CPT which corresponds + to a dictionary of rv=val sets. This can be + used for parameter learning to increment the + appropriate cpt frequency value based on + observations in the data. + + There is definitely a fast way to do this. + -- check if (idx - rv_stride*value_idx) % (rv_card*rv_stride) == 0 + + Arguments + --------- + *target* : a string + Main RV + + *val_dict* : a dictionary, where + key=rv,val=rv value + + """ + stride = dict([(n, self.stride(target, n)) for n in self.scope(target)]) + # if len(val_dict)==len(self.parents(target)): + # idx = sum([self.value_idx(rv,val)*stride[rv] \ + # for rv,val in val_dict.items()]) + # else: + card = dict([(n, self.card(n)) for n in self.scope(target)]) + idx = set(range(len(self.cpt(target)))) + for rv, val in val_dict.items(): + val_idx = self.value_idx(rv, val) + rv_idx = [] + s_idx = val_idx * stride[rv] + while s_idx < len(self.cpt(target)): + rv_idx.extend(range(s_idx, (s_idx + stride[rv]))) + s_idx += stride[rv] * card[rv] + idx = idx.intersection(set(rv_idx)) + + return list(idx) + + def cpt_str_idx(self, rv, idx): + """ + Return string representation of RV=VAL and + Parents=Val for the given idx of the given rv's cpt. + """ + rv_val = self.values(rv)[idx % self.card(rv)] + s = str(rv) + "=" + str(rv_val) + "|" + _idx = 1 + for parent in self.parents(rv): + for val in self.values(parent): + if idx in self.cpt_indices(rv, {rv: rv_val, parent: val}): + s += str(parent) + "=" + str(val) + if _idx < len(self.parents(rv)): + s += "," + _idx += 1 + return s + + def set_structure(self, edge_dict, value_dict=None): + """ + Set the structure of a BayesNet object. This + function is mostly used to instantiate a BN + skeleton after structure learning algorithms. + + See "structure_learn" folder & algorithms + + Arguments + --------- + *edge_dict* : a dictionary, + where key = rv, + value = list of rv's children + NOTE: THIS MUST BE DIRECTED ALREADY! + + *value_dict* : a dictionary, + where key = rv, + value = list of rv's possible values + + Returns + ------- + None + + Effects + ------- + - sets self.V in topsort order from edge_dict + - sets self.E + - creates self.F structure and sets the parents + + Notes + ----- + + """ + + self.V = topsort(edge_dict) + self.E = edge_dict + self.F = dict([(rv, {}) for rv in self.nodes()]) + for rv in self.nodes(): + self.F[rv] = { + "parents": [p for p in self.nodes() if rv in self.children(p)], + "cpt": [], + "values": [], + } + if value_dict is not None: + self.F[rv]["values"] = value_dict[rv] + + def adj_list(self): + """ + Returns adjacency list of lists, where + each list element is a vertex, and each sub-list is + a list of that vertex's neighbors. + """ + adj_list = [[] for _ in self.V] + vi_map = dict((self.V[i], i) for i in range(len(self.V))) + for u, v in self.edges(): + adj_list[vi_map[u]].append(vi_map[v]) + return adj_list + + def moralized_edges(self): + """ + Moralized graph is the original graph PLUS + an edge between every set of common effect + structures - + i.e. all parents of a node are connected. + + This function has be validated. + + Returns + ------- + *e* : a python list of parent-child tuples. + + """ + e = set() + for u in self.nodes(): + for p1 in self.parents(u): + e.add((p1, u)) + for p2 in self.parents(u): + if p1 != p2 and (p2, p1) not in e: + e.add((p1, p2)) + return list(e) diff --git a/bamt/external/pyBN/utils/__init__.py b/bamt/external/pyBN/utils/__init__.py new file mode 100644 index 0000000..1de46d8 --- /dev/null +++ b/bamt/external/pyBN/utils/__init__.py @@ -0,0 +1,4 @@ +from bamt.external.pyBN.utils.class_equivalence import * +from bamt.external.pyBN.utils.data import * +from bamt.external.pyBN.utils.graph import * +from bamt.external.pyBN.utils.independence_tests import * diff --git a/bamt/core/nodes/__init__.py b/bamt/external/pyBN/utils/_tests/__init__.py similarity index 100% rename from bamt/core/nodes/__init__.py rename to bamt/external/pyBN/utils/_tests/__init__.py diff --git a/bamt/external/pyBN/utils/_tests/test_independence_tests.py b/bamt/external/pyBN/utils/_tests/test_independence_tests.py new file mode 100644 index 0000000..fefc12c --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_independence_tests.py @@ -0,0 +1,56 @@ +""" +**************** +UnitTest +Constraint Tests +**************** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.independence.constraint_tests import mi_test + + +class ConstraintTestsTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.data = np.loadtxt( + os.path.join(self.dpath, "lizards.csv"), + delimiter=",", + dtype="int32", + skiprows=1, + ) + + def tearDown(self): + pass + + def test_mi_two_vars_value_a(self): + self.assertEqual(mi_test(self.data[:, (0, 1)]), 0.0004) + + def test_mi_two_vars_value_b(self): + self.assertEqual(mi_test(self.data[:, (0, 2)]), 0.0014) + + def test_mi_two_vars_symmetry(self): + self.assertEqual(mi_test(self.data[:, (1, 0)]), mi_test(self.data[:, (0, 1)])) + + def test_mi_three_vars_value_a(self): + self.assertEqual(mi_test(self.data), 0.0009) + + def test_mi_three_vars_symmetry(self): + self.assertEqual( + mi_test(self.data[:, (0, 1, 2)]), mi_test(self.data[:, (1, 0, 2)]) + ) + + def test_mi_random_three(self): + np.random.seed(3636) + self.data = np.random.randint(1, 10, size=((10000, 3))) + self.assertEqual(mi_test(self.data), 0.0211) + + def test_mi_random_four(self): + np.random.seed(3636) + self.data = np.random.randint(1, 10, size=((10000, 4))) + self.assertEqual(mi_test(self.data), 0.0071) diff --git a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py new file mode 100644 index 0000000..af97eb1 --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py @@ -0,0 +1,36 @@ +""" +************** +Markov Blanket +Unit Test +************** +""" + +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +from external.pyBN.independence.markov_blanket import markov_blanket +from external.pyBN.readwrite.read import read_bn + + +class ConstraintTestsTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cmu.bn")) + + def tearDown(self): + pass + + def test_markov_blanket(self): + self.assertDictEqual( + markov_blanket(self.bn), + { + "Alarm": ["Earthquake", "Burglary", "JohnCalls", "MaryCalls"], + "Burglary": ["Alarm", "Earthquake"], + "Earthquake": ["Alarm", "Burglary"], + "JohnCalls": ["Alarm"], + "MaryCalls": ["Alarm"], + }, + ) diff --git a/bamt/external/pyBN/utils/_tests/test_orient_edges.py b/bamt/external/pyBN/utils/_tests/test_orient_edges.py new file mode 100644 index 0000000..4242fc1 --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_orient_edges.py @@ -0,0 +1,38 @@ +""" +******** +UnitTest +OrientEdges +******** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.structure_learn.orient_edges import orient_edges_gs, orient_edges_pc + + +class OrientEdgesTestCase(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_orient_edges_pc(self): + e = {0: [1, 2], 1: [0], 2: [0]} + b = {0: [], 1: {2: (0,)}, 2: {1: (0,)}} + self.assertDictEqual(orient_edges_pc(e, b), {0: [1, 2], 1: [], 2: []}) + + def test_orient_edges_gs(self): + e = {0: [1, 2], 1: [0], 2: [0]} + b = {0: [1, 2], 1: [0], 2: [0]} + dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + path = os.path.join(dpath, "lizards.csv") + data = np.loadtxt(path, dtype="int32", skiprows=1, delimiter=",") + self.assertDictEqual( + orient_edges_gs(e, b, data, 0.05), {0: [1, 2], 1: [], 2: []} + ) diff --git a/bamt/external/pyBN/utils/_tests/test_random_sample.py b/bamt/external/pyBN/utils/_tests/test_random_sample.py new file mode 100644 index 0000000..bf3e92d --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_random_sample.py @@ -0,0 +1,33 @@ +""" +******** +UnitTest +Misc +******** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.readwrite.read import read_bn +from external.pyBN.utils.random_sample import random_sample + + +class RandomSampleTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cancer.bif")) + + def tearDown(self): + pass + + def test_random_sample(self): + np.random.seed(3636) + sample = random_sample(self.bn, 5) + self.assertListEqual( + list(sample.ravel()), + [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0], + ) diff --git a/bamt/external/pyBN/utils/class_equivalence.py b/bamt/external/pyBN/utils/class_equivalence.py new file mode 100644 index 0000000..9e284b3 --- /dev/null +++ b/bamt/external/pyBN/utils/class_equivalence.py @@ -0,0 +1,33 @@ +""" +**************** +Equivalence Code +**************** + +This code is for testing whether or not +two Bayesian networks belong to the same +equivalence class - i.e. they have the same +edges when viewed as undirected graphs. + +Also, this code is for actually generating +equivalent Bayesian networks from a given BN. + +""" + + +def are_class_equivalent(x, y): + """ + Check whether two Bayesian networks belong + to the same equivalence class. + """ + are_equivalent = True + + if set(list(x.nodes())) != set(list(y.nodes())): + are_equivalent = False + else: + for rv in x.nodes(): + rv_x_neighbors = set(x.parents(rv)) + set(y.children(rv)) + rv_y_neighbors = set(y.parents(rv)) + set(y.children(rv)) + if rv_x_neighbors != rv_y_neighbors: + are_equivalent = False + break + return are_equivalent diff --git a/bamt/external/pyBN/utils/data.py b/bamt/external/pyBN/utils/data.py new file mode 100644 index 0000000..ad4781a --- /dev/null +++ b/bamt/external/pyBN/utils/data.py @@ -0,0 +1,24 @@ +""" +These are functions for dealing with datasets +that have strings as values - as those values +must be converted to integers in order to be +used in many structure learning functions, for +example. + +There is, of course, the issue of how to get those +string values back into the underlying representation +after the learning occurs.. +""" +import numpy as np + + +def unique_bins(data): + """ + Get the unique values for each column in a dataset. + """ + bins = np.empty(len(data.T), dtype=np.int32) + i = 0 + for col in data.T: + bins[i] = len(np.unique(col)) + i += 1 + return bins diff --git a/bamt/external/pyBN/utils/graph.py b/bamt/external/pyBN/utils/graph.py new file mode 100644 index 0000000..92daf4d --- /dev/null +++ b/bamt/external/pyBN/utils/graph.py @@ -0,0 +1,54 @@ +""" +Collection of Graph Algorithms. + +Any networkx dependencies should exist here only, but +there are currently a few exceptions which will +eventually be corrected. + +""" + +import networkx as nx + + +def would_cause_cycle(e, u, v, reverse=False): + """ + Test if adding the edge u -> v to the BayesNet + object would create a DIRECTED (i.e. illegal) cycle. + """ + G = nx.DiGraph(e) + if reverse: + G.remove_edge(v, u) + G.add_edge(u, v) + try: + nx.find_cycle(G, source=u) + return True + except BaseException: + return False + + +def topsort(edge_dict, root=None): + """ + List of nodes in topological sort order from edge dict + where key = rv and value = list of rv's children + """ + queue = [] + if root is not None: + queue = [root] + else: + for rv in edge_dict.keys(): + prior = True + for p in edge_dict.keys(): + if rv in edge_dict[p]: + prior = False + if prior: + queue.append(rv) + + visited = [] + while queue: + vertex = queue.pop(0) + if vertex not in visited: + visited.append(vertex) + for nbr in edge_dict[vertex]: + queue.append(nbr) + # queue.extend(edge_dict[vertex]) # add all vertex's children + return visited diff --git a/bamt/external/pyBN/utils/independence_tests.py b/bamt/external/pyBN/utils/independence_tests.py new file mode 100644 index 0000000..cc803fe --- /dev/null +++ b/bamt/external/pyBN/utils/independence_tests.py @@ -0,0 +1,181 @@ +""" +****************************** +Conditional Independence Tests +for Constraint-based Learning +****************************** + +Implemented Constraint-based Tests +---------------------------------- +- mutual information +- Pearson's X^2 + +I may consider putting this code into its own class structure. The +main benefit I could see from doing this would be the ability to +cache joint/marginal/conditional probabilities for expedited tests. + +""" + +__author__ = """Nicholas Cullen """ + +import numpy as np + +from bamt.external.pyBN.utils.data import unique_bins + + +def mutual_information(data, conditional=False): + # bins = np.amax(data, axis=0)+1 # read levels for each variable + bins = unique_bins(data) + if len(bins) == 1: + hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts + Px = (hist / hist.sum()) + (1e-7) + MI = -1 * np.sum(Px * np.log(Px)) + return round(MI, 4) + + if len(bins) == 2: + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Px = np.sum(Pxy, axis=1) # P(X,Z) + Py = np.sum(Pxy, axis=0) # P(Y,Z) + + PxPy = np.outer(Px, Py) + Pxy += 1e-7 + PxPy += 1e-7 + MI = np.sum(Pxy * np.log(Pxy / (PxPy))) + return round(MI, 4) + elif len(bins) > 2 and conditional: + # CHECK FOR > 3 COLUMNS -> concatenate Z into one column + if len(bins) > 3: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 2] = "".join(data[i, 2:ncols]) + data = data.astype(np.int64)[:, 0:3] + + bins = np.amax(data, axis=0) + hist, _ = np.histogramdd(data, bins=bins) # frequency counts + + Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z + Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) + Pxz = np.sum(Pxyz, axis=1) # P(X,Z) + Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) + + Pxy_z = Pxyz / (Pz + 1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) + Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) + Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) + + Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) + for i in range(bins[0]): + for j in range(bins[1]): + for k in range(bins[2]): + Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] + Pxyz += 1e-7 + Pxy_z += 1e-7 + Px_y_z += 1e-7 + MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) + + return round(MI, 4) + elif len(bins) > 2 and conditional == False: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 1] = "".join(data[i, 1:ncols]) + data = data.astype(np.int64)[:, 0:2] + + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Px = np.sum(Pxy, axis=1) # P(X,Z) + Py = np.sum(Pxy, axis=0) # P(Y,Z) + + PxPy = np.outer(Px, Py) + Pxy += 1e-7 + PxPy += 1e-7 + MI = np.sum(Pxy * np.log(Pxy / (PxPy))) + return round(MI, 4) + + +def entropy(data): + """ + In the context of structure learning, and more specifically + in constraint-based algorithms which rely on the mutual information + test for conditional independence, it has been proven that the variable + X in a set which MAXIMIZES mutual information is also the variable which + MINIMIZES entropy. This fact can be used to reduce the computational + requirements of tests based on the following relationship: + + Entropy is related to marginal mutual information as follows: + MI(X;Y) = H(X) - H(X|Y) + + Entropy is related to conditional mutual information as follows: + MI(X;Y|Z) = H(X|Z) - H(X|Y,Z) + + For one varibale, H(X) is equal to the following: + -1 * sum of p(x) * log(p(x)) + + For two variables H(X|Y) is equal to the following: + sum over x,y of p(x,y)*log(p(y)/p(x,y)) + + For three variables, H(X|Y,Z) is equal to the following: + -1 * sum of p(x,y,z) * log(p(x|y,z)), + where p(x|y,z) = p(x,y,z)/p(y)*p(z) + Arguments + ---------- + *data* : a nested numpy array + The data from which to learn - must have at least three + variables. All conditioned variables (i.e. Z) are compressed + into one variable. + + Returns + ------- + *H* : entropy value + + """ + try: + cols = data.shape[1] + except IndexError: + cols = 1 + + bins = np.amax(data, axis=0) + if isinstance(bins, np.ndarray): + for i in range(len(bins)): + if bins[i] == 0: + bins[i] = 1 + else: + bins = 1 + # bins = unique_bins(data) + + if cols == 1: + hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts + Px = hist / hist.sum() + (1e-7) + H = -1 * np.sum(Px * np.log(Px)) + + elif cols == 2: # two variables -> assume X then Y + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Py = np.sum(Pxy, axis=0) # P(Y) + Py += 1e-7 + Pxy += 1e-7 + H = np.sum(Pxy * np.log(Py / Pxy)) + + else: + # CHECK FOR > 3 COLUMNS -> concatenate Z into one column + if cols > 3: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 2] = "".join(data[i, 2:ncols]) + data = data.astype(np.int64)[:, 0:3] + + bins = np.amax(data, axis=0) + hist, _ = np.histogramdd(data, bins=bins) # frequency counts + + Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z + Pyz = np.sum(Pxyz, axis=0) + + Pxyz += 1e-7 # for log -inf + Pyz += 1e-7 + H = -1 * np.sum(Pxyz * np.log(Pxyz)) + np.sum(Pyz * np.log(Pyz)) + + return round(H, 4) diff --git a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py new file mode 100644 index 0000000..004a237 --- /dev/null +++ b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py @@ -0,0 +1,852 @@ +import warnings + +import numpy as np +import pandas as pd +import sklearn.preprocessing + +NONE_REPLACEMENT = -32768 + + +def information_mutual_conditional( + x, + y, + z, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + alphabet_x=None, + alphabet_y=None, + Alphabet_Z=None, + keep_dims=False, +): + x, fill_value_X = _sanitise_array_input(x, fill_value) + y, fill_value_Y = _sanitise_array_input(y, fill_value) + z, fill_value_Z = _sanitise_array_input(z, fill_value) + if alphabet_x is not None: + alphabet_x, fill_value_Alphabet_X = _sanitise_array_input( + alphabet_x, fill_value + ) + alphabet_x, _ = _autocreate_alphabet(alphabet_x, fill_value_Alphabet_X) + else: + alphabet_x, fill_value_Alphabet_X = _autocreate_alphabet(x, fill_value_X) + if alphabet_y is not None: + alphabet_y, fill_value_Alphabet_Y = _sanitise_array_input( + alphabet_y, fill_value + ) + alphabet_y, _ = _autocreate_alphabet(alphabet_y, fill_value_Alphabet_Y) + else: + alphabet_y, fill_value_Alphabet_Y = _autocreate_alphabet(y, fill_value_Y) + if Alphabet_Z is not None: + Alphabet_Z, fill_value_Alphabet_Z = _sanitise_array_input( + Alphabet_Z, fill_value + ) + Alphabet_Z, _ = _autocreate_alphabet(Alphabet_Z, fill_value_Alphabet_Z) + else: + Alphabet_Z, fill_value_Alphabet_Z = _autocreate_alphabet(z, fill_value_Z) + + if x.size == 0: + raise ValueError("arg X contains no elements") + if y.size == 0: + raise ValueError("arg Y contains no elements") + if z.size == 0: + raise ValueError("arg Z contains no elements") + if np.any(_isnan(x)): + raise ValueError("arg X contains NaN values") + if np.any(_isnan(y)): + raise ValueError("arg Y contains NaN values") + if np.any(_isnan(z)): + raise ValueError("arg Z contains NaN values") + if alphabet_x.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(alphabet_x)): + raise ValueError("arg Alphabet_X contains NaN values") + if alphabet_y.size == 0: + raise ValueError("arg Alphabet_Y contains no elements") + if np.any(_isnan(alphabet_y)): + raise ValueError("arg Alphabet_Y contains NaN values") + if Alphabet_Z.size == 0: + raise ValueError("arg Alphabet_Z contains no elements") + if np.any(_isnan(Alphabet_Z)): + raise ValueError("arg Alphabet_Z contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if _isnan(fill_value_Y): + raise ValueError("fill value for arg Y is NaN") + if _isnan(fill_value_Z): + raise ValueError("fill value for arg Z is NaN") + if x.shape[:-1] != alphabet_x.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if y.shape[:-1] != alphabet_y.shape[:-1]: + raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") + if z.shape[:-1] != Alphabet_Z.shape[:-1]: + raise ValueError("leading dimensions of args Z and Alphabet_Z do not " "match") + if not cartesian_product and (x.shape != y.shape or x.shape != z.shape): + raise ValueError("dimensions of args X, Y, Z do not match") + if cartesian_product and (x.shape[-1] != y.shape[-1] or x.shape[-1] != z.shape[-1]): + raise ValueError("trailing dimensions of args X, Y, Z do not match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (x, alphabet_x, y, alphabet_y, z, Alphabet_Z), + ( + fill_value_X, + fill_value_Alphabet_X, + fill_value_Y, + fill_value_Alphabet_Y, + fill_value_Z, + fill_value_Alphabet_Z, + ), + ) + x, alphabet_x, y, alphabet_y, z, Alphabet_Z = S + + if not cartesian_product: + I = np.empty(x.shape[:-1]) + if I.ndim > 0: + I[:] = np.NaN + else: + I = np.float64(np.NaN) + else: + shapeI_Z = z.shape[:-1] + z = np.reshape(z, (-1, z.shape[-1])) + Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) + I = [] + for i in range(z.shape[0]): + def f(X, Y, Alphabet_X, Alphabet_Y): + return information_mutual_conditional( + X, + Y, + z[i], + False, + base, + fill_value, + estimator, + Alphabet_X, + Alphabet_Y, + Alphabet_Z[i], + ) + + I.append(_cartesian_product_apply(x, y, f, alphabet_x, alphabet_y)) + shapeI_XY = I[0].shape + if len(shapeI_Z) == 0: + I = np.array(I)[0].reshape(shapeI_XY) + else: + I = np.array(I) + I = np.rollaxis(I, 0, len(I.shape)) + I = I.reshape(np.append(shapeI_XY, shapeI_Z).astype("int")) + return I + + # Re-shape H, X,Y,Z so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + x = np.reshape(x, (-1, x.shape[-1])) + y = np.reshape(y, (-1, y.shape[-1])) + z = np.reshape(z, (-1, z.shape[-1])) + alphabet_x = np.reshape(alphabet_x, (-1, alphabet_x.shape[-1])) + alphabet_y = np.reshape(alphabet_y, (-1, alphabet_y.shape[-1])) + Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) + orig_shape_I = I.shape + I = np.reshape(I, (-1, 1)) + + for i in range(x.shape[0]): + I_ = ( + entropy_joint( + np.vstack((x[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], Alphabet_Z[i]), fill_value), + ) + + entropy_joint( + np.vstack((y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint( + np.vstack((x[i], y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint(z[i], base, fill_value, estimator, Alphabet_Z[i]) + ) + I[i] = I_ + + # Reverse re-shaping + I = np.reshape(I, orig_shape_I) + + if keep_dims and not cartesian_product: + I = I[..., np.newaxis] + + return I + + +def information_mutual( + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, +): + H_conditional = entropy_conditional( + X, Y, cartesian_product, base, fill_value, estimator, Alphabet_X, Alphabet_Y + ) + H = entropy(X, base, fill_value, estimator, Alphabet_X) + + H = np.reshape( + H, np.append(H.shape, np.ones(H_conditional.ndim - H.ndim)).astype("int") + ) + + H = H - H_conditional + + if keep_dims and not cartesian_product: + H = H[..., np.newaxis] + + return H + + +def entropy_pmf(P, base=2, require_valid_pmf=True, keep_dims=False): + P, _ = _sanitise_array_input(P) + + if P.size == 0: + raise ValueError("arg P contains no elements") + if np.any(_isnan(P)): + raise ValueError("arg P contains NaN values") + if np.any(np.logical_or(P < 0, P > 1)): + raise ValueError("arg P contains values outside unit interval") + if require_valid_pmf and not np.allclose(np.sum(P, axis=-1), 1): + raise ValueError("arg P does not sum to unity across last axis") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + H = -np.sum(P * np.log2(P + np.spacing(0)), axis=-1) + H = H / np.log2(base) + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def entropy_joint( + X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False +): + X, fill_value_X = _sanitise_array_input(X, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) + ) + X, Alphabet_X = S + + X = np.reshape(X, (-1, X.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + + _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) + + for i in range(X.shape[0]): + X = X[:, X[i].argsort(kind="mergesort")] + + B = np.any(X[:, 1:] != X[:, :-1], axis=0) + I = np.append(np.where(B), X.shape[1] - 1) + L = np.diff(np.append(-1, I)) + + alphabet_X = X[:, I] + if estimator != "ML": + n_additional_empty_bins = _determine_number_additional_empty_bins( + L, alphabet_X, Alphabet_X, fill_value + ) + else: + n_additional_empty_bins = 0 + L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) + if not np.any(L): + return np.float64(np.NaN) + + # P_0 is the probability mass assigned to each additional empty bin + P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) + H_0 = n_additional_empty_bins * P_0 * -np.log2(P_0 + np.spacing(0)) / np.log2(base) + H = entropy_pmf(P, base, require_valid_pmf=False) + H_0 + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def entropy_conditional( + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, +): + if Y is None: + Y = X + cartesian_product = True + Alphabet_Y = Alphabet_X + + X, fill_value_X = _sanitise_array_input(X, fill_value) + Y, fill_value_Y = _sanitise_array_input(Y, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + if Alphabet_Y is not None: + Alphabet_Y, fill_value_Alphabet_Y = _sanitise_array_input( + Alphabet_Y, fill_value + ) + Alphabet_Y, _ = _autocreate_alphabet(Alphabet_Y, fill_value_Alphabet_Y) + else: + Alphabet_Y, fill_value_Alphabet_Y = _autocreate_alphabet(Y, fill_value_Y) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if Y.size == 0: + raise ValueError("arg Y contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if np.any(_isnan(Y)): + raise ValueError("arg Y contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if Alphabet_Y.size == 0: + raise ValueError("arg Alphabet_Y contains no elements") + if np.any(_isnan(Alphabet_Y)): + raise ValueError("arg Alphabet_Y contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if _isnan(fill_value_Y): + raise ValueError("fill value for arg Y is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if Y.shape[:-1] != Alphabet_Y.shape[:-1]: + raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") + if not cartesian_product and X.shape != Y.shape: + raise ValueError("dimensions of args X and Y do not match") + if cartesian_product and X.shape[-1] != Y.shape[-1]: + raise ValueError("trailing dimensions of args X and Y do not match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X, Y, Alphabet_Y), + (fill_value_X, fill_value_Alphabet_X, fill_value_Y, fill_value_Alphabet_Y), + ) + X, Alphabet_X, Y, Alphabet_Y = S + + if not cartesian_product: + H = np.empty(X.shape[:-1]) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + else: + + def f(X, Y, Alphabet_X, Alphabet_Y): + return entropy_conditional( + X, Y, False, base, fill_value, estimator, Alphabet_X, Alphabet_Y + ) + + return _cartesian_product_apply(X, Y, f, Alphabet_X, Alphabet_Y) + + # Re-shape H, X and Y, so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + X = np.reshape(X, (-1, X.shape[-1])) + Y = np.reshape(Y, (-1, Y.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + for i in range(X.shape[0]): + H[i] = entropy_joint( + np.vstack((X[i], Y[i])), + base, + fill_value, + estimator, + _vstack_pad((Alphabet_X[i], Alphabet_Y[i]), fill_value), + ) - entropy(Y[i], base, fill_value, estimator, Alphabet_Y[i]) + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + if keep_dims and not cartesian_product: + H = H[..., np.newaxis] + + return H + + +def entropy(X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False): + X, fill_value_X = _sanitise_array_input(X, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) + ) + X, Alphabet_X = S + + H = np.empty(X.shape[:-1]) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + + # Re-shape H and X, so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + X = np.reshape(X, (-1, X.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) + + # NB: This is not joint entropy. Elements in each row are sorted + # independently + X = np.sort(X, axis=1) + + # Compute symbol run-lengths + # Compute symbol change indicators + B = X[:, 1:] != X[:, :-1] + for i in range(X.shape[0]): + # Obtain symbol change positions + I = np.append(np.where(B[i]), X.shape[1] - 1) + # Compute run lengths + L = np.diff(np.append(-1, I)) + + alphabet_X = X[i, I] + if estimator != "ML": + n_additional_empty_bins = _determine_number_additional_empty_bins( + L, alphabet_X, Alphabet_X[i], fill_value + ) + else: + n_additional_empty_bins = 0 + L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) + if not np.any(L): + continue + + # P_0 is the probability mass assigned to each additional empty bin + P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) + H_0 = ( + n_additional_empty_bins + * P_0 + * -np.log2(P_0 + np.spacing(0)) + / np.log2(base) + ) + H[i] = entropy_pmf(P, base, require_valid_pmf=False) + H_0 + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def _autocreate_alphabet(X, fill_value): + Lengths = np.apply_along_axis(lambda x: np.unique(x).size, axis=-1, arr=X) + max_length = np.max(Lengths) + + def pad_with_fillvalue(x): + return np.append(x, np.tile(fill_value, int(max_length - x.size))) + + Alphabet = np.apply_along_axis( + lambda x: pad_with_fillvalue(np.unique(x)), axis=-1, arr=X + ) + return (Alphabet, fill_value) + + +def _sanitise_array_input(x, fill_value=-1): + # Avoid Python 3 issues with numpy arrays containing None elements + if np.any(np.equal(x, None)) or fill_value is None: + x = np.array(x, copy=False) + assert np.all(x != NONE_REPLACEMENT) + m = np.equal(x, None) + x = np.where(m, NONE_REPLACEMENT, x) + if fill_value is None: + x = np.array(x, copy=False) + fill_value = NONE_REPLACEMENT + + if isinstance(x, (pd.core.frame.DataFrame, pd.core.series.Series)): + # Create masked array, honouring Dataframe/Series missing entries + # NB: We transpose for convenience, so that quantities are computed for + # each column + x = np.ma.MaskedArray(x, x.isnull()) + + if isinstance(x, np.ma.MaskedArray): + fill_value = x.fill_value + + if np.any(x == fill_value): + warnings.warn("Masked array contains data equal to fill value") + + if x.dtype.kind in ("S", "U"): + kind = x.dtype.kind + current_dtype_len = int(x.dtype.str.split(kind)[1]) + if current_dtype_len < len(fill_value): + # Fix numpy's broken array string type behaviour which causes + # X.filled() placeholder entries to be no longer than + # non-placeholder entries + warnings.warn( + "Changing numpy array dtype internally to " + "accommodate fill_value string length" + ) + m = x.mask + x = np.array(x.filled(), dtype=kind + str(len(fill_value))) + x[m] = fill_value + else: + x = x.filled() + else: + x = x.filled() + else: + x = np.array(x, copy=False) + + if x.dtype.kind not in "biufcmMOSUV": + raise TypeError("Unsupported array dtype") + + if x.size == 1 and x.ndim == 0: + x = np.array((x,)) + + return x, np.array(fill_value) + + +def _map_observations_to_integers(Symbol_matrices, Fill_values): + assert len(Symbol_matrices) == len(Fill_values) + FILL_VALUE = -1 + if np.any([A.dtype != "int" for A in Symbol_matrices]) or np.any( + np.array(Fill_values) != FILL_VALUE + ): + L = sklearn.preprocessing.LabelEncoder() + F = [np.atleast_1d(v) for v in Fill_values] + L.fit(np.concatenate([A.ravel() for A in Symbol_matrices] + F)) + Symbol_matrices = [ + L.transform(A.ravel()).reshape(A.shape) for A in Symbol_matrices + ] + Fill_values = [L.transform(np.atleast_1d(f)) for f in Fill_values] + + for A, f in zip(Symbol_matrices, Fill_values): + assert not np.any(A == FILL_VALUE) + A[A == f] = FILL_VALUE + + assert np.all([A.dtype == "int" for A in Symbol_matrices]) + return Symbol_matrices, FILL_VALUE + + +def _isnan(X): + X = np.array(X, copy=False) + if X.dtype in ("int", "float"): + return np.isnan(X) + else: + f = np.vectorize(_isnan_element) + return f(X) + + +def _isnan_element(x): + if isinstance(x, type(np.nan)): + return np.isnan(x) + else: + return False + + +def _determine_number_additional_empty_bins( + Counts, Alphabet, Full_Alphabet, fill_value +): + alphabet_sizes = np.sum(np.atleast_2d(Full_Alphabet) != fill_value, axis=-1) + if np.any(alphabet_sizes != fill_value): + joint_alphabet_size = np.prod(alphabet_sizes[alphabet_sizes > 0]) + if joint_alphabet_size <= 0: + raise ValueError( + "Numerical overflow detected. Joint alphabet " "size too large." + ) + else: + joint_alphabet_size = 0 + return joint_alphabet_size - np.sum( + np.all(np.atleast_2d(Alphabet) != fill_value, axis=0) + ) + + +def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): + # TODO Documentation should present the following guidelines: + # 1) Good-Turing may be used if slope requirement satisfied and if + # unobserved symbols have been defined (TODO Clarify what the requirement + # is) + # 2) James-Stein approach may be used as an alternative + # 3) Dirichlet prior may be used in all other cases + + assert (np.sum(Counts) > 0) + assert (np.all(Counts.astype('int') == Counts)) + assert (n_additional_empty_bins >= 0) + Counts = Counts.astype('int') + + if isinstance(estimator, str): + estimator = estimator.upper().replace(' ', '') + + if np.isreal(estimator) or estimator in ('ML', 'PERKS', 'MINIMAX'): + if np.isreal(estimator): + alpha = estimator + elif estimator == 'PERKS': + alpha = 1.0 / (Counts.size + n_additional_empty_bins) + elif estimator == 'MINIMAX': + alpha = np.sqrt(np.sum(Counts)) / \ + (Counts.size + n_additional_empty_bins) + else: + alpha = 0 + Theta = (Counts + alpha) / \ + (1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins)) + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = alpha / (1.0 * np.sum(Counts) + + alpha * (Counts.size + n_additional_empty_bins)) + else: + Theta_0 = 0 + + elif estimator == 'GOOD-TURING': + # TODO We could also add a Chen-Chao vocabulary size estimator (See + # Bhat Suma's thesis) + + # The following notation is based on Gale and Sampson (1995) + # Determine histogram of counts N_r (index r denotes count) + X = np.sort(Counts) + B = X[1:] != X[:-1] # Compute symbol change indicators + I = np.append(np.where(B), X.size - 1) # Obtain symbol change positions + N_r = np.zeros(X[I[-1]] + 1) + N_r[X[I]] = np.diff(np.append(-1, I)) # Compute run lengths + N_r[0] = 0 # Ensures that unobserved symbols do not interfere + + # Compute Z_r, a locally averaged version of N_r + R = np.where(N_r)[0] + Q = np.append(0, R[:-1]) + T = np.append(R[1:], 2 * R[-1] - Q[-1]) + Z_r = np.zeros_like(N_r) + Z_r[R] = N_r[R] / (0.5 * (T - Q)) + + # Fit least squares regression line to plot of log(Z_r) versus log(r) + x = np.log10(np.arange(1, Z_r.size)) + with np.errstate(invalid='ignore', divide='ignore'): + y = np.log10(Z_r[1:]) + x = x[np.isfinite(y)] + y = y[np.isfinite(y)] + m, c = np.linalg.lstsq(np.vstack([x, np.ones(x.size)]).T, y, + rcond=None)[0] + if m >= -1: + warnings.warn("Regression slope < -1 requirement in linear " + "Good-Turing estimate not satisfied") + # Compute smoothed value of N_r based on interpolation + # We need to refer to SmoothedN_{r+1} for all observed values of r + SmoothedN_r = np.zeros(N_r.size + 1) + SmoothedN_r[1:] = 10 ** (np.log10(np.arange(1, SmoothedN_r.size)) * + m + c) + + # Determine threshold value of r at which to use smoothed values of N_r + # (SmoothedN_r), as apposed to straightforward N_r. + # Variance of Turing estimate + with np.errstate(invalid='ignore', divide='ignore'): + VARr_T = (np.arange(N_r.size) + 1) ** 2 * \ + (1.0 * np.append(N_r[1:], 0) / (N_r ** 2)) * \ + (1 + np.append(N_r[1:], 0) / N_r) + x = (np.arange(N_r.size) + 1) * 1.0 * np.append(N_r[1:], 0) / N_r + y = (np.arange(N_r.size) + 1) * \ + 1.0 * SmoothedN_r[1:] / (SmoothedN_r[:-1]) + assert (np.isinf(VARr_T[0]) or np.isnan(VARr_T[0])) + turing_is_sig_diff = np.abs(x - y) > 1.96 * np.sqrt(VARr_T) + assert (turing_is_sig_diff[0] == np.array(False)) + # NB: 0th element can be safely ignored, since always 0 + T = np.where(turing_is_sig_diff == np.array(False))[0] + if T.size > 1: + thresh_r = T[1] + # Use smoothed estimates from the first non-significant + # np.abs(SmoothedN_r-N_r) position onwards + SmoothedN_r[:thresh_r] = N_r[:thresh_r] + else: + # Use only non-smoothed estimates (except for SmoothedN_r[-1]) + SmoothedN_r[:-1] = N_r + + # Estimate probability of encountering one particular symbol among the + # objects observed r times, r>0 + p_r = np.zeros(N_r.size) + N = np.sum(Counts) + p_r[1:] = (np.arange(1, N_r.size) + 1) * \ + 1.0 * SmoothedN_r[2:] / (SmoothedN_r[1:-1] * N) + # Estimate probability of observing any unseen symbol + p_r[0] = 1.0 * N_r[1] / N + + # Assign probabilities to observed symbols + Theta = np.array([p_r[r] for r in Counts]) + Theta[Counts == 0] = 0 + + # Normalise probabilities for observed symbols, so that they sum to one + if np.any(Counts == 0) or n_additional_empty_bins > 0: + Theta = (1 - p_r[0]) * Theta / np.sum(Theta) + else: + warnings.warn("No unobserved outcomes specified. Disregarding the " + "probability mass allocated to any unobserved " + "outcomes.") + Theta = Theta / np.sum(Theta) + + # Divide p_0 among unobserved symbols + with np.errstate(invalid='ignore', divide='ignore'): + p_emptybin = p_r[0] / (np.sum(Counts == 0) + + n_additional_empty_bins) + Theta[Counts == 0] = p_emptybin + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = p_emptybin + else: + Theta_0 = 0 + + elif estimator == 'JAMES-STEIN': + Theta, _ = _estimate_probabilities(Counts, 'ML') + p_uniform = 1.0 / (Counts.size + n_additional_empty_bins) + with np.errstate(invalid='ignore', divide='ignore'): + Lambda = (1 - np.sum(Theta ** 2)) / \ + ((np.sum(Counts) - 1) * + (np.sum((p_uniform - Theta) ** 2) + + n_additional_empty_bins * p_uniform ** 2)) + + if Lambda > 1: + Lambda = 1 + elif Lambda < 0: + Lambda = 0 + elif np.isnan(Lambda): + Lambda = 1 + + Theta = Lambda * p_uniform + (1 - Lambda) * Theta + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = Lambda * p_uniform + else: + Theta_0 = 0 + else: + raise ValueError("invalid value specified for estimator") + + return Theta, Theta_0 + + +def _remove_counts_at_fill_value(Counts, Alphabet, fill_value): + I = np.any(np.atleast_2d(Alphabet) == fill_value, axis=0) + if np.any(I): + Counts = Counts[~I] + Alphabet = Alphabet.T[~I].T + return (Counts, Alphabet) + + +def _cartesian_product_apply(X, Y, function, Alphabet_X=None, Alphabet_Y=None): + assert X.ndim > 0 and Y.ndim > 0 + assert X.size > 0 and Y.size > 0 + if Alphabet_X is not None or Alphabet_Y is not None: + assert Alphabet_X.ndim > 0 and Alphabet_Y.ndim > 0 + assert Alphabet_X.size > 0 and Alphabet_Y.size > 0 + + H = np.empty(np.append(X.shape[:-1], Y.shape[:-1]).astype("int")) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + + X = np.reshape(X, (-1, X.shape[-1])) + Y = np.reshape(Y, (-1, Y.shape[-1])) + if Alphabet_X is not None or Alphabet_Y is not None: + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + n = 0 + for i in range(X.shape[0]): + for j in range(Y.shape[0]): + if Alphabet_X is not None or Alphabet_Y is not None: + H[n] = function(X[i], Y[j], Alphabet_X[i], Alphabet_Y[j]) + else: + H[n] = function(X[i], Y[j]) + n = n + 1 + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + return H + + +def _verify_alphabet_sufficiently_large(X, Alphabet, fill_value): + assert not np.any(X == np.array(None)) + assert not np.any(Alphabet == np.array(None)) + for i in range(X.shape[0]): + I = X[i] != fill_value + J = Alphabet[i] != fill_value + # NB: This causes issues when both arguments contain None. But it is + # always called after observations have all been mapped to integers. + if np.setdiff1d(X[i, I], Alphabet[i, J]).size > 0: + raise ValueError( + "provided alphabet does not contain all observed " "outcomes" + ) + + +def _vstack_pad(arrays, fill_value): + max_length = max([A.shape[-1] for A in arrays]) + arrays = [ + np.append( + A, + np.tile( + fill_value, + np.append(A.shape[:-1], max_length - A.shape[-1]).astype(int), + ), + ) + for A in arrays + ] + return np.vstack(arrays) diff --git a/bamt/core/nodes/child_nodes/__init__.py b/bamt/external/pyitlib/__init__.py similarity index 100% rename from bamt/core/nodes/child_nodes/__init__.py rename to bamt/external/pyitlib/__init__.py diff --git a/bamt/log.py b/bamt/log.py new file mode 100644 index 0000000..74d65fc --- /dev/null +++ b/bamt/log.py @@ -0,0 +1,67 @@ +import logging +import logging.config +import os + + +class BamtLogger: + def __init__(self): + self.file_handler = False + self.enabled = True + self.base = os.path.join(os.path.dirname(os.path.abspath(__file__))) + self.log_file_path = os.path.join(self.base, "logging.conf") + logging.config.fileConfig(self.log_file_path) + + self.loggers = dict( + logger_builder=logging.getLogger("builder"), + logger_network=logging.getLogger("network"), + logger_preprocessor=logging.getLogger("preprocessor"), + logger_nodes=logging.getLogger("nodes"), + logger_display=logging.getLogger("display"), + ) + + def has_handler(self, logger, handler_type): + """Check if a logger has a handler of a specific type.""" + return any(isinstance(handler, handler_type) for handler in logger.handlers) + + def remove_handler_type(self, logger, handler_type): + """Remove all handlers of a specific type from a logger.""" + for handler in logger.handlers[:]: + if isinstance(handler, handler_type): + logger.removeHandler(handler) + + def switch_console_out(self, value: bool): + """ + Turn off console output from logger. + """ + assert isinstance(value, bool) + handler_class = logging.StreamHandler if not value else logging.NullHandler + + for logger in self.loggers.values(): + if self.has_handler(logger, handler_class): + self.remove_handler_type(logger, handler_class) + logger.addHandler(logging.NullHandler() if not value else logging.root.handlers[0]) + + def switch_file_out(self, value: bool, log_file: str): + """ + Send all messages in file + """ + assert isinstance(value, bool) + + for logger in self.loggers.values(): + if value: + file_handler = logging.FileHandler(log_file, mode="a") + file_handler.setFormatter(logging.root.handlers[0].formatter) + logger.addHandler(file_handler) + else: + self.remove_handler_type(logger, logging.FileHandler) + + +bamt_logger = BamtLogger() + +( + logger_builder, + logger_network, + logger_preprocessor, + logger_nodes, + logger_display, +) = list(bamt_logger.loggers.values()) diff --git a/bamt/logging.conf b/bamt/logging.conf new file mode 100644 index 0000000..87663ef --- /dev/null +++ b/bamt/logging.conf @@ -0,0 +1,51 @@ +[loggers] +keys=root, preprocessor, builder, nodes, network, display + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_preprocessor] +level=INFO +qualname=preprocessor +handlers=consoleHandler +propagate=0 + +[logger_network] +level=INFO +qualname=network +handlers=consoleHandler +propagate=0 + +[logger_builder] +level=INFO +qualname=builder +handlers=consoleHandler +propagate=0 + +[logger_nodes] +level=INFO +qualname=nodes +handlers=consoleHandler +propagate=0 + +[logger_display] +level=INFO +qualname=display +handlers=consoleHandler +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s | %(levelname)-8s | %(filename)s-%(funcName)s-%(lineno)04d | %(message)s diff --git a/bamt/mi_entropy_gauss.py b/bamt/mi_entropy_gauss.py new file mode 100644 index 0000000..48b1ef9 --- /dev/null +++ b/bamt/mi_entropy_gauss.py @@ -0,0 +1,310 @@ +import math +import sys +from copy import copy +from typing import List + +import numpy as np +import pandas as pd + +from bamt.external.pyBN.utils.independence_tests import mutual_information, entropy +from bamt.preprocess.discretization import get_nodes_type +from bamt.preprocess.graph import edges_to_dict +from bamt.preprocess.numpy_pandas import loc_to_DataFrame + + +def query_filter(data: pd.DataFrame, columns: List, values: List): + """ + Filters the data according to the column-value list + Arguments + ---------- + *data* : pandas.DataFrame + Returns + ------- + *data_trim* : pandas.DataFrame + Filtered data. + Effects + ------- + None + """ + data_copy = copy(data) + filter_str = "`" + str(columns[0]) + "`" + " == " + str(values[0]) + if len(columns) == 1: + return data_copy.query(filter_str) + else: + for i in range(1, len(columns)): + filter_str += " & " + "`" + str(columns[i]) + "`" + " == " + str(values[i]) + data_trim = data_copy.query(filter_str) + return data_trim + + +def entropy_gauss(pd_data): + """ + Calculate entropy for Gaussian multivariate distributions. + Arguments + ---------- + *data* : pd.DataFrame + Returns + ------- + *entropy* : a float + The entropy for Gaussian multivariate distributions. + Effects + ------- + None + """ + if not isinstance(pd_data, pd.Series): + data = copy(pd_data).values.T + else: + data = np.array(copy(pd_data)).T + if data.size == 0: + return 0.0 + flag_row = False + flag_col = False + + if isinstance(data[0], np.float64): + flag_row = True + elif (len(data[0]) < 2) | (data.ndim < 2): + flag_row = True + elif data.shape[0] < 2: + flag_row = True + if isinstance(copy(data).T[0], np.float64): + flag_col = True + elif (len(copy(data).T) < 2) | (copy(data).T.ndim < 2): + flag_col = True + elif data.shape[1] < 2: + flag_col = True + + if flag_row & flag_col: + return sys.float_info.max + elif flag_row | flag_col: + var = np.var(data) + if var > 1e-16: + return 0.5 * (1 + math.log(var * 2 * math.pi)) + else: + return sys.float_info.min + else: + var = np.linalg.det(np.cov(data)) + N = var.ndim + if var > 1e-16: + return 0.5 * (N * (1 + math.log(2 * math.pi)) + math.log(var)) + else: + return sys.float_info.min + + +def entropy_all(data, method="MI"): + """ + For one varibale, H(X) is equal to the following: + -1 * sum of p(x) * log(p(x)) + For two variables H(X|Y) is equal to the following: + sum over x,y of p(x,y)*log(p(y)/p(x,y)) + For three variables, H(X|Y,Z) is equal to the following: + -1 * sum of p(x,y,z) * log(p(x|y,z)), + where p(x|y,z) = p(x,y,z)/p(y)*p(z) + Arguments + ---------- + *data* : pd.DataFrame + Returns + ------- + *H* : entropy value""" + if isinstance(data, np.ndarray): + return entropy_all(loc_to_DataFrame(data), method=method) + elif isinstance(data, pd.Series): + return entropy_all(pd.DataFrame(data), method) + elif isinstance(data, pd.DataFrame): + nodes_type = get_nodes_type(data) + column_disc = [] + for key in nodes_type: + if nodes_type[key] == "disc": + column_disc.append(key) + column_cont = [] + for key in nodes_type: + if nodes_type[key] == "cont": + column_cont.append(key) + data_disc = data[column_disc] + data_cont = data[column_cont] + + if len(column_cont) == 0: + return entropy(data_disc.values) + elif len(column_disc) == 0: + return entropy_gauss(data_cont) + else: + H_disc = entropy(data_disc.values) + dict_comb = {} + comb_prob = {} + for i in range(len(data_disc)): + row = data_disc.iloc[i] + comb = "" + for _, val in row.items(): + comb = comb + str(val) + ", " + if comb not in dict_comb: + dict_comb[comb] = row + comb_prob[comb] = 1 + else: + comb_prob[comb] += 1 + + H_cond = 0.0 + for key in list(dict_comb.keys()): + filtered_data = query_filter(data, column_disc, list(dict_comb[key])) + filtered_data = filtered_data[column_cont] + if comb_prob[key] == 1: + if (method == "BIC") | (method == "AIC"): + H_cond += ( + comb_prob[key] + / len(data_disc) + * entropy_gauss(data[column_cont]) + ) + else: + H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max + else: + H_cond += ( + comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + ) + if (method == "BIC") | (method == "AIC"): + if H_cond > entropy_gauss(data[column_cont]): + H_cond = entropy_gauss(data[column_cont]) + + return H_disc + H_cond + + +def entropy_cond(data, column_cont, column_disc, method): + data_cont = data[column_cont] + data_disc = data[column_disc] + H_gauss = entropy_gauss(data_cont) + H_cond = 0.0 + + dict_comb = {} + comb_prob = {} + for i in range(len(data_disc)): + row = data_disc.iloc[i] + comb = "" + for _, val in row.items(): + comb = comb + str(val) + ", " + if comb not in dict_comb: + dict_comb[comb] = row + comb_prob[comb] = 1 + else: + comb_prob[comb] += 1 + + for key in list(dict_comb.keys()): + filtered_data = query_filter(data, column_disc, list(dict_comb[key])) + filtered_data = filtered_data[column_cont] + if comb_prob[key] == 1: + if (method == "BIC") | (method == "AIC"): + H_cond += comb_prob[key] / len(data_disc) * H_gauss + else: + H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max + else: + H_cond += comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + if (method == "BIC") | (method == "AIC"): + if H_cond > H_gauss: + return H_gauss + else: + return H_cond + return H_cond + + +def mi_gauss(data, method="MI", conditional=False): + """ + Calculate Mutual Information based on entropy. + In the case of continuous uses entropy for Gaussian multivariate distributions. + Arguments + ---------- + *data* : pandas.DataFrame + Returns + ------- + *MI* : a float + The Mutual Information + Effects + ------- + None + Notes + ----- + - Need to preprocess data with code_categories + """ + if isinstance(data, np.ndarray): + return mi_gauss(loc_to_DataFrame(data), method, conditional) + elif isinstance(data, pd.Series): + return mi_gauss(pd.DataFrame(data)) + elif isinstance(data, pd.DataFrame): + nodes_type = get_nodes_type(data) + if conditional: + # Hill-Climbing does not use conditional MI, but other algorithms may require it + # At the moment it counts on condition of the last row in the list + # of columns + print("Warning: conditional == True") + nodes_type_trim = copy(nodes_type) + data_trim = copy(data) + list_keys = list(nodes_type_trim.keys) + del nodes_type_trim[list_keys[-1]] + del data_trim[list_keys[-1]] + return mi_gauss(data, nodes_type, method) - mi_gauss( + data_trim, nodes_type, method + ) + else: + column_disc = [] + for key in nodes_type: + if nodes_type[key] == "disc": + column_disc.append(key) + column_cont = [] + for key in nodes_type: + if nodes_type[key] == "cont": + column_cont.append(key) + data_disc = data[column_disc] + data_cont = data[column_cont] + + H_gauss = 0.0 + H_cond = 0.0 + + if len(column_cont) == 0: + return mutual_information(data_disc.values, conditional=False) + elif len(column_disc) == 0: + if len(column_cont) == 1: + return entropy_gauss(data_cont) + else: + data_last = data_cont[[column_cont[-1]]] + column_cont_trim = copy(column_cont) + del column_cont_trim[-1] + data_cont_trim = data[column_cont_trim] + + H_gauss = ( + entropy_gauss(data_last) + + entropy_gauss(data_cont_trim) + - entropy_gauss(data_cont) + ) + H_gauss = min( + H_gauss, entropy_gauss(data_last), entropy_gauss(data_cont_trim) + ) + # H_gauss = entropy_gauss(data_cont) + H_cond = 0.0 + else: + H_gauss = entropy_gauss(data_cont) + H_cond = entropy_cond(data, column_cont, column_disc, method) + + return H_gauss - H_cond + + +def mi(edges: list, data: pd.DataFrame, method="MI"): + """ + Bypasses all nodes and summarizes scores, + taking into account the parent-child relationship. + Arguments + ---------- + *edges* : list + *data* : pd.DataFrame + Returns + ------- + *sum_score* : float + Effects + ------- + None + """ + parents_dict = edges_to_dict(edges) + sum_score = 0.0 + nodes_with_edges = parents_dict.keys() + for var in nodes_with_edges: + child_parents = [var] + child_parents.extend(parents_dict[var]) + sum_score += mi_gauss(copy(data[child_parents]), method) + nodes_without_edges = list(set(data.columns).difference(set(nodes_with_edges))) + for var in nodes_without_edges: + sum_score += mi_gauss(copy(data[var]), method) + return sum_score diff --git a/bamt/models/__init__.py b/bamt/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/models/probabilistic_structural_models/__init__.py b/bamt/models/probabilistic_structural_models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/models/probabilistic_structural_models/bayesian_network.py b/bamt/models/probabilistic_structural_models/bayesian_network.py deleted file mode 100644 index e9cf91b..0000000 --- a/bamt/models/probabilistic_structural_models/bayesian_network.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import abstractmethod - -from .probabilistic_structural_model import ProbabilisticStructuralModel - - -class BayesianNetwork(ProbabilisticStructuralModel): - def __init__(self): - super().__init__() - - @abstractmethod - def fit(self): - pass - - @abstractmethod - def predict(self): - pass - - @abstractmethod - def sample(self): - pass diff --git a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py deleted file mode 100644 index 486eaf7..0000000 --- a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class CompositeBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Composite Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py deleted file mode 100644 index 618c4dc..0000000 --- a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class ContinuousBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Continuous Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py deleted file mode 100644 index b7654a6..0000000 --- a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class DiscreteBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Discrete Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py deleted file mode 100644 index c4256df..0000000 --- a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class HybridBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Hybrid Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py b/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py deleted file mode 100644 index b2b3b56..0000000 --- a/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py +++ /dev/null @@ -1,14 +0,0 @@ -from abc import ABC, abstractmethod - - -class ProbabilisticStructuralModel(ABC): - def __init__(self): - pass - - @abstractmethod - def fit(self): - pass - - @abstractmethod - def predict(self): - pass diff --git a/bamt/networks/__init__.py b/bamt/networks/__init__.py new file mode 100644 index 0000000..7f8df8f --- /dev/null +++ b/bamt/networks/__init__.py @@ -0,0 +1,6 @@ +from bamt.networks.base import BaseNetwork +from bamt.networks.big_brave_bn import BigBraveBN +from bamt.networks.composite_bn import CompositeBN +from bamt.networks.continuous_bn import ContinuousBN +from bamt.networks.discrete_bn import DiscreteBN +from bamt.networks.hybrid_bn import HybridBN diff --git a/bamt/networks/base.py b/bamt/networks/base.py index e69de29..8090918 100644 --- a/bamt/networks/base.py +++ b/bamt/networks/base.py @@ -0,0 +1,1002 @@ +import json +import os.path as path +import random +import re +from copy import deepcopy +from typing import Dict, Tuple, List, Callable, Optional, Type, Union, Any, Sequence + +import numpy as np +import pandas as pd +from joblib import Parallel, delayed +from pgmpy.estimators import K2Score +from sklearn.preprocessing import LabelEncoder +from tqdm import tqdm + +import bamt.builders as builders +from bamt.builders.builders_base import ParamDict +from bamt.builders.evo_builder import EvoStructureBuilder +from bamt.builders.hc_builder import HCStructureBuilder +from bamt.display import plot_, get_info_ +from bamt.external.pyitlib.DiscreteRandomVariableUtils import ( + entropy, + information_mutual, + information_mutual_conditional, + entropy_conditional, +) +from bamt.log import logger_network +from bamt.nodes.base import BaseNode +from bamt.utils import GraphUtils, serialization_utils, check_utils + + +class BaseNetwork(object): + """ + Base class for Bayesian Network + """ + + def __init__(self): + """ + nodes: a list of nodes instances + edges: a list of edges + distributions: dict + """ + self.sf_name = None + self.type = "Abstract" + self._allowed_dtypes = ["Abstract"] + self.nodes = [] + self.edges = [] + self.weights = {} + self.descriptor = {"types": {}, "signs": {}} + self.distributions = {} + self.has_logit = False + self.use_mixture = False + self.encoders = {} + + @property + def nodes_names(self) -> List[str]: + return [node.name for node in self.nodes] + + def __getitem__(self, node_name: str) -> Type[BaseNode]: + if node_name in self.nodes_names: + index = self.nodes_names.index(node_name) + return self.nodes[index] + + def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: + types = descriptor["types"] + return ( + True if all([a in self._allowed_dtypes for a in types.values()]) else False + ) + + def update_descriptor(self): + new_nodes_names = [node.name for node in self.nodes] + self.descriptor["types"] = { + node: type + for node, type in self.descriptor["types"].items() + if node in new_nodes_names + } + if "cont" in self.descriptor["types"].values(): + self.descriptor["signs"] = { + node: sign + for node, sign in self.descriptor["signs"].items() + if node in new_nodes_names + } + + def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): + """ + Function for initializing nodes in Bayesian Network + descriptor: dict with types and signs of nodes + """ + if not self.validate(descriptor=descriptor): + if not self.type == "Hybrid" or not self.type == "Composite": + logger_network.error( + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) + return + else: + logger_network.error( + f"Descriptor validation failed due to wrong type of column(s)." + ) + return + elif ["Abstract"] in self._allowed_dtypes: + return None + self.descriptor = descriptor + worker_1 = builders.builders_base.VerticesDefiner(descriptor, regressor=None) + + # first stage + worker_1.skeleton["V"] = worker_1.vertices + # second stage + worker_1.overwrite_vertex( + has_logit=False, + use_mixture=self.use_mixture, + classifier=None, + regressor=None, + ) + self.nodes = worker_1.vertices + + def add_edges( + self, + data: pd.DataFrame, + scoring_function: Union[Tuple[str, Callable], Tuple[str]] = ("K2", K2Score), + progress_bar: bool = True, + classifier: Optional[object] = None, + regressor: Optional[object] = None, + params: Optional[ParamDict] = None, + optimizer: str = "HC", + **kwargs, + ): + """ + Base function for Structure learning + scoring_function: tuple with the following format (NAME, scoring_function) or (NAME, ) + Params: + init_edges: list of tuples, a graph to start learning with + remove_init_edges: allows changes in a model defined by user + white_list: list of allowed edges + """ + if not self.has_logit and check_utils.is_model(classifier): + logger_network.error("Classifiers dict with use_logit=False is forbidden.") + return None + + # params validation + if params: + # init_edges validation + if not self.has_logit and "init_edges" in params.keys(): + type_map = np.array( + [ + [ + self.descriptor["types"][node1], + self.descriptor["types"][node2], + ] + for node1, node2 in params["init_edges"] + ] + ) + failed = (type_map[:, 0] == "cont") & ( + (type_map[:, 1] == "disc") | (type_map[:, 1] == "disc_num") + ) + if sum(failed): + logger_network.warning( + f"Edges between continuous nodes and disc nodes are forbidden (has_logit = {self.has_logit}), " + f"they will be ignored. Indexes: {np.where(failed)[0]}" + ) + params["init_edges"] = [ + params["init_edges"][i] + for i in range(len(params["init_edges"])) + if i not in np.where(failed)[0] + ] + + if not self.validate(descriptor=self.descriptor): + logger_network.error( + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) + return None + if optimizer == "HC": + worker = HCStructureBuilder( + data=data, + descriptor=self.descriptor, + scoring_function=scoring_function, + has_logit=self.has_logit, + use_mixture=self.use_mixture, + regressor=regressor, + ) + elif optimizer == "Evo": + worker = EvoStructureBuilder( + data=data, + descriptor=self.descriptor, + has_logit=self.has_logit, + use_mixture=self.use_mixture, + regressor=regressor, + ) + else: + logger_network.error(f"Optimizer {optimizer} is not supported") + return None + + self.sf_name = scoring_function[0] + + worker.build( + data=data, + params=params, + classifier=classifier, + regressor=regressor, + progress_bar=progress_bar, + **kwargs, + ) + + # update family + self.nodes = worker.skeleton["V"] + self.edges = worker.skeleton["E"] + + def calculate_weights(self, discretized_data: pd.DataFrame): + """ + Provide calculation of link strength according mutual information between node and its parent(-s) values. + """ + import bamt.utils.GraphUtils as gru + + data_descriptor = gru.nodes_types(discretized_data) + if not all([i in ["disc", "disc_num"] for i in data_descriptor.values()]): + logger_network.error( + f"calculate_weghts() method deals only with discrete data. Continuous data: " + + f"{[col for col, type in data_descriptor.items() if type not in ['disc', 'disc_num']]}" + ) + if not self.edges: + logger_network.error( + "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" + ) + if not self.nodes: + logger_network.error( + "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" + ) + weights = dict() + + for node in self.nodes: + parents = node.cont_parents + node.disc_parents + if parents is None: + continue + y = discretized_data[node.name].values + if len(parents) == 1: + x = discretized_data[parents[0]].values + ls_true = information_mutual(X=y, Y=x) + entropy_value = entropy(X=y) + weight = ls_true / entropy_value + weights[(parents[0], node.name)] = weight + else: + for parent_node in parents: + x = discretized_data[parent_node].values + other_parents = [tmp for tmp in parents if tmp != parent_node] + z = list() + for other_parent in other_parents: + z.append(list(discretized_data[other_parent].values)) + ls_true = np.average( + information_mutual_conditional( + x=y, y=x, z=z, cartesian_product=True + ) + ) + entropy_value = ( + np.average( + entropy_conditional(X=y, Y=z, cartesian_product=True) + ) + + 1e-8 + ) + weight = ls_true / entropy_value + weights[(parent_node, node.name)] = weight + self.weights = weights + + def set_nodes(self, nodes: List, info: Optional[Dict] = None): + """ + additional function to set nodes manually. User should be aware that + nodes must be a subclass of BaseNode. + Params: + nodes: dict with name and node (if a lot of nodes should be added) + info: descriptor + """ + if not info and not self.descriptor["types"]: + logger_network.error( + "In case of manual setting nodes user should set map for them as well." + ) + return + self.nodes = [] + for node in nodes: + if issubclass(type(node), BaseNode): + self.nodes.append(node) + else: + logger_network.error(f"{node} is not an instance of {BaseNode}") + + if info: + self.descriptor = info + + def set_edges(self, edges: Optional[List[Sequence[str]]] = None): + """ + additional function to set edges manually. User should be aware that + nodes must be a subclass of BaseNode. + param: edges dict with name and node (if a lot of nodes should be added) + """ + + if not self.nodes: + logger_network.error("Graph without nodes") + self.edges = [] + for node1, node2 in edges: + if isinstance(node1, str) and isinstance(node2, str): + if self[node1] and self[node2]: + if ( + not self.has_logit + and self.descriptor["types"][node1] == "cont" + and self.descriptor["types"][node2] == "disc" + ): + logger_network.warning( + f"Restricted edge detected (has_logit=False) : [{node1}, {node2}]" + ) + continue + else: + self.edges.append((node1, node2)) + else: + logger_network.error(f"Unknown nodes : [{node1}, {node2}]") + continue + else: + logger_network.error( + f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]" + ) + continue + self.update_descriptor() + + def set_structure( + self, + info: Optional[Dict] = None, + nodes: Optional[List] = None, + edges: Optional[List[Sequence[str]]] = None, + ): + """ + Function to set structure manually + info: Descriptor + nodes, edges: + """ + if nodes and (info or (self.descriptor["types"] and self.descriptor["signs"])): + if ( + any("mixture" in node.type.lower() for node in nodes) + and not self.use_mixture + ): + logger_network.error("Compatibility error: use mixture.") + return + if ( + any("logit" in node.type.lower() for node in nodes) + and not self.has_logit + ): + logger_network.error("Compatibility error: has logit.") + return + self.set_nodes(nodes=nodes, info=info) + if isinstance(edges, list): + if not self.nodes: + logger_network.error("Nodes/info detection failed.") + return + + builder = builders.builders_base.VerticesDefiner( + descriptor=self.descriptor, regressor=None + ) # init worker + builder.skeleton["V"] = builder.vertices # 1 stage + if len(edges) != 0: + # set edges and register members + self.set_edges(edges=edges) + builder.skeleton["E"] = self.edges + builder.get_family() + + builder.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=None, + regressor=None, + ) + + self.set_nodes(nodes=builder.skeleton["V"]) + + def _param_validation(self, params: Dict[str, Any]) -> bool: + if all(self[i] for i in params.keys()): + for name, info in params.items(): + try: + self[name].choose(node_info=info, pvals=[]) + except Exception as ex: + logger_network.error("Validation failed", exc_info=ex) + return False + return True + else: + logger_network.error("Param validation failed due to unknown nodes.") + return False + + def set_parameters(self, parameters: Dict): + if not self.nodes: + logger_network.error("Failed on search of BN's nodes.") + + self.distributions = parameters + + for node, data in self.distributions.items(): + if "hybcprob" in data.keys(): + if "mixture" in self[node].type.lower(): + continue + else: + if "gaussian" in self[node].type.lower(): + model_type = "regressor" + else: + model_type = "classifier" + + model = None + for v in data["hybcprob"].values(): + if v[model_type]: + model = v[model_type] + break + else: + continue + if not model: + logger_network.warning( + f"Classifier/regressor for {node} hadn't been used." + ) + + self[node].type = re.sub( + r"\([\s\S]*\)", f"({model})", self[node].type + ) + else: + if data.get("serialization", False): + regressor = data.get("regressor", None) + classifier = data.get("classifier", None) + self[node].type = re.sub( + r"\([\s\S]*\)", f"({regressor or classifier})", self[node].type + ) + else: + self[node].type = re.sub( + r"\([\s\S]*\)", f"({None})", self[node].type + ) + + @staticmethod + def _save_to_file(outdir: str, data: Union[dict, list]): + """ + Function to save data to json file + :param outdir: output directory + :param data: dictionary to be saved + """ + if not outdir.endswith(".json"): + raise TypeError( + f"Unappropriated file format. Expected: .json. Got: {path.splitext(outdir)[-1]}" + ) + with open(outdir, "w+") as out: + json.dump(data, out) + return True + + def save_params(self, outdir: str): + """ + Function to save BN params to json file + outdir: output directory + """ + return self._save_to_file(outdir, self.distributions) + + def save_structure(self, outdir: str): + """ + Function to save BN edges to json file + outdir: output directory + """ + return self._save_to_file(outdir, self.edges) + + def save(self, bn_name, models_dir: str = "models_dir"): + """ + Function to save the whole BN to json file. + + :param bn_name: unique name of bn user want to save. It will be used as file name (e.g. bn_name.json). + :param models_dir: if picklization is broken, joblib will serialize models in compressed files + in models directory. + + :return: saving status. + """ + distributions = deepcopy(self.distributions) + new_weights = {str(key): self.weights[key] for key in self.weights} + + to_serialize = {} + # separate logit and gaussian nodes from distributions to serialize bn's models + for node_name in distributions.keys(): + if "Mixture" in self[node_name].type: + continue + if self[node_name].type.startswith("Gaussian"): + if not distributions[node_name]["regressor"]: + continue + if ( + "Gaussian" in self[node_name].type + or "Logit" in self[node_name].type + or "ConditionalLogit" in self[node_name].type + ): + to_serialize[node_name] = [ + self[node_name].type, + distributions[node_name], + ] + + serializer = serialization_utils.ModelsSerializer( + bn_name=bn_name, models_dir=models_dir + ) + serialized_dist = serializer.serialize(to_serialize) + + for serialized_node in serialized_dist.keys(): + distributions[serialized_node] = serialized_dist[serialized_node] + + outdict = { + "info": self.descriptor, + "edges": self.edges, + "parameters": distributions, + "weights": new_weights, + } + return self._save_to_file(f"{bn_name}.json", outdict) + + def load(self, + input_data: Union[str, Dict], + models_dir: str = "/"): + """ + Function to load the whole BN from json file. + :param input_data: input path to json file with bn. + :param models_dir: directory with models. + + :return: loading status. + """ + if isinstance(input_data, str): + with open(input_data) as f: + input_dict = json.load(f) + elif isinstance(input_data, dict): + input_dict = deepcopy(input_data) + else: + logger_network.error(f"Unknown input type: {type(input_data)}") + return + + self.add_nodes(input_dict["info"]) + self.set_structure(edges=input_dict["edges"]) + + # check compatibility with father network. + if not self.use_mixture: + for node_data in input_dict["parameters"].values(): + if "hybcprob" not in node_data.keys(): + continue + else: + # Since we don't have information about types of nodes, we + # should derive it from parameters. + if any( + list(node_keys.keys()) == ["covars", "mean", "coef"] + for node_keys in node_data["hybcprob"].values() + ): + logger_network.error( + f"This crucial parameter is not the same as father's parameter: use_mixture." + ) + return + + # check if edges before and after are the same.They can be different in + # the case when user sets forbidden edges. + if not self.has_logit: + if not all( + edges_before == [edges_after[0], edges_after[1]] + for edges_before, edges_after in zip(input_dict["edges"], self.edges) + ): + logger_network.error( + f"This crucial parameter is not the same as father's parameter: has_logit." + ) + return + + deserializer = serialization_utils.Deserializer(models_dir) + + to_deserialize = {} + # separate logit and gaussian nodes from distributions to deserialize bn's models + for node_name in input_dict["parameters"].keys(): + if "Mixture" in self[node_name].type: + continue + if self[node_name].type.startswith("Gaussian"): + if not input_dict["parameters"][node_name]["regressor"]: + continue + + if ( + "Gaussian" in self[node_name].type + or "Logit" in self[node_name].type + or "ConditionalLogit" in self[node_name].type + ): + if input_dict["parameters"][node_name].get("serialization", False): + to_deserialize[node_name] = [ + self[node_name].type, + input_dict["parameters"][node_name], + ] + elif "hybcprob" in input_dict["parameters"][node_name].keys(): + to_deserialize[node_name] = [ + self[node_name].type, + input_dict["parameters"][node_name], + ] + else: + continue + + deserialized_parameters = deserializer.apply(to_deserialize) + distributions = input_dict["parameters"].copy() + + for serialized_node in deserialized_parameters.keys(): + distributions[serialized_node] = deserialized_parameters[serialized_node] + + self.set_parameters(parameters=distributions) + + if input_dict.get("weights", False): + str_keys = list(input_dict["weights"].keys()) + tuple_keys = [eval(key) for key in str_keys] + weights = {} + for tuple_key in tuple_keys: + weights[tuple_key] = input_dict["weights"][str(tuple_key)] + self.weights = weights + return True + + def fit_parameters(self, data: pd.DataFrame, n_jobs: int = 1): + """ + Base function for parameter learning + """ + if data.isnull().values.any(): + logger_network.error("Dataframe contains NaNs.") + return + + if type(self).__name__ == "CompositeBN": + data = self._encode_categorical_data(data) + + # Turn all discrete values to str for learning algorithm + if "disc_num" in self.descriptor["types"].values(): + columns_names = [ + name + for name, t in self.descriptor["types"].items() + if t in ["disc_num"] + ] + data[columns_names] = data.loc[:, columns_names].astype("str") + + def worker(node): + return node.fit_parameters(data) + + results = Parallel(n_jobs=n_jobs)(delayed(worker)(node) for node in self.nodes) + + # code for debugging, do not remove + # results = [worker(node) for node in self.nodes] + + for result, node in zip(results, self.nodes): + self.distributions[node.name] = result + + def get_info(self, as_df: bool = True) -> Optional[pd.DataFrame]: + """Return a table with name, type, parents_type, parents_names""" + return get_info_(self, as_df) + + def sample( + self, + n: int, + models_dir: Optional[str] = None, + progress_bar: bool = True, + evidence: Optional[Dict[str, Union[str, int, float]]] = None, + as_df: bool = True, + predict: bool = False, + parall_count: int = 1, + filter_neg: bool = True, + seed: Optional[int] = None, + ) -> Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: + """ + Sampling from Bayesian Network + n: int number of samples + evidence: values for nodes from user + parall_count: number of threads. Defaults to 1. + filter_neg: either filter negative vals or not. + seed: seed value to use for random number generator + """ + from joblib import Parallel, delayed + + random.seed(seed) + np.random.seed(seed) + + if not self.distributions.items(): + logger_network.error( + "Parameter learning wasn't done. Call fit_parameters method" + ) + return None + if evidence: + for node in self.nodes: + if (node.type == "Discrete") & (node.name in evidence.keys()): + if not (isinstance(evidence[node.name], str)): + evidence[node.name] = str(int(evidence[node.name])) + + def wrapper(): + output = {} + for node in self.nodes: + parents = node.cont_parents + node.disc_parents + if evidence and node.name in evidence.keys(): + output[node.name] = evidence[node.name] + else: + if not parents: + pvals = None + else: + if self.type == "Discrete": + pvals = [str(output[t]) for t in parents] + else: + pvals = [output[t] for t in parents] + + # If any nan from parents, sampling from node blocked. + if any(pd.isnull(pvalue) for pvalue in pvals): + output[node.name] = np.nan + continue + node_data = self.distributions[node.name] + if models_dir and ("hybcprob" in node_data.keys()): + for obj, obj_data in node_data["hybcprob"].items(): + if "serialization" in obj_data.keys(): + if "gaussian" in node.type.lower(): + model_type = "regressor" + else: + model_type = "classifier" + if ( + obj_data["serialization"] == "joblib" + and obj_data[f"{model_type}_obj"] + ): + new_path = ( + models_dir + + f"\\{node.name.replace(' ', '_')}\\{obj}.joblib.compressed" + ) + node_data["hybcprob"][obj][ + f"{model_type}_obj" + ] = new_path + if predict: + output[node.name] = node.predict(node_data, pvals=pvals) + else: + output[node.name] = node.choose(node_data, pvals=pvals) + return output + + if predict: + seq = [] + for _ in tqdm(range(n), position=0, leave=True): + result = wrapper() + seq.append(result) + elif progress_bar: + seq = Parallel(n_jobs=parall_count)( + delayed(wrapper)() for _ in tqdm(range(n), position=0, leave=True) + ) + else: + seq = Parallel(n_jobs=parall_count)(delayed(wrapper)() for _ in range(n)) + + # code for debugging, don't remove + # seq = [] + # for _ in tqdm(range(n), position=0, leave=True): + # result = wrapper() + # seq.append(result) + + seq_df = pd.DataFrame.from_dict(seq, orient="columns") + seq_df.dropna(inplace=True) + cont_nodes = [ + c.name + for c in self.nodes + if type(c).__name__ + not in ( + "DiscreteNode", + "LogitNode", + "CompositeDiscreteNode", + "ConditionalLogitNode", + ) + ] + positive_columns = [ + c for c in cont_nodes if self.descriptor["signs"][c] == "pos" + ] + if filter_neg: + seq_df = seq_df[(seq_df[positive_columns] >= 0).all(axis=1)] + seq_df.reset_index(inplace=True, drop=True) + + seq = seq_df.to_dict("records") + sample_output = pd.DataFrame.from_dict(seq, orient="columns") + + if as_df: + if type(self).__name__ == "CompositeBN": + sample_output = self._decode_categorical_data(sample_output) + return sample_output + else: + return seq + + def predict( + self, + test: pd.DataFrame, + parall_count: int = 1, + progress_bar: bool = True, + models_dir: Optional[str] = None, + ) -> Dict[str, Union[List[str], List[int], List[float]]]: + """ + Function to predict columns from given data. + Note that train data and test data must have different columns. + Both train and test datasets must be cleaned from NaNs. + + Args: + test (pd.DataFrame): test dataset + parall_count (int, optional):number of threads. Defaults to 1. + progress_bar: verbose mode. + + Returns: + predicted data (dict): dict with column as key and predicted data as value + """ + if test.isnull().any().any(): + logger_network.error("Test data contains NaN values.") + return {} + + from joblib import Parallel, delayed + + def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): + preds = {column_name: list() for column_name in columns} + + if len(test) == 1: + for i in range(test.shape[0]): + test_row = dict(test.iloc[i, :]) + for n, key in enumerate(columns): + try: + sample = bn.sample( + 1, + evidence=test_row, + predict=True, + progress_bar=False, + models_dir=models_dir, + ) + if sample.empty: + preds[key].append(np.nan) + continue + if bn.descriptor["types"][key] == "cont": + if (bn.descriptor["signs"][key] == "pos") & ( + sample.loc[0, key] < 0 + ): + # preds[key].append(np.nan) + preds[key].append(0) + else: + preds[key].append(sample.loc[0, key]) + else: + preds[key].append(sample.loc[0, key]) + except Exception as ex: + logger_network.error(ex) + preds[key].append(np.nan) + return preds + else: + logger_network.error("Wrapper for one row from pandas.DataFrame") + return {} + + columns = list(set(self.nodes_names) - set(test.columns.to_list())) + if not columns: + logger_network.error("Test data is the same as train.") + return {} + + preds = {column_name: list() for column_name in columns} + + if progress_bar: + processed_list = Parallel(n_jobs=parall_count)( + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in tqdm(test.index, position=0, leave=True) + ) + else: + processed_list = Parallel(n_jobs=parall_count)( + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in test.index + ) + + for i in range(test.shape[0]): + curr_pred = processed_list[i] + for n, key in enumerate(columns): + preds[key].append(curr_pred[key][0]) + + return preds + + def set_classifiers(self, classifiers: Dict[str, object]): + """ + Set classifiers for logit nodes. + classifiers: dict with node_name and Classifier + """ + if not self.has_logit: + logger_network.error("Logit nodes are forbidden.") + return None + + for node in self.nodes: + if "Logit" in node.type: + if node.name in classifiers.keys(): + node.classifier = classifiers[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type + ) + else: + continue + + def set_regressor(self, regressors: Dict[str, object]): + """ + Set classifiers for gaussian nodes. + classifiers: dict with node_name and Classifier + """ + + for node in self.nodes: + if "Gaussian" in node.type: + if node.name in regressors.keys(): + node.regressor = regressors[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type + ) + else: + continue + + def plot(self, output: str): + """ + Visualize a Bayesian Network. Result will be saved + in the parent directory in folder visualization_result. + output: str name of output file + """ + return plot_(output, self.nodes, self.edges) + + def markov_blanket(self, node_name, plot_to: Optional[str] = None): + """ + A method to get markov blanket of a node. + :param node_name: name of node + :param plot_to: directory to plot graph, the file must have html extension. + + :return structure: json with {"nodes": [...], "edges": [...]} + """ + structure = GraphUtils.GraphAnalyzer(self).markov_blanket(node_name) + if plot_to: + plot_( + plot_to, [self[name] for name in structure["nodes"]], structure["edges"] + ) + return structure + + def find_family( + self, + node_name: str, + height: int = 1, + depth: int = 1, + with_nodes: Optional[List] = None, + plot_to: Optional[str] = None, + ): + """ + A method to get markov blanket of a node. + :param node_name: name of node + :param height: a number of layers up to node (its parents) that will be taken + :param depth: a number of layers down to node (its children) that will be taken + :param with_nodes: include nodes in return + :param plot_to: directory to plot graph, the file must have html extension. + + :return structure: json with {"nodes": [...], "edges": [...]} + """ + structure = GraphUtils.GraphAnalyzer(self).find_family( + node_name, height, depth, with_nodes + ) + + if plot_to: + plot_( + plot_to, [self[name] for name in structure["nodes"]], structure["edges"] + ) + + def fill_gaps(self, df: pd.DataFrame, **kwargs): + """ + Fill NaNs with sampled values. + + :param df: dataframe with NaNs + :param kwargs: the same params as bn.predict + + :return df, failed: filled DataFrame and list of failed rows (sometimes predict can return np.nan) + """ + if not self.distributions: + logger_network.error("To call this method you must train parameters.") + + # create a mimic row to get a dataframe from iloc method + list = [np.nan for _ in range(df.shape[1])] + df.loc["mimic"] = list + + def fill_row(df, i): + row = df.iloc[[i, -1], :].drop(["mimic"], axis=0) + + evidences = row.dropna(axis=1) + + return row.index[0], self.predict(evidences, progress_bar=False, **kwargs) + + failed = [] + for index in range(df.shape[0] - 1): + if df.iloc[index].isna().any(): + true_pos, result = fill_row(df, index) + if any(pd.isna(v[0]) for v in result.values()): + failed.append(true_pos) + continue + else: + for column, value in result.items(): + df.loc[true_pos, column] = value[0] + else: + continue + df.drop(failed, inplace=True) + return df.drop(["mimic"]), failed + + def get_dist(self, node_name: str, pvals: Optional[dict] = None): + """ + Get a distribution from node with known parent values (conditional distribution). + + :param node_name: name of node + :param pvals: parent values + """ + if not self.distributions: + logger_network.error("Empty parameters. Call fit_params first.") + return + node = self[node_name] + + parents = node.cont_parents + node.disc_parents + if not parents: + return self.distributions[node_name] + + pvals = [pvals[parent] for parent in parents] + + return node.get_dist(node_info=self.distributions[node_name], pvals=pvals) + + def _encode_categorical_data(self, data): + for column in data.select_dtypes(include=["object", "string"]).columns: + encoder = LabelEncoder() + data[column] = encoder.fit_transform(data[column]) + self.encoders[column] = encoder + return data + + def _decode_categorical_data(self, data): + data = data.apply( + lambda col: pd.to_numeric(col).astype(int) if col.dtype == "object" else col + ) + for column, encoder in self.encoders.items(): + data[column] = encoder.inverse_transform(data[column]) + return data diff --git a/bamt/networks/big_brave_bn.py b/bamt/networks/big_brave_bn.py new file mode 100644 index 0000000..e4adc9a --- /dev/null +++ b/bamt/networks/big_brave_bn.py @@ -0,0 +1,105 @@ +import math + +import numpy as np +import pandas as pd +from sklearn.metrics import mutual_info_score +from sklearn.preprocessing import OrdinalEncoder + + +class BigBraveBN: + def __init__(self): + self.possible_edges = [] + + def set_possible_edges_by_brave( + self, + df: pd.DataFrame, + n_nearest: int = 5, + threshold: float = 0.3, + proximity_metric: str = "MI", + ) -> list: + """Returns list of possible edges for structure learning and sets it into attribute + + Args: + df (pd.DataFrame): Data. + n_nearest (int): Number of nearest neighbors to consider. Default is 5. + threshold (float): Threshold for selecting edges. Default is 0.3. + proximity_metric (str): Metric used to calculate proximity. Default is "MI". + + Returns: + None: Modifies the object's possible_edges attribute. + """ + df_copy = df.copy(deep=True) + proximity_matrix = self._get_proximity_matrix(df_copy, proximity_metric) + brave_matrix = self._get_brave_matrix(df_copy.columns, proximity_matrix, n_nearest) + + threshold_value = brave_matrix.max(numeric_only=True).max() * threshold + filtered_brave_matrix = brave_matrix[brave_matrix > threshold_value].stack() + self.possible_edges = filtered_brave_matrix.index.tolist() + return self.possible_edges + + @staticmethod + def _get_n_nearest( + data: pd.DataFrame, columns: list, corr: bool = False, number_close: int = 5 + ) -> list: + """Returns N nearest neighbors for every column of dataframe.""" + groups = [] + for c in columns: + close_ind = data[c].sort_values(ascending=not corr).index.tolist() + groups.append(close_ind[: number_close + 1]) + return groups + + @staticmethod + def _get_proximity_matrix(df: pd.DataFrame, proximity_metric: str) -> pd.DataFrame: + """Returns matrix of proximity for the dataframe.""" + encoder = OrdinalEncoder() + df_coded = df.copy() + columns_to_encode = list(df_coded.select_dtypes(include=["category", "object"])) + df_coded[columns_to_encode] = encoder.fit_transform(df_coded[columns_to_encode]) + + if proximity_metric == "MI": + df_distance = pd.DataFrame( + np.zeros((len(df.columns), len(df.columns))), + columns=df.columns, + index=df.columns, + ) + for c1 in df.columns: + for c2 in df.columns: + dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) + df_distance.loc[c1, c2] = dist + return df_distance + + elif proximity_metric == "pearson": + return df_coded.corr(method="pearson") + + def _get_brave_matrix( + self, df_columns: pd.Index, proximity_matrix: pd.DataFrame, n_nearest: int = 5 + ) -> pd.DataFrame: + """Returns matrix of Brave coefficients for the DataFrame.""" + brave_matrix = pd.DataFrame( + np.zeros((len(df_columns), len(df_columns))), + columns=df_columns, + index=df_columns, + ) + groups = self._get_n_nearest( + proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest + ) + + for c1 in df_columns: + for c2 in df_columns: + a = b = c = d = 0.0 + if c1 != c2: + for g in groups: + a += (c1 in g) & (c2 in g) + b += (c1 in g) & (c2 not in g) + c += (c1 not in g) & (c2 in g) + d += (c1 not in g) & (c2 not in g) + + divisor = (math.sqrt((a + c) * (b + d))) * ( + math.sqrt((a + b) * (c + d)) + ) + br = (a * len(groups) + (a + c) * (a + b)) / ( + divisor if divisor != 0 else 0.0000000001 + ) + brave_matrix.loc[c1, c2] = br + + return brave_matrix diff --git a/bamt/networks/composite_bn.py b/bamt/networks/composite_bn.py new file mode 100644 index 0000000..e3739aa --- /dev/null +++ b/bamt/networks/composite_bn.py @@ -0,0 +1,114 @@ +import re +from typing import Optional, Dict + +import pandas as pd + +from bamt.builders.composite_builder import CompositeStructureBuilder, CompositeDefiner +from bamt.log import logger_network +from bamt.networks.base import BaseNetwork +from bamt.utils.composite_utils.MLUtils import MlModels + + +class CompositeBN(BaseNetwork): + """ + Composite Bayesian Network with Machine Learning Models support + """ + + def __init__(self): + super(CompositeBN, self).__init__() + self._allowed_dtypes = ["cont", "disc", "disc_num"] + self.type = "Composite" + self.parent_models = {} + + def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): + """ + Function for initializing nodes in Bayesian Network + descriptor: dict with types and signs of nodes + """ + self.descriptor = descriptor + + worker_1 = CompositeDefiner(descriptor=descriptor, regressor=None) + self.nodes = worker_1.vertices + + def add_edges( + self, + data: pd.DataFrame, + progress_bar: bool = True, + classifier: Optional[object] = None, + regressor: Optional[object] = None, + **kwargs, + ): + worker = CompositeStructureBuilder( + data=data, descriptor=self.descriptor, regressor=regressor + ) + + worker.build( + data=data, + classifier=classifier, + regressor=regressor, + progress_bar=progress_bar, + **kwargs, + ) + + # update family + self.nodes = worker.skeleton["V"] + self.edges = worker.skeleton["E"] + self.parent_models = worker.parent_models_dict + self.set_models(self.parent_models) + + def set_models(self, parent_models): + ml_models = MlModels() + ml_models_dict = ml_models.dict_models + for node in self.nodes: + if ( + type(node).__name__ == "CompositeDiscreteNode" + and parent_models[node.name] is not None + and len(node.cont_parents + node.disc_parents) > 0 + ): + self.set_classifiers( + {node.name: ml_models_dict[parent_models[node.name]]()} + ) + logger_network.info( + msg=f"{ml_models_dict[parent_models[node.name]]} classifier has been set for {node.name}" + ) + elif ( + type(node).__name__ == "CompositeContinuousNode" + and parent_models[node.name] is not None + and len(node.cont_parents + node.disc_parents) > 0 + ): + self.set_regressor( + {node.name: ml_models_dict[parent_models[node.name]]()} + ) + logger_network.info( + msg=f"{ml_models_dict[parent_models[node.name]]} regressor has been set for {node.name}" + ) + else: + pass + + def set_classifiers(self, classifiers: Dict[str, object]): + """ + Set classifiers for logit nodes. + classifiers: dict with node_name and Classifier + """ + for node in self.nodes: + if node.name in classifiers.keys(): + node.classifier = classifiers[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type + ) + else: + continue + + def set_regressor(self, regressors: Dict[str, object]): + """ + Set regressor for gaussian nodes. + classifiers: dict with node_name and regressors + """ + for node in self.nodes: + if node.name in regressors.keys(): + node.regressor = regressors[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type + ) + else: + continue diff --git a/bamt/networks/continuous_bn.py b/bamt/networks/continuous_bn.py new file mode 100644 index 0000000..2c2f327 --- /dev/null +++ b/bamt/networks/continuous_bn.py @@ -0,0 +1,15 @@ +from .base import BaseNetwork + + +class ContinuousBN(BaseNetwork): + """ + Bayesian Network with Continuous Types of Nodes + """ + + def __init__(self, use_mixture: bool = False): + super(ContinuousBN, self).__init__() + self.type = "Continuous" + self._allowed_dtypes = ["cont"] + self.has_logit = None + self.use_mixture = use_mixture + self.scoring_function = "" diff --git a/bamt/networks/discrete_bn.py b/bamt/networks/discrete_bn.py new file mode 100644 index 0000000..90a0d07 --- /dev/null +++ b/bamt/networks/discrete_bn.py @@ -0,0 +1,15 @@ +from .base import BaseNetwork + + +class DiscreteBN(BaseNetwork): + """ + Bayesian Network with Discrete Types of Nodes + """ + + def __init__(self): + super(DiscreteBN, self).__init__() + self.type = "Discrete" + self.scoring_function = "" + self._allowed_dtypes = ["disc", "disc_num"] + self.has_logit = None + self.use_mixture = None diff --git a/bamt/networks/hybrid_bn.py b/bamt/networks/hybrid_bn.py new file mode 100644 index 0000000..2942da8 --- /dev/null +++ b/bamt/networks/hybrid_bn.py @@ -0,0 +1,27 @@ +from typing import Dict + +from .base import BaseNetwork + + +class HybridBN(BaseNetwork): + """ + Bayesian Network with Mixed Types of Nodes + """ + + def __init__(self, has_logit: bool = False, use_mixture: bool = False): + super(HybridBN, self).__init__() + self._allowed_dtypes = ["cont", "disc", "disc_num"] + self.type = "Hybrid" + self.has_logit = has_logit + self.use_mixture = use_mixture + + def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: + types = descriptor["types"] + s = set(types.values()) + return ( + True + if ({"cont", "disc", "disc_num"} == s) + or ({"cont", "disc"} == s) + or ({"cont", "disc_num"} == s) + else False + ) diff --git a/bamt/nodes/__init__.py b/bamt/nodes/__init__.py new file mode 100644 index 0000000..f896bc3 --- /dev/null +++ b/bamt/nodes/__init__.py @@ -0,0 +1,10 @@ +__all__ = [ + "base", + "conditional_gaussian_node", + "conditional_logit_node", + "conditional_mixture_gaussian_node", + "discrete_node", + "logit_node", + "gaussian_node", + "mixture_gaussian_node", +] diff --git a/bamt/nodes/base.py b/bamt/nodes/base.py new file mode 100644 index 0000000..f42b0a3 --- /dev/null +++ b/bamt/nodes/base.py @@ -0,0 +1,59 @@ +import pickle +from typing import Union + + +class BaseNode(object): + """ + Base class for nodes. + """ + + def __init__(self, name: str): + """ + :param name: name for node (taken from column name) + type: node type + disc_parents: list with discrete parents + cont_parents: list with continuous parents + children: node's children + """ + self.name = name + self.type = "abstract" + + self.disc_parents = [] + self.cont_parents = [] + self.children = [] + + def __repr__(self): + return f"{self.name}" + + def __eq__(self, other): + if not isinstance(other, BaseNode): + # don't attempt to compare against unrelated types + return NotImplemented + + return ( + self.name == other.name + and self.type == other.type + and self.disc_parents == other.disc_parents + and self.cont_parents == other.cont_parents + and self.children == other.children + ) + + @staticmethod + def choose_serialization(model) -> Union[str, Exception]: + try: + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1").replace("'", '"') + + if type(model).__name__ == "CatBoostRegressor": + a = model_ser.encode("latin1") + else: + a = model_ser.replace('"', "'").encode("latin1") + + classifier_body = pickle.loads(a) + return "pickle" + except Exception as ex: + return ex + + @staticmethod + def get_dist(node_info, pvals): + pass diff --git a/bamt/nodes/composite_continuous_node.py b/bamt/nodes/composite_continuous_node.py new file mode 100644 index 0000000..b446402 --- /dev/null +++ b/bamt/nodes/composite_continuous_node.py @@ -0,0 +1,17 @@ +from typing import Optional, Union + +from sklearn import linear_model + +from .gaussian_node import GaussianNode +from .schema import GaussianParams, HybcprobParams + +NodeInfo = Union[GaussianParams, HybcprobParams] + + +class CompositeContinuousNode(GaussianNode): + def __init__(self, name, regressor: Optional[object] = None): + super(CompositeContinuousNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "CompositeContinuous" + f" ({type(self.regressor).__name__})" diff --git a/bamt/nodes/composite_discrete_node.py b/bamt/nodes/composite_discrete_node.py new file mode 100644 index 0000000..6d82db5 --- /dev/null +++ b/bamt/nodes/composite_discrete_node.py @@ -0,0 +1,20 @@ +from typing import Optional + +from sklearn import linear_model + +from .logit_node import LogitNode + + +class CompositeDiscreteNode(LogitNode): + """ + Class for composite discrete node. + """ + + def __init__(self, name, classifier: Optional[object] = None): + super(CompositeDiscreteNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "CompositeDiscrete" + f" ({type(self.classifier).__name__})" diff --git a/bamt/nodes/conditional_gaussian_node.py b/bamt/nodes/conditional_gaussian_node.py new file mode 100644 index 0000000..58801b6 --- /dev/null +++ b/bamt/nodes/conditional_gaussian_node.py @@ -0,0 +1,173 @@ +import itertools +import math +import random +from typing import Dict, Optional, List, Union + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.base import clone +from sklearn.metrics import mean_squared_error as mse + +from .base import BaseNode +from .schema import CondGaussParams + + +class ConditionalGaussianNode(BaseNode): + """ + Main class for Conditional Gaussian Node + """ + + def __init__(self, name, regressor: Optional[object] = None): + super(ConditionalGaussianNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "ConditionalGaussian" + f" ({type(self.regressor).__name__})" + + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams]]: + """ + Train params for Conditional Gaussian Node. + Return: + {"hybcprob": { : CondGaussParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + key_comb = [str(x) for x in comb] + if new_data.shape[0] > 0: + if self.cont_parents: + model = clone(self.regressor) + model.fit( + new_data[self.cont_parents].values, new_data[self.name].values + ) + predicted_value = model.predict(new_data[self.cont_parents].values) + variance = mse( + new_data[self.name].values, predicted_value, squared=False + ) + hycprob[str(key_comb)] = { + "variance": variance, + "mean": np.nan, + "regressor_obj": model, + "regressor": type(self.regressor).__name__, + "serialization": None, + } + else: + mean_base = np.mean(new_data[self.name].values) + variance = np.var(new_data[self.name].values) + hycprob[str(key_comb)] = { + "variance": variance, + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "serialization": None, + } + else: + hycprob[str(key_comb)] = { + "variance": np.nan, + "regressor": None, + "regressor_obj": None, + "serialization": None, + "mean": np.nan, + } + return {"hybcprob": hycprob} + + def get_dist(self, node_info, pvals): + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + + if self.cont_parents: + flag = False + for el in lgpvals: + if str(el) == "nan": + flag = True + break + if flag: + return np.nan, np.nan + else: + if lgdistribution["regressor"]: + model = lgdistribution["regressor_obj"] + + cond_mean = model.predict(np.array(lgpvals).reshape(1, -1))[0] + variance = lgdistribution["variance"] + return cond_mean, variance + else: + return np.nan, np.nan + + else: + return lgdistribution["mean"], math.sqrt(lgdistribution["variance"]) + + def choose( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + cond_mean, variance = self.get_dist(node_info, pvals) + if np.isnan(cond_mean) or np.isnan(variance): + return np.nan + + return random.gauss(cond_mean, variance) + + def predict( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + if self.cont_parents: + flag = False + for el in lgpvals: + if str(el) == "nan": + flag = True + break + if flag: + return np.nan + else: + if lgdistribution["regressor"]: + model = lgdistribution["regressor_obj"] + return model.predict(np.array(lgpvals).reshape(1, -1))[0] + else: + return np.nan + else: + return lgdistribution["mean"] diff --git a/bamt/nodes/conditional_logit_node.py b/bamt/nodes/conditional_logit_node.py new file mode 100644 index 0000000..08010c0 --- /dev/null +++ b/bamt/nodes/conditional_logit_node.py @@ -0,0 +1,172 @@ +import itertools +import random +from typing import Optional, List, Union, Dict + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.base import clone + +from .base import BaseNode +from .schema import LogitParams + + +class ConditionalLogitNode(BaseNode): + """ + Main class for Conditional Logit Node + """ + + def __init__(self, name: str, classifier: Optional[object] = None): + super(ConditionalLogitNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "ConditionalLogit" + f" ({type(self.classifier).__name__})" + + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, LogitParams]]: + """ + Train params on data + Return: + {"hybcprob": { : LogitParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + # mean_base = [np.nan] + classes = [np.nan] + key_comb = [str(x) for x in comb] + if new_data.shape[0] != 0: + model = clone(self.classifier) + values = set(new_data[self.name]) + if len(values) > 1: + model.fit( + X=new_data[self.cont_parents].values, + y=new_data[self.name].values, + ) + classes = list(model.classes_) + hycprob[str(key_comb)] = { + "classes": classes, + "classifier_obj": model, + "classifier": type(self.classifier).__name__, + "serialization": None, + } + else: + classes = list(values) + hycprob[str(key_comb)] = { + "classes": classes, + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } + + else: + hycprob[str(key_comb)] = { + "classes": list(classes), + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } + return {"hybcprob": hycprob} + + @staticmethod + def get_dist(node_info, pvals, **kwargs): + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + if any(parent_value == "nan" for parent_value in dispvals): + return np.nan + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + model = lgdistribution["classifier_obj"] + distribution = model.predict_proba(np.array(lgpvals).reshape(1, -1))[0] + + if not kwargs.get("inner", False): + return distribution + else: + return distribution, lgdistribution + else: + if not kwargs.get("inner", False): + return np.array([1.0]) + else: + return np.array([1.0]), lgdistribution + + def choose( + self, + node_info: Dict[str, Dict[str, LogitParams]], + pvals: List[Union[str, float]], + ) -> str: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + distribution, lgdistribution = self.get_dist(node_info, pvals, inner=True) + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + rand = random.random() + rindex = 0 + lbound = 0 + ubound = 0 + for interval in range(len(lgdistribution["classes"])): + ubound += distribution[interval] + if lbound <= rand < ubound: + rindex = interval + break + else: + lbound = ubound + return str(lgdistribution["classes"][rindex]) + else: + return str(lgdistribution["classes"][0]) + + @staticmethod + def predict( + node_info: Dict[str, Dict[str, LogitParams]], pvals: List[Union[str, float]] + ) -> str: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + model = lgdistribution["classifier_obj"] + pred = model.predict(np.array(lgpvals).reshape(1, -1))[0] + + return str(pred) + + else: + return str(lgdistribution["classes"][0]) diff --git a/bamt/nodes/conditional_mixture_gaussian_node.py b/bamt/nodes/conditional_mixture_gaussian_node.py new file mode 100644 index 0000000..bdd3263 --- /dev/null +++ b/bamt/nodes/conditional_mixture_gaussian_node.py @@ -0,0 +1,227 @@ +import itertools +from typing import Union, List, Optional, Dict + +import numpy as np +from gmr import GMM +from pandas import DataFrame + +from bamt.utils.MathUtils import component +from .base import BaseNode +from .schema import CondMixtureGaussParams + + +class ConditionalMixtureGaussianNode(BaseNode): + """ + Main class for Conditional Mixture Gaussian Node + """ + + def __init__(self, name): + super(ConditionalMixtureGaussianNode, self).__init__(name) + self.type = "ConditionalMixtureGaussian" + + def fit_parameters( + self, data: DataFrame + ) -> Dict[str, Dict[str, CondMixtureGaussParams]]: + """ + Train params for Conditional Mixture Gaussian Node. + Return: + {"hybcprob": { : CondMixtureGaussParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + new_data.reset_index(inplace=True, drop=True) + key_comb = [str(x) for x in comb] + nodes = [self.name] + self.cont_parents + if new_data.shape[0] > 5: + if self.cont_parents: + # component(new_data, nodes, + # 'LRTS')#int((component(new_data, nodes, 'aic') + + # component(new_data, nodes, 'bic')) / 2) + n_comp = int( + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) + else: + # component(new_data, [node], + # 'LRTS')#int((component(new_data, [node], 'aic') + + # component(new_data, [node], 'bic')) / 2) + n_comp = int( + ( + component(new_data, [self.name], "aic") + + component(new_data, [self.name], "bic") + ) + / 2 + ) + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([new_data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} + elif new_data.shape[0] != 0: + n_comp = 1 + gmm = GMM(n_components=n_comp) + if self.cont_parents: + gmm.from_samples(new_data[nodes].values) + else: + gmm.from_samples(np.transpose([new_data[self.name].values])) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} + else: + if self.cont_parents: + hycprob[str(key_comb)] = { + "covars": np.nan, + "mean": np.nan, + "coef": [], + } + else: + hycprob[str(key_comb)] = { + "covars": np.nan, + "mean": np.nan, + "coef": [], + } + return {"hybcprob": hycprob} + + @staticmethod + def get_dist(node_info, pvals): + lgpvals = [] + dispvals = [] + + for pval in pvals: + if (isinstance(pval, str)) | (isinstance(pval, int)): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + mean = lgdistribution["mean"] + covariance = lgdistribution["covars"] + w = lgdistribution["coef"] + + if len(w) != 0: + if len(lgpvals) != 0: + indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] + if not np.isnan(np.array(lgpvals)).all(): + n_comp = len(w) + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + cond_gmm = gmm.condition(indexes, [lgpvals]) + return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors + else: + return np.nan, np.nan, np.nan + else: + n_comp = len(w) + gmm = GMM( + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) + return gmm.means, gmm.covariances, gmm.priors + else: + return np.nan, np.nan, np.nan + + def choose( + self, + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: + """ + Function to get value from ConditionalMixtureGaussian node + params: + node_info: nodes info from distributions + pvals: parent values + """ + mean, covariance, w = self.get_dist(node_info, pvals) + + # check if w is nan or list of weights + if not isinstance(w, np.ndarray): + return np.nan + + n_comp = len(w) + + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.sample(1)[0][0] + return sample + + @staticmethod + def predict( + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: + """ + Function to get prediction from ConditionalMixtureGaussian node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if (isinstance(pval, str)) | (isinstance(pval, int)): + dispvals.append(pval) + else: + lgpvals.append(pval) + lgdistribution = node_info["hybcprob"][str(dispvals)] + mean = lgdistribution["mean"] + covariance = lgdistribution["covars"] + w = lgdistribution["coef"] + if len(w) != 0: + if len(lgpvals) != 0: + indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] + if not np.isnan(np.array(lgpvals)).all(): + n_comp = len(w) + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.predict(indexes, [lgpvals])[0][0] + else: + sample = np.nan + else: + # n_comp = len(w) + # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) + sample = 0 + for ind, wi in enumerate(w): + sample += wi * mean[ind][0] + else: + sample = np.nan + return sample diff --git a/bamt/nodes/discrete_node.py b/bamt/nodes/discrete_node.py new file mode 100644 index 0000000..705f1ee --- /dev/null +++ b/bamt/nodes/discrete_node.py @@ -0,0 +1,113 @@ +import random +from itertools import product +from typing import Type, Dict, Union, List + +import numpy as np +from pandas import DataFrame, crosstab + +from .base import BaseNode +from .schema import DiscreteParams + + +class DiscreteNode(BaseNode): + """ + Main class of Discrete Node + """ + + def __init__(self, name): + super(DiscreteNode, self).__init__(name) + self.type = "Discrete" + + def fit_parameters(self, data: DataFrame, num_workers: int = 1): + """ + Train params for Discrete Node + data: DataFrame to train on + num_workers: number of Parallel Workers + Method returns probas dict with the following format {[: value]} + and vals, list of appeared values in combinations + """ + + def worker(node: Type[BaseNode]) -> DiscreteParams: + parents = node.disc_parents + node.cont_parents + dist = data[node.name].value_counts(normalize=True).sort_index() + vals = [str(i) for i in dist.index.to_list()] + + if not parents: + cprob = dist.to_list() + else: + cprob = { + str([str(i) for i in comb]): [1 / len(vals) for _ in vals] + for comb in product(*[data[p].unique() for p in parents]) + } + + conditional_dist = crosstab( + data[node.name].to_list(), + [data[p] for p in parents], + normalize="columns", + ).T + tight_form = conditional_dist.to_dict("tight") + + for comb, probs in zip(tight_form["index"], tight_form["data"]): + if len(parents) > 1: + cprob[str([str(i) for i in comb])] = probs + else: + cprob[f"['{comb}']"] = probs + return {"cprob": cprob, "vals": vals} + + # pool = ThreadPoolExecutor(num_workers) + # future = pool.submit(worker, self) + result = worker(self) + return result + + @staticmethod + def get_dist(node_info, pvals): + if not pvals: + return node_info["cprob"] + else: + # noinspection PyTypeChecker + return node_info["cprob"][str(pvals)] + + def choose(self, node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: + """ + Return value from discrete node + params: + node_info: nodes info from distributions + pvals: parent values + """ + vals = node_info["vals"] + dist = np.array(self.get_dist(node_info, pvals)) + + cumulative_dist = np.cumsum(dist) + + rand = np.random.random() + rindex = np.searchsorted(cumulative_dist, rand) + + return vals[rindex] + + @staticmethod + def predict(node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: + """function for prediction based on evidence values in discrete node + + Args: + node_info (Dict[str, Union[float, str]]): parameters of node + pvals (List[str]): values in parents nodes + + Returns: + str: prediction + """ + + vals = node_info["vals"] + disct = [] + if not pvals: + dist = node_info["cprob"] + else: + # noinspection PyTypeChecker + dist = node_info["cprob"][str(pvals)] + max_value = max(dist) + indices = [index for index, value in enumerate(dist) if value == max_value] + max_ind = 0 + if len(indices) == 1: + max_ind = indices[0] + else: + max_ind = random.choice(indices) + return vals[max_ind] diff --git a/bamt/nodes/gaussian_node.py b/bamt/nodes/gaussian_node.py new file mode 100644 index 0000000..15144a8 --- /dev/null +++ b/bamt/nodes/gaussian_node.py @@ -0,0 +1,96 @@ +import math +import random +from typing import Optional, List + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.metrics import mean_squared_error as mse + +from .base import BaseNode +from .schema import GaussianParams + + +class GaussianNode(BaseNode): + """ + Main class for Gaussian Node + """ + + def __init__(self, name, regressor: Optional[object] = None): + super(GaussianNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "Gaussian" + f" ({type(self.regressor).__name__})" + + def fit_parameters(self, data: DataFrame, **kwargs) -> GaussianParams: + parents = self.cont_parents + if type(self).__name__ == "CompositeContinuousNode": + parents = parents + self.disc_parents + if parents: + self.regressor.fit(data[parents].values, data[self.name].values, **kwargs) + predicted_value = self.regressor.predict(data[parents].values) + variance = mse(data[self.name].values, predicted_value, squared=False) + return { + "mean": np.nan, + "regressor_obj": self.regressor, + "regressor": type(self.regressor).__name__, + "variance": variance, + "serialization": None, + } + else: + mean_base = np.mean(data[self.name].values) + variance = np.var(data[self.name].values) + return { + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "variance": variance, + "serialization": None, + } + + def get_dist(self, node_info, pvals): + var = node_info["variance"] + if pvals: + for el in pvals: + if str(el) == "nan": + return np.nan + model = node_info["regressor_obj"] + + if type(self).__name__ == "CompositeContinuousNode": + pvals = [int(item) if isinstance(item, str) else item for item in pvals] + + cond_mean = model.predict(np.array(pvals).reshape(1, -1))[0] + return cond_mean, var + else: + return node_info["mean"], math.sqrt(var) + + def choose(self, node_info: GaussianParams, pvals: List[float]) -> float: + """ + Return value from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + cond_mean, var = self.get_dist(node_info, pvals) + return random.gauss(cond_mean, var) + + @staticmethod + def predict(node_info: GaussianParams, pvals: List[float]) -> float: + """ + Return prediction from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + if pvals: + for el in pvals: + if str(el) == "nan": + return np.nan + model = node_info["regressor_obj"] + pred = model.predict(np.array(pvals).reshape(1, -1))[0] + return pred + else: + return node_info["mean"] diff --git a/bamt/nodes/logit_node.py b/bamt/nodes/logit_node.py new file mode 100644 index 0000000..9b16004 --- /dev/null +++ b/bamt/nodes/logit_node.py @@ -0,0 +1,91 @@ +import random +from typing import Optional, List, Union + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model + +from .base import BaseNode +from .schema import LogitParams + + +class LogitNode(BaseNode): + """ + Main class for logit node + """ + + def __init__(self, name, classifier: Optional[object] = None): + super(LogitNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "Logit" + f" ({type(self.classifier).__name__})" + + def fit_parameters(self, data: DataFrame, **kwargs) -> LogitParams: + parents = self.cont_parents + self.disc_parents + self.classifier.fit(X=data[parents].values, y=data[self.name].values, **kwargs) + + return { + "classes": list(self.classifier.classes_), + "classifier_obj": self.classifier, + "classifier": type(self.classifier).__name__, + "serialization": None, + } + + def get_dist(self, node_info, pvals): + if len(node_info["classes"]) > 1: + model = node_info["classifier_obj"] + if type(self).__name__ == "CompositeDiscreteNode": + pvals = [int(item) if isinstance(item, str) else item for item in pvals] + + return model.predict_proba(np.array(pvals).reshape(1, -1))[0] + else: + return np.array([1.0]) + + def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: + """ + Return value from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + rindex = 0 + + distribution = self.get_dist(node_info, pvals) + + if len(node_info["classes"]) > 1: + rand = random.random() + lbound = 0 + ubound = 0 + for interval in range(len(node_info["classes"])): + ubound += distribution[interval] + if lbound <= rand < ubound: + rindex = interval + break + else: + lbound = ubound + + return str(node_info["classes"][rindex]) + else: + return str(node_info["classes"][0]) + + @staticmethod + def predict(node_info: LogitParams, pvals: List[Union[float]]) -> str: + """ + Return prediction from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + if len(node_info["classes"]) > 1: + model = node_info["classifier_obj"] + pred = model.predict(np.array(pvals).reshape(1, -1))[0] + + return str(pred) + + else: + return str(node_info["classes"][0]) diff --git a/bamt/nodes/mixture_gaussian_node.py b/bamt/nodes/mixture_gaussian_node.py new file mode 100644 index 0000000..22cd2b3 --- /dev/null +++ b/bamt/nodes/mixture_gaussian_node.py @@ -0,0 +1,154 @@ +from typing import Union, List, Optional + +import numpy as np +from gmr import GMM +from pandas import DataFrame + +from bamt.utils.MathUtils import component +from .base import BaseNode +from .schema import MixtureGaussianParams + + +class MixtureGaussianNode(BaseNode): + """ + Main class for Mixture Gaussian Node + """ + + def __init__(self, name): + super(MixtureGaussianNode, self).__init__(name) + self.type = "MixtureGaussian" + + def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: + """ + Train params for Mixture Gaussian Node + """ + parents = self.disc_parents + self.cont_parents + if not parents: + n_comp = int( + ( + component(data, [self.name], "aic") + + component(data, [self.name], "bic") + ) + / 2 + ) # component(data, [node], 'LRTS')# + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + return {"mean": means, "coef": w, "covars": cov} + if parents: + if not self.disc_parents and self.cont_parents: + nodes = [self.name] + self.cont_parents + new_data = data[nodes] + new_data.reset_index(inplace=True, drop=True) + n_comp = int( + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) # component(new_data, nodes, 'LRTS')# + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(new_data[nodes].values)) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + return {"mean": means, "coef": w, "covars": cov} + + @staticmethod + def get_dist(node_info, pvals): + mean = node_info["mean"] + covariance = node_info["covars"] + w = node_info["coef"] + n_comp = len(node_info["coef"]) + if n_comp != 0: + if pvals: + indexes = [i for i in range(1, len(pvals) + 1)] + if not np.isnan(np.array(pvals)).all(): + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + cond_gmm = gmm.condition(indexes, [pvals]) + return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors + else: + return np.nan, np.nan, np.nan + else: + gmm = GMM( + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) + return gmm.means, gmm.covariances, gmm.priors + else: + return np.nan, np.nan, np.nan + + def choose( + self, node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: + """ + Func to get value from current node + node_info: nodes info from distributions + pvals: parent values + Return value from MixtureGaussian node + """ + mean, covariance, w = self.get_dist(node_info, pvals) + + n_comp = len(w) + + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + return gmm.sample(1)[0][0] + + @staticmethod + def predict( + node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: + """ + Func to get prediction from current node + node_info: nodes info from distributions + pvals: parent values + Return value from MixtureGaussian node + """ + mean = node_info["mean"] + covariance = node_info["covars"] + w = node_info["coef"] + n_comp = len(node_info["coef"]) + if n_comp != 0: + if pvals: + indexes = [i for i in range(1, len(pvals) + 1)] + if not np.isnan(np.array(pvals)).all(): + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.predict(indexes, [pvals])[0][0] + else: + sample = np.nan + else: + # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) + sample = 0 + for ind, wi in enumerate(w): + sample += wi * mean[ind][0] + else: + sample = np.nan + return sample diff --git a/bamt/nodes/schema.py b/bamt/nodes/schema.py new file mode 100644 index 0000000..2db2642 --- /dev/null +++ b/bamt/nodes/schema.py @@ -0,0 +1,47 @@ +from typing import Dict, List, Any, Union, TypedDict, Optional + +from numpy import ndarray + + +class DiscreteParams(TypedDict): + cprob: Union[List[Union[list, Any]], Dict[str, list]] + vals: List[str] + + +class MixtureGaussianParams(TypedDict): + mean: List[float] + coef: List[float] + covars: List[float] + + +class GaussianParams(TypedDict): + regressor: str + regressor_obj: Optional[object] + variance: Union[ndarray, float] + mean: Union[ndarray, float] + serialization: Optional[str] + + +class CondGaussParams(TypedDict): + regressor: str + regressor_obj: Optional[object] + variance: Union[ndarray, float] + mean: Union[ndarray, float] + serialization: Optional[str] + + +class CondMixtureGaussParams(TypedDict): + mean: Optional[List[float]] + coef: List[float] + covars: Optional[List[float]] + + +class LogitParams(TypedDict): + classes: List[int] + classifier: str + classifier_obj: Optional[object] + serialization: Optional[str] + + +class HybcprobParams(TypedDict): + hybcprob: Dict[str, CondGaussParams] diff --git a/bamt/parameter_estimators/__init__.py b/bamt/parameter_estimators/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/parameter_estimators/maximum_likelihood_estimator.py b/bamt/parameter_estimators/maximum_likelihood_estimator.py deleted file mode 100644 index 63783b5..0000000 --- a/bamt/parameter_estimators/maximum_likelihood_estimator.py +++ /dev/null @@ -1,9 +0,0 @@ -from .parameters_estimator import ParametersEstimator - - -class MaximumLikelihoodEstimator(ParametersEstimator): - def __init__(self): - pass - - def estimate(self): - pass diff --git a/bamt/parameter_estimators/parameters_estimator.py b/bamt/parameter_estimators/parameters_estimator.py deleted file mode 100644 index b93734c..0000000 --- a/bamt/parameter_estimators/parameters_estimator.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class ParametersEstimator(ABC): - def __init__(self): - pass - - @abstractmethod - def estimate(self): - pass diff --git a/bamt/core/nodes/root_nodes/__init__.py b/bamt/preprocess/__init__.py similarity index 100% rename from bamt/core/nodes/root_nodes/__init__.py rename to bamt/preprocess/__init__.py diff --git a/bamt/preprocess/discretization.py b/bamt/preprocess/discretization.py new file mode 100644 index 0000000..9a36f86 --- /dev/null +++ b/bamt/preprocess/discretization.py @@ -0,0 +1,165 @@ +from copy import copy +from typing import Tuple + +import pandas as pd +from sklearn import preprocessing +from sklearn.preprocessing import KBinsDiscretizer + + +def get_nodes_sign(data: pd.DataFrame) -> dict: + """Function to define sign of the node + neg - if node has negative values + pos - if node has only positive values + + Args: + data (pd.DataFrame): input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - sign of data + """ + nodes_types = get_nodes_type(data) + columns_sign = dict() + for c in data.columns.to_list(): + if nodes_types[c] == "cont": + if (data[c] < 0).any(): + columns_sign[c] = "neg" + else: + columns_sign[c] = "pos" + return columns_sign + + +def get_nodes_type(data: pd.DataFrame) -> dict: + """Function to define the type of the node + disc - discrete node + cont - continuous + Args: + data (pd.DataFrame): input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + """ + column_type = dict() + for c in data.columns.to_list(): + if (data[c].dtypes == "float64") | (data[c].dtypes == "float32"): + column_type[c] = "cont" + if ( + (data[c].dtypes == "str") + | (data[c].dtypes == "O") + | (data[c].dtypes == "b") + ): + column_type[c] = "disc" + if (data[c].dtypes == "int64") | (data[c].dtypes == "int32"): + column_type[c] = "disc" + return column_type + + +def discretization( + data: pd.DataFrame, method: str, columns: list, bins: int = 5 +) -> Tuple[pd.DataFrame, KBinsDiscretizer]: + """Discretization of continuous parameters + + Args: + data (pd.DataFrame): input dataset + method (str): discretization approach (equal_intervals, equal_frequency, kmeans) + columns (list): name of columns for discretization + bins (int, optional): number of bins. Defaults to 5. + + Returns: + pd.DataFrame: output dataset with discretized parameters + KBinsDiscretizer: fitted exemplar of discretization class + """ + data = data.dropna() + data.reset_index(inplace=True, drop=True) + d_data = copy(data) + est = KBinsDiscretizer(n_bins=bins, encode="ordinal") + strategy_dict = { + "equal_intervals": "uniform", + "equal_frequency": "quantile", + "kmeans": "kmeans", + } + if method in strategy_dict: + est.strategy = strategy_dict[method] + data_discrete = est.fit_transform(d_data.loc[:, columns].values) + d_data[columns] = data_discrete.astype("int") + else: + raise Exception("This discretization method is not supported") + + return d_data, est + + +def label_encoding(data, columns): + d_data = copy(data) + encoder_dict = dict() + for column in columns: + le = preprocessing.LabelEncoder() + d_data[column] = le.fit_transform(d_data[column].values) + mapping = dict(zip(le.classes_, range(len(le.classes_)))) + encoder_dict[column] = mapping + return d_data, encoder_dict + + +def onehot_encoding(data, columns): + d_data = pd.get_dummies(data, columns=columns) + return d_data, None + + +def code_categories( + data: pd.DataFrame, method: str, columns: list +) -> Tuple[pd.DataFrame, dict]: + """Encoding categorical parameters + + Args: + data (pd.DataFrame): input dataset + method (str): method of encoding (label or onehot) + columns (list): name of categorical columns + + Returns: + pd.DataFrame: output dataset with encoded parameters + dict: dictionary with values and codes + """ + data = data.dropna() + data.reset_index(inplace=True, drop=True) + encoding_func_dict = {"label": label_encoding, "onehot": onehot_encoding} + if method in encoding_func_dict: + d_data, encoder_dict = encoding_func_dict[method](data, columns) + else: + raise Exception("This encoding method is not supported") + + return d_data, encoder_dict + + +def inverse_discretization( + data: pd.DataFrame, columns: list, discretizer: KBinsDiscretizer +) -> pd.DataFrame: + """Inverse discretization for numeric params + + Args: + data (pd.DataFrame): input dataset with discrete values + columns (list): colums for inverse_discretization + discretizer (KBinsDiscretizer): fitted exemplar of discretization class + + Returns: + pd.DataFrame: output dataset with continuous values + """ + new_data = copy(data) + new_data[columns] = discretizer.inverse_transform(new_data[columns].values) + + return new_data + + +def decode(data: pd.DataFrame, columns: list, encoder_dict: dict) -> pd.DataFrame: + """Decoding categorical params to initial labels + + Args: + data (pd.DataFrame): input dataset with encoded params + columns (list): columns for decoding + encoder_dict (dict): dictionary with values and codes + Returns: + pd.DataFrame: output dataset with decoded params + """ + for column in columns: + dict_parameter = encoder_dict[column] + inv_map = {v: k for k, v in dict_parameter.items()} + data[column] = data[column].apply(lambda x: inv_map(x)) + + return data diff --git a/bamt/preprocess/graph.py b/bamt/preprocess/graph.py new file mode 100644 index 0000000..87e688a --- /dev/null +++ b/bamt/preprocess/graph.py @@ -0,0 +1,42 @@ +from collections import defaultdict + + +def nodes_from_edges(edges: list): + """ + Retrieves all nodes from the list of edges. + Arguments + ---------- + *edges* : list + + Returns + ------- + *nodes* : list + + Effects + ------- + None + """ + return set().union(edges) + + +def edges_to_dict(edges: list): + """ + Transfers the list of edges to the dictionary of parents. + Arguments + ---------- + *edges* : list + + Returns + ------- + *parents_dict* : dict + + Effects + ------- + None + """ + nodes = nodes_from_edges(edges) + parents_dict = defaultdict(list, {node: [] for node in nodes}) + parents_dict.update( + {child: parents_dict[child] + [parent] for parent, child in edges} + ) + return parents_dict diff --git a/bamt/preprocess/numpy_pandas.py b/bamt/preprocess/numpy_pandas.py new file mode 100644 index 0000000..32242df --- /dev/null +++ b/bamt/preprocess/numpy_pandas.py @@ -0,0 +1,57 @@ +# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +# parentdir = os.path.dirname(currentdir) +# sys.path.insert(0,parentdir) +import numpy as np +import pandas as pd + + +def loc_to_DataFrame(data: np.array): + """Function to convert array to DataFrame + Args: + data (np.array): input array + + Returns: + data (pd.DataFrame): with string columns for filtering + """ + nodes_type = get_type_numpy(data) + if data.T.ndim == 1: + data = data.T + nodes_type = {0: nodes_type[0]} + dtype = { + key: "int64" if value == "disc" else "float64" + for key, value in nodes_type.items() + if value in ["disc", "cont"] + } + df = pd.DataFrame(data).astype(dtype) + df.columns = df.columns.map(str) + return df + + +def get_type_numpy(data: np.array): + """Function to define the type of the columns of array + disc - discrete node + cont - continuous + Args: + data (np.array): input array + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + Notes: + -- You may have problems with confusing rows and columns + """ + arr = data.T + + column_type = {} + for i, row in enumerate(arr): + if row.ndim == 0 or row.T.ndim == 0: + row_is_integer = np.issubdtype(row, np.integer) or row.is_integer() + column_type[i] = "disc" if row_is_integer else "cont" + else: + all_row_is_integer = all( + np.issubdtype(x, np.integer) or x.is_integer() for x in row + ) + column_type[i] = "disc" if all_row_is_integer else "cont" + if column_type[i] not in ["disc", "cont"]: + print("get_type_numpy: Incorrect type of row") + print(row) + return column_type diff --git a/bamt/preprocessors.py b/bamt/preprocessors.py new file mode 100644 index 0000000..847ec9c --- /dev/null +++ b/bamt/preprocessors.py @@ -0,0 +1,126 @@ +from typing import Tuple, Dict + +from pandas import DataFrame + +from bamt.log import logger_preprocessor +from bamt.utils import GraphUtils as gru + + +class BasePreprocessor(object): + """ + Base for Preprocessor + """ + + def __init__(self): + self.nodes_signs = {} + self.nodes_types = {} + + @staticmethod + def get_nodes_types(data): + return gru.nodes_types(data=data) + + def get_nodes_signs(self, data): + return gru.nodes_signs(nodes_types=self.nodes_types, data=data) + + def code_categories( + self, data: DataFrame, encoder + ) -> Tuple[DataFrame, Dict[str, Dict]]: + """Encoding categorical parameters + + Args: + data (DataFrame): input dataset + encoder: any object with fit_transform method + + Returns: + pd.DataFrame: output dataset with encoded parameters + dict: dictionary with values and codes + """ + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "disc" + ] + df = data.copy() # INPUT DF. Debugging SettingWithCopyWarning + if not columns: + return df, None + data = df[columns] # DATA TO CATEGORIZE + encoder_dict = dict() + + for col_name, column in data.items(): + # Iterate over (column name, Series) pairs. + try: + df[col_name] = encoder.fit_transform(column.values) + except TypeError as exc: + logger_preprocessor.error( + f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}" + ) + try: + mapping = dict(zip(encoder.classes_, range(len(encoder.classes_)))) + encoder_dict[col_name] = mapping + except BaseException: + pass + return df, encoder_dict + + def discretize(self, data: DataFrame, discretizer) -> tuple: + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] + df = data.copy() + if not columns: + return df, None + data = df[columns] + + data_discrete = discretizer.fit_transform(data.values) + df[columns] = data_discrete.astype("int") + + return df, discretizer + + def decode(self): + pass + + +class Preprocessor(BasePreprocessor): + def __init__(self, pipeline: list): + super().__init__() + assert isinstance(pipeline, list), "pipeline must be list" + self.pipeline = pipeline + self.coder = None + + @property + def info(self): + return {"types": self.nodes_types, "signs": self.nodes_signs} + + def scan(self, data: DataFrame): + """ + Function to scan data. If something is wrong, it will be send to log file + """ + columns_cont = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] + if not columns_cont: + logger_preprocessor.info("No one column is continuous") + + columns_disc = [ + col + for col in data.columns.to_list() + if self.nodes_types[col] in ["disc", "disc_num"] + ] + if not columns_disc: + logger_preprocessor.info("No one column is discrete") + + def apply(self, data: DataFrame) -> Tuple[DataFrame, Dict]: + """ + Apply pipeline + data: data to apply on + """ + df = data.copy() + self.nodes_types = self.get_nodes_types(data) + if list(self.nodes_types.keys()) != data.columns.to_list(): + logger_preprocessor.error("Nodes_types dictionary are not full.") + return None, None + self.nodes_signs = self.get_nodes_signs(data) + self.scan(df) + for name, instrument in self.pipeline: + if name == "encoder": + df, self.coder = self.code_categories(data=data, encoder=instrument) + if name == "discretizer": + df, est = self.discretize(data=df, discretizer=instrument) + return df, self.coder diff --git a/bamt/redef_HC.py b/bamt/redef_HC.py new file mode 100644 index 0000000..bab3119 --- /dev/null +++ b/bamt/redef_HC.py @@ -0,0 +1,316 @@ +""" +********************** +Greedy Hill-Climbing +for Structure Learning +********************** + +Code for Searching through the space of +possible Bayesian Network structures. + +Various optimization procedures are employed, +from greedy search to simulated annealing, and +so on - mostly using scipy.optimize. + +Local search - possible moves: +- Add edge +- Delete edge +- Invert edge + +Strategies to improve Greedy Hill-Climbing: +- Random Restarts + - when we get stuck, take some number of + random steps and then start climbing again. +- Tabu List + - keep a list of the K steps most recently taken, + and say that the search cannt reverse (undo) any + of these steps. +""" + +from bamt.external.pyBN.classes.bayesnet import BayesNet +from bamt.external.pyBN.utils.graph import would_cause_cycle +from bamt.mi_entropy_gauss import mi_gauss +from bamt.redef_info_scores import log_lik_local, BIC_local, AIC_local + + +def hc( + data, + metric="MI", + max_iter=200, + debug=False, + init_nodes=None, + restriction=None, + init_edges=None, + remove_geo_edges=True, + black_list=None, +): + """ + Greedy Hill Climbing search proceeds by choosing the move + which maximizes the increase in fitness of the + network at the current step. It continues until + it reaches a point where there does not exist any + feasible single move that increases the network fitness. + + It is called "greedy" because it simply does what is + best at the current iteration only, and thus does not + look ahead to what may be better later on in the search. + + For computational saving, a Priority Queue (python's heapq) + can be used to maintain the best operators and reduce the + complexity of picking the best operator from O(n^2) to O(nlogn). + This works by maintaining the heapq of operators sorted by their + delta score, and each time a move is made, we only have to recompute + the O(n) delta-scores which were affected by the move. The rest of + the operator delta-scores are not affected. + + For additional computational efficiency, we can cache the + sufficient statistics for various families of distributions - + therefore, computing the mutual information for a given family + only needs to happen once. + + The possible moves are the following: + - add edge + - delete edge + - invert edge + + Arguments + --------- + *data* : pd.DataFrame + The data from which the Bayesian network + structure will be learned. + + *metric* : a string + Which score metric to use. + Options: + - AIC + - BIC / MDL + - LL (log-likelihood) + + *max_iter* : an integer + The maximum number of iterations of the + hill-climbing algorithm to run. Note that + the algorithm will terminate on its own if no + improvement is made in a given iteration. + + *debug* : boolean + Whether to print(the scores/moves of the) + algorithm as its happening. + + *init_nodes* : a list of initialize nodes (number of nodes according to the dataset) + + *restriction* : a list of 2-tuples + For MMHC algorithm, the list of allowable edge additions. + + Returns + ------- + *bn* : a BayesNet object + + """ + nrow = data.shape[0] + ncol = data.shape[1] + + names = range(ncol) + + # INITIALIZE NETWORK W/ NO EDGES + # maintain children and parents dict for fast lookups + c_dict = dict([(n, []) for n in names]) + p_dict = dict([(n, []) for n in names]) + if init_edges: + for edge in init_edges: + c_dict[edge[0]].append(edge[1]) + p_dict[edge[1]].append(edge[0]) + + bn = BayesNet(c_dict) + + mutual_information = mi_gauss + if metric == "BIC": + mutual_information = BIC_local + if metric == "AIC": + mutual_information = AIC_local + if metric == "LL": + mutual_information = log_lik_local + + data = data.values + + cache = dict() + + _iter = 0 + improvement = True + + while improvement: + improvement = False + max_delta = 0 + + if debug: + print("ITERATION: ", _iter) + + ### TEST ARC ADDITIONS ### + for u in bn.nodes(): + for v in bn.nodes(): + if ( + v not in c_dict[u] + and u != v + and not would_cause_cycle(c_dict, u, v) + and len(p_dict[v]) != 3 + ): + # FOR MMHC ALGORITHM -> Edge Restrictions + if ( + (init_nodes is None or not (v in init_nodes)) + and (restriction is None or (u, v) in restriction) + and (black_list is None or not ((u, v) in black_list)) + ): + # SCORE FOR 'V' -> gaining a parent + # without 'u' as parent + old_cols = (v,) + tuple(p_dict[v]) + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = old_cols + (u,) # with'u' as parent + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta_score = nrow * (mi_old - mi_new) + + if delta_score > max_delta: + if debug: + print("Improved Arc Addition: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Addition" + max_arc = (u, v) + + # ### TEST ARC DELETIONS ### + for u in bn.nodes(): + for v in bn.nodes(): + if v in c_dict[u]: + # SCORE FOR 'V' -> losing a parent + old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = tuple([i for i in old_cols if i != u]) + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta_score = nrow * (mi_old - mi_new) + + if delta_score > max_delta: + if init_edges is None: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + else: + if (u, v) in init_edges: + if remove_geo_edges: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + else: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + + # ### TEST ARC REVERSALS ### + for u in bn.nodes(): + for v in bn.nodes(): + if ( + v in c_dict[u] + and not would_cause_cycle(c_dict, v, u, reverse=True) + and len(p_dict[u]) != 3 + and (init_nodes is None or not (u in init_nodes)) + and (restriction is None or (v, u) in restriction) + and (black_list is None or not ((v, u) in black_list)) + ): + old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = old_cols + (v,) # with 'v' as parent + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta1 = -1 * nrow * (mi_old - mi_new) + # SCORE FOR 'V' -> losing 'u' as parent + old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + # without 'u' as parent + new_cols = tuple([u for i in old_cols if i != u]) + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta2 = nrow * (mi_old - mi_new) + # COMBINED DELTA-SCORES + delta_score = delta1 + delta2 + + if delta_score > max_delta: + if init_edges is None: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + else: + if (u, v) in init_edges: + if remove_geo_edges: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + else: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + + if max_delta != 0: + improvement = True + u, v = max_arc + if max_operation == "Addition": + if debug: + print("ADDING: ", max_arc, "\n") + c_dict[u].append(v) + p_dict[v].append(u) + + elif max_operation == "Deletion": + if debug: + print("DELETING: ", max_arc, "\n") + c_dict[u].remove(v) + p_dict[v].remove(u) + + elif max_operation == "Reversal": + if debug: + print("REVERSING: ", max_arc, "\n") + c_dict[u].remove(v) + p_dict[v].remove(u) + c_dict[v].append(u) + p_dict[u].append(v) + + else: + if debug: + print("No Improvement on Iter: ", _iter) + + ### TEST FOR MAX ITERATION ### + _iter += 1 + if _iter > max_iter: + if debug: + print("Max Iteration Reached") + break + + bn = BayesNet(c_dict) + + return bn diff --git a/bamt/redef_info_scores.py b/bamt/redef_info_scores.py new file mode 100644 index 0000000..0890550 --- /dev/null +++ b/bamt/redef_info_scores.py @@ -0,0 +1,179 @@ +import sys +import warnings +from copy import copy + +import numpy as np +import pandas as pd + +from bamt.mi_entropy_gauss import mi_gauss as mutual_information, entropy_all as entropy +from bamt.preprocess.graph import edges_to_dict +from bamt.preprocess.numpy_pandas import get_type_numpy + + +def info_score(edges: list, data: pd.DataFrame, method="LL"): + score_funcs = {"LL": log_lik_local, "BIC": BIC_local, "AIC": AIC_local} + score = score_funcs.get(method.upper(), BIC_local) + + parents_dict = edges_to_dict(edges) + nodes_with_edges = parents_dict.keys() + scores = [ + score(data[child_parents].copy(), method) + for var in nodes_with_edges + for child_parents in ([var] + parents_dict[var],) + ] + scores += [ + score(data[[var]].copy(), method) + for var in set(data.columns).difference(set(nodes_with_edges)) + ] + return sum(scores) + + +##### INFORMATION-THEORETIC SCORING FUNCTIONS ##### + + +def log_likelihood(bn, data, method="LL"): + """ + Determining log-likelihood of the parameters + of a Bayesian Network. This is a quite simple + score/calculation, but it is useful as a straight-forward + structure learning score. + + Semantically, this can be considered as the evaluation + of the log-likelihood of the data, given the structure + and parameters of the BN: + - log( P( D | Theta_G, G ) ) + where Theta_G are the parameters and G is the structure. + + However, for computational reasons it is best to take + advantage of the decomposability of the log-likelihood score. + + As an example, if you add an edge from A->B, then you simply + need to calculate LOG(P'(B|A)) - Log(P(B)), and if the value + is positive then the edge improves the fitness score and should + therefore be included. + + Even more, you can expand and manipulate terms to calculate the + difference between the new graph and the original graph as follows: + Score(G') - Score(G) = M * I(X,Y), + where M is the number of data points and I(X,Y) is + the marginal mutual information calculated using + the empirical distribution over the data. + + In general, the likelihood score decomposes as follows: + LL(D | Theta_G, G) = + M * Sum over Variables ( I ( X , Parents(X) ) ) - + M * Sum over Variables ( H( X ) ), + where 'I' is mutual information and 'H' is the entropy, + and M is the number of data points + + Moreover, it is clear to see that H(X) is independent of the choice + of graph structure (G). Thus, we must only determine the difference + in the mutual information score of the original graph which had a given + node and its original parents, and the new graph which has a given node + and new parents. + + NOTE: This assumes the parameters have already + been learned for the BN's given structure. + + LL = LL - f(N)*|B|, where f(N) = 0 + + Arguments + --------- + *bn* : a BayesNet object + Must have both structure and parameters + instantiated. + Notes + ----- + NROW = data.shape[0] + mi_score = 0 + ent_score = 0 + for rv in bn.nodes(): + cols = tuple([bn.V.index(rv)].extend([bn.V.index(p) for p in bn.parents(rv)])) + mi_score += mutual_information(data[:,cols]) + ent_score += entropy(data[:,bn.V.index(rv)]) + + return NROW * (mi_score - ent_score) + """ + + NROW = data.shape[0] + mi_scores = [ + mutual_information( + data[:, (bn.V.index(rv),) + tuple([bn.V.index(p) for p in bn.parents(rv)])], + method=method, + ) + for rv in bn.nodes() + ] + ent_scores = [entropy(data[:, bn.V.index(rv)], method=method) for rv in bn.nodes()] + return NROW * (sum(mi_scores) - sum(ent_scores)) + + +def log_lik_local(data, method="LL"): + NROW = data.shape[0] + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if isinstance(data, pd.DataFrame): + return NROW * ( + mutual_information(data, method=method) + - entropy(data.iloc[:, 0], method=method) + ) + elif isinstance(data, pd.Series): + return 0.0 + elif isinstance(data, np.ndarray): + return NROW * ( + mutual_information(data, method=method) + - entropy(data[:, 0], method=method) + ) + + +def BIC_local(data, method="BIC"): + NROW = data.shape[0] + log_score = log_lik_local(data, method=method) + try: + penalty = 0.5 * num_params(data) * np.log(NROW) + except OverflowError as err: + penalty = sys.float_info.max + return log_score - penalty + + +def num_params(data): + # Convert pandas DataFrame to numpy array + if isinstance(data, pd.DataFrame): + data = data.values + # Convert pandas Series to numpy array + if isinstance(data, pd.Series): + data = np.array(copy(data)) + + # Calculate number of parameters for numpy array + if isinstance(data, np.ndarray): + node_type = get_type_numpy(data) + columns_for_discrete = [ + param for param, node in node_type.items() if node == "cont" + ] + columns_for_code = [ + param for param, node in node_type.items() if node == "disc" + ] + + prod = 1 + for var in columns_for_code: + prod *= ( + len(np.unique(data[:, var])) if data.ndim != 1 else len(np.unique(data)) + ) + if columns_for_discrete: + prod *= len(columns_for_discrete) + + # Handle overflow error + try: + return prod + except OverflowError: + return sys.float_info.max + + # Raise an error if data type is unexpected + print("Num_params: Unexpected data type") + print(data) + return None + + +def AIC_local(data, method="AIC"): + log_score = log_lik_local(data, method=method) + penalty = num_params(data) + return log_score - penalty diff --git a/bamt/score_functions/__init__.py b/bamt/score_functions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/score_functions/k2_score.py b/bamt/score_functions/k2_score.py deleted file mode 100644 index 4a46b6a..0000000 --- a/bamt/score_functions/k2_score.py +++ /dev/null @@ -1,9 +0,0 @@ -from .score_function import ScoreFunction - - -class K2Score(ScoreFunction): - def __init__(self): - super().__init__() - - def estimate(self): - pass diff --git a/bamt/score_functions/mutual_information_score.py b/bamt/score_functions/mutual_information_score.py deleted file mode 100644 index 0bd2b7e..0000000 --- a/bamt/score_functions/mutual_information_score.py +++ /dev/null @@ -1,9 +0,0 @@ -from .score_function import ScoreFunction - - -class MutualInformationScore(ScoreFunction): - def __init__(self): - super().__init__() - - def estimate(self): - pass diff --git a/bamt/score_functions/score_function.py b/bamt/score_functions/score_function.py deleted file mode 100644 index e3ec70f..0000000 --- a/bamt/score_functions/score_function.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class ScoreFunction(ABC): - def __init__(self): - pass - - @abstractmethod - def estimate(self): - pass diff --git a/bamt/utils/EvoUtils.py b/bamt/utils/EvoUtils.py new file mode 100644 index 0000000..5266a75 --- /dev/null +++ b/bamt/utils/EvoUtils.py @@ -0,0 +1,116 @@ +import random + +import pandas as pd +from golem.core.dag.convert import graph_structure_as_nx_graph +from golem.core.dag.graph_utils import ordered_subnodes_hierarchy +from golem.core.optimisers.graph import OptGraph, OptNode +from pgmpy.estimators import K2Score +from pgmpy.models import BayesianNetwork + + +class CustomGraphModel(OptGraph): + def evaluate(self, data: pd.DataFrame): + nodes = data.columns.to_list() + _, labels = graph_structure_as_nx_graph(self) + return len(nodes) + + +class CustomGraphNode(OptNode): + def __str__(self): + return f'{self.content["name"]}' + + +def K2_metric(graph: CustomGraphModel, data: pd.DataFrame): + graph_nx, labels = graph_structure_as_nx_graph(graph) + struct = [] + for meta_edge in graph_nx.edges(): + l1 = str(labels[meta_edge[0]]) + l2 = str(labels[meta_edge[1]]) + struct.append([l1, l2]) + + bn_model = BayesianNetwork(struct) + bn_model.add_nodes_from(data.columns) + + score = K2Score(data).score(bn_model) + return -score + + +def custom_mutation_add(graph: CustomGraphModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + nodes_not_cycling = random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) + ] and other_random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) + ] + if nodes_not_cycling: + random_node.nodes_from.append(other_random_node) + break + + except Exception as ex: + print(f"Incorrect connection: {ex}") + return graph + + +def custom_mutation_delete(graph: OptGraph, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_reverse(graph: OptGraph, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + other_random_node.nodes_from.append(random_node) + break + except Exception as ex: + print(ex) + return graph + + +def has_no_duplicates(graph): + _, labels = graph_structure_as_nx_graph(graph) + if len(labels.values()) != len(set(labels.values())): + raise ValueError("Custom graph has duplicates") + return True + + +def has_no_blacklist_edges(graph, blacklist): + nx_graph, _ = graph_structure_as_nx_graph(graph) + for edge in nx_graph.edges(): + if edge in blacklist: + raise ValueError("Graph contains blacklisted edges") + return True + + +def has_only_whitelist_edges(graph, whitelist): + nx_graph, _ = graph_structure_as_nx_graph(graph) + for edge in nx_graph.edges(): + if edge not in whitelist: + raise ValueError("Graph contains non-whitelisted edges") + return True diff --git a/bamt/utils/GraphUtils.py b/bamt/utils/GraphUtils.py new file mode 100644 index 0000000..ccffae1 --- /dev/null +++ b/bamt/utils/GraphUtils.py @@ -0,0 +1,158 @@ +from typing import Dict, List, Tuple, Type + +import networkx as nx +from pandas import DataFrame + +from bamt.log import logger_preprocessor +from bamt.nodes.base import BaseNode + + +def nodes_types(data: DataFrame) -> Dict[str, str]: + """ + Function to define the type of the node + disc - discrete node + cont - continuous + Args: + data: input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + """ + + column_type = dict() + for c in data.columns.to_list(): + disc = ["str", "O", "b", "categorical", "object", "bool"] + disc_numerical = ["int32", "int64"] + cont = ["float32", "float64"] + if data[c].dtype.name in disc: + column_type[c] = "disc" + elif data[c].dtype.name in cont: + column_type[c] = "cont" + elif data[c].dtype.name in disc_numerical: + column_type[c] = "disc_num" + else: + logger_preprocessor.error(f"Unsupported data type. Dtype: {data[c].dtypes}") + + return column_type + + +def nodes_signs(nodes_types: dict, data: DataFrame) -> Dict[str, str]: + """Function to define sign of the node + neg - if node has negative values + pos - if node has only positive values + + Args: + data (pd.DataFrame): input dataset + nodes_types (dict): dict with nodes_types + + Returns: + dict: output dictionary where 'key' - node name and 'value' - sign of data + """ + if list(nodes_types.keys()) != data.columns.to_list(): + logger_preprocessor.error("Nodes_types dictionary is not full.") + return + columns_sign = dict() + for c in data.columns.to_list(): + if nodes_types[c] == "cont": + if (data[c] < 0).any(): + columns_sign[c] = "neg" + else: + columns_sign[c] = "pos" + return columns_sign + + +def get_descriptor(data) -> Dict[str, Dict[str, str]]: + return {"types": nodes_types(data), "signs": nodes_signs(nodes_types(data), data)} + + +def toporder(nodes: List[Type[BaseNode]], edges: List[Tuple]) -> List[List[str]]: + """ + Function for topological sorting + """ + G = nx.DiGraph() + G.add_nodes_from([node.name for node in nodes]) + G.add_edges_from(edges) + return list(nx.topological_sort(G)) + + +class GraphAnalyzer(object): + """ + Object to analyze DAG. + """ + + def __init__(self, bn): + self.bn = bn + + def _isolate_structure(self, nodes): + isolated_edges = [] + for edge in self.bn.edges: + if edge[0] in nodes and edge[1] in nodes: + isolated_edges.append(edge) + return isolated_edges + + def markov_blanket(self, node_name: str): + node = self.bn[node_name] + + parents = node.cont_parents + node.disc_parents + children = node.children + fremd_eltern = [] + + for child in node.children: + all_parents = self.bn[child].cont_parents + self.bn[child].disc_parents + + if all_parents == [node_name]: + continue + else: + new = all_parents + fremd_eltern.extend(new) + + nodes = parents + children + fremd_eltern + [node_name] + + edges = self._isolate_structure(nodes) + return {"nodes": list(set(nodes)), "edges": edges} + + def _collect_height(self, node_name, height): + nodes = [] + node = self.bn[node_name] + if height <= 0: + return [] + + if height == 1: + return node.disc_parents + node.cont_parents + + for parent in node.cont_parents + node.disc_parents: + nodes.append(parent) + nodes.extend(self._collect_height(parent, height=height - 1)) + return nodes + + def _collect_depth(self, node_name, depth): + nodes = [] + node = self.bn[node_name] + + if depth <= 0: + return [] + + if depth == 1: + return node.children + + for child in node.children: + nodes.append(child) + nodes.extend(self._collect_depth(child, depth=depth - 1)) + + return nodes + + def find_family(self, *args): + node_name, height, depth, with_nodes = args + if not with_nodes: + with_nodes = [] + else: + with_nodes = list(with_nodes) + nodes = ( + self._collect_depth(node_name, depth) + + self._collect_height(node_name, height) + + [node_name] + ) + + nodes = list(set(nodes + with_nodes)) + + return {"nodes": nodes, "edges": self._isolate_structure(nodes + with_nodes)} diff --git a/bamt/utils/MathUtils.py b/bamt/utils/MathUtils.py new file mode 100644 index 0000000..ed08105 --- /dev/null +++ b/bamt/utils/MathUtils.py @@ -0,0 +1,161 @@ +import math + +import numpy as np +from scipy import stats +from scipy.stats.distributions import chi2 +from sklearn.mixture import GaussianMixture + + +def lrts_comp(data): + n = 0 + biggets_p = -1 * np.infty + comp_biggest = 0 + max_comp = 10 + if len(data) < max_comp: + max_comp = len(data) + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm2 = GaussianMixture(n_components=i + 1, random_state=0) + gm1.fit(data) + ll1 = np.mean(gm1.score_samples(data)) + gm2.fit(data) + ll2 = np.mean(gm2.score_samples(data)) + LR = 2 * (ll2 - ll1) + p = chi2.sf(LR, 1) + if p > biggets_p: + biggets_p = p + comp_biggest = i + n = comp_biggest + return n + + +def mix_norm_cdf(x, weights, means, covars): + mcdf = 0.0 + for i in range(len(weights)): + mcdf += weights[i] * stats.norm.cdf(x, loc=means[i][0], scale=covars[i][0][0]) + return mcdf + + +def theoretical_quantile(data, n_comp): + model = GaussianMixture(n_components=n_comp, random_state=0) + model.fit(data) + q = [] + x = [] + # step = ((np.max(model.sample(100000)[0])) - (np.min(model.sample(100000)[0])))/1000 + step = (np.max(data) - np.min(data)) / 1000 + d = np.arange(np.min(data), np.max(data), step) + for i in d: + x.append(i) + q.append(mix_norm_cdf(i, model.weights_, model.means_, model.covariances_)) + return x, q + + +def quantile_mix(p, vals, q): + ind = q.index(min(q, key=lambda x: abs(x - p))) + return vals[ind] + + +def probability_mix(val, vals, q): + ind = vals.index(min(vals, key=lambda x: abs(x - val))) + return q[ind] + + +def sum_dist(data, vals, q): + percs = np.linspace(1, 100, 10) + x = np.quantile(data, percs / 100) + y = [] + for p in percs: + y.append(quantile_mix(p / 100, vals, q)) + dist = 0 + for xi, yi in zip(x, y): + dist = dist + (abs(-1 * xi + yi)) / math.sqrt(2) + return dist + + +def component(data, columns, method): + n = 1 + max_comp = 10 + x = [] + if data.shape[0] < max_comp: + max_comp = data.shape[0] + if len(columns) == 1: + x = np.transpose([data[columns[0]].values]) + else: + x = data[columns].values + if method == "aic": + lowest_aic = np.infty + comp_lowest = 0 + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm1.fit(x) + aic1 = gm1.aic(x) + if aic1 < lowest_aic: + lowest_aic = aic1 + comp_lowest = i + n = comp_lowest + + if method == "bic": + lowest_bic = np.infty + comp_lowest = 0 + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm1.fit(x) + bic1 = gm1.bic(x) + if bic1 < lowest_bic: + lowest_bic = bic1 + comp_lowest = i + n = comp_lowest + + if method == "LRTS": + n = lrts_comp(x) + if method == "quantile": + biggest_p = -1 * np.infty + comp_biggest = 0 + for i in range(1, max_comp, 1): + vals, q = theoretical_quantile(x, i) + dist = sum_dist(x, vals, q) + p = probability_mix(dist, vals, q) + if p > biggest_p: + biggest_p = p + comp_biggest = i + n = comp_biggest + return n + + +def _child_dict(net: list): + res_dict = dict() + for e0, e1 in net: + if e1 in res_dict: + res_dict[e1].append(e0) + else: + res_dict[e1] = [e0] + return res_dict + + +def precision_recall(pred_net: list, true_net: list, decimal=4): + pred_dict = _child_dict(pred_net) + true_dict = _child_dict(true_net) + corr_undirected = 0 + corr_dir = 0 + for e0, e1 in pred_net: + flag = True + if e1 in true_dict: + if e0 in true_dict[e1]: + corr_undirected += 1 + corr_dir += 1 + flag = False + if (e0 in true_dict) and flag: + if e1 in true_dict[e0]: + corr_undirected += 1 + pred_len = len(pred_net) + true_len = len(true_net) + shd = pred_len + true_len - corr_undirected - corr_dir + return { + "AP": round(corr_undirected / pred_len, decimal), + "AR": round(corr_undirected / true_len, decimal), + # 'F1_undir': round(2 * (corr_undirected / pred_len) * (corr_undirected / true_len) / (corr_undirected / pred_len + corr_undirected / true_len), decimal), + "AHP": round(corr_dir / pred_len, decimal), + "AHR": round(corr_dir / true_len, decimal), + # 'F1_directed': round(2*(corr_dir/pred_len)*(corr_dir/true_len)/(corr_dir/pred_len+corr_dir/true_len), decimal), + "SHD": shd, + } diff --git a/bamt/dag_optimizers/__init__.py b/bamt/utils/__init__.py similarity index 100% rename from bamt/dag_optimizers/__init__.py rename to bamt/utils/__init__.py diff --git a/bamt/utils/check_utils.py b/bamt/utils/check_utils.py new file mode 100644 index 0000000..ec44a33 --- /dev/null +++ b/bamt/utils/check_utils.py @@ -0,0 +1,9 @@ +def is_model(model): + try: + methods = dir(model) + if "fit" in methods: + return True + else: + return False + except Exception: + return False diff --git a/bamt/utils/composite_utils/CompositeGeneticOperators.py b/bamt/utils/composite_utils/CompositeGeneticOperators.py new file mode 100644 index 0000000..f42493a --- /dev/null +++ b/bamt/utils/composite_utils/CompositeGeneticOperators.py @@ -0,0 +1,189 @@ +from math import log10 +from random import choice + +import pandas as pd +from golem.core.dag.graph_utils import ordered_subnodes_hierarchy +from numpy import std, mean, log +from scipy.stats import norm +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + +from .CompositeModel import CompositeModel +from .MLUtils import MlModels + + +def custom_crossover_all_model( + graph_first: CompositeModel, graph_second: CompositeModel, max_depth +): + num_cros = 100 + try: + for _ in range(num_cros): + selected_node1 = choice(graph_first.nodes) + if selected_node1.nodes_from is None or selected_node1.nodes_from == []: + continue + + selected_node2 = graph_second.get_nodes_by_name(str(selected_node1))[0] + if selected_node2.nodes_from is None or selected_node2.nodes_from == []: + continue + + model1 = selected_node1.content["parent_model"] + model2 = selected_node2.content["parent_model"] + + selected_node1.content["parent_model"] = model2 + selected_node2.content["parent_model"] = model1 + + break + + except Exception as ex: + print(ex) + return graph_first, graph_second + + +def custom_mutation_add_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + nodes_not_cycling = random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) + ] and other_random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) + ] + if nodes_not_cycling: + other_random_node.nodes_from.append(random_node) + ml_models = MlModels() + other_random_node.content[ + "parent_model" + ] = ml_models.get_model_by_children_type(other_random_node) + break + + except Exception as ex: + graph.log.warn(f"Incorrect connection: {ex}") + return graph + + +def custom_mutation_delete_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + if not random_node.nodes_from: + random_node.content["parent_model"] = None + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_reverse_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + if not random_node.nodes_from: + random_node.content["parent_model"] = None + other_random_node.nodes_from.append(random_node) + ml_models = MlModels() + other_random_node.content[ + "parent_model" + ] = ml_models.get_model_by_children_type(other_random_node) + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_add_model(graph: CompositeModel, **kwargs): + try: + all_nodes = graph.nodes + nodes_with_parents = [ + node + for node in all_nodes + if (node.nodes_from != [] and node.nodes_from is not None) + ] + if not nodes_with_parents: + return graph + node = choice(nodes_with_parents) + ml_models = MlModels() + node.content["parent_model"] = ml_models.get_model_by_children_type(node) + except Exception as ex: + print(ex) + return graph + + +def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02): + data_all = data + data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42) + score, len_data = 0, len(data_train) + for node in graph.nodes: + data_of_node_train = data_train[node.content["name"]] + data_of_node_test = data_test[node.content["name"]] + if node.nodes_from is None or node.nodes_from == []: + if node.content["type"] == "cont": + mu, sigma = mean(data_of_node_train), std(data_of_node_train) + score += norm.logpdf( + data_of_node_test.values, loc=mu, scale=sigma + ).sum() + else: + count = data_of_node_train.value_counts() + frequency = log(count / len_data) + index = frequency.index.tolist() + for value in data_of_node_test: + if value in index: + score += frequency[value] + else: + model, columns, target, idx = ( + MlModels().dict_models[node.content["parent_model"]](), + [n.content["name"] for n in node.nodes_from], + data_of_node_train.to_numpy(), + data_train.index.to_numpy(), + ) + setattr(model, "max_iter", 100000) + features = data_train[columns].to_numpy() + if len(set(target)) == 1: + continue + fitted_model = model.fit(features, target) + + features = data_test[columns].to_numpy() + target = data_of_node_test.to_numpy() + if node.content["type"] == "cont": + predict = fitted_model.predict(features) + mse = mean_squared_error(target, predict, squared=False) + 0.0000001 + a = norm.logpdf(target, loc=predict, scale=mse) + score += a.sum() + else: + predict_proba = fitted_model.predict_proba(features) + idx = pd.array(list(range(len(target)))) + li = [] + + for i in idx: + a = predict_proba[i] + try: + b = a[target[i]] + except BaseException: + b = 0.0000001 + if b < 0.0000001: + b = 0.0000001 + li.append(log(b)) + score += sum(li) + + edges_count = len(graph.get_edges()) + score -= (edges_count * percent) * log10(len_data) * edges_count + + return -score diff --git a/bamt/utils/composite_utils/CompositeModel.py b/bamt/utils/composite_utils/CompositeModel.py new file mode 100644 index 0000000..344a7c8 --- /dev/null +++ b/bamt/utils/composite_utils/CompositeModel.py @@ -0,0 +1,17 @@ +from typing import Optional, Union, List + +from golem.core.dag.graph_delegate import GraphDelegate +from golem.core.dag.linked_graph_node import LinkedGraphNode + + +class CompositeNode(LinkedGraphNode): + def __str__(self): + return self.content["name"] + + +class CompositeModel(GraphDelegate): + def __init__( + self, nodes: Optional[Union[LinkedGraphNode, List[LinkedGraphNode]]] = None + ): + super().__init__(nodes) + self.unique_pipeline_id = 1 diff --git a/bamt/utils/composite_utils/MLUtils.py b/bamt/utils/composite_utils/MLUtils.py new file mode 100644 index 0000000..9c3cefb --- /dev/null +++ b/bamt/utils/composite_utils/MLUtils.py @@ -0,0 +1,126 @@ +import json +from random import choice + +import pkg_resources +from typing import Union + +from catboost import CatBoostClassifier, CatBoostRegressor +from golem.core.dag.graph_node import GraphNode +from sklearn.cluster import KMeans +from sklearn.ensemble import ( + AdaBoostRegressor, + ExtraTreesRegressor, + GradientBoostingRegressor, + RandomForestClassifier, + RandomForestRegressor, +) +from sklearn.linear_model import ( + Lasso, + LinearRegression, + LogisticRegression, + Ridge, + SGDRegressor, +) +from sklearn.naive_bayes import BernoulliNB, MultinomialNB +from sklearn.neural_network import MLPClassifier +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from xgboost import XGBClassifier, XGBRegressor + +from .CompositeModel import CompositeNode + +# Try to import LGBMRegressor and LGBMClassifier from lightgbm, if not available set to None +try: + from lightgbm.sklearn import LGBMRegressor, LGBMClassifier +except ModuleNotFoundError: + LGBMRegressor = None + LGBMClassifier = None + +lgbm_params = "lgbm_params.json" +models_repo = "models_repo.json" +lgbm_params_path = pkg_resources.resource_filename(__name__, lgbm_params) +models_repo_path = pkg_resources.resource_filename(__name__, models_repo) + + +class MlModels: + def __init__(self): + self.operations_by_types = { + "xgbreg": "XGBRegressor", + "adareg": "AdaBoostRegressor", + "gbr": "GradientBoostingRegressor", + "dtreg": "DecisionTreeRegressor", + "treg": "ExtraTreesRegressor", + "rfr": "RandomForestRegressor", + "linear": "LinearRegression", + "ridge": "Ridge", + "lasso": "Lasso", + "sgdr": "SGDRegressor", + "lgbmreg": "LGBMRegressor", + "catboostreg": "CatBoostRegressor", + "xgboost": "XGBClassifier", + "logit": "LogisticRegression", + "bernb": "BernoulliNB", + "multinb": "MultinomialNB", + "dt": "DecisionTreeClassifier", + "rf": "RandomForestClassifier", + "mlp": "MLPClassifier", + "catboost": "CatBoostClassifier", + "kmeans": "KMeans", + } + + self.dict_models = { + "XGBRegressor": XGBRegressor, + "AdaBoostRegressor": AdaBoostRegressor, + "GradientBoostingRegressor": GradientBoostingRegressor, + "DecisionTreeRegressor": DecisionTreeRegressor, + "ExtraTreesRegressor": ExtraTreesRegressor, + "RandomForestRegressor": RandomForestRegressor, + "LinearRegression": LinearRegression, + "Ridge": Ridge, + "Lasso": Lasso, + "SGDRegressor": SGDRegressor, + "LGBMRegressor": LGBMRegressor, + "CatBoostRegressor": CatBoostRegressor, + "XGBClassifier": XGBClassifier, + "LogisticRegression": LogisticRegression, + "BernoulliNB": BernoulliNB, + "MultinomialNB": MultinomialNB, + "DecisionTreeClassifier": DecisionTreeClassifier, + "RandomForestClassifier": RandomForestClassifier, + "MLPClassifier": MLPClassifier, + "LGBMClassifier": LGBMClassifier, + "CatBoostClassifier": CatBoostClassifier, + "KMeans": KMeans, + } + + # Include LGBMRegressor and LGBMClassifier if they were imported successfully + if LGBMRegressor is not None: + self.dict_models["LGBMRegressor"] = LGBMRegressor + self.operations_by_types["lgbmreg"] = "LGBMRegressor" + if LGBMClassifier is not None: + self.dict_models["LGBMClassifier"] = LGBMClassifier + self.operations_by_types["lgbm"] = "LGBMClassifier" + + if LGBMClassifier and LGBMRegressor is not None: + with open(lgbm_params_path) as file: + self.lgbm_dict = json.load(file) + + def get_model_by_children_type(self, node: Union[GraphNode, CompositeNode]): + candidates = [] + if node.content["type"] == "cont": + type_model = "regr" + else: + type_model = "class" + forbidden_tags = ["non-default", "expensive"] + with open(models_repo_path, "r") as f: + models_json = json.load(f) + models = models_json["operations"] + if LGBMClassifier and LGBMRegressor is not None: + models = models | self.lgbm_dict + for model, value in models.items(): + if ( + model not in ["knnreg", "knn", "qda"] + and list(set(value["tags"]).intersection(forbidden_tags)) == [] + and type_model in value["meta"] + ): + candidates.append(model) + return self.operations_by_types[choice(candidates)] diff --git a/bamt/utils/composite_utils/lgbm_params.json b/bamt/utils/composite_utils/lgbm_params.json new file mode 100644 index 0000000..5c5fa9d --- /dev/null +++ b/bamt/utils/composite_utils/lgbm_params.json @@ -0,0 +1,11 @@ +{ + "lgbm": { + "meta": "sklearn_class", + "tags": ["boosting", "tree", "non_linear", "mix"] + }, + "lgbmreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": ["boosting", "tree", "non_multi", "non_linear", "mix"] + } +} \ No newline at end of file diff --git a/bamt/utils/composite_utils/models_repo.json b/bamt/utils/composite_utils/models_repo.json new file mode 100644 index 0000000..38a4d20 --- /dev/null +++ b/bamt/utils/composite_utils/models_repo.json @@ -0,0 +1,447 @@ +{ + "metadata": { + "custom_class": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the custom classification models", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.classification", + "FedotClassificationStrategy" + ], + "tags": [ + "ml", + "custom" + ], + "tasks": "[TaskTypesEnum.classification]" + }, + "custom_regr": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the custom regression models", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.regression", + "FedotRegressionStrategy" + ], + "tags": [ + "ml", + "custom" + ], + "tasks": "[TaskTypesEnum.regression]" + }, + "sklearn_class": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the classification models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.classification", + "SkLearnClassificationStrategy" + ], + "tags": [ + "ml", + "sklearn" + ], + "tasks": "[TaskTypesEnum.classification]" + }, + "sklearn_clust": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the clustering models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.clustering", + "SkLearnClusteringStrategy" + ], + "tags": [ + "ml", + "sklearn" + ], + "tasks": "[TaskTypesEnum.clustering]" + }, + "sklearn_regr": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the regression models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.regression", + "SkLearnRegressionStrategy" + ], + "tags": [ + "ml", + "sklearn", + "composition" + ], + "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting]" + }, + "ts_model": { + "description": "Implementations of the time series models", + "input_type": "[DataTypesEnum.ts, DataTypesEnum.multi_ts]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.time_series", + "FedotTsForecastingStrategy" + ], + "tags": [ + "time_series" + ], + "tasks": "[TaskTypesEnum.ts_forecasting]" + }, + "custom_model": { + "description": "Implementations of the models specified by user with external code source", + "input_type": "[DataTypesEnum.ts, DataTypesEnum.table, DataTypesEnum.text]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.custom", + "CustomModelStrategy" + ], + "tags": [ + "non-default" + ], + "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting, TaskTypesEnum.classification, TaskTypesEnum.clustering]" + } + }, + "operations": { + "adareg": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts", "*tree"], + "tags": [ + "boosting", + "non_multi", + "non_linear", + "cont" + ] + }, + "ar": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "correct_params" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "arima": { + "meta": "ts_model", + "presets": ["ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "new_data_refit" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "clstm": { + "meta": "ts_model", + "tags": [ + "non_lagged", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "bernb": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "bayesian", "non_multi", "linear", "disc" + ] + }, + "catboost": { + "meta": "sklearn_class", + "presets": ["*tree"], + "tags": [ + "boosting", "non-default", "non_linear", "mix" + ] + }, + "catboostreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", "non_multi", "non-default", "non_linear", "mix" + ] + }, + "dt": { + "meta": "sklearn_class", + "presets": ["fast_train", "*tree"], + "tags": [ + "tree", + "interpretable", + "non_linear", + "cont" + ] + }, + "dtreg": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts", "*tree"], + "tags": [ + "tree", + "interpretable", + "non_linear", + "cont" + ] + }, + "gbr": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", + "non_multi", + "non_linear", + "cont" + ] + }, + "kmeans": { + "meta": "sklearn_clust", + "presets": ["fast_train"], + "tags": ["linear"] + }, + "knn": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "simple", + "correct_params", + "non_linear", + "cont" + ] + }, + "knnreg": { + "meta": "custom_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "correct_params", + "non_linear", + "cont" + ] + }, + "lasso": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "linear", + "interpretable", + "cont" + ] + }, + "lda": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "discriminant", "linear", "correct_params", "non-default", "cont" + ] + }, + "linear": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", "linear", "interpretable", "cont" + ] + }, + "logit": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "simple", + "linear", + "interpretable", + "non_multi", + "cont" + ] + }, + "mlp": { + "meta": "sklearn_class", + "tags": [ + "neural", + "non_linear", + "cont" + ] + }, + "multinb": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "non-default", + "bayesian", + "non_multi", + "linear", + "disc" + ] + }, + "qda": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "discriminant", + "quadratic", + "non_linear", + "cont" + ] + }, + "rf": { + "meta": "sklearn_class", + "presets": ["fast_train", "*tree"], + "tags": ["tree", "non_linear", "cont"] + }, + "rfr": { + "meta": "sklearn_regr", + "presets": ["fast_train", "*tree"], + "tags": ["tree", "non_linear", "cont"] + }, + "ridge": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "linear", + "interpretable", + "cont" + ] + }, + "polyfit": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "non_lagged", + "non_linear", + "interpretable", + "correct_params" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "sgdr": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "non_multi", "non_linear", "cont" + ] + }, + "stl_arima": { + "meta": "ts_model", + "presets": ["ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "new_data_refit" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "glm": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "correct_params", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "ets": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "locf": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "non_linear", + "simple", + "interpretable", + "non_lagged" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "ts_naive_average": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "non_linear", + "simple", + "interpretable", + "non_lagged" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "svc": { + "meta": "custom_class", + "tags": [ + "no_prob", + "expensive", + "non_linear" + ] + }, + "treg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "tree", + "non_linear", + "cont" + ] + }, + "xgboost": { + "meta": "sklearn_class", + "presets": ["*tree"], + "tags": [ + "boosting", "tree", "non-default", "non_linear", "mix" + ] + }, + "xgbreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", "tree", "non_multi", "non-default", "non_linear", "cont" + ] + }, + "cnn": { + "meta": "custom_class", + "tags": [ + "deep", "non-default", "non_linear" + ], + "input_type": "[DataTypesEnum.image]", + "output_type": "[DataTypesEnum.table]" + }, + "custom": { + "meta": "custom_model", + "tags": [ + "custom_model", + "non-default" + ] + } + } +} diff --git a/bamt/utils/serialization_utils.py b/bamt/utils/serialization_utils.py new file mode 100644 index 0000000..1e4ff74 --- /dev/null +++ b/bamt/utils/serialization_utils.py @@ -0,0 +1,150 @@ +import os +import pickle +from typing import Union, Tuple + +import joblib + +import bamt.utils.check_utils as check_utils +from bamt.log import logger_nodes + + +class ModelsSerializer: + def __init__(self, bn_name, models_dir): + self.bn_name = bn_name + self.models_dir = models_dir + self.serialization = None + + if os.path.isdir(models_dir): + if bn_name in os.listdir(models_dir): + raise AssertionError(f"Name must be unique. | {os.listdir(models_dir)}") + + @staticmethod + def choose_serialization(model) -> Tuple[str, Union[Exception, int]]: + try: + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1").replace("'", '"') + + if type(model).__name__ == "CatBoostRegressor": + a = model_ser.encode("latin1") + else: + a = model_ser.replace('"', "'").encode("latin1") + + classifier_body = pickle.loads(a) + return "pickle", 1 + except Exception: + return "joblib", -1 + + def get_path_joblib(self, models_dir, node_name: str) -> str: + """ + Args: + node_name: name of node + specific: more specific unique name for node. + For example, combination. + + Return: + Path to node. + """ + path = os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") + + if not os.path.isdir(path): + os.makedirs( + os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") + ) + return path + + def serialize_instance(self, instance: dict, model_type, node_name, specific=False): + """Every distribution contains a dict with params with models""" + model_ser = None + + model = instance[f"{model_type}_obj"] + if not check_utils.is_model(model): + return instance + + serialization, status = self.choose_serialization(model) + if status == -1: + logger_nodes.warning( + f"{node_name}:{'' if not specific else specific}::Pickle failed. BAMT will use Joblib." + ) + if serialization == "pickle": + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1") + elif serialization == "joblib": + path = self.get_path_joblib( + models_dir=self.models_dir, node_name=node_name.replace(" ", "_") + ) + if not specific: + destination = f"{node_name.replace(' ', '_')}.joblib.compressed" + else: + destination = f"{specific}.joblib.compressed" + path = os.path.abspath(os.path.join(path, destination)) + joblib.dump(model, path, compress=True, protocol=4) + else: + logger_nodes.error("Serialization detection failed.") + return + instance["serialization"] = serialization + instance[f"{model_type}_obj"] = model_ser or path + return instance + + def serialize(self, distributions): + result = {} + for node_name, [node_type, dist] in distributions.items(): + result[node_name] = {} + model_type = "regressor" if "Gaussian" in node_type else "classifier" + if "Conditional" in node_type: + result[node_name]["hybcprob"] = {} + for combination, dist_nested in dist["hybcprob"].items(): + instance_serialized = self.serialize_instance( + instance=dist_nested, + model_type=model_type, + node_name=node_name, + specific=combination, + ) + result[node_name]["hybcprob"][combination] = instance_serialized + else: + instance_serialized = self.serialize_instance( + instance=dist, model_type=model_type, node_name=node_name + ) + result[node_name] = instance_serialized + return result + + +class Deserializer: + def __init__(self, models_dir): + self.models_dir = models_dir + + @staticmethod + def deserialize_instance(instance: dict, model_type): + model_repr = instance[f"{model_type}_obj"] + if model_repr is None: + return instance + + serialization = instance["serialization"] + + if serialization == "pickle": + bytes_model = model_repr.encode("latin1") + model = pickle.loads(bytes_model) + else: + model = joblib.load(model_repr) + + instance[f"{model_type}_obj"] = model + return instance + + def apply(self, distributions): + result = {} + for node_name, [node_type, dist] in distributions.items(): + model_type = "regressor" if "Gaussian" in node_type else "classifier" + result[node_name] = {} + if "Conditional" in node_type: + result[node_name]["hybcprob"] = {} + for combination, dist_nested in dist["hybcprob"].items(): + instance_deserialized = self.deserialize_instance( + instance=dist_nested, + model_type=model_type, + ) + result[node_name]["hybcprob"][combination] = instance_deserialized + else: + instance_deserialized = self.deserialize_instance( + instance=dist, model_type=model_type + ) + result[node_name] = instance_deserialized + return result diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 0363f66..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,77 +0,0 @@ -site_name: BAMT -repo_name: aimclub/BAMT -repo_url: https://github.com/aimclub/BAMT -theme: - name: material - locale: en - features: - - announce.dismiss - - content.action.edit - - content.action.view - - content.code.annotate - - content.code.copy - # - content.code.select - # - content.footnote.tooltips - # - content.tabs.link - - content.tooltips - # - header.autohide - # - navigation.expand - - navigation.footer - - navigation.indexes - - navigation.instant - # - navigation.instant.prefetch - - navigation.instant.progress - # - navigation.prune - - navigation.sections - - navigation.tabs - # - navigation.tabs.sticky - - navigation.top - - navigation.tracking - - search.highlight - - search.share - - search.suggest - - toc.follow - # - toc.integrate - palette: - - media: "(prefers-color-scheme)" - toggle: - icon: material/link - name: Switch to light mode - - media: "(prefers-color-scheme: light)" - scheme: default - primary: indigo - accent: indigo - toggle: - icon: material/toggle-switch - name: Switch to dark mode - - media: "(prefers-color-scheme: dark)" - scheme: slate - primary: black - accent: indigo - toggle: - icon: material/toggle-switch-off - name: Switch to system preference - font: - text: Roboto - code: Roboto Mono - -nav: - - Home: index.md - - Getting Started: getting-started.md - - Installation: installation.md - - API Reference: - - logger: reference/logger.md -#extra: -# social: -# - icon: fontawesome/brands/github -# link: https://github.com/myusername/myproject -plugins: - - search - - mkdocstrings -markdown_extensions: - - admonition - - codehilite - - footnotes - - meta - - toc: - permalink: True diff --git a/pyproject.toml b/pyproject.toml index 8000b61..fab78d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "BAMT" -version = "2.0.0" +version = "1.2.01" description = "data modeling and analysis tool based on Bayesian networks" authors = ["Roman Netrogolov ", "Irina Deeva ", @@ -22,19 +22,20 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.10" -numpy = "^1.26.0" -matplotlib = "^3.8.0" -pandas = "^2.2.0" +python = ">=3.9,<3.11" +setuptools = "65.6.3" +numpy = ">=1.24.2" +matplotlib = "3.6.2" +pandas = "2.0.3" gmr = "1.6.2" -scikit-learn = "^1.4.2" -scipy = "^1.13.0" -pyvis = "^0.3.1" +scikit-learn = "^1.2.0" +scipy = "^1.8.0" +pyvis = ">=0.2.1" missingno = "^0.5.1" -pgmpy = "^0.1.20" -thegolem = "^0.3.3" +pgmpy = "0.1.20" +thegolem = ">=0.3.3" xgboost = ">=1.7.6" -catboost = ">=2.0.0" +catboost = ">=1.0.6" lightgbm = {version = ">=3.3.5", optional = true } [tool.poetry.extras] diff --git a/requirements.txt b/requirements.txt index 374a8a4..dad16dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,15 @@ -# Data Manipulation -numpy>=1.26.0 -pandas>=2.2.0 - -# Graph manipulation -networkx>=3.3 -thegolem>=0.4.0 - -# ML modeling frameworks -scikit-learn>=1.4.2 -scipy>=1.13.0 -catboost>=1.2.1 -xgboost>=2.0.0 - -# visualization -matplotlib>=3.8.0 -pyvis>=0.3.1 - -# TODO: exclude these libraries +numpy==1.26.4 +matplotlib==3.6.2 +pandas>=2.0.3 gmr==1.6.2 -pgmpy==0.1.20 \ No newline at end of file +scikit-learn>=1.2.0 +scipy>=1.9.3 +pyvis>=0.2.1 +pgmpy==0.1.25 +catboost>=1.0.6 +joblib>=1.1.1 +networkx>=3.1 +tqdm>=4.65.0 +thegolem>=0.3.3 +typing>=3.7.4.3 +xgboost>=1.7.6 diff --git a/tests/BigbraveBNTest.py b/tests/BigbraveBNTest.py new file mode 100644 index 0000000..d21d215 --- /dev/null +++ b/tests/BigbraveBNTest.py @@ -0,0 +1,59 @@ +import pandas as pd +from pgmpy.estimators import K2Score +from sklearn import preprocessing + +import bamt.preprocessors as pp +from bamt.networks.big_brave_bn import BigBraveBN +from bamt.networks.continuous_bn import ContinuousBN +from bamt.networks.discrete_bn import DiscreteBN + +data_discrete = pd.read_csv(r"../Data/benchmark/pigs.csv") +data_continuous = pd.read_csv(r"../Data/benchmark/arth150.csv") + +encoder = preprocessing.LabelEncoder() +discretizer = preprocessing.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" +) + +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) +discretized_data, est = p.apply(data_discrete) + +info = p.info + +space_restrictor = BigBraveBN() + +space_restrictor.set_possible_edges_by_brave(df=data_discrete) + +ps = space_restrictor.possible_edges + +bn_discrete = DiscreteBN() + +bn_discrete.add_nodes(descriptor=info) + +params = {"white_list": ps} +bn_discrete.add_edges(discretized_data, scoring_function=("K2", K2Score), params=params) + +encoder = preprocessing.LabelEncoder() +discretizer = preprocessing.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" +) + +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) +discretized_data, est = p.apply(data_continuous) + +info = p.info + +space_restrictor = BigBraveBN() + +space_restrictor.set_possible_edges_by_brave(df=data_continuous) + +ps = space_restrictor.possible_edges + +bn_continuous = ContinuousBN() + +bn_continuous.add_nodes(descriptor=info) + +params = {"white_list": ps} +bn_continuous.add_edges( + discretized_data, scoring_function=("K2", K2Score), params=params +) diff --git a/tests/LoadBN.py b/tests/LoadBN.py new file mode 100644 index 0000000..6335bb2 --- /dev/null +++ b/tests/LoadBN.py @@ -0,0 +1,54 @@ +import json + +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info + +bn.add_nodes(info) + +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] + +bn.set_structure(edges=structure) + +bn.get_info(as_df=False) + +with open("hack_p.json") as params: + params = json.load(params) + bn.set_parameters(params) + +bn.plot("gg3.html") + +bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) +bn2.load("hack.json") + +print(bn2.sample(50)) diff --git a/tests/MainTest.py b/tests/MainTest.py new file mode 100644 index 0000000..4796fe6 --- /dev/null +++ b/tests/MainTest.py @@ -0,0 +1,137 @@ +import pandas as pd +from pgmpy.estimators import K2Score +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +""" +Optional: +You can also uncomment print() that you need. +""" + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv") +cont_data = hack_data[["Gross", "Netpay", "Porosity", "Permeability", "Depth"]].dropna() +disc_data = hack_data[ + ["Tectonic regime", "Period", "Lithology", "Structural setting"] +].dropna() +hybrid_data = hack_data[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() + +cont_test_data = cont_data[cont_data.columns[:-1]] +cont_target = cont_data[cont_data.columns[-1]] +disc_test_data = disc_data[disc_data.columns[:-1]] +disc_target = disc_data[disc_data.columns[-1]] +hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] +hybrid_target = hybrid_data[hybrid_data.columns[-1]] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +# Discrete pipeline +discretized_data, _ = p.apply(disc_data) +disc_bn = Networks.DiscreteBN() +info = p.info +disc_bn.add_nodes(info) +disc_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +disc_bn.fit_parameters(data=disc_data) +disc_bn.calculate_weights(discretized_data) +disc_predicted_values = disc_bn.predict(test=disc_test_data) +disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient="columns") +synth_disc_data = disc_bn.sample(50) + +disc_bn.save("./disc_bn.json") +disc_bn2 = Networks.DiscreteBN() +disc_bn2.load("./disc_bn.json") +synth_disc_data2 = disc_bn2.sample(50) +# print(disc_bn.weights) +# print(disc_bn2.weights) +# print(disc_bn.get_info()) +# print(disc_bn2.get_info()) +# print(synth_disc_data) +# print(synth_disc_data2) + +# Continuous pipeline +discretized_data, _ = p.apply(cont_data) +cont_bn = Networks.ContinuousBN(use_mixture=True) +info = p.info +cont_bn.add_nodes(info) +cont_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +cont_bn.fit_parameters(data=cont_data) +cont_bn.calculate_weights(discretized_data) +cont_predicted_values = cont_bn.predict(test=cont_test_data) +cont_predicted_values = pd.DataFrame.from_dict(cont_predicted_values, orient="columns") +synth_cont_data = cont_bn.sample(50) + +cont_bn.save("./cont_bn.json") +cont_bn2 = Networks.ContinuousBN(use_mixture=True) +cont_bn2.load("./cont_bn.json") +synth_cont_data2 = cont_bn2.sample(50) +# print(cont_bn.weights) +# print(cont_bn2.weights) +# print('RMSE on predicted values with continuous data: ' + +# f'{mse(cont_target, cont_predicted_values, squared=False)}') +# print(cont_bn.get_info()) +# print(cont_bn2.get_info()) +# print(synth_cont_data) +# print(synth_cont_data2) + +# Hybrid pipeline +discretized_data, _ = p.apply(hybrid_data) +hybrid_bn = Networks.HybridBN(use_mixture=True) +hybrid_bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info +hybrid_bn.add_nodes(info) +hybrid_bn2.add_nodes(info) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn2.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn.fit_parameters(data=hybrid_data) +hybrid_bn2.fit_parameters(data=hybrid_data) +hybrid_bn.calculate_weights(discretized_data) +hybrid_bn2.calculate_weights(discretized_data) +hybrid_predicted_values = hybrid_bn.predict(test=hybrid_test_data) +hybrid_predicted_values = pd.DataFrame.from_dict( + hybrid_predicted_values, orient="columns" +) +synth_hybrid_data = hybrid_bn.sample(50) +synth_hybrid_data2 = hybrid_bn2.sample(50) + +hybrid_bn.save("./hybrid_bn.json") +hybrid_bn3 = Networks.HybridBN(use_mixture=True) +hybrid_bn3.load("./hybrid_bn.json") +synth_hybrid_data3 = hybrid_bn3.sample(50) +# print(hybrid_bn.weights) +# print(hybrid_bn2.weights) +# print(hybrid_bn3.weights) +# print('RMSE on predicted values with hybrid data: ' + +# f'{mse(hybrid_target, hybrid_predicted_values, squared=False)}') +# print(hybrid_bn.get_info()) +# print(hybrid_bn2.get_info()) +# print(hybrid_bn3.get_info()) +# print(synth_hybrid_data) +# print(synth_hybrid_data2) +# print(synth_hybrid_data3) + +# Save and load BN without weights +discretized_data, _ = p.apply(hybrid_data) +hybrid_bn = Networks.HybridBN(use_mixture=True) +info = p.info +hybrid_bn.add_nodes(info) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn.fit_parameters(data=hybrid_data) +hybrid_bn.save("./hybrid_bn_without_weights.json") +hybrid_bn2 = Networks.HybridBN(use_mixture=True) +hybrid_bn2.load("./hybrid_bn_without_weights.json") +# print(hybrid_bn2.weights) diff --git a/tests/MetricsTest.py b/tests/MetricsTest.py new file mode 100644 index 0000000..5f04deb --- /dev/null +++ b/tests/MetricsTest.py @@ -0,0 +1,61 @@ +import time + +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +start = time.time() + + +p1 = time.time() +print(f"Time elapsed for importing: {p1 - start}") + +h = pd.read_csv("data/real data/hack_processed_with_rf.csv") +cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", +] +h = h[cols] + +print(h.describe()) +print("-----") +p2 = time.time() +print(f"Time elapsed for preparing data: {p2 - p1}") + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +# ----------- +discrete_data, est = p.apply(h) +info = p.info + +bn = Networks.HybridBN(has_logit=True) # all may vary +bn.add_nodes(descriptor=info) +bn.add_edges(data=discrete_data, optimizer="HC", scoring_function=("MI",)) + +bn.get_info(as_df=False) +t1 = time.time() +bn.fit_parameters(data=h) +t2 = time.time() +print(f"PL elapsed: {t2 - t1}") + +columns = ["Lithology", "Structural setting", "Porosity", "Depth"] +validY = h[columns].dropna() +validX = h.drop(columns, axis=1).dropna() + +time_1 = time.time() +pred_param = bn.predict(validX, parall_count=3) +time_2 = time.time() +print(pred_param) +print(f"Predict elapsed: {time_2 - time_1}") diff --git a/tests/NetworksTest.py b/tests/NetworksTest.py new file mode 100644 index 0000000..607bdf9 --- /dev/null +++ b/tests/NetworksTest.py @@ -0,0 +1,722 @@ +import itertools +import json +import time + +import numpy as np +import pandas as pd +from pandas.testing import assert_frame_equal +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier + +import bamt.networks as Networks +import bamt.nodes as Nodes +from bamt.preprocessors import Preprocessor + + +# import abc + + +class NetworkTest(object): + def __init__( + self, + directory: str, + verbose: bool = False, + case_id: int = 0, + sample_n: int = 500, + sample_tol: float = 0.6, + ): + """ + sample_n: number of rows in sample + sample_tol: precent of acceptable number of nans. + If number of nan more than sample_int * sample_tol, then sample test failed. + """ + self.bn = Networks.BaseNetwork() + self.sf = "" + self.type = "abstract" + self.case_id = case_id + self.sample_n = sample_n + self.sample_tol = sample_tol + self.verboseprint = print if verbose else lambda *a: None + self.directory = directory + + @staticmethod + def _tabularize_output(message1: str, message2: str): + # message1 - usually class of output (error, success) + # message2 - usually description of message 1 + return f"{message1: <52} | {message2: >3}" + + def test_preprocess(self): + failed = False + + if self.case_id == 0: + self.discrete_cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + self.cont_cols = ["Gross", "Netpay", "Porosity", "Permeability", "Depth"] + self.hybrid_cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + # Base of standards + self.base = "hack_" + self.type + else: + self.discrete_cols = [] + self.cont_cols = [] + self.hybrid_cols = [] + + if self.type == "discrete": + data = pd.read_csv(self.directory)[self.discrete_cols] + elif self.type == "continuous": + data = pd.read_csv(self.directory)[self.cont_cols] + else: + data = pd.read_csv(self.directory)[self.hybrid_cols] + + encoder = pp.LabelEncoder() + discretizer = pp.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" + ) + + p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + + discretized_data, est = p.apply(data) + info = p.info + + try: + assert info == json.load(open(f"{self.base}/hack_info.json")) + except AssertionError: + failed = True + self.verboseprint(self._tabularize_output("ERROR", "Bad descriptor")) + + try: + assert_frame_equal( + discretized_data, + pd.read_csv(f"{self.base}/hack_data.csv", index_col=0), + check_dtype=False, + ) + except Exception as ex: + failed = True + self.verboseprint(self._tabularize_output("ERROR", str(ex))) + + if not failed: + status = "OK" + else: + status = "Failed" + print(self._tabularize_output("Preprocess", status)) + + self.info = info + self.data = discretized_data + + def test_structure_learning(self): + pass + + def test_parameters_learning(self): + pass + + def test_sampling(self): + failed = False + + sample = self.bn.sample(n=self.sample_n, progress_bar=False) + + if sample.empty: + self.verboseprint("Sampling", "Dataframe is empty") + return + + if sample.isna().sum().sum() > (self.sample_n * self.sample_tol): + failed = True + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Sampling ({self.sf})", status)) + + def test_predict(self): + failed = False + + if self.type == "discrete": + cols = self.discrete_cols + elif self.type == "continuous": + cols = self.cont_cols + elif self.type == "hybrid": + cols = self.hybrid_cols + else: + raise Exception("Inner error") + + preds = self.bn.predict( + test=pd.read_csv(self.directory)[cols[:2]].dropna(), + progress_bar=False, + parall_count=2, + ) + + # with open(f"{self.base}/hack_predict.json", "r") as f: + # p = json.load(f) + + if self.type == "continuous": + # cols: ['Porosity', 'Permeability', 'Depth'] + for node in preds.keys(): + right_val = json.load(open(f"{self.base}/hack_predict.json"))[self.sf][ + node + ] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + assert np.all( + np.isclose(test_val, right_val, rtol=0.4) + ), f"Predict failed: {node, right_val, test_val}" + elif self.type == "discrete": + # cols: ['Lithology', 'Structural setting'] + for node in preds.keys(): + test_vals = pd.Series(preds[node]).value_counts().to_dict() + for category, right_val in json.load( + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): + try: + assert np.all( + np.isclose(test_vals[category], right_val, atol=5) + ), f"Predict failed: {node, test_vals[category], right_val}" + except KeyError as ex: + print("Unknown preds category: ", ex.args[0]) + continue + elif self.type == "hybrid": + cont_nodes = [ + node + for node in self.bn.nodes_names + if self.info["types"][node] == "cont" + ] + for node in preds.keys(): + if node in cont_nodes: + right_val = json.load(open(f"{self.base}/hack_predict.json"))[ + self.sf + ][node] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + # p[self.sf][node] = test_val + s = [right_val, test_val] + assert np.all( + np.isclose(min(s), max(s), atol=5, rtol=0.6) + ), f"Predict failed: {node, test_val, right_val}" + else: + test_vals = pd.Series(preds[node]).value_counts().to_dict() + # p[self.sf][node] = test_vals + for category, right_val in json.load( + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): + try: + assert np.all( + np.isclose( + min(test_vals[category], right_val), + max(right_val, test_vals[category]), + atol=100, + rtol=0.5, + ) + ), f"Predict failed: {node, test_vals[category], right_val}" + except KeyError as ex: + print("Unknown preds category: ", ex.args[0]) + continue + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Predict ({self.sf})", status)) + + # with open(f"{self.base}/hack_predict.json", "w") as f: + # json.dump(p, f) + + def apply(self): + pass + + @staticmethod + def use_rules(*args, **kwargs): + for rule in args: + rule(**kwargs) + + +class TestDiscreteBN(NetworkTest): + def __init__(self, **kwargs): + super(TestDiscreteBN, self).__init__(**kwargs) + self.type = "discrete" + + def test_structure_learning(self): + failed = False + + bn = Networks.DiscreteBN() + bn.add_nodes(descriptor=self.info) + + try: + assert bn.nodes_names == [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[self.sf] + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + self.bn = None + status = "Failed" + + print(self._tabularize_output(f"Structure ({self.sf})", status)) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.discrete_cols]) + + try: + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output(f"Parameters ({self.sf})", "bad distributions") + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Parameters ({self.sf})", status)) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + t0 = time.time() + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning() + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_parameters_learning() + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8, "\n") + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") + + +class TestContinuousBN(NetworkTest): + def __init__(self, **kwargs): + super(TestContinuousBN, self).__init__(**kwargs) + self.type = "continuous" + + def test_setters(self): + failed = False + + bn = Networks.ContinuousBN() + ns = [] + for d in [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 4)]: + ns.append(d) + + bn.set_structure(nodes=ns) + bn.set_classifiers( + classifiers={ + "Node0": DecisionTreeClassifier(), + "Node1": RandomForestClassifier(), + "Node2": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output("Setters", status)) + + def test_structure_learning(self, use_mixture: bool = False): + self.use_mixture = use_mixture + failed = False + + bn = Networks.ContinuousBN(use_mixture=use_mixture) + bn.add_nodes(descriptor=self.info) + + try: + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Structure ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.cont_cols]) + try: + if self.use_mixture: + empty_data = {"mean": [], "covars": [], "coef": []} + for k, v in self.bn.distributions.items(): + assert all( + [v[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {k}." + assert ( + 0.9 <= sum(v["coef"]) <= 1.1 + ), f"{sum(v['coef'])} || {k}'s: coefs are wrong." + else: + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[ + "use_mixture=False" + ][self.sf] + ), "Bad distributions." + except AssertionError as ex: + failed = True + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", + ex.args[0], + ) + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + # Auskommentieren mich + # self._test_setters() + t0 = time.time() + for use_mixture in [True, False]: + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning(use_mixture=use_mixture) + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_parameters_learning() + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") + + +class TestHybridBN(NetworkTest): + def __init__(self, **kwargs): + super(TestHybridBN, self).__init__(**kwargs) + self.type = "hybrid" + + def test_setters(self): + failed = False + + bn = Networks.HybridBN(has_logit=True) + ns = [] + for d, g in zip( + [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], + [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)], + ): + ns.append(d) + ns.append(g) + edges = [ + ("Node0", "Node3"), + ("Node3", "Node1"), + ("Node1", "Node4"), + ("Node4", "Node2"), + ("Node2", "Node5"), + ] + test_info = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "disc", + "Node4": "disc", + "Node5": "disc", + }, + "signs": {"Node0": "pos", "Node1": "pos", "Node2": "pos"}, + } + + # Structure setter + bn.set_structure(info=test_info, nodes=ns, edges=edges) + + assert [ + "Gaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + ] == [node.type for node in bn.nodes], "Setter | Nodes are not the same." + assert edges == bn.edges, "Setter | Edges are not the same." + + # Classifiers setters + + bn.set_classifiers( + classifiers={ + "Node3": DecisionTreeClassifier(), + "Node4": RandomForestClassifier(), + "Node5": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." + + # Parameters setters + + with open("test_params.json") as f: + p = json.load(f) + + bn.set_parameters(p) + assert bn.distributions == p, "Setter | Parameters are not the same." + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output("Setters", status)) + + def test_structure_learning( + self, use_mixture: bool = False, has_logit: bool = False + ): + self.use_mixture = use_mixture + self.has_logit = has_logit + failed = False + + bn = Networks.HybridBN(use_mixture=use_mixture, has_logit=has_logit) + bn.add_nodes(descriptor=self.info) + + try: + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Structure ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + status, + ) + ) + + @staticmethod + def non_empty_gaussian_nodes(name, node_params): + empty_data = {"mean": [], "covars": [], "coef": []} + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." + + @staticmethod + def non_empty_logit_nodes(name, node_params): + empty_data = {"classes": [], "classifier_obj": None} + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." + + @staticmethod + def sum_equals_to_1(name, node_params): + assert 0.9 <= sum(node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." + + def _validate_node(self, name, type, node_params, true_vals): + try: + if type == "MixtureGaussian": + self.use_rules( + self.non_empty_gaussian_nodes, + self.sum_equals_to_1, + name=name, + node_params=node_params, + ) + elif type == "ConditionalMixtureGaussian": + for comb, data in node_params["hybcprob"].items(): + self.use_rules( + self.non_empty_gaussian_nodes, + self.sum_equals_to_1, + name=name, + node_params=data, + ) + elif type.startswith("Logit"): + self.use_rules( + self.non_empty_logit_nodes, name=name, node_params=node_params + ) + elif type.startswith("ConditionalLogit"): + for comb, data in node_params["hybcprob"].items(): + self.use_rules( + self.non_empty_logit_nodes, name=name, node_params=data + ) + else: + assert node_params == true_vals, f"Parameters error on {name}, {type}" + except AssertionError as ex: + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + ex.args[0], + ) + ) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.hybrid_cols]) + try: + true_params = json.load(open(f"{self.base}/hack_params.json"))[ + f"use_mixture={self.use_mixture}" + ][f"has_logit={self.has_logit}"][self.sf] + + node_type_dict = {node.name: node.type for node in self.bn.nodes} + for name, type in node_type_dict.items(): + node_params = self.bn.distributions[name] + self._validate_node(name, type, node_params, true_params[name]) + except AssertionError as ex: + failed = True + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + ex.args[0], + ) + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + status, + ) + ) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + self.test_setters() + t0 = time.time() + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning( + use_mixture=use_mixture, has_logit=has_logit + ) + self.test_parameters_learning() + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) + + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") diff --git a/tests/SaveBN.py b/tests/SaveBN.py new file mode 100644 index 0000000..0a210ea --- /dev/null +++ b/tests/SaveBN.py @@ -0,0 +1,49 @@ +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +# import json + +hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info + +bn.add_nodes(info) + +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] + +bn.set_structure(edges=structure) + +bn.get_info(as_df=False) + +bn.fit_parameters(data=hack_data) +print(bn.sample(4)) +bn.save_params("hack_p.json") +# bn.save_structure("hack_s.json") +bn.save("hack.json") diff --git a/tests/main.py b/tests/main.py new file mode 100644 index 0000000..37083e0 --- /dev/null +++ b/tests/main.py @@ -0,0 +1,27 @@ +import logging +import time +import traceback + +from NetworksTest import TestHybridBN, TestContinuousBN, TestDiscreteBN + +# Print only errors +logging.getLogger("preprocessor").setLevel(logging.ERROR) + +if __name__ == "__main__": + t0 = time.time() + dir = r"../data/real data/hack_processed_with_rf.csv" + + tests = [ + TestHybridBN(directory=dir), + TestDiscreteBN(directory=dir), + TestContinuousBN(directory=dir), + ] + + for test in tests: + try: + test.apply() + except Exception as ex: + traceback.print_exc() + continue + + print(f"Total time: {time.time() - t0}") diff --git a/tests/sendingClassifiersLogit.py b/tests/sendingClassifiersLogit.py new file mode 100644 index 0000000..ce312f0 --- /dev/null +++ b/tests/sendingClassifiersLogit.py @@ -0,0 +1,61 @@ +# import json + +import pandas as pd +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier + +import bamt.networks as Nets +import bamt.preprocessors as preprocessors + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Nets.HybridBN(has_logit=True) +info = p.info + +# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: +# net_data = json.load(f) + +# bn.add_nodes(net_data["info"]) +# bn.set_structure(edges=net_data["edges"]) +# bn.set_parameters(net_data["parameters"]) +# +# print(bn.sample(10, models_dir=r"")) + +bn.add_nodes(info) + +bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) +bn.set_classifiers( + classifiers={ + "Structural setting": DecisionTreeClassifier(), + "Lithology": RandomForestClassifier(), + "Period": KNeighborsClassifier(n_neighbors=2), + } +) + +bn.fit_parameters(hack_data) + +# bn.save("mynet.json") +print(bn.sample(500).shape) + +bn.get_info(as_df=False) diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py new file mode 100644 index 0000000..b0c79ae --- /dev/null +++ b/tests/sendingRegressors.py @@ -0,0 +1,62 @@ +# import json + +import pandas as pd +from catboost import CatBoostRegressor +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestRegressor + +# from sklearn.linear_model import ElasticNet +from sklearn.tree import DecisionTreeRegressor + +import bamt.preprocessors as preprocessors +from bamt.networks import HybridBN + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = HybridBN(has_logit=True) +info = p.info + +# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: +# net_data = json.load(f) + +# bn.add_nodes(net_data["info"]) +# bn.set_structure(edges=net_data["edges"]) +# bn.set_parameters(net_data["parameters"]) + +bn.add_nodes(info) + +bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) + +bn.set_regressor( + regressors={ + "Depth": CatBoostRegressor(logging_level="Silent", allow_writing_files=False), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } +) + +bn.fit_parameters(hack_data) + +# bn.save("mynet") + +print(bn.sample(100).shape) +bn.get_info(as_df=False) diff --git a/tests/test_builders.py b/tests/test_builders.py new file mode 100644 index 0000000..530a2b4 --- /dev/null +++ b/tests/test_builders.py @@ -0,0 +1,728 @@ +import itertools +import logging +import unittest + +import pandas as pd + +from bamt.builders.builders_base import StructureBuilder, VerticesDefiner +from bamt.builders.evo_builder import EvoStructureBuilder +from bamt.builders.hc_builder import HillClimbDefiner +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils.MathUtils import precision_recall + +logging.getLogger("builder").setLevel(logging.CRITICAL) + + +class TestStructureBuilder(unittest.TestCase): + def setUp(self): + self.data = pd.DataFrame(columns=["Node0", "Node1", "Node2"]) + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "disc_num"}, + "signs": {"Node0": "pos"}, + } + self.SB = StructureBuilder(descriptor=self.descriptor) + + def test_restrict(self): + self.SB.has_logit = True + + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) + + self.assertEqual(self.SB.black_list, [], msg="Restrict wrong edges.") + + # --------- + self.SB.has_logit = False + + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) + + self.assertEqual( + self.SB.black_list, + [("Node0", "Node1"), ("Node0", "Node2")], + msg="Restricted edges are allowed.", + ) + + def test_get_family(self): + self.assertIsNone(self.SB.get_family()) + + self.SB.skeleton["V"] = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + DiscreteNode(name="Node2"), + ] + self.assertIsNone(self.SB.get_family()) + # Note that the method get_family is not supposed to be used by user (only developer), + # so we don't cover a case with restricted edges here (we did this in + # the previous test). + self.SB.skeleton["E"] = [ + ("Node1", "Node0"), + ("Node2", "Node1"), + ("Node2", "Node0"), + ] + self.SB.get_family() + + # Node: [[cont_parents], [disc_parents], [children]] + data = [ + [[], [], ["Node1", "Node0"]], + [[], ["Node2"], ["Node0"]], + [[], ["Node1", "Node2"], []], + ] + for node_nummer in range(3): + self.assertEqual( + self.SB.skeleton["V"][node_nummer].cont_parents, data[node_nummer][0] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].disc_parents, data[node_nummer][1] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].children, data[node_nummer][2] + ) + + +class TestVerticesDefiner(unittest.TestCase): + def setUp(self): + self.descriptor = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + self.VD = VerticesDefiner(descriptor=self.descriptor, regressor=None) + + def test_first_level(self): + self.assertEqual( + self.VD.vertices, + [ + GaussianNode(name="Node0"), + GaussianNode(name="Node1"), + GaussianNode(name="Node2"), + GaussianNode(name="Node3"), + DiscreteNode(name="Node4"), + DiscreteNode(name="Node5"), + DiscreteNode(name="Node6"), + DiscreteNode(name="Node7"), + ], + ) + + def test_overwrite_vetrex(self): + self.assertEqual(self.VD.skeleton, {"V": [], "E": []}) + + def reload(): + self.VD.skeleton["V"] = self.VD.vertices + self.VD.skeleton["E"] = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] + self.VD.get_family() + + data = { + "True, True": { + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, + "True, False": { + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, + "False, True": { + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + "False, False": { + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + } + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): + reload() + self.VD.overwrite_vertex( + has_logit=has_logit, + use_mixture=use_mixture, + classifier=None, + regressor=None, + ) + self.assertEqual( + {node.name: node.type for node in self.VD.skeleton["V"]}, + data[f"{use_mixture}, {has_logit}"], + msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}", + ) + + +class TestHillClimbDefiner(unittest.TestCase): + def setUp(self): + self.descriptor = { + "signs": { + "Depth": "pos", + "Gross": "pos", + "Netpay": "pos", + "Permeability": "pos", + "Porosity": "pos", + }, + "types": { + "Depth": "cont", + "Gross": "cont", + "Lithology": "disc", + "Netpay": "cont", + "Period": "disc", + "Permeability": "cont", + "Porosity": "cont", + "Structural setting": "disc", + "Tectonic regime": "disc", + }, + } + self.data = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + + def test_apply_K2(self): + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("K2",), + ) + + hcd.apply_K2( + data=pd.DataFrame(self.data), + init_edges=None, + progress_bar=False, + remove_init_edges=False, + white_list=None, + ) + + right_edges = [ + ["Tectonic regime", "Structural setting"], + ["Tectonic regime", "Depth"], + ["Tectonic regime", "Netpay"], + ["Period", "Porosity"], + ["Period", "Tectonic regime"], + ["Period", "Netpay"], + ["Lithology", "Permeability"], + ["Lithology", "Period"], + ["Lithology", "Tectonic regime"], + ["Structural setting", "Netpay"], + ["Netpay", "Gross"], + ["Porosity", "Permeability"], + ["Porosity", "Depth"], + ["Porosity", "Netpay"], + ["Permeability", "Netpay"], + ] + + self.assertEqual(hcd.skeleton["E"], right_edges) + + def test_apply_group1(self): + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("MI",), + ) + + hcd.restrict(data=pd.DataFrame(self.data), bl_add=None, init_nodes=None) + hcd.apply_group1( + data=pd.DataFrame(self.data), + progress_bar=False, + init_edges=None, + remove_init_edges=False, + white_list=None, + ) + + right_edges = [ + ["Lithology", "Depth"], + ["Period", "Gross"], + ["Netpay", "Gross"], + ["Period", "Netpay"], + ["Depth", "Period"], + ["Depth", "Permeability"], + ["Netpay", "Permeability"], + ["Period", "Porosity"], + ["Netpay", "Porosity"], + ["Permeability", "Structural setting"], + ["Netpay", "Structural setting"], + ["Period", "Tectonic regime"], + ["Netpay", "Tectonic regime"], + ] + + self.assertEqual(hcd.skeleton["E"], right_edges) + + +class TestEvoStructureBuilder(unittest.TestCase): + def setUp(self): + self.data = pd.read_csv(r"data/benchmark/asia.csv", index_col=0) + self.descriptor = { + "types": { + "asia": "disc", + "tub": "disc", + "smoke": "disc", + "lung": "disc", + "bronc": "disc", + "either": "disc", + "xray": "disc", + "dysp": "disc", + }, + "signs": {}, + } + self.evo_builder = EvoStructureBuilder( + data=self.data, + descriptor=self.descriptor, + regressor=None, + has_logit=True, + use_mixture=True, + ) + # Replace this with your actual reference DAG + self.reference_dag = [ + ("asia", "tub"), + ("tub", "either"), + ("smoke", "lung"), + ("smoke", "bronc"), + ("lung", "either"), + ("bronc", "dysp"), + ("either", "xray"), + ("either", "dysp"), + ] + + def test_build(self): + # placeholder kwargs + kwargs = {} + self.evo_builder.build( + data=self.data, classifier=None, regressor=None, verbose=False, **kwargs + ) + + obtained_dag = self.evo_builder.skeleton["E"] + num_edges = len(obtained_dag) + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) + + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] + self.assertLess( + dist, + 15, + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_graph_analyzer.py b/tests/test_graph_analyzer.py new file mode 100644 index 0000000..a87bcb2 --- /dev/null +++ b/tests/test_graph_analyzer.py @@ -0,0 +1,98 @@ +import logging +import unittest + +from bamt.builders.builders_base import VerticesDefiner +from bamt.networks.discrete_bn import DiscreteBN +from bamt.nodes.discrete_node import DiscreteNode +from bamt.utils import GraphUtils + +logging.getLogger("builder").setLevel(logging.CRITICAL) + + +class TestGraphAnalyzer(unittest.TestCase): + def setUp(self): + self.bn = DiscreteBN() + definer = VerticesDefiner( + descriptor={ + "types": { + "Node0": "disc", + "Node1": "disc", + "Node2": "disc", + "Node3": "disc_num", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc", + "Node7": "disc", + "Node8": "disc", + "Node9": "disc", + }, + "signs": {}, + }, + regressor=None, + ) + + definer.skeleton["V"] = [DiscreteNode(name=f"Node{i}") for i in range(10)] + definer.skeleton["E"] = [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node2", "Node3"), + ("Node4", "Node7"), + ("Node1", "Node5"), + ("Node5", "Node6"), + ("Node7", "Node0"), + ("Node8", "Node1"), + ("Node9", "Node2"), + ] + definer.get_family() + + self.bn.nodes = definer.skeleton["V"] + self.bn.edges = definer.skeleton["E"] + + self.analyzer = GraphUtils.GraphAnalyzer(self.bn) + + def test_markov_blanket(self): + result = self.analyzer.markov_blanket("Node0") + result["nodes"] = sorted(result["nodes"]) + self.assertEqual( + { + "edges": [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node7", "Node0"), + ("Node8", "Node1"), + ("Node9", "Node2"), + ], + "nodes": sorted(["Node0", "Node1", "Node2", "Node7", "Node8", "Node9"]), + }, + result, + ) + + def test_find_family(self): + without_parents = self.analyzer.find_family("Node0", 0, 2, None) + without_parents["nodes"] = sorted(without_parents["nodes"]) + self.assertEqual( + { + "nodes": sorted(["Node3", "Node2", "Node1", "Node0", "Node5"]), + "edges": [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node2", "Node3"), + ("Node1", "Node5"), + ], + }, + without_parents, + ) + + without_children = self.analyzer.find_family("Node0", 2, 0, None) + without_children["nodes"] = sorted(without_children["nodes"]) + self.assertEqual( + { + "nodes": sorted(["Node4", "Node7", "Node0"]), + "edges": [("Node4", "Node7"), ("Node7", "Node0")], + }, + without_children, + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_networks.py b/tests/test_networks.py new file mode 100644 index 0000000..b2df174 --- /dev/null +++ b/tests/test_networks.py @@ -0,0 +1,1192 @@ +import json +import logging +import pathlib as pl +import unittest + +import pandas as pd +from catboost import CatBoostRegressor +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor + +import bamt.preprocessors as bp +from bamt.networks.composite_bn import CompositeBN +from bamt.networks.hybrid_bn import BaseNetwork, HybridBN +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils.MathUtils import precision_recall +from bamt.utils.composite_utils.CompositeGeneticOperators import ( + custom_mutation_add_model, + custom_crossover_all_model, +) +from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode + +logging.getLogger("network").setLevel(logging.CRITICAL) + + +class TestCaseBase(unittest.TestCase): + def assertIsFile(self, path): + if not pl.Path(path).resolve().is_file(): + raise AssertionError("File does not exist: %s" % str(path)) + + def assertIsDir(self, path): + if not pl.Path(path).resolve().is_dir(): + raise AssertionError("Direction does not exist: %s" % str(path)) + + def prepare_bn_and_data(self): + # prepare bn where models were set by set_model + hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + ] + + encoder = pp.LabelEncoder() + discretizer = pp.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="quantile" + ) + + p = bp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + + hack_data.dropna(inplace=True) + hack_data.reset_index(inplace=True, drop=True) + + discretized_data, est = p.apply(hack_data) + + self.bn = HybridBN(has_logit=True) + info = p.info + + self.bn.add_nodes(info) + + self.bn.add_edges( + discretized_data, scoring_function=("BIC",), progress_bar=False + ) + + self.bn.set_regressor( + regressors={ + "Depth": CatBoostRegressor( + logging_level="Silent", allow_writing_files=False + ), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } + ) + return hack_data + + +class TestBaseNetwork(TestCaseBase): + def setUp(self): + self.bn = BaseNetwork() + + self.nodes = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + GaussianNode(name="Node2"), + ] + + self.edges = [("Node0", "Node1"), ("Node1", "Node2")] + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + def test_validate(self): + descriptor_t = {"types": {"Node0": "Abstract", "Node1": "Abstract"}} + descriptor_f = {"types": {"Node0": "Abstract", "Node1": "cont"}} + + self.assertFalse(self.bn.validate(descriptor_f)) + self.assertTrue(self.bn.validate(descriptor_t)) + + def test_update_descriptor(self): + self.bn.descriptor = self.descriptor + # Nodes out + self.bn.nodes = [GaussianNode(name="Node0")] + self.bn.update_descriptor() + self.assertEqual({"Node0": "cont"}, self.bn.descriptor["types"]) + + # It uses only Vertices Definer, test of this is in builders tests. + def test_add_nodes(self): + pass + + def test_add_edges(self): + # It uses builders + pass + + def test_calculate_weights(self): + pass + + def test_set_nodes(self): + class MyNode: + def __init__(self, name): + self.name = name + + # set without mapping + self.assertIsNone(self.bn.set_nodes(nodes=[GaussianNode(name="Node0")])) + + map = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {}, + } + + self.bn.set_nodes(nodes=self.nodes, info=map) + self.assertEqual(self.bn.nodes, self.nodes) + + self.bn.set_nodes( + nodes=[MyNode(name="Node-1"), MyNode("Node-2")], + info={"types": {"Node-1": "cont", "Node-2": "disc"}, "signs": {}}, + ) + self.assertEqual(self.bn.nodes, []) + + def test_set_edges(self): + self.edges.extend([(0, 1), (pd.NA, "1")]) + + self.bn.has_logit = False + self.bn.set_nodes(nodes=self.nodes, info=self.descriptor) + self.bn.set_edges(edges=self.edges) + + self.assertEqual([("Node1", "Node2")], self.bn.edges) + + # The test consists of 2 previous methods that are tested, + # plus methods of builders, they are tested as well. + def test_set_structure(self): + pass + + # Node testing. + def test_param_validation(self): + pass + + def test_set_parameters(self): + pass + + def test_save_params(self): + self.bn.distributions = {"AAA": "BBB"} + with self.assertRaises(TypeError): + self.bn.save_params("out.txt") + + self.assertTrue(self.bn.save_params("out.json")) + + self.assertIsFile("out.json") + + self.assertEqual(json.load(open("out.json")), self.bn.distributions) + pl.Path("out.json").unlink() + + def test_save_structure(self): + self.bn.edges = self.edges + + with self.assertRaises(TypeError): + self.bn.save_structure("out.txt") + + self.assertTrue(self.bn.save_structure("out.json")) + self.assertIsFile("out.json") + f = json.load(open("out.json")) + for i in range(len(f)): + self.assertEqual(list(self.bn.edges[i]), f[i]) + pl.Path("out.json").unlink() + + # Covers by prev tests. + def test_save(self): + pass + + def test_fit_parameters(self): + """ + General test, the full one is in the tests of each node. + It is frozen for a while. + """ + pass + # bn = BaseNetwork() + # shape = 500 + # data = pd.DataFrame({"Node0": np.random.poisson(5, shape), + # "Node1": np.random.choice(["cat1", "cat2", "cat3"], shape), + # "Node2": np.random.normal(1.5, 4, shape)}) + # + # + # bn.set_structure(info=self.descriptor, nodes=self.nodes, edges=self.edges) + # bn.get_info(as_df=False) + + def test_joblib_pathsave(self): + hack_data = self.prepare_bn_and_data() + self.bn.fit_parameters(hack_data) + + self.assertGreater( + self.bn.sample(100, progress_bar=False).size, 0, "Sampling is broken" + ) + + combination_package = self.bn.distributions["Gross"]["hybcprob"][ + "['COMPRESSION']" + ] + regressor_obj = combination_package["regressor_obj"] + + if combination_package["serialization"] == "joblib": + self.assertIsFile(regressor_obj) + + def test_sample(self): + data = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + nodes = [ + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + DiscreteNode(name="Gross"), + DiscreteNode(name="Netpay"), + DiscreteNode(name="Porosity"), + DiscreteNode(name="Permeability"), + DiscreteNode(name="Depth"), + ] + + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) + self.bn.fit_parameters(pd.DataFrame.from_records(data)) + self.assertIsNotNone(self.bn.sample(50, as_df=False, progress_bar=False)) + + def test_predict(self): + seq = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + data = pd.DataFrame.from_records(seq) + nodes = [ + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + ] + + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) + self.bn.fit_parameters(data) + + result = self.bn.predict(data.iloc[:, :3], parall_count=2, progress_bar=False) + self.assertNotEqual(result, {}) + + for v in result.values(): + for item in v: + self.assertFalse(pd.isna(item)) + + +class TestBigBraveBN(unittest.SkipTest): + pass + + +class TestCompositeNetwork(unittest.TestCase): + def setUp(self): + self.data = pd.read_csv(r"data/benchmark/healthcare.csv", index_col=0) + self.descriptor = { + "types": { + "A": "disc", + "C": "disc", + "D": "cont", + "H": "disc", + "I": "cont", + "O": "cont", + "T": "cont", + }, + "signs": {"D": "pos", "I": "neg", "O": "pos", "T": "pos"}, + } + self.reference_dag = [ + ("A", "C"), + ("A", "D"), + ("A", "H"), + ("A", "O"), + ("C", "I"), + ("D", "I"), + ("H", "D"), + ("I", "T"), + ("O", "T"), + ] + + self.comparative_dag = [("I", "T"), ("O", "T")] + + def test_learning(self): + bn, _ = self._get_starter_bn(self.data) + + bn.add_edges(self.data, verbose=False) + + bn.fit_parameters(self.data) + + obtained_dag = bn.edges + num_edges = len(obtained_dag) + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) + + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] + self.assertLess( + dist, + 25, + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) + + for node in bn.nodes: + if type(node).__name__ == "CompositeContinuousNode": + self.assertIsNotNone( + node.regressor, + msg="CompositeContinuousNode does not have regressor", + ) + if type(node).__name__ == "CompositeDiscreteNode": + self.assertIsNotNone( + node.classifier, + msg="CompositeDiscreteNode does not have classifier", + ) + + def test_learning_models(self): + bn, p = self._get_starter_bn(self.data[["I", "O", "T"]]) + + parent_node_a = CompositeNode( + nodes_from=None, + content={ + "name": "I", + "type": p.nodes_types["I"], + "parent_model": None, + }, + ) + + parent_node_h = CompositeNode( + nodes_from=None, + content={ + "name": "O", + "type": p.nodes_types["O"], + "parent_model": None, + }, + ) + + child_node = CompositeNode( + nodes_from=[parent_node_a, parent_node_h], + content={ + "name": "T", + "type": p.nodes_types["T"], + "parent_model": "CatBoostRegressor", + }, + ) + + comp_model = CompositeModel(nodes=[parent_node_a, parent_node_h, child_node]) + + bn.add_edges( + self.data[["I", "O", "T"]], + verbose=False, + custom_mutations=[custom_mutation_add_model], + custom_crossovers=[custom_crossover_all_model], + custom_initial_structure=[comp_model], + ) + + output_structure = [ + tuple([str(item) for item in inner_list]) for inner_list in bn.edges + ] + + self.assertEqual( + output_structure, + self.comparative_dag, + msg="Obtained BN should have reference structure", + ) + + @staticmethod + def _get_starter_bn(data): + encoder = pp.LabelEncoder() + p = bp.Preprocessor([("encoder", encoder)]) + + _, _ = p.apply(data) + + info = p.info + + bn = CompositeBN() + bn.add_nodes(info) + + return bn, p + + +if __name__ == "__main__": + unittest.main(verbosity=3) diff --git a/tests/test_nodes.py b/tests/test_nodes.py new file mode 100644 index 0000000..878d736 --- /dev/null +++ b/tests/test_nodes.py @@ -0,0 +1,464 @@ +import logging +import unittest + +import numpy as np +import pandas as pd + +from bamt.networks.hybrid_bn import HybridBN +from bamt.nodes import * + +logging.getLogger("nodes").setLevel(logging.CRITICAL) + + +class MyTest(unittest.TestCase): + unittest.skip("This is an assertion.") + + def assertDist(self, dist, node_type): + if node_type in ("disc", "disc_num"): + return self.assertAlmostEqual(sum(dist), 1) + else: + return self.assertEqual(len(dist), 2, msg=f"Error on {dist}") + + def assertDistMixture(self, dist): + return self.assertEqual(len(dist), 3, msg=f"Error on {dist}") + + +class TestBaseNode(MyTest): + def setUp(self): + np.random.seed(510) + + hybrid_bn = HybridBN(has_logit=True) + + info = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg", "Node2": "neg", "Node3": "neg"}, + } + + data = pd.DataFrame( + { + "Node0": np.random.normal(0, 4, 30), + "Node1": np.random.normal(0, 0.1, 30), + "Node2": np.random.normal(0, 0.3, 30), + "Node3": np.random.normal(0, 0.3, 30), + "Node4": np.random.choice(["cat1", "cat2", "cat3"], 30), + "Node5": np.random.choice(["cat4", "cat5", "cat6"], 30), + "Node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + "Node7": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + ) + + nodes = [ + gaussian_node.GaussianNode(name="Node0"), + gaussian_node.GaussianNode(name="Node1"), + gaussian_node.GaussianNode(name="Node2"), + gaussian_node.GaussianNode(name="Node3"), + discrete_node.DiscreteNode(name="Node4"), + discrete_node.DiscreteNode(name="Node5"), + discrete_node.DiscreteNode(name="Node6"), + discrete_node.DiscreteNode(name="Node7"), + ] + + edges = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] + + hybrid_bn.set_structure(info, nodes=nodes, edges=edges) + hybrid_bn.fit_parameters(data) + self.bn = hybrid_bn + self.data = data + self.info = info + + def test_equality(self): + test = base.BaseNode(name="node0") + first = base.BaseNode(name="node1") + + test_clone = test + + # different name + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different type + test_clone.type = "gaussian" + test.type = "gaussian" + + first.type = "discrete" + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different disc_parents + disc_parents = [f"node{i}" for i in range(2, 6)] + test.disc_parents, test_clone.disc_parents = disc_parents, disc_parents + first = disc_parents[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different cont_parents + cont_parents = [f"node{i}" for i in range(6, 10)] + test.disc_parents, test_clone.disc_parents = cont_parents, cont_parents + first = cont_parents[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different children + children = [f"node{i}" for i in range(5)] + test.disc_parents, test_clone.disc_parents = children, children + first = children[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + def test_get_dist_mixture(self): + hybrid_bn = HybridBN(use_mixture=True, has_logit=True) + + hybrid_bn.set_structure(self.info, self.bn.nodes, self.bn.edges) + hybrid_bn.fit_parameters(self.data) + + mixture_gauss = hybrid_bn["Node1"] + cond_mixture_gauss = hybrid_bn["Node2"] + + for i in range(-2, 0, 2): + dist = hybrid_bn.get_dist(mixture_gauss.name, pvals={"Node0": i}) + self.assertDistMixture(dist) + + for i in range(-2, 0, 2): + for j in self.data[cond_mixture_gauss.disc_parents[0]].unique().tolist(): + dist = hybrid_bn.get_dist( + cond_mixture_gauss.name, pvals={"Node0": float(i), "Node4": j} + ) + self.assertDistMixture(dist) + + def test_get_dist(self): + for node in self.bn.nodes: + if not node.cont_parents + node.disc_parents: + dist = self.bn.get_dist(node.name) + if "mean" in dist.keys(): + self.assertTrue(isinstance(dist["mean"], float)) + self.assertTrue(isinstance(dist["variance"], float)) + else: + self.assertAlmostEqual(sum(dist["cprob"]), 1) + continue + + if len(node.cont_parents + node.disc_parents) == 1: + if node.disc_parents: + pvals = self.data[node.disc_parents[0]].unique().tolist() + parent = node.disc_parents[0] + else: + pvals = range(-5, 5, 1) + parent = node.cont_parents[0] + + for pval in pvals: + dist = self.bn.get_dist(node.name, {parent: pval}) + self.assertDist(dist, self.info["types"][node.name]) + else: + for i in self.data[node.disc_parents[0]].unique().tolist(): + for j in range(-5, 5, 1): + dist = self.bn.get_dist( + node.name, + {node.cont_parents[0]: float(j), node.disc_parents[0]: i}, + ) + self.assertDist(dist, self.info["types"][node.name]) + + # ??? + def test_choose_serialization(self): + pass + + +class TestDiscreteNode(unittest.TestCase): + def setUp(self): + self.node = discrete_node.DiscreteNode(name="test") + self.data_dict = { + "node0": np.random.normal(0, 4, 30), + "node1": np.random.normal(0, 0.1, 30), + "node2": np.random.normal(0, 0.3, 30), + "test": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + # self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertIsNotNone(params["vals"]) + self.assertNotEqual(params["vals"], []) + + for comb, probas in params["cprob"].items(): + self.assertAlmostEqual(sum(probas), 1, delta=1e-5) + + def test_choose(self): + pvals = ["cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.choose(params, pvals) in params["vals"]]) + + def test_predict(self): + pvals = ["cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.predict(params, pvals) in params["vals"]]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) + + +class TestGaussianNode(unittest.TestCase): + def setUp(self): + self.node = gaussian_node.GaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + node_without_parents = gaussian_node.GaussianNode(name="foster-son") + node_without_parents.children = ["node6", "node5"] + + params_parents = self.node.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + ) + params_foster = node_without_parents.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + ) + + self.assertIsNotNone(params_parents["regressor_obj"]) + self.assertTrue(pd.isna(params_parents["mean"])) + self.assertIsNotNone(params_parents["variance"]) + + self.assertIsNone(params_foster["regressor_obj"]) + self.assertFalse(pd.isna(params_foster["mean"])) + self.assertIsNotNone(params_foster["variance"]) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + self.assertRaises(ValueError, self.node.predict, params, ["bad", "values"]) + + +class TestConditionalGaussianNode(unittest.TestCase): + def setUp(self): + self.node = conditional_gaussian_node.ConditionalGaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def fit_parameters(self, regressor=None): + if regressor is not None: + self.node.regressor = regressor + self.node.type = "ConditionalGaussian" + f" ({type(regressor).__name__})" + + node_without_parents = conditional_gaussian_node.ConditionalGaussianNode( + name="foster-son" + ) + node_without_parents.children = ["node6", "node5"] + + params_parents = self.node.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"] + params_foster = node_without_parents.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"]["[]"] + + self.assertIsNone(params_foster["regressor_obj"]) + self.assertIsNotNone(params_foster["mean"]) + + # Since there can be empty dictionaries, + # we have to count them and if a fraction is greater than the threshold + # test failed. + report = [] + for comb, data in params_parents.items(): + if all(pd.isna(x) for x in data.values()): + report.append(1) + continue + else: + report.append(0) + + if pd.isna(data["mean"]): + self.assertIsNotNone(data["regressor_obj"]) + self.assertIsNotNone(data["variance"]) + else: + self.assertIsNone(data["regressor_obj"]) + self.assertIsNotNone(data["mean"]) + + self.assertLess(sum(report) / len(report), 0.3) + # print(params_parents, params_foster, sep="\n\n") + + # for k, v in params_parents.items(): + # print(k, True if v["regressor_obj"] else False, v["regressor"]) + + # print(sum(report) / len(report), node_without_parents.regressor) + + def test_choose(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) + + +class TestMixtureGaussianNode(unittest.TestCase): + def setUp(self): + self.node = mixture_gaussian_node.MixtureGaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertAlmostEqual(sum(params["coef"]), 1, delta=1e-5) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + + +class TestConditionalMixtureGaussianNode(unittest.TestCase): + def setUp(self): + self.node = conditional_mixture_gaussian_node.ConditionalMixtureGaussianNode( + name="test" + ) + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict))[ + "hybcprob" + ] + report = [] + # sometimes combination's data can be empty, so we set the percent of + # empty combinations + for comb, data in params.items(): + if np.isclose(sum(data["coef"]), 1, atol=1e-5): + report.append(0) + else: + report.append(1) + self.assertLess(sum(report) / len(report), 0.3) + + def test_choose(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + + +class TestLogitNode(unittest.TestCase): + def setUp(self): + self.node = logit_node.LogitNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "node2": np.random.normal(3, 0.3, 30), + "test": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertIsNotNone(params["classifier_obj"]) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(self.node.choose(params, pvals) in params["classes"]) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.predict(params, pvals) in params["classes"]]) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_params.json b/tests/test_params.json new file mode 100644 index 0000000..e3ebc03 --- /dev/null +++ b/tests/test_params.json @@ -0,0 +1 @@ +{"Node0": {"mean": 56.7183257918552, "coef": [], "variance": 7149.17326145042}, "Node3": {"classes": ["COMPRESSION", "EVAPORITE", "EXTENSION", "GRAVITY", "INVERSION", "STRIKE-SLIP", "TRANSPRESSION", "TRANSTENSION", "UPLIFT"], "classifier_obj": "\u0080\u0004\u0095\u00b1\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\t\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u000bCOMPRESSION\u0094\u008c\tEVAPORITE\u0094\u008c\tEXTENSION\u0094\u008c\u0007GRAVITY\u0094\u008c\tINVERSION\u0094\u008c\u000bSTRIKE-SLIP\u0094\u008c\rTRANSPRESSION\u0094\u008c\fTRANSTENSION\u0094\u008c\u0006UPLIFT\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004+\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\tK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003h>NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089CH\u00a9\u0093\n\u00a1b\u001ak?\u00fc\u00a5\u00ad\u00e1\u00a2\u0001P\u00bf\u00f4G\u00ab\u00dd\u00f4\u00cc{?e.QC-\u00bau?\u001f\u00bd\u00acb&>t?j\u0092\u00f9\u0092\u00f0aw?1\u00cb\u00967N\u00dc\u0092\u00bfk\u0000\u00ac\u00d1\u00a4`\u007f?\u00a1\u0096\u0006\u00d1\u00b6Q\u008d\u00bf\u0094t\u0094b\u008c\nintercept_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\t\u0085\u0094hI\u0089CH'\u009cX\u008cG&\u0007@\u009a\u0011\u00e6\u00e9\u00feR\u0002\u00c0\u008f\u0085\u00f3\u00df\u00d5\u00da\u00fa?\u00df\u00ee\u00a1\u00e4\u0001#\u00f7?\u00e5\u00e1\b\u00b5\u009c\u000b\u00fb?x\u000b1Q\u0088\u00c8\u00d5?`r\u001f\u0006\u00b2\u00c2\u00f1\u00bf%B\u0006\u00c9=\u0002\u0006\u00c0\u00e0\u00b6\u00a3:\u00faZ\u00fe\u00bf\u0094t\u0094b\u008c\u0010_sklearn_version\u0094\u008c\u00051.0.2\u0094ub.", "classifier": "LogisticRegression", "serialization": "pickle"}, "Node1": {"hybcprob": {"['COMPRESSION']": {"variance": 53.87084286837278, "mean": 15.428971962616822, "coef": []}, "['EVAPORITE']": {"variance": 0.0, "mean": 27.0, "coef": []}, "['EXTENSION']": {"variance": 49.697271531886926, "mean": 18.712820512820514, "coef": []}, "['GRAVITY']": {"variance": 45.05489795918368, "mean": 21.12857142857143, "coef": []}, "['INVERSION']": {"variance": 46.653795918367344, "mean": 21.314285714285713, "coef": []}, "['STRIKE-SLIP']": {"variance": 46.55263157894737, "mean": 20.5, "coef": []}, "['TRANSPRESSION']": {"variance": 2.25, "mean": 18.5, "coef": []}, "['TRANSTENSION']": {"variance": 0.0, "mean": 25.0, "coef": []}, "['UPLIFT']": {"variance": 0.0, "mean": 10.0, "coef": []}}}, "Node4": {"classes": ["ARCHEAN", "CAMBRIAN", "CAMBRIAN-ORDOVICIAN", "CAMBRIAN-ORDOVICIAN/CARBONIFEROUS", "CARBONIFEROUS", "CARBONIFEROUS-CRETACEOUS", "CARBONIFEROUS-PERMIAN", "CRETACEOUS", "CRETACEOUS-PALEOGENE", "DEVONIAN", "DEVONIAN-CARBONIFEROUS", "DEVONIAN-PERMIAN", "JURASSIC", "JURASSIC-CRETACEOUS", "MESOZOIC", "NEOGENE", "ORDOVICIAN", "PALEOGENE", "PALEOGENE-NEOGENE", "PALEOZOIC", "PALEOZOIC-CRETACEOUS", "PERMIAN", "PERMIAN-TRIASSIC", "PROTEROZOIC", "PROTEROZOIC-CAMBRIAN", "SILURIAN", "TRIASSIC", "TRIASSIC-JURASSIC"], "classifier_obj": "\u0080\u0004\u0095N\u0006\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\u001c\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u0007ARCHEAN\u0094\u008c\bCAMBRIAN\u0094\u008c\u0013CAMBRIAN-ORDOVICIAN\u0094\u008c!CAMBRIAN-ORDOVICIAN/CARBONIFEROUS\u0094\u008c\rCARBONIFEROUS\u0094\u008c\u0018CARBONIFEROUS-CRETACEOUS\u0094\u008c\u0015CARBONIFEROUS-PERMIAN\u0094\u008c\nCRETACEOUS\u0094\u008c\u0014CRETACEOUS-PALEOGENE\u0094\u008c\bDEVONIAN\u0094\u008c\u0016DEVONIAN-CARBONIFEROUS\u0094\u008c\u0010DEVONIAN-PERMIAN\u0094\u008c\bJURASSIC\u0094\u008c\u0013JURASSIC-CRETACEOUS\u0094\u008c\bMESOZOIC\u0094\u008c\u0007NEOGENE\u0094\u008c\nORDOVICIAN\u0094\u008c\tPALEOGENE\u0094\u008c\u0011PALEOGENE-NEOGENE\u0094\u008c\tPALEOZOIC\u0094\u008c\u0014PALEOZOIC-CRETACEOUS\u0094\u008c\u0007PERMIAN\u0094\u008c\u0010PERMIAN-TRIASSIC\u0094\u008c\u000bPROTEROZOIC\u0094\u008c\u0014PROTEROZOIC-CAMBRIAN\u0094\u008c\bSILURIAN\u0094\u008c\bTRIASSIC\u0094\u008c\u0011TRIASSIC-JURASSIC\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004!\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u001cK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003hQNNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u00e0LS1\u00b9zE\u00ec\u00bf\u00d1\u00c7\u00bf\u0092Y\u00c5\u00c2\u00bf\u00a6.\u0085\u00aen\u0001\u00dd\u00bf\u0016\u0099\u00f0\u00e0'\u0002\u00d2?\u00c3o\u00ed\u000e\u00f0Z\u00b3?\u00daFH.\u0082\u00c7\u0091?\u00bds\u0099\u00fe\u00a5#\u00bb?\u0093?\u00b0\u00aflW\u00d2?\u00a6\u008a.\u0096\u007f\u00aa\u00d6?\u00bf7|:\u00be\u0095\u00ab\u00bf\u00eb\u0017\u00de\u00edN&\u00e1\u00bf\u00d3S\u00e6 \u00d8H\u00b6\u00bf}\u001b\u0088VKS\u00ce?z\u00b8\u0090;\u00f74\u0092?oK\u001f\u00c4v\u0012\u00a9?\u008f\u00a9\u00f8s\u00b4\u00a3\u00da?\u00ffS)+v\u0092\u00d6\u00bf]\u00ea\u00af&\u001a~\u00d3?\u0094\u00f2\u00e1M\u00c1\u0092\u00d2?\u00fb\u00de\u00da?\u001a\u0090\u00c1?z\u0092\u009d\u00c3\u00e2\u0096\u00c9?\u00e5m\u0098T\u00ed(\u00a6?\u00e26 Date: Sat, 13 Jul 2024 14:02:11 +0300 Subject: [PATCH 18/18] fix issue with in-place changes to dataframes (#113) Making deepcopy of dataframe before making performing any changes on it --- bamt/networks/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bamt/networks/base.py b/bamt/networks/base.py index 8090918..eaee66b 100644 --- a/bamt/networks/base.py +++ b/bamt/networks/base.py @@ -597,6 +597,7 @@ def fit_parameters(self, data: pd.DataFrame, n_jobs: int = 1): """ Base function for parameter learning """ + data = data.copy() if data.isnull().values.any(): logger_network.error("Dataframe contains NaNs.") return