diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index af032f6..0000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: black-action -on: [push, pull_request] -jobs: - linter_name: - name: Run black formatter - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: rickstaa/action-black@v1.3.3 - with: - black_args: "." \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 64749c8..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: ci -on: - push: - branches: - - master - - main -permissions: - contents: write -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Configure Git Credentials - run: | - git config user.name github-actions[bot] - git config user.email 41898282+github-actions[bot]@users.noreply.github.com - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - - uses: actions/cache@v4 - with: - key: mkdocs-material-${{ env.cache_id }} - path: .cache - restore-keys: | - mkdocs-material- - - run: pip install mkdocs-material 'mkdocstrings[python]' - - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..2efbd82 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,39 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt + +python: + install: + - requirements: other_requirements/readthedocs.txt diff --git a/bamt/builders/__init__.py b/bamt/builders/__init__.py new file mode 100644 index 0000000..d886ad6 --- /dev/null +++ b/bamt/builders/__init__.py @@ -0,0 +1 @@ +__all__ = ["builders_base", "evo_builder", "hc_builder"] diff --git a/bamt/builders/builders_base.py b/bamt/builders/builders_base.py new file mode 100644 index 0000000..e04bbff --- /dev/null +++ b/bamt/builders/builders_base.py @@ -0,0 +1,222 @@ +import itertools +from typing import Dict, List, Optional, Tuple, Callable, TypedDict, Sequence, Union + +from pandas import DataFrame + +from bamt.log import logger_builder +from bamt.nodes.conditional_gaussian_node import ConditionalGaussianNode +from bamt.nodes.conditional_logit_node import ConditionalLogitNode +from bamt.nodes.conditional_mixture_gaussian_node import ConditionalMixtureGaussianNode +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.nodes.logit_node import LogitNode +from bamt.nodes.mixture_gaussian_node import MixtureGaussianNode +from bamt.utils import GraphUtils as gru + + +class ParamDict(TypedDict, total=False): + init_edges: Optional[Sequence[str]] + init_nodes: Optional[List[str]] + remove_init_edges: bool + white_list: Optional[Tuple[str, str]] + bl_add: Optional[List[str]] + + +class StructureBuilder(object): + """ + Base Class for Structure Builder. + It can restrict nodes defined by RESTRICTIONS + """ + + def __init__(self, descriptor: Dict[str, Dict[str, str]]): + """ + :param descriptor: a dict with types and signs of nodes + Attributes: + black_list: a list with restricted connections; + """ + self.skeleton = {"V": [], "E": []} + self.descriptor = descriptor + + self.has_logit = bool + + self.black_list = None + + def restrict( + self, + data: DataFrame, + init_nodes: Optional[List[str]], + bl_add: Optional[List[str]], + ): + """ + :param data: data to deal with + :param init_nodes: nodes to begin with (thus they have no parents) + :param bl_add: additional vertices + """ + node_type = self.descriptor["types"] + blacklist = [] + datacol = data.columns.to_list() + + if not self.has_logit: + # Has_logit flag allows BN building edges between cont and disc + RESTRICTIONS = [("cont", "disc"), ("cont", "disc_num")] + for x, y in itertools.product(datacol, repeat=2): + if x != y: + if (node_type[x], node_type[y]) in RESTRICTIONS: + blacklist.append((x, y)) + else: + self.black_list = [] + if init_nodes: + blacklist += [(x, y) for x in datacol for y in init_nodes if x != y] + if bl_add: + blacklist = blacklist + bl_add + self.black_list = blacklist + + def get_family(self): + """ + A function that updates a skeleton; + """ + if not self.skeleton["V"]: + logger_builder.error("Vertex list is None") + return None + if not self.skeleton["E"]: + logger_builder.error("Edges list is None") + return None + for node_instance in self.skeleton["V"]: + node = node_instance.name + children = [] + parents = [] + for edge in self.skeleton["E"]: + if node in edge: + if edge.index(node) == 0: + children.append(edge[1]) + if edge.index(node) == 1: + parents.append(edge[0]) + + disc_parents = [] + cont_parents = [] + for parent in parents: + if self.descriptor["types"][parent] in ["disc", "disc_num"]: + disc_parents.append(parent) + else: + cont_parents.append(parent) + + id = self.skeleton["V"].index(node_instance) + self.skeleton["V"][id].disc_parents = disc_parents + self.skeleton["V"][id].cont_parents = cont_parents + self.skeleton["V"][id].children = children + + ordered = gru.toporder(self.skeleton["V"], self.skeleton["E"]) + not_ordered = [node.name for node in self.skeleton["V"]] + mask = [not_ordered.index(name) for name in ordered] + self.skeleton["V"] = [self.skeleton["V"][i] for i in mask] + + +class VerticesDefiner(StructureBuilder): + """ + Main class for defining vertices + """ + + def __init__( + self, descriptor: Dict[str, Dict[str, str]], regressor: Optional[object] + ): + """ + Automatically creates a list of nodes + """ + super(VerticesDefiner, self).__init__(descriptor=descriptor) + # Notice that vertices are used only by Builders + self.vertices = [] + + node = None + # LEVEL 1: Define a general type of node: Discrete or Gaussian + for vertex, type in self.descriptor["types"].items(): + if type in ["disc_num", "disc"]: + node = DiscreteNode(name=vertex) + elif type == "cont": + node = GaussianNode(name=vertex, regressor=regressor) + else: + msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). + Set vertex manually (by calling set_nodes()) or investigate the error.""" + logger_builder.error(msg) + continue + + self.vertices.append(node) + + def overwrite_vertex( + self, + has_logit: bool, + use_mixture: bool, + classifier: Optional[Callable], + regressor: Optional[Callable], + ): + """ + Level 2: Redefined nodes according structure (parents) + :param classifier: an object to pass into logit, condLogit nodes + :param regressor: an object to pass into gaussian nodes + :param has_logit allows edges from cont to disc nodes + :param use_mixture allows using Mixture + """ + for node_instance in self.vertices: + node = node_instance + if has_logit: + if "Discrete" in node_instance.type: + if node_instance.cont_parents: + if not node_instance.disc_parents: + node = LogitNode( + name=node_instance.name, classifier=classifier + ) + + elif node_instance.disc_parents: + node = ConditionalLogitNode( + name=node_instance.name, classifier=classifier + ) + + if use_mixture: + if "Gaussian" in node_instance.type: + if not node_instance.disc_parents: + node = MixtureGaussianNode(name=node_instance.name) + elif node_instance.disc_parents: + node = ConditionalMixtureGaussianNode(name=node_instance.name) + else: + continue + else: + if "Gaussian" in node_instance.type: + if node_instance.disc_parents: + node = ConditionalGaussianNode( + name=node_instance.name, regressor=regressor + ) + else: + continue + + if node_instance == node: + continue + + id = self.skeleton["V"].index(node_instance) + node.disc_parents = node_instance.disc_parents + node.cont_parents = node_instance.cont_parents + node.children = node_instance.children + self.skeleton["V"][id] = node + + +class EdgesDefiner(StructureBuilder): + def __init__(self, descriptor: Dict[str, Dict[str, str]]): + super(EdgesDefiner, self).__init__(descriptor) + + +class BaseDefiner(VerticesDefiner, EdgesDefiner): + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): + self.scoring_function = scoring_function + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + super().__init__(descriptor, regressor=regressor) + self.optimizer = None # will be defined in subclasses diff --git a/bamt/builders/composite_builder.py b/bamt/builders/composite_builder.py new file mode 100644 index 0000000..764363a --- /dev/null +++ b/bamt/builders/composite_builder.py @@ -0,0 +1,293 @@ +from datetime import timedelta +from typing import Dict, Optional, List, Tuple, Callable + +from golem.core.adapter import DirectAdapter +from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes +from golem.core.log import Log +from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer +from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters +from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum +from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum +from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum +from golem.core.optimisers.objective import Objective, ObjectiveEvaluate +from golem.core.optimisers.optimization_parameters import GraphRequirements +from golem.core.optimisers.optimizer import GraphGenerationParams +from pandas import DataFrame +from sklearn import preprocessing + +import bamt.preprocessors as pp +from bamt.builders.builders_base import VerticesDefiner, EdgesDefiner +from bamt.log import logger_builder +from bamt.nodes.composite_continuous_node import CompositeContinuousNode +from bamt.nodes.composite_discrete_node import CompositeDiscreteNode +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils import EvoUtils as evo +from bamt.utils.composite_utils import CompositeGeneticOperators +from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode + + +class CompositeDefiner(VerticesDefiner, EdgesDefiner): + """ + Object that might take additional methods to decompose structure builder class + """ + + def __init__( + self, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object] = None, + ): + super().__init__(descriptor, regressor) + + # Notice that vertices are used only by Builders + self.vertices = [] + + # LEVEL 1: Define a general type of node: Discrete or Сontinuous + for vertex, type in self.descriptor["types"].items(): + if type in ["disc_num", "disc"]: + node = CompositeDiscreteNode(name=vertex) + elif type == "cont": + node = CompositeContinuousNode(name=vertex, regressor=regressor) + else: + msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). + Set vertex manually (by calling set_nodes()) or investigate the error.""" + logger_builder.error(msg) + continue + + self.vertices.append(node) + + +class CompositeStructureBuilder(CompositeDefiner): + """ + This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents + the structure of a Composite Bayesian Network. + + Attributes: + data (DataFrame): Input data used to build the structure. + descriptor (dict): Descriptor describing node types and signs. + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object], + ): + super(CompositeStructureBuilder, self).__init__( + descriptor=descriptor, regressor=regressor + ) + self.data = data + self.parent_models_dict = {} + self.descriptor = descriptor + self.regressor = regressor + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + self.default_n_jobs = -1 + self.default_pop_size = 15 + self.default_crossover_prob = 0.9 + self.default_mutation_prob = 0.8 + self.default_max_arity = 100 + self.default_max_depth = 100 + self.default_timeout = 180 + self.default_num_of_generations = 50 + self.default_early_stopping_iterations = 50 + self.objective_metric = CompositeGeneticOperators.composite_metric + self.default_crossovers = [ + CrossoverTypesEnum.exchange_edges, + CrossoverTypesEnum.exchange_parents_one, + CrossoverTypesEnum.exchange_parents_both, + CompositeGeneticOperators.custom_crossover_all_model, + ] + self.default_mutations = [ + CompositeGeneticOperators.custom_mutation_add_structure, + CompositeGeneticOperators.custom_mutation_delete_structure, + CompositeGeneticOperators.custom_mutation_reverse_structure, + CompositeGeneticOperators.custom_mutation_add_model, + ] + self.default_selection = [SelectionTypesEnum.tournament] + self.default_constraints = [ + has_no_self_cycled_nodes, + has_no_cycle, + evo.has_no_duplicates, + ] + self.verbose = True + self.logging_level = 50 + + def overwrite_vertex( + self, + regressor: Optional[Callable], + ): + for node_instance in self.vertices: + node = node_instance + if ( + len(node_instance.cont_parents + node_instance.disc_parents) < 1 + and type(node_instance).__name__ == "CompositeContinuousNode" + ): + node = GaussianNode(name=node_instance.name, regressor=regressor) + elif ( + len(node_instance.cont_parents + node_instance.disc_parents) < 1 + and type(node_instance).__name__ == "CompositeDiscreteNode" + ): + node = DiscreteNode(name=node_instance.name) + else: + continue + id_node = self.skeleton["V"].index(node_instance) + node.disc_parents = node_instance.disc_parents + node.cont_parents = node_instance.cont_parents + node.children = node_instance.children + self.skeleton["V"][id_node] = node + + def build( + self, + data: DataFrame, + classifier: Optional[object], + regressor: Optional[object], + **kwargs, + ): + """ + Calls the search method to execute all the evolutionary computations. + + Args: + data (DataFrame): The data from which to build the structure. + classifier (Optional[object]): A classification model for discrete nodes. + regressor (Optional[object]): A regression model for continuous nodes. + """ + best_graph_edge_list, parent_models = self.search(data, **kwargs) + + # Convert the best graph to the format used by the Bayesian Network + self.skeleton["V"] = self.vertices + self.skeleton["E"] = best_graph_edge_list + self.parent_models_dict = parent_models + + self.get_family() + self.overwrite_vertex(regressor=regressor) + + def search(self, data: DataFrame, **kwargs) -> [List[Tuple[str, str]], Dict]: + """ + Executes all the evolutionary computations and returns the best graph's edge list. + + Args: + data (DataFrame): The data from which to build the structure. + + Returns: + best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. + """ + # Get the list of node names + vertices = list(data.columns) + + encoder = preprocessing.LabelEncoder() + p = pp.Preprocessor([("encoder", encoder)]) + preprocessed_data, _ = p.apply(data) + + # Create the initial population + initial = kwargs.get( + "custom_initial_structure", + [ + CompositeModel( + nodes=[ + CompositeNode( + nodes_from=None, + content={ + "name": vertex, + "type": p.nodes_types[vertex], + "parent_model": None, + }, + ) + for vertex in vertices + ] + ) + ], + ) + + # Define the requirements for the evolutionary algorithm + requirements = GraphRequirements( + max_arity=kwargs.get("max_arity", self.default_max_arity), + max_depth=kwargs.get("max_depth", self.default_max_depth), + num_of_generations=kwargs.get( + "num_of_generations", self.default_num_of_generations + ), + timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), + early_stopping_iterations=kwargs.get( + "early_stopping_iterations", self.default_early_stopping_iterations + ), + n_jobs=kwargs.get("n_jobs", self.default_n_jobs), + ) + + # Set the parameters for the evolutionary algorithm + optimizer_parameters = GPAlgorithmParameters( + pop_size=kwargs.get("pop_size", self.default_pop_size), + crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), + mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), + genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, + mutation_types=kwargs.get("custom_mutations", self.default_mutations), + crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), + selection_types=kwargs.get("selection_type", self.default_selection), + ) + + # Set the adapter for the conversion between the graph and the data + # structures used by the optimizer + adapter = DirectAdapter( + base_graph_class=CompositeModel, base_node_class=CompositeNode + ) + + # Set the constraints for the graph + + constraints = kwargs.get("custom_constraints", []) + + constraints.extend(self.default_constraints) + + if kwargs.get("blacklist", None) is not None: + constraints.append(evo.has_no_blacklist_edges) + if kwargs.get("whitelist", None) is not None: + constraints.append(evo.has_only_whitelist_edges) + + graph_generation_params = GraphGenerationParams( + adapter=adapter, rules_for_constraint=constraints + ) + + # Define the objective function to optimize + objective = Objective( + {"custom": kwargs.get("custom_metric", self.objective_metric)} + ) + + # Initialize the optimizer + optimizer = EvoGraphOptimizer( + objective=objective, + initial_graphs=initial, + requirements=requirements, + graph_generation_params=graph_generation_params, + graph_optimizer_params=optimizer_parameters, + ) + + # Define the function to evaluate the objective function + objective_eval = ObjectiveEvaluate(objective, data=preprocessed_data) + + if not kwargs.get("verbose", self.verbose): + Log().reset_logging_level(logging_level=50) + + # Run the optimization + optimized_graph = optimizer.optimise(objective_eval)[0] + + parent_models = self._get_parent_models(optimized_graph) + + # Get the best graph + best_graph_edge_list = optimized_graph.operator.get_edges() + best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) + + return best_graph_edge_list, parent_models + + @staticmethod + def _convert_to_strings(nested_list): + return [[str(item) for item in inner_list] for inner_list in nested_list] + + @staticmethod + def _get_parent_models(graph): + parent_models = {} + for node in graph.nodes: + parent_models[node.content["name"]] = node.content["parent_model"] + return parent_models diff --git a/bamt/builders/evo_builder.py b/bamt/builders/evo_builder.py new file mode 100644 index 0000000..2921347 --- /dev/null +++ b/bamt/builders/evo_builder.py @@ -0,0 +1,231 @@ +from datetime import timedelta +from typing import Dict, Optional, List, Tuple + +from golem.core.adapter import DirectAdapter +from golem.core.dag.verification_rules import has_no_cycle, has_no_self_cycled_nodes +from golem.core.log import Log +from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer +from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters +from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum +from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum +from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum +from golem.core.optimisers.objective import Objective, ObjectiveEvaluate +from golem.core.optimisers.optimization_parameters import GraphRequirements +from golem.core.optimisers.optimizer import GraphGenerationParams +from pandas import DataFrame + +from bamt.builders.builders_base import BaseDefiner +from bamt.utils import EvoUtils as evo + + +class EvoDefiner(BaseDefiner): + """ + Object that might take additional methods to decompose structure builder class + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object] = None, + ): + super().__init__(data, descriptor, regressor) + + +class EvoStructureBuilder(EvoDefiner): + """ + This class uses an evolutionary algorithm based on GOLEM to generate a Directed Acyclic Graph (DAG) that represents + the structure of a Bayesian Network. + + Attributes: + data (DataFrame): Input data used to build the structure. + descriptor (dict): Descriptor describing node types and signs. + regressor (object): A regression model for continuous nodes. + has_logit (bool): Indicates whether a logit link function should be used. + use_mixture (bool): Indicates whether a mixture model should be used. + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): + super(EvoStructureBuilder, self).__init__( + data=data, descriptor=descriptor, regressor=regressor + ) + self.data = data + self.descriptor = descriptor + self.has_logit = has_logit + self.use_mixture = use_mixture + self.regressor = regressor + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } + self.default_n_jobs = -1 + self.default_pop_size = 15 + self.default_crossover_prob = 0.9 + self.default_mutation_prob = 0.8 + self.default_max_arity = 100 + self.default_max_depth = 100 + self.default_timeout = 180 + self.default_num_of_generations = 50 + self.default_early_stopping_iterations = 50 + self.logging_level = 50 + self.objective_metric = evo.K2_metric + self.default_crossovers = [ + CrossoverTypesEnum.exchange_edges, + CrossoverTypesEnum.exchange_parents_one, + CrossoverTypesEnum.exchange_parents_both, + ] + self.default_mutations = [ + evo.custom_mutation_add, + evo.custom_mutation_delete, + evo.custom_mutation_reverse, + ] + self.default_selection = [SelectionTypesEnum.tournament] + self.default_constraints = [ + has_no_self_cycled_nodes, + has_no_cycle, + evo.has_no_duplicates, + ] + self.verbose = True + + def build( + self, + data: DataFrame, + classifier: Optional[object], + regressor: Optional[object], + **kwargs + ): + """ + Calls the search method to execute all the evolutionary computations. + + Args: + data (DataFrame): The data from which to build the structure. + classifier (Optional[object]): A classification model for discrete nodes. + regressor (Optional[object]): A regression model for continuous nodes. + """ + best_graph_edge_list = self.search(data, **kwargs) + + # Convert the best graph to the format used by the Bayesian Network + self.skeleton["V"] = self.vertices + self.skeleton["E"] = best_graph_edge_list + + self.get_family() + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) + + def search(self, data: DataFrame, **kwargs) -> List[Tuple[str, str]]: + """ + Executes all the evolutionary computations and returns the best graph's edge list. + + Args: + data (DataFrame): The data from which to build the structure. + + Returns: + best_graph_edge_list (List[Tuple[str, str]]): The edge list of the best graph found by the search. + """ + # Get the list of node names + nodes_types = data.columns.to_list() + + # Create the initial population + initial = [ + evo.CustomGraphModel( + nodes=kwargs.get( + "init_nodes", + [evo.CustomGraphNode(node_type) for node_type in nodes_types], + ) + ) + ] + + # Define the requirements for the evolutionary algorithm + requirements = GraphRequirements( + max_arity=kwargs.get("max_arity", self.default_max_arity), + max_depth=kwargs.get("max_depth", self.default_max_depth), + num_of_generations=kwargs.get( + "num_of_generations", self.default_num_of_generations + ), + timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), + early_stopping_iterations=kwargs.get( + "early_stopping_iterations", self.default_early_stopping_iterations + ), + n_jobs=kwargs.get("n_jobs", self.default_n_jobs), + ) + + # Set the parameters for the evolutionary algorithm + optimizer_parameters = GPAlgorithmParameters( + pop_size=kwargs.get("pop_size", self.default_pop_size), + crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), + mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), + genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, + mutation_types=kwargs.get("custom_mutations", self.default_mutations), + crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), + selection_types=kwargs.get("selection_type", self.default_selection), + ) + + # Set the adapter for the conversion between the graph and the data + # structures used by the optimizer + adapter = DirectAdapter( + base_graph_class=evo.CustomGraphModel, base_node_class=evo.CustomGraphNode + ) + + # Set the constraints for the graph + + constraints = kwargs.get("custom_constraints", []) + + constraints.extend(self.default_constraints) + + if kwargs.get("blacklist", None) is not None: + constraints.append(evo.has_no_blacklist_edges) + if kwargs.get("whitelist", None) is not None: + constraints.append(evo.has_only_whitelist_edges) + + graph_generation_params = GraphGenerationParams( + adapter=adapter, + rules_for_constraint=constraints, + available_node_types=nodes_types, + ) + + # Define the objective function to optimize + objective = Objective( + {"custom": kwargs.get("custom_metric", self.objective_metric)} + ) + + # Initialize the optimizer + optimizer = EvoGraphOptimizer( + objective=objective, + initial_graphs=initial, + requirements=requirements, + graph_generation_params=graph_generation_params, + graph_optimizer_params=optimizer_parameters, + ) + + # Define the function to evaluate the objective function + objective_eval = ObjectiveEvaluate(objective, data=data) + + if not kwargs.get("verbose", self.verbose): + Log().reset_logging_level(logging_level=50) + + # Run the optimization + optimized_graph = optimizer.optimise(objective_eval)[0] + + # Get the best graph + best_graph_edge_list = optimized_graph.operator.get_edges() + best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) + + return best_graph_edge_list + + @staticmethod + def _convert_to_strings(nested_list): + return [tuple([str(item) for item in inner_list]) for inner_list in nested_list] diff --git a/bamt/builders/hc_builder.py b/bamt/builders/hc_builder.py new file mode 100644 index 0000000..3f92e14 --- /dev/null +++ b/bamt/builders/hc_builder.py @@ -0,0 +1,208 @@ +from typing import Dict, List, Optional, Tuple, Callable, Union + +from pandas import DataFrame +from pgmpy.base import DAG +from pgmpy.estimators import HillClimbSearch + +from bamt.builders.builders_base import ParamDict, BaseDefiner +from bamt.log import logger_builder +from bamt.redef_HC import hc as hc_method +from bamt.utils import GraphUtils as gru + + +class HillClimbDefiner(BaseDefiner): + """ + Object to define structure and pass it into skeleton + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): + super().__init__(data, descriptor, scoring_function, regressor) + self.optimizer = HillClimbSearch(data) + + def apply_K2( + self, + data: DataFrame, + init_edges: Optional[List[Tuple[str, str]]], + progress_bar: bool, + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): + """ + :param init_edges: list of tuples, a graph to start learning with + :param remove_init_edges: allows changes in a model defined by user + :param data: user's data + :param progress_bar: verbose regime + :param white_list: list of allowed edges + """ + if not all([i in ["disc", "disc_num"] for i in gru.nodes_types(data).values()]): + logger_builder.error( + f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}" + ) + return None + + if len(self.scoring_function) != 2: + from pgmpy.estimators import K2Score + + scoring_function = K2Score + else: + scoring_function = self.scoring_function[1] + + if not init_edges: + best_model = self.optimizer.estimate( + scoring_method=scoring_function(data), + black_list=self.black_list, + white_list=white_list, + show_progress=progress_bar, + ) + else: + if remove_init_edges: + startdag = DAG() + nodes = [str(v) for v in self.vertices] + startdag.add_nodes_from(nodes=nodes) + startdag.add_edges_from(ebunch=init_edges) + best_model = self.optimizer.estimate( + black_list=self.black_list, + white_list=white_list, + start_dag=startdag, + show_progress=False, + ) + else: + best_model = self.optimizer.estimate( + black_list=self.black_list, + white_list=white_list, + fixed_edges=init_edges, + show_progress=False, + ) + + structure = [list(x) for x in list(best_model.edges())] + self.skeleton["E"] = structure + + def apply_group1( + self, + data: DataFrame, + progress_bar: bool, + init_edges: Optional[List[Tuple[str, str]]], + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): + """ + This method implements the group of scoring functions. + Group: + "MI" - Mutual Information, + "LL" - Log Likelihood, + "BIC" - Bayesian Information Criteria, + "AIC" - Akaike information Criteria. + """ + column_name_dict = dict([(n.name, i) for i, n in enumerate(self.vertices)]) + blacklist_new = [] + for pair in self.black_list: + blacklist_new.append((column_name_dict[pair[0]], column_name_dict[pair[1]])) + if white_list: + white_list_old = white_list[:] + white_list = [] + for pair in white_list_old: + white_list.append( + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) + if init_edges: + init_edges_old = init_edges[:] + init_edges = [] + for pair in init_edges_old: + init_edges.append( + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) + + bn = hc_method( + data, + metric=self.scoring_function[0], + restriction=white_list, + init_edges=init_edges, + remove_geo_edges=remove_init_edges, + black_list=blacklist_new, + debug=progress_bar, + ) + structure = [] + nodes = sorted(list(bn.nodes())) + for rv in nodes: + for pa in bn.F[rv]["parents"]: + structure.append( + [ + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(pa) + ], + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(rv) + ], + ] + ) + self.skeleton["E"] = structure + + +class HCStructureBuilder(HillClimbDefiner): + """ + Final object with build method + """ + + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Tuple[str, Callable], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): + """ + :param data: train data + :param descriptor: map for data + """ + + super(HCStructureBuilder, self).__init__( + descriptor=descriptor, + data=data, + scoring_function=scoring_function, + regressor=regressor, + ) + self.use_mixture = use_mixture + self.has_logit = has_logit + + def build( + self, + data: DataFrame, + progress_bar: bool, + classifier: Optional[object], + regressor: Optional[object], + params: Optional[ParamDict] = None, + **kwargs, + ): + if params: + for param, value in params.items(): + self.params[param] = value + + init_nodes = self.params.pop("init_nodes") + bl_add = self.params.pop("bl_add") + + # Level 1 + self.skeleton["V"] = self.vertices + + self.restrict(data, init_nodes, bl_add) + if self.scoring_function[0] == "K2": + self.apply_K2(data=data, progress_bar=progress_bar, **self.params) + elif self.scoring_function[0] in ["MI", "LL", "BIC", "AIC"]: + self.apply_group1(data=data, progress_bar=progress_bar, **self.params) + + # Level 2 + + self.get_family() + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) diff --git a/bamt/core/graph/dag.py b/bamt/core/graph/dag.py deleted file mode 100644 index 03a1770..0000000 --- a/bamt/core/graph/dag.py +++ /dev/null @@ -1,24 +0,0 @@ -from .graph import Graph - - -class DirectedAcyclicGraph(Graph): - def __init__(self): - super().__init__() - - def add_edge(self): - pass - - def remove_edge(self): - pass - - def get_parents(self): - pass - - def get_children(self): - pass - - def get_edges(self): - pass - - def get_nodes(self): - pass diff --git a/bamt/core/graph/graph.py b/bamt/core/graph/graph.py deleted file mode 100644 index c2628fc..0000000 --- a/bamt/core/graph/graph.py +++ /dev/null @@ -1,30 +0,0 @@ -from abc import ABC, abstractmethod - - -class Graph(ABC): - def __init__(self): - pass - - @abstractmethod - def add_edge(self): - pass - - @abstractmethod - def remove_edge(self): - pass - - @abstractmethod - def get_parents(self): - pass - - @abstractmethod - def get_children(self): - pass - - @abstractmethod - def get_edges(self): - pass - - @abstractmethod - def get_nodes(self): - pass diff --git a/bamt/core/node_models/classifier.py b/bamt/core/node_models/classifier.py deleted file mode 100644 index afe9df4..0000000 --- a/bamt/core/node_models/classifier.py +++ /dev/null @@ -1,25 +0,0 @@ -from .prediction_model import PredictionModel - -import numpy as np - - -class Classifier(PredictionModel): - def __init__(self, classifier=None, **parameters): - self._classifier = classifier - self._parameters = parameters - - def fit(self, X: np.ndarray, y: np.ndarray) -> None: - if self._classifier is None: - # TODO: implement an algorithm that finds a classifier and fits it with chosen parameters - pass - else: - self._classifier.fit(X, y) - - def predict(self, X: np.ndarray) -> np.ndarray: - return self._classifier.predict(X) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - return self._classifier.predict_proba(X) - - def __str__(self): - return str(self._classifier) diff --git a/bamt/core/node_models/continuous_distribution.py b/bamt/core/node_models/continuous_distribution.py deleted file mode 100644 index 9d31e21..0000000 --- a/bamt/core/node_models/continuous_distribution.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np - -from .distribution import Distribution - - -class ContinuousDistribution(Distribution): - def __init__(self, distribution_model=None, **parameters): - self._distribution = distribution_model - self._parameters = parameters - - def fit(self, X: np.ndarray) -> None: - if self._distribution is None: - # TODO: implement an algorithm that finds a distribution and fits it with chosen parameters - pass - else: - pass - - def sample(self, num_samples: int) -> np.ndarray: - pass - - def __str__(self): - return str(self._distribution.name) + " continuous distribution" diff --git a/bamt/core/node_models/distribution.py b/bamt/core/node_models/distribution.py deleted file mode 100644 index ed3a747..0000000 --- a/bamt/core/node_models/distribution.py +++ /dev/null @@ -1,14 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np - - -class Distribution(ABC): - - @abstractmethod - def fit(self, X: np.ndarray) -> None: - pass - - @abstractmethod - def sample(self, num_samples: int) -> np.ndarray: - pass diff --git a/bamt/core/node_models/empirical_distribution.py b/bamt/core/node_models/empirical_distribution.py deleted file mode 100644 index ec63673..0000000 --- a/bamt/core/node_models/empirical_distribution.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np - -from .distribution import Distribution - - -class EmpiricalDistribution(Distribution): - def __init__(self): - pass - - def fit(self, X: np.ndarray) -> None: - pass - - def sample(self, num_samples: int) -> np.ndarray: - pass - - def __str__(self): - return "Empirical Distribution" diff --git a/bamt/core/node_models/prediction_model.py b/bamt/core/node_models/prediction_model.py deleted file mode 100644 index c2b5df7..0000000 --- a/bamt/core/node_models/prediction_model.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np - - -class PredictionModel(ABC): - """Represents general prediction model implementations. Each prediction model should provide a fit and a predict - method.""" - - @abstractmethod - def fit(self, X: np.ndarray, Y: np.ndarray) -> None: - raise NotImplementedError - - @abstractmethod - def predict(self, X: np.ndarray) -> np.ndarray: - raise NotImplementedError - - @abstractmethod - def predict_proba(self, X: np.ndarray) -> np.ndarray: - raise NotImplementedError diff --git a/bamt/core/node_models/regressor.py b/bamt/core/node_models/regressor.py deleted file mode 100644 index d69941a..0000000 --- a/bamt/core/node_models/regressor.py +++ /dev/null @@ -1,25 +0,0 @@ -from .prediction_model import PredictionModel - -import numpy as np - - -class Regressor(PredictionModel): - def __init__(self, regressor=None, **parameters): - self._regressor = regressor - self._parameters = parameters - - def fit(self, X: np.ndarray, y: np.ndarray) -> None: - if self._regressor is None: - # TODO: implement an algorithm that finds a regressor and fits it with chosen parameters - pass - else: - self._regressor.fit(X, y) - - def predict(self, X: np.ndarray) -> np.ndarray: - return self._regressor.predict(X) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - return self._regressor.predict_proba(X) - - def __str__(self): - return str(self._regressor) diff --git a/bamt/core/nodes/child_nodes/child_node.py b/bamt/core/nodes/child_nodes/child_node.py deleted file mode 100644 index fc1636d..0000000 --- a/bamt/core/nodes/child_nodes/child_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from bamt.core.nodes.node import Node - - -class ChildNode(Node): - def __init__(self): - super().__init__() diff --git a/bamt/core/nodes/child_nodes/conditional_continuous_node.py b/bamt/core/nodes/child_nodes/conditional_continuous_node.py deleted file mode 100644 index 171b4a8..0000000 --- a/bamt/core/nodes/child_nodes/conditional_continuous_node.py +++ /dev/null @@ -1,7 +0,0 @@ -from .child_node import ChildNode - - -class ConditionalContinuousNode(ChildNode): - def __init__(self): - super().__init__() - self._model = None diff --git a/bamt/core/nodes/child_nodes/conditional_discrete_node.py b/bamt/core/nodes/child_nodes/conditional_discrete_node.py deleted file mode 100644 index 0a01b00..0000000 --- a/bamt/core/nodes/child_nodes/conditional_discrete_node.py +++ /dev/null @@ -1,7 +0,0 @@ -from .child_node import ChildNode - - -class ConditionalDiscreteNode(ChildNode): - def __init__(self): - super().__init__() - self._model = None diff --git a/bamt/core/nodes/node.py b/bamt/core/nodes/node.py deleted file mode 100644 index 6dc4140..0000000 --- a/bamt/core/nodes/node.py +++ /dev/null @@ -1,6 +0,0 @@ -from abc import ABC - - -class Node(ABC): - def __init__(self): - pass diff --git a/bamt/core/nodes/root_nodes/continuous_node.py b/bamt/core/nodes/root_nodes/continuous_node.py deleted file mode 100644 index f725000..0000000 --- a/bamt/core/nodes/root_nodes/continuous_node.py +++ /dev/null @@ -1,10 +0,0 @@ -from .root_node import RootNode - - -class ContinuousNode(RootNode): - def __init__(self): - super().__init__() - self._distribution = None - - def __str__(self): - return "Continuous Node with " + str(self._distribution) diff --git a/bamt/core/nodes/root_nodes/discrete_node.py b/bamt/core/nodes/root_nodes/discrete_node.py deleted file mode 100644 index d9b04c8..0000000 --- a/bamt/core/nodes/root_nodes/discrete_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from .root_node import RootNode - - -class DiscreteNode(RootNode): - def __init__(self): - super().__init__() diff --git a/bamt/core/nodes/root_nodes/root_node.py b/bamt/core/nodes/root_nodes/root_node.py deleted file mode 100644 index faf7ff1..0000000 --- a/bamt/core/nodes/root_nodes/root_node.py +++ /dev/null @@ -1,6 +0,0 @@ -from bamt.core.nodes.node import Node - - -class RootNode(Node): - def __init__(self): - super().__init__() diff --git a/bamt/dag_optimizers/constraint/__init__.py b/bamt/dag_optimizers/constraint/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/constraint/constraint_dag_optimizer.py b/bamt/dag_optimizers/constraint/constraint_dag_optimizer.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/dag_optimizer.py b/bamt/dag_optimizers/dag_optimizer.py deleted file mode 100644 index b9c992f..0000000 --- a/bamt/dag_optimizers/dag_optimizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class DAGOptimizer(ABC): - def __init__(self): - pass - - @abstractmethod - def optimize(self): - pass diff --git a/bamt/dag_optimizers/hybrid/__init__.py b/bamt/dag_optimizers/hybrid/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py b/bamt/dag_optimizers/hybrid/hybrid_dag_optimizer.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/__init__.py b/bamt/dag_optimizers/score/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/bigbravebn.py b/bamt/dag_optimizers/score/bigbravebn.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/golem_genetic.py b/bamt/dag_optimizers/score/golem_genetic.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/hill_climbing.py b/bamt/dag_optimizers/score/hill_climbing.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/lsevobn.py b/bamt/dag_optimizers/score/lsevobn.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/dag_optimizers/score/score_dag_optimizer.py b/bamt/dag_optimizers/score/score_dag_optimizer.py deleted file mode 100644 index a9ae0fb..0000000 --- a/bamt/dag_optimizers/score/score_dag_optimizer.py +++ /dev/null @@ -1,9 +0,0 @@ -from bamt.dag_optimizers.dag_optimizer import DAGOptimizer - - -class ScoreDAGOptimizer(DAGOptimizer): - def __init__(self): - super().__init__() - - def optimize(self): - pass diff --git a/bamt/display/__init__.py b/bamt/display/__init__.py new file mode 100644 index 0000000..a22b2c3 --- /dev/null +++ b/bamt/display/__init__.py @@ -0,0 +1,9 @@ +from .display import Display + + +def plot_(output, *args): + return Display(output).build(*args) + + +def get_info_(bn, as_df): + return Display(output=None).get_info(bn, as_df) diff --git a/bamt/display/display.py b/bamt/display/display.py new file mode 100644 index 0000000..2568fb7 --- /dev/null +++ b/bamt/display/display.py @@ -0,0 +1,175 @@ +import random +from typing import Dict + +import matplotlib +import matplotlib.pyplot as plt +import networkx as nx +import numpy as np +from pandas import DataFrame +from pyvis.network import Network + +from bamt.log import logger_display + + +class Display(object): + """Display object""" + + def __init__(self, output): + """ + :param output: output file name + """ + self.output = output + self.class2color = Dict + self.name2class = Dict + + @staticmethod + def _validate_dir(output): + """Format validation""" + if not output.endswith(".html"): + logger_display.error("This version allows only html format.") + return None + return True + + @staticmethod + def _make_network(nodes, edges, **kwargs): + """Make network and graph + :param nodes: nodes + :param edges: edges + :param kwargs: a dict passed to pyvis.network.Network + """ + g = nx.DiGraph() + network_params = dict( + height=kwargs.get("height", "800px"), + width=kwargs.get("width", "100%"), + notebook=kwargs.get("notebook", True), + directed=kwargs.get("directed", nx.is_directed(g)), + layout=kwargs.get("layout", "hierarchical"), + ) + + g.add_nodes_from(nodes) + g.add_edges_from(edges) + + network = Network(**network_params) + return network, g + + def _init_colors(self, nodes): + # Qualitative class of colormaps + q_classes = [ + "Pastel1", + "Pastel2", + "Paired", + "Accent", + "Dark2", + "Set1", + "Set2", + "Set3", + "tab10", + "tab20", + "tab20b", + "tab20c", + ] + + hex_colors = [] + for cls in q_classes: + rgb_colors = plt.get_cmap(cls).colors + hex_colors.extend( + [matplotlib.colors.rgb2hex(rgb_color) for rgb_color in rgb_colors] + ) + + class_number = len(set([node.type for node in nodes])) + hex_colors_indexes = [ + random.randint(0, len(hex_colors) - 1) for _ in range(class_number) + ] + + hex_colors = np.array(hex_colors) + + hex_colors_picked = hex_colors[hex_colors_indexes] + class2color = { + cls: color + for cls, color in zip(set([node.type for node in nodes]), hex_colors_picked) + } + name2class = {node.name: node.type for node in nodes} + self.class2color = class2color + self.name2class = name2class + + def build(self, nodes, edges, **kwargs): + """Build and show network + + :param nodes: nodes + :param edges: edges + :param kwargs: a dict passed to pyvis.network.Network.add_node + """ + if not self._validate_dir(self.output): + logger_display.error(f"Output error: {self.output}") + return None + + if not kwargs: + node_params = dict(font={"size": 36}, size=45) + else: + node_params = kwargs + + self._init_colors(nodes) + + nodes_names = [node.name for node in nodes] + + network, g = self._make_network(nodes_names, edges) + nodes_sorted = np.array(list(nx.topological_generations(g)), dtype=object) + + for level in range(len(nodes_sorted)): + for node_i in range(len(nodes_sorted[level])): + name = nodes_sorted[level][node_i] + cls = self.name2class[name] + color = self.class2color[cls] + network.add_node( + name, + label=name, + color=color, + level=level, + title=f"{name} ({cls})", + **node_params, + ) + + for edge in g.edges: + network.add_edge(edge[0], edge[1]) + + network.hrepulsion(node_distance=300, central_gravity=0.5) + + return network.show(self.output) + + @staticmethod + def get_info(bn, as_df): + if as_df: + names = [] + types_n = [] + types_d = [] + parents = [] + parents_types = [] + for n in bn.nodes: + names.append(n) + types_n.append(n.type) + types_d.append(bn.descriptor["types"][n.name]) + parents_types.append( + [ + bn.descriptor["types"][name] + for name in n.cont_parents + n.disc_parents + ] + ) + parents.append([name for name in n.cont_parents + n.disc_parents]) + return DataFrame( + { + "name": names, + "node_type": types_n, + "data_type": types_d, + "parents": parents, + "parents_types": parents_types, + } + ) + else: + for n in bn.nodes: + print( + f"{n.name: <20} | " + f"{n.type: <50} | " + f"{bn.descriptor['types'][n.name]: <10} | " + f"{str([bn.descriptor['types'][name] for name in n.cont_parents + n.disc_parents]): <50} | " + f"{str([name for name in n.cont_parents + n.disc_parents])}" + ) diff --git a/bamt/core/__init__.py b/bamt/external/__init__.py similarity index 100% rename from bamt/core/__init__.py rename to bamt/external/__init__.py diff --git a/bamt/external/pyBN/__init__.py b/bamt/external/pyBN/__init__.py new file mode 100644 index 0000000..321c5ec --- /dev/null +++ b/bamt/external/pyBN/__init__.py @@ -0,0 +1,2 @@ +from bamt.external.pyBN.classes import * +from bamt.external.pyBN.utils import * diff --git a/bamt/core/graph/__init__.py b/bamt/external/pyBN/classes/__init__.py similarity index 100% rename from bamt/core/graph/__init__.py rename to bamt/external/pyBN/classes/__init__.py diff --git a/bamt/core/node_models/__init__.py b/bamt/external/pyBN/classes/_tests/__init__.py similarity index 100% rename from bamt/core/node_models/__init__.py rename to bamt/external/pyBN/classes/_tests/__init__.py diff --git a/bamt/external/pyBN/classes/_tests/test_bayesnet.py b/bamt/external/pyBN/classes/_tests/test_bayesnet.py new file mode 100644 index 0000000..8beca5c --- /dev/null +++ b/bamt/external/pyBN/classes/_tests/test_bayesnet.py @@ -0,0 +1,184 @@ +""" +******** +UnitTest +BayesNet +******** + +Method Checks that +assertEqual(a, b) a == b +assertNotEqual(a, b) a != b +assertTrue(x) bool(x) is True +assertFalse(x) bool(x) is False +assertIs(a, b) a is b +assertIsNot(a, b) a is not b +assertIsNone(x) x is None +assertIsNotNone(x) x is not None +assertIn(a, b) a in b +assertNotIn(a, b) a not in b +assertIsInstance(a, b) isinstance(a, b) +assertNotIsInstance(a, b) not isinstance(a, b) + +assertAlmostEqual(a, b) round(a-b, 7) == 0 +assertNotAlmostEqual(a, b) round(a-b, 7) != 0 +assertGreater(a, b) a > b +assertGreaterEqual(a, b) a >= b +assertLess(a, b) a < b +assertLessEqual(a, b) a <= b + +assertListEqual(a, b) lists +assertTupleEqual(a, b) tuples +assertSetEqual(a, b) sets or frozensets +assertDictEqual(a, b) dicts + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +from external.pyBN.classes.bayesnet import BayesNet +from external.pyBN.readwrite.read import read_bn + + +class BayesNetTestCase(unittest.TestCase): + def setUp(self): + self.bn = BayesNet() + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn_bif = read_bn(os.path.join(self.dpath, "cancer.bif")) + self.bn_bn = read_bn(os.path.join(self.dpath, "cmu.bn")) + + def tearDown(self): + pass + + def test_isinstance(self): + self.assertIsInstance(self.bn, BayesNet) + + def test_V_bif(self): + self.assertListEqual( + self.bn_bif.V, ["Smoker", "Pollution", "Cancer", "Xray", "Dyspnoea"] + ) + + def test_E_bif(self): + self.assertDictEqual( + self.bn_bif.E, + { + "Cancer": ["Xray", "Dyspnoea"], + "Dyspnoea": [], + "Pollution": ["Cancer"], + "Smoker": ["Cancer"], + "Xray": [], + }, + ) + + def test_F_bif(self): + self.assertDictEqual( + self.bn_bif.F, + { + "Cancer": { + "cpt": [0.03, 0.97, 0.05, 0.95, 0.001, 0.999, 0.02, 0.98], + "parents": ["Pollution", "Smoker"], + "values": ["True", "False"], + }, + "Dyspnoea": { + "cpt": [0.65, 0.35, 0.3, 0.7], + "parents": ["Cancer"], + "values": ["True", "False"], + }, + "Pollution": { + "cpt": [0.9, 0.1], + "parents": [], + "values": ["low", "high"], + }, + "Smoker": { + "cpt": [0.3, 0.7], + "parents": [], + "values": ["True", "False"], + }, + "Xray": { + "cpt": [0.9, 0.1, 0.2, 0.8], + "parents": ["Cancer"], + "values": ["positive", "negative"], + }, + }, + ) + + def test_V_bn(self): + self.assertListEqual( + self.bn_bn.V, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) + + def test_E_bn(self): + self.assertDictEqual( + self.bn_bn.E, + { + "Alarm": ["JohnCalls", "MaryCalls"], + "Burglary": ["Alarm"], + "Earthquake": ["Alarm"], + "JohnCalls": [], + "MaryCalls": [], + }, + ) + + def test_F_bn(self): + self.assertDictEqual( + self.bn_bn.F, + { + "Alarm": { + "cpt": [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95], + "parents": ["Earthquake", "Burglary"], + "values": ["No", "Yes"], + }, + "Burglary": { + "cpt": [0.999, 0.001], + "parents": [], + "values": ["No", "Yes"], + }, + "Earthquake": { + "cpt": [0.998, 0.002], + "parents": [], + "values": ["No", "Yes"], + }, + "JohnCalls": { + "cpt": [0.95, 0.05, 0.1, 0.9], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + "MaryCalls": { + "cpt": [0.99, 0.01, 0.3, 0.7], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + }, + ) + + def test_nodes(self): + n = list(self.bn_bn.nodes()) + self.assertListEqual( + n, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) + + def test_cpt(self): + cpt = list(self.bn_bn.cpt("Alarm")) + self.assertListEqual(cpt, [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95]) + + def test_card(self): + self.assertEqual(self.bn_bn.card("Alarm"), 2) + + def test_scope(self): + self.assertListEqual( + self.bn_bn.scope("Alarm"), ["Alarm", "Earthquake", "Burglary"] + ) + + def test_parents(self): + self.assertListEqual(self.bn_bn.parents("Alarm"), ["Earthquake", "Burglary"]) + + def test_values(self): + self.assertListEqual(self.bn_bn.values("Alarm"), ["No", "Yes"]) + + def test_values_idx(self): + self.assertEqual(self.bn_bn.values("Alarm")[1], "Yes") + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/bamt/external/pyBN/classes/bayesnet.py b/bamt/external/pyBN/classes/bayesnet.py new file mode 100644 index 0000000..88c6893 --- /dev/null +++ b/bamt/external/pyBN/classes/bayesnet.py @@ -0,0 +1,393 @@ +""" +************** +BayesNet Class +************** + +Overarching class for Discrete Bayesian Networks. + + +Design Specs +------------ + +- Bayesian Network - + + - F - + key: + - rv - + values: + - children - + - parents - + - values - + - cpt - + + - V - + - list of rvs - + + - E - + key: + - rv - + values: + - list of rv's children - +Notes +----- +- Edges can be inferred from Factorization, but Vertex values + must be specified. +""" + +__author__ = """Nicholas Cullen """ + +from copy import deepcopy + +import numpy as np + +from bamt.external.pyBN.utils.class_equivalence import are_class_equivalent +from bamt.external.pyBN.utils.graph import topsort + + +class BayesNet(object): + """ + Overarching class for Bayesian Networks + + """ + + def __init__(self, E=None, value_dict=None, file=None): + """ + Initialize the BayesNet class. + + Arguments + --------- + *V* : a list of strings - vertices in topsort order + *E* : a dict, where key = vertex, val = list of its children + *F* : a dict, + where key = rv, + val = another dict with + keys = + 'parents', + 'values', + 'cpt' + + *V* : a dict + + Notes + ----- + + """ + if file is not None: + import pyBN.io.read as ior + + bn = ior.read_bn(file) + self.V = bn.V + self.E = bn.E + self.F = bn.F + else: + if E is not None: + # assert (value_dict is not None), 'Must set values if E is set.' + self.set_structure(E, value_dict) + else: + self.V = [] + self.E = {} + self.F = {} + + def __eq__(self, y): + """ + Tests whether two Bayesian Networks are + equivalent - i.e. they contain the same + node/edge structure, and equality of + conditional probabilities. + """ + return are_class_equivalent(self, y) + + def __hash__(self): + """ + Allows BayesNet objects to be used + as keys in a dictionary (i.e. hashable) + """ + return hash((str(self.V), str(self.E))) + + def copy(self): + V = deepcopy(self.V) + E = deepcopy(self.E) + F = {} + for v in V: + F[v] = {} + F[v]["cpt"] = deepcopy(self.F[v]["cpt"]) + F[v]["parents"] = deepcopy(self.F[v]["parents"]) + F[v]["values"] = deepcopy(self.F[v]["values"]) + bn = BayesNet() + bn.V = V + bn.E = E + bn.F = F + + return bn + + def add_node(self, rv, cpt=[], parents=[], values=[]): + self.V.append(rv) + self.F[rv] = {"cpt": cpt, "parents": parents, "values": values} + + def add_edge(self, u, v): + if not self.has_node(u): + self.add_node(u) + if not self.has_node(v): + self.add_node(v) + if self.has_edge(u, v): + print("Edge already exists") + else: + self.E[u].append(v) + self.F[v]["parents"].append(u) + # self.V = topsort(self.E) + # HOW DO I RECALCULATE CPT? + + def remove_edge(self, u, v): + self.E[u].remove(v) + self.F[v]["parents"].remove(u) + + def reverse_arc(self, u, v): + if self.has_edge(u, v): + self.E[u].remove(v) + self.E[v].append(u) + + def set_data(self, rv, data): + assert isinstance(data, dict), "data must be dictionary" + self.F[rv] = data + + def set_cpt(self, rv, cpt): + self.F[rv]["cpt"] = cpt + + def set_parents(self, rv, parents): + self.F[rv]["parents"] = parents + + def set_values(self, rv, values): + self.F[rv]["values"] = values + + def nodes(self): + for v in self.V: + yield v + + def node_idx(self, rv): + try: + return self.V.index(rv) + except ValueError: + return -1 + + def has_node(self, rv): + return rv in self.V + + def has_edge(self, u, v): + return v in self.E[u] + + def edges(self): + for u in self.nodes(): + for v in self.E[u]: + yield (u, v) + + def num_edges(self): + num = 0 + for u in self.nodes(): + num += len(self.E[u]) + return num + + def num_params(self): + num = 0 + for u in self.nodes(): + num += len(self.F[u]["cpt"]) + return num + + def scope_size(self, rv): + return len(self.F[rv]["parents"]) + 1 + + def num_nodes(self): + return len(self.V) + + def cpt(self, rv): + return self.F[rv]["cpt"] + + def card(self, rv): + return len(self.F[rv]["values"]) + + def scope(self, rv): + scope = [rv] + scope.extend(self.F[rv]["parents"]) + return scope + + def parents(self, rv): + return self.F[rv]["parents"] + + def children(self, rv): + return self.E[rv] + + def degree(self, rv): + return len(self.parents(rv)) + len(self.children(rv)) + + def values(self, rv): + return self.F[rv]["values"] + + def value_idx(self, rv, val): + try: + return self.F[rv]["values"].index(val) + except ValueError: + print("Value Index Error") + return -1 + + def stride(self, rv, n): + if n == rv: + return 1 + else: + card_list = [self.card(rv)] + card_list.extend([self.card(p) for p in self.parents(rv)]) + n_idx = self.parents(rv).index(n) + 1 + return int(np.prod(card_list[0:n_idx])) + + def flat_cpt(self, by_var=False, by_parents=False): + """ + Return all cpt values in the BN as a flattened + numpy array ordered by bn.nodes() - i.e. topsort + """ + if by_var: + cpt = np.array([sum(self.cpt(rv)) for rv in self.nodes()]) + elif by_parents: + cpt = np.array( + [ + sum(self.cpt(rv)[i : (i + self.card(rv))]) + for rv in self.nodes() + for i in range(len(self.cpt(rv)) / self.card(rv)) + ] + ) + else: + cpt = np.array([val for rv in self.nodes() for val in self.cpt(rv)]) + return cpt + + def cpt_indices(self, target, val_dict): + """ + Get the index of the CPT which corresponds + to a dictionary of rv=val sets. This can be + used for parameter learning to increment the + appropriate cpt frequency value based on + observations in the data. + + There is definitely a fast way to do this. + -- check if (idx - rv_stride*value_idx) % (rv_card*rv_stride) == 0 + + Arguments + --------- + *target* : a string + Main RV + + *val_dict* : a dictionary, where + key=rv,val=rv value + + """ + stride = dict([(n, self.stride(target, n)) for n in self.scope(target)]) + # if len(val_dict)==len(self.parents(target)): + # idx = sum([self.value_idx(rv,val)*stride[rv] \ + # for rv,val in val_dict.items()]) + # else: + card = dict([(n, self.card(n)) for n in self.scope(target)]) + idx = set(range(len(self.cpt(target)))) + for rv, val in val_dict.items(): + val_idx = self.value_idx(rv, val) + rv_idx = [] + s_idx = val_idx * stride[rv] + while s_idx < len(self.cpt(target)): + rv_idx.extend(range(s_idx, (s_idx + stride[rv]))) + s_idx += stride[rv] * card[rv] + idx = idx.intersection(set(rv_idx)) + + return list(idx) + + def cpt_str_idx(self, rv, idx): + """ + Return string representation of RV=VAL and + Parents=Val for the given idx of the given rv's cpt. + """ + rv_val = self.values(rv)[idx % self.card(rv)] + s = str(rv) + "=" + str(rv_val) + "|" + _idx = 1 + for parent in self.parents(rv): + for val in self.values(parent): + if idx in self.cpt_indices(rv, {rv: rv_val, parent: val}): + s += str(parent) + "=" + str(val) + if _idx < len(self.parents(rv)): + s += "," + _idx += 1 + return s + + def set_structure(self, edge_dict, value_dict=None): + """ + Set the structure of a BayesNet object. This + function is mostly used to instantiate a BN + skeleton after structure learning algorithms. + + See "structure_learn" folder & algorithms + + Arguments + --------- + *edge_dict* : a dictionary, + where key = rv, + value = list of rv's children + NOTE: THIS MUST BE DIRECTED ALREADY! + + *value_dict* : a dictionary, + where key = rv, + value = list of rv's possible values + + Returns + ------- + None + + Effects + ------- + - sets self.V in topsort order from edge_dict + - sets self.E + - creates self.F structure and sets the parents + + Notes + ----- + + """ + + self.V = topsort(edge_dict) + self.E = edge_dict + self.F = dict([(rv, {}) for rv in self.nodes()]) + for rv in self.nodes(): + self.F[rv] = { + "parents": [p for p in self.nodes() if rv in self.children(p)], + "cpt": [], + "values": [], + } + if value_dict is not None: + self.F[rv]["values"] = value_dict[rv] + + def adj_list(self): + """ + Returns adjacency list of lists, where + each list element is a vertex, and each sub-list is + a list of that vertex's neighbors. + """ + adj_list = [[] for _ in self.V] + vi_map = dict((self.V[i], i) for i in range(len(self.V))) + for u, v in self.edges(): + adj_list[vi_map[u]].append(vi_map[v]) + return adj_list + + def moralized_edges(self): + """ + Moralized graph is the original graph PLUS + an edge between every set of common effect + structures - + i.e. all parents of a node are connected. + + This function has be validated. + + Returns + ------- + *e* : a python list of parent-child tuples. + + """ + e = set() + for u in self.nodes(): + for p1 in self.parents(u): + e.add((p1, u)) + for p2 in self.parents(u): + if p1 != p2 and (p2, p1) not in e: + e.add((p1, p2)) + return list(e) diff --git a/bamt/external/pyBN/utils/__init__.py b/bamt/external/pyBN/utils/__init__.py new file mode 100644 index 0000000..1de46d8 --- /dev/null +++ b/bamt/external/pyBN/utils/__init__.py @@ -0,0 +1,4 @@ +from bamt.external.pyBN.utils.class_equivalence import * +from bamt.external.pyBN.utils.data import * +from bamt.external.pyBN.utils.graph import * +from bamt.external.pyBN.utils.independence_tests import * diff --git a/bamt/core/nodes/__init__.py b/bamt/external/pyBN/utils/_tests/__init__.py similarity index 100% rename from bamt/core/nodes/__init__.py rename to bamt/external/pyBN/utils/_tests/__init__.py diff --git a/bamt/external/pyBN/utils/_tests/test_independence_tests.py b/bamt/external/pyBN/utils/_tests/test_independence_tests.py new file mode 100644 index 0000000..fefc12c --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_independence_tests.py @@ -0,0 +1,56 @@ +""" +**************** +UnitTest +Constraint Tests +**************** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.independence.constraint_tests import mi_test + + +class ConstraintTestsTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.data = np.loadtxt( + os.path.join(self.dpath, "lizards.csv"), + delimiter=",", + dtype="int32", + skiprows=1, + ) + + def tearDown(self): + pass + + def test_mi_two_vars_value_a(self): + self.assertEqual(mi_test(self.data[:, (0, 1)]), 0.0004) + + def test_mi_two_vars_value_b(self): + self.assertEqual(mi_test(self.data[:, (0, 2)]), 0.0014) + + def test_mi_two_vars_symmetry(self): + self.assertEqual(mi_test(self.data[:, (1, 0)]), mi_test(self.data[:, (0, 1)])) + + def test_mi_three_vars_value_a(self): + self.assertEqual(mi_test(self.data), 0.0009) + + def test_mi_three_vars_symmetry(self): + self.assertEqual( + mi_test(self.data[:, (0, 1, 2)]), mi_test(self.data[:, (1, 0, 2)]) + ) + + def test_mi_random_three(self): + np.random.seed(3636) + self.data = np.random.randint(1, 10, size=((10000, 3))) + self.assertEqual(mi_test(self.data), 0.0211) + + def test_mi_random_four(self): + np.random.seed(3636) + self.data = np.random.randint(1, 10, size=((10000, 4))) + self.assertEqual(mi_test(self.data), 0.0071) diff --git a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py new file mode 100644 index 0000000..af97eb1 --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py @@ -0,0 +1,36 @@ +""" +************** +Markov Blanket +Unit Test +************** +""" + +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +from external.pyBN.independence.markov_blanket import markov_blanket +from external.pyBN.readwrite.read import read_bn + + +class ConstraintTestsTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cmu.bn")) + + def tearDown(self): + pass + + def test_markov_blanket(self): + self.assertDictEqual( + markov_blanket(self.bn), + { + "Alarm": ["Earthquake", "Burglary", "JohnCalls", "MaryCalls"], + "Burglary": ["Alarm", "Earthquake"], + "Earthquake": ["Alarm", "Burglary"], + "JohnCalls": ["Alarm"], + "MaryCalls": ["Alarm"], + }, + ) diff --git a/bamt/external/pyBN/utils/_tests/test_orient_edges.py b/bamt/external/pyBN/utils/_tests/test_orient_edges.py new file mode 100644 index 0000000..4242fc1 --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_orient_edges.py @@ -0,0 +1,38 @@ +""" +******** +UnitTest +OrientEdges +******** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.structure_learn.orient_edges import orient_edges_gs, orient_edges_pc + + +class OrientEdgesTestCase(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_orient_edges_pc(self): + e = {0: [1, 2], 1: [0], 2: [0]} + b = {0: [], 1: {2: (0,)}, 2: {1: (0,)}} + self.assertDictEqual(orient_edges_pc(e, b), {0: [1, 2], 1: [], 2: []}) + + def test_orient_edges_gs(self): + e = {0: [1, 2], 1: [0], 2: [0]} + b = {0: [1, 2], 1: [0], 2: [0]} + dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + path = os.path.join(dpath, "lizards.csv") + data = np.loadtxt(path, dtype="int32", skiprows=1, delimiter=",") + self.assertDictEqual( + orient_edges_gs(e, b, data, 0.05), {0: [1, 2], 1: [], 2: []} + ) diff --git a/bamt/external/pyBN/utils/_tests/test_random_sample.py b/bamt/external/pyBN/utils/_tests/test_random_sample.py new file mode 100644 index 0000000..bf3e92d --- /dev/null +++ b/bamt/external/pyBN/utils/_tests/test_random_sample.py @@ -0,0 +1,33 @@ +""" +******** +UnitTest +Misc +******** + +""" +__author__ = """Nicholas Cullen """ + +import os +import unittest +from os.path import dirname + +import numpy as np +from external.pyBN.readwrite.read import read_bn +from external.pyBN.utils.random_sample import random_sample + + +class RandomSampleTestCase(unittest.TestCase): + def setUp(self): + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cancer.bif")) + + def tearDown(self): + pass + + def test_random_sample(self): + np.random.seed(3636) + sample = random_sample(self.bn, 5) + self.assertListEqual( + list(sample.ravel()), + [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0], + ) diff --git a/bamt/external/pyBN/utils/class_equivalence.py b/bamt/external/pyBN/utils/class_equivalence.py new file mode 100644 index 0000000..9e284b3 --- /dev/null +++ b/bamt/external/pyBN/utils/class_equivalence.py @@ -0,0 +1,33 @@ +""" +**************** +Equivalence Code +**************** + +This code is for testing whether or not +two Bayesian networks belong to the same +equivalence class - i.e. they have the same +edges when viewed as undirected graphs. + +Also, this code is for actually generating +equivalent Bayesian networks from a given BN. + +""" + + +def are_class_equivalent(x, y): + """ + Check whether two Bayesian networks belong + to the same equivalence class. + """ + are_equivalent = True + + if set(list(x.nodes())) != set(list(y.nodes())): + are_equivalent = False + else: + for rv in x.nodes(): + rv_x_neighbors = set(x.parents(rv)) + set(y.children(rv)) + rv_y_neighbors = set(y.parents(rv)) + set(y.children(rv)) + if rv_x_neighbors != rv_y_neighbors: + are_equivalent = False + break + return are_equivalent diff --git a/bamt/external/pyBN/utils/data.py b/bamt/external/pyBN/utils/data.py new file mode 100644 index 0000000..ad4781a --- /dev/null +++ b/bamt/external/pyBN/utils/data.py @@ -0,0 +1,24 @@ +""" +These are functions for dealing with datasets +that have strings as values - as those values +must be converted to integers in order to be +used in many structure learning functions, for +example. + +There is, of course, the issue of how to get those +string values back into the underlying representation +after the learning occurs.. +""" +import numpy as np + + +def unique_bins(data): + """ + Get the unique values for each column in a dataset. + """ + bins = np.empty(len(data.T), dtype=np.int32) + i = 0 + for col in data.T: + bins[i] = len(np.unique(col)) + i += 1 + return bins diff --git a/bamt/external/pyBN/utils/graph.py b/bamt/external/pyBN/utils/graph.py new file mode 100644 index 0000000..92daf4d --- /dev/null +++ b/bamt/external/pyBN/utils/graph.py @@ -0,0 +1,54 @@ +""" +Collection of Graph Algorithms. + +Any networkx dependencies should exist here only, but +there are currently a few exceptions which will +eventually be corrected. + +""" + +import networkx as nx + + +def would_cause_cycle(e, u, v, reverse=False): + """ + Test if adding the edge u -> v to the BayesNet + object would create a DIRECTED (i.e. illegal) cycle. + """ + G = nx.DiGraph(e) + if reverse: + G.remove_edge(v, u) + G.add_edge(u, v) + try: + nx.find_cycle(G, source=u) + return True + except BaseException: + return False + + +def topsort(edge_dict, root=None): + """ + List of nodes in topological sort order from edge dict + where key = rv and value = list of rv's children + """ + queue = [] + if root is not None: + queue = [root] + else: + for rv in edge_dict.keys(): + prior = True + for p in edge_dict.keys(): + if rv in edge_dict[p]: + prior = False + if prior: + queue.append(rv) + + visited = [] + while queue: + vertex = queue.pop(0) + if vertex not in visited: + visited.append(vertex) + for nbr in edge_dict[vertex]: + queue.append(nbr) + # queue.extend(edge_dict[vertex]) # add all vertex's children + return visited diff --git a/bamt/external/pyBN/utils/independence_tests.py b/bamt/external/pyBN/utils/independence_tests.py new file mode 100644 index 0000000..cc803fe --- /dev/null +++ b/bamt/external/pyBN/utils/independence_tests.py @@ -0,0 +1,181 @@ +""" +****************************** +Conditional Independence Tests +for Constraint-based Learning +****************************** + +Implemented Constraint-based Tests +---------------------------------- +- mutual information +- Pearson's X^2 + +I may consider putting this code into its own class structure. The +main benefit I could see from doing this would be the ability to +cache joint/marginal/conditional probabilities for expedited tests. + +""" + +__author__ = """Nicholas Cullen """ + +import numpy as np + +from bamt.external.pyBN.utils.data import unique_bins + + +def mutual_information(data, conditional=False): + # bins = np.amax(data, axis=0)+1 # read levels for each variable + bins = unique_bins(data) + if len(bins) == 1: + hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts + Px = (hist / hist.sum()) + (1e-7) + MI = -1 * np.sum(Px * np.log(Px)) + return round(MI, 4) + + if len(bins) == 2: + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Px = np.sum(Pxy, axis=1) # P(X,Z) + Py = np.sum(Pxy, axis=0) # P(Y,Z) + + PxPy = np.outer(Px, Py) + Pxy += 1e-7 + PxPy += 1e-7 + MI = np.sum(Pxy * np.log(Pxy / (PxPy))) + return round(MI, 4) + elif len(bins) > 2 and conditional: + # CHECK FOR > 3 COLUMNS -> concatenate Z into one column + if len(bins) > 3: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 2] = "".join(data[i, 2:ncols]) + data = data.astype(np.int64)[:, 0:3] + + bins = np.amax(data, axis=0) + hist, _ = np.histogramdd(data, bins=bins) # frequency counts + + Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z + Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) + Pxz = np.sum(Pxyz, axis=1) # P(X,Z) + Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) + + Pxy_z = Pxyz / (Pz + 1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) + Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) + Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) + + Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) + for i in range(bins[0]): + for j in range(bins[1]): + for k in range(bins[2]): + Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] + Pxyz += 1e-7 + Pxy_z += 1e-7 + Px_y_z += 1e-7 + MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) + + return round(MI, 4) + elif len(bins) > 2 and conditional == False: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 1] = "".join(data[i, 1:ncols]) + data = data.astype(np.int64)[:, 0:2] + + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Px = np.sum(Pxy, axis=1) # P(X,Z) + Py = np.sum(Pxy, axis=0) # P(Y,Z) + + PxPy = np.outer(Px, Py) + Pxy += 1e-7 + PxPy += 1e-7 + MI = np.sum(Pxy * np.log(Pxy / (PxPy))) + return round(MI, 4) + + +def entropy(data): + """ + In the context of structure learning, and more specifically + in constraint-based algorithms which rely on the mutual information + test for conditional independence, it has been proven that the variable + X in a set which MAXIMIZES mutual information is also the variable which + MINIMIZES entropy. This fact can be used to reduce the computational + requirements of tests based on the following relationship: + + Entropy is related to marginal mutual information as follows: + MI(X;Y) = H(X) - H(X|Y) + + Entropy is related to conditional mutual information as follows: + MI(X;Y|Z) = H(X|Z) - H(X|Y,Z) + + For one varibale, H(X) is equal to the following: + -1 * sum of p(x) * log(p(x)) + + For two variables H(X|Y) is equal to the following: + sum over x,y of p(x,y)*log(p(y)/p(x,y)) + + For three variables, H(X|Y,Z) is equal to the following: + -1 * sum of p(x,y,z) * log(p(x|y,z)), + where p(x|y,z) = p(x,y,z)/p(y)*p(z) + Arguments + ---------- + *data* : a nested numpy array + The data from which to learn - must have at least three + variables. All conditioned variables (i.e. Z) are compressed + into one variable. + + Returns + ------- + *H* : entropy value + + """ + try: + cols = data.shape[1] + except IndexError: + cols = 1 + + bins = np.amax(data, axis=0) + if isinstance(bins, np.ndarray): + for i in range(len(bins)): + if bins[i] == 0: + bins[i] = 1 + else: + bins = 1 + # bins = unique_bins(data) + + if cols == 1: + hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts + Px = hist / hist.sum() + (1e-7) + H = -1 * np.sum(Px * np.log(Px)) + + elif cols == 2: # two variables -> assume X then Y + hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts + + Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z + Py = np.sum(Pxy, axis=0) # P(Y) + Py += 1e-7 + Pxy += 1e-7 + H = np.sum(Pxy * np.log(Py / Pxy)) + + else: + # CHECK FOR > 3 COLUMNS -> concatenate Z into one column + if cols > 3: + data = data.astype("str") + ncols = len(bins) + for i in range(len(data)): + data[i, 2] = "".join(data[i, 2:ncols]) + data = data.astype(np.int64)[:, 0:3] + + bins = np.amax(data, axis=0) + hist, _ = np.histogramdd(data, bins=bins) # frequency counts + + Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z + Pyz = np.sum(Pxyz, axis=0) + + Pxyz += 1e-7 # for log -inf + Pyz += 1e-7 + H = -1 * np.sum(Pxyz * np.log(Pxyz)) + np.sum(Pyz * np.log(Pyz)) + + return round(H, 4) diff --git a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py new file mode 100644 index 0000000..004a237 --- /dev/null +++ b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py @@ -0,0 +1,852 @@ +import warnings + +import numpy as np +import pandas as pd +import sklearn.preprocessing + +NONE_REPLACEMENT = -32768 + + +def information_mutual_conditional( + x, + y, + z, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + alphabet_x=None, + alphabet_y=None, + Alphabet_Z=None, + keep_dims=False, +): + x, fill_value_X = _sanitise_array_input(x, fill_value) + y, fill_value_Y = _sanitise_array_input(y, fill_value) + z, fill_value_Z = _sanitise_array_input(z, fill_value) + if alphabet_x is not None: + alphabet_x, fill_value_Alphabet_X = _sanitise_array_input( + alphabet_x, fill_value + ) + alphabet_x, _ = _autocreate_alphabet(alphabet_x, fill_value_Alphabet_X) + else: + alphabet_x, fill_value_Alphabet_X = _autocreate_alphabet(x, fill_value_X) + if alphabet_y is not None: + alphabet_y, fill_value_Alphabet_Y = _sanitise_array_input( + alphabet_y, fill_value + ) + alphabet_y, _ = _autocreate_alphabet(alphabet_y, fill_value_Alphabet_Y) + else: + alphabet_y, fill_value_Alphabet_Y = _autocreate_alphabet(y, fill_value_Y) + if Alphabet_Z is not None: + Alphabet_Z, fill_value_Alphabet_Z = _sanitise_array_input( + Alphabet_Z, fill_value + ) + Alphabet_Z, _ = _autocreate_alphabet(Alphabet_Z, fill_value_Alphabet_Z) + else: + Alphabet_Z, fill_value_Alphabet_Z = _autocreate_alphabet(z, fill_value_Z) + + if x.size == 0: + raise ValueError("arg X contains no elements") + if y.size == 0: + raise ValueError("arg Y contains no elements") + if z.size == 0: + raise ValueError("arg Z contains no elements") + if np.any(_isnan(x)): + raise ValueError("arg X contains NaN values") + if np.any(_isnan(y)): + raise ValueError("arg Y contains NaN values") + if np.any(_isnan(z)): + raise ValueError("arg Z contains NaN values") + if alphabet_x.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(alphabet_x)): + raise ValueError("arg Alphabet_X contains NaN values") + if alphabet_y.size == 0: + raise ValueError("arg Alphabet_Y contains no elements") + if np.any(_isnan(alphabet_y)): + raise ValueError("arg Alphabet_Y contains NaN values") + if Alphabet_Z.size == 0: + raise ValueError("arg Alphabet_Z contains no elements") + if np.any(_isnan(Alphabet_Z)): + raise ValueError("arg Alphabet_Z contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if _isnan(fill_value_Y): + raise ValueError("fill value for arg Y is NaN") + if _isnan(fill_value_Z): + raise ValueError("fill value for arg Z is NaN") + if x.shape[:-1] != alphabet_x.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if y.shape[:-1] != alphabet_y.shape[:-1]: + raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") + if z.shape[:-1] != Alphabet_Z.shape[:-1]: + raise ValueError("leading dimensions of args Z and Alphabet_Z do not " "match") + if not cartesian_product and (x.shape != y.shape or x.shape != z.shape): + raise ValueError("dimensions of args X, Y, Z do not match") + if cartesian_product and (x.shape[-1] != y.shape[-1] or x.shape[-1] != z.shape[-1]): + raise ValueError("trailing dimensions of args X, Y, Z do not match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (x, alphabet_x, y, alphabet_y, z, Alphabet_Z), + ( + fill_value_X, + fill_value_Alphabet_X, + fill_value_Y, + fill_value_Alphabet_Y, + fill_value_Z, + fill_value_Alphabet_Z, + ), + ) + x, alphabet_x, y, alphabet_y, z, Alphabet_Z = S + + if not cartesian_product: + I = np.empty(x.shape[:-1]) + if I.ndim > 0: + I[:] = np.NaN + else: + I = np.float64(np.NaN) + else: + shapeI_Z = z.shape[:-1] + z = np.reshape(z, (-1, z.shape[-1])) + Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) + I = [] + for i in range(z.shape[0]): + def f(X, Y, Alphabet_X, Alphabet_Y): + return information_mutual_conditional( + X, + Y, + z[i], + False, + base, + fill_value, + estimator, + Alphabet_X, + Alphabet_Y, + Alphabet_Z[i], + ) + + I.append(_cartesian_product_apply(x, y, f, alphabet_x, alphabet_y)) + shapeI_XY = I[0].shape + if len(shapeI_Z) == 0: + I = np.array(I)[0].reshape(shapeI_XY) + else: + I = np.array(I) + I = np.rollaxis(I, 0, len(I.shape)) + I = I.reshape(np.append(shapeI_XY, shapeI_Z).astype("int")) + return I + + # Re-shape H, X,Y,Z so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + x = np.reshape(x, (-1, x.shape[-1])) + y = np.reshape(y, (-1, y.shape[-1])) + z = np.reshape(z, (-1, z.shape[-1])) + alphabet_x = np.reshape(alphabet_x, (-1, alphabet_x.shape[-1])) + alphabet_y = np.reshape(alphabet_y, (-1, alphabet_y.shape[-1])) + Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) + orig_shape_I = I.shape + I = np.reshape(I, (-1, 1)) + + for i in range(x.shape[0]): + I_ = ( + entropy_joint( + np.vstack((x[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], Alphabet_Z[i]), fill_value), + ) + + entropy_joint( + np.vstack((y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint( + np.vstack((x[i], y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint(z[i], base, fill_value, estimator, Alphabet_Z[i]) + ) + I[i] = I_ + + # Reverse re-shaping + I = np.reshape(I, orig_shape_I) + + if keep_dims and not cartesian_product: + I = I[..., np.newaxis] + + return I + + +def information_mutual( + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, +): + H_conditional = entropy_conditional( + X, Y, cartesian_product, base, fill_value, estimator, Alphabet_X, Alphabet_Y + ) + H = entropy(X, base, fill_value, estimator, Alphabet_X) + + H = np.reshape( + H, np.append(H.shape, np.ones(H_conditional.ndim - H.ndim)).astype("int") + ) + + H = H - H_conditional + + if keep_dims and not cartesian_product: + H = H[..., np.newaxis] + + return H + + +def entropy_pmf(P, base=2, require_valid_pmf=True, keep_dims=False): + P, _ = _sanitise_array_input(P) + + if P.size == 0: + raise ValueError("arg P contains no elements") + if np.any(_isnan(P)): + raise ValueError("arg P contains NaN values") + if np.any(np.logical_or(P < 0, P > 1)): + raise ValueError("arg P contains values outside unit interval") + if require_valid_pmf and not np.allclose(np.sum(P, axis=-1), 1): + raise ValueError("arg P does not sum to unity across last axis") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + H = -np.sum(P * np.log2(P + np.spacing(0)), axis=-1) + H = H / np.log2(base) + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def entropy_joint( + X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False +): + X, fill_value_X = _sanitise_array_input(X, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) + ) + X, Alphabet_X = S + + X = np.reshape(X, (-1, X.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + + _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) + + for i in range(X.shape[0]): + X = X[:, X[i].argsort(kind="mergesort")] + + B = np.any(X[:, 1:] != X[:, :-1], axis=0) + I = np.append(np.where(B), X.shape[1] - 1) + L = np.diff(np.append(-1, I)) + + alphabet_X = X[:, I] + if estimator != "ML": + n_additional_empty_bins = _determine_number_additional_empty_bins( + L, alphabet_X, Alphabet_X, fill_value + ) + else: + n_additional_empty_bins = 0 + L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) + if not np.any(L): + return np.float64(np.NaN) + + # P_0 is the probability mass assigned to each additional empty bin + P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) + H_0 = n_additional_empty_bins * P_0 * -np.log2(P_0 + np.spacing(0)) / np.log2(base) + H = entropy_pmf(P, base, require_valid_pmf=False) + H_0 + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def entropy_conditional( + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, +): + if Y is None: + Y = X + cartesian_product = True + Alphabet_Y = Alphabet_X + + X, fill_value_X = _sanitise_array_input(X, fill_value) + Y, fill_value_Y = _sanitise_array_input(Y, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + if Alphabet_Y is not None: + Alphabet_Y, fill_value_Alphabet_Y = _sanitise_array_input( + Alphabet_Y, fill_value + ) + Alphabet_Y, _ = _autocreate_alphabet(Alphabet_Y, fill_value_Alphabet_Y) + else: + Alphabet_Y, fill_value_Alphabet_Y = _autocreate_alphabet(Y, fill_value_Y) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if Y.size == 0: + raise ValueError("arg Y contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if np.any(_isnan(Y)): + raise ValueError("arg Y contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if Alphabet_Y.size == 0: + raise ValueError("arg Alphabet_Y contains no elements") + if np.any(_isnan(Alphabet_Y)): + raise ValueError("arg Alphabet_Y contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if _isnan(fill_value_Y): + raise ValueError("fill value for arg Y is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if Y.shape[:-1] != Alphabet_Y.shape[:-1]: + raise ValueError("leading dimensions of args Y and Alphabet_Y do not " "match") + if not cartesian_product and X.shape != Y.shape: + raise ValueError("dimensions of args X and Y do not match") + if cartesian_product and X.shape[-1] != Y.shape[-1]: + raise ValueError("trailing dimensions of args X and Y do not match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X, Y, Alphabet_Y), + (fill_value_X, fill_value_Alphabet_X, fill_value_Y, fill_value_Alphabet_Y), + ) + X, Alphabet_X, Y, Alphabet_Y = S + + if not cartesian_product: + H = np.empty(X.shape[:-1]) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + else: + + def f(X, Y, Alphabet_X, Alphabet_Y): + return entropy_conditional( + X, Y, False, base, fill_value, estimator, Alphabet_X, Alphabet_Y + ) + + return _cartesian_product_apply(X, Y, f, Alphabet_X, Alphabet_Y) + + # Re-shape H, X and Y, so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + X = np.reshape(X, (-1, X.shape[-1])) + Y = np.reshape(Y, (-1, Y.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + for i in range(X.shape[0]): + H[i] = entropy_joint( + np.vstack((X[i], Y[i])), + base, + fill_value, + estimator, + _vstack_pad((Alphabet_X[i], Alphabet_Y[i]), fill_value), + ) - entropy(Y[i], base, fill_value, estimator, Alphabet_Y[i]) + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + if keep_dims and not cartesian_product: + H = H[..., np.newaxis] + + return H + + +def entropy(X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False): + X, fill_value_X = _sanitise_array_input(X, fill_value) + if Alphabet_X is not None: + Alphabet_X, fill_value_Alphabet_X = _sanitise_array_input( + Alphabet_X, fill_value + ) + Alphabet_X, _ = _autocreate_alphabet(Alphabet_X, fill_value_Alphabet_X) + else: + Alphabet_X, fill_value_Alphabet_X = _autocreate_alphabet(X, fill_value_X) + + if X.size == 0: + raise ValueError("arg X contains no elements") + if np.any(_isnan(X)): + raise ValueError("arg X contains NaN values") + if Alphabet_X.size == 0: + raise ValueError("arg Alphabet_X contains no elements") + if np.any(_isnan(Alphabet_X)): + raise ValueError("arg Alphabet_X contains NaN values") + if _isnan(fill_value_X): + raise ValueError("fill value for arg X is NaN") + if X.shape[:-1] != Alphabet_X.shape[:-1]: + raise ValueError("leading dimensions of args X and Alphabet_X do not " "match") + if not (np.isscalar(base) and np.isreal(base) and base > 0): + raise ValueError("arg base not a positive real-valued scalar") + + S, fill_value = _map_observations_to_integers( + (X, Alphabet_X), (fill_value_X, fill_value_Alphabet_X) + ) + X, Alphabet_X = S + + H = np.empty(X.shape[:-1]) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + + # Re-shape H and X, so that we may handle multi-dimensional arrays + # equivalently and iterate across 0th axis + X = np.reshape(X, (-1, X.shape[-1])) + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + _verify_alphabet_sufficiently_large(X, Alphabet_X, fill_value) + + # NB: This is not joint entropy. Elements in each row are sorted + # independently + X = np.sort(X, axis=1) + + # Compute symbol run-lengths + # Compute symbol change indicators + B = X[:, 1:] != X[:, :-1] + for i in range(X.shape[0]): + # Obtain symbol change positions + I = np.append(np.where(B[i]), X.shape[1] - 1) + # Compute run lengths + L = np.diff(np.append(-1, I)) + + alphabet_X = X[i, I] + if estimator != "ML": + n_additional_empty_bins = _determine_number_additional_empty_bins( + L, alphabet_X, Alphabet_X[i], fill_value + ) + else: + n_additional_empty_bins = 0 + L, _ = _remove_counts_at_fill_value(L, alphabet_X, fill_value) + if not np.any(L): + continue + + # P_0 is the probability mass assigned to each additional empty bin + P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) + H_0 = ( + n_additional_empty_bins + * P_0 + * -np.log2(P_0 + np.spacing(0)) + / np.log2(base) + ) + H[i] = entropy_pmf(P, base, require_valid_pmf=False) + H_0 + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + if keep_dims: + H = H[..., np.newaxis] + + return H + + +def _autocreate_alphabet(X, fill_value): + Lengths = np.apply_along_axis(lambda x: np.unique(x).size, axis=-1, arr=X) + max_length = np.max(Lengths) + + def pad_with_fillvalue(x): + return np.append(x, np.tile(fill_value, int(max_length - x.size))) + + Alphabet = np.apply_along_axis( + lambda x: pad_with_fillvalue(np.unique(x)), axis=-1, arr=X + ) + return (Alphabet, fill_value) + + +def _sanitise_array_input(x, fill_value=-1): + # Avoid Python 3 issues with numpy arrays containing None elements + if np.any(np.equal(x, None)) or fill_value is None: + x = np.array(x, copy=False) + assert np.all(x != NONE_REPLACEMENT) + m = np.equal(x, None) + x = np.where(m, NONE_REPLACEMENT, x) + if fill_value is None: + x = np.array(x, copy=False) + fill_value = NONE_REPLACEMENT + + if isinstance(x, (pd.core.frame.DataFrame, pd.core.series.Series)): + # Create masked array, honouring Dataframe/Series missing entries + # NB: We transpose for convenience, so that quantities are computed for + # each column + x = np.ma.MaskedArray(x, x.isnull()) + + if isinstance(x, np.ma.MaskedArray): + fill_value = x.fill_value + + if np.any(x == fill_value): + warnings.warn("Masked array contains data equal to fill value") + + if x.dtype.kind in ("S", "U"): + kind = x.dtype.kind + current_dtype_len = int(x.dtype.str.split(kind)[1]) + if current_dtype_len < len(fill_value): + # Fix numpy's broken array string type behaviour which causes + # X.filled() placeholder entries to be no longer than + # non-placeholder entries + warnings.warn( + "Changing numpy array dtype internally to " + "accommodate fill_value string length" + ) + m = x.mask + x = np.array(x.filled(), dtype=kind + str(len(fill_value))) + x[m] = fill_value + else: + x = x.filled() + else: + x = x.filled() + else: + x = np.array(x, copy=False) + + if x.dtype.kind not in "biufcmMOSUV": + raise TypeError("Unsupported array dtype") + + if x.size == 1 and x.ndim == 0: + x = np.array((x,)) + + return x, np.array(fill_value) + + +def _map_observations_to_integers(Symbol_matrices, Fill_values): + assert len(Symbol_matrices) == len(Fill_values) + FILL_VALUE = -1 + if np.any([A.dtype != "int" for A in Symbol_matrices]) or np.any( + np.array(Fill_values) != FILL_VALUE + ): + L = sklearn.preprocessing.LabelEncoder() + F = [np.atleast_1d(v) for v in Fill_values] + L.fit(np.concatenate([A.ravel() for A in Symbol_matrices] + F)) + Symbol_matrices = [ + L.transform(A.ravel()).reshape(A.shape) for A in Symbol_matrices + ] + Fill_values = [L.transform(np.atleast_1d(f)) for f in Fill_values] + + for A, f in zip(Symbol_matrices, Fill_values): + assert not np.any(A == FILL_VALUE) + A[A == f] = FILL_VALUE + + assert np.all([A.dtype == "int" for A in Symbol_matrices]) + return Symbol_matrices, FILL_VALUE + + +def _isnan(X): + X = np.array(X, copy=False) + if X.dtype in ("int", "float"): + return np.isnan(X) + else: + f = np.vectorize(_isnan_element) + return f(X) + + +def _isnan_element(x): + if isinstance(x, type(np.nan)): + return np.isnan(x) + else: + return False + + +def _determine_number_additional_empty_bins( + Counts, Alphabet, Full_Alphabet, fill_value +): + alphabet_sizes = np.sum(np.atleast_2d(Full_Alphabet) != fill_value, axis=-1) + if np.any(alphabet_sizes != fill_value): + joint_alphabet_size = np.prod(alphabet_sizes[alphabet_sizes > 0]) + if joint_alphabet_size <= 0: + raise ValueError( + "Numerical overflow detected. Joint alphabet " "size too large." + ) + else: + joint_alphabet_size = 0 + return joint_alphabet_size - np.sum( + np.all(np.atleast_2d(Alphabet) != fill_value, axis=0) + ) + + +def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): + # TODO Documentation should present the following guidelines: + # 1) Good-Turing may be used if slope requirement satisfied and if + # unobserved symbols have been defined (TODO Clarify what the requirement + # is) + # 2) James-Stein approach may be used as an alternative + # 3) Dirichlet prior may be used in all other cases + + assert (np.sum(Counts) > 0) + assert (np.all(Counts.astype('int') == Counts)) + assert (n_additional_empty_bins >= 0) + Counts = Counts.astype('int') + + if isinstance(estimator, str): + estimator = estimator.upper().replace(' ', '') + + if np.isreal(estimator) or estimator in ('ML', 'PERKS', 'MINIMAX'): + if np.isreal(estimator): + alpha = estimator + elif estimator == 'PERKS': + alpha = 1.0 / (Counts.size + n_additional_empty_bins) + elif estimator == 'MINIMAX': + alpha = np.sqrt(np.sum(Counts)) / \ + (Counts.size + n_additional_empty_bins) + else: + alpha = 0 + Theta = (Counts + alpha) / \ + (1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins)) + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = alpha / (1.0 * np.sum(Counts) + + alpha * (Counts.size + n_additional_empty_bins)) + else: + Theta_0 = 0 + + elif estimator == 'GOOD-TURING': + # TODO We could also add a Chen-Chao vocabulary size estimator (See + # Bhat Suma's thesis) + + # The following notation is based on Gale and Sampson (1995) + # Determine histogram of counts N_r (index r denotes count) + X = np.sort(Counts) + B = X[1:] != X[:-1] # Compute symbol change indicators + I = np.append(np.where(B), X.size - 1) # Obtain symbol change positions + N_r = np.zeros(X[I[-1]] + 1) + N_r[X[I]] = np.diff(np.append(-1, I)) # Compute run lengths + N_r[0] = 0 # Ensures that unobserved symbols do not interfere + + # Compute Z_r, a locally averaged version of N_r + R = np.where(N_r)[0] + Q = np.append(0, R[:-1]) + T = np.append(R[1:], 2 * R[-1] - Q[-1]) + Z_r = np.zeros_like(N_r) + Z_r[R] = N_r[R] / (0.5 * (T - Q)) + + # Fit least squares regression line to plot of log(Z_r) versus log(r) + x = np.log10(np.arange(1, Z_r.size)) + with np.errstate(invalid='ignore', divide='ignore'): + y = np.log10(Z_r[1:]) + x = x[np.isfinite(y)] + y = y[np.isfinite(y)] + m, c = np.linalg.lstsq(np.vstack([x, np.ones(x.size)]).T, y, + rcond=None)[0] + if m >= -1: + warnings.warn("Regression slope < -1 requirement in linear " + "Good-Turing estimate not satisfied") + # Compute smoothed value of N_r based on interpolation + # We need to refer to SmoothedN_{r+1} for all observed values of r + SmoothedN_r = np.zeros(N_r.size + 1) + SmoothedN_r[1:] = 10 ** (np.log10(np.arange(1, SmoothedN_r.size)) * + m + c) + + # Determine threshold value of r at which to use smoothed values of N_r + # (SmoothedN_r), as apposed to straightforward N_r. + # Variance of Turing estimate + with np.errstate(invalid='ignore', divide='ignore'): + VARr_T = (np.arange(N_r.size) + 1) ** 2 * \ + (1.0 * np.append(N_r[1:], 0) / (N_r ** 2)) * \ + (1 + np.append(N_r[1:], 0) / N_r) + x = (np.arange(N_r.size) + 1) * 1.0 * np.append(N_r[1:], 0) / N_r + y = (np.arange(N_r.size) + 1) * \ + 1.0 * SmoothedN_r[1:] / (SmoothedN_r[:-1]) + assert (np.isinf(VARr_T[0]) or np.isnan(VARr_T[0])) + turing_is_sig_diff = np.abs(x - y) > 1.96 * np.sqrt(VARr_T) + assert (turing_is_sig_diff[0] == np.array(False)) + # NB: 0th element can be safely ignored, since always 0 + T = np.where(turing_is_sig_diff == np.array(False))[0] + if T.size > 1: + thresh_r = T[1] + # Use smoothed estimates from the first non-significant + # np.abs(SmoothedN_r-N_r) position onwards + SmoothedN_r[:thresh_r] = N_r[:thresh_r] + else: + # Use only non-smoothed estimates (except for SmoothedN_r[-1]) + SmoothedN_r[:-1] = N_r + + # Estimate probability of encountering one particular symbol among the + # objects observed r times, r>0 + p_r = np.zeros(N_r.size) + N = np.sum(Counts) + p_r[1:] = (np.arange(1, N_r.size) + 1) * \ + 1.0 * SmoothedN_r[2:] / (SmoothedN_r[1:-1] * N) + # Estimate probability of observing any unseen symbol + p_r[0] = 1.0 * N_r[1] / N + + # Assign probabilities to observed symbols + Theta = np.array([p_r[r] for r in Counts]) + Theta[Counts == 0] = 0 + + # Normalise probabilities for observed symbols, so that they sum to one + if np.any(Counts == 0) or n_additional_empty_bins > 0: + Theta = (1 - p_r[0]) * Theta / np.sum(Theta) + else: + warnings.warn("No unobserved outcomes specified. Disregarding the " + "probability mass allocated to any unobserved " + "outcomes.") + Theta = Theta / np.sum(Theta) + + # Divide p_0 among unobserved symbols + with np.errstate(invalid='ignore', divide='ignore'): + p_emptybin = p_r[0] / (np.sum(Counts == 0) + + n_additional_empty_bins) + Theta[Counts == 0] = p_emptybin + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = p_emptybin + else: + Theta_0 = 0 + + elif estimator == 'JAMES-STEIN': + Theta, _ = _estimate_probabilities(Counts, 'ML') + p_uniform = 1.0 / (Counts.size + n_additional_empty_bins) + with np.errstate(invalid='ignore', divide='ignore'): + Lambda = (1 - np.sum(Theta ** 2)) / \ + ((np.sum(Counts) - 1) * + (np.sum((p_uniform - Theta) ** 2) + + n_additional_empty_bins * p_uniform ** 2)) + + if Lambda > 1: + Lambda = 1 + elif Lambda < 0: + Lambda = 0 + elif np.isnan(Lambda): + Lambda = 1 + + Theta = Lambda * p_uniform + (1 - Lambda) * Theta + # Theta_0 is the probability mass assigned to each additional empty bin + if n_additional_empty_bins > 0: + Theta_0 = Lambda * p_uniform + else: + Theta_0 = 0 + else: + raise ValueError("invalid value specified for estimator") + + return Theta, Theta_0 + + +def _remove_counts_at_fill_value(Counts, Alphabet, fill_value): + I = np.any(np.atleast_2d(Alphabet) == fill_value, axis=0) + if np.any(I): + Counts = Counts[~I] + Alphabet = Alphabet.T[~I].T + return (Counts, Alphabet) + + +def _cartesian_product_apply(X, Y, function, Alphabet_X=None, Alphabet_Y=None): + assert X.ndim > 0 and Y.ndim > 0 + assert X.size > 0 and Y.size > 0 + if Alphabet_X is not None or Alphabet_Y is not None: + assert Alphabet_X.ndim > 0 and Alphabet_Y.ndim > 0 + assert Alphabet_X.size > 0 and Alphabet_Y.size > 0 + + H = np.empty(np.append(X.shape[:-1], Y.shape[:-1]).astype("int")) + if H.ndim > 0: + H[:] = np.NaN + else: + H = np.float64(np.NaN) + + X = np.reshape(X, (-1, X.shape[-1])) + Y = np.reshape(Y, (-1, Y.shape[-1])) + if Alphabet_X is not None or Alphabet_Y is not None: + Alphabet_X = np.reshape(Alphabet_X, (-1, Alphabet_X.shape[-1])) + Alphabet_Y = np.reshape(Alphabet_Y, (-1, Alphabet_Y.shape[-1])) + orig_shape_H = H.shape + H = np.reshape(H, (-1, 1)) + + n = 0 + for i in range(X.shape[0]): + for j in range(Y.shape[0]): + if Alphabet_X is not None or Alphabet_Y is not None: + H[n] = function(X[i], Y[j], Alphabet_X[i], Alphabet_Y[j]) + else: + H[n] = function(X[i], Y[j]) + n = n + 1 + + # Reverse re-shaping + H = np.reshape(H, orig_shape_H) + + return H + + +def _verify_alphabet_sufficiently_large(X, Alphabet, fill_value): + assert not np.any(X == np.array(None)) + assert not np.any(Alphabet == np.array(None)) + for i in range(X.shape[0]): + I = X[i] != fill_value + J = Alphabet[i] != fill_value + # NB: This causes issues when both arguments contain None. But it is + # always called after observations have all been mapped to integers. + if np.setdiff1d(X[i, I], Alphabet[i, J]).size > 0: + raise ValueError( + "provided alphabet does not contain all observed " "outcomes" + ) + + +def _vstack_pad(arrays, fill_value): + max_length = max([A.shape[-1] for A in arrays]) + arrays = [ + np.append( + A, + np.tile( + fill_value, + np.append(A.shape[:-1], max_length - A.shape[-1]).astype(int), + ), + ) + for A in arrays + ] + return np.vstack(arrays) diff --git a/bamt/core/nodes/child_nodes/__init__.py b/bamt/external/pyitlib/__init__.py similarity index 100% rename from bamt/core/nodes/child_nodes/__init__.py rename to bamt/external/pyitlib/__init__.py diff --git a/bamt/log.py b/bamt/log.py new file mode 100644 index 0000000..74d65fc --- /dev/null +++ b/bamt/log.py @@ -0,0 +1,67 @@ +import logging +import logging.config +import os + + +class BamtLogger: + def __init__(self): + self.file_handler = False + self.enabled = True + self.base = os.path.join(os.path.dirname(os.path.abspath(__file__))) + self.log_file_path = os.path.join(self.base, "logging.conf") + logging.config.fileConfig(self.log_file_path) + + self.loggers = dict( + logger_builder=logging.getLogger("builder"), + logger_network=logging.getLogger("network"), + logger_preprocessor=logging.getLogger("preprocessor"), + logger_nodes=logging.getLogger("nodes"), + logger_display=logging.getLogger("display"), + ) + + def has_handler(self, logger, handler_type): + """Check if a logger has a handler of a specific type.""" + return any(isinstance(handler, handler_type) for handler in logger.handlers) + + def remove_handler_type(self, logger, handler_type): + """Remove all handlers of a specific type from a logger.""" + for handler in logger.handlers[:]: + if isinstance(handler, handler_type): + logger.removeHandler(handler) + + def switch_console_out(self, value: bool): + """ + Turn off console output from logger. + """ + assert isinstance(value, bool) + handler_class = logging.StreamHandler if not value else logging.NullHandler + + for logger in self.loggers.values(): + if self.has_handler(logger, handler_class): + self.remove_handler_type(logger, handler_class) + logger.addHandler(logging.NullHandler() if not value else logging.root.handlers[0]) + + def switch_file_out(self, value: bool, log_file: str): + """ + Send all messages in file + """ + assert isinstance(value, bool) + + for logger in self.loggers.values(): + if value: + file_handler = logging.FileHandler(log_file, mode="a") + file_handler.setFormatter(logging.root.handlers[0].formatter) + logger.addHandler(file_handler) + else: + self.remove_handler_type(logger, logging.FileHandler) + + +bamt_logger = BamtLogger() + +( + logger_builder, + logger_network, + logger_preprocessor, + logger_nodes, + logger_display, +) = list(bamt_logger.loggers.values()) diff --git a/bamt/logging.conf b/bamt/logging.conf new file mode 100644 index 0000000..87663ef --- /dev/null +++ b/bamt/logging.conf @@ -0,0 +1,51 @@ +[loggers] +keys=root, preprocessor, builder, nodes, network, display + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_preprocessor] +level=INFO +qualname=preprocessor +handlers=consoleHandler +propagate=0 + +[logger_network] +level=INFO +qualname=network +handlers=consoleHandler +propagate=0 + +[logger_builder] +level=INFO +qualname=builder +handlers=consoleHandler +propagate=0 + +[logger_nodes] +level=INFO +qualname=nodes +handlers=consoleHandler +propagate=0 + +[logger_display] +level=INFO +qualname=display +handlers=consoleHandler +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s | %(levelname)-8s | %(filename)s-%(funcName)s-%(lineno)04d | %(message)s diff --git a/bamt/mi_entropy_gauss.py b/bamt/mi_entropy_gauss.py new file mode 100644 index 0000000..48b1ef9 --- /dev/null +++ b/bamt/mi_entropy_gauss.py @@ -0,0 +1,310 @@ +import math +import sys +from copy import copy +from typing import List + +import numpy as np +import pandas as pd + +from bamt.external.pyBN.utils.independence_tests import mutual_information, entropy +from bamt.preprocess.discretization import get_nodes_type +from bamt.preprocess.graph import edges_to_dict +from bamt.preprocess.numpy_pandas import loc_to_DataFrame + + +def query_filter(data: pd.DataFrame, columns: List, values: List): + """ + Filters the data according to the column-value list + Arguments + ---------- + *data* : pandas.DataFrame + Returns + ------- + *data_trim* : pandas.DataFrame + Filtered data. + Effects + ------- + None + """ + data_copy = copy(data) + filter_str = "`" + str(columns[0]) + "`" + " == " + str(values[0]) + if len(columns) == 1: + return data_copy.query(filter_str) + else: + for i in range(1, len(columns)): + filter_str += " & " + "`" + str(columns[i]) + "`" + " == " + str(values[i]) + data_trim = data_copy.query(filter_str) + return data_trim + + +def entropy_gauss(pd_data): + """ + Calculate entropy for Gaussian multivariate distributions. + Arguments + ---------- + *data* : pd.DataFrame + Returns + ------- + *entropy* : a float + The entropy for Gaussian multivariate distributions. + Effects + ------- + None + """ + if not isinstance(pd_data, pd.Series): + data = copy(pd_data).values.T + else: + data = np.array(copy(pd_data)).T + if data.size == 0: + return 0.0 + flag_row = False + flag_col = False + + if isinstance(data[0], np.float64): + flag_row = True + elif (len(data[0]) < 2) | (data.ndim < 2): + flag_row = True + elif data.shape[0] < 2: + flag_row = True + if isinstance(copy(data).T[0], np.float64): + flag_col = True + elif (len(copy(data).T) < 2) | (copy(data).T.ndim < 2): + flag_col = True + elif data.shape[1] < 2: + flag_col = True + + if flag_row & flag_col: + return sys.float_info.max + elif flag_row | flag_col: + var = np.var(data) + if var > 1e-16: + return 0.5 * (1 + math.log(var * 2 * math.pi)) + else: + return sys.float_info.min + else: + var = np.linalg.det(np.cov(data)) + N = var.ndim + if var > 1e-16: + return 0.5 * (N * (1 + math.log(2 * math.pi)) + math.log(var)) + else: + return sys.float_info.min + + +def entropy_all(data, method="MI"): + """ + For one varibale, H(X) is equal to the following: + -1 * sum of p(x) * log(p(x)) + For two variables H(X|Y) is equal to the following: + sum over x,y of p(x,y)*log(p(y)/p(x,y)) + For three variables, H(X|Y,Z) is equal to the following: + -1 * sum of p(x,y,z) * log(p(x|y,z)), + where p(x|y,z) = p(x,y,z)/p(y)*p(z) + Arguments + ---------- + *data* : pd.DataFrame + Returns + ------- + *H* : entropy value""" + if isinstance(data, np.ndarray): + return entropy_all(loc_to_DataFrame(data), method=method) + elif isinstance(data, pd.Series): + return entropy_all(pd.DataFrame(data), method) + elif isinstance(data, pd.DataFrame): + nodes_type = get_nodes_type(data) + column_disc = [] + for key in nodes_type: + if nodes_type[key] == "disc": + column_disc.append(key) + column_cont = [] + for key in nodes_type: + if nodes_type[key] == "cont": + column_cont.append(key) + data_disc = data[column_disc] + data_cont = data[column_cont] + + if len(column_cont) == 0: + return entropy(data_disc.values) + elif len(column_disc) == 0: + return entropy_gauss(data_cont) + else: + H_disc = entropy(data_disc.values) + dict_comb = {} + comb_prob = {} + for i in range(len(data_disc)): + row = data_disc.iloc[i] + comb = "" + for _, val in row.items(): + comb = comb + str(val) + ", " + if comb not in dict_comb: + dict_comb[comb] = row + comb_prob[comb] = 1 + else: + comb_prob[comb] += 1 + + H_cond = 0.0 + for key in list(dict_comb.keys()): + filtered_data = query_filter(data, column_disc, list(dict_comb[key])) + filtered_data = filtered_data[column_cont] + if comb_prob[key] == 1: + if (method == "BIC") | (method == "AIC"): + H_cond += ( + comb_prob[key] + / len(data_disc) + * entropy_gauss(data[column_cont]) + ) + else: + H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max + else: + H_cond += ( + comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + ) + if (method == "BIC") | (method == "AIC"): + if H_cond > entropy_gauss(data[column_cont]): + H_cond = entropy_gauss(data[column_cont]) + + return H_disc + H_cond + + +def entropy_cond(data, column_cont, column_disc, method): + data_cont = data[column_cont] + data_disc = data[column_disc] + H_gauss = entropy_gauss(data_cont) + H_cond = 0.0 + + dict_comb = {} + comb_prob = {} + for i in range(len(data_disc)): + row = data_disc.iloc[i] + comb = "" + for _, val in row.items(): + comb = comb + str(val) + ", " + if comb not in dict_comb: + dict_comb[comb] = row + comb_prob[comb] = 1 + else: + comb_prob[comb] += 1 + + for key in list(dict_comb.keys()): + filtered_data = query_filter(data, column_disc, list(dict_comb[key])) + filtered_data = filtered_data[column_cont] + if comb_prob[key] == 1: + if (method == "BIC") | (method == "AIC"): + H_cond += comb_prob[key] / len(data_disc) * H_gauss + else: + H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max + else: + H_cond += comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + if (method == "BIC") | (method == "AIC"): + if H_cond > H_gauss: + return H_gauss + else: + return H_cond + return H_cond + + +def mi_gauss(data, method="MI", conditional=False): + """ + Calculate Mutual Information based on entropy. + In the case of continuous uses entropy for Gaussian multivariate distributions. + Arguments + ---------- + *data* : pandas.DataFrame + Returns + ------- + *MI* : a float + The Mutual Information + Effects + ------- + None + Notes + ----- + - Need to preprocess data with code_categories + """ + if isinstance(data, np.ndarray): + return mi_gauss(loc_to_DataFrame(data), method, conditional) + elif isinstance(data, pd.Series): + return mi_gauss(pd.DataFrame(data)) + elif isinstance(data, pd.DataFrame): + nodes_type = get_nodes_type(data) + if conditional: + # Hill-Climbing does not use conditional MI, but other algorithms may require it + # At the moment it counts on condition of the last row in the list + # of columns + print("Warning: conditional == True") + nodes_type_trim = copy(nodes_type) + data_trim = copy(data) + list_keys = list(nodes_type_trim.keys) + del nodes_type_trim[list_keys[-1]] + del data_trim[list_keys[-1]] + return mi_gauss(data, nodes_type, method) - mi_gauss( + data_trim, nodes_type, method + ) + else: + column_disc = [] + for key in nodes_type: + if nodes_type[key] == "disc": + column_disc.append(key) + column_cont = [] + for key in nodes_type: + if nodes_type[key] == "cont": + column_cont.append(key) + data_disc = data[column_disc] + data_cont = data[column_cont] + + H_gauss = 0.0 + H_cond = 0.0 + + if len(column_cont) == 0: + return mutual_information(data_disc.values, conditional=False) + elif len(column_disc) == 0: + if len(column_cont) == 1: + return entropy_gauss(data_cont) + else: + data_last = data_cont[[column_cont[-1]]] + column_cont_trim = copy(column_cont) + del column_cont_trim[-1] + data_cont_trim = data[column_cont_trim] + + H_gauss = ( + entropy_gauss(data_last) + + entropy_gauss(data_cont_trim) + - entropy_gauss(data_cont) + ) + H_gauss = min( + H_gauss, entropy_gauss(data_last), entropy_gauss(data_cont_trim) + ) + # H_gauss = entropy_gauss(data_cont) + H_cond = 0.0 + else: + H_gauss = entropy_gauss(data_cont) + H_cond = entropy_cond(data, column_cont, column_disc, method) + + return H_gauss - H_cond + + +def mi(edges: list, data: pd.DataFrame, method="MI"): + """ + Bypasses all nodes and summarizes scores, + taking into account the parent-child relationship. + Arguments + ---------- + *edges* : list + *data* : pd.DataFrame + Returns + ------- + *sum_score* : float + Effects + ------- + None + """ + parents_dict = edges_to_dict(edges) + sum_score = 0.0 + nodes_with_edges = parents_dict.keys() + for var in nodes_with_edges: + child_parents = [var] + child_parents.extend(parents_dict[var]) + sum_score += mi_gauss(copy(data[child_parents]), method) + nodes_without_edges = list(set(data.columns).difference(set(nodes_with_edges))) + for var in nodes_without_edges: + sum_score += mi_gauss(copy(data[var]), method) + return sum_score diff --git a/bamt/models/__init__.py b/bamt/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/models/probabilistic_structural_models/__init__.py b/bamt/models/probabilistic_structural_models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/models/probabilistic_structural_models/bayesian_network.py b/bamt/models/probabilistic_structural_models/bayesian_network.py deleted file mode 100644 index e9cf91b..0000000 --- a/bamt/models/probabilistic_structural_models/bayesian_network.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import abstractmethod - -from .probabilistic_structural_model import ProbabilisticStructuralModel - - -class BayesianNetwork(ProbabilisticStructuralModel): - def __init__(self): - super().__init__() - - @abstractmethod - def fit(self): - pass - - @abstractmethod - def predict(self): - pass - - @abstractmethod - def sample(self): - pass diff --git a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py b/bamt/models/probabilistic_structural_models/composite_bayesian_network.py deleted file mode 100644 index 486eaf7..0000000 --- a/bamt/models/probabilistic_structural_models/composite_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class CompositeBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Composite Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py b/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py deleted file mode 100644 index 618c4dc..0000000 --- a/bamt/models/probabilistic_structural_models/continuous_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class ContinuousBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Continuous Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py b/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py deleted file mode 100644 index b7654a6..0000000 --- a/bamt/models/probabilistic_structural_models/discrete_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class DiscreteBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Discrete Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py b/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py deleted file mode 100644 index c4256df..0000000 --- a/bamt/models/probabilistic_structural_models/hybrid_bayesian_network.py +++ /dev/null @@ -1,18 +0,0 @@ -from .bayesian_network import BayesianNetwork - - -class HybridBayesianNetwork(BayesianNetwork): - def __init__(self): - super().__init__() - - def fit(self): - pass - - def predict(self): - pass - - def sample(self): - pass - - def __str__(self): - return "Hybrid Bayesian Network" diff --git a/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py b/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py deleted file mode 100644 index b2b3b56..0000000 --- a/bamt/models/probabilistic_structural_models/probabilistic_structural_model.py +++ /dev/null @@ -1,14 +0,0 @@ -from abc import ABC, abstractmethod - - -class ProbabilisticStructuralModel(ABC): - def __init__(self): - pass - - @abstractmethod - def fit(self): - pass - - @abstractmethod - def predict(self): - pass diff --git a/bamt/networks/__init__.py b/bamt/networks/__init__.py new file mode 100644 index 0000000..7f8df8f --- /dev/null +++ b/bamt/networks/__init__.py @@ -0,0 +1,6 @@ +from bamt.networks.base import BaseNetwork +from bamt.networks.big_brave_bn import BigBraveBN +from bamt.networks.composite_bn import CompositeBN +from bamt.networks.continuous_bn import ContinuousBN +from bamt.networks.discrete_bn import DiscreteBN +from bamt.networks.hybrid_bn import HybridBN diff --git a/bamt/networks/base.py b/bamt/networks/base.py index e69de29..8090918 100644 --- a/bamt/networks/base.py +++ b/bamt/networks/base.py @@ -0,0 +1,1002 @@ +import json +import os.path as path +import random +import re +from copy import deepcopy +from typing import Dict, Tuple, List, Callable, Optional, Type, Union, Any, Sequence + +import numpy as np +import pandas as pd +from joblib import Parallel, delayed +from pgmpy.estimators import K2Score +from sklearn.preprocessing import LabelEncoder +from tqdm import tqdm + +import bamt.builders as builders +from bamt.builders.builders_base import ParamDict +from bamt.builders.evo_builder import EvoStructureBuilder +from bamt.builders.hc_builder import HCStructureBuilder +from bamt.display import plot_, get_info_ +from bamt.external.pyitlib.DiscreteRandomVariableUtils import ( + entropy, + information_mutual, + information_mutual_conditional, + entropy_conditional, +) +from bamt.log import logger_network +from bamt.nodes.base import BaseNode +from bamt.utils import GraphUtils, serialization_utils, check_utils + + +class BaseNetwork(object): + """ + Base class for Bayesian Network + """ + + def __init__(self): + """ + nodes: a list of nodes instances + edges: a list of edges + distributions: dict + """ + self.sf_name = None + self.type = "Abstract" + self._allowed_dtypes = ["Abstract"] + self.nodes = [] + self.edges = [] + self.weights = {} + self.descriptor = {"types": {}, "signs": {}} + self.distributions = {} + self.has_logit = False + self.use_mixture = False + self.encoders = {} + + @property + def nodes_names(self) -> List[str]: + return [node.name for node in self.nodes] + + def __getitem__(self, node_name: str) -> Type[BaseNode]: + if node_name in self.nodes_names: + index = self.nodes_names.index(node_name) + return self.nodes[index] + + def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: + types = descriptor["types"] + return ( + True if all([a in self._allowed_dtypes for a in types.values()]) else False + ) + + def update_descriptor(self): + new_nodes_names = [node.name for node in self.nodes] + self.descriptor["types"] = { + node: type + for node, type in self.descriptor["types"].items() + if node in new_nodes_names + } + if "cont" in self.descriptor["types"].values(): + self.descriptor["signs"] = { + node: sign + for node, sign in self.descriptor["signs"].items() + if node in new_nodes_names + } + + def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): + """ + Function for initializing nodes in Bayesian Network + descriptor: dict with types and signs of nodes + """ + if not self.validate(descriptor=descriptor): + if not self.type == "Hybrid" or not self.type == "Composite": + logger_network.error( + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) + return + else: + logger_network.error( + f"Descriptor validation failed due to wrong type of column(s)." + ) + return + elif ["Abstract"] in self._allowed_dtypes: + return None + self.descriptor = descriptor + worker_1 = builders.builders_base.VerticesDefiner(descriptor, regressor=None) + + # first stage + worker_1.skeleton["V"] = worker_1.vertices + # second stage + worker_1.overwrite_vertex( + has_logit=False, + use_mixture=self.use_mixture, + classifier=None, + regressor=None, + ) + self.nodes = worker_1.vertices + + def add_edges( + self, + data: pd.DataFrame, + scoring_function: Union[Tuple[str, Callable], Tuple[str]] = ("K2", K2Score), + progress_bar: bool = True, + classifier: Optional[object] = None, + regressor: Optional[object] = None, + params: Optional[ParamDict] = None, + optimizer: str = "HC", + **kwargs, + ): + """ + Base function for Structure learning + scoring_function: tuple with the following format (NAME, scoring_function) or (NAME, ) + Params: + init_edges: list of tuples, a graph to start learning with + remove_init_edges: allows changes in a model defined by user + white_list: list of allowed edges + """ + if not self.has_logit and check_utils.is_model(classifier): + logger_network.error("Classifiers dict with use_logit=False is forbidden.") + return None + + # params validation + if params: + # init_edges validation + if not self.has_logit and "init_edges" in params.keys(): + type_map = np.array( + [ + [ + self.descriptor["types"][node1], + self.descriptor["types"][node2], + ] + for node1, node2 in params["init_edges"] + ] + ) + failed = (type_map[:, 0] == "cont") & ( + (type_map[:, 1] == "disc") | (type_map[:, 1] == "disc_num") + ) + if sum(failed): + logger_network.warning( + f"Edges between continuous nodes and disc nodes are forbidden (has_logit = {self.has_logit}), " + f"they will be ignored. Indexes: {np.where(failed)[0]}" + ) + params["init_edges"] = [ + params["init_edges"][i] + for i in range(len(params["init_edges"])) + if i not in np.where(failed)[0] + ] + + if not self.validate(descriptor=self.descriptor): + logger_network.error( + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) + return None + if optimizer == "HC": + worker = HCStructureBuilder( + data=data, + descriptor=self.descriptor, + scoring_function=scoring_function, + has_logit=self.has_logit, + use_mixture=self.use_mixture, + regressor=regressor, + ) + elif optimizer == "Evo": + worker = EvoStructureBuilder( + data=data, + descriptor=self.descriptor, + has_logit=self.has_logit, + use_mixture=self.use_mixture, + regressor=regressor, + ) + else: + logger_network.error(f"Optimizer {optimizer} is not supported") + return None + + self.sf_name = scoring_function[0] + + worker.build( + data=data, + params=params, + classifier=classifier, + regressor=regressor, + progress_bar=progress_bar, + **kwargs, + ) + + # update family + self.nodes = worker.skeleton["V"] + self.edges = worker.skeleton["E"] + + def calculate_weights(self, discretized_data: pd.DataFrame): + """ + Provide calculation of link strength according mutual information between node and its parent(-s) values. + """ + import bamt.utils.GraphUtils as gru + + data_descriptor = gru.nodes_types(discretized_data) + if not all([i in ["disc", "disc_num"] for i in data_descriptor.values()]): + logger_network.error( + f"calculate_weghts() method deals only with discrete data. Continuous data: " + + f"{[col for col, type in data_descriptor.items() if type not in ['disc', 'disc_num']]}" + ) + if not self.edges: + logger_network.error( + "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" + ) + if not self.nodes: + logger_network.error( + "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" + ) + weights = dict() + + for node in self.nodes: + parents = node.cont_parents + node.disc_parents + if parents is None: + continue + y = discretized_data[node.name].values + if len(parents) == 1: + x = discretized_data[parents[0]].values + ls_true = information_mutual(X=y, Y=x) + entropy_value = entropy(X=y) + weight = ls_true / entropy_value + weights[(parents[0], node.name)] = weight + else: + for parent_node in parents: + x = discretized_data[parent_node].values + other_parents = [tmp for tmp in parents if tmp != parent_node] + z = list() + for other_parent in other_parents: + z.append(list(discretized_data[other_parent].values)) + ls_true = np.average( + information_mutual_conditional( + x=y, y=x, z=z, cartesian_product=True + ) + ) + entropy_value = ( + np.average( + entropy_conditional(X=y, Y=z, cartesian_product=True) + ) + + 1e-8 + ) + weight = ls_true / entropy_value + weights[(parent_node, node.name)] = weight + self.weights = weights + + def set_nodes(self, nodes: List, info: Optional[Dict] = None): + """ + additional function to set nodes manually. User should be aware that + nodes must be a subclass of BaseNode. + Params: + nodes: dict with name and node (if a lot of nodes should be added) + info: descriptor + """ + if not info and not self.descriptor["types"]: + logger_network.error( + "In case of manual setting nodes user should set map for them as well." + ) + return + self.nodes = [] + for node in nodes: + if issubclass(type(node), BaseNode): + self.nodes.append(node) + else: + logger_network.error(f"{node} is not an instance of {BaseNode}") + + if info: + self.descriptor = info + + def set_edges(self, edges: Optional[List[Sequence[str]]] = None): + """ + additional function to set edges manually. User should be aware that + nodes must be a subclass of BaseNode. + param: edges dict with name and node (if a lot of nodes should be added) + """ + + if not self.nodes: + logger_network.error("Graph without nodes") + self.edges = [] + for node1, node2 in edges: + if isinstance(node1, str) and isinstance(node2, str): + if self[node1] and self[node2]: + if ( + not self.has_logit + and self.descriptor["types"][node1] == "cont" + and self.descriptor["types"][node2] == "disc" + ): + logger_network.warning( + f"Restricted edge detected (has_logit=False) : [{node1}, {node2}]" + ) + continue + else: + self.edges.append((node1, node2)) + else: + logger_network.error(f"Unknown nodes : [{node1}, {node2}]") + continue + else: + logger_network.error( + f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]" + ) + continue + self.update_descriptor() + + def set_structure( + self, + info: Optional[Dict] = None, + nodes: Optional[List] = None, + edges: Optional[List[Sequence[str]]] = None, + ): + """ + Function to set structure manually + info: Descriptor + nodes, edges: + """ + if nodes and (info or (self.descriptor["types"] and self.descriptor["signs"])): + if ( + any("mixture" in node.type.lower() for node in nodes) + and not self.use_mixture + ): + logger_network.error("Compatibility error: use mixture.") + return + if ( + any("logit" in node.type.lower() for node in nodes) + and not self.has_logit + ): + logger_network.error("Compatibility error: has logit.") + return + self.set_nodes(nodes=nodes, info=info) + if isinstance(edges, list): + if not self.nodes: + logger_network.error("Nodes/info detection failed.") + return + + builder = builders.builders_base.VerticesDefiner( + descriptor=self.descriptor, regressor=None + ) # init worker + builder.skeleton["V"] = builder.vertices # 1 stage + if len(edges) != 0: + # set edges and register members + self.set_edges(edges=edges) + builder.skeleton["E"] = self.edges + builder.get_family() + + builder.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=None, + regressor=None, + ) + + self.set_nodes(nodes=builder.skeleton["V"]) + + def _param_validation(self, params: Dict[str, Any]) -> bool: + if all(self[i] for i in params.keys()): + for name, info in params.items(): + try: + self[name].choose(node_info=info, pvals=[]) + except Exception as ex: + logger_network.error("Validation failed", exc_info=ex) + return False + return True + else: + logger_network.error("Param validation failed due to unknown nodes.") + return False + + def set_parameters(self, parameters: Dict): + if not self.nodes: + logger_network.error("Failed on search of BN's nodes.") + + self.distributions = parameters + + for node, data in self.distributions.items(): + if "hybcprob" in data.keys(): + if "mixture" in self[node].type.lower(): + continue + else: + if "gaussian" in self[node].type.lower(): + model_type = "regressor" + else: + model_type = "classifier" + + model = None + for v in data["hybcprob"].values(): + if v[model_type]: + model = v[model_type] + break + else: + continue + if not model: + logger_network.warning( + f"Classifier/regressor for {node} hadn't been used." + ) + + self[node].type = re.sub( + r"\([\s\S]*\)", f"({model})", self[node].type + ) + else: + if data.get("serialization", False): + regressor = data.get("regressor", None) + classifier = data.get("classifier", None) + self[node].type = re.sub( + r"\([\s\S]*\)", f"({regressor or classifier})", self[node].type + ) + else: + self[node].type = re.sub( + r"\([\s\S]*\)", f"({None})", self[node].type + ) + + @staticmethod + def _save_to_file(outdir: str, data: Union[dict, list]): + """ + Function to save data to json file + :param outdir: output directory + :param data: dictionary to be saved + """ + if not outdir.endswith(".json"): + raise TypeError( + f"Unappropriated file format. Expected: .json. Got: {path.splitext(outdir)[-1]}" + ) + with open(outdir, "w+") as out: + json.dump(data, out) + return True + + def save_params(self, outdir: str): + """ + Function to save BN params to json file + outdir: output directory + """ + return self._save_to_file(outdir, self.distributions) + + def save_structure(self, outdir: str): + """ + Function to save BN edges to json file + outdir: output directory + """ + return self._save_to_file(outdir, self.edges) + + def save(self, bn_name, models_dir: str = "models_dir"): + """ + Function to save the whole BN to json file. + + :param bn_name: unique name of bn user want to save. It will be used as file name (e.g. bn_name.json). + :param models_dir: if picklization is broken, joblib will serialize models in compressed files + in models directory. + + :return: saving status. + """ + distributions = deepcopy(self.distributions) + new_weights = {str(key): self.weights[key] for key in self.weights} + + to_serialize = {} + # separate logit and gaussian nodes from distributions to serialize bn's models + for node_name in distributions.keys(): + if "Mixture" in self[node_name].type: + continue + if self[node_name].type.startswith("Gaussian"): + if not distributions[node_name]["regressor"]: + continue + if ( + "Gaussian" in self[node_name].type + or "Logit" in self[node_name].type + or "ConditionalLogit" in self[node_name].type + ): + to_serialize[node_name] = [ + self[node_name].type, + distributions[node_name], + ] + + serializer = serialization_utils.ModelsSerializer( + bn_name=bn_name, models_dir=models_dir + ) + serialized_dist = serializer.serialize(to_serialize) + + for serialized_node in serialized_dist.keys(): + distributions[serialized_node] = serialized_dist[serialized_node] + + outdict = { + "info": self.descriptor, + "edges": self.edges, + "parameters": distributions, + "weights": new_weights, + } + return self._save_to_file(f"{bn_name}.json", outdict) + + def load(self, + input_data: Union[str, Dict], + models_dir: str = "/"): + """ + Function to load the whole BN from json file. + :param input_data: input path to json file with bn. + :param models_dir: directory with models. + + :return: loading status. + """ + if isinstance(input_data, str): + with open(input_data) as f: + input_dict = json.load(f) + elif isinstance(input_data, dict): + input_dict = deepcopy(input_data) + else: + logger_network.error(f"Unknown input type: {type(input_data)}") + return + + self.add_nodes(input_dict["info"]) + self.set_structure(edges=input_dict["edges"]) + + # check compatibility with father network. + if not self.use_mixture: + for node_data in input_dict["parameters"].values(): + if "hybcprob" not in node_data.keys(): + continue + else: + # Since we don't have information about types of nodes, we + # should derive it from parameters. + if any( + list(node_keys.keys()) == ["covars", "mean", "coef"] + for node_keys in node_data["hybcprob"].values() + ): + logger_network.error( + f"This crucial parameter is not the same as father's parameter: use_mixture." + ) + return + + # check if edges before and after are the same.They can be different in + # the case when user sets forbidden edges. + if not self.has_logit: + if not all( + edges_before == [edges_after[0], edges_after[1]] + for edges_before, edges_after in zip(input_dict["edges"], self.edges) + ): + logger_network.error( + f"This crucial parameter is not the same as father's parameter: has_logit." + ) + return + + deserializer = serialization_utils.Deserializer(models_dir) + + to_deserialize = {} + # separate logit and gaussian nodes from distributions to deserialize bn's models + for node_name in input_dict["parameters"].keys(): + if "Mixture" in self[node_name].type: + continue + if self[node_name].type.startswith("Gaussian"): + if not input_dict["parameters"][node_name]["regressor"]: + continue + + if ( + "Gaussian" in self[node_name].type + or "Logit" in self[node_name].type + or "ConditionalLogit" in self[node_name].type + ): + if input_dict["parameters"][node_name].get("serialization", False): + to_deserialize[node_name] = [ + self[node_name].type, + input_dict["parameters"][node_name], + ] + elif "hybcprob" in input_dict["parameters"][node_name].keys(): + to_deserialize[node_name] = [ + self[node_name].type, + input_dict["parameters"][node_name], + ] + else: + continue + + deserialized_parameters = deserializer.apply(to_deserialize) + distributions = input_dict["parameters"].copy() + + for serialized_node in deserialized_parameters.keys(): + distributions[serialized_node] = deserialized_parameters[serialized_node] + + self.set_parameters(parameters=distributions) + + if input_dict.get("weights", False): + str_keys = list(input_dict["weights"].keys()) + tuple_keys = [eval(key) for key in str_keys] + weights = {} + for tuple_key in tuple_keys: + weights[tuple_key] = input_dict["weights"][str(tuple_key)] + self.weights = weights + return True + + def fit_parameters(self, data: pd.DataFrame, n_jobs: int = 1): + """ + Base function for parameter learning + """ + if data.isnull().values.any(): + logger_network.error("Dataframe contains NaNs.") + return + + if type(self).__name__ == "CompositeBN": + data = self._encode_categorical_data(data) + + # Turn all discrete values to str for learning algorithm + if "disc_num" in self.descriptor["types"].values(): + columns_names = [ + name + for name, t in self.descriptor["types"].items() + if t in ["disc_num"] + ] + data[columns_names] = data.loc[:, columns_names].astype("str") + + def worker(node): + return node.fit_parameters(data) + + results = Parallel(n_jobs=n_jobs)(delayed(worker)(node) for node in self.nodes) + + # code for debugging, do not remove + # results = [worker(node) for node in self.nodes] + + for result, node in zip(results, self.nodes): + self.distributions[node.name] = result + + def get_info(self, as_df: bool = True) -> Optional[pd.DataFrame]: + """Return a table with name, type, parents_type, parents_names""" + return get_info_(self, as_df) + + def sample( + self, + n: int, + models_dir: Optional[str] = None, + progress_bar: bool = True, + evidence: Optional[Dict[str, Union[str, int, float]]] = None, + as_df: bool = True, + predict: bool = False, + parall_count: int = 1, + filter_neg: bool = True, + seed: Optional[int] = None, + ) -> Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: + """ + Sampling from Bayesian Network + n: int number of samples + evidence: values for nodes from user + parall_count: number of threads. Defaults to 1. + filter_neg: either filter negative vals or not. + seed: seed value to use for random number generator + """ + from joblib import Parallel, delayed + + random.seed(seed) + np.random.seed(seed) + + if not self.distributions.items(): + logger_network.error( + "Parameter learning wasn't done. Call fit_parameters method" + ) + return None + if evidence: + for node in self.nodes: + if (node.type == "Discrete") & (node.name in evidence.keys()): + if not (isinstance(evidence[node.name], str)): + evidence[node.name] = str(int(evidence[node.name])) + + def wrapper(): + output = {} + for node in self.nodes: + parents = node.cont_parents + node.disc_parents + if evidence and node.name in evidence.keys(): + output[node.name] = evidence[node.name] + else: + if not parents: + pvals = None + else: + if self.type == "Discrete": + pvals = [str(output[t]) for t in parents] + else: + pvals = [output[t] for t in parents] + + # If any nan from parents, sampling from node blocked. + if any(pd.isnull(pvalue) for pvalue in pvals): + output[node.name] = np.nan + continue + node_data = self.distributions[node.name] + if models_dir and ("hybcprob" in node_data.keys()): + for obj, obj_data in node_data["hybcprob"].items(): + if "serialization" in obj_data.keys(): + if "gaussian" in node.type.lower(): + model_type = "regressor" + else: + model_type = "classifier" + if ( + obj_data["serialization"] == "joblib" + and obj_data[f"{model_type}_obj"] + ): + new_path = ( + models_dir + + f"\\{node.name.replace(' ', '_')}\\{obj}.joblib.compressed" + ) + node_data["hybcprob"][obj][ + f"{model_type}_obj" + ] = new_path + if predict: + output[node.name] = node.predict(node_data, pvals=pvals) + else: + output[node.name] = node.choose(node_data, pvals=pvals) + return output + + if predict: + seq = [] + for _ in tqdm(range(n), position=0, leave=True): + result = wrapper() + seq.append(result) + elif progress_bar: + seq = Parallel(n_jobs=parall_count)( + delayed(wrapper)() for _ in tqdm(range(n), position=0, leave=True) + ) + else: + seq = Parallel(n_jobs=parall_count)(delayed(wrapper)() for _ in range(n)) + + # code for debugging, don't remove + # seq = [] + # for _ in tqdm(range(n), position=0, leave=True): + # result = wrapper() + # seq.append(result) + + seq_df = pd.DataFrame.from_dict(seq, orient="columns") + seq_df.dropna(inplace=True) + cont_nodes = [ + c.name + for c in self.nodes + if type(c).__name__ + not in ( + "DiscreteNode", + "LogitNode", + "CompositeDiscreteNode", + "ConditionalLogitNode", + ) + ] + positive_columns = [ + c for c in cont_nodes if self.descriptor["signs"][c] == "pos" + ] + if filter_neg: + seq_df = seq_df[(seq_df[positive_columns] >= 0).all(axis=1)] + seq_df.reset_index(inplace=True, drop=True) + + seq = seq_df.to_dict("records") + sample_output = pd.DataFrame.from_dict(seq, orient="columns") + + if as_df: + if type(self).__name__ == "CompositeBN": + sample_output = self._decode_categorical_data(sample_output) + return sample_output + else: + return seq + + def predict( + self, + test: pd.DataFrame, + parall_count: int = 1, + progress_bar: bool = True, + models_dir: Optional[str] = None, + ) -> Dict[str, Union[List[str], List[int], List[float]]]: + """ + Function to predict columns from given data. + Note that train data and test data must have different columns. + Both train and test datasets must be cleaned from NaNs. + + Args: + test (pd.DataFrame): test dataset + parall_count (int, optional):number of threads. Defaults to 1. + progress_bar: verbose mode. + + Returns: + predicted data (dict): dict with column as key and predicted data as value + """ + if test.isnull().any().any(): + logger_network.error("Test data contains NaN values.") + return {} + + from joblib import Parallel, delayed + + def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): + preds = {column_name: list() for column_name in columns} + + if len(test) == 1: + for i in range(test.shape[0]): + test_row = dict(test.iloc[i, :]) + for n, key in enumerate(columns): + try: + sample = bn.sample( + 1, + evidence=test_row, + predict=True, + progress_bar=False, + models_dir=models_dir, + ) + if sample.empty: + preds[key].append(np.nan) + continue + if bn.descriptor["types"][key] == "cont": + if (bn.descriptor["signs"][key] == "pos") & ( + sample.loc[0, key] < 0 + ): + # preds[key].append(np.nan) + preds[key].append(0) + else: + preds[key].append(sample.loc[0, key]) + else: + preds[key].append(sample.loc[0, key]) + except Exception as ex: + logger_network.error(ex) + preds[key].append(np.nan) + return preds + else: + logger_network.error("Wrapper for one row from pandas.DataFrame") + return {} + + columns = list(set(self.nodes_names) - set(test.columns.to_list())) + if not columns: + logger_network.error("Test data is the same as train.") + return {} + + preds = {column_name: list() for column_name in columns} + + if progress_bar: + processed_list = Parallel(n_jobs=parall_count)( + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in tqdm(test.index, position=0, leave=True) + ) + else: + processed_list = Parallel(n_jobs=parall_count)( + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in test.index + ) + + for i in range(test.shape[0]): + curr_pred = processed_list[i] + for n, key in enumerate(columns): + preds[key].append(curr_pred[key][0]) + + return preds + + def set_classifiers(self, classifiers: Dict[str, object]): + """ + Set classifiers for logit nodes. + classifiers: dict with node_name and Classifier + """ + if not self.has_logit: + logger_network.error("Logit nodes are forbidden.") + return None + + for node in self.nodes: + if "Logit" in node.type: + if node.name in classifiers.keys(): + node.classifier = classifiers[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type + ) + else: + continue + + def set_regressor(self, regressors: Dict[str, object]): + """ + Set classifiers for gaussian nodes. + classifiers: dict with node_name and Classifier + """ + + for node in self.nodes: + if "Gaussian" in node.type: + if node.name in regressors.keys(): + node.regressor = regressors[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type + ) + else: + continue + + def plot(self, output: str): + """ + Visualize a Bayesian Network. Result will be saved + in the parent directory in folder visualization_result. + output: str name of output file + """ + return plot_(output, self.nodes, self.edges) + + def markov_blanket(self, node_name, plot_to: Optional[str] = None): + """ + A method to get markov blanket of a node. + :param node_name: name of node + :param plot_to: directory to plot graph, the file must have html extension. + + :return structure: json with {"nodes": [...], "edges": [...]} + """ + structure = GraphUtils.GraphAnalyzer(self).markov_blanket(node_name) + if plot_to: + plot_( + plot_to, [self[name] for name in structure["nodes"]], structure["edges"] + ) + return structure + + def find_family( + self, + node_name: str, + height: int = 1, + depth: int = 1, + with_nodes: Optional[List] = None, + plot_to: Optional[str] = None, + ): + """ + A method to get markov blanket of a node. + :param node_name: name of node + :param height: a number of layers up to node (its parents) that will be taken + :param depth: a number of layers down to node (its children) that will be taken + :param with_nodes: include nodes in return + :param plot_to: directory to plot graph, the file must have html extension. + + :return structure: json with {"nodes": [...], "edges": [...]} + """ + structure = GraphUtils.GraphAnalyzer(self).find_family( + node_name, height, depth, with_nodes + ) + + if plot_to: + plot_( + plot_to, [self[name] for name in structure["nodes"]], structure["edges"] + ) + + def fill_gaps(self, df: pd.DataFrame, **kwargs): + """ + Fill NaNs with sampled values. + + :param df: dataframe with NaNs + :param kwargs: the same params as bn.predict + + :return df, failed: filled DataFrame and list of failed rows (sometimes predict can return np.nan) + """ + if not self.distributions: + logger_network.error("To call this method you must train parameters.") + + # create a mimic row to get a dataframe from iloc method + list = [np.nan for _ in range(df.shape[1])] + df.loc["mimic"] = list + + def fill_row(df, i): + row = df.iloc[[i, -1], :].drop(["mimic"], axis=0) + + evidences = row.dropna(axis=1) + + return row.index[0], self.predict(evidences, progress_bar=False, **kwargs) + + failed = [] + for index in range(df.shape[0] - 1): + if df.iloc[index].isna().any(): + true_pos, result = fill_row(df, index) + if any(pd.isna(v[0]) for v in result.values()): + failed.append(true_pos) + continue + else: + for column, value in result.items(): + df.loc[true_pos, column] = value[0] + else: + continue + df.drop(failed, inplace=True) + return df.drop(["mimic"]), failed + + def get_dist(self, node_name: str, pvals: Optional[dict] = None): + """ + Get a distribution from node with known parent values (conditional distribution). + + :param node_name: name of node + :param pvals: parent values + """ + if not self.distributions: + logger_network.error("Empty parameters. Call fit_params first.") + return + node = self[node_name] + + parents = node.cont_parents + node.disc_parents + if not parents: + return self.distributions[node_name] + + pvals = [pvals[parent] for parent in parents] + + return node.get_dist(node_info=self.distributions[node_name], pvals=pvals) + + def _encode_categorical_data(self, data): + for column in data.select_dtypes(include=["object", "string"]).columns: + encoder = LabelEncoder() + data[column] = encoder.fit_transform(data[column]) + self.encoders[column] = encoder + return data + + def _decode_categorical_data(self, data): + data = data.apply( + lambda col: pd.to_numeric(col).astype(int) if col.dtype == "object" else col + ) + for column, encoder in self.encoders.items(): + data[column] = encoder.inverse_transform(data[column]) + return data diff --git a/bamt/networks/big_brave_bn.py b/bamt/networks/big_brave_bn.py new file mode 100644 index 0000000..e4adc9a --- /dev/null +++ b/bamt/networks/big_brave_bn.py @@ -0,0 +1,105 @@ +import math + +import numpy as np +import pandas as pd +from sklearn.metrics import mutual_info_score +from sklearn.preprocessing import OrdinalEncoder + + +class BigBraveBN: + def __init__(self): + self.possible_edges = [] + + def set_possible_edges_by_brave( + self, + df: pd.DataFrame, + n_nearest: int = 5, + threshold: float = 0.3, + proximity_metric: str = "MI", + ) -> list: + """Returns list of possible edges for structure learning and sets it into attribute + + Args: + df (pd.DataFrame): Data. + n_nearest (int): Number of nearest neighbors to consider. Default is 5. + threshold (float): Threshold for selecting edges. Default is 0.3. + proximity_metric (str): Metric used to calculate proximity. Default is "MI". + + Returns: + None: Modifies the object's possible_edges attribute. + """ + df_copy = df.copy(deep=True) + proximity_matrix = self._get_proximity_matrix(df_copy, proximity_metric) + brave_matrix = self._get_brave_matrix(df_copy.columns, proximity_matrix, n_nearest) + + threshold_value = brave_matrix.max(numeric_only=True).max() * threshold + filtered_brave_matrix = brave_matrix[brave_matrix > threshold_value].stack() + self.possible_edges = filtered_brave_matrix.index.tolist() + return self.possible_edges + + @staticmethod + def _get_n_nearest( + data: pd.DataFrame, columns: list, corr: bool = False, number_close: int = 5 + ) -> list: + """Returns N nearest neighbors for every column of dataframe.""" + groups = [] + for c in columns: + close_ind = data[c].sort_values(ascending=not corr).index.tolist() + groups.append(close_ind[: number_close + 1]) + return groups + + @staticmethod + def _get_proximity_matrix(df: pd.DataFrame, proximity_metric: str) -> pd.DataFrame: + """Returns matrix of proximity for the dataframe.""" + encoder = OrdinalEncoder() + df_coded = df.copy() + columns_to_encode = list(df_coded.select_dtypes(include=["category", "object"])) + df_coded[columns_to_encode] = encoder.fit_transform(df_coded[columns_to_encode]) + + if proximity_metric == "MI": + df_distance = pd.DataFrame( + np.zeros((len(df.columns), len(df.columns))), + columns=df.columns, + index=df.columns, + ) + for c1 in df.columns: + for c2 in df.columns: + dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) + df_distance.loc[c1, c2] = dist + return df_distance + + elif proximity_metric == "pearson": + return df_coded.corr(method="pearson") + + def _get_brave_matrix( + self, df_columns: pd.Index, proximity_matrix: pd.DataFrame, n_nearest: int = 5 + ) -> pd.DataFrame: + """Returns matrix of Brave coefficients for the DataFrame.""" + brave_matrix = pd.DataFrame( + np.zeros((len(df_columns), len(df_columns))), + columns=df_columns, + index=df_columns, + ) + groups = self._get_n_nearest( + proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest + ) + + for c1 in df_columns: + for c2 in df_columns: + a = b = c = d = 0.0 + if c1 != c2: + for g in groups: + a += (c1 in g) & (c2 in g) + b += (c1 in g) & (c2 not in g) + c += (c1 not in g) & (c2 in g) + d += (c1 not in g) & (c2 not in g) + + divisor = (math.sqrt((a + c) * (b + d))) * ( + math.sqrt((a + b) * (c + d)) + ) + br = (a * len(groups) + (a + c) * (a + b)) / ( + divisor if divisor != 0 else 0.0000000001 + ) + brave_matrix.loc[c1, c2] = br + + return brave_matrix diff --git a/bamt/networks/composite_bn.py b/bamt/networks/composite_bn.py new file mode 100644 index 0000000..e3739aa --- /dev/null +++ b/bamt/networks/composite_bn.py @@ -0,0 +1,114 @@ +import re +from typing import Optional, Dict + +import pandas as pd + +from bamt.builders.composite_builder import CompositeStructureBuilder, CompositeDefiner +from bamt.log import logger_network +from bamt.networks.base import BaseNetwork +from bamt.utils.composite_utils.MLUtils import MlModels + + +class CompositeBN(BaseNetwork): + """ + Composite Bayesian Network with Machine Learning Models support + """ + + def __init__(self): + super(CompositeBN, self).__init__() + self._allowed_dtypes = ["cont", "disc", "disc_num"] + self.type = "Composite" + self.parent_models = {} + + def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): + """ + Function for initializing nodes in Bayesian Network + descriptor: dict with types and signs of nodes + """ + self.descriptor = descriptor + + worker_1 = CompositeDefiner(descriptor=descriptor, regressor=None) + self.nodes = worker_1.vertices + + def add_edges( + self, + data: pd.DataFrame, + progress_bar: bool = True, + classifier: Optional[object] = None, + regressor: Optional[object] = None, + **kwargs, + ): + worker = CompositeStructureBuilder( + data=data, descriptor=self.descriptor, regressor=regressor + ) + + worker.build( + data=data, + classifier=classifier, + regressor=regressor, + progress_bar=progress_bar, + **kwargs, + ) + + # update family + self.nodes = worker.skeleton["V"] + self.edges = worker.skeleton["E"] + self.parent_models = worker.parent_models_dict + self.set_models(self.parent_models) + + def set_models(self, parent_models): + ml_models = MlModels() + ml_models_dict = ml_models.dict_models + for node in self.nodes: + if ( + type(node).__name__ == "CompositeDiscreteNode" + and parent_models[node.name] is not None + and len(node.cont_parents + node.disc_parents) > 0 + ): + self.set_classifiers( + {node.name: ml_models_dict[parent_models[node.name]]()} + ) + logger_network.info( + msg=f"{ml_models_dict[parent_models[node.name]]} classifier has been set for {node.name}" + ) + elif ( + type(node).__name__ == "CompositeContinuousNode" + and parent_models[node.name] is not None + and len(node.cont_parents + node.disc_parents) > 0 + ): + self.set_regressor( + {node.name: ml_models_dict[parent_models[node.name]]()} + ) + logger_network.info( + msg=f"{ml_models_dict[parent_models[node.name]]} regressor has been set for {node.name}" + ) + else: + pass + + def set_classifiers(self, classifiers: Dict[str, object]): + """ + Set classifiers for logit nodes. + classifiers: dict with node_name and Classifier + """ + for node in self.nodes: + if node.name in classifiers.keys(): + node.classifier = classifiers[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type + ) + else: + continue + + def set_regressor(self, regressors: Dict[str, object]): + """ + Set regressor for gaussian nodes. + classifiers: dict with node_name and regressors + """ + for node in self.nodes: + if node.name in regressors.keys(): + node.regressor = regressors[node.name] + node.type = re.sub( + r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type + ) + else: + continue diff --git a/bamt/networks/continuous_bn.py b/bamt/networks/continuous_bn.py new file mode 100644 index 0000000..2c2f327 --- /dev/null +++ b/bamt/networks/continuous_bn.py @@ -0,0 +1,15 @@ +from .base import BaseNetwork + + +class ContinuousBN(BaseNetwork): + """ + Bayesian Network with Continuous Types of Nodes + """ + + def __init__(self, use_mixture: bool = False): + super(ContinuousBN, self).__init__() + self.type = "Continuous" + self._allowed_dtypes = ["cont"] + self.has_logit = None + self.use_mixture = use_mixture + self.scoring_function = "" diff --git a/bamt/networks/discrete_bn.py b/bamt/networks/discrete_bn.py new file mode 100644 index 0000000..90a0d07 --- /dev/null +++ b/bamt/networks/discrete_bn.py @@ -0,0 +1,15 @@ +from .base import BaseNetwork + + +class DiscreteBN(BaseNetwork): + """ + Bayesian Network with Discrete Types of Nodes + """ + + def __init__(self): + super(DiscreteBN, self).__init__() + self.type = "Discrete" + self.scoring_function = "" + self._allowed_dtypes = ["disc", "disc_num"] + self.has_logit = None + self.use_mixture = None diff --git a/bamt/networks/hybrid_bn.py b/bamt/networks/hybrid_bn.py new file mode 100644 index 0000000..2942da8 --- /dev/null +++ b/bamt/networks/hybrid_bn.py @@ -0,0 +1,27 @@ +from typing import Dict + +from .base import BaseNetwork + + +class HybridBN(BaseNetwork): + """ + Bayesian Network with Mixed Types of Nodes + """ + + def __init__(self, has_logit: bool = False, use_mixture: bool = False): + super(HybridBN, self).__init__() + self._allowed_dtypes = ["cont", "disc", "disc_num"] + self.type = "Hybrid" + self.has_logit = has_logit + self.use_mixture = use_mixture + + def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: + types = descriptor["types"] + s = set(types.values()) + return ( + True + if ({"cont", "disc", "disc_num"} == s) + or ({"cont", "disc"} == s) + or ({"cont", "disc_num"} == s) + else False + ) diff --git a/bamt/nodes/__init__.py b/bamt/nodes/__init__.py new file mode 100644 index 0000000..f896bc3 --- /dev/null +++ b/bamt/nodes/__init__.py @@ -0,0 +1,10 @@ +__all__ = [ + "base", + "conditional_gaussian_node", + "conditional_logit_node", + "conditional_mixture_gaussian_node", + "discrete_node", + "logit_node", + "gaussian_node", + "mixture_gaussian_node", +] diff --git a/bamt/nodes/base.py b/bamt/nodes/base.py new file mode 100644 index 0000000..f42b0a3 --- /dev/null +++ b/bamt/nodes/base.py @@ -0,0 +1,59 @@ +import pickle +from typing import Union + + +class BaseNode(object): + """ + Base class for nodes. + """ + + def __init__(self, name: str): + """ + :param name: name for node (taken from column name) + type: node type + disc_parents: list with discrete parents + cont_parents: list with continuous parents + children: node's children + """ + self.name = name + self.type = "abstract" + + self.disc_parents = [] + self.cont_parents = [] + self.children = [] + + def __repr__(self): + return f"{self.name}" + + def __eq__(self, other): + if not isinstance(other, BaseNode): + # don't attempt to compare against unrelated types + return NotImplemented + + return ( + self.name == other.name + and self.type == other.type + and self.disc_parents == other.disc_parents + and self.cont_parents == other.cont_parents + and self.children == other.children + ) + + @staticmethod + def choose_serialization(model) -> Union[str, Exception]: + try: + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1").replace("'", '"') + + if type(model).__name__ == "CatBoostRegressor": + a = model_ser.encode("latin1") + else: + a = model_ser.replace('"', "'").encode("latin1") + + classifier_body = pickle.loads(a) + return "pickle" + except Exception as ex: + return ex + + @staticmethod + def get_dist(node_info, pvals): + pass diff --git a/bamt/nodes/composite_continuous_node.py b/bamt/nodes/composite_continuous_node.py new file mode 100644 index 0000000..b446402 --- /dev/null +++ b/bamt/nodes/composite_continuous_node.py @@ -0,0 +1,17 @@ +from typing import Optional, Union + +from sklearn import linear_model + +from .gaussian_node import GaussianNode +from .schema import GaussianParams, HybcprobParams + +NodeInfo = Union[GaussianParams, HybcprobParams] + + +class CompositeContinuousNode(GaussianNode): + def __init__(self, name, regressor: Optional[object] = None): + super(CompositeContinuousNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "CompositeContinuous" + f" ({type(self.regressor).__name__})" diff --git a/bamt/nodes/composite_discrete_node.py b/bamt/nodes/composite_discrete_node.py new file mode 100644 index 0000000..6d82db5 --- /dev/null +++ b/bamt/nodes/composite_discrete_node.py @@ -0,0 +1,20 @@ +from typing import Optional + +from sklearn import linear_model + +from .logit_node import LogitNode + + +class CompositeDiscreteNode(LogitNode): + """ + Class for composite discrete node. + """ + + def __init__(self, name, classifier: Optional[object] = None): + super(CompositeDiscreteNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "CompositeDiscrete" + f" ({type(self.classifier).__name__})" diff --git a/bamt/nodes/conditional_gaussian_node.py b/bamt/nodes/conditional_gaussian_node.py new file mode 100644 index 0000000..58801b6 --- /dev/null +++ b/bamt/nodes/conditional_gaussian_node.py @@ -0,0 +1,173 @@ +import itertools +import math +import random +from typing import Dict, Optional, List, Union + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.base import clone +from sklearn.metrics import mean_squared_error as mse + +from .base import BaseNode +from .schema import CondGaussParams + + +class ConditionalGaussianNode(BaseNode): + """ + Main class for Conditional Gaussian Node + """ + + def __init__(self, name, regressor: Optional[object] = None): + super(ConditionalGaussianNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "ConditionalGaussian" + f" ({type(self.regressor).__name__})" + + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams]]: + """ + Train params for Conditional Gaussian Node. + Return: + {"hybcprob": { : CondGaussParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + key_comb = [str(x) for x in comb] + if new_data.shape[0] > 0: + if self.cont_parents: + model = clone(self.regressor) + model.fit( + new_data[self.cont_parents].values, new_data[self.name].values + ) + predicted_value = model.predict(new_data[self.cont_parents].values) + variance = mse( + new_data[self.name].values, predicted_value, squared=False + ) + hycprob[str(key_comb)] = { + "variance": variance, + "mean": np.nan, + "regressor_obj": model, + "regressor": type(self.regressor).__name__, + "serialization": None, + } + else: + mean_base = np.mean(new_data[self.name].values) + variance = np.var(new_data[self.name].values) + hycprob[str(key_comb)] = { + "variance": variance, + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "serialization": None, + } + else: + hycprob[str(key_comb)] = { + "variance": np.nan, + "regressor": None, + "regressor_obj": None, + "serialization": None, + "mean": np.nan, + } + return {"hybcprob": hycprob} + + def get_dist(self, node_info, pvals): + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + + if self.cont_parents: + flag = False + for el in lgpvals: + if str(el) == "nan": + flag = True + break + if flag: + return np.nan, np.nan + else: + if lgdistribution["regressor"]: + model = lgdistribution["regressor_obj"] + + cond_mean = model.predict(np.array(lgpvals).reshape(1, -1))[0] + variance = lgdistribution["variance"] + return cond_mean, variance + else: + return np.nan, np.nan + + else: + return lgdistribution["mean"], math.sqrt(lgdistribution["variance"]) + + def choose( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + cond_mean, variance = self.get_dist(node_info, pvals) + if np.isnan(cond_mean) or np.isnan(variance): + return np.nan + + return random.gauss(cond_mean, variance) + + def predict( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + if self.cont_parents: + flag = False + for el in lgpvals: + if str(el) == "nan": + flag = True + break + if flag: + return np.nan + else: + if lgdistribution["regressor"]: + model = lgdistribution["regressor_obj"] + return model.predict(np.array(lgpvals).reshape(1, -1))[0] + else: + return np.nan + else: + return lgdistribution["mean"] diff --git a/bamt/nodes/conditional_logit_node.py b/bamt/nodes/conditional_logit_node.py new file mode 100644 index 0000000..08010c0 --- /dev/null +++ b/bamt/nodes/conditional_logit_node.py @@ -0,0 +1,172 @@ +import itertools +import random +from typing import Optional, List, Union, Dict + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.base import clone + +from .base import BaseNode +from .schema import LogitParams + + +class ConditionalLogitNode(BaseNode): + """ + Main class for Conditional Logit Node + """ + + def __init__(self, name: str, classifier: Optional[object] = None): + super(ConditionalLogitNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "ConditionalLogit" + f" ({type(self.classifier).__name__})" + + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, LogitParams]]: + """ + Train params on data + Return: + {"hybcprob": { : LogitParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + # mean_base = [np.nan] + classes = [np.nan] + key_comb = [str(x) for x in comb] + if new_data.shape[0] != 0: + model = clone(self.classifier) + values = set(new_data[self.name]) + if len(values) > 1: + model.fit( + X=new_data[self.cont_parents].values, + y=new_data[self.name].values, + ) + classes = list(model.classes_) + hycprob[str(key_comb)] = { + "classes": classes, + "classifier_obj": model, + "classifier": type(self.classifier).__name__, + "serialization": None, + } + else: + classes = list(values) + hycprob[str(key_comb)] = { + "classes": classes, + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } + + else: + hycprob[str(key_comb)] = { + "classes": list(classes), + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } + return {"hybcprob": hycprob} + + @staticmethod + def get_dist(node_info, pvals, **kwargs): + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + if any(parent_value == "nan" for parent_value in dispvals): + return np.nan + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + model = lgdistribution["classifier_obj"] + distribution = model.predict_proba(np.array(lgpvals).reshape(1, -1))[0] + + if not kwargs.get("inner", False): + return distribution + else: + return distribution, lgdistribution + else: + if not kwargs.get("inner", False): + return np.array([1.0]) + else: + return np.array([1.0]), lgdistribution + + def choose( + self, + node_info: Dict[str, Dict[str, LogitParams]], + pvals: List[Union[str, float]], + ) -> str: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + distribution, lgdistribution = self.get_dist(node_info, pvals, inner=True) + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + rand = random.random() + rindex = 0 + lbound = 0 + ubound = 0 + for interval in range(len(lgdistribution["classes"])): + ubound += distribution[interval] + if lbound <= rand < ubound: + rindex = interval + break + else: + lbound = ubound + return str(lgdistribution["classes"][rindex]) + else: + return str(lgdistribution["classes"][0]) + + @staticmethod + def predict( + node_info: Dict[str, Dict[str, LogitParams]], pvals: List[Union[str, float]] + ) -> str: + """ + Return value from ConditionalLogit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if isinstance(pval, str): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + + # JOBLIB + if len(lgdistribution["classes"]) > 1: + model = lgdistribution["classifier_obj"] + pred = model.predict(np.array(lgpvals).reshape(1, -1))[0] + + return str(pred) + + else: + return str(lgdistribution["classes"][0]) diff --git a/bamt/nodes/conditional_mixture_gaussian_node.py b/bamt/nodes/conditional_mixture_gaussian_node.py new file mode 100644 index 0000000..bdd3263 --- /dev/null +++ b/bamt/nodes/conditional_mixture_gaussian_node.py @@ -0,0 +1,227 @@ +import itertools +from typing import Union, List, Optional, Dict + +import numpy as np +from gmr import GMM +from pandas import DataFrame + +from bamt.utils.MathUtils import component +from .base import BaseNode +from .schema import CondMixtureGaussParams + + +class ConditionalMixtureGaussianNode(BaseNode): + """ + Main class for Conditional Mixture Gaussian Node + """ + + def __init__(self, name): + super(ConditionalMixtureGaussianNode, self).__init__(name) + self.type = "ConditionalMixtureGaussian" + + def fit_parameters( + self, data: DataFrame + ) -> Dict[str, Dict[str, CondMixtureGaussParams]]: + """ + Train params for Conditional Mixture Gaussian Node. + Return: + {"hybcprob": { : CondMixtureGaussParams}} + """ + hycprob = dict() + values = [] + combinations = [] + for d_p in self.disc_parents: + values.append(np.unique(data[d_p].values)) + for xs in itertools.product(*values): + combinations.append(list(xs)) + for comb in combinations: + mask = np.full(len(data), True) + for col, val in zip(self.disc_parents, comb): + mask = (mask) & (data[col] == val) + new_data = data[mask] + new_data.reset_index(inplace=True, drop=True) + key_comb = [str(x) for x in comb] + nodes = [self.name] + self.cont_parents + if new_data.shape[0] > 5: + if self.cont_parents: + # component(new_data, nodes, + # 'LRTS')#int((component(new_data, nodes, 'aic') + + # component(new_data, nodes, 'bic')) / 2) + n_comp = int( + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) + else: + # component(new_data, [node], + # 'LRTS')#int((component(new_data, [node], 'aic') + + # component(new_data, [node], 'bic')) / 2) + n_comp = int( + ( + component(new_data, [self.name], "aic") + + component(new_data, [self.name], "bic") + ) + / 2 + ) + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([new_data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} + elif new_data.shape[0] != 0: + n_comp = 1 + gmm = GMM(n_components=n_comp) + if self.cont_parents: + gmm.from_samples(new_data[nodes].values) + else: + gmm.from_samples(np.transpose([new_data[self.name].values])) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} + else: + if self.cont_parents: + hycprob[str(key_comb)] = { + "covars": np.nan, + "mean": np.nan, + "coef": [], + } + else: + hycprob[str(key_comb)] = { + "covars": np.nan, + "mean": np.nan, + "coef": [], + } + return {"hybcprob": hycprob} + + @staticmethod + def get_dist(node_info, pvals): + lgpvals = [] + dispvals = [] + + for pval in pvals: + if (isinstance(pval, str)) | (isinstance(pval, int)): + dispvals.append(pval) + else: + lgpvals.append(pval) + + lgdistribution = node_info["hybcprob"][str(dispvals)] + mean = lgdistribution["mean"] + covariance = lgdistribution["covars"] + w = lgdistribution["coef"] + + if len(w) != 0: + if len(lgpvals) != 0: + indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] + if not np.isnan(np.array(lgpvals)).all(): + n_comp = len(w) + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + cond_gmm = gmm.condition(indexes, [lgpvals]) + return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors + else: + return np.nan, np.nan, np.nan + else: + n_comp = len(w) + gmm = GMM( + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) + return gmm.means, gmm.covariances, gmm.priors + else: + return np.nan, np.nan, np.nan + + def choose( + self, + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: + """ + Function to get value from ConditionalMixtureGaussian node + params: + node_info: nodes info from distributions + pvals: parent values + """ + mean, covariance, w = self.get_dist(node_info, pvals) + + # check if w is nan or list of weights + if not isinstance(w, np.ndarray): + return np.nan + + n_comp = len(w) + + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.sample(1)[0][0] + return sample + + @staticmethod + def predict( + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: + """ + Function to get prediction from ConditionalMixtureGaussian node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + dispvals = [] + lgpvals = [] + for pval in pvals: + if (isinstance(pval, str)) | (isinstance(pval, int)): + dispvals.append(pval) + else: + lgpvals.append(pval) + lgdistribution = node_info["hybcprob"][str(dispvals)] + mean = lgdistribution["mean"] + covariance = lgdistribution["covars"] + w = lgdistribution["coef"] + if len(w) != 0: + if len(lgpvals) != 0: + indexes = [i for i in range(1, (len(lgpvals) + 1), 1)] + if not np.isnan(np.array(lgpvals)).all(): + n_comp = len(w) + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.predict(indexes, [lgpvals])[0][0] + else: + sample = np.nan + else: + # n_comp = len(w) + # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) + sample = 0 + for ind, wi in enumerate(w): + sample += wi * mean[ind][0] + else: + sample = np.nan + return sample diff --git a/bamt/nodes/discrete_node.py b/bamt/nodes/discrete_node.py new file mode 100644 index 0000000..705f1ee --- /dev/null +++ b/bamt/nodes/discrete_node.py @@ -0,0 +1,113 @@ +import random +from itertools import product +from typing import Type, Dict, Union, List + +import numpy as np +from pandas import DataFrame, crosstab + +from .base import BaseNode +from .schema import DiscreteParams + + +class DiscreteNode(BaseNode): + """ + Main class of Discrete Node + """ + + def __init__(self, name): + super(DiscreteNode, self).__init__(name) + self.type = "Discrete" + + def fit_parameters(self, data: DataFrame, num_workers: int = 1): + """ + Train params for Discrete Node + data: DataFrame to train on + num_workers: number of Parallel Workers + Method returns probas dict with the following format {[: value]} + and vals, list of appeared values in combinations + """ + + def worker(node: Type[BaseNode]) -> DiscreteParams: + parents = node.disc_parents + node.cont_parents + dist = data[node.name].value_counts(normalize=True).sort_index() + vals = [str(i) for i in dist.index.to_list()] + + if not parents: + cprob = dist.to_list() + else: + cprob = { + str([str(i) for i in comb]): [1 / len(vals) for _ in vals] + for comb in product(*[data[p].unique() for p in parents]) + } + + conditional_dist = crosstab( + data[node.name].to_list(), + [data[p] for p in parents], + normalize="columns", + ).T + tight_form = conditional_dist.to_dict("tight") + + for comb, probs in zip(tight_form["index"], tight_form["data"]): + if len(parents) > 1: + cprob[str([str(i) for i in comb])] = probs + else: + cprob[f"['{comb}']"] = probs + return {"cprob": cprob, "vals": vals} + + # pool = ThreadPoolExecutor(num_workers) + # future = pool.submit(worker, self) + result = worker(self) + return result + + @staticmethod + def get_dist(node_info, pvals): + if not pvals: + return node_info["cprob"] + else: + # noinspection PyTypeChecker + return node_info["cprob"][str(pvals)] + + def choose(self, node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: + """ + Return value from discrete node + params: + node_info: nodes info from distributions + pvals: parent values + """ + vals = node_info["vals"] + dist = np.array(self.get_dist(node_info, pvals)) + + cumulative_dist = np.cumsum(dist) + + rand = np.random.random() + rindex = np.searchsorted(cumulative_dist, rand) + + return vals[rindex] + + @staticmethod + def predict(node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: + """function for prediction based on evidence values in discrete node + + Args: + node_info (Dict[str, Union[float, str]]): parameters of node + pvals (List[str]): values in parents nodes + + Returns: + str: prediction + """ + + vals = node_info["vals"] + disct = [] + if not pvals: + dist = node_info["cprob"] + else: + # noinspection PyTypeChecker + dist = node_info["cprob"][str(pvals)] + max_value = max(dist) + indices = [index for index, value in enumerate(dist) if value == max_value] + max_ind = 0 + if len(indices) == 1: + max_ind = indices[0] + else: + max_ind = random.choice(indices) + return vals[max_ind] diff --git a/bamt/nodes/gaussian_node.py b/bamt/nodes/gaussian_node.py new file mode 100644 index 0000000..15144a8 --- /dev/null +++ b/bamt/nodes/gaussian_node.py @@ -0,0 +1,96 @@ +import math +import random +from typing import Optional, List + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model +from sklearn.metrics import mean_squared_error as mse + +from .base import BaseNode +from .schema import GaussianParams + + +class GaussianNode(BaseNode): + """ + Main class for Gaussian Node + """ + + def __init__(self, name, regressor: Optional[object] = None): + super(GaussianNode, self).__init__(name) + if regressor is None: + regressor = linear_model.LinearRegression() + self.regressor = regressor + self.type = "Gaussian" + f" ({type(self.regressor).__name__})" + + def fit_parameters(self, data: DataFrame, **kwargs) -> GaussianParams: + parents = self.cont_parents + if type(self).__name__ == "CompositeContinuousNode": + parents = parents + self.disc_parents + if parents: + self.regressor.fit(data[parents].values, data[self.name].values, **kwargs) + predicted_value = self.regressor.predict(data[parents].values) + variance = mse(data[self.name].values, predicted_value, squared=False) + return { + "mean": np.nan, + "regressor_obj": self.regressor, + "regressor": type(self.regressor).__name__, + "variance": variance, + "serialization": None, + } + else: + mean_base = np.mean(data[self.name].values) + variance = np.var(data[self.name].values) + return { + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "variance": variance, + "serialization": None, + } + + def get_dist(self, node_info, pvals): + var = node_info["variance"] + if pvals: + for el in pvals: + if str(el) == "nan": + return np.nan + model = node_info["regressor_obj"] + + if type(self).__name__ == "CompositeContinuousNode": + pvals = [int(item) if isinstance(item, str) else item for item in pvals] + + cond_mean = model.predict(np.array(pvals).reshape(1, -1))[0] + return cond_mean, var + else: + return node_info["mean"], math.sqrt(var) + + def choose(self, node_info: GaussianParams, pvals: List[float]) -> float: + """ + Return value from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + cond_mean, var = self.get_dist(node_info, pvals) + return random.gauss(cond_mean, var) + + @staticmethod + def predict(node_info: GaussianParams, pvals: List[float]) -> float: + """ + Return prediction from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + if pvals: + for el in pvals: + if str(el) == "nan": + return np.nan + model = node_info["regressor_obj"] + pred = model.predict(np.array(pvals).reshape(1, -1))[0] + return pred + else: + return node_info["mean"] diff --git a/bamt/nodes/logit_node.py b/bamt/nodes/logit_node.py new file mode 100644 index 0000000..9b16004 --- /dev/null +++ b/bamt/nodes/logit_node.py @@ -0,0 +1,91 @@ +import random +from typing import Optional, List, Union + +import numpy as np +from pandas import DataFrame +from sklearn import linear_model + +from .base import BaseNode +from .schema import LogitParams + + +class LogitNode(BaseNode): + """ + Main class for logit node + """ + + def __init__(self, name, classifier: Optional[object] = None): + super(LogitNode, self).__init__(name) + if classifier is None: + classifier = linear_model.LogisticRegression( + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) + self.classifier = classifier + self.type = "Logit" + f" ({type(self.classifier).__name__})" + + def fit_parameters(self, data: DataFrame, **kwargs) -> LogitParams: + parents = self.cont_parents + self.disc_parents + self.classifier.fit(X=data[parents].values, y=data[self.name].values, **kwargs) + + return { + "classes": list(self.classifier.classes_), + "classifier_obj": self.classifier, + "classifier": type(self.classifier).__name__, + "serialization": None, + } + + def get_dist(self, node_info, pvals): + if len(node_info["classes"]) > 1: + model = node_info["classifier_obj"] + if type(self).__name__ == "CompositeDiscreteNode": + pvals = [int(item) if isinstance(item, str) else item for item in pvals] + + return model.predict_proba(np.array(pvals).reshape(1, -1))[0] + else: + return np.array([1.0]) + + def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: + """ + Return value from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + rindex = 0 + + distribution = self.get_dist(node_info, pvals) + + if len(node_info["classes"]) > 1: + rand = random.random() + lbound = 0 + ubound = 0 + for interval in range(len(node_info["classes"])): + ubound += distribution[interval] + if lbound <= rand < ubound: + rindex = interval + break + else: + lbound = ubound + + return str(node_info["classes"][rindex]) + else: + return str(node_info["classes"][0]) + + @staticmethod + def predict(node_info: LogitParams, pvals: List[Union[float]]) -> str: + """ + Return prediction from Logit node + params: + node_info: nodes info from distributions + pvals: parent values + """ + + if len(node_info["classes"]) > 1: + model = node_info["classifier_obj"] + pred = model.predict(np.array(pvals).reshape(1, -1))[0] + + return str(pred) + + else: + return str(node_info["classes"][0]) diff --git a/bamt/nodes/mixture_gaussian_node.py b/bamt/nodes/mixture_gaussian_node.py new file mode 100644 index 0000000..22cd2b3 --- /dev/null +++ b/bamt/nodes/mixture_gaussian_node.py @@ -0,0 +1,154 @@ +from typing import Union, List, Optional + +import numpy as np +from gmr import GMM +from pandas import DataFrame + +from bamt.utils.MathUtils import component +from .base import BaseNode +from .schema import MixtureGaussianParams + + +class MixtureGaussianNode(BaseNode): + """ + Main class for Mixture Gaussian Node + """ + + def __init__(self, name): + super(MixtureGaussianNode, self).__init__(name) + self.type = "MixtureGaussian" + + def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: + """ + Train params for Mixture Gaussian Node + """ + parents = self.disc_parents + self.cont_parents + if not parents: + n_comp = int( + ( + component(data, [self.name], "aic") + + component(data, [self.name], "bic") + ) + / 2 + ) # component(data, [node], 'LRTS')# + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(np.transpose([data[node].values]))) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + return {"mean": means, "coef": w, "covars": cov} + if parents: + if not self.disc_parents and self.cont_parents: + nodes = [self.name] + self.cont_parents + new_data = data[nodes] + new_data.reset_index(inplace=True, drop=True) + n_comp = int( + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) # component(new_data, nodes, 'LRTS')# + # n_comp = 3 + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) + means = gmm.means.tolist() + cov = gmm.covariances.tolist() + # weigts = np.transpose(gmm.to_responsibilities(new_data[nodes].values)) + w = gmm.priors.tolist() # [] + # for row in weigts: + # w.append(np.mean(row)) + return {"mean": means, "coef": w, "covars": cov} + + @staticmethod + def get_dist(node_info, pvals): + mean = node_info["mean"] + covariance = node_info["covars"] + w = node_info["coef"] + n_comp = len(node_info["coef"]) + if n_comp != 0: + if pvals: + indexes = [i for i in range(1, len(pvals) + 1)] + if not np.isnan(np.array(pvals)).all(): + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + cond_gmm = gmm.condition(indexes, [pvals]) + return cond_gmm.means, cond_gmm.covariances, cond_gmm.priors + else: + return np.nan, np.nan, np.nan + else: + gmm = GMM( + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) + return gmm.means, gmm.covariances, gmm.priors + else: + return np.nan, np.nan, np.nan + + def choose( + self, node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: + """ + Func to get value from current node + node_info: nodes info from distributions + pvals: parent values + Return value from MixtureGaussian node + """ + mean, covariance, w = self.get_dist(node_info, pvals) + + n_comp = len(w) + + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + return gmm.sample(1)[0][0] + + @staticmethod + def predict( + node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: + """ + Func to get prediction from current node + node_info: nodes info from distributions + pvals: parent values + Return value from MixtureGaussian node + """ + mean = node_info["mean"] + covariance = node_info["covars"] + w = node_info["coef"] + n_comp = len(node_info["coef"]) + if n_comp != 0: + if pvals: + indexes = [i for i in range(1, len(pvals) + 1)] + if not np.isnan(np.array(pvals)).all(): + gmm = GMM( + n_components=n_comp, + priors=w, + means=mean, + covariances=covariance, + ) + sample = gmm.predict(indexes, [pvals])[0][0] + else: + sample = np.nan + else: + # gmm = GMM(n_components=n_comp, priors=w, means=mean, covariances=covariance) + sample = 0 + for ind, wi in enumerate(w): + sample += wi * mean[ind][0] + else: + sample = np.nan + return sample diff --git a/bamt/nodes/schema.py b/bamt/nodes/schema.py new file mode 100644 index 0000000..2db2642 --- /dev/null +++ b/bamt/nodes/schema.py @@ -0,0 +1,47 @@ +from typing import Dict, List, Any, Union, TypedDict, Optional + +from numpy import ndarray + + +class DiscreteParams(TypedDict): + cprob: Union[List[Union[list, Any]], Dict[str, list]] + vals: List[str] + + +class MixtureGaussianParams(TypedDict): + mean: List[float] + coef: List[float] + covars: List[float] + + +class GaussianParams(TypedDict): + regressor: str + regressor_obj: Optional[object] + variance: Union[ndarray, float] + mean: Union[ndarray, float] + serialization: Optional[str] + + +class CondGaussParams(TypedDict): + regressor: str + regressor_obj: Optional[object] + variance: Union[ndarray, float] + mean: Union[ndarray, float] + serialization: Optional[str] + + +class CondMixtureGaussParams(TypedDict): + mean: Optional[List[float]] + coef: List[float] + covars: Optional[List[float]] + + +class LogitParams(TypedDict): + classes: List[int] + classifier: str + classifier_obj: Optional[object] + serialization: Optional[str] + + +class HybcprobParams(TypedDict): + hybcprob: Dict[str, CondGaussParams] diff --git a/bamt/parameter_estimators/__init__.py b/bamt/parameter_estimators/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/parameter_estimators/maximum_likelihood_estimator.py b/bamt/parameter_estimators/maximum_likelihood_estimator.py deleted file mode 100644 index 63783b5..0000000 --- a/bamt/parameter_estimators/maximum_likelihood_estimator.py +++ /dev/null @@ -1,9 +0,0 @@ -from .parameters_estimator import ParametersEstimator - - -class MaximumLikelihoodEstimator(ParametersEstimator): - def __init__(self): - pass - - def estimate(self): - pass diff --git a/bamt/parameter_estimators/parameters_estimator.py b/bamt/parameter_estimators/parameters_estimator.py deleted file mode 100644 index b93734c..0000000 --- a/bamt/parameter_estimators/parameters_estimator.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class ParametersEstimator(ABC): - def __init__(self): - pass - - @abstractmethod - def estimate(self): - pass diff --git a/bamt/core/nodes/root_nodes/__init__.py b/bamt/preprocess/__init__.py similarity index 100% rename from bamt/core/nodes/root_nodes/__init__.py rename to bamt/preprocess/__init__.py diff --git a/bamt/preprocess/discretization.py b/bamt/preprocess/discretization.py new file mode 100644 index 0000000..9a36f86 --- /dev/null +++ b/bamt/preprocess/discretization.py @@ -0,0 +1,165 @@ +from copy import copy +from typing import Tuple + +import pandas as pd +from sklearn import preprocessing +from sklearn.preprocessing import KBinsDiscretizer + + +def get_nodes_sign(data: pd.DataFrame) -> dict: + """Function to define sign of the node + neg - if node has negative values + pos - if node has only positive values + + Args: + data (pd.DataFrame): input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - sign of data + """ + nodes_types = get_nodes_type(data) + columns_sign = dict() + for c in data.columns.to_list(): + if nodes_types[c] == "cont": + if (data[c] < 0).any(): + columns_sign[c] = "neg" + else: + columns_sign[c] = "pos" + return columns_sign + + +def get_nodes_type(data: pd.DataFrame) -> dict: + """Function to define the type of the node + disc - discrete node + cont - continuous + Args: + data (pd.DataFrame): input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + """ + column_type = dict() + for c in data.columns.to_list(): + if (data[c].dtypes == "float64") | (data[c].dtypes == "float32"): + column_type[c] = "cont" + if ( + (data[c].dtypes == "str") + | (data[c].dtypes == "O") + | (data[c].dtypes == "b") + ): + column_type[c] = "disc" + if (data[c].dtypes == "int64") | (data[c].dtypes == "int32"): + column_type[c] = "disc" + return column_type + + +def discretization( + data: pd.DataFrame, method: str, columns: list, bins: int = 5 +) -> Tuple[pd.DataFrame, KBinsDiscretizer]: + """Discretization of continuous parameters + + Args: + data (pd.DataFrame): input dataset + method (str): discretization approach (equal_intervals, equal_frequency, kmeans) + columns (list): name of columns for discretization + bins (int, optional): number of bins. Defaults to 5. + + Returns: + pd.DataFrame: output dataset with discretized parameters + KBinsDiscretizer: fitted exemplar of discretization class + """ + data = data.dropna() + data.reset_index(inplace=True, drop=True) + d_data = copy(data) + est = KBinsDiscretizer(n_bins=bins, encode="ordinal") + strategy_dict = { + "equal_intervals": "uniform", + "equal_frequency": "quantile", + "kmeans": "kmeans", + } + if method in strategy_dict: + est.strategy = strategy_dict[method] + data_discrete = est.fit_transform(d_data.loc[:, columns].values) + d_data[columns] = data_discrete.astype("int") + else: + raise Exception("This discretization method is not supported") + + return d_data, est + + +def label_encoding(data, columns): + d_data = copy(data) + encoder_dict = dict() + for column in columns: + le = preprocessing.LabelEncoder() + d_data[column] = le.fit_transform(d_data[column].values) + mapping = dict(zip(le.classes_, range(len(le.classes_)))) + encoder_dict[column] = mapping + return d_data, encoder_dict + + +def onehot_encoding(data, columns): + d_data = pd.get_dummies(data, columns=columns) + return d_data, None + + +def code_categories( + data: pd.DataFrame, method: str, columns: list +) -> Tuple[pd.DataFrame, dict]: + """Encoding categorical parameters + + Args: + data (pd.DataFrame): input dataset + method (str): method of encoding (label or onehot) + columns (list): name of categorical columns + + Returns: + pd.DataFrame: output dataset with encoded parameters + dict: dictionary with values and codes + """ + data = data.dropna() + data.reset_index(inplace=True, drop=True) + encoding_func_dict = {"label": label_encoding, "onehot": onehot_encoding} + if method in encoding_func_dict: + d_data, encoder_dict = encoding_func_dict[method](data, columns) + else: + raise Exception("This encoding method is not supported") + + return d_data, encoder_dict + + +def inverse_discretization( + data: pd.DataFrame, columns: list, discretizer: KBinsDiscretizer +) -> pd.DataFrame: + """Inverse discretization for numeric params + + Args: + data (pd.DataFrame): input dataset with discrete values + columns (list): colums for inverse_discretization + discretizer (KBinsDiscretizer): fitted exemplar of discretization class + + Returns: + pd.DataFrame: output dataset with continuous values + """ + new_data = copy(data) + new_data[columns] = discretizer.inverse_transform(new_data[columns].values) + + return new_data + + +def decode(data: pd.DataFrame, columns: list, encoder_dict: dict) -> pd.DataFrame: + """Decoding categorical params to initial labels + + Args: + data (pd.DataFrame): input dataset with encoded params + columns (list): columns for decoding + encoder_dict (dict): dictionary with values and codes + Returns: + pd.DataFrame: output dataset with decoded params + """ + for column in columns: + dict_parameter = encoder_dict[column] + inv_map = {v: k for k, v in dict_parameter.items()} + data[column] = data[column].apply(lambda x: inv_map(x)) + + return data diff --git a/bamt/preprocess/graph.py b/bamt/preprocess/graph.py new file mode 100644 index 0000000..87e688a --- /dev/null +++ b/bamt/preprocess/graph.py @@ -0,0 +1,42 @@ +from collections import defaultdict + + +def nodes_from_edges(edges: list): + """ + Retrieves all nodes from the list of edges. + Arguments + ---------- + *edges* : list + + Returns + ------- + *nodes* : list + + Effects + ------- + None + """ + return set().union(edges) + + +def edges_to_dict(edges: list): + """ + Transfers the list of edges to the dictionary of parents. + Arguments + ---------- + *edges* : list + + Returns + ------- + *parents_dict* : dict + + Effects + ------- + None + """ + nodes = nodes_from_edges(edges) + parents_dict = defaultdict(list, {node: [] for node in nodes}) + parents_dict.update( + {child: parents_dict[child] + [parent] for parent, child in edges} + ) + return parents_dict diff --git a/bamt/preprocess/numpy_pandas.py b/bamt/preprocess/numpy_pandas.py new file mode 100644 index 0000000..32242df --- /dev/null +++ b/bamt/preprocess/numpy_pandas.py @@ -0,0 +1,57 @@ +# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +# parentdir = os.path.dirname(currentdir) +# sys.path.insert(0,parentdir) +import numpy as np +import pandas as pd + + +def loc_to_DataFrame(data: np.array): + """Function to convert array to DataFrame + Args: + data (np.array): input array + + Returns: + data (pd.DataFrame): with string columns for filtering + """ + nodes_type = get_type_numpy(data) + if data.T.ndim == 1: + data = data.T + nodes_type = {0: nodes_type[0]} + dtype = { + key: "int64" if value == "disc" else "float64" + for key, value in nodes_type.items() + if value in ["disc", "cont"] + } + df = pd.DataFrame(data).astype(dtype) + df.columns = df.columns.map(str) + return df + + +def get_type_numpy(data: np.array): + """Function to define the type of the columns of array + disc - discrete node + cont - continuous + Args: + data (np.array): input array + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + Notes: + -- You may have problems with confusing rows and columns + """ + arr = data.T + + column_type = {} + for i, row in enumerate(arr): + if row.ndim == 0 or row.T.ndim == 0: + row_is_integer = np.issubdtype(row, np.integer) or row.is_integer() + column_type[i] = "disc" if row_is_integer else "cont" + else: + all_row_is_integer = all( + np.issubdtype(x, np.integer) or x.is_integer() for x in row + ) + column_type[i] = "disc" if all_row_is_integer else "cont" + if column_type[i] not in ["disc", "cont"]: + print("get_type_numpy: Incorrect type of row") + print(row) + return column_type diff --git a/bamt/preprocessors.py b/bamt/preprocessors.py new file mode 100644 index 0000000..847ec9c --- /dev/null +++ b/bamt/preprocessors.py @@ -0,0 +1,126 @@ +from typing import Tuple, Dict + +from pandas import DataFrame + +from bamt.log import logger_preprocessor +from bamt.utils import GraphUtils as gru + + +class BasePreprocessor(object): + """ + Base for Preprocessor + """ + + def __init__(self): + self.nodes_signs = {} + self.nodes_types = {} + + @staticmethod + def get_nodes_types(data): + return gru.nodes_types(data=data) + + def get_nodes_signs(self, data): + return gru.nodes_signs(nodes_types=self.nodes_types, data=data) + + def code_categories( + self, data: DataFrame, encoder + ) -> Tuple[DataFrame, Dict[str, Dict]]: + """Encoding categorical parameters + + Args: + data (DataFrame): input dataset + encoder: any object with fit_transform method + + Returns: + pd.DataFrame: output dataset with encoded parameters + dict: dictionary with values and codes + """ + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "disc" + ] + df = data.copy() # INPUT DF. Debugging SettingWithCopyWarning + if not columns: + return df, None + data = df[columns] # DATA TO CATEGORIZE + encoder_dict = dict() + + for col_name, column in data.items(): + # Iterate over (column name, Series) pairs. + try: + df[col_name] = encoder.fit_transform(column.values) + except TypeError as exc: + logger_preprocessor.error( + f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}" + ) + try: + mapping = dict(zip(encoder.classes_, range(len(encoder.classes_)))) + encoder_dict[col_name] = mapping + except BaseException: + pass + return df, encoder_dict + + def discretize(self, data: DataFrame, discretizer) -> tuple: + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] + df = data.copy() + if not columns: + return df, None + data = df[columns] + + data_discrete = discretizer.fit_transform(data.values) + df[columns] = data_discrete.astype("int") + + return df, discretizer + + def decode(self): + pass + + +class Preprocessor(BasePreprocessor): + def __init__(self, pipeline: list): + super().__init__() + assert isinstance(pipeline, list), "pipeline must be list" + self.pipeline = pipeline + self.coder = None + + @property + def info(self): + return {"types": self.nodes_types, "signs": self.nodes_signs} + + def scan(self, data: DataFrame): + """ + Function to scan data. If something is wrong, it will be send to log file + """ + columns_cont = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] + if not columns_cont: + logger_preprocessor.info("No one column is continuous") + + columns_disc = [ + col + for col in data.columns.to_list() + if self.nodes_types[col] in ["disc", "disc_num"] + ] + if not columns_disc: + logger_preprocessor.info("No one column is discrete") + + def apply(self, data: DataFrame) -> Tuple[DataFrame, Dict]: + """ + Apply pipeline + data: data to apply on + """ + df = data.copy() + self.nodes_types = self.get_nodes_types(data) + if list(self.nodes_types.keys()) != data.columns.to_list(): + logger_preprocessor.error("Nodes_types dictionary are not full.") + return None, None + self.nodes_signs = self.get_nodes_signs(data) + self.scan(df) + for name, instrument in self.pipeline: + if name == "encoder": + df, self.coder = self.code_categories(data=data, encoder=instrument) + if name == "discretizer": + df, est = self.discretize(data=df, discretizer=instrument) + return df, self.coder diff --git a/bamt/redef_HC.py b/bamt/redef_HC.py new file mode 100644 index 0000000..bab3119 --- /dev/null +++ b/bamt/redef_HC.py @@ -0,0 +1,316 @@ +""" +********************** +Greedy Hill-Climbing +for Structure Learning +********************** + +Code for Searching through the space of +possible Bayesian Network structures. + +Various optimization procedures are employed, +from greedy search to simulated annealing, and +so on - mostly using scipy.optimize. + +Local search - possible moves: +- Add edge +- Delete edge +- Invert edge + +Strategies to improve Greedy Hill-Climbing: +- Random Restarts + - when we get stuck, take some number of + random steps and then start climbing again. +- Tabu List + - keep a list of the K steps most recently taken, + and say that the search cannt reverse (undo) any + of these steps. +""" + +from bamt.external.pyBN.classes.bayesnet import BayesNet +from bamt.external.pyBN.utils.graph import would_cause_cycle +from bamt.mi_entropy_gauss import mi_gauss +from bamt.redef_info_scores import log_lik_local, BIC_local, AIC_local + + +def hc( + data, + metric="MI", + max_iter=200, + debug=False, + init_nodes=None, + restriction=None, + init_edges=None, + remove_geo_edges=True, + black_list=None, +): + """ + Greedy Hill Climbing search proceeds by choosing the move + which maximizes the increase in fitness of the + network at the current step. It continues until + it reaches a point where there does not exist any + feasible single move that increases the network fitness. + + It is called "greedy" because it simply does what is + best at the current iteration only, and thus does not + look ahead to what may be better later on in the search. + + For computational saving, a Priority Queue (python's heapq) + can be used to maintain the best operators and reduce the + complexity of picking the best operator from O(n^2) to O(nlogn). + This works by maintaining the heapq of operators sorted by their + delta score, and each time a move is made, we only have to recompute + the O(n) delta-scores which were affected by the move. The rest of + the operator delta-scores are not affected. + + For additional computational efficiency, we can cache the + sufficient statistics for various families of distributions - + therefore, computing the mutual information for a given family + only needs to happen once. + + The possible moves are the following: + - add edge + - delete edge + - invert edge + + Arguments + --------- + *data* : pd.DataFrame + The data from which the Bayesian network + structure will be learned. + + *metric* : a string + Which score metric to use. + Options: + - AIC + - BIC / MDL + - LL (log-likelihood) + + *max_iter* : an integer + The maximum number of iterations of the + hill-climbing algorithm to run. Note that + the algorithm will terminate on its own if no + improvement is made in a given iteration. + + *debug* : boolean + Whether to print(the scores/moves of the) + algorithm as its happening. + + *init_nodes* : a list of initialize nodes (number of nodes according to the dataset) + + *restriction* : a list of 2-tuples + For MMHC algorithm, the list of allowable edge additions. + + Returns + ------- + *bn* : a BayesNet object + + """ + nrow = data.shape[0] + ncol = data.shape[1] + + names = range(ncol) + + # INITIALIZE NETWORK W/ NO EDGES + # maintain children and parents dict for fast lookups + c_dict = dict([(n, []) for n in names]) + p_dict = dict([(n, []) for n in names]) + if init_edges: + for edge in init_edges: + c_dict[edge[0]].append(edge[1]) + p_dict[edge[1]].append(edge[0]) + + bn = BayesNet(c_dict) + + mutual_information = mi_gauss + if metric == "BIC": + mutual_information = BIC_local + if metric == "AIC": + mutual_information = AIC_local + if metric == "LL": + mutual_information = log_lik_local + + data = data.values + + cache = dict() + + _iter = 0 + improvement = True + + while improvement: + improvement = False + max_delta = 0 + + if debug: + print("ITERATION: ", _iter) + + ### TEST ARC ADDITIONS ### + for u in bn.nodes(): + for v in bn.nodes(): + if ( + v not in c_dict[u] + and u != v + and not would_cause_cycle(c_dict, u, v) + and len(p_dict[v]) != 3 + ): + # FOR MMHC ALGORITHM -> Edge Restrictions + if ( + (init_nodes is None or not (v in init_nodes)) + and (restriction is None or (u, v) in restriction) + and (black_list is None or not ((u, v) in black_list)) + ): + # SCORE FOR 'V' -> gaining a parent + # without 'u' as parent + old_cols = (v,) + tuple(p_dict[v]) + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = old_cols + (u,) # with'u' as parent + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta_score = nrow * (mi_old - mi_new) + + if delta_score > max_delta: + if debug: + print("Improved Arc Addition: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Addition" + max_arc = (u, v) + + # ### TEST ARC DELETIONS ### + for u in bn.nodes(): + for v in bn.nodes(): + if v in c_dict[u]: + # SCORE FOR 'V' -> losing a parent + old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = tuple([i for i in old_cols if i != u]) + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta_score = nrow * (mi_old - mi_new) + + if delta_score > max_delta: + if init_edges is None: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + else: + if (u, v) in init_edges: + if remove_geo_edges: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + else: + if debug: + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Deletion" + max_arc = (u, v) + + # ### TEST ARC REVERSALS ### + for u in bn.nodes(): + for v in bn.nodes(): + if ( + v in c_dict[u] + and not would_cause_cycle(c_dict, v, u, reverse=True) + and len(p_dict[u]) != 3 + and (init_nodes is None or not (u in init_nodes)) + and (restriction is None or (v, u) in restriction) + and (black_list is None or not ((v, u) in black_list)) + ): + old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + new_cols = old_cols + (v,) # with 'v' as parent + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta1 = -1 * nrow * (mi_old - mi_new) + # SCORE FOR 'V' -> losing 'u' as parent + old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent + if old_cols not in cache: + cache[old_cols] = mutual_information(data[:, old_cols]) + mi_old = cache[old_cols] + # without 'u' as parent + new_cols = tuple([u for i in old_cols if i != u]) + if new_cols not in cache: + cache[new_cols] = mutual_information(data[:, new_cols]) + mi_new = cache[new_cols] + delta2 = nrow * (mi_old - mi_new) + # COMBINED DELTA-SCORES + delta_score = delta1 + delta2 + + if delta_score > max_delta: + if init_edges is None: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + else: + if (u, v) in init_edges: + if remove_geo_edges: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + else: + if debug: + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) + max_delta = delta_score + max_operation = "Reversal" + max_arc = (u, v) + + if max_delta != 0: + improvement = True + u, v = max_arc + if max_operation == "Addition": + if debug: + print("ADDING: ", max_arc, "\n") + c_dict[u].append(v) + p_dict[v].append(u) + + elif max_operation == "Deletion": + if debug: + print("DELETING: ", max_arc, "\n") + c_dict[u].remove(v) + p_dict[v].remove(u) + + elif max_operation == "Reversal": + if debug: + print("REVERSING: ", max_arc, "\n") + c_dict[u].remove(v) + p_dict[v].remove(u) + c_dict[v].append(u) + p_dict[u].append(v) + + else: + if debug: + print("No Improvement on Iter: ", _iter) + + ### TEST FOR MAX ITERATION ### + _iter += 1 + if _iter > max_iter: + if debug: + print("Max Iteration Reached") + break + + bn = BayesNet(c_dict) + + return bn diff --git a/bamt/redef_info_scores.py b/bamt/redef_info_scores.py new file mode 100644 index 0000000..0890550 --- /dev/null +++ b/bamt/redef_info_scores.py @@ -0,0 +1,179 @@ +import sys +import warnings +from copy import copy + +import numpy as np +import pandas as pd + +from bamt.mi_entropy_gauss import mi_gauss as mutual_information, entropy_all as entropy +from bamt.preprocess.graph import edges_to_dict +from bamt.preprocess.numpy_pandas import get_type_numpy + + +def info_score(edges: list, data: pd.DataFrame, method="LL"): + score_funcs = {"LL": log_lik_local, "BIC": BIC_local, "AIC": AIC_local} + score = score_funcs.get(method.upper(), BIC_local) + + parents_dict = edges_to_dict(edges) + nodes_with_edges = parents_dict.keys() + scores = [ + score(data[child_parents].copy(), method) + for var in nodes_with_edges + for child_parents in ([var] + parents_dict[var],) + ] + scores += [ + score(data[[var]].copy(), method) + for var in set(data.columns).difference(set(nodes_with_edges)) + ] + return sum(scores) + + +##### INFORMATION-THEORETIC SCORING FUNCTIONS ##### + + +def log_likelihood(bn, data, method="LL"): + """ + Determining log-likelihood of the parameters + of a Bayesian Network. This is a quite simple + score/calculation, but it is useful as a straight-forward + structure learning score. + + Semantically, this can be considered as the evaluation + of the log-likelihood of the data, given the structure + and parameters of the BN: + - log( P( D | Theta_G, G ) ) + where Theta_G are the parameters and G is the structure. + + However, for computational reasons it is best to take + advantage of the decomposability of the log-likelihood score. + + As an example, if you add an edge from A->B, then you simply + need to calculate LOG(P'(B|A)) - Log(P(B)), and if the value + is positive then the edge improves the fitness score and should + therefore be included. + + Even more, you can expand and manipulate terms to calculate the + difference between the new graph and the original graph as follows: + Score(G') - Score(G) = M * I(X,Y), + where M is the number of data points and I(X,Y) is + the marginal mutual information calculated using + the empirical distribution over the data. + + In general, the likelihood score decomposes as follows: + LL(D | Theta_G, G) = + M * Sum over Variables ( I ( X , Parents(X) ) ) - + M * Sum over Variables ( H( X ) ), + where 'I' is mutual information and 'H' is the entropy, + and M is the number of data points + + Moreover, it is clear to see that H(X) is independent of the choice + of graph structure (G). Thus, we must only determine the difference + in the mutual information score of the original graph which had a given + node and its original parents, and the new graph which has a given node + and new parents. + + NOTE: This assumes the parameters have already + been learned for the BN's given structure. + + LL = LL - f(N)*|B|, where f(N) = 0 + + Arguments + --------- + *bn* : a BayesNet object + Must have both structure and parameters + instantiated. + Notes + ----- + NROW = data.shape[0] + mi_score = 0 + ent_score = 0 + for rv in bn.nodes(): + cols = tuple([bn.V.index(rv)].extend([bn.V.index(p) for p in bn.parents(rv)])) + mi_score += mutual_information(data[:,cols]) + ent_score += entropy(data[:,bn.V.index(rv)]) + + return NROW * (mi_score - ent_score) + """ + + NROW = data.shape[0] + mi_scores = [ + mutual_information( + data[:, (bn.V.index(rv),) + tuple([bn.V.index(p) for p in bn.parents(rv)])], + method=method, + ) + for rv in bn.nodes() + ] + ent_scores = [entropy(data[:, bn.V.index(rv)], method=method) for rv in bn.nodes()] + return NROW * (sum(mi_scores) - sum(ent_scores)) + + +def log_lik_local(data, method="LL"): + NROW = data.shape[0] + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if isinstance(data, pd.DataFrame): + return NROW * ( + mutual_information(data, method=method) + - entropy(data.iloc[:, 0], method=method) + ) + elif isinstance(data, pd.Series): + return 0.0 + elif isinstance(data, np.ndarray): + return NROW * ( + mutual_information(data, method=method) + - entropy(data[:, 0], method=method) + ) + + +def BIC_local(data, method="BIC"): + NROW = data.shape[0] + log_score = log_lik_local(data, method=method) + try: + penalty = 0.5 * num_params(data) * np.log(NROW) + except OverflowError as err: + penalty = sys.float_info.max + return log_score - penalty + + +def num_params(data): + # Convert pandas DataFrame to numpy array + if isinstance(data, pd.DataFrame): + data = data.values + # Convert pandas Series to numpy array + if isinstance(data, pd.Series): + data = np.array(copy(data)) + + # Calculate number of parameters for numpy array + if isinstance(data, np.ndarray): + node_type = get_type_numpy(data) + columns_for_discrete = [ + param for param, node in node_type.items() if node == "cont" + ] + columns_for_code = [ + param for param, node in node_type.items() if node == "disc" + ] + + prod = 1 + for var in columns_for_code: + prod *= ( + len(np.unique(data[:, var])) if data.ndim != 1 else len(np.unique(data)) + ) + if columns_for_discrete: + prod *= len(columns_for_discrete) + + # Handle overflow error + try: + return prod + except OverflowError: + return sys.float_info.max + + # Raise an error if data type is unexpected + print("Num_params: Unexpected data type") + print(data) + return None + + +def AIC_local(data, method="AIC"): + log_score = log_lik_local(data, method=method) + penalty = num_params(data) + return log_score - penalty diff --git a/bamt/score_functions/__init__.py b/bamt/score_functions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bamt/score_functions/k2_score.py b/bamt/score_functions/k2_score.py deleted file mode 100644 index 4a46b6a..0000000 --- a/bamt/score_functions/k2_score.py +++ /dev/null @@ -1,9 +0,0 @@ -from .score_function import ScoreFunction - - -class K2Score(ScoreFunction): - def __init__(self): - super().__init__() - - def estimate(self): - pass diff --git a/bamt/score_functions/mutual_information_score.py b/bamt/score_functions/mutual_information_score.py deleted file mode 100644 index 0bd2b7e..0000000 --- a/bamt/score_functions/mutual_information_score.py +++ /dev/null @@ -1,9 +0,0 @@ -from .score_function import ScoreFunction - - -class MutualInformationScore(ScoreFunction): - def __init__(self): - super().__init__() - - def estimate(self): - pass diff --git a/bamt/score_functions/score_function.py b/bamt/score_functions/score_function.py deleted file mode 100644 index e3ec70f..0000000 --- a/bamt/score_functions/score_function.py +++ /dev/null @@ -1,10 +0,0 @@ -from abc import ABC, abstractmethod - - -class ScoreFunction(ABC): - def __init__(self): - pass - - @abstractmethod - def estimate(self): - pass diff --git a/bamt/utils/EvoUtils.py b/bamt/utils/EvoUtils.py new file mode 100644 index 0000000..5266a75 --- /dev/null +++ b/bamt/utils/EvoUtils.py @@ -0,0 +1,116 @@ +import random + +import pandas as pd +from golem.core.dag.convert import graph_structure_as_nx_graph +from golem.core.dag.graph_utils import ordered_subnodes_hierarchy +from golem.core.optimisers.graph import OptGraph, OptNode +from pgmpy.estimators import K2Score +from pgmpy.models import BayesianNetwork + + +class CustomGraphModel(OptGraph): + def evaluate(self, data: pd.DataFrame): + nodes = data.columns.to_list() + _, labels = graph_structure_as_nx_graph(self) + return len(nodes) + + +class CustomGraphNode(OptNode): + def __str__(self): + return f'{self.content["name"]}' + + +def K2_metric(graph: CustomGraphModel, data: pd.DataFrame): + graph_nx, labels = graph_structure_as_nx_graph(graph) + struct = [] + for meta_edge in graph_nx.edges(): + l1 = str(labels[meta_edge[0]]) + l2 = str(labels[meta_edge[1]]) + struct.append([l1, l2]) + + bn_model = BayesianNetwork(struct) + bn_model.add_nodes_from(data.columns) + + score = K2Score(data).score(bn_model) + return -score + + +def custom_mutation_add(graph: CustomGraphModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + nodes_not_cycling = random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) + ] and other_random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) + ] + if nodes_not_cycling: + random_node.nodes_from.append(other_random_node) + break + + except Exception as ex: + print(f"Incorrect connection: {ex}") + return graph + + +def custom_mutation_delete(graph: OptGraph, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_reverse(graph: OptGraph, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = random.choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + other_random_node.nodes_from.append(random_node) + break + except Exception as ex: + print(ex) + return graph + + +def has_no_duplicates(graph): + _, labels = graph_structure_as_nx_graph(graph) + if len(labels.values()) != len(set(labels.values())): + raise ValueError("Custom graph has duplicates") + return True + + +def has_no_blacklist_edges(graph, blacklist): + nx_graph, _ = graph_structure_as_nx_graph(graph) + for edge in nx_graph.edges(): + if edge in blacklist: + raise ValueError("Graph contains blacklisted edges") + return True + + +def has_only_whitelist_edges(graph, whitelist): + nx_graph, _ = graph_structure_as_nx_graph(graph) + for edge in nx_graph.edges(): + if edge not in whitelist: + raise ValueError("Graph contains non-whitelisted edges") + return True diff --git a/bamt/utils/GraphUtils.py b/bamt/utils/GraphUtils.py new file mode 100644 index 0000000..ccffae1 --- /dev/null +++ b/bamt/utils/GraphUtils.py @@ -0,0 +1,158 @@ +from typing import Dict, List, Tuple, Type + +import networkx as nx +from pandas import DataFrame + +from bamt.log import logger_preprocessor +from bamt.nodes.base import BaseNode + + +def nodes_types(data: DataFrame) -> Dict[str, str]: + """ + Function to define the type of the node + disc - discrete node + cont - continuous + Args: + data: input dataset + + Returns: + dict: output dictionary where 'key' - node name and 'value' - node type + """ + + column_type = dict() + for c in data.columns.to_list(): + disc = ["str", "O", "b", "categorical", "object", "bool"] + disc_numerical = ["int32", "int64"] + cont = ["float32", "float64"] + if data[c].dtype.name in disc: + column_type[c] = "disc" + elif data[c].dtype.name in cont: + column_type[c] = "cont" + elif data[c].dtype.name in disc_numerical: + column_type[c] = "disc_num" + else: + logger_preprocessor.error(f"Unsupported data type. Dtype: {data[c].dtypes}") + + return column_type + + +def nodes_signs(nodes_types: dict, data: DataFrame) -> Dict[str, str]: + """Function to define sign of the node + neg - if node has negative values + pos - if node has only positive values + + Args: + data (pd.DataFrame): input dataset + nodes_types (dict): dict with nodes_types + + Returns: + dict: output dictionary where 'key' - node name and 'value' - sign of data + """ + if list(nodes_types.keys()) != data.columns.to_list(): + logger_preprocessor.error("Nodes_types dictionary is not full.") + return + columns_sign = dict() + for c in data.columns.to_list(): + if nodes_types[c] == "cont": + if (data[c] < 0).any(): + columns_sign[c] = "neg" + else: + columns_sign[c] = "pos" + return columns_sign + + +def get_descriptor(data) -> Dict[str, Dict[str, str]]: + return {"types": nodes_types(data), "signs": nodes_signs(nodes_types(data), data)} + + +def toporder(nodes: List[Type[BaseNode]], edges: List[Tuple]) -> List[List[str]]: + """ + Function for topological sorting + """ + G = nx.DiGraph() + G.add_nodes_from([node.name for node in nodes]) + G.add_edges_from(edges) + return list(nx.topological_sort(G)) + + +class GraphAnalyzer(object): + """ + Object to analyze DAG. + """ + + def __init__(self, bn): + self.bn = bn + + def _isolate_structure(self, nodes): + isolated_edges = [] + for edge in self.bn.edges: + if edge[0] in nodes and edge[1] in nodes: + isolated_edges.append(edge) + return isolated_edges + + def markov_blanket(self, node_name: str): + node = self.bn[node_name] + + parents = node.cont_parents + node.disc_parents + children = node.children + fremd_eltern = [] + + for child in node.children: + all_parents = self.bn[child].cont_parents + self.bn[child].disc_parents + + if all_parents == [node_name]: + continue + else: + new = all_parents + fremd_eltern.extend(new) + + nodes = parents + children + fremd_eltern + [node_name] + + edges = self._isolate_structure(nodes) + return {"nodes": list(set(nodes)), "edges": edges} + + def _collect_height(self, node_name, height): + nodes = [] + node = self.bn[node_name] + if height <= 0: + return [] + + if height == 1: + return node.disc_parents + node.cont_parents + + for parent in node.cont_parents + node.disc_parents: + nodes.append(parent) + nodes.extend(self._collect_height(parent, height=height - 1)) + return nodes + + def _collect_depth(self, node_name, depth): + nodes = [] + node = self.bn[node_name] + + if depth <= 0: + return [] + + if depth == 1: + return node.children + + for child in node.children: + nodes.append(child) + nodes.extend(self._collect_depth(child, depth=depth - 1)) + + return nodes + + def find_family(self, *args): + node_name, height, depth, with_nodes = args + if not with_nodes: + with_nodes = [] + else: + with_nodes = list(with_nodes) + nodes = ( + self._collect_depth(node_name, depth) + + self._collect_height(node_name, height) + + [node_name] + ) + + nodes = list(set(nodes + with_nodes)) + + return {"nodes": nodes, "edges": self._isolate_structure(nodes + with_nodes)} diff --git a/bamt/utils/MathUtils.py b/bamt/utils/MathUtils.py new file mode 100644 index 0000000..ed08105 --- /dev/null +++ b/bamt/utils/MathUtils.py @@ -0,0 +1,161 @@ +import math + +import numpy as np +from scipy import stats +from scipy.stats.distributions import chi2 +from sklearn.mixture import GaussianMixture + + +def lrts_comp(data): + n = 0 + biggets_p = -1 * np.infty + comp_biggest = 0 + max_comp = 10 + if len(data) < max_comp: + max_comp = len(data) + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm2 = GaussianMixture(n_components=i + 1, random_state=0) + gm1.fit(data) + ll1 = np.mean(gm1.score_samples(data)) + gm2.fit(data) + ll2 = np.mean(gm2.score_samples(data)) + LR = 2 * (ll2 - ll1) + p = chi2.sf(LR, 1) + if p > biggets_p: + biggets_p = p + comp_biggest = i + n = comp_biggest + return n + + +def mix_norm_cdf(x, weights, means, covars): + mcdf = 0.0 + for i in range(len(weights)): + mcdf += weights[i] * stats.norm.cdf(x, loc=means[i][0], scale=covars[i][0][0]) + return mcdf + + +def theoretical_quantile(data, n_comp): + model = GaussianMixture(n_components=n_comp, random_state=0) + model.fit(data) + q = [] + x = [] + # step = ((np.max(model.sample(100000)[0])) - (np.min(model.sample(100000)[0])))/1000 + step = (np.max(data) - np.min(data)) / 1000 + d = np.arange(np.min(data), np.max(data), step) + for i in d: + x.append(i) + q.append(mix_norm_cdf(i, model.weights_, model.means_, model.covariances_)) + return x, q + + +def quantile_mix(p, vals, q): + ind = q.index(min(q, key=lambda x: abs(x - p))) + return vals[ind] + + +def probability_mix(val, vals, q): + ind = vals.index(min(vals, key=lambda x: abs(x - val))) + return q[ind] + + +def sum_dist(data, vals, q): + percs = np.linspace(1, 100, 10) + x = np.quantile(data, percs / 100) + y = [] + for p in percs: + y.append(quantile_mix(p / 100, vals, q)) + dist = 0 + for xi, yi in zip(x, y): + dist = dist + (abs(-1 * xi + yi)) / math.sqrt(2) + return dist + + +def component(data, columns, method): + n = 1 + max_comp = 10 + x = [] + if data.shape[0] < max_comp: + max_comp = data.shape[0] + if len(columns) == 1: + x = np.transpose([data[columns[0]].values]) + else: + x = data[columns].values + if method == "aic": + lowest_aic = np.infty + comp_lowest = 0 + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm1.fit(x) + aic1 = gm1.aic(x) + if aic1 < lowest_aic: + lowest_aic = aic1 + comp_lowest = i + n = comp_lowest + + if method == "bic": + lowest_bic = np.infty + comp_lowest = 0 + for i in range(1, max_comp + 1, 1): + gm1 = GaussianMixture(n_components=i, random_state=0) + gm1.fit(x) + bic1 = gm1.bic(x) + if bic1 < lowest_bic: + lowest_bic = bic1 + comp_lowest = i + n = comp_lowest + + if method == "LRTS": + n = lrts_comp(x) + if method == "quantile": + biggest_p = -1 * np.infty + comp_biggest = 0 + for i in range(1, max_comp, 1): + vals, q = theoretical_quantile(x, i) + dist = sum_dist(x, vals, q) + p = probability_mix(dist, vals, q) + if p > biggest_p: + biggest_p = p + comp_biggest = i + n = comp_biggest + return n + + +def _child_dict(net: list): + res_dict = dict() + for e0, e1 in net: + if e1 in res_dict: + res_dict[e1].append(e0) + else: + res_dict[e1] = [e0] + return res_dict + + +def precision_recall(pred_net: list, true_net: list, decimal=4): + pred_dict = _child_dict(pred_net) + true_dict = _child_dict(true_net) + corr_undirected = 0 + corr_dir = 0 + for e0, e1 in pred_net: + flag = True + if e1 in true_dict: + if e0 in true_dict[e1]: + corr_undirected += 1 + corr_dir += 1 + flag = False + if (e0 in true_dict) and flag: + if e1 in true_dict[e0]: + corr_undirected += 1 + pred_len = len(pred_net) + true_len = len(true_net) + shd = pred_len + true_len - corr_undirected - corr_dir + return { + "AP": round(corr_undirected / pred_len, decimal), + "AR": round(corr_undirected / true_len, decimal), + # 'F1_undir': round(2 * (corr_undirected / pred_len) * (corr_undirected / true_len) / (corr_undirected / pred_len + corr_undirected / true_len), decimal), + "AHP": round(corr_dir / pred_len, decimal), + "AHR": round(corr_dir / true_len, decimal), + # 'F1_directed': round(2*(corr_dir/pred_len)*(corr_dir/true_len)/(corr_dir/pred_len+corr_dir/true_len), decimal), + "SHD": shd, + } diff --git a/bamt/dag_optimizers/__init__.py b/bamt/utils/__init__.py similarity index 100% rename from bamt/dag_optimizers/__init__.py rename to bamt/utils/__init__.py diff --git a/bamt/utils/check_utils.py b/bamt/utils/check_utils.py new file mode 100644 index 0000000..ec44a33 --- /dev/null +++ b/bamt/utils/check_utils.py @@ -0,0 +1,9 @@ +def is_model(model): + try: + methods = dir(model) + if "fit" in methods: + return True + else: + return False + except Exception: + return False diff --git a/bamt/utils/composite_utils/CompositeGeneticOperators.py b/bamt/utils/composite_utils/CompositeGeneticOperators.py new file mode 100644 index 0000000..f42493a --- /dev/null +++ b/bamt/utils/composite_utils/CompositeGeneticOperators.py @@ -0,0 +1,189 @@ +from math import log10 +from random import choice + +import pandas as pd +from golem.core.dag.graph_utils import ordered_subnodes_hierarchy +from numpy import std, mean, log +from scipy.stats import norm +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + +from .CompositeModel import CompositeModel +from .MLUtils import MlModels + + +def custom_crossover_all_model( + graph_first: CompositeModel, graph_second: CompositeModel, max_depth +): + num_cros = 100 + try: + for _ in range(num_cros): + selected_node1 = choice(graph_first.nodes) + if selected_node1.nodes_from is None or selected_node1.nodes_from == []: + continue + + selected_node2 = graph_second.get_nodes_by_name(str(selected_node1))[0] + if selected_node2.nodes_from is None or selected_node2.nodes_from == []: + continue + + model1 = selected_node1.content["parent_model"] + model2 = selected_node2.content["parent_model"] + + selected_node1.content["parent_model"] = model2 + selected_node2.content["parent_model"] = model1 + + break + + except Exception as ex: + print(ex) + return graph_first, graph_second + + +def custom_mutation_add_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + nodes_not_cycling = random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) + ] and other_random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) + ] + if nodes_not_cycling: + other_random_node.nodes_from.append(random_node) + ml_models = MlModels() + other_random_node.content[ + "parent_model" + ] = ml_models.get_model_by_children_type(other_random_node) + break + + except Exception as ex: + graph.log.warn(f"Incorrect connection: {ex}") + return graph + + +def custom_mutation_delete_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + if not random_node.nodes_from: + random_node.content["parent_model"] = None + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_reverse_structure(graph: CompositeModel, **kwargs): + num_mut = 100 + try: + for _ in range(num_mut): + rid = choice(range(len(graph.nodes))) + random_node = graph.nodes[rid] + other_random_node = graph.nodes[choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): + random_node.nodes_from.remove(other_random_node) + if not random_node.nodes_from: + random_node.content["parent_model"] = None + other_random_node.nodes_from.append(random_node) + ml_models = MlModels() + other_random_node.content[ + "parent_model" + ] = ml_models.get_model_by_children_type(other_random_node) + break + except Exception as ex: + print(ex) + return graph + + +def custom_mutation_add_model(graph: CompositeModel, **kwargs): + try: + all_nodes = graph.nodes + nodes_with_parents = [ + node + for node in all_nodes + if (node.nodes_from != [] and node.nodes_from is not None) + ] + if not nodes_with_parents: + return graph + node = choice(nodes_with_parents) + ml_models = MlModels() + node.content["parent_model"] = ml_models.get_model_by_children_type(node) + except Exception as ex: + print(ex) + return graph + + +def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02): + data_all = data + data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42) + score, len_data = 0, len(data_train) + for node in graph.nodes: + data_of_node_train = data_train[node.content["name"]] + data_of_node_test = data_test[node.content["name"]] + if node.nodes_from is None or node.nodes_from == []: + if node.content["type"] == "cont": + mu, sigma = mean(data_of_node_train), std(data_of_node_train) + score += norm.logpdf( + data_of_node_test.values, loc=mu, scale=sigma + ).sum() + else: + count = data_of_node_train.value_counts() + frequency = log(count / len_data) + index = frequency.index.tolist() + for value in data_of_node_test: + if value in index: + score += frequency[value] + else: + model, columns, target, idx = ( + MlModels().dict_models[node.content["parent_model"]](), + [n.content["name"] for n in node.nodes_from], + data_of_node_train.to_numpy(), + data_train.index.to_numpy(), + ) + setattr(model, "max_iter", 100000) + features = data_train[columns].to_numpy() + if len(set(target)) == 1: + continue + fitted_model = model.fit(features, target) + + features = data_test[columns].to_numpy() + target = data_of_node_test.to_numpy() + if node.content["type"] == "cont": + predict = fitted_model.predict(features) + mse = mean_squared_error(target, predict, squared=False) + 0.0000001 + a = norm.logpdf(target, loc=predict, scale=mse) + score += a.sum() + else: + predict_proba = fitted_model.predict_proba(features) + idx = pd.array(list(range(len(target)))) + li = [] + + for i in idx: + a = predict_proba[i] + try: + b = a[target[i]] + except BaseException: + b = 0.0000001 + if b < 0.0000001: + b = 0.0000001 + li.append(log(b)) + score += sum(li) + + edges_count = len(graph.get_edges()) + score -= (edges_count * percent) * log10(len_data) * edges_count + + return -score diff --git a/bamt/utils/composite_utils/CompositeModel.py b/bamt/utils/composite_utils/CompositeModel.py new file mode 100644 index 0000000..344a7c8 --- /dev/null +++ b/bamt/utils/composite_utils/CompositeModel.py @@ -0,0 +1,17 @@ +from typing import Optional, Union, List + +from golem.core.dag.graph_delegate import GraphDelegate +from golem.core.dag.linked_graph_node import LinkedGraphNode + + +class CompositeNode(LinkedGraphNode): + def __str__(self): + return self.content["name"] + + +class CompositeModel(GraphDelegate): + def __init__( + self, nodes: Optional[Union[LinkedGraphNode, List[LinkedGraphNode]]] = None + ): + super().__init__(nodes) + self.unique_pipeline_id = 1 diff --git a/bamt/utils/composite_utils/MLUtils.py b/bamt/utils/composite_utils/MLUtils.py new file mode 100644 index 0000000..9c3cefb --- /dev/null +++ b/bamt/utils/composite_utils/MLUtils.py @@ -0,0 +1,126 @@ +import json +from random import choice + +import pkg_resources +from typing import Union + +from catboost import CatBoostClassifier, CatBoostRegressor +from golem.core.dag.graph_node import GraphNode +from sklearn.cluster import KMeans +from sklearn.ensemble import ( + AdaBoostRegressor, + ExtraTreesRegressor, + GradientBoostingRegressor, + RandomForestClassifier, + RandomForestRegressor, +) +from sklearn.linear_model import ( + Lasso, + LinearRegression, + LogisticRegression, + Ridge, + SGDRegressor, +) +from sklearn.naive_bayes import BernoulliNB, MultinomialNB +from sklearn.neural_network import MLPClassifier +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from xgboost import XGBClassifier, XGBRegressor + +from .CompositeModel import CompositeNode + +# Try to import LGBMRegressor and LGBMClassifier from lightgbm, if not available set to None +try: + from lightgbm.sklearn import LGBMRegressor, LGBMClassifier +except ModuleNotFoundError: + LGBMRegressor = None + LGBMClassifier = None + +lgbm_params = "lgbm_params.json" +models_repo = "models_repo.json" +lgbm_params_path = pkg_resources.resource_filename(__name__, lgbm_params) +models_repo_path = pkg_resources.resource_filename(__name__, models_repo) + + +class MlModels: + def __init__(self): + self.operations_by_types = { + "xgbreg": "XGBRegressor", + "adareg": "AdaBoostRegressor", + "gbr": "GradientBoostingRegressor", + "dtreg": "DecisionTreeRegressor", + "treg": "ExtraTreesRegressor", + "rfr": "RandomForestRegressor", + "linear": "LinearRegression", + "ridge": "Ridge", + "lasso": "Lasso", + "sgdr": "SGDRegressor", + "lgbmreg": "LGBMRegressor", + "catboostreg": "CatBoostRegressor", + "xgboost": "XGBClassifier", + "logit": "LogisticRegression", + "bernb": "BernoulliNB", + "multinb": "MultinomialNB", + "dt": "DecisionTreeClassifier", + "rf": "RandomForestClassifier", + "mlp": "MLPClassifier", + "catboost": "CatBoostClassifier", + "kmeans": "KMeans", + } + + self.dict_models = { + "XGBRegressor": XGBRegressor, + "AdaBoostRegressor": AdaBoostRegressor, + "GradientBoostingRegressor": GradientBoostingRegressor, + "DecisionTreeRegressor": DecisionTreeRegressor, + "ExtraTreesRegressor": ExtraTreesRegressor, + "RandomForestRegressor": RandomForestRegressor, + "LinearRegression": LinearRegression, + "Ridge": Ridge, + "Lasso": Lasso, + "SGDRegressor": SGDRegressor, + "LGBMRegressor": LGBMRegressor, + "CatBoostRegressor": CatBoostRegressor, + "XGBClassifier": XGBClassifier, + "LogisticRegression": LogisticRegression, + "BernoulliNB": BernoulliNB, + "MultinomialNB": MultinomialNB, + "DecisionTreeClassifier": DecisionTreeClassifier, + "RandomForestClassifier": RandomForestClassifier, + "MLPClassifier": MLPClassifier, + "LGBMClassifier": LGBMClassifier, + "CatBoostClassifier": CatBoostClassifier, + "KMeans": KMeans, + } + + # Include LGBMRegressor and LGBMClassifier if they were imported successfully + if LGBMRegressor is not None: + self.dict_models["LGBMRegressor"] = LGBMRegressor + self.operations_by_types["lgbmreg"] = "LGBMRegressor" + if LGBMClassifier is not None: + self.dict_models["LGBMClassifier"] = LGBMClassifier + self.operations_by_types["lgbm"] = "LGBMClassifier" + + if LGBMClassifier and LGBMRegressor is not None: + with open(lgbm_params_path) as file: + self.lgbm_dict = json.load(file) + + def get_model_by_children_type(self, node: Union[GraphNode, CompositeNode]): + candidates = [] + if node.content["type"] == "cont": + type_model = "regr" + else: + type_model = "class" + forbidden_tags = ["non-default", "expensive"] + with open(models_repo_path, "r") as f: + models_json = json.load(f) + models = models_json["operations"] + if LGBMClassifier and LGBMRegressor is not None: + models = models | self.lgbm_dict + for model, value in models.items(): + if ( + model not in ["knnreg", "knn", "qda"] + and list(set(value["tags"]).intersection(forbidden_tags)) == [] + and type_model in value["meta"] + ): + candidates.append(model) + return self.operations_by_types[choice(candidates)] diff --git a/bamt/utils/composite_utils/lgbm_params.json b/bamt/utils/composite_utils/lgbm_params.json new file mode 100644 index 0000000..5c5fa9d --- /dev/null +++ b/bamt/utils/composite_utils/lgbm_params.json @@ -0,0 +1,11 @@ +{ + "lgbm": { + "meta": "sklearn_class", + "tags": ["boosting", "tree", "non_linear", "mix"] + }, + "lgbmreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": ["boosting", "tree", "non_multi", "non_linear", "mix"] + } +} \ No newline at end of file diff --git a/bamt/utils/composite_utils/models_repo.json b/bamt/utils/composite_utils/models_repo.json new file mode 100644 index 0000000..38a4d20 --- /dev/null +++ b/bamt/utils/composite_utils/models_repo.json @@ -0,0 +1,447 @@ +{ + "metadata": { + "custom_class": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the custom classification models", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.classification", + "FedotClassificationStrategy" + ], + "tags": [ + "ml", + "custom" + ], + "tasks": "[TaskTypesEnum.classification]" + }, + "custom_regr": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the custom regression models", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.regression", + "FedotRegressionStrategy" + ], + "tags": [ + "ml", + "custom" + ], + "tasks": "[TaskTypesEnum.regression]" + }, + "sklearn_class": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the classification models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.classification", + "SkLearnClassificationStrategy" + ], + "tags": [ + "ml", + "sklearn" + ], + "tasks": "[TaskTypesEnum.classification]" + }, + "sklearn_clust": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the clustering models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.clustering", + "SkLearnClusteringStrategy" + ], + "tags": [ + "ml", + "sklearn" + ], + "tasks": "[TaskTypesEnum.clustering]" + }, + "sklearn_regr": { + "accepted_node_types": [ + "any" + ], + "description": "Implementations of the regression models from scikit-learn framework", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.regression", + "SkLearnRegressionStrategy" + ], + "tags": [ + "ml", + "sklearn", + "composition" + ], + "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting]" + }, + "ts_model": { + "description": "Implementations of the time series models", + "input_type": "[DataTypesEnum.ts, DataTypesEnum.multi_ts]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.time_series", + "FedotTsForecastingStrategy" + ], + "tags": [ + "time_series" + ], + "tasks": "[TaskTypesEnum.ts_forecasting]" + }, + "custom_model": { + "description": "Implementations of the models specified by user with external code source", + "input_type": "[DataTypesEnum.ts, DataTypesEnum.table, DataTypesEnum.text]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.custom", + "CustomModelStrategy" + ], + "tags": [ + "non-default" + ], + "tasks": "[TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting, TaskTypesEnum.classification, TaskTypesEnum.clustering]" + } + }, + "operations": { + "adareg": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts", "*tree"], + "tags": [ + "boosting", + "non_multi", + "non_linear", + "cont" + ] + }, + "ar": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "correct_params" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "arima": { + "meta": "ts_model", + "presets": ["ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "new_data_refit" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "clstm": { + "meta": "ts_model", + "tags": [ + "non_lagged", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "bernb": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "bayesian", "non_multi", "linear", "disc" + ] + }, + "catboost": { + "meta": "sklearn_class", + "presets": ["*tree"], + "tags": [ + "boosting", "non-default", "non_linear", "mix" + ] + }, + "catboostreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", "non_multi", "non-default", "non_linear", "mix" + ] + }, + "dt": { + "meta": "sklearn_class", + "presets": ["fast_train", "*tree"], + "tags": [ + "tree", + "interpretable", + "non_linear", + "cont" + ] + }, + "dtreg": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts", "*tree"], + "tags": [ + "tree", + "interpretable", + "non_linear", + "cont" + ] + }, + "gbr": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", + "non_multi", + "non_linear", + "cont" + ] + }, + "kmeans": { + "meta": "sklearn_clust", + "presets": ["fast_train"], + "tags": ["linear"] + }, + "knn": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "simple", + "correct_params", + "non_linear", + "cont" + ] + }, + "knnreg": { + "meta": "custom_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "correct_params", + "non_linear", + "cont" + ] + }, + "lasso": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "linear", + "interpretable", + "cont" + ] + }, + "lda": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "discriminant", "linear", "correct_params", "non-default", "cont" + ] + }, + "linear": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", "linear", "interpretable", "cont" + ] + }, + "logit": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "simple", + "linear", + "interpretable", + "non_multi", + "cont" + ] + }, + "mlp": { + "meta": "sklearn_class", + "tags": [ + "neural", + "non_linear", + "cont" + ] + }, + "multinb": { + "meta": "sklearn_class", + "presets": ["fast_train"], + "tags": [ + "non-default", + "bayesian", + "non_multi", + "linear", + "disc" + ] + }, + "qda": { + "meta": "custom_class", + "presets": ["fast_train"], + "tags": [ + "discriminant", + "quadratic", + "non_linear", + "cont" + ] + }, + "rf": { + "meta": "sklearn_class", + "presets": ["fast_train", "*tree"], + "tags": ["tree", "non_linear", "cont"] + }, + "rfr": { + "meta": "sklearn_regr", + "presets": ["fast_train", "*tree"], + "tags": ["tree", "non_linear", "cont"] + }, + "ridge": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "linear", + "interpretable", + "cont" + ] + }, + "polyfit": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "non_lagged", + "non_linear", + "interpretable", + "correct_params" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "sgdr": { + "meta": "sklearn_regr", + "presets": ["fast_train", "ts"], + "tags": [ + "non_multi", "non_linear", "cont" + ] + }, + "stl_arima": { + "meta": "ts_model", + "presets": ["ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "linear", + "new_data_refit" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "glm": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "correct_params", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "ets": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "simple", + "interpretable", + "non_lagged", + "non_linear" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "locf": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "non_linear", + "simple", + "interpretable", + "non_lagged" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "ts_naive_average": { + "meta": "ts_model", + "presets": ["fast_train", "ts"], + "tags": [ + "non_linear", + "simple", + "interpretable", + "non_lagged" + ], + "input_type": "[DataTypesEnum.ts]" + }, + "svc": { + "meta": "custom_class", + "tags": [ + "no_prob", + "expensive", + "non_linear" + ] + }, + "treg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "tree", + "non_linear", + "cont" + ] + }, + "xgboost": { + "meta": "sklearn_class", + "presets": ["*tree"], + "tags": [ + "boosting", "tree", "non-default", "non_linear", "mix" + ] + }, + "xgbreg": { + "meta": "sklearn_regr", + "presets": ["*tree"], + "tags": [ + "boosting", "tree", "non_multi", "non-default", "non_linear", "cont" + ] + }, + "cnn": { + "meta": "custom_class", + "tags": [ + "deep", "non-default", "non_linear" + ], + "input_type": "[DataTypesEnum.image]", + "output_type": "[DataTypesEnum.table]" + }, + "custom": { + "meta": "custom_model", + "tags": [ + "custom_model", + "non-default" + ] + } + } +} diff --git a/bamt/utils/serialization_utils.py b/bamt/utils/serialization_utils.py new file mode 100644 index 0000000..1e4ff74 --- /dev/null +++ b/bamt/utils/serialization_utils.py @@ -0,0 +1,150 @@ +import os +import pickle +from typing import Union, Tuple + +import joblib + +import bamt.utils.check_utils as check_utils +from bamt.log import logger_nodes + + +class ModelsSerializer: + def __init__(self, bn_name, models_dir): + self.bn_name = bn_name + self.models_dir = models_dir + self.serialization = None + + if os.path.isdir(models_dir): + if bn_name in os.listdir(models_dir): + raise AssertionError(f"Name must be unique. | {os.listdir(models_dir)}") + + @staticmethod + def choose_serialization(model) -> Tuple[str, Union[Exception, int]]: + try: + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1").replace("'", '"') + + if type(model).__name__ == "CatBoostRegressor": + a = model_ser.encode("latin1") + else: + a = model_ser.replace('"', "'").encode("latin1") + + classifier_body = pickle.loads(a) + return "pickle", 1 + except Exception: + return "joblib", -1 + + def get_path_joblib(self, models_dir, node_name: str) -> str: + """ + Args: + node_name: name of node + specific: more specific unique name for node. + For example, combination. + + Return: + Path to node. + """ + path = os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") + + if not os.path.isdir(path): + os.makedirs( + os.path.join(models_dir, self.bn_name, f"{node_name.replace(' ', '_')}") + ) + return path + + def serialize_instance(self, instance: dict, model_type, node_name, specific=False): + """Every distribution contains a dict with params with models""" + model_ser = None + + model = instance[f"{model_type}_obj"] + if not check_utils.is_model(model): + return instance + + serialization, status = self.choose_serialization(model) + if status == -1: + logger_nodes.warning( + f"{node_name}:{'' if not specific else specific}::Pickle failed. BAMT will use Joblib." + ) + if serialization == "pickle": + ex_b = pickle.dumps(model, protocol=4) + model_ser = ex_b.decode("latin1") + elif serialization == "joblib": + path = self.get_path_joblib( + models_dir=self.models_dir, node_name=node_name.replace(" ", "_") + ) + if not specific: + destination = f"{node_name.replace(' ', '_')}.joblib.compressed" + else: + destination = f"{specific}.joblib.compressed" + path = os.path.abspath(os.path.join(path, destination)) + joblib.dump(model, path, compress=True, protocol=4) + else: + logger_nodes.error("Serialization detection failed.") + return + instance["serialization"] = serialization + instance[f"{model_type}_obj"] = model_ser or path + return instance + + def serialize(self, distributions): + result = {} + for node_name, [node_type, dist] in distributions.items(): + result[node_name] = {} + model_type = "regressor" if "Gaussian" in node_type else "classifier" + if "Conditional" in node_type: + result[node_name]["hybcprob"] = {} + for combination, dist_nested in dist["hybcprob"].items(): + instance_serialized = self.serialize_instance( + instance=dist_nested, + model_type=model_type, + node_name=node_name, + specific=combination, + ) + result[node_name]["hybcprob"][combination] = instance_serialized + else: + instance_serialized = self.serialize_instance( + instance=dist, model_type=model_type, node_name=node_name + ) + result[node_name] = instance_serialized + return result + + +class Deserializer: + def __init__(self, models_dir): + self.models_dir = models_dir + + @staticmethod + def deserialize_instance(instance: dict, model_type): + model_repr = instance[f"{model_type}_obj"] + if model_repr is None: + return instance + + serialization = instance["serialization"] + + if serialization == "pickle": + bytes_model = model_repr.encode("latin1") + model = pickle.loads(bytes_model) + else: + model = joblib.load(model_repr) + + instance[f"{model_type}_obj"] = model + return instance + + def apply(self, distributions): + result = {} + for node_name, [node_type, dist] in distributions.items(): + model_type = "regressor" if "Gaussian" in node_type else "classifier" + result[node_name] = {} + if "Conditional" in node_type: + result[node_name]["hybcprob"] = {} + for combination, dist_nested in dist["hybcprob"].items(): + instance_deserialized = self.deserialize_instance( + instance=dist_nested, + model_type=model_type, + ) + result[node_name]["hybcprob"][combination] = instance_deserialized + else: + instance_deserialized = self.deserialize_instance( + instance=dist, model_type=model_type + ) + result[node_name] = instance_deserialized + return result diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 0363f66..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,77 +0,0 @@ -site_name: BAMT -repo_name: aimclub/BAMT -repo_url: https://github.com/aimclub/BAMT -theme: - name: material - locale: en - features: - - announce.dismiss - - content.action.edit - - content.action.view - - content.code.annotate - - content.code.copy - # - content.code.select - # - content.footnote.tooltips - # - content.tabs.link - - content.tooltips - # - header.autohide - # - navigation.expand - - navigation.footer - - navigation.indexes - - navigation.instant - # - navigation.instant.prefetch - - navigation.instant.progress - # - navigation.prune - - navigation.sections - - navigation.tabs - # - navigation.tabs.sticky - - navigation.top - - navigation.tracking - - search.highlight - - search.share - - search.suggest - - toc.follow - # - toc.integrate - palette: - - media: "(prefers-color-scheme)" - toggle: - icon: material/link - name: Switch to light mode - - media: "(prefers-color-scheme: light)" - scheme: default - primary: indigo - accent: indigo - toggle: - icon: material/toggle-switch - name: Switch to dark mode - - media: "(prefers-color-scheme: dark)" - scheme: slate - primary: black - accent: indigo - toggle: - icon: material/toggle-switch-off - name: Switch to system preference - font: - text: Roboto - code: Roboto Mono - -nav: - - Home: index.md - - Getting Started: getting-started.md - - Installation: installation.md - - API Reference: - - logger: reference/logger.md -#extra: -# social: -# - icon: fontawesome/brands/github -# link: https://github.com/myusername/myproject -plugins: - - search - - mkdocstrings -markdown_extensions: - - admonition - - codehilite - - footnotes - - meta - - toc: - permalink: True diff --git a/pyproject.toml b/pyproject.toml index 8000b61..fab78d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "BAMT" -version = "2.0.0" +version = "1.2.01" description = "data modeling and analysis tool based on Bayesian networks" authors = ["Roman Netrogolov ", "Irina Deeva ", @@ -22,19 +22,20 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.10" -numpy = "^1.26.0" -matplotlib = "^3.8.0" -pandas = "^2.2.0" +python = ">=3.9,<3.11" +setuptools = "65.6.3" +numpy = ">=1.24.2" +matplotlib = "3.6.2" +pandas = "2.0.3" gmr = "1.6.2" -scikit-learn = "^1.4.2" -scipy = "^1.13.0" -pyvis = "^0.3.1" +scikit-learn = "^1.2.0" +scipy = "^1.8.0" +pyvis = ">=0.2.1" missingno = "^0.5.1" -pgmpy = "^0.1.20" -thegolem = "^0.3.3" +pgmpy = "0.1.20" +thegolem = ">=0.3.3" xgboost = ">=1.7.6" -catboost = ">=2.0.0" +catboost = ">=1.0.6" lightgbm = {version = ">=3.3.5", optional = true } [tool.poetry.extras] diff --git a/requirements.txt b/requirements.txt index 374a8a4..dad16dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,15 @@ -# Data Manipulation -numpy>=1.26.0 -pandas>=2.2.0 - -# Graph manipulation -networkx>=3.3 -thegolem>=0.4.0 - -# ML modeling frameworks -scikit-learn>=1.4.2 -scipy>=1.13.0 -catboost>=1.2.1 -xgboost>=2.0.0 - -# visualization -matplotlib>=3.8.0 -pyvis>=0.3.1 - -# TODO: exclude these libraries +numpy==1.26.4 +matplotlib==3.6.2 +pandas>=2.0.3 gmr==1.6.2 -pgmpy==0.1.20 \ No newline at end of file +scikit-learn>=1.2.0 +scipy>=1.9.3 +pyvis>=0.2.1 +pgmpy==0.1.25 +catboost>=1.0.6 +joblib>=1.1.1 +networkx>=3.1 +tqdm>=4.65.0 +thegolem>=0.3.3 +typing>=3.7.4.3 +xgboost>=1.7.6 diff --git a/tests/BigbraveBNTest.py b/tests/BigbraveBNTest.py new file mode 100644 index 0000000..d21d215 --- /dev/null +++ b/tests/BigbraveBNTest.py @@ -0,0 +1,59 @@ +import pandas as pd +from pgmpy.estimators import K2Score +from sklearn import preprocessing + +import bamt.preprocessors as pp +from bamt.networks.big_brave_bn import BigBraveBN +from bamt.networks.continuous_bn import ContinuousBN +from bamt.networks.discrete_bn import DiscreteBN + +data_discrete = pd.read_csv(r"../Data/benchmark/pigs.csv") +data_continuous = pd.read_csv(r"../Data/benchmark/arth150.csv") + +encoder = preprocessing.LabelEncoder() +discretizer = preprocessing.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" +) + +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) +discretized_data, est = p.apply(data_discrete) + +info = p.info + +space_restrictor = BigBraveBN() + +space_restrictor.set_possible_edges_by_brave(df=data_discrete) + +ps = space_restrictor.possible_edges + +bn_discrete = DiscreteBN() + +bn_discrete.add_nodes(descriptor=info) + +params = {"white_list": ps} +bn_discrete.add_edges(discretized_data, scoring_function=("K2", K2Score), params=params) + +encoder = preprocessing.LabelEncoder() +discretizer = preprocessing.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" +) + +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) +discretized_data, est = p.apply(data_continuous) + +info = p.info + +space_restrictor = BigBraveBN() + +space_restrictor.set_possible_edges_by_brave(df=data_continuous) + +ps = space_restrictor.possible_edges + +bn_continuous = ContinuousBN() + +bn_continuous.add_nodes(descriptor=info) + +params = {"white_list": ps} +bn_continuous.add_edges( + discretized_data, scoring_function=("K2", K2Score), params=params +) diff --git a/tests/LoadBN.py b/tests/LoadBN.py new file mode 100644 index 0000000..6335bb2 --- /dev/null +++ b/tests/LoadBN.py @@ -0,0 +1,54 @@ +import json + +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info + +bn.add_nodes(info) + +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] + +bn.set_structure(edges=structure) + +bn.get_info(as_df=False) + +with open("hack_p.json") as params: + params = json.load(params) + bn.set_parameters(params) + +bn.plot("gg3.html") + +bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) +bn2.load("hack.json") + +print(bn2.sample(50)) diff --git a/tests/MainTest.py b/tests/MainTest.py new file mode 100644 index 0000000..4796fe6 --- /dev/null +++ b/tests/MainTest.py @@ -0,0 +1,137 @@ +import pandas as pd +from pgmpy.estimators import K2Score +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +""" +Optional: +You can also uncomment print() that you need. +""" + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv") +cont_data = hack_data[["Gross", "Netpay", "Porosity", "Permeability", "Depth"]].dropna() +disc_data = hack_data[ + ["Tectonic regime", "Period", "Lithology", "Structural setting"] +].dropna() +hybrid_data = hack_data[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() + +cont_test_data = cont_data[cont_data.columns[:-1]] +cont_target = cont_data[cont_data.columns[-1]] +disc_test_data = disc_data[disc_data.columns[:-1]] +disc_target = disc_data[disc_data.columns[-1]] +hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] +hybrid_target = hybrid_data[hybrid_data.columns[-1]] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +# Discrete pipeline +discretized_data, _ = p.apply(disc_data) +disc_bn = Networks.DiscreteBN() +info = p.info +disc_bn.add_nodes(info) +disc_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +disc_bn.fit_parameters(data=disc_data) +disc_bn.calculate_weights(discretized_data) +disc_predicted_values = disc_bn.predict(test=disc_test_data) +disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient="columns") +synth_disc_data = disc_bn.sample(50) + +disc_bn.save("./disc_bn.json") +disc_bn2 = Networks.DiscreteBN() +disc_bn2.load("./disc_bn.json") +synth_disc_data2 = disc_bn2.sample(50) +# print(disc_bn.weights) +# print(disc_bn2.weights) +# print(disc_bn.get_info()) +# print(disc_bn2.get_info()) +# print(synth_disc_data) +# print(synth_disc_data2) + +# Continuous pipeline +discretized_data, _ = p.apply(cont_data) +cont_bn = Networks.ContinuousBN(use_mixture=True) +info = p.info +cont_bn.add_nodes(info) +cont_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +cont_bn.fit_parameters(data=cont_data) +cont_bn.calculate_weights(discretized_data) +cont_predicted_values = cont_bn.predict(test=cont_test_data) +cont_predicted_values = pd.DataFrame.from_dict(cont_predicted_values, orient="columns") +synth_cont_data = cont_bn.sample(50) + +cont_bn.save("./cont_bn.json") +cont_bn2 = Networks.ContinuousBN(use_mixture=True) +cont_bn2.load("./cont_bn.json") +synth_cont_data2 = cont_bn2.sample(50) +# print(cont_bn.weights) +# print(cont_bn2.weights) +# print('RMSE on predicted values with continuous data: ' + +# f'{mse(cont_target, cont_predicted_values, squared=False)}') +# print(cont_bn.get_info()) +# print(cont_bn2.get_info()) +# print(synth_cont_data) +# print(synth_cont_data2) + +# Hybrid pipeline +discretized_data, _ = p.apply(hybrid_data) +hybrid_bn = Networks.HybridBN(use_mixture=True) +hybrid_bn2 = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info +hybrid_bn.add_nodes(info) +hybrid_bn2.add_nodes(info) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn2.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn.fit_parameters(data=hybrid_data) +hybrid_bn2.fit_parameters(data=hybrid_data) +hybrid_bn.calculate_weights(discretized_data) +hybrid_bn2.calculate_weights(discretized_data) +hybrid_predicted_values = hybrid_bn.predict(test=hybrid_test_data) +hybrid_predicted_values = pd.DataFrame.from_dict( + hybrid_predicted_values, orient="columns" +) +synth_hybrid_data = hybrid_bn.sample(50) +synth_hybrid_data2 = hybrid_bn2.sample(50) + +hybrid_bn.save("./hybrid_bn.json") +hybrid_bn3 = Networks.HybridBN(use_mixture=True) +hybrid_bn3.load("./hybrid_bn.json") +synth_hybrid_data3 = hybrid_bn3.sample(50) +# print(hybrid_bn.weights) +# print(hybrid_bn2.weights) +# print(hybrid_bn3.weights) +# print('RMSE on predicted values with hybrid data: ' + +# f'{mse(hybrid_target, hybrid_predicted_values, squared=False)}') +# print(hybrid_bn.get_info()) +# print(hybrid_bn2.get_info()) +# print(hybrid_bn3.get_info()) +# print(synth_hybrid_data) +# print(synth_hybrid_data2) +# print(synth_hybrid_data3) + +# Save and load BN without weights +discretized_data, _ = p.apply(hybrid_data) +hybrid_bn = Networks.HybridBN(use_mixture=True) +info = p.info +hybrid_bn.add_nodes(info) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn.fit_parameters(data=hybrid_data) +hybrid_bn.save("./hybrid_bn_without_weights.json") +hybrid_bn2 = Networks.HybridBN(use_mixture=True) +hybrid_bn2.load("./hybrid_bn_without_weights.json") +# print(hybrid_bn2.weights) diff --git a/tests/MetricsTest.py b/tests/MetricsTest.py new file mode 100644 index 0000000..5f04deb --- /dev/null +++ b/tests/MetricsTest.py @@ -0,0 +1,61 @@ +import time + +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +start = time.time() + + +p1 = time.time() +print(f"Time elapsed for importing: {p1 - start}") + +h = pd.read_csv("data/real data/hack_processed_with_rf.csv") +cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", +] +h = h[cols] + +print(h.describe()) +print("-----") +p2 = time.time() +print(f"Time elapsed for preparing data: {p2 - p1}") + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +# ----------- +discrete_data, est = p.apply(h) +info = p.info + +bn = Networks.HybridBN(has_logit=True) # all may vary +bn.add_nodes(descriptor=info) +bn.add_edges(data=discrete_data, optimizer="HC", scoring_function=("MI",)) + +bn.get_info(as_df=False) +t1 = time.time() +bn.fit_parameters(data=h) +t2 = time.time() +print(f"PL elapsed: {t2 - t1}") + +columns = ["Lithology", "Structural setting", "Porosity", "Depth"] +validY = h[columns].dropna() +validX = h.drop(columns, axis=1).dropna() + +time_1 = time.time() +pred_param = bn.predict(validX, parall_count=3) +time_2 = time.time() +print(pred_param) +print(f"Predict elapsed: {time_2 - time_1}") diff --git a/tests/NetworksTest.py b/tests/NetworksTest.py new file mode 100644 index 0000000..607bdf9 --- /dev/null +++ b/tests/NetworksTest.py @@ -0,0 +1,722 @@ +import itertools +import json +import time + +import numpy as np +import pandas as pd +from pandas.testing import assert_frame_equal +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier + +import bamt.networks as Networks +import bamt.nodes as Nodes +from bamt.preprocessors import Preprocessor + + +# import abc + + +class NetworkTest(object): + def __init__( + self, + directory: str, + verbose: bool = False, + case_id: int = 0, + sample_n: int = 500, + sample_tol: float = 0.6, + ): + """ + sample_n: number of rows in sample + sample_tol: precent of acceptable number of nans. + If number of nan more than sample_int * sample_tol, then sample test failed. + """ + self.bn = Networks.BaseNetwork() + self.sf = "" + self.type = "abstract" + self.case_id = case_id + self.sample_n = sample_n + self.sample_tol = sample_tol + self.verboseprint = print if verbose else lambda *a: None + self.directory = directory + + @staticmethod + def _tabularize_output(message1: str, message2: str): + # message1 - usually class of output (error, success) + # message2 - usually description of message 1 + return f"{message1: <52} | {message2: >3}" + + def test_preprocess(self): + failed = False + + if self.case_id == 0: + self.discrete_cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + self.cont_cols = ["Gross", "Netpay", "Porosity", "Permeability", "Depth"] + self.hybrid_cols = [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + # Base of standards + self.base = "hack_" + self.type + else: + self.discrete_cols = [] + self.cont_cols = [] + self.hybrid_cols = [] + + if self.type == "discrete": + data = pd.read_csv(self.directory)[self.discrete_cols] + elif self.type == "continuous": + data = pd.read_csv(self.directory)[self.cont_cols] + else: + data = pd.read_csv(self.directory)[self.hybrid_cols] + + encoder = pp.LabelEncoder() + discretizer = pp.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="uniform" + ) + + p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + + discretized_data, est = p.apply(data) + info = p.info + + try: + assert info == json.load(open(f"{self.base}/hack_info.json")) + except AssertionError: + failed = True + self.verboseprint(self._tabularize_output("ERROR", "Bad descriptor")) + + try: + assert_frame_equal( + discretized_data, + pd.read_csv(f"{self.base}/hack_data.csv", index_col=0), + check_dtype=False, + ) + except Exception as ex: + failed = True + self.verboseprint(self._tabularize_output("ERROR", str(ex))) + + if not failed: + status = "OK" + else: + status = "Failed" + print(self._tabularize_output("Preprocess", status)) + + self.info = info + self.data = discretized_data + + def test_structure_learning(self): + pass + + def test_parameters_learning(self): + pass + + def test_sampling(self): + failed = False + + sample = self.bn.sample(n=self.sample_n, progress_bar=False) + + if sample.empty: + self.verboseprint("Sampling", "Dataframe is empty") + return + + if sample.isna().sum().sum() > (self.sample_n * self.sample_tol): + failed = True + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Sampling ({self.sf})", status)) + + def test_predict(self): + failed = False + + if self.type == "discrete": + cols = self.discrete_cols + elif self.type == "continuous": + cols = self.cont_cols + elif self.type == "hybrid": + cols = self.hybrid_cols + else: + raise Exception("Inner error") + + preds = self.bn.predict( + test=pd.read_csv(self.directory)[cols[:2]].dropna(), + progress_bar=False, + parall_count=2, + ) + + # with open(f"{self.base}/hack_predict.json", "r") as f: + # p = json.load(f) + + if self.type == "continuous": + # cols: ['Porosity', 'Permeability', 'Depth'] + for node in preds.keys(): + right_val = json.load(open(f"{self.base}/hack_predict.json"))[self.sf][ + node + ] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + assert np.all( + np.isclose(test_val, right_val, rtol=0.4) + ), f"Predict failed: {node, right_val, test_val}" + elif self.type == "discrete": + # cols: ['Lithology', 'Structural setting'] + for node in preds.keys(): + test_vals = pd.Series(preds[node]).value_counts().to_dict() + for category, right_val in json.load( + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): + try: + assert np.all( + np.isclose(test_vals[category], right_val, atol=5) + ), f"Predict failed: {node, test_vals[category], right_val}" + except KeyError as ex: + print("Unknown preds category: ", ex.args[0]) + continue + elif self.type == "hybrid": + cont_nodes = [ + node + for node in self.bn.nodes_names + if self.info["types"][node] == "cont" + ] + for node in preds.keys(): + if node in cont_nodes: + right_val = json.load(open(f"{self.base}/hack_predict.json"))[ + self.sf + ][node] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + # p[self.sf][node] = test_val + s = [right_val, test_val] + assert np.all( + np.isclose(min(s), max(s), atol=5, rtol=0.6) + ), f"Predict failed: {node, test_val, right_val}" + else: + test_vals = pd.Series(preds[node]).value_counts().to_dict() + # p[self.sf][node] = test_vals + for category, right_val in json.load( + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): + try: + assert np.all( + np.isclose( + min(test_vals[category], right_val), + max(right_val, test_vals[category]), + atol=100, + rtol=0.5, + ) + ), f"Predict failed: {node, test_vals[category], right_val}" + except KeyError as ex: + print("Unknown preds category: ", ex.args[0]) + continue + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Predict ({self.sf})", status)) + + # with open(f"{self.base}/hack_predict.json", "w") as f: + # json.dump(p, f) + + def apply(self): + pass + + @staticmethod + def use_rules(*args, **kwargs): + for rule in args: + rule(**kwargs) + + +class TestDiscreteBN(NetworkTest): + def __init__(self, **kwargs): + super(TestDiscreteBN, self).__init__(**kwargs) + self.type = "discrete" + + def test_structure_learning(self): + failed = False + + bn = Networks.DiscreteBN() + bn.add_nodes(descriptor=self.info) + + try: + assert bn.nodes_names == [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[self.sf] + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + self.bn = None + status = "Failed" + + print(self._tabularize_output(f"Structure ({self.sf})", status)) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.discrete_cols]) + + try: + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output(f"Parameters ({self.sf})", "bad distributions") + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output(f"Parameters ({self.sf})", status)) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + t0 = time.time() + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning() + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_parameters_learning() + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8, "\n") + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") + + +class TestContinuousBN(NetworkTest): + def __init__(self, **kwargs): + super(TestContinuousBN, self).__init__(**kwargs) + self.type = "continuous" + + def test_setters(self): + failed = False + + bn = Networks.ContinuousBN() + ns = [] + for d in [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 4)]: + ns.append(d) + + bn.set_structure(nodes=ns) + bn.set_classifiers( + classifiers={ + "Node0": DecisionTreeClassifier(), + "Node1": RandomForestClassifier(), + "Node2": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output("Setters", status)) + + def test_structure_learning(self, use_mixture: bool = False): + self.use_mixture = use_mixture + failed = False + + bn = Networks.ContinuousBN(use_mixture=use_mixture) + bn.add_nodes(descriptor=self.info) + + try: + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Structure ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.cont_cols]) + try: + if self.use_mixture: + empty_data = {"mean": [], "covars": [], "coef": []} + for k, v in self.bn.distributions.items(): + assert all( + [v[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {k}." + assert ( + 0.9 <= sum(v["coef"]) <= 1.1 + ), f"{sum(v['coef'])} || {k}'s: coefs are wrong." + else: + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[ + "use_mixture=False" + ][self.sf] + ), "Bad distributions." + except AssertionError as ex: + failed = True + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", + ex.args[0], + ) + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + # Auskommentieren mich + # self._test_setters() + t0 = time.time() + for use_mixture in [True, False]: + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning(use_mixture=use_mixture) + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_parameters_learning() + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") + + +class TestHybridBN(NetworkTest): + def __init__(self, **kwargs): + super(TestHybridBN, self).__init__(**kwargs) + self.type = "hybrid" + + def test_setters(self): + failed = False + + bn = Networks.HybridBN(has_logit=True) + ns = [] + for d, g in zip( + [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], + [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)], + ): + ns.append(d) + ns.append(g) + edges = [ + ("Node0", "Node3"), + ("Node3", "Node1"), + ("Node1", "Node4"), + ("Node4", "Node2"), + ("Node2", "Node5"), + ] + test_info = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "disc", + "Node4": "disc", + "Node5": "disc", + }, + "signs": {"Node0": "pos", "Node1": "pos", "Node2": "pos"}, + } + + # Structure setter + bn.set_structure(info=test_info, nodes=ns, edges=edges) + + assert [ + "Gaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + ] == [node.type for node in bn.nodes], "Setter | Nodes are not the same." + assert edges == bn.edges, "Setter | Edges are not the same." + + # Classifiers setters + + bn.set_classifiers( + classifiers={ + "Node3": DecisionTreeClassifier(), + "Node4": RandomForestClassifier(), + "Node5": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." + + # Parameters setters + + with open("test_params.json") as f: + p = json.load(f) + + bn.set_parameters(p) + assert bn.distributions == p, "Setter | Parameters are not the same." + + if not failed: + status = "OK" + else: + status = "Failed" + + print(self._tabularize_output("Setters", status)) + + def test_structure_learning( + self, use_mixture: bool = False, has_logit: bool = False + ): + self.use_mixture = use_mixture + self.has_logit = has_logit + failed = False + + bn = Networks.HybridBN(use_mixture=use_mixture, has_logit=has_logit) + bn.add_nodes(descriptor=self.info) + + try: + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint( + self._tabularize_output( + "ERROR", "first stage failed (wrong init nodes)." + ) + ) + + bn.add_edges(self.data, (self.sf,), progress_bar=False) + + try: + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) + except AssertionError: + failed = True + self.verboseprint(f"Stage 2 failed with {self.sf}.") + + if not failed: + self.bn = bn + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Structure ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + status, + ) + ) + + @staticmethod + def non_empty_gaussian_nodes(name, node_params): + empty_data = {"mean": [], "covars": [], "coef": []} + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." + + @staticmethod + def non_empty_logit_nodes(name, node_params): + empty_data = {"classes": [], "classifier_obj": None} + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." + + @staticmethod + def sum_equals_to_1(name, node_params): + assert 0.9 <= sum(node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." + + def _validate_node(self, name, type, node_params, true_vals): + try: + if type == "MixtureGaussian": + self.use_rules( + self.non_empty_gaussian_nodes, + self.sum_equals_to_1, + name=name, + node_params=node_params, + ) + elif type == "ConditionalMixtureGaussian": + for comb, data in node_params["hybcprob"].items(): + self.use_rules( + self.non_empty_gaussian_nodes, + self.sum_equals_to_1, + name=name, + node_params=data, + ) + elif type.startswith("Logit"): + self.use_rules( + self.non_empty_logit_nodes, name=name, node_params=node_params + ) + elif type.startswith("ConditionalLogit"): + for comb, data in node_params["hybcprob"].items(): + self.use_rules( + self.non_empty_logit_nodes, name=name, node_params=data + ) + else: + assert node_params == true_vals, f"Parameters error on {name}, {type}" + except AssertionError as ex: + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + ex.args[0], + ) + ) + + def test_parameters_learning(self): + failed = False + + self.bn.fit_parameters(pd.read_csv(self.directory)[self.hybrid_cols]) + try: + true_params = json.load(open(f"{self.base}/hack_params.json"))[ + f"use_mixture={self.use_mixture}" + ][f"has_logit={self.has_logit}"][self.sf] + + node_type_dict = {node.name: node.type for node in self.bn.nodes} + for name, type in node_type_dict.items(): + node_params = self.bn.distributions[name] + self._validate_node(name, type, node_params, true_params[name]) + except AssertionError as ex: + failed = True + self.verboseprint( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + ex.args[0], + ) + ) + + if not failed: + status = "OK" + else: + status = "Failed" + + print( + self._tabularize_output( + f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", + status, + ) + ) + + def apply(self): + print(f"Executing {self.type} BN tests.") + self.test_preprocess() + self.test_setters() + t0 = time.time() + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): + for sf in ["MI", "K2", "BIC"]: + self.sf = sf + t1 = time.time() + self.test_structure_learning( + use_mixture=use_mixture, has_logit=has_logit + ) + self.test_parameters_learning() + if not self.bn: + print(self._tabularize_output(f"Error on {sf}", "No structure")) + print("-" * 8) + continue + self.test_sampling() + self.test_predict() + t2 = time.time() + print("-" * 8, f"Elapsed time: {t2 - t1}", "-" * 8) + + t3 = time.time() + print(f"\nElapsed time: {t3 - t0}") diff --git a/tests/SaveBN.py b/tests/SaveBN.py new file mode 100644 index 0000000..0a210ea --- /dev/null +++ b/tests/SaveBN.py @@ -0,0 +1,49 @@ +import pandas as pd +from sklearn import preprocessing as pp + +import bamt.networks as Networks +from bamt.preprocessors import Preprocessor + +# import json + +hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") + +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Networks.HybridBN(use_mixture=True, has_logit=True) +info = p.info + +bn.add_nodes(info) + +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] + +bn.set_structure(edges=structure) + +bn.get_info(as_df=False) + +bn.fit_parameters(data=hack_data) +print(bn.sample(4)) +bn.save_params("hack_p.json") +# bn.save_structure("hack_s.json") +bn.save("hack.json") diff --git a/tests/main.py b/tests/main.py new file mode 100644 index 0000000..37083e0 --- /dev/null +++ b/tests/main.py @@ -0,0 +1,27 @@ +import logging +import time +import traceback + +from NetworksTest import TestHybridBN, TestContinuousBN, TestDiscreteBN + +# Print only errors +logging.getLogger("preprocessor").setLevel(logging.ERROR) + +if __name__ == "__main__": + t0 = time.time() + dir = r"../data/real data/hack_processed_with_rf.csv" + + tests = [ + TestHybridBN(directory=dir), + TestDiscreteBN(directory=dir), + TestContinuousBN(directory=dir), + ] + + for test in tests: + try: + test.apply() + except Exception as ex: + traceback.print_exc() + continue + + print(f"Total time: {time.time() - t0}") diff --git a/tests/sendingClassifiersLogit.py b/tests/sendingClassifiersLogit.py new file mode 100644 index 0000000..ce312f0 --- /dev/null +++ b/tests/sendingClassifiersLogit.py @@ -0,0 +1,61 @@ +# import json + +import pandas as pd +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier + +import bamt.networks as Nets +import bamt.preprocessors as preprocessors + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = Nets.HybridBN(has_logit=True) +info = p.info + +# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: +# net_data = json.load(f) + +# bn.add_nodes(net_data["info"]) +# bn.set_structure(edges=net_data["edges"]) +# bn.set_parameters(net_data["parameters"]) +# +# print(bn.sample(10, models_dir=r"")) + +bn.add_nodes(info) + +bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) +bn.set_classifiers( + classifiers={ + "Structural setting": DecisionTreeClassifier(), + "Lithology": RandomForestClassifier(), + "Period": KNeighborsClassifier(n_neighbors=2), + } +) + +bn.fit_parameters(hack_data) + +# bn.save("mynet.json") +print(bn.sample(500).shape) + +bn.get_info(as_df=False) diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py new file mode 100644 index 0000000..b0c79ae --- /dev/null +++ b/tests/sendingRegressors.py @@ -0,0 +1,62 @@ +# import json + +import pandas as pd +from catboost import CatBoostRegressor +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestRegressor + +# from sklearn.linear_model import ElasticNet +from sklearn.tree import DecisionTreeRegressor + +import bamt.preprocessors as preprocessors +from bamt.networks import HybridBN + +hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() + +encoder = pp.LabelEncoder() +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") + +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + +discretized_data, est = p.apply(hack_data) + +bn = HybridBN(has_logit=True) +info = p.info + +# with open(r"C:\Users\Roman\Desktop\mymodels\mynet.json") as f: +# net_data = json.load(f) + +# bn.add_nodes(net_data["info"]) +# bn.set_structure(edges=net_data["edges"]) +# bn.set_parameters(net_data["parameters"]) + +bn.add_nodes(info) + +bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) + +bn.set_regressor( + regressors={ + "Depth": CatBoostRegressor(logging_level="Silent", allow_writing_files=False), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } +) + +bn.fit_parameters(hack_data) + +# bn.save("mynet") + +print(bn.sample(100).shape) +bn.get_info(as_df=False) diff --git a/tests/test_builders.py b/tests/test_builders.py new file mode 100644 index 0000000..530a2b4 --- /dev/null +++ b/tests/test_builders.py @@ -0,0 +1,728 @@ +import itertools +import logging +import unittest + +import pandas as pd + +from bamt.builders.builders_base import StructureBuilder, VerticesDefiner +from bamt.builders.evo_builder import EvoStructureBuilder +from bamt.builders.hc_builder import HillClimbDefiner +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils.MathUtils import precision_recall + +logging.getLogger("builder").setLevel(logging.CRITICAL) + + +class TestStructureBuilder(unittest.TestCase): + def setUp(self): + self.data = pd.DataFrame(columns=["Node0", "Node1", "Node2"]) + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "disc_num"}, + "signs": {"Node0": "pos"}, + } + self.SB = StructureBuilder(descriptor=self.descriptor) + + def test_restrict(self): + self.SB.has_logit = True + + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) + + self.assertEqual(self.SB.black_list, [], msg="Restrict wrong edges.") + + # --------- + self.SB.has_logit = False + + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) + + self.assertEqual( + self.SB.black_list, + [("Node0", "Node1"), ("Node0", "Node2")], + msg="Restricted edges are allowed.", + ) + + def test_get_family(self): + self.assertIsNone(self.SB.get_family()) + + self.SB.skeleton["V"] = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + DiscreteNode(name="Node2"), + ] + self.assertIsNone(self.SB.get_family()) + # Note that the method get_family is not supposed to be used by user (only developer), + # so we don't cover a case with restricted edges here (we did this in + # the previous test). + self.SB.skeleton["E"] = [ + ("Node1", "Node0"), + ("Node2", "Node1"), + ("Node2", "Node0"), + ] + self.SB.get_family() + + # Node: [[cont_parents], [disc_parents], [children]] + data = [ + [[], [], ["Node1", "Node0"]], + [[], ["Node2"], ["Node0"]], + [[], ["Node1", "Node2"], []], + ] + for node_nummer in range(3): + self.assertEqual( + self.SB.skeleton["V"][node_nummer].cont_parents, data[node_nummer][0] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].disc_parents, data[node_nummer][1] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].children, data[node_nummer][2] + ) + + +class TestVerticesDefiner(unittest.TestCase): + def setUp(self): + self.descriptor = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + self.VD = VerticesDefiner(descriptor=self.descriptor, regressor=None) + + def test_first_level(self): + self.assertEqual( + self.VD.vertices, + [ + GaussianNode(name="Node0"), + GaussianNode(name="Node1"), + GaussianNode(name="Node2"), + GaussianNode(name="Node3"), + DiscreteNode(name="Node4"), + DiscreteNode(name="Node5"), + DiscreteNode(name="Node6"), + DiscreteNode(name="Node7"), + ], + ) + + def test_overwrite_vetrex(self): + self.assertEqual(self.VD.skeleton, {"V": [], "E": []}) + + def reload(): + self.VD.skeleton["V"] = self.VD.vertices + self.VD.skeleton["E"] = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] + self.VD.get_family() + + data = { + "True, True": { + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, + "True, False": { + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, + "False, True": { + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + "False, False": { + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + } + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): + reload() + self.VD.overwrite_vertex( + has_logit=has_logit, + use_mixture=use_mixture, + classifier=None, + regressor=None, + ) + self.assertEqual( + {node.name: node.type for node in self.VD.skeleton["V"]}, + data[f"{use_mixture}, {has_logit}"], + msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}", + ) + + +class TestHillClimbDefiner(unittest.TestCase): + def setUp(self): + self.descriptor = { + "signs": { + "Depth": "pos", + "Gross": "pos", + "Netpay": "pos", + "Permeability": "pos", + "Porosity": "pos", + }, + "types": { + "Depth": "cont", + "Gross": "cont", + "Lithology": "disc", + "Netpay": "cont", + "Period": "disc", + "Permeability": "cont", + "Porosity": "cont", + "Structural setting": "disc", + "Tectonic regime": "disc", + }, + } + self.data = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + + def test_apply_K2(self): + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("K2",), + ) + + hcd.apply_K2( + data=pd.DataFrame(self.data), + init_edges=None, + progress_bar=False, + remove_init_edges=False, + white_list=None, + ) + + right_edges = [ + ["Tectonic regime", "Structural setting"], + ["Tectonic regime", "Depth"], + ["Tectonic regime", "Netpay"], + ["Period", "Porosity"], + ["Period", "Tectonic regime"], + ["Period", "Netpay"], + ["Lithology", "Permeability"], + ["Lithology", "Period"], + ["Lithology", "Tectonic regime"], + ["Structural setting", "Netpay"], + ["Netpay", "Gross"], + ["Porosity", "Permeability"], + ["Porosity", "Depth"], + ["Porosity", "Netpay"], + ["Permeability", "Netpay"], + ] + + self.assertEqual(hcd.skeleton["E"], right_edges) + + def test_apply_group1(self): + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("MI",), + ) + + hcd.restrict(data=pd.DataFrame(self.data), bl_add=None, init_nodes=None) + hcd.apply_group1( + data=pd.DataFrame(self.data), + progress_bar=False, + init_edges=None, + remove_init_edges=False, + white_list=None, + ) + + right_edges = [ + ["Lithology", "Depth"], + ["Period", "Gross"], + ["Netpay", "Gross"], + ["Period", "Netpay"], + ["Depth", "Period"], + ["Depth", "Permeability"], + ["Netpay", "Permeability"], + ["Period", "Porosity"], + ["Netpay", "Porosity"], + ["Permeability", "Structural setting"], + ["Netpay", "Structural setting"], + ["Period", "Tectonic regime"], + ["Netpay", "Tectonic regime"], + ] + + self.assertEqual(hcd.skeleton["E"], right_edges) + + +class TestEvoStructureBuilder(unittest.TestCase): + def setUp(self): + self.data = pd.read_csv(r"data/benchmark/asia.csv", index_col=0) + self.descriptor = { + "types": { + "asia": "disc", + "tub": "disc", + "smoke": "disc", + "lung": "disc", + "bronc": "disc", + "either": "disc", + "xray": "disc", + "dysp": "disc", + }, + "signs": {}, + } + self.evo_builder = EvoStructureBuilder( + data=self.data, + descriptor=self.descriptor, + regressor=None, + has_logit=True, + use_mixture=True, + ) + # Replace this with your actual reference DAG + self.reference_dag = [ + ("asia", "tub"), + ("tub", "either"), + ("smoke", "lung"), + ("smoke", "bronc"), + ("lung", "either"), + ("bronc", "dysp"), + ("either", "xray"), + ("either", "dysp"), + ] + + def test_build(self): + # placeholder kwargs + kwargs = {} + self.evo_builder.build( + data=self.data, classifier=None, regressor=None, verbose=False, **kwargs + ) + + obtained_dag = self.evo_builder.skeleton["E"] + num_edges = len(obtained_dag) + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) + + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] + self.assertLess( + dist, + 15, + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_graph_analyzer.py b/tests/test_graph_analyzer.py new file mode 100644 index 0000000..a87bcb2 --- /dev/null +++ b/tests/test_graph_analyzer.py @@ -0,0 +1,98 @@ +import logging +import unittest + +from bamt.builders.builders_base import VerticesDefiner +from bamt.networks.discrete_bn import DiscreteBN +from bamt.nodes.discrete_node import DiscreteNode +from bamt.utils import GraphUtils + +logging.getLogger("builder").setLevel(logging.CRITICAL) + + +class TestGraphAnalyzer(unittest.TestCase): + def setUp(self): + self.bn = DiscreteBN() + definer = VerticesDefiner( + descriptor={ + "types": { + "Node0": "disc", + "Node1": "disc", + "Node2": "disc", + "Node3": "disc_num", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc", + "Node7": "disc", + "Node8": "disc", + "Node9": "disc", + }, + "signs": {}, + }, + regressor=None, + ) + + definer.skeleton["V"] = [DiscreteNode(name=f"Node{i}") for i in range(10)] + definer.skeleton["E"] = [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node2", "Node3"), + ("Node4", "Node7"), + ("Node1", "Node5"), + ("Node5", "Node6"), + ("Node7", "Node0"), + ("Node8", "Node1"), + ("Node9", "Node2"), + ] + definer.get_family() + + self.bn.nodes = definer.skeleton["V"] + self.bn.edges = definer.skeleton["E"] + + self.analyzer = GraphUtils.GraphAnalyzer(self.bn) + + def test_markov_blanket(self): + result = self.analyzer.markov_blanket("Node0") + result["nodes"] = sorted(result["nodes"]) + self.assertEqual( + { + "edges": [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node7", "Node0"), + ("Node8", "Node1"), + ("Node9", "Node2"), + ], + "nodes": sorted(["Node0", "Node1", "Node2", "Node7", "Node8", "Node9"]), + }, + result, + ) + + def test_find_family(self): + without_parents = self.analyzer.find_family("Node0", 0, 2, None) + without_parents["nodes"] = sorted(without_parents["nodes"]) + self.assertEqual( + { + "nodes": sorted(["Node3", "Node2", "Node1", "Node0", "Node5"]), + "edges": [ + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node2", "Node3"), + ("Node1", "Node5"), + ], + }, + without_parents, + ) + + without_children = self.analyzer.find_family("Node0", 2, 0, None) + without_children["nodes"] = sorted(without_children["nodes"]) + self.assertEqual( + { + "nodes": sorted(["Node4", "Node7", "Node0"]), + "edges": [("Node4", "Node7"), ("Node7", "Node0")], + }, + without_children, + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_networks.py b/tests/test_networks.py new file mode 100644 index 0000000..b2df174 --- /dev/null +++ b/tests/test_networks.py @@ -0,0 +1,1192 @@ +import json +import logging +import pathlib as pl +import unittest + +import pandas as pd +from catboost import CatBoostRegressor +from sklearn import preprocessing as pp +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor + +import bamt.preprocessors as bp +from bamt.networks.composite_bn import CompositeBN +from bamt.networks.hybrid_bn import BaseNetwork, HybridBN +from bamt.nodes.discrete_node import DiscreteNode +from bamt.nodes.gaussian_node import GaussianNode +from bamt.utils.MathUtils import precision_recall +from bamt.utils.composite_utils.CompositeGeneticOperators import ( + custom_mutation_add_model, + custom_crossover_all_model, +) +from bamt.utils.composite_utils.CompositeModel import CompositeModel, CompositeNode + +logging.getLogger("network").setLevel(logging.CRITICAL) + + +class TestCaseBase(unittest.TestCase): + def assertIsFile(self, path): + if not pl.Path(path).resolve().is_file(): + raise AssertionError("File does not exist: %s" % str(path)) + + def assertIsDir(self, path): + if not pl.Path(path).resolve().is_dir(): + raise AssertionError("Direction does not exist: %s" % str(path)) + + def prepare_bn_and_data(self): + # prepare bn where models were set by set_model + hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + ] + + encoder = pp.LabelEncoder() + discretizer = pp.KBinsDiscretizer( + n_bins=5, encode="ordinal", strategy="quantile" + ) + + p = bp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) + + hack_data.dropna(inplace=True) + hack_data.reset_index(inplace=True, drop=True) + + discretized_data, est = p.apply(hack_data) + + self.bn = HybridBN(has_logit=True) + info = p.info + + self.bn.add_nodes(info) + + self.bn.add_edges( + discretized_data, scoring_function=("BIC",), progress_bar=False + ) + + self.bn.set_regressor( + regressors={ + "Depth": CatBoostRegressor( + logging_level="Silent", allow_writing_files=False + ), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } + ) + return hack_data + + +class TestBaseNetwork(TestCaseBase): + def setUp(self): + self.bn = BaseNetwork() + + self.nodes = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + GaussianNode(name="Node2"), + ] + + self.edges = [("Node0", "Node1"), ("Node1", "Node2")] + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + def test_validate(self): + descriptor_t = {"types": {"Node0": "Abstract", "Node1": "Abstract"}} + descriptor_f = {"types": {"Node0": "Abstract", "Node1": "cont"}} + + self.assertFalse(self.bn.validate(descriptor_f)) + self.assertTrue(self.bn.validate(descriptor_t)) + + def test_update_descriptor(self): + self.bn.descriptor = self.descriptor + # Nodes out + self.bn.nodes = [GaussianNode(name="Node0")] + self.bn.update_descriptor() + self.assertEqual({"Node0": "cont"}, self.bn.descriptor["types"]) + + # It uses only Vertices Definer, test of this is in builders tests. + def test_add_nodes(self): + pass + + def test_add_edges(self): + # It uses builders + pass + + def test_calculate_weights(self): + pass + + def test_set_nodes(self): + class MyNode: + def __init__(self, name): + self.name = name + + # set without mapping + self.assertIsNone(self.bn.set_nodes(nodes=[GaussianNode(name="Node0")])) + + map = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {}, + } + + self.bn.set_nodes(nodes=self.nodes, info=map) + self.assertEqual(self.bn.nodes, self.nodes) + + self.bn.set_nodes( + nodes=[MyNode(name="Node-1"), MyNode("Node-2")], + info={"types": {"Node-1": "cont", "Node-2": "disc"}, "signs": {}}, + ) + self.assertEqual(self.bn.nodes, []) + + def test_set_edges(self): + self.edges.extend([(0, 1), (pd.NA, "1")]) + + self.bn.has_logit = False + self.bn.set_nodes(nodes=self.nodes, info=self.descriptor) + self.bn.set_edges(edges=self.edges) + + self.assertEqual([("Node1", "Node2")], self.bn.edges) + + # The test consists of 2 previous methods that are tested, + # plus methods of builders, they are tested as well. + def test_set_structure(self): + pass + + # Node testing. + def test_param_validation(self): + pass + + def test_set_parameters(self): + pass + + def test_save_params(self): + self.bn.distributions = {"AAA": "BBB"} + with self.assertRaises(TypeError): + self.bn.save_params("out.txt") + + self.assertTrue(self.bn.save_params("out.json")) + + self.assertIsFile("out.json") + + self.assertEqual(json.load(open("out.json")), self.bn.distributions) + pl.Path("out.json").unlink() + + def test_save_structure(self): + self.bn.edges = self.edges + + with self.assertRaises(TypeError): + self.bn.save_structure("out.txt") + + self.assertTrue(self.bn.save_structure("out.json")) + self.assertIsFile("out.json") + f = json.load(open("out.json")) + for i in range(len(f)): + self.assertEqual(list(self.bn.edges[i]), f[i]) + pl.Path("out.json").unlink() + + # Covers by prev tests. + def test_save(self): + pass + + def test_fit_parameters(self): + """ + General test, the full one is in the tests of each node. + It is frozen for a while. + """ + pass + # bn = BaseNetwork() + # shape = 500 + # data = pd.DataFrame({"Node0": np.random.poisson(5, shape), + # "Node1": np.random.choice(["cat1", "cat2", "cat3"], shape), + # "Node2": np.random.normal(1.5, 4, shape)}) + # + # + # bn.set_structure(info=self.descriptor, nodes=self.nodes, edges=self.edges) + # bn.get_info(as_df=False) + + def test_joblib_pathsave(self): + hack_data = self.prepare_bn_and_data() + self.bn.fit_parameters(hack_data) + + self.assertGreater( + self.bn.sample(100, progress_bar=False).size, 0, "Sampling is broken" + ) + + combination_package = self.bn.distributions["Gross"]["hybcprob"][ + "['COMPRESSION']" + ] + regressor_obj = combination_package["regressor_obj"] + + if combination_package["serialization"] == "joblib": + self.assertIsFile(regressor_obj) + + def test_sample(self): + data = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + nodes = [ + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + DiscreteNode(name="Gross"), + DiscreteNode(name="Netpay"), + DiscreteNode(name="Porosity"), + DiscreteNode(name="Permeability"), + DiscreteNode(name="Depth"), + ] + + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) + self.bn.fit_parameters(pd.DataFrame.from_records(data)) + self.assertIsNotNone(self.bn.sample(50, as_df=False, progress_bar=False)) + + def test_predict(self): + seq = { + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } + data = pd.DataFrame.from_records(seq) + nodes = [ + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + ] + + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) + self.bn.fit_parameters(data) + + result = self.bn.predict(data.iloc[:, :3], parall_count=2, progress_bar=False) + self.assertNotEqual(result, {}) + + for v in result.values(): + for item in v: + self.assertFalse(pd.isna(item)) + + +class TestBigBraveBN(unittest.SkipTest): + pass + + +class TestCompositeNetwork(unittest.TestCase): + def setUp(self): + self.data = pd.read_csv(r"data/benchmark/healthcare.csv", index_col=0) + self.descriptor = { + "types": { + "A": "disc", + "C": "disc", + "D": "cont", + "H": "disc", + "I": "cont", + "O": "cont", + "T": "cont", + }, + "signs": {"D": "pos", "I": "neg", "O": "pos", "T": "pos"}, + } + self.reference_dag = [ + ("A", "C"), + ("A", "D"), + ("A", "H"), + ("A", "O"), + ("C", "I"), + ("D", "I"), + ("H", "D"), + ("I", "T"), + ("O", "T"), + ] + + self.comparative_dag = [("I", "T"), ("O", "T")] + + def test_learning(self): + bn, _ = self._get_starter_bn(self.data) + + bn.add_edges(self.data, verbose=False) + + bn.fit_parameters(self.data) + + obtained_dag = bn.edges + num_edges = len(obtained_dag) + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) + + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] + self.assertLess( + dist, + 25, + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) + + for node in bn.nodes: + if type(node).__name__ == "CompositeContinuousNode": + self.assertIsNotNone( + node.regressor, + msg="CompositeContinuousNode does not have regressor", + ) + if type(node).__name__ == "CompositeDiscreteNode": + self.assertIsNotNone( + node.classifier, + msg="CompositeDiscreteNode does not have classifier", + ) + + def test_learning_models(self): + bn, p = self._get_starter_bn(self.data[["I", "O", "T"]]) + + parent_node_a = CompositeNode( + nodes_from=None, + content={ + "name": "I", + "type": p.nodes_types["I"], + "parent_model": None, + }, + ) + + parent_node_h = CompositeNode( + nodes_from=None, + content={ + "name": "O", + "type": p.nodes_types["O"], + "parent_model": None, + }, + ) + + child_node = CompositeNode( + nodes_from=[parent_node_a, parent_node_h], + content={ + "name": "T", + "type": p.nodes_types["T"], + "parent_model": "CatBoostRegressor", + }, + ) + + comp_model = CompositeModel(nodes=[parent_node_a, parent_node_h, child_node]) + + bn.add_edges( + self.data[["I", "O", "T"]], + verbose=False, + custom_mutations=[custom_mutation_add_model], + custom_crossovers=[custom_crossover_all_model], + custom_initial_structure=[comp_model], + ) + + output_structure = [ + tuple([str(item) for item in inner_list]) for inner_list in bn.edges + ] + + self.assertEqual( + output_structure, + self.comparative_dag, + msg="Obtained BN should have reference structure", + ) + + @staticmethod + def _get_starter_bn(data): + encoder = pp.LabelEncoder() + p = bp.Preprocessor([("encoder", encoder)]) + + _, _ = p.apply(data) + + info = p.info + + bn = CompositeBN() + bn.add_nodes(info) + + return bn, p + + +if __name__ == "__main__": + unittest.main(verbosity=3) diff --git a/tests/test_nodes.py b/tests/test_nodes.py new file mode 100644 index 0000000..878d736 --- /dev/null +++ b/tests/test_nodes.py @@ -0,0 +1,464 @@ +import logging +import unittest + +import numpy as np +import pandas as pd + +from bamt.networks.hybrid_bn import HybridBN +from bamt.nodes import * + +logging.getLogger("nodes").setLevel(logging.CRITICAL) + + +class MyTest(unittest.TestCase): + unittest.skip("This is an assertion.") + + def assertDist(self, dist, node_type): + if node_type in ("disc", "disc_num"): + return self.assertAlmostEqual(sum(dist), 1) + else: + return self.assertEqual(len(dist), 2, msg=f"Error on {dist}") + + def assertDistMixture(self, dist): + return self.assertEqual(len(dist), 3, msg=f"Error on {dist}") + + +class TestBaseNode(MyTest): + def setUp(self): + np.random.seed(510) + + hybrid_bn = HybridBN(has_logit=True) + + info = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg", "Node2": "neg", "Node3": "neg"}, + } + + data = pd.DataFrame( + { + "Node0": np.random.normal(0, 4, 30), + "Node1": np.random.normal(0, 0.1, 30), + "Node2": np.random.normal(0, 0.3, 30), + "Node3": np.random.normal(0, 0.3, 30), + "Node4": np.random.choice(["cat1", "cat2", "cat3"], 30), + "Node5": np.random.choice(["cat4", "cat5", "cat6"], 30), + "Node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + "Node7": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + ) + + nodes = [ + gaussian_node.GaussianNode(name="Node0"), + gaussian_node.GaussianNode(name="Node1"), + gaussian_node.GaussianNode(name="Node2"), + gaussian_node.GaussianNode(name="Node3"), + discrete_node.DiscreteNode(name="Node4"), + discrete_node.DiscreteNode(name="Node5"), + discrete_node.DiscreteNode(name="Node6"), + discrete_node.DiscreteNode(name="Node7"), + ] + + edges = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] + + hybrid_bn.set_structure(info, nodes=nodes, edges=edges) + hybrid_bn.fit_parameters(data) + self.bn = hybrid_bn + self.data = data + self.info = info + + def test_equality(self): + test = base.BaseNode(name="node0") + first = base.BaseNode(name="node1") + + test_clone = test + + # different name + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different type + test_clone.type = "gaussian" + test.type = "gaussian" + + first.type = "discrete" + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different disc_parents + disc_parents = [f"node{i}" for i in range(2, 6)] + test.disc_parents, test_clone.disc_parents = disc_parents, disc_parents + first = disc_parents[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different cont_parents + cont_parents = [f"node{i}" for i in range(6, 10)] + test.disc_parents, test_clone.disc_parents = cont_parents, cont_parents + first = cont_parents[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + # different children + children = [f"node{i}" for i in range(5)] + test.disc_parents, test_clone.disc_parents = children, children + first = children[:3] + + self.assertFalse(test == first) + self.assertTrue(test == test_clone) + + def test_get_dist_mixture(self): + hybrid_bn = HybridBN(use_mixture=True, has_logit=True) + + hybrid_bn.set_structure(self.info, self.bn.nodes, self.bn.edges) + hybrid_bn.fit_parameters(self.data) + + mixture_gauss = hybrid_bn["Node1"] + cond_mixture_gauss = hybrid_bn["Node2"] + + for i in range(-2, 0, 2): + dist = hybrid_bn.get_dist(mixture_gauss.name, pvals={"Node0": i}) + self.assertDistMixture(dist) + + for i in range(-2, 0, 2): + for j in self.data[cond_mixture_gauss.disc_parents[0]].unique().tolist(): + dist = hybrid_bn.get_dist( + cond_mixture_gauss.name, pvals={"Node0": float(i), "Node4": j} + ) + self.assertDistMixture(dist) + + def test_get_dist(self): + for node in self.bn.nodes: + if not node.cont_parents + node.disc_parents: + dist = self.bn.get_dist(node.name) + if "mean" in dist.keys(): + self.assertTrue(isinstance(dist["mean"], float)) + self.assertTrue(isinstance(dist["variance"], float)) + else: + self.assertAlmostEqual(sum(dist["cprob"]), 1) + continue + + if len(node.cont_parents + node.disc_parents) == 1: + if node.disc_parents: + pvals = self.data[node.disc_parents[0]].unique().tolist() + parent = node.disc_parents[0] + else: + pvals = range(-5, 5, 1) + parent = node.cont_parents[0] + + for pval in pvals: + dist = self.bn.get_dist(node.name, {parent: pval}) + self.assertDist(dist, self.info["types"][node.name]) + else: + for i in self.data[node.disc_parents[0]].unique().tolist(): + for j in range(-5, 5, 1): + dist = self.bn.get_dist( + node.name, + {node.cont_parents[0]: float(j), node.disc_parents[0]: i}, + ) + self.assertDist(dist, self.info["types"][node.name]) + + # ??? + def test_choose_serialization(self): + pass + + +class TestDiscreteNode(unittest.TestCase): + def setUp(self): + self.node = discrete_node.DiscreteNode(name="test") + self.data_dict = { + "node0": np.random.normal(0, 4, 30), + "node1": np.random.normal(0, 0.1, 30), + "node2": np.random.normal(0, 0.3, 30), + "test": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + # self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertIsNotNone(params["vals"]) + self.assertNotEqual(params["vals"], []) + + for comb, probas in params["cprob"].items(): + self.assertAlmostEqual(sum(probas), 1, delta=1e-5) + + def test_choose(self): + pvals = ["cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.choose(params, pvals) in params["vals"]]) + + def test_predict(self): + pvals = ["cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.predict(params, pvals) in params["vals"]]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) + + +class TestGaussianNode(unittest.TestCase): + def setUp(self): + self.node = gaussian_node.GaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + node_without_parents = gaussian_node.GaussianNode(name="foster-son") + node_without_parents.children = ["node6", "node5"] + + params_parents = self.node.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + ) + params_foster = node_without_parents.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + ) + + self.assertIsNotNone(params_parents["regressor_obj"]) + self.assertTrue(pd.isna(params_parents["mean"])) + self.assertIsNotNone(params_parents["variance"]) + + self.assertIsNone(params_foster["regressor_obj"]) + self.assertFalse(pd.isna(params_foster["mean"])) + self.assertIsNotNone(params_foster["variance"]) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + self.assertRaises(ValueError, self.node.predict, params, ["bad", "values"]) + + +class TestConditionalGaussianNode(unittest.TestCase): + def setUp(self): + self.node = conditional_gaussian_node.ConditionalGaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def fit_parameters(self, regressor=None): + if regressor is not None: + self.node.regressor = regressor + self.node.type = "ConditionalGaussian" + f" ({type(regressor).__name__})" + + node_without_parents = conditional_gaussian_node.ConditionalGaussianNode( + name="foster-son" + ) + node_without_parents.children = ["node6", "node5"] + + params_parents = self.node.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"] + params_foster = node_without_parents.fit_parameters( + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"]["[]"] + + self.assertIsNone(params_foster["regressor_obj"]) + self.assertIsNotNone(params_foster["mean"]) + + # Since there can be empty dictionaries, + # we have to count them and if a fraction is greater than the threshold + # test failed. + report = [] + for comb, data in params_parents.items(): + if all(pd.isna(x) for x in data.values()): + report.append(1) + continue + else: + report.append(0) + + if pd.isna(data["mean"]): + self.assertIsNotNone(data["regressor_obj"]) + self.assertIsNotNone(data["variance"]) + else: + self.assertIsNone(data["regressor_obj"]) + self.assertIsNotNone(data["mean"]) + + self.assertLess(sum(report) / len(report), 0.3) + # print(params_parents, params_foster, sep="\n\n") + + # for k, v in params_parents.items(): + # print(k, True if v["regressor_obj"] else False, v["regressor"]) + + # print(sum(report) / len(report), node_without_parents.regressor) + + def test_choose(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) + + +class TestMixtureGaussianNode(unittest.TestCase): + def setUp(self): + self.node = mixture_gaussian_node.MixtureGaussianNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertAlmostEqual(sum(params["coef"]), 1, delta=1e-5) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + + +class TestConditionalMixtureGaussianNode(unittest.TestCase): + def setUp(self): + self.node = conditional_mixture_gaussian_node.ConditionalMixtureGaussianNode( + name="test" + ) + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), + "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict))[ + "hybcprob" + ] + report = [] + # sometimes combination's data can be empty, so we set the percent of + # empty combinations + for comb, data in params.items(): + if np.isclose(sum(data["coef"]), 1, atol=1e-5): + report.append(0) + else: + report.append(1) + self.assertLess(sum(report) / len(report), 0.3) + + def test_choose(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(isinstance(self.node.choose(params, pvals), float)) + + def test_predict(self): + pvals = [1.05, 1.95, "cat4", "cat7"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue(isinstance(self.node.predict(params, pvals), float)) + + +class TestLogitNode(unittest.TestCase): + def setUp(self): + self.node = logit_node.LogitNode(name="test") + self.data_dict = { + "node0": np.random.normal(1, 4, 30), + "node1": np.random.normal(2, 0.1, 30), + "node2": np.random.normal(3, 0.3, 30), + "test": np.random.choice(["cat1", "cat2", "cat3"], 30), + "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), + "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), + "node6": np.random.choice(["cat7", "cat8", "cat9"], 30), + } + + # self.node.disc_parents = ["node4", "node5"] + self.node.cont_parents = ["node0", "node1"] + self.node.children = ["node6"] + + def test_fit_parameters(self): + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertIsNotNone(params["classifier_obj"]) + + def test_choose(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + self.assertTrue(self.node.choose(params, pvals) in params["classes"]) + + def test_predict(self): + pvals = [1.05, 1.95] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) + + self.assertTrue([self.node.predict(params, pvals) in params["classes"]]) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_params.json b/tests/test_params.json new file mode 100644 index 0000000..e3ebc03 --- /dev/null +++ b/tests/test_params.json @@ -0,0 +1 @@ +{"Node0": {"mean": 56.7183257918552, "coef": [], "variance": 7149.17326145042}, "Node3": {"classes": ["COMPRESSION", "EVAPORITE", "EXTENSION", "GRAVITY", "INVERSION", "STRIKE-SLIP", "TRANSPRESSION", "TRANSTENSION", "UPLIFT"], "classifier_obj": "\u0080\u0004\u0095\u00b1\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\t\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u000bCOMPRESSION\u0094\u008c\tEVAPORITE\u0094\u008c\tEXTENSION\u0094\u008c\u0007GRAVITY\u0094\u008c\tINVERSION\u0094\u008c\u000bSTRIKE-SLIP\u0094\u008c\rTRANSPRESSION\u0094\u008c\fTRANSTENSION\u0094\u008c\u0006UPLIFT\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004+\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\tK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003h>NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089CH\u00a9\u0093\n\u00a1b\u001ak?\u00fc\u00a5\u00ad\u00e1\u00a2\u0001P\u00bf\u00f4G\u00ab\u00dd\u00f4\u00cc{?e.QC-\u00bau?\u001f\u00bd\u00acb&>t?j\u0092\u00f9\u0092\u00f0aw?1\u00cb\u00967N\u00dc\u0092\u00bfk\u0000\u00ac\u00d1\u00a4`\u007f?\u00a1\u0096\u0006\u00d1\u00b6Q\u008d\u00bf\u0094t\u0094b\u008c\nintercept_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\t\u0085\u0094hI\u0089CH'\u009cX\u008cG&\u0007@\u009a\u0011\u00e6\u00e9\u00feR\u0002\u00c0\u008f\u0085\u00f3\u00df\u00d5\u00da\u00fa?\u00df\u00ee\u00a1\u00e4\u0001#\u00f7?\u00e5\u00e1\b\u00b5\u009c\u000b\u00fb?x\u000b1Q\u0088\u00c8\u00d5?`r\u001f\u0006\u00b2\u00c2\u00f1\u00bf%B\u0006\u00c9=\u0002\u0006\u00c0\u00e0\u00b6\u00a3:\u00faZ\u00fe\u00bf\u0094t\u0094b\u008c\u0010_sklearn_version\u0094\u008c\u00051.0.2\u0094ub.", "classifier": "LogisticRegression", "serialization": "pickle"}, "Node1": {"hybcprob": {"['COMPRESSION']": {"variance": 53.87084286837278, "mean": 15.428971962616822, "coef": []}, "['EVAPORITE']": {"variance": 0.0, "mean": 27.0, "coef": []}, "['EXTENSION']": {"variance": 49.697271531886926, "mean": 18.712820512820514, "coef": []}, "['GRAVITY']": {"variance": 45.05489795918368, "mean": 21.12857142857143, "coef": []}, "['INVERSION']": {"variance": 46.653795918367344, "mean": 21.314285714285713, "coef": []}, "['STRIKE-SLIP']": {"variance": 46.55263157894737, "mean": 20.5, "coef": []}, "['TRANSPRESSION']": {"variance": 2.25, "mean": 18.5, "coef": []}, "['TRANSTENSION']": {"variance": 0.0, "mean": 25.0, "coef": []}, "['UPLIFT']": {"variance": 0.0, "mean": 10.0, "coef": []}}}, "Node4": {"classes": ["ARCHEAN", "CAMBRIAN", "CAMBRIAN-ORDOVICIAN", "CAMBRIAN-ORDOVICIAN/CARBONIFEROUS", "CARBONIFEROUS", "CARBONIFEROUS-CRETACEOUS", "CARBONIFEROUS-PERMIAN", "CRETACEOUS", "CRETACEOUS-PALEOGENE", "DEVONIAN", "DEVONIAN-CARBONIFEROUS", "DEVONIAN-PERMIAN", "JURASSIC", "JURASSIC-CRETACEOUS", "MESOZOIC", "NEOGENE", "ORDOVICIAN", "PALEOGENE", "PALEOGENE-NEOGENE", "PALEOZOIC", "PALEOZOIC-CRETACEOUS", "PERMIAN", "PERMIAN-TRIASSIC", "PROTEROZOIC", "PROTEROZOIC-CAMBRIAN", "SILURIAN", "TRIASSIC", "TRIASSIC-JURASSIC"], "classifier_obj": "\u0080\u0004\u0095N\u0006\u0000\u0000\u0000\u0000\u0000\u0000\u008c\u001esklearn.linear_model._logistic\u0094\u008c\u0012LogisticRegression\u0094\u0093\u0094)\u0081\u0094}\u0094(\u008c\u0007penalty\u0094\u008c\u0002l2\u0094\u008c\u0004dual\u0094\u0089\u008c\u0003tol\u0094G?\u001a6\u00e2\u00eb\u001cC-\u008c\u0001C\u0094G?\u00f0\u0000\u0000\u0000\u0000\u0000\u0000\u008c\rfit_intercept\u0094\u0088\u008c\u0011intercept_scaling\u0094K\u0001\u008c\fclass_weight\u0094N\u008c\frandom_state\u0094N\u008c\u0006solver\u0094\u008c\tnewton-cg\u0094\u008c\bmax_iter\u0094Kd\u008c\u000bmulti_class\u0094\u008c\u000bmultinomial\u0094\u008c\u0007verbose\u0094K\u0000\u008c\nwarm_start\u0094\u0089\u008c\u0006n_jobs\u0094N\u008c\bl1_ratio\u0094N\u008c\u000en_features_in_\u0094K\u0001\u008c\bclasses_\u0094\u008c\u0015numpy.core.multiarray\u0094\u008c\f_reconstruct\u0094\u0093\u0094\u008c\u0005numpy\u0094\u008c\u0007ndarray\u0094\u0093\u0094K\u0000\u0085\u0094C\u0001b\u0094\u0087\u0094R\u0094(K\u0001K\u001c\u0085\u0094h\u001c\u008c\u0005dtype\u0094\u0093\u0094\u008c\u0002O8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001|\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK?t\u0094b\u0089]\u0094(\u008c\u0007ARCHEAN\u0094\u008c\bCAMBRIAN\u0094\u008c\u0013CAMBRIAN-ORDOVICIAN\u0094\u008c!CAMBRIAN-ORDOVICIAN/CARBONIFEROUS\u0094\u008c\rCARBONIFEROUS\u0094\u008c\u0018CARBONIFEROUS-CRETACEOUS\u0094\u008c\u0015CARBONIFEROUS-PERMIAN\u0094\u008c\nCRETACEOUS\u0094\u008c\u0014CRETACEOUS-PALEOGENE\u0094\u008c\bDEVONIAN\u0094\u008c\u0016DEVONIAN-CARBONIFEROUS\u0094\u008c\u0010DEVONIAN-PERMIAN\u0094\u008c\bJURASSIC\u0094\u008c\u0013JURASSIC-CRETACEOUS\u0094\u008c\bMESOZOIC\u0094\u008c\u0007NEOGENE\u0094\u008c\nORDOVICIAN\u0094\u008c\tPALEOGENE\u0094\u008c\u0011PALEOGENE-NEOGENE\u0094\u008c\tPALEOZOIC\u0094\u008c\u0014PALEOZOIC-CRETACEOUS\u0094\u008c\u0007PERMIAN\u0094\u008c\u0010PERMIAN-TRIASSIC\u0094\u008c\u000bPROTEROZOIC\u0094\u008c\u0014PROTEROZOIC-CAMBRIAN\u0094\u008c\bSILURIAN\u0094\u008c\bTRIASSIC\u0094\u008c\u0011TRIASSIC-JURASSIC\u0094et\u0094b\u008c\u0007n_iter_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u0001\u0085\u0094h%\u008c\u0002i4\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003\u008c\u0001<\u0094NNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u0004!\u0000\u0000\u0000\u0094t\u0094b\u008c\u0005coef_\u0094h\u001bh\u001eK\u0000\u0085\u0094h \u0087\u0094R\u0094(K\u0001K\u001cK\u0001\u0086\u0094h%\u008c\u0002f8\u0094\u0089\u0088\u0087\u0094R\u0094(K\u0003hQNNNJ\u00ff\u00ff\u00ff\u00ffJ\u00ff\u00ff\u00ff\u00ffK\u0000t\u0094b\u0089C\u00e0LS1\u00b9zE\u00ec\u00bf\u00d1\u00c7\u00bf\u0092Y\u00c5\u00c2\u00bf\u00a6.\u0085\u00aen\u0001\u00dd\u00bf\u0016\u0099\u00f0\u00e0'\u0002\u00d2?\u00c3o\u00ed\u000e\u00f0Z\u00b3?\u00daFH.\u0082\u00c7\u0091?\u00bds\u0099\u00fe\u00a5#\u00bb?\u0093?\u00b0\u00aflW\u00d2?\u00a6\u008a.\u0096\u007f\u00aa\u00d6?\u00bf7|:\u00be\u0095\u00ab\u00bf\u00eb\u0017\u00de\u00edN&\u00e1\u00bf\u00d3S\u00e6 \u00d8H\u00b6\u00bf}\u001b\u0088VKS\u00ce?z\u00b8\u0090;\u00f74\u0092?oK\u001f\u00c4v\u0012\u00a9?\u008f\u00a9\u00f8s\u00b4\u00a3\u00da?\u00ffS)+v\u0092\u00d6\u00bf]\u00ea\u00af&\u001a~\u00d3?\u0094\u00f2\u00e1M\u00c1\u0092\u00d2?\u00fb\u00de\u00da?\u001a\u0090\u00c1?z\u0092\u009d\u00c3\u00e2\u0096\u00c9?\u00e5m\u0098T\u00ed(\u00a6?\u00e26