From 33ad9be7be21d9c621390c894bfd9ccbdb4856ac Mon Sep 17 00:00:00 2001 From: jrzkaminski <86363785+jrzkaminski@users.noreply.github.com> Date: Tue, 11 Jul 2023 13:27:08 +0300 Subject: [PATCH] Code reformatting to black style --- README.rst | 5 + bamt/builders/__init__.py | 5 +- bamt/builders/builders_base.py | 113 +- bamt/builders/evo_builder.py | 162 +-- bamt/builders/hc_builder.py | 155 +-- bamt/config.py | 22 +- .../pyBN/classes/_tests/test_bayesnet.py | 160 +-- bamt/external/pyBN/classes/bayesnet.py | 68 +- .../utils/_tests/test_independence_tests.py | 26 +- .../pyBN/utils/_tests/test_markov_blanket.py | 26 +- .../pyBN/utils/_tests/test_orient_edges.py | 20 +- .../pyBN/utils/_tests/test_random_sample.py | 17 +- .../external/pyBN/utils/independence_tests.py | 12 +- bamt/log.py | 27 +- bamt/mi_entropy_gauss.py | 100 +- bamt/networks/__init__.py | 11 +- bamt/networks/base.py | 505 +++++---- bamt/networks/big_brave_bn.py | 15 +- bamt/networks/continuous_bn.py | 4 +- bamt/networks/discrete_bn.py | 4 +- bamt/networks/hybrid_bn.py | 15 +- bamt/nodes/__init__.py | 16 +- bamt/nodes/base.py | 41 +- bamt/nodes/conditional_gaussian_node.py | 124 ++- bamt/nodes/conditional_logit_node.py | 84 +- .../conditional_mixture_gaussian_node.py | 90 +- bamt/nodes/discrete_node.py | 43 +- bamt/nodes/gaussian_node.py | 77 +- bamt/nodes/logit_node.py | 47 +- bamt/nodes/mixture_gaussian_node.py | 74 +- bamt/preprocess/discretization.py | 60 +- bamt/preprocess/graph.py | 3 +- bamt/preprocess/numpy_pandas.py | 20 +- bamt/preprocessors.py | 59 +- bamt/redef_HC.py | 130 +-- bamt/redef_info_scores.py | 70 +- bamt/utils/EvoUtils.py | 36 +- bamt/utils/GraphUtils.py | 41 +- bamt/utils/MathUtils.py | 77 +- tests/BigbraveBNTest.py | 26 +- tests/LoadBN.py | 27 +- tests/MainTest.py | 70 +- tests/MetricsTest.py | 34 +- tests/NetworksTest.py | 405 ++++---- tests/SaveBN.py | 28 +- tests/main.py | 10 +- tests/sendingClassifiersLogit.py | 32 +- tests/sendingRegressors.py | 32 +- tests/test_builders.py | 837 +++++++++++---- tests/test_networks.py | 973 ++++++++++++++++-- tests/test_nodes.py | 125 +-- 51 files changed, 3279 insertions(+), 1884 deletions(-) diff --git a/README.rst b/README.rst index cd777fa..ab0ccd6 100644 --- a/README.rst +++ b/README.rst @@ -18,6 +18,8 @@ - | |license| * - stats - | |downloads_stats| |downloads_monthly| |downloads_weekly| + * - style + - | |Black| Repository of a data modeling and analysis tool based on Bayesian networks @@ -241,3 +243,6 @@ Citation .. |coverage| image:: https://codecov.io/github/aimclub/BAMT/branch/master/graph/badge.svg?token=fA4qsxGqTC :target: https://codecov.io/github/aimclub/BAMT + +.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg +.. _Black: https://github.com/psf/black diff --git a/bamt/builders/__init__.py b/bamt/builders/__init__.py index a05b821..d886ad6 100644 --- a/bamt/builders/__init__.py +++ b/bamt/builders/__init__.py @@ -1,4 +1 @@ -__all__ = ["builders_base", - "evo_builder", - "hc_builder" - ] +__all__ = ["builders_base", "evo_builder", "hc_builder"] diff --git a/bamt/builders/builders_base.py b/bamt/builders/builders_base.py index 8b61b97..510324e 100644 --- a/bamt/builders/builders_base.py +++ b/bamt/builders/builders_base.py @@ -35,29 +35,31 @@ def __init__(self, descriptor: Dict[str, Dict[str, str]]): Attributes: black_list: a list with restricted connections; """ - self.skeleton = {'V': [], - 'E': []} + self.skeleton = {"V": [], "E": []} self.descriptor = descriptor self.has_logit = bool self.black_list = None - def restrict(self, data: DataFrame, - init_nodes: Optional[List[str]], - bl_add: Optional[List[str]]): + def restrict( + self, + data: DataFrame, + init_nodes: Optional[List[str]], + bl_add: Optional[List[str]], + ): """ :param data: data to deal with :param init_nodes: nodes to begin with (thus they have no parents) :param bl_add: additional vertices """ - node_type = self.descriptor['types'] + node_type = self.descriptor["types"] blacklist = [] datacol = data.columns.to_list() if not self.has_logit: # Has_logit flag allows BN building edges between cont and disc - RESTRICTIONS = [('cont', 'disc'), ('cont', 'disc_num')] + RESTRICTIONS = [("cont", "disc"), ("cont", "disc_num")] for x, y in itertools.product(datacol, repeat=2): if x != y: if (node_type[x], node_type[y]) in RESTRICTIONS: @@ -65,8 +67,7 @@ def restrict(self, data: DataFrame, else: self.black_list = [] if init_nodes: - blacklist += [(x, y) - for x in datacol for y in init_nodes if x != y] + blacklist += [(x, y) for x in datacol for y in init_nodes if x != y] if bl_add: blacklist = blacklist + bl_add self.black_list = blacklist @@ -75,17 +76,17 @@ def get_family(self): """ A function that updates a skeleton; """ - if not self.skeleton['V']: + if not self.skeleton["V"]: logger_builder.error("Vertex list is None") return None - if not self.skeleton['E']: + if not self.skeleton["E"]: logger_builder.error("Edges list is None") return None - for node_instance in self.skeleton['V']: + for node_instance in self.skeleton["V"]: node = node_instance.name children = [] parents = [] - for edge in self.skeleton['E']: + for edge in self.skeleton["E"]: if node in edge: if edge.index(node) == 0: children.append(edge[1]) @@ -95,20 +96,20 @@ def get_family(self): disc_parents = [] cont_parents = [] for parent in parents: - if self.descriptor['types'][parent] in ['disc', 'disc_num']: + if self.descriptor["types"][parent] in ["disc", "disc_num"]: disc_parents.append(parent) else: cont_parents.append(parent) - id = self.skeleton['V'].index(node_instance) - self.skeleton['V'][id].disc_parents = disc_parents - self.skeleton['V'][id].cont_parents = cont_parents - self.skeleton['V'][id].children = children + id = self.skeleton["V"].index(node_instance) + self.skeleton["V"][id].disc_parents = disc_parents + self.skeleton["V"][id].cont_parents = cont_parents + self.skeleton["V"][id].children = children - ordered = gru.toporder(self.skeleton['V'], self.skeleton['E']) - not_ordered = [node.name for node in self.skeleton['V']] + ordered = gru.toporder(self.skeleton["V"], self.skeleton["E"]) + not_ordered = [node.name for node in self.skeleton["V"]] mask = [not_ordered.index(name) for name in ordered] - self.skeleton['V'] = [self.skeleton['V'][i] for i in mask] + self.skeleton["V"] = [self.skeleton["V"][i] for i in mask] class VerticesDefiner(StructureBuilder): @@ -116,8 +117,9 @@ class VerticesDefiner(StructureBuilder): Main class for defining vertices """ - def __init__(self, descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object]): + def __init__( + self, descriptor: Dict[str, Dict[str, str]], regressor: Optional[object] + ): """ Automatically creates a list of nodes """ @@ -127,10 +129,10 @@ def __init__(self, descriptor: Dict[str, Dict[str, str]], node = None # LEVEL 1: Define a general type of node: Discrete or Gaussian - for vertex, type in self.descriptor['types'].items(): - if type in ['disc_num', 'disc']: + for vertex, type in self.descriptor["types"].items(): + if type in ["disc_num", "disc"]: node = DiscreteNode(name=vertex) - elif type == 'cont': + elif type == "cont": node = GaussianNode(name=vertex, regressor=regressor) else: msg = f"""First stage of automatic vertex detection failed on {vertex} due TypeError ({type}). @@ -141,11 +143,12 @@ def __init__(self, descriptor: Dict[str, Dict[str, str]], self.vertices.append(node) def overwrite_vertex( - self, - has_logit: bool, - use_mixture: bool, - classifier: Optional[Callable], - regressor: Optional[Callable]): + self, + has_logit: bool, + use_mixture: bool, + classifier: Optional[Callable], + regressor: Optional[Callable], + ): """ Level 2: Redefined nodes according structure (parents) :param classifier: an object to pass into logit, condLogit nodes @@ -156,42 +159,43 @@ def overwrite_vertex( for node_instance in self.vertices: node = node_instance if has_logit: - if 'Discrete' in node_instance.type: + if "Discrete" in node_instance.type: if node_instance.cont_parents: if not node_instance.disc_parents: node = LogitNode( - name=node_instance.name, classifier=classifier) + name=node_instance.name, classifier=classifier + ) elif node_instance.disc_parents: node = ConditionalLogitNode( - name=node_instance.name, classifier=classifier) + name=node_instance.name, classifier=classifier + ) if use_mixture: - if 'Gaussian' in node_instance.type: + if "Gaussian" in node_instance.type: if not node_instance.disc_parents: - node = MixtureGaussianNode( - name=node_instance.name) + node = MixtureGaussianNode(name=node_instance.name) elif node_instance.disc_parents: - node = ConditionalMixtureGaussianNode( - name=node_instance.name) + node = ConditionalMixtureGaussianNode(name=node_instance.name) else: continue else: - if 'Gaussian' in node_instance.type: + if "Gaussian" in node_instance.type: if node_instance.disc_parents: node = ConditionalGaussianNode( - name=node_instance.name, regressor=regressor) + name=node_instance.name, regressor=regressor + ) else: continue if node_instance == node: continue - id = self.skeleton['V'].index(node_instance) + id = self.skeleton["V"].index(node_instance) node.disc_parents = node_instance.disc_parents node.cont_parents = node_instance.cont_parents node.children = node_instance.children - self.skeleton['V'][id] = node + self.skeleton["V"][id] = node class EdgesDefiner(StructureBuilder): @@ -200,15 +204,20 @@ def __init__(self, descriptor: Dict[str, Dict[str, str]]): class BaseDefiner(VerticesDefiner, EdgesDefiner): - def __init__(self, data: DataFrame, descriptor: Dict[str, Dict[str, str]], - scoring_function: Union[Tuple[str, Callable], Tuple[str]], - regressor: Optional[object] = None): - + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): self.scoring_function = scoring_function - self.params = {'init_edges': None, - 'init_nodes': None, - 'remove_init_edges': True, - 'white_list': None, - 'bl_add': None} + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } super().__init__(descriptor, regressor=regressor) self.optimizer = None # will be defined in subclasses diff --git a/bamt/builders/evo_builder.py b/bamt/builders/evo_builder.py index 93dc2e9..e4b1f4b 100644 --- a/bamt/builders/evo_builder.py +++ b/bamt/builders/evo_builder.py @@ -23,9 +23,13 @@ class EvoDefiner(BaseDefiner): """ Object that might take additional methods to decompose structure builder class """ - def __init__(self, data: DataFrame, descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object] = None): + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object] = None, + ): super().__init__(data, descriptor, regressor) @@ -41,29 +45,30 @@ class EvoStructureBuilder(EvoDefiner): has_logit (bool): Indicates whether a logit link function should be used. use_mixture (bool): Indicates whether a mixture model should be used. """ + def __init__( - self, - data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - regressor: Optional[object], - has_logit: bool, - use_mixture: bool): - super( - EvoStructureBuilder, - self).__init__( - data=data, - descriptor=descriptor, - regressor=regressor) + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): + super(EvoStructureBuilder, self).__init__( + data=data, descriptor=descriptor, regressor=regressor + ) self.data = data self.descriptor = descriptor self.has_logit = has_logit self.use_mixture = use_mixture self.regressor = regressor - self.params = {'init_edges': None, - 'init_nodes': None, - 'remove_init_edges': True, - 'white_list': None, - 'bl_add': None} + self.params = { + "init_edges": None, + "init_nodes": None, + "remove_init_edges": True, + "white_list": None, + "bl_add": None, + } self.default_n_jobs = -1 self.default_pop_size = 15 self.default_crossover_prob = 0.9 @@ -72,37 +77,43 @@ def __init__( self.default_max_depth = 100 self.default_timeout = 180 self.objective_metric = evo.K2_metric - self.default_crossovers = [CrossoverTypesEnum.exchange_edges, - CrossoverTypesEnum.exchange_parents_one, - CrossoverTypesEnum.exchange_parents_both] + self.default_crossovers = [ + CrossoverTypesEnum.exchange_edges, + CrossoverTypesEnum.exchange_parents_one, + CrossoverTypesEnum.exchange_parents_both, + ] self.default_mutations = [ evo.custom_mutation_add, evo.custom_mutation_delete, - evo.custom_mutation_reverse] + evo.custom_mutation_reverse, + ] self.default_selection = [SelectionTypesEnum.tournament] self.default_constraints = [ has_no_self_cycled_nodes, has_no_cycle, - evo.has_no_duplicates] - - def build(self, - data: DataFrame, - classifier: Optional[object], - regressor: Optional[object], - **kwargs): + evo.has_no_duplicates, + ] + + def build( + self, + data: DataFrame, + classifier: Optional[object], + regressor: Optional[object], + **kwargs + ): """ - Builds the structure of a Bayesian network from the given data using an evolutionary algorithm. + Builds the structure of a Bayesian network from the given data using an evolutionary algorithm. - Args: - data (DataFrame): The data from which to build the structure. - classifier (Optional[object]): A classification model for discrete nodes. - regressor (Optional[object]): A regression model for continuous nodes. + Args: + data (DataFrame): The data from which to build the structure. + classifier (Optional[object]): A classification model for discrete nodes. + regressor (Optional[object]): A regression model for continuous nodes. - Additional optional parameters can be provided as keyword arguments (kwargs) to customize the evolutionary - algorithm used to generate the structure. These include parameters to control the size of the population, - the probabilities of crossover and mutation, constraints on the structure of the graph, and many others. + Additional optional parameters can be provided as keyword arguments (kwargs) to customize the evolutionary + algorithm used to generate the structure. These include parameters to control the size of the population, + the probabilities of crossover and mutation, constraints on the structure of the graph, and many others. - The resulting structure is stored in the `skeleton` attribute of the `EvoStructureBuilder` object. + The resulting structure is stored in the `skeleton` attribute of the `EvoStructureBuilder` object. """ # Get the list of node names nodes_types = data.columns.to_list() @@ -111,54 +122,58 @@ def build(self, initial = [ evo.CustomGraphModel( nodes=kwargs.get( - 'init_nodes', [ - evo.CustomGraphNode(node_type) for node_type in nodes_types]))] + "init_nodes", + [evo.CustomGraphNode(node_type) for node_type in nodes_types], + ) + ) + ] # Define the requirements for the evolutionary algorithm requirements = GraphRequirements( - max_arity=kwargs.get( - 'max_arity', self.default_max_arity), max_depth=kwargs.get( - 'max_depth', self.default_max_depth), - timeout=timedelta( - minutes=kwargs.get( - 'timeout', self.default_timeout)), - n_jobs=kwargs.get('n_jobs', self.default_n_jobs)) + max_arity=kwargs.get("max_arity", self.default_max_arity), + max_depth=kwargs.get("max_depth", self.default_max_depth), + timeout=timedelta(minutes=kwargs.get("timeout", self.default_timeout)), + n_jobs=kwargs.get("n_jobs", self.default_n_jobs), + ) # Set the parameters for the evolutionary algorithm optimizer_parameters = GPAlgorithmParameters( - pop_size=kwargs.get('pop_size', self.default_pop_size), - crossover_prob=kwargs.get('crossover_prob', self.default_crossover_prob), - mutation_prob=kwargs.get('mutation_prob', self.default_mutation_prob), + pop_size=kwargs.get("pop_size", self.default_pop_size), + crossover_prob=kwargs.get("crossover_prob", self.default_crossover_prob), + mutation_prob=kwargs.get("mutation_prob", self.default_mutation_prob), genetic_scheme_type=GeneticSchemeTypesEnum.steady_state, - mutation_types=kwargs.get('custom_mutations', self.default_mutations), - crossover_types=kwargs.get('custom_crossovers', self.default_crossovers), - selection_types=kwargs.get('selection_type', self.default_selection)) + mutation_types=kwargs.get("custom_mutations", self.default_mutations), + crossover_types=kwargs.get("custom_crossovers", self.default_crossovers), + selection_types=kwargs.get("selection_type", self.default_selection), + ) # Set the adapter for the conversion between the graph and the data # structures used by the optimizer adapter = DirectAdapter( - base_graph_class=evo.CustomGraphModel, - base_node_class=evo.CustomGraphNode) + base_graph_class=evo.CustomGraphModel, base_node_class=evo.CustomGraphNode + ) # Set the constraints for the graph - constraints = kwargs.get('custom_constraints', []) + constraints = kwargs.get("custom_constraints", []) constraints.extend(self.default_constraints) - if kwargs.get('blacklist', None) is not None: + if kwargs.get("blacklist", None) is not None: constraints.append(evo.has_no_blacklist_edges) - if kwargs.get('whitelist', None) is not None: + if kwargs.get("whitelist", None) is not None: constraints.append(evo.has_only_whitelist_edges) graph_generation_params = GraphGenerationParams( adapter=adapter, rules_for_constraint=constraints, - available_node_types=nodes_types) + available_node_types=nodes_types, + ) # Define the objective function to optimize - objective = Objective({'custom': kwargs.get( - 'custom_metric', self.objective_metric)}) + objective = Objective( + {"custom": kwargs.get("custom_metric", self.objective_metric)} + ) # Initialize the optimizer optimizer = EvoGraphOptimizer( @@ -166,11 +181,11 @@ def build(self, initial_graphs=initial, requirements=requirements, graph_generation_params=graph_generation_params, - graph_optimizer_params=optimizer_parameters) + graph_optimizer_params=optimizer_parameters, + ) # Define the function to evaluate the objective function - objective_eval = ObjectiveEvaluate( - objective, data=data) + objective_eval = ObjectiveEvaluate(objective, data=data) # Run the optimization optimized_graph = optimizer.optimise(objective_eval)[0] @@ -180,16 +195,17 @@ def build(self, best_graph_edge_list = self._convert_to_strings(best_graph_edge_list) # Convert the best graph to the format used by the Bayesian Network - self.skeleton['V'] = self.vertices - self.skeleton['E'] = best_graph_edge_list + self.skeleton["V"] = self.vertices + self.skeleton["E"] = best_graph_edge_list self.get_family() - self.overwrite_vertex(has_logit=self.has_logit, - use_mixture=self.use_mixture, - classifier=classifier, - regressor=regressor) + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) @staticmethod def _convert_to_strings(nested_list): - return [[str(item) for item in inner_list] - for inner_list in nested_list] + return [[str(item) for item in inner_list] for inner_list in nested_list] diff --git a/bamt/builders/hc_builder.py b/bamt/builders/hc_builder.py index 0a5e80e..f4fdbb2 100644 --- a/bamt/builders/hc_builder.py +++ b/bamt/builders/hc_builder.py @@ -16,21 +16,24 @@ class HillClimbDefiner(BaseDefiner): Object to define structure and pass it into skeleton """ - def __init__(self, data: DataFrame, descriptor: Dict[str, Dict[str, str]], - scoring_function: Union[Tuple[str, Callable], Tuple[str]], - regressor: Optional[object] = None): - + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Union[Tuple[str, Callable], Tuple[str]], + regressor: Optional[object] = None, + ): super().__init__(data, descriptor, scoring_function, regressor) self.optimizer = HillClimbSearch(data) - def apply_K2(self, - data: DataFrame, - init_edges: Optional[List[Tuple[str, - str]]], - progress_bar: bool, - remove_init_edges: bool, - white_list: Optional[List[Tuple[str, - str]]]): + def apply_K2( + self, + data: DataFrame, + init_edges: Optional[List[Tuple[str, str]]], + progress_bar: bool, + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): """ :param init_edges: list of tuples, a graph to start learning with :param remove_init_edges: allows changes in a model defined by user @@ -38,14 +41,15 @@ def apply_K2(self, :param progress_bar: verbose regime :param white_list: list of allowed edges """ - if not all([i in ['disc', 'disc_num'] - for i in gru.nodes_types(data).values()]): + if not all([i in ["disc", "disc_num"] for i in gru.nodes_types(data).values()]): logger_builder.error( - f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}") + f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}" + ) return None if len(self.scoring_function) != 2: from pgmpy.estimators import K2Score + scoring_function = K2Score else: scoring_function = self.scoring_function[1] @@ -55,10 +59,9 @@ def apply_K2(self, scoring_method=scoring_function(data), black_list=self.black_list, white_list=white_list, - show_progress=progress_bar + show_progress=progress_bar, ) else: - if remove_init_edges: startdag = DAG() nodes = [str(v) for v in self.vertices] @@ -68,25 +71,27 @@ def apply_K2(self, black_list=self.black_list, white_list=white_list, start_dag=startdag, - show_progress=False) + show_progress=False, + ) else: best_model = self.optimizer.estimate( black_list=self.black_list, white_list=white_list, fixed_edges=init_edges, - show_progress=False) + show_progress=False, + ) structure = [list(x) for x in list(best_model.edges())] - self.skeleton['E'] = structure - - def apply_group1(self, - data: DataFrame, - progress_bar: bool, - init_edges: Optional[List[Tuple[str, - str]]], - remove_init_edges: bool, - white_list: Optional[List[Tuple[str, - str]]]): + self.skeleton["E"] = structure + + def apply_group1( + self, + data: DataFrame, + progress_bar: bool, + init_edges: Optional[List[Tuple[str, str]]], + remove_init_edges: bool, + white_list: Optional[List[Tuple[str, str]]], + ): """ This method implements the group of scoring functions. Group: @@ -95,24 +100,24 @@ def apply_group1(self, "BIC" - Bayesian Information Criteria, "AIC" - Akaike information Criteria. """ - column_name_dict = dict([(n.name, i) - for i, n in enumerate(self.vertices)]) + column_name_dict = dict([(n.name, i) for i, n in enumerate(self.vertices)]) blacklist_new = [] for pair in self.black_list: - blacklist_new.append( - (column_name_dict[pair[0]], column_name_dict[pair[1]])) + blacklist_new.append((column_name_dict[pair[0]], column_name_dict[pair[1]])) if white_list: white_list_old = white_list[:] white_list = [] for pair in white_list_old: white_list.append( - (column_name_dict[pair[0]], column_name_dict[pair[1]])) + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) if init_edges: init_edges_old = init_edges[:] init_edges = [] for pair in init_edges_old: init_edges.append( - (column_name_dict[pair[0]], column_name_dict[pair[1]])) + (column_name_dict[pair[0]], column_name_dict[pair[1]]) + ) bn = hc_method( data, @@ -121,14 +126,23 @@ def apply_group1(self, init_edges=init_edges, remove_geo_edges=remove_init_edges, black_list=blacklist_new, - debug=progress_bar) + debug=progress_bar, + ) structure = [] nodes = sorted(list(bn.nodes())) for rv in nodes: - for pa in bn.F[rv]['parents']: - structure.append([list(column_name_dict.keys())[list(column_name_dict.values()).index( - pa)], list(column_name_dict.keys())[list(column_name_dict.values()).index(rv)]]) - self.skeleton['E'] = structure + for pa in bn.F[rv]["parents"]: + structure.append( + [ + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(pa) + ], + list(column_name_dict.keys())[ + list(column_name_dict.values()).index(rv) + ], + ] + ) + self.skeleton["E"] = structure class HCStructureBuilder(HillClimbDefiner): @@ -136,55 +150,60 @@ class HCStructureBuilder(HillClimbDefiner): Final object with build method """ - def __init__(self, data: DataFrame, - descriptor: Dict[str, Dict[str, str]], - scoring_function: Tuple[str, Callable], - regressor: Optional[object], - has_logit: bool, use_mixture: bool): + def __init__( + self, + data: DataFrame, + descriptor: Dict[str, Dict[str, str]], + scoring_function: Tuple[str, Callable], + regressor: Optional[object], + has_logit: bool, + use_mixture: bool, + ): """ :param data: train data :param descriptor: map for data """ - super( - HCStructureBuilder, - self).__init__( + super(HCStructureBuilder, self).__init__( descriptor=descriptor, data=data, scoring_function=scoring_function, - regressor=regressor) + regressor=regressor, + ) self.use_mixture = use_mixture self.has_logit = has_logit - def build(self, data: DataFrame, - progress_bar: bool, - classifier: Optional[object], - regressor: Optional[object], - params: Optional[ParamDict] = None, - **kwargs): + def build( + self, + data: DataFrame, + progress_bar: bool, + classifier: Optional[object], + regressor: Optional[object], + params: Optional[ParamDict] = None, + **kwargs, + ): if params: for param, value in params.items(): self.params[param] = value - init_nodes = self.params.pop('init_nodes') - bl_add = self.params.pop('bl_add') + init_nodes = self.params.pop("init_nodes") + bl_add = self.params.pop("bl_add") # Level 1 - self.skeleton['V'] = self.vertices + self.skeleton["V"] = self.vertices self.restrict(data, init_nodes, bl_add) - if self.scoring_function[0] == 'K2': + if self.scoring_function[0] == "K2": self.apply_K2(data=data, progress_bar=progress_bar, **self.params) - elif self.scoring_function[0] in ['MI', 'LL', 'BIC', 'AIC']: - self.apply_group1( - data=data, - progress_bar=progress_bar, - **self.params) + elif self.scoring_function[0] in ["MI", "LL", "BIC", "AIC"]: + self.apply_group1(data=data, progress_bar=progress_bar, **self.params) # Level 2 self.get_family() - self.overwrite_vertex(has_logit=self.has_logit, - use_mixture=self.use_mixture, - classifier=classifier, - regressor=regressor) + self.overwrite_vertex( + has_logit=self.has_logit, + use_mixture=self.use_mixture, + classifier=classifier, + regressor=regressor, + ) diff --git a/bamt/config.py b/bamt/config.py index 586f7ed..61e3860 100644 --- a/bamt/config.py +++ b/bamt/config.py @@ -3,21 +3,17 @@ config = configparser.ConfigParser() -CONFIGFILE = path.join(path.dirname(path.abspath(__file__)), 'selbst.ini') +CONFIGFILE = path.join(path.dirname(path.abspath(__file__)), "selbst.ini") if path.isfile(CONFIGFILE) and stat(CONFIGFILE).st_size != 0: config.read(CONFIGFILE) else: - open(CONFIGFILE, 'a').close() - config['NODES'] = { - 'models_storage': path.join( - path.expanduser("~"), - 'BAMT', - 'Nodes_data')} - config['LOG'] = { - "log_conf_loc": path.join( - path.dirname( - path.abspath(__file__)), - 'logging.conf')} - with open(CONFIGFILE, 'w') as configfile: + open(CONFIGFILE, "a").close() + config["NODES"] = { + "models_storage": path.join(path.expanduser("~"), "BAMT", "Nodes_data") + } + config["LOG"] = { + "log_conf_loc": path.join(path.dirname(path.abspath(__file__)), "logging.conf") + } + with open(CONFIGFILE, "w") as configfile: config.write(configfile) diff --git a/bamt/external/pyBN/classes/_tests/test_bayesnet.py b/bamt/external/pyBN/classes/_tests/test_bayesnet.py index 766a3ed..4946f8a 100644 --- a/bamt/external/pyBN/classes/_tests/test_bayesnet.py +++ b/bamt/external/pyBN/classes/_tests/test_bayesnet.py @@ -42,17 +42,11 @@ class BayesNetTestCase(unittest.TestCase): - def setUp(self): self.bn = BayesNet() - self.dpath = os.path.join( - dirname( - dirname( - dirname( - dirname(__file__)))), - 'data') - self.bn_bif = read_bn(os.path.join(self.dpath, 'cancer.bif')) - self.bn_bn = read_bn(os.path.join(self.dpath, 'cmu.bn')) + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn_bif = read_bn(os.path.join(self.dpath, "cancer.bif")) + self.bn_bn = read_bn(os.path.join(self.dpath, "cmu.bn")) def tearDown(self): pass @@ -62,85 +56,129 @@ def test_isinstance(self): def test_V_bif(self): self.assertListEqual( - self.bn_bif.V, [ - 'Smoker', 'Pollution', 'Cancer', 'Xray', 'Dyspnoea']) + self.bn_bif.V, ["Smoker", "Pollution", "Cancer", "Xray", "Dyspnoea"] + ) def test_E_bif(self): - self.assertDictEqual(self.bn_bif.E, - {'Cancer': ['Xray', 'Dyspnoea'], - 'Dyspnoea': [], - 'Pollution': ['Cancer'], - 'Smoker': ['Cancer'], - 'Xray': []}) + self.assertDictEqual( + self.bn_bif.E, + { + "Cancer": ["Xray", "Dyspnoea"], + "Dyspnoea": [], + "Pollution": ["Cancer"], + "Smoker": ["Cancer"], + "Xray": [], + }, + ) def test_F_bif(self): - self.assertDictEqual(self.bn_bif.F, - {'Cancer': {'cpt': [0.03, 0.97, 0.05, 0.95, 0.001, 0.999, 0.02, 0.98], - 'parents': ['Pollution', 'Smoker'], - 'values': ['True', 'False']}, - 'Dyspnoea': {'cpt': [0.65, 0.35, 0.3, 0.7], - 'parents': ['Cancer'], - 'values': ['True', 'False']}, - 'Pollution': {'cpt': [0.9, 0.1], 'parents': [], 'values': ['low', 'high']}, - 'Smoker': {'cpt': [0.3, 0.7], 'parents': [], 'values': ['True', 'False']}, - 'Xray': {'cpt': [0.9, 0.1, 0.2, 0.8], - 'parents': ['Cancer'], - 'values': ['positive', 'negative']}}) + self.assertDictEqual( + self.bn_bif.F, + { + "Cancer": { + "cpt": [0.03, 0.97, 0.05, 0.95, 0.001, 0.999, 0.02, 0.98], + "parents": ["Pollution", "Smoker"], + "values": ["True", "False"], + }, + "Dyspnoea": { + "cpt": [0.65, 0.35, 0.3, 0.7], + "parents": ["Cancer"], + "values": ["True", "False"], + }, + "Pollution": { + "cpt": [0.9, 0.1], + "parents": [], + "values": ["low", "high"], + }, + "Smoker": { + "cpt": [0.3, 0.7], + "parents": [], + "values": ["True", "False"], + }, + "Xray": { + "cpt": [0.9, 0.1, 0.2, 0.8], + "parents": ["Cancer"], + "values": ["positive", "negative"], + }, + }, + ) def test_V_bn(self): self.assertListEqual( - self.bn_bn.V, [ - 'Burglary', 'Earthquake', 'Alarm', 'JohnCalls', 'MaryCalls']) + self.bn_bn.V, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) def test_E_bn(self): - self.assertDictEqual(self.bn_bn.E, - {'Alarm': ['JohnCalls', 'MaryCalls'], - 'Burglary': ['Alarm'], - 'Earthquake': ['Alarm'], - 'JohnCalls': [], - 'MaryCalls': []}) + self.assertDictEqual( + self.bn_bn.E, + { + "Alarm": ["JohnCalls", "MaryCalls"], + "Burglary": ["Alarm"], + "Earthquake": ["Alarm"], + "JohnCalls": [], + "MaryCalls": [], + }, + ) def test_F_bn(self): - self.assertDictEqual(self.bn_bn.F, - {'Alarm': {'cpt': [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95], - 'parents': ['Earthquake', 'Burglary'], - 'values': ['No', 'Yes']}, - 'Burglary': {'cpt': [0.999, 0.001], 'parents': [], 'values': ['No', 'Yes']}, - 'Earthquake': {'cpt': [0.998, 0.002], 'parents': [], 'values': ['No', 'Yes']}, - 'JohnCalls': {'cpt': [0.95, 0.05, 0.1, 0.9], - 'parents': ['Alarm'], - 'values': ['No', 'Yes']}, - 'MaryCalls': {'cpt': [0.99, 0.01, 0.3, 0.7], - 'parents': ['Alarm'], - 'values': ['No', 'Yes']}}) + self.assertDictEqual( + self.bn_bn.F, + { + "Alarm": { + "cpt": [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95], + "parents": ["Earthquake", "Burglary"], + "values": ["No", "Yes"], + }, + "Burglary": { + "cpt": [0.999, 0.001], + "parents": [], + "values": ["No", "Yes"], + }, + "Earthquake": { + "cpt": [0.998, 0.002], + "parents": [], + "values": ["No", "Yes"], + }, + "JohnCalls": { + "cpt": [0.95, 0.05, 0.1, 0.9], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + "MaryCalls": { + "cpt": [0.99, 0.01, 0.3, 0.7], + "parents": ["Alarm"], + "values": ["No", "Yes"], + }, + }, + ) def test_nodes(self): n = list(self.bn_bn.nodes()) self.assertListEqual( - n, ['Burglary', 'Earthquake', 'Alarm', 'JohnCalls', 'MaryCalls']) + n, ["Burglary", "Earthquake", "Alarm", "JohnCalls", "MaryCalls"] + ) def test_cpt(self): - cpt = list(self.bn_bn.cpt('Alarm')) - self.assertListEqual( - cpt, [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95]) + cpt = list(self.bn_bn.cpt("Alarm")) + self.assertListEqual(cpt, [0.999, 0.001, 0.71, 0.29, 0.06, 0.94, 0.05, 0.95]) def test_card(self): - self.assertEqual(self.bn_bn.card('Alarm'), 2) + self.assertEqual(self.bn_bn.card("Alarm"), 2) def test_scope(self): - self.assertListEqual(self.bn_bn.scope('Alarm'), - ['Alarm', 'Earthquake', 'Burglary']) + self.assertListEqual( + self.bn_bn.scope("Alarm"), ["Alarm", "Earthquake", "Burglary"] + ) def test_parents(self): - self.assertListEqual(self.bn_bn.parents('Alarm'), - ['Earthquake', 'Burglary']) + self.assertListEqual(self.bn_bn.parents("Alarm"), ["Earthquake", "Burglary"]) def test_values(self): - self.assertListEqual(self.bn_bn.values('Alarm'), ['No', 'Yes']) + self.assertListEqual(self.bn_bn.values("Alarm"), ["No", "Yes"]) def test_values_idx(self): - self.assertEqual(self.bn_bn.values('Alarm')[1], 'Yes') + self.assertEqual(self.bn_bn.values("Alarm")[1], "Yes") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main(exit=False) diff --git a/bamt/external/pyBN/classes/bayesnet.py b/bamt/external/pyBN/classes/bayesnet.py index 654b21b..7c48a70 100644 --- a/bamt/external/pyBN/classes/bayesnet.py +++ b/bamt/external/pyBN/classes/bayesnet.py @@ -73,6 +73,7 @@ def __init__(self, E=None, value_dict=None, file=None): """ if file is not None: import pyBN.io.read as ior + bn = ior.read_bn(file) self.V = bn.V self.E = bn.E @@ -108,9 +109,9 @@ def copy(self): F = {} for v in V: F[v] = {} - F[v]['cpt'] = deepcopy(self.F[v]['cpt']) - F[v]['parents'] = deepcopy(self.F[v]['parents']) - F[v]['values'] = deepcopy(self.F[v]['values']) + F[v]["cpt"] = deepcopy(self.F[v]["cpt"]) + F[v]["parents"] = deepcopy(self.F[v]["parents"]) + F[v]["values"] = deepcopy(self.F[v]["values"]) bn = BayesNet() bn.V = V bn.E = E @@ -120,7 +121,7 @@ def copy(self): def add_node(self, rv, cpt=[], parents=[], values=[]): self.V.append(rv) - self.F[rv] = {'cpt': cpt, 'parents': parents, 'values': values} + self.F[rv] = {"cpt": cpt, "parents": parents, "values": values} def add_edge(self, u, v): if not self.has_node(u): @@ -128,16 +129,16 @@ def add_edge(self, u, v): if not self.has_node(v): self.add_node(v) if self.has_edge(u, v): - print('Edge already exists') + print("Edge already exists") else: self.E[u].append(v) - self.F[v]['parents'].append(u) + self.F[v]["parents"].append(u) # self.V = topsort(self.E) # HOW DO I RECALCULATE CPT? def remove_edge(self, u, v): self.E[u].remove(v) - self.F[v]['parents'].remove(u) + self.F[v]["parents"].remove(u) def reverse_arc(self, u, v): if self.has_edge(u, v): @@ -145,17 +146,17 @@ def reverse_arc(self, u, v): self.E[v].append(u) def set_data(self, rv, data): - assert (isinstance(data, dict)), 'data must be dictionary' + assert isinstance(data, dict), "data must be dictionary" self.F[rv] = data def set_cpt(self, rv, cpt): - self.F[rv]['cpt'] = cpt + self.F[rv]["cpt"] = cpt def set_parents(self, rv, parents): - self.F[rv]['parents'] = parents + self.F[rv]["parents"] = parents def set_values(self, rv, values): - self.F[rv]['values'] = values + self.F[rv]["values"] = values def nodes(self): for v in self.V: @@ -187,28 +188,28 @@ def num_edges(self): def num_params(self): num = 0 for u in self.nodes(): - num += len(self.F[u]['cpt']) + num += len(self.F[u]["cpt"]) return num def scope_size(self, rv): - return len(self.F[rv]['parents']) + 1 + return len(self.F[rv]["parents"]) + 1 def num_nodes(self): return len(self.V) def cpt(self, rv): - return self.F[rv]['cpt'] + return self.F[rv]["cpt"] def card(self, rv): - return len(self.F[rv]['values']) + return len(self.F[rv]["values"]) def scope(self, rv): scope = [rv] - scope.extend(self.F[rv]['parents']) + scope.extend(self.F[rv]["parents"]) return scope def parents(self, rv): - return self.F[rv]['parents'] + return self.F[rv]["parents"] def children(self, rv): return self.E[rv] @@ -217,11 +218,11 @@ def degree(self, rv): return len(self.parents(rv)) + len(self.children(rv)) def values(self, rv): - return self.F[rv]['values'] + return self.F[rv]["values"] def value_idx(self, rv, val): try: - return self.F[rv]['values'].index(val) + return self.F[rv]["values"].index(val) except ValueError: print("Value Index Error") return -1 @@ -243,11 +244,15 @@ def flat_cpt(self, by_var=False, by_parents=False): if by_var: cpt = np.array([sum(self.cpt(rv)) for rv in self.nodes()]) elif by_parents: - cpt = np.array([sum(self.cpt(rv)[i:(i + self.card(rv))]) - for rv in self.nodes() for i in range(len(self.cpt(rv)) / self.card(rv))]) + cpt = np.array( + [ + sum(self.cpt(rv)[i : (i + self.card(rv))]) + for rv in self.nodes() + for i in range(len(self.cpt(rv)) / self.card(rv)) + ] + ) else: - cpt = np.array([val for rv in self.nodes() - for val in self.cpt(rv)]) + cpt = np.array([val for rv in self.nodes() for val in self.cpt(rv)]) return cpt def cpt_indices(self, target, val_dict): @@ -270,8 +275,7 @@ def cpt_indices(self, target, val_dict): key=rv,val=rv value """ - stride = dict([(n, self.stride(target, n)) - for n in self.scope(target)]) + stride = dict([(n, self.stride(target, n)) for n in self.scope(target)]) # if len(val_dict)==len(self.parents(target)): # idx = sum([self.value_idx(rv,val)*stride[rv] \ # for rv,val in val_dict.items()]) @@ -295,14 +299,14 @@ def cpt_str_idx(self, rv, idx): Parents=Val for the given idx of the given rv's cpt. """ rv_val = self.values(rv)[idx % self.card(rv)] - s = str(rv) + '=' + str(rv_val) + '|' + s = str(rv) + "=" + str(rv_val) + "|" _idx = 1 for parent in self.parents(rv): for val in self.values(parent): if idx in self.cpt_indices(rv, {rv: rv_val, parent: val}): - s += str(parent) + '=' + str(val) + s += str(parent) + "=" + str(val) if _idx < len(self.parents(rv)): - s += ',' + s += "," _idx += 1 return s @@ -345,12 +349,12 @@ def set_structure(self, edge_dict, value_dict=None): self.F = dict([(rv, {}) for rv in self.nodes()]) for rv in self.nodes(): self.F[rv] = { - 'parents': [p for p in self.nodes() if rv in self.children(p)], - 'cpt': [], - 'values': [] + "parents": [p for p in self.nodes() if rv in self.children(p)], + "cpt": [], + "values": [], } if value_dict is not None: - self.F[rv]['values'] = value_dict[rv] + self.F[rv]["values"] = value_dict[rv] def adj_list(self): """ diff --git a/bamt/external/pyBN/utils/_tests/test_independence_tests.py b/bamt/external/pyBN/utils/_tests/test_independence_tests.py index fea6d43..5d34a61 100644 --- a/bamt/external/pyBN/utils/_tests/test_independence_tests.py +++ b/bamt/external/pyBN/utils/_tests/test_independence_tests.py @@ -16,18 +16,14 @@ class ConstraintTestsTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join( - dirname( - dirname( - dirname( - dirname(__file__)))), - 'data') - self.data = np.loadtxt(os.path.join(self.dpath, 'lizards.csv'), - delimiter=',', - dtype='int32', - skiprows=1) + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.data = np.loadtxt( + os.path.join(self.dpath, "lizards.csv"), + delimiter=",", + dtype="int32", + skiprows=1, + ) def tearDown(self): pass @@ -39,15 +35,15 @@ def test_mi_two_vars_value_b(self): self.assertEqual(mi_test(self.data[:, (0, 2)]), 0.0014) def test_mi_two_vars_symmetry(self): - self.assertEqual( - mi_test(self.data[:, (1, 0)]), mi_test(self.data[:, (0, 1)])) + self.assertEqual(mi_test(self.data[:, (1, 0)]), mi_test(self.data[:, (0, 1)])) def test_mi_three_vars_value_a(self): self.assertEqual(mi_test(self.data), 0.0009) def test_mi_three_vars_symmetry(self): - self.assertEqual(mi_test(self.data[:, (0, 1, 2)]), mi_test( - self.data[:, (1, 0, 2)])) + self.assertEqual( + mi_test(self.data[:, (0, 1, 2)]), mi_test(self.data[:, (1, 0, 2)]) + ) def test_mi_random_three(self): np.random.seed(3636) diff --git a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py index 27c02ac..206b89c 100644 --- a/bamt/external/pyBN/utils/_tests/test_markov_blanket.py +++ b/bamt/external/pyBN/utils/_tests/test_markov_blanket.py @@ -18,23 +18,21 @@ class ConstraintTestsTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join( - dirname( - dirname( - dirname( - dirname(__file__)))), - 'data') - self.bn = read_bn(os.path.join(self.dpath, 'cmu.bn')) + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cmu.bn")) def tearDown(self): pass def test_markov_blanket(self): - self.assertDictEqual(markov_blanket(self.bn), - {'Alarm': ['Earthquake', 'Burglary', 'JohnCalls', 'MaryCalls'], - 'Burglary': ['Alarm', 'Earthquake'], - 'Earthquake': ['Alarm', 'Burglary'], - 'JohnCalls': ['Alarm'], - 'MaryCalls': ['Alarm']}) + self.assertDictEqual( + markov_blanket(self.bn), + { + "Alarm": ["Earthquake", "Burglary", "JohnCalls", "MaryCalls"], + "Burglary": ["Alarm", "Earthquake"], + "Earthquake": ["Alarm", "Burglary"], + "JohnCalls": ["Alarm"], + "MaryCalls": ["Alarm"], + }, + ) diff --git a/bamt/external/pyBN/utils/_tests/test_orient_edges.py b/bamt/external/pyBN/utils/_tests/test_orient_edges.py index 457cd64..943cec9 100644 --- a/bamt/external/pyBN/utils/_tests/test_orient_edges.py +++ b/bamt/external/pyBN/utils/_tests/test_orient_edges.py @@ -16,7 +16,6 @@ class OrientEdgesTestCase(unittest.TestCase): - def setUp(self): pass @@ -26,19 +25,14 @@ def tearDown(self): def test_orient_edges_pc(self): e = {0: [1, 2], 1: [0], 2: [0]} b = {0: [], 1: {2: (0,)}, 2: {1: (0,)}} - self.assertDictEqual(orient_edges_pc(e, b), - {0: [1, 2], 1: [], 2: []}) + self.assertDictEqual(orient_edges_pc(e, b), {0: [1, 2], 1: [], 2: []}) def test_orient_edges_gs(self): e = {0: [1, 2], 1: [0], 2: [0]} b = {0: [1, 2], 1: [0], 2: [0]} - dpath = os.path.join( - dirname( - dirname( - dirname( - dirname(__file__)))), - 'data') - path = (os.path.join(dpath, 'lizards.csv')) - data = np.loadtxt(path, dtype='int32', skiprows=1, delimiter=',') - self.assertDictEqual(orient_edges_gs(e, b, data, 0.05), - {0: [1, 2], 1: [], 2: []}) + dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + path = os.path.join(dpath, "lizards.csv") + data = np.loadtxt(path, dtype="int32", skiprows=1, delimiter=",") + self.assertDictEqual( + orient_edges_gs(e, b, data, 0.05), {0: [1, 2], 1: [], 2: []} + ) diff --git a/bamt/external/pyBN/utils/_tests/test_random_sample.py b/bamt/external/pyBN/utils/_tests/test_random_sample.py index 030d3f9..7830b57 100644 --- a/bamt/external/pyBN/utils/_tests/test_random_sample.py +++ b/bamt/external/pyBN/utils/_tests/test_random_sample.py @@ -17,15 +17,9 @@ class RandomSampleTestCase(unittest.TestCase): - def setUp(self): - self.dpath = os.path.join( - dirname( - dirname( - dirname( - dirname(__file__)))), - 'data') - self.bn = read_bn(os.path.join(self.dpath, 'cancer.bif')) + self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), "data") + self.bn = read_bn(os.path.join(self.dpath, "cancer.bif")) def tearDown(self): pass @@ -33,6 +27,7 @@ def tearDown(self): def test_random_sample(self): np.random.seed(3636) sample = random_sample(self.bn, 5) - self.assertListEqual(list(sample.ravel()), - [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, - 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0]) + self.assertListEqual( + list(sample.ravel()), + [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0], + ) diff --git a/bamt/external/pyBN/utils/independence_tests.py b/bamt/external/pyBN/utils/independence_tests.py index c4f9efa..cc803fe 100644 --- a/bamt/external/pyBN/utils/independence_tests.py +++ b/bamt/external/pyBN/utils/independence_tests.py @@ -46,10 +46,10 @@ def mutual_information(data, conditional=False): elif len(bins) > 2 and conditional: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if len(bins) > 3: - data = data.astype('str') + data = data.astype("str") ncols = len(bins) for i in range(len(data)): - data[i, 2] = ''.join(data[i, 2:ncols]) + data[i, 2] = "".join(data[i, 2:ncols]) data = data.astype(np.int64)[:, 0:3] bins = np.amax(data, axis=0) @@ -76,10 +76,10 @@ def mutual_information(data, conditional=False): return round(MI, 4) elif len(bins) > 2 and conditional == False: - data = data.astype('str') + data = data.astype("str") ncols = len(bins) for i in range(len(data)): - data[i, 1] = ''.join(data[i, 1:ncols]) + data[i, 1] = "".join(data[i, 1:ncols]) data = data.astype(np.int64)[:, 0:2] hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts @@ -162,10 +162,10 @@ def entropy(data): else: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if cols > 3: - data = data.astype('str') + data = data.astype("str") ncols = len(bins) for i in range(len(data)): - data[i, 2] = ''.join(data[i, 2:ncols]) + data[i, 2] = "".join(data[i, 2:ncols]) data = data.astype(np.int64)[:, 0:3] bins = np.amax(data, axis=0) diff --git a/bamt/log.py b/bamt/log.py index 3f86b1f..7a44516 100644 --- a/bamt/log.py +++ b/bamt/log.py @@ -4,28 +4,27 @@ import warnings log_file_path = config.get( - 'LOG', - 'log_conf_loc', - fallback='log_conf_path is not defined') + "LOG", "log_conf_loc", fallback="log_conf_path is not defined" +) -if not os.path.isdir(os.path.join(os.path.expanduser("~"), 'BAMT')): - os.mkdir(os.path.join(os.path.expanduser("~"), 'BAMT')) +if not os.path.isdir(os.path.join(os.path.expanduser("~"), "BAMT")): + os.mkdir(os.path.join(os.path.expanduser("~"), "BAMT")) try: logging.config.fileConfig(log_file_path) except BaseException: log_file_path = os.path.join( - os.path.dirname( - os.path.abspath(__file__)), - 'logging.conf') + os.path.dirname(os.path.abspath(__file__)), "logging.conf" + ) logging.config.fileConfig(log_file_path) warnings.warn( - "Reading log path location from config file failed. Default location will be used instead.") + "Reading log path location from config file failed. Default location will be used instead." + ) -logger_builder = logging.getLogger('builder') -logger_network = logging.getLogger('network') -logger_preprocessor = logging.getLogger('preprocessor') -logger_nodes = logging.getLogger('nodes') +logger_builder = logging.getLogger("builder") +logger_network = logging.getLogger("network") +logger_preprocessor = logging.getLogger("preprocessor") +logger_nodes = logging.getLogger("nodes") logging.captureWarnings(True) -logger_warnings = logging.getLogger('py.warnings') +logger_warnings = logging.getLogger("py.warnings") diff --git a/bamt/mi_entropy_gauss.py b/bamt/mi_entropy_gauss.py index 2677baa..decef55 100644 --- a/bamt/mi_entropy_gauss.py +++ b/bamt/mi_entropy_gauss.py @@ -29,13 +29,12 @@ def query_filter(data: pd.DataFrame, columns: List, values: List): None """ data_copy = copy(data) - filter_str = '`' + str(columns[0]) + '`' + ' == ' + str(values[0]) + filter_str = "`" + str(columns[0]) + "`" + " == " + str(values[0]) if len(columns) == 1: return data_copy.query(filter_str) else: for i in range(1, len(columns)): - filter_str += ' & ' + '`' + \ - str(columns[i]) + '`' + ' == ' + str(values[i]) + filter_str += " & " + "`" + str(columns[i]) + "`" + " == " + str(values[i]) data_trim = data_copy.query(filter_str) return data_trim @@ -68,14 +67,12 @@ def entropy_gauss(pd_data): elif (len(data[0]) < 2) | (data.ndim < 2): flag_row = True elif data.shape[0] < 2: - flag_row = True if isinstance(copy(data).T[0], np.float64): flag_col = True elif (len(copy(data).T) < 2) | (copy(data).T.ndim < 2): flag_col = True elif data.shape[1] < 2: - flag_col = True if flag_row & flag_col: @@ -95,7 +92,7 @@ def entropy_gauss(pd_data): return sys.float_info.min -def entropy_all(data, method='MI'): +def entropy_all(data, method="MI"): """ For one varibale, H(X) is equal to the following: -1 * sum of p(x) * log(p(x)) @@ -114,15 +111,15 @@ def entropy_all(data, method='MI'): return entropy_all(loc_to_DataFrame(data), method=method) elif isinstance(data, pd.Series): return entropy_all(pd.DataFrame(data), method) - elif (isinstance(data, pd.DataFrame)): + elif isinstance(data, pd.DataFrame): nodes_type = get_nodes_type(data) column_disc = [] for key in nodes_type: - if nodes_type[key] == 'disc': + if nodes_type[key] == "disc": column_disc.append(key) column_cont = [] for key in nodes_type: - if nodes_type[key] == 'cont': + if nodes_type[key] == "cont": column_cont.append(key) data_disc = data[column_disc] data_cont = data[column_cont] @@ -137,9 +134,9 @@ def entropy_all(data, method='MI'): comb_prob = {} for i in range(len(data_disc)): row = data_disc.iloc[i] - comb = '' + comb = "" for _, val in row.items(): - comb = comb + str(val) + ', ' + comb = comb + str(val) + ", " if comb not in dict_comb: dict_comb[comb] = row comb_prob[comb] = 1 @@ -148,24 +145,26 @@ def entropy_all(data, method='MI'): H_cond = 0.0 for key in list(dict_comb.keys()): - filtered_data = query_filter( - data, column_disc, list(dict_comb[key])) + filtered_data = query_filter(data, column_disc, list(dict_comb[key])) filtered_data = filtered_data[column_cont] if comb_prob[key] == 1: - if (method == 'BIC') | (method == 'AIC'): - H_cond += comb_prob[key] / \ - len(data_disc) * entropy_gauss(data[column_cont]) + if (method == "BIC") | (method == "AIC"): + H_cond += ( + comb_prob[key] + / len(data_disc) + * entropy_gauss(data[column_cont]) + ) else: - H_cond += comb_prob[key] / \ - len(data_disc) * sys.float_info.max + H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max else: - H_cond += comb_prob[key] / \ - len(data_disc) * entropy_gauss(filtered_data) - if (method == 'BIC') | (method == 'AIC'): + H_cond += ( + comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + ) + if (method == "BIC") | (method == "AIC"): if H_cond > entropy_gauss(data[column_cont]): H_cond = entropy_gauss(data[column_cont]) - return (H_disc + H_cond) + return H_disc + H_cond def entropy_cond(data, column_cont, column_disc, method): @@ -178,9 +177,9 @@ def entropy_cond(data, column_cont, column_disc, method): comb_prob = {} for i in range(len(data_disc)): row = data_disc.iloc[i] - comb = '' + comb = "" for _, val in row.items(): - comb = comb + str(val) + ', ' + comb = comb + str(val) + ", " if comb not in dict_comb: dict_comb[comb] = row comb_prob[comb] = 1 @@ -191,14 +190,13 @@ def entropy_cond(data, column_cont, column_disc, method): filtered_data = query_filter(data, column_disc, list(dict_comb[key])) filtered_data = filtered_data[column_cont] if comb_prob[key] == 1: - if (method == 'BIC') | (method == 'AIC'): + if (method == "BIC") | (method == "AIC"): H_cond += comb_prob[key] / len(data_disc) * H_gauss else: H_cond += comb_prob[key] / len(data_disc) * sys.float_info.max else: - H_cond += comb_prob[key] / \ - len(data_disc) * entropy_gauss(filtered_data) - if (method == 'BIC') | (method == 'AIC'): + H_cond += comb_prob[key] / len(data_disc) * entropy_gauss(filtered_data) + if (method == "BIC") | (method == "AIC"): if H_cond > H_gauss: return H_gauss else: @@ -206,7 +204,7 @@ def entropy_cond(data, column_cont, column_disc, method): return H_cond -def mi_gauss(data, method='MI', conditional=False): +def mi_gauss(data, method="MI", conditional=False): """ Calculate Mutual Information based on entropy. In the case of continuous uses entropy for Gaussian multivariate distributions. @@ -227,36 +225,30 @@ def mi_gauss(data, method='MI', conditional=False): if isinstance(data, np.ndarray): return mi_gauss(loc_to_DataFrame(data), method, conditional) elif isinstance(data, pd.Series): - return (mi_gauss(pd.DataFrame(data))) + return mi_gauss(pd.DataFrame(data)) elif isinstance(data, pd.DataFrame): nodes_type = get_nodes_type(data) if conditional: # Hill-Climbing does not use conditional MI, but other algorithms may require it # At the moment it counts on condition of the last row in the list # of columns - print('Warning: conditional == True') + print("Warning: conditional == True") nodes_type_trim = copy(nodes_type) data_trim = copy(data) list_keys = list(nodes_type_trim.keys) del nodes_type_trim[list_keys[-1]] del data_trim[list_keys[-1]] - return ( - mi_gauss( - data, - nodes_type, - method) - - mi_gauss( - data_trim, - nodes_type, - method)) + return mi_gauss(data, nodes_type, method) - mi_gauss( + data_trim, nodes_type, method + ) else: column_disc = [] for key in nodes_type: - if nodes_type[key] == 'disc': + if nodes_type[key] == "disc": column_disc.append(key) column_cont = [] for key in nodes_type: - if nodes_type[key] == 'cont': + if nodes_type[key] == "cont": column_cont.append(key) data_disc = data[column_disc] data_cont = data[column_cont] @@ -265,10 +257,7 @@ def mi_gauss(data, method='MI', conditional=False): H_cond = 0.0 if len(column_cont) == 0: - return ( - mutual_information( - data_disc.values, - conditional=False)) + return mutual_information(data_disc.values, conditional=False) elif len(column_disc) == 0: if len(column_cont) == 1: return entropy_gauss(data_cont) @@ -278,22 +267,24 @@ def mi_gauss(data, method='MI', conditional=False): del column_cont_trim[-1] data_cont_trim = data[column_cont_trim] - H_gauss = entropy_gauss( - data_last) + entropy_gauss(data_cont_trim) - entropy_gauss(data_cont) + H_gauss = ( + entropy_gauss(data_last) + + entropy_gauss(data_cont_trim) + - entropy_gauss(data_cont) + ) H_gauss = min( - H_gauss, - entropy_gauss(data_last), - entropy_gauss(data_cont_trim)) + H_gauss, entropy_gauss(data_last), entropy_gauss(data_cont_trim) + ) # H_gauss = entropy_gauss(data_cont) H_cond = 0.0 else: H_gauss = entropy_gauss(data_cont) H_cond = entropy_cond(data, column_cont, column_disc, method) - return (H_gauss - H_cond) + return H_gauss - H_cond -def mi(edges: list, data: pd.DataFrame, method='MI'): +def mi(edges: list, data: pd.DataFrame, method="MI"): """ Bypasses all nodes and summarizes scores, taking into account the parent-child relationship. @@ -315,8 +306,7 @@ def mi(edges: list, data: pd.DataFrame, method='MI'): child_parents = [var] child_parents.extend(parents_dict[var]) sum_score += mi_gauss(copy(data[child_parents]), method) - nodes_without_edges = list( - set(data.columns).difference(set(nodes_with_edges))) + nodes_without_edges = list(set(data.columns).difference(set(nodes_with_edges))) for var in nodes_without_edges: sum_score += mi_gauss(copy(data[var]), method) return sum_score diff --git a/bamt/networks/__init__.py b/bamt/networks/__init__.py index 3de3c48..3943690 100644 --- a/bamt/networks/__init__.py +++ b/bamt/networks/__init__.py @@ -1,4 +1,7 @@ -__all__ = ["base", "hybrid_bn", - "continuous_bn", "discrete_bn", - "big_brave_bn", - ] +__all__ = [ + "base", + "hybrid_bn", + "continuous_bn", + "discrete_bn", + "big_brave_bn", +] diff --git a/bamt/networks/base.py b/bamt/networks/base.py index c090b5e..6e16283 100644 --- a/bamt/networks/base.py +++ b/bamt/networks/base.py @@ -26,8 +26,9 @@ from typing import Dict, Tuple, List, Callable, Optional, Type, Union, Any, Sequence -STORAGE = config.get('NODES', 'models_storage', - fallback='models_storage is not defined') +STORAGE = config.get( + "NODES", "models_storage", fallback="models_storage is not defined" +) class BaseNetwork(object): @@ -41,13 +42,12 @@ def __init__(self): edges: a list of edges distributions: dict """ - self.type = 'Abstract' - self._allowed_dtypes = ['Abstract'] + self.type = "Abstract" + self._allowed_dtypes = ["Abstract"] self.nodes = [] self.edges = [] self.weights = {} - self.descriptor = {"types": {}, - "signs": {}} + self.descriptor = {"types": {}, "signs": {}} self.distributions = {} self.has_logit = False self.use_mixture = False @@ -62,19 +62,24 @@ def __getitem__(self, node_name: str) -> Type[BaseNode]: return self.nodes[index] def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: - types = descriptor['types'] - return True if all( - [a in self._allowed_dtypes for a in types.values()]) else False + types = descriptor["types"] + return ( + True if all([a in self._allowed_dtypes for a in types.values()]) else False + ) def update_descriptor(self): new_nodes_names = [node.name for node in self.nodes] - self.descriptor['types'] = { - node: type for node, - type in self.descriptor['types'].items() if node in new_nodes_names} + self.descriptor["types"] = { + node: type + for node, type in self.descriptor["types"].items() + if node in new_nodes_names + } if "cont" in self.descriptor["types"].values(): - self.descriptor['signs'] = { - node: sign for node, - sign in self.descriptor['signs'].items() if node in new_nodes_names} + self.descriptor["signs"] = { + node: sign + for node, sign in self.descriptor["signs"].items() + if node in new_nodes_names + } def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): """ @@ -82,33 +87,34 @@ def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): descriptor: dict with types and signs of nodes """ if not self.validate(descriptor=descriptor): - if not self.type == 'Hybrid': + if not self.type == "Hybrid": logger_network.error( - f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data") + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) return else: logger_network.error( - f"Descriptor validation failed due to wrong type of column(s).") + f"Descriptor validation failed due to wrong type of column(s)." + ) return - elif ['Abstract'] in self._allowed_dtypes: + elif ["Abstract"] in self._allowed_dtypes: return None self.descriptor = descriptor # LEVEL 1 - worker_1 = builders.builders_base.VerticesDefiner( - descriptor, regressor=None) + worker_1 = builders.builders_base.VerticesDefiner(descriptor, regressor=None) self.nodes = worker_1.vertices - def add_edges(self, - data: pd.DataFrame, - scoring_function: Union[Tuple[str, - Callable], - Tuple[str]] = ('K2', K2Score), - progress_bar: bool = True, - classifier: Optional[object] = None, - regressor: Optional[object] = None, - params: Optional[ParamDict] = None, - optimizer: str = 'HC', - **kwargs): + def add_edges( + self, + data: pd.DataFrame, + scoring_function: Union[Tuple[str, Callable], Tuple[str]] = ("K2", K2Score), + progress_bar: bool = True, + classifier: Optional[object] = None, + regressor: Optional[object] = None, + params: Optional[ParamDict] = None, + optimizer: str = "HC", + **kwargs, + ): """ Base function for Structure learning scoring_function: tuple with the following format (NAME, scoring_function) or (NAME,) @@ -119,48 +125,59 @@ def add_edges(self, """ if not self.has_logit and classifier: logger_network.error( - "Classifiers dict will be ignored since logit nodes are forbidden.") + "Classifiers dict will be ignored since logit nodes are forbidden." + ) return None # params validation if params: # init_edges validation if not self.has_logit and "init_edges" in params.keys(): - type_map = np.array([ - [self.descriptor["types"][node1], self.descriptor["types"][node2]] for node1, node2 in - params["init_edges"]] + type_map = np.array( + [ + [ + self.descriptor["types"][node1], + self.descriptor["types"][node2], + ] + for node1, node2 in params["init_edges"] + ] ) - failed = ( - (type_map[:, 0] == "cont") & - ((type_map[:, 1] == "disc") | - (type_map[:, 1] == "disc_num")) + failed = (type_map[:, 0] == "cont") & ( + (type_map[:, 1] == "disc") | (type_map[:, 1] == "disc_num") ) if sum(failed): logger_network.warning( f"Edges between continuous nodes and disc nodes are forbidden (has_logit = {self.has_logit}), " - f"they will be ignored. Indexes: {np.where(failed)[0]}") - params["init_edges"] = [params["init_edges"][i] for i in range( - len(params["init_edges"])) if i not in np.where(failed)[0]] + f"they will be ignored. Indexes: {np.where(failed)[0]}" + ) + params["init_edges"] = [ + params["init_edges"][i] + for i in range(len(params["init_edges"])) + if i not in np.where(failed)[0] + ] if not self.validate(descriptor=self.descriptor): logger_network.error( - f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data") + f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" + ) return None - if optimizer == 'HC': + if optimizer == "HC": worker = HCStructureBuilder( data=data, descriptor=self.descriptor, scoring_function=scoring_function, has_logit=self.has_logit, use_mixture=self.use_mixture, - regressor=regressor) - elif optimizer == 'Evo': + regressor=regressor, + ) + elif optimizer == "Evo": worker = EvoStructureBuilder( data=data, descriptor=self.descriptor, has_logit=self.has_logit, use_mixture=self.use_mixture, - regressor=regressor) + regressor=regressor, + ) self.sf_name = scoring_function[0] @@ -170,29 +187,33 @@ def add_edges(self, classifier=classifier, regressor=regressor, progress_bar=progress_bar, - **kwargs) + **kwargs, + ) # update family - self.nodes = worker.skeleton['V'] - self.edges = worker.skeleton['E'] + self.nodes = worker.skeleton["V"] + self.edges = worker.skeleton["E"] def calculate_weights(self, discretized_data: pd.DataFrame): """ Provide calculation of link strength according mutual information between node and its parent(-s) values. """ import bamt.utils.GraphUtils as gru + data_descriptor = gru.nodes_types(discretized_data) - if not all([i in ['disc', 'disc_num'] - for i in data_descriptor.values()]): + if not all([i in ["disc", "disc_num"] for i in data_descriptor.values()]): logger_network.error( - f"calculate_weghts() method deals only with discrete data. Continuous data: " + - f"{[col for col, type in data_descriptor.items() if type not in ['disc', 'disc_num']]}") + f"calculate_weghts() method deals only with discrete data. Continuous data: " + + f"{[col for col, type in data_descriptor.items() if type not in ['disc', 'disc_num']]}" + ) if not self.edges: logger_network.error( - "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method") + "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" + ) if not self.nodes: logger_network.error( - "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method") + "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" + ) weights = dict() for node in self.nodes: @@ -209,15 +230,21 @@ def calculate_weights(self, discretized_data: pd.DataFrame): else: for parent_node in parents: x = discretized_data[parent_node].values - other_parents = [ - tmp for tmp in parents if tmp != parent_node] + other_parents = [tmp for tmp in parents if tmp != parent_node] z = list() for other_parent in other_parents: z.append(list(discretized_data[other_parent].values)) - ls_true = np.average(drv.information_mutual_conditional( - X=y, Y=x, Z=z, cartesian_product=True)) - entropy = np.average(drv.entropy_conditional( - X=y, Y=z, cartesian_product=True)) + 1e-8 + ls_true = np.average( + drv.information_mutual_conditional( + X=y, Y=x, Z=z, cartesian_product=True + ) + ) + entropy = ( + np.average( + drv.entropy_conditional(X=y, Y=z, cartesian_product=True) + ) + + 1e-8 + ) weight = ls_true / entropy weights[(parent_node, node.name)] = weight self.weights = weights @@ -232,15 +259,15 @@ def set_nodes(self, nodes: List, info: Optional[Dict] = None): """ if not info and not self.descriptor["types"]: logger_network.error( - "In case of manual setting nodes user should set map for them as well.") + "In case of manual setting nodes user should set map for them as well." + ) return self.nodes = [] for node in nodes: if issubclass(type(node), BaseNode): self.nodes.append(node) else: - logger_network.error( - f"{node} is not an instance of {BaseNode}") + logger_network.error(f"{node} is not an instance of {BaseNode}") if info: self.descriptor = info @@ -258,11 +285,14 @@ def set_edges(self, edges: Optional[List[Sequence[str]]] = None): for node1, node2 in edges: if isinstance(node1, str) and isinstance(node2, str): if self[node1] and self[node2]: - if not self.has_logit and \ - self.descriptor["types"][node1] == "cont" and \ - self.descriptor["types"][node2] == "disc": + if ( + not self.has_logit + and self.descriptor["types"][node1] == "cont" + and self.descriptor["types"][node2] == "disc" + ): logger_network.warning( - f"Restricted edge detected (has_logit=False) : [{node1}, {node2}]") + f"Restricted edge detected (has_logit=False) : [{node1}, {node2}]" + ) continue else: self.edges.append((node1, node2)) @@ -271,40 +301,43 @@ def set_edges(self, edges: Optional[List[Sequence[str]]] = None): continue else: logger_network.error( - f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]") + f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]" + ) continue self.update_descriptor() - def set_structure(self, - info: Optional[Dict] = None, - nodes: Optional[List] = None, - edges: Optional[List[Sequence[str]]] = None, - overwrite: bool = True): + def set_structure( + self, + info: Optional[Dict] = None, + nodes: Optional[List] = None, + edges: Optional[List[Sequence[str]]] = None, + overwrite: bool = True, + ): """ Function to set structure manually info: Descriptor nodes, edges: overwrite: use 2nd stage of defining or not """ - if nodes and ( - info or ( - self.descriptor["types"] and self.descriptor["signs"])): + if nodes and (info or (self.descriptor["types"] and self.descriptor["signs"])): self.set_nodes(nodes=nodes, info=info) if edges: self.set_edges(edges=edges) if overwrite: builder = builders.builders_base.VerticesDefiner( - descriptor=self.descriptor, regressor=None) # init worker - builder.skeleton['V'] = builder.vertices # 1 stage - builder.skeleton['E'] = self.edges + descriptor=self.descriptor, regressor=None + ) # init worker + builder.skeleton["V"] = builder.vertices # 1 stage + builder.skeleton["E"] = self.edges builder.get_family() if self.edges: builder.overwrite_vertex( has_logit=self.has_logit, use_mixture=self.use_mixture, classifier=None, - regressor=None) - self.set_nodes(nodes=builder.skeleton['V']) + regressor=None, + ) + self.set_nodes(nodes=builder.skeleton["V"]) else: logger_network.error("Empty set of edges") @@ -318,8 +351,7 @@ def _param_validation(self, params: Dict[str, Any]) -> bool: return False return True else: - logger_network.error( - "Param validation failed due to unknown nodes.") + logger_network.error("Param validation failed due to unknown nodes.") return False def set_parameters(self, parameters: Dict): @@ -347,10 +379,12 @@ def set_parameters(self, parameters: Dict): continue if not model: logger_network.warning( - f"Classifier/regressor for {node} hadn't been used.") + f"Classifier/regressor for {node} hadn't been used." + ) self[node].type = re.sub( - r"\([\s\S]*\)", f"({model})", self[node].type) + r"\([\s\S]*\)", f"({model})", self[node].type + ) def save_to_file(self, outdir: str, data: dict): """ @@ -358,9 +392,9 @@ def save_to_file(self, outdir: str, data: dict): :param outdir: output directory :param data: dictionary to be saved """ - if not outdir.endswith('.json'): + if not outdir.endswith(".json"): return None - with open(outdir, 'w+') as out: + with open(outdir, "w+") as out: json.dump(data, out) return True @@ -385,10 +419,10 @@ def save(self, outdir: str): """ new_weights = {str(key): self.weights[key] for key in self.weights} outdict = { - 'info': self.descriptor, - 'edges': self.edges, - 'parameters': self.distributions, - 'weights': new_weights + "info": self.descriptor, + "edges": self.edges, + "parameters": self.distributions, + "weights": new_weights, } return self.save_to_file(outdir, outdict) @@ -405,51 +439,47 @@ def __init__(self, type: str): type: either use_mixture or has_logit is wrong """ super().__init__( - f"This parameter is not the same as father's parameter: {type}") + f"This parameter is not the same as father's parameter: {type}" + ) with open(input_dir) as f: input_dict = json.load(f) - self.add_nodes(input_dict['info']) - self.set_structure(edges=input_dict['edges']) + self.add_nodes(input_dict["info"]) + self.set_structure(edges=input_dict["edges"]) # check compatibility with father network. if not self.use_mixture: - for node_data in input_dict['parameters'].values(): - if 'hybcprob' not in node_data.keys(): + for node_data in input_dict["parameters"].values(): + if "hybcprob" not in node_data.keys(): continue else: # Since we don't have information about types of nodes, we # should derive it from parameters. - if any(list(node_keys.keys()) == ["covars", "mean", "coef"] - for node_keys in node_data['hybcprob'].values()): + if any( + list(node_keys.keys()) == ["covars", "mean", "coef"] + for node_keys in node_data["hybcprob"].values() + ): raise CompatibilityError("use_mixture") # check if edges before and after are the same.They can be different in # the case when user sets forbidden edges. if not self.has_logit: if not all( - edges_before == [ - edges_after[0], - edges_after[1]] for edges_before, - edges_after in zip( - input_dict['edges'], - self.edges)): + edges_before == [edges_after[0], edges_after[1]] + for edges_before, edges_after in zip(input_dict["edges"], self.edges) + ): raise CompatibilityError("has_logit") - self.set_parameters(parameters=input_dict['parameters']) - str_keys = list(input_dict['weights'].keys()) + self.set_parameters(parameters=input_dict["parameters"]) + str_keys = list(input_dict["weights"].keys()) tuple_keys = [eval(key) for key in str_keys] weights = {} for tuple_key in tuple_keys: - weights[tuple_key] = input_dict['weights'][str(tuple_key)] + weights[tuple_key] = input_dict["weights"][str(tuple_key)] self.weights = weights - def fit_parameters( - self, - data: pd.DataFrame, - dropna: bool = True, - n_jobs: int = -1): + def fit_parameters(self, data: pd.DataFrame, dropna: bool = True, n_jobs: int = -1): """ Base function for parameter learning """ @@ -464,22 +494,22 @@ def fit_parameters( if not os.listdir(STORAGE): os.makedirs(os.path.join(STORAGE, "0")) - index = sorted( - [int(id) for id in os.listdir(STORAGE)] - )[-1] + 1 + index = sorted([int(id) for id in os.listdir(STORAGE)])[-1] + 1 os.makedirs(os.path.join(STORAGE, str(index))) # Turn all discrete values to str for learning algorithm - if 'disc_num' in self.descriptor['types'].values(): - columns_names = [name for name, t in self.descriptor['types'].items() if t in [ - 'disc_num']] - data[columns_names] = data.loc[:, columns_names].astype('str') + if "disc_num" in self.descriptor["types"].values(): + columns_names = [ + name + for name, t in self.descriptor["types"].items() + if t in ["disc_num"] + ] + data[columns_names] = data.loc[:, columns_names].astype("str") def worker(node): return node.fit_parameters(data) - results = Parallel(n_jobs=n_jobs)(delayed(worker)(node) - for node in self.nodes) + results = Parallel(n_jobs=n_jobs)(delayed(worker)(node) for node in self.nodes) for result, node in zip(results, self.nodes): self.distributions[node.name] = result @@ -495,28 +525,39 @@ def get_info(self, as_df: bool = True) -> Optional[pd.DataFrame]: for n in self.nodes: names.append(n) types_n.append(n.type) - types_d.append(self.descriptor['types'][n.name]) - parents_types.append([self.descriptor['types'][name] - for name in n.cont_parents + n.disc_parents]) - parents.append( - [name for name in n.cont_parents + n.disc_parents]) - return pd.DataFrame({'name': names, 'node_type': types_n, - 'data_type': types_d, 'parents': parents, - 'parents_types': parents_types}) + types_d.append(self.descriptor["types"][n.name]) + parents_types.append( + [ + self.descriptor["types"][name] + for name in n.cont_parents + n.disc_parents + ] + ) + parents.append([name for name in n.cont_parents + n.disc_parents]) + return pd.DataFrame( + { + "name": names, + "node_type": types_n, + "data_type": types_d, + "parents": parents, + "parents_types": parents_types, + } + ) else: for n in self.nodes: print( - f"{n.name: <20} | {n.type: <50} | {self.descriptor['types'][n.name]: <10} | {str([self.descriptor['types'][name] for name in n.cont_parents + n.disc_parents]): <50} | {str([name for name in n.cont_parents + n.disc_parents])}") - - def sample(self, - n: int, - models_dir: Optional[str] = None, - progress_bar: bool = True, - evidence: Optional[Dict[str, Union[str, int, float]]] = None, - as_df: bool = True, - predict: bool = False, - parall_count: int = 1) -> \ - Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: + f"{n.name: <20} | {n.type: <50} | {self.descriptor['types'][n.name]: <10} | {str([self.descriptor['types'][name] for name in n.cont_parents + n.disc_parents]): <50} | {str([name for name in n.cont_parents + n.disc_parents])}" + ) + + def sample( + self, + n: int, + models_dir: Optional[str] = None, + progress_bar: bool = True, + evidence: Optional[Dict[str, Union[str, int, float]]] = None, + as_df: bool = True, + predict: bool = False, + parall_count: int = 1, + ) -> Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: """ Sampling from Bayesian Network n: int number of samples @@ -528,11 +569,12 @@ def sample(self, random.seed() if not self.distributions.items(): logger_network.error( - "Parameter learning wasn't done. Call fit_parameters method") + "Parameter learning wasn't done. Call fit_parameters method" + ) return None if evidence: for node in self.nodes: - if (node.type == 'Discrete') & (node.name in evidence.keys()): + if (node.type == "Discrete") & (node.name in evidence.keys()): if not (isinstance(evidence[node.name], str)): evidence[node.name] = str(int(evidence[node.name])) @@ -546,7 +588,7 @@ def wrapper(): if not parents: pvals = None else: - if self.type == 'Discrete': + if self.type == "Discrete": pvals = [str(output[t]) for t in parents] else: pvals = [output[t] for t in parents] @@ -564,54 +606,54 @@ def wrapper(): model_type = "regressor" else: model_type = "classifier" - if obj_data["serialization"] == 'joblib' and obj_data[ - f"{model_type}_obj"]: - new_path = models_dir + \ - f"\\{node.name.replace(' ', '_')}\\{obj}.joblib.compressed" - node_data["hybcprob"][obj][f"{model_type}_obj"] = new_path + if ( + obj_data["serialization"] == "joblib" + and obj_data[f"{model_type}_obj"] + ): + new_path = ( + models_dir + + f"\\{node.name.replace(' ', '_')}\\{obj}.joblib.compressed" + ) + node_data["hybcprob"][obj][ + f"{model_type}_obj" + ] = new_path if predict: - output[node.name] = \ - node.predict(node_data, pvals=pvals) + output[node.name] = node.predict(node_data, pvals=pvals) else: - output[node.name] = \ - node.choose(node_data, pvals=pvals) + output[node.name] = node.choose(node_data, pvals=pvals) return output if progress_bar: - seq = Parallel( - n_jobs=parall_count)( - delayed(wrapper)() for _ in tqdm( - range(n), - position=0, - leave=True)) + seq = Parallel(n_jobs=parall_count)( + delayed(wrapper)() for _ in tqdm(range(n), position=0, leave=True) + ) else: - seq = Parallel( - n_jobs=parall_count)( - delayed(wrapper)() for _ in range(n)) - seq_df = pd.DataFrame.from_dict(seq, orient='columns') + seq = Parallel(n_jobs=parall_count)(delayed(wrapper)() for _ in range(n)) + seq_df = pd.DataFrame.from_dict(seq, orient="columns") seq_df.dropna(inplace=True) - cont_nodes = [c.name for c in self.nodes if c.type != - 'Discrete' and 'Logit' not in c.type] + cont_nodes = [ + c.name for c in self.nodes if c.type != "Discrete" and "Logit" not in c.type + ] positive_columns = [ - c for c in cont_nodes if self.descriptor['signs'][c] == 'pos'] + c for c in cont_nodes if self.descriptor["signs"][c] == "pos" + ] seq_df = seq_df[(seq_df[positive_columns] >= 0).all(axis=1)] seq_df.reset_index(inplace=True, drop=True) - seq = seq_df.to_dict('records') + seq = seq_df.to_dict("records") if as_df: - return pd.DataFrame.from_dict(seq, orient='columns') + return pd.DataFrame.from_dict(seq, orient="columns") else: return seq - def predict(self, - test: pd.DataFrame, - parall_count: int = 1, - progress_bar: bool = True, - models_dir: Optional[str] = None) -> Dict[str, - Union[List[str], - List[int], - List[float]]]: + def predict( + self, + test: pd.DataFrame, + parall_count: int = 1, + progress_bar: bool = True, + models_dir: Optional[str] = None, + ) -> Dict[str, Union[List[str], List[int], List[float]]]: """ Function to predict columns from given data. Note that train data and test data must have different columns. @@ -640,13 +682,19 @@ def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): for n, key in enumerate(columns): try: sample = bn.sample( - 1, evidence=test_row, predict=True, progress_bar=False, models_dir=models_dir) + 1, + evidence=test_row, + predict=True, + progress_bar=False, + models_dir=models_dir, + ) if sample.empty: preds[key].append(np.nan) continue - if bn.descriptor['types'][key] == 'cont': - if (bn.descriptor['signs'][key] == 'pos') & ( - sample.loc[0, key] < 0): + if bn.descriptor["types"][key] == "cont": + if (bn.descriptor["signs"][key] == "pos") & ( + sample.loc[0, key] < 0 + ): # preds[key].append(np.nan) preds[key].append(0) else: @@ -658,8 +706,7 @@ def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): preds[key].append(np.nan) return preds else: - logger_network.error( - 'Wrapper for one row from pandas.DataFrame') + logger_network.error("Wrapper for one row from pandas.DataFrame") return {} columns = list(set(self.nodes_names) - set(test.columns.to_list())) @@ -670,11 +717,15 @@ def wrapper(bn, test: pd.DataFrame, columns: List[str], models_dir: str): preds = {column_name: list() for column_name in columns} if progress_bar: - processed_list = Parallel(n_jobs=parall_count)(delayed(wrapper)( - self, test.loc[[i]], columns, models_dir) for i in tqdm(test.index, position=0, leave=True)) + processed_list = Parallel(n_jobs=parall_count)( + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in tqdm(test.index, position=0, leave=True) + ) else: processed_list = Parallel(n_jobs=parall_count)( - delayed(wrapper)(self, test.loc[[i]], columns, models_dir) for i in test.index) + delayed(wrapper)(self, test.loc[[i]], columns, models_dir) + for i in test.index + ) for i in range(test.shape[0]): curr_pred = processed_list[i] @@ -697,9 +748,8 @@ def set_classifiers(self, classifiers: Dict[str, object]): if node.name in classifiers.keys(): node.classifier = classifiers[node.name] node.type = re.sub( - r"\([\s\S]*\)", - f"({type(node.classifier).__name__})", - node.type) + r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type + ) else: continue @@ -714,9 +764,8 @@ def set_regressor(self, regressors: Dict[str, object]): if node.name in regressors.keys(): node.regressor = regressors[node.name] node.type = re.sub( - r"\([\s\S]*\)", - f"({type(node.regressor).__name__})", - node.type) + r"\([\s\S]*\)", f"({type(node.regressor).__name__})", node.type + ) else: continue @@ -726,7 +775,7 @@ def plot(self, output: str): in the parent directory in folder visualization_result. output: str name of output file """ - if not output.endswith('.html'): + if not output.endswith(".html"): logger_network.error("This version allows only html format.") return None @@ -740,44 +789,49 @@ def plot(self, output: str): width="100%", notebook=True, directed=nx.is_directed(G), - layout='hierarchical') + layout="hierarchical", + ) - nodes_sorted = np.array( - list(nx.topological_generations(G)), dtype=object) + nodes_sorted = np.array(list(nx.topological_generations(G)), dtype=object) # Qualitative class of colormaps q_classes = [ - 'Pastel1', - 'Pastel2', - 'Paired', - 'Accent', - 'Dark2', - 'Set1', - 'Set2', - 'Set3', - 'tab10', - 'tab20', - 'tab20b', - 'tab20c'] + "Pastel1", + "Pastel2", + "Paired", + "Accent", + "Dark2", + "Set1", + "Set2", + "Set3", + "tab10", + "tab20", + "tab20b", + "tab20c", + ] hex_colors = [] for cls in q_classes: rgb_colors = plt.get_cmap(cls).colors - hex_colors.extend([matplotlib.colors.rgb2hex(rgb_color) - for rgb_color in rgb_colors]) + hex_colors.extend( + [matplotlib.colors.rgb2hex(rgb_color) for rgb_color in rgb_colors] + ) hex_colors = np.array(hex_colors) # Number_of_colors in matplotlib in Qualitative class = 144 - class_number = len( - set([node.type for node in self.nodes]) - ) - hex_colors_indexes = [random.randint( - 0, len(hex_colors) - 1) for _ in range(class_number)] + class_number = len(set([node.type for node in self.nodes])) + hex_colors_indexes = [ + random.randint(0, len(hex_colors) - 1) for _ in range(class_number) + ] hex_colors_picked = hex_colors[hex_colors_indexes] - class2color = {cls: color for cls, color in zip( - set([node.type for node in self.nodes]), hex_colors_picked)} + class2color = { + cls: color + for cls, color in zip( + set([node.type for node in self.nodes]), hex_colors_picked + ) + } name2class = {node.name: node.type for node in self.nodes} for level in range(len(nodes_sorted)): @@ -785,15 +839,22 @@ def plot(self, output: str): name = nodes_sorted[level][node_i] cls = name2class[name] color = class2color[cls] - network.add_node(name, label=name, color=color, size=45, level=level, font={ - 'size': 36}, title=f'Узел байесовской сети {name} ({cls})') + network.add_node( + name, + label=name, + color=color, + size=45, + level=level, + font={"size": 36}, + title=f"Узел байесовской сети {name} ({cls})", + ) for edge in G.edges: network.add_edge(edge[0], edge[1]) network.hrepulsion(node_distance=300, central_gravity=0.5) - if not (os.path.exists('visualization_result')): + if not (os.path.exists("visualization_result")): os.mkdir("visualization_result") - return network.show(f'visualization_result/' + output) + return network.show(f"visualization_result/" + output) diff --git a/bamt/networks/big_brave_bn.py b/bamt/networks/big_brave_bn.py index 731c9a9..f137ec1 100644 --- a/bamt/networks/big_brave_bn.py +++ b/bamt/networks/big_brave_bn.py @@ -2,8 +2,7 @@ class BigBraveBN: - - def __init__(self, n_nearest=5, threshold=.3, proximity_metric='MI'): + def __init__(self, n_nearest=5, threshold=0.3, proximity_metric="MI"): self.n_nearest = n_nearest self.threshold = threshold self.proximity_metric = proximity_metric @@ -25,16 +24,18 @@ def set_possible_edges_by_brave(self, df): """ proximity_matrix = get_proximity_matrix( - df, proximity_metric=self.proximity_metric) - brave_matrix = get_brave_matrix( - df.columns, proximity_matrix, self.n_nearest) + df, proximity_metric=self.proximity_metric + ) + brave_matrix = get_brave_matrix(df.columns, proximity_matrix, self.n_nearest) possible_edges_list = [] for c1 in df.columns: for c2 in df.columns: - if brave_matrix.loc[c1, c2] > brave_matrix.max( - numeric_only='true').max() * self.threshold: + if ( + brave_matrix.loc[c1, c2] + > brave_matrix.max(numeric_only="true").max() * self.threshold + ): possible_edges_list.append((c1, c2)) self.possible_edges = possible_edges_list diff --git a/bamt/networks/continuous_bn.py b/bamt/networks/continuous_bn.py index 0bc2687..2c2f327 100644 --- a/bamt/networks/continuous_bn.py +++ b/bamt/networks/continuous_bn.py @@ -8,8 +8,8 @@ class ContinuousBN(BaseNetwork): def __init__(self, use_mixture: bool = False): super(ContinuousBN, self).__init__() - self.type = 'Continuous' - self._allowed_dtypes = ['cont'] + self.type = "Continuous" + self._allowed_dtypes = ["cont"] self.has_logit = None self.use_mixture = use_mixture self.scoring_function = "" diff --git a/bamt/networks/discrete_bn.py b/bamt/networks/discrete_bn.py index 9d6d32c..90a0d07 100644 --- a/bamt/networks/discrete_bn.py +++ b/bamt/networks/discrete_bn.py @@ -8,8 +8,8 @@ class DiscreteBN(BaseNetwork): def __init__(self): super(DiscreteBN, self).__init__() - self.type = 'Discrete' + self.type = "Discrete" self.scoring_function = "" - self._allowed_dtypes = ['disc', 'disc_num'] + self._allowed_dtypes = ["disc", "disc_num"] self.has_logit = None self.use_mixture = None diff --git a/bamt/networks/hybrid_bn.py b/bamt/networks/hybrid_bn.py index fc68402..6856da6 100644 --- a/bamt/networks/hybrid_bn.py +++ b/bamt/networks/hybrid_bn.py @@ -10,13 +10,18 @@ class HybridBN(BaseNetwork): def __init__(self, has_logit: bool = False, use_mixture: bool = False): super(HybridBN, self).__init__() - self._allowed_dtypes = ['cont', 'disc', 'disc_num'] - self.type = 'Hybrid' + self._allowed_dtypes = ["cont", "disc", "disc_num"] + self.type = "Hybrid" self.has_logit = has_logit self.use_mixture = use_mixture def validate(self, descriptor: Dict[str, Dict[str, str]]) -> bool: - types = descriptor['types'] + types = descriptor["types"] s = set(types.values()) - return True if ({'cont', 'disc', 'disc_num'} == s) or ( - {'cont', 'disc'} == s) or ({'cont', 'disc_num'} == s) else False + return ( + True + if ({"cont", "disc", "disc_num"} == s) + or ({"cont", "disc"} == s) + or ({"cont", "disc_num"} == s) + else False + ) diff --git a/bamt/nodes/__init__.py b/bamt/nodes/__init__.py index 78c7b68..f896bc3 100644 --- a/bamt/nodes/__init__.py +++ b/bamt/nodes/__init__.py @@ -1,10 +1,10 @@ __all__ = [ - 'base', - 'conditional_gaussian_node', - 'conditional_logit_node', - 'conditional_mixture_gaussian_node', - 'discrete_node', - 'logit_node', - 'gaussian_node', - 'mixture_gaussian_node', + "base", + "conditional_gaussian_node", + "conditional_logit_node", + "conditional_mixture_gaussian_node", + "discrete_node", + "logit_node", + "gaussian_node", + "mixture_gaussian_node", ] diff --git a/bamt/nodes/base.py b/bamt/nodes/base.py index 8d96255..1c5c2c8 100644 --- a/bamt/nodes/base.py +++ b/bamt/nodes/base.py @@ -6,9 +6,8 @@ import os STORAGE = config.get( - 'NODES', - 'models_storage', - fallback='models_storage is not defined') + "NODES", "models_storage", fallback="models_storage is not defined" +) class BaseNode(object): @@ -25,7 +24,7 @@ def __init__(self, name: str): children: node's children """ self.name = name - self.type = 'abstract' + self.type = "abstract" self.disc_parents = [] self.cont_parents = [] @@ -39,25 +38,27 @@ def __eq__(self, other): # don't attempt to compare against unrelated types return NotImplemented - return self.name == other.name and \ - self.type == other.type and \ - self.disc_parents == other.disc_parents and \ - self.cont_parents == other.cont_parents and \ - self.children == other.children + return ( + self.name == other.name + and self.type == other.type + and self.disc_parents == other.disc_parents + and self.cont_parents == other.cont_parents + and self.children == other.children + ) @staticmethod def choose_serialization(model) -> Union[str, Exception]: try: ex_b = pickle.dumps(model, protocol=4) - model_ser = ex_b.decode('latin1').replace('\'', '\"') + model_ser = ex_b.decode("latin1").replace("'", '"') if type(model).__name__ == "CatBoostRegressor": - a = model_ser.encode('latin1') + a = model_ser.encode("latin1") else: - a = model_ser.replace('\"', '\'').encode('latin1') + a = model_ser.replace('"', "'").encode("latin1") classifier_body = pickle.loads(a) - return 'pickle' + return "pickle" except Exception as ex: return ex @@ -76,18 +77,12 @@ def get_path_joblib(node_name: str, specific: str = "") -> str: specific = str(specific) index = str(int(os.listdir(STORAGE)[-1])) - path_to_check = os.path.join( - STORAGE, - index, - f"{node_name.replace(' ', '_')}") + path_to_check = os.path.join(STORAGE, index, f"{node_name.replace(' ', '_')}") if not os.path.isdir(path_to_check): - os.makedirs( - os.path.join( - STORAGE, - index, - f"{node_name.replace(' ', '_')}")) + os.makedirs(os.path.join(STORAGE, index, f"{node_name.replace(' ', '_')}")) path = os.path.abspath( - os.path.join(path_to_check, f"{specific}.joblib.compressed")) + os.path.join(path_to_check, f"{specific}.joblib.compressed") + ) return path diff --git a/bamt/nodes/conditional_gaussian_node.py b/bamt/nodes/conditional_gaussian_node.py index 6bc750b..dcdd166 100644 --- a/bamt/nodes/conditional_gaussian_node.py +++ b/bamt/nodes/conditional_gaussian_node.py @@ -27,11 +27,9 @@ def __init__(self, name, regressor: Optional[object] = None): if regressor is None: regressor = linear_model.LinearRegression() self.regressor = regressor - self.type = 'ConditionalGaussian' + \ - f" ({type(self.regressor).__name__})" + self.type = "ConditionalGaussian" + f" ({type(self.regressor).__name__})" - def fit_parameters( - self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams]]: + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams]]: """ Train params for Conditional Gaussian Node. Return: @@ -53,56 +51,69 @@ def fit_parameters( if new_data.shape[0] > 1: if self.cont_parents: model = self.regressor - model.fit(new_data[self.cont_parents].values, - new_data[self.name].values) - predicted_value = model.predict( - new_data[self.cont_parents].values) + model.fit( + new_data[self.cont_parents].values, new_data[self.name].values + ) + predicted_value = model.predict(new_data[self.cont_parents].values) variance = mse( - new_data[self.name].values, predicted_value, squared=False) + new_data[self.name].values, predicted_value, squared=False + ) serialization = self.choose_serialization(model) - if serialization == 'pickle': + if serialization == "pickle": ex_b = pickle.dumps(self.regressor, protocol=4) - model_ser = ex_b.decode('latin1') + model_ser = ex_b.decode("latin1") # model_ser = pickle.dumps(self.classifier, protocol=4) - hycprob[str(key_comb)] = {'variance': variance, - 'mean': np.nan, - 'regressor_obj': model_ser, - 'regressor': type(self.regressor).__name__, - 'serialization': 'pickle'} + hycprob[str(key_comb)] = { + "variance": variance, + "mean": np.nan, + "regressor_obj": model_ser, + "regressor": type(self.regressor).__name__, + "serialization": "pickle", + } else: logger_nodes.warning( - f"{self.name} {comb}::Pickle failed. BAMT will use Joblib. | " + str(serialization.args[0])) + f"{self.name} {comb}::Pickle failed. BAMT will use Joblib. | " + + str(serialization.args[0]) + ) path = self.get_path_joblib( - node_name=self.name.replace( - ' ', '_'), specific=comb) + node_name=self.name.replace(" ", "_"), specific=comb + ) joblib.dump(model, path, compress=True, protocol=4) - hycprob[str(key_comb)] = {'variance': variance, - 'mean': np.nan, - 'regressor_obj': path, - 'regressor': type(self.regressor).__name__, - 'serialization': 'joblib'} + hycprob[str(key_comb)] = { + "variance": variance, + "mean": np.nan, + "regressor_obj": path, + "regressor": type(self.regressor).__name__, + "serialization": "joblib", + } else: mean_base = np.mean(new_data[self.name].values) variance = np.var(new_data[self.name].values) - hycprob[str(key_comb)] = {'variance': variance, - 'mean': mean_base, - 'regressor_obj': None, - 'regressor': None, - 'serialization': None} + hycprob[str(key_comb)] = { + "variance": variance, + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "serialization": None, + } else: - hycprob[str(key_comb)] = {'variance': np.nan, - 'regressor': None, - 'regressor_obj': None, - 'serialization': None, - 'mean': np.nan} + hycprob[str(key_comb)] = { + "variance": np.nan, + "regressor": None, + "regressor_obj": None, + "serialization": None, + "mean": np.nan, + } return {"hybcprob": hycprob} - def choose(self, - node_info: Dict[str, Dict[str, CondGaussParams]], - pvals: List[Union[str, float]]) -> float: + def choose( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: """ Return value from ConditionalLogit node params: @@ -125,36 +136,36 @@ def choose(self, if self.cont_parents: flag = False for el in lgpvals: - if str(el) == 'nan': + if str(el) == "nan": flag = True break if flag: return np.nan else: - if lgdistribution['regressor']: - if lgdistribution["serialization"] == 'joblib': + if lgdistribution["regressor"]: + if lgdistribution["serialization"] == "joblib": model = joblib.load(lgdistribution["regressor_obj"]) else: # str_model = lgdistribution["classifier_obj"].decode('latin1').replace('\'', '\"') - bytes_model = lgdistribution["regressor_obj"].encode( - 'latin1') + bytes_model = lgdistribution["regressor_obj"].encode("latin1") model = pickle.loads(bytes_model) - cond_mean = model.predict( - np.array(lgpvals).reshape(1, -1))[0] - variance = lgdistribution['variance'] + cond_mean = model.predict(np.array(lgpvals).reshape(1, -1))[0] + variance = lgdistribution["variance"] return random.gauss(cond_mean, variance) else: return np.nan else: return random.gauss( - lgdistribution['mean'], math.sqrt( - lgdistribution['variance'])) - - def predict(self, - node_info: Dict[str, Dict[str, CondGaussParams]], - pvals: List[Union[str, float]]) -> float: + lgdistribution["mean"], math.sqrt(lgdistribution["variance"]) + ) + + def predict( + self, + node_info: Dict[str, Dict[str, CondGaussParams]], + pvals: List[Union[str, float]], + ) -> float: """ Return value from ConditionalLogit node params: @@ -175,22 +186,21 @@ def predict(self, if self.cont_parents: flag = False for el in lgpvals: - if str(el) == 'nan': + if str(el) == "nan": flag = True break if flag: return np.nan else: - if lgdistribution['regressor']: - if lgdistribution["serialization"] == 'joblib': + if lgdistribution["regressor"]: + if lgdistribution["serialization"] == "joblib": model = joblib.load(lgdistribution["regressor_obj"]) else: # str_model = lgdistribution["classifier_obj"].decode('latin1').replace('\'', '\"') - bytes_model = lgdistribution["regressor_obj"].encode( - 'latin1') + bytes_model = lgdistribution["regressor_obj"].encode("latin1") model = pickle.loads(bytes_model) return model.predict(np.array(lgpvals).reshape(1, -1))[0] else: return np.nan else: - return lgdistribution['mean'] + return lgdistribution["mean"] diff --git a/bamt/nodes/conditional_logit_node.py b/bamt/nodes/conditional_logit_node.py index 90e3731..3a1b1a0 100644 --- a/bamt/nodes/conditional_logit_node.py +++ b/bamt/nodes/conditional_logit_node.py @@ -23,12 +23,12 @@ def __init__(self, name: str, classifier: Optional[object] = None): super(ConditionalLogitNode, self).__init__(name) if classifier is None: classifier = linear_model.LogisticRegression( - multi_class='multinomial', solver='newton-cg', max_iter=100) + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) self.classifier = classifier - self.type = 'ConditionalLogit' + f" ({type(self.classifier).__name__})" + self.type = "ConditionalLogit" + f" ({type(self.classifier).__name__})" - def fit_parameters( - self, data: DataFrame) -> Dict[str, Dict[str, LogitParams]]: + def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, LogitParams]]: """ Train params on data Return: @@ -53,46 +53,62 @@ def fit_parameters( model = self.classifier values = set(new_data[self.name]) if len(values) > 1: - model.fit(new_data[self.cont_parents].values, - new_data[self.name].values) + model.fit( + new_data[self.cont_parents].values, new_data[self.name].values + ) classes = list(model.classes_) serialization = self.choose_serialization(model) - if serialization == 'pickle': + if serialization == "pickle": ex_b = pickle.dumps(self.classifier, protocol=4) - model_ser = ex_b.decode('latin1') + model_ser = ex_b.decode("latin1") # model_ser = pickle.dumps(self.classifier, protocol=4) - hycprob[str(key_comb)] = {'classes': classes, - 'classifier_obj': model_ser, - 'classifier': type(self.classifier).__name__, - 'serialization': 'pickle'} + hycprob[str(key_comb)] = { + "classes": classes, + "classifier_obj": model_ser, + "classifier": type(self.classifier).__name__, + "serialization": "pickle", + } else: logger_nodes.warning( - f"{self.name} {comb}::Pickle failed. BAMT will use Joblib. | " + str(serialization.args[0])) + f"{self.name} {comb}::Pickle failed. BAMT will use Joblib. | " + + str(serialization.args[0]) + ) path = self.get_path_joblib( - node_name=self.name.replace( - ' ', '_'), specific=comb) + node_name=self.name.replace(" ", "_"), specific=comb + ) joblib.dump(model, path, compress=True, protocol=4) - hycprob[str(key_comb)] = {'classes': classes, - 'classifier_obj': path, - 'classifier': type(self.classifier).__name__, - 'serialization': 'joblib'} + hycprob[str(key_comb)] = { + "classes": classes, + "classifier_obj": path, + "classifier": type(self.classifier).__name__, + "serialization": "joblib", + } else: classes = list(values) - hycprob[str(key_comb)] = {'classes': classes, 'classifier': type( - self.classifier).__name__, 'classifier_obj': None, 'serialization': None} + hycprob[str(key_comb)] = { + "classes": classes, + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } else: - hycprob[str(key_comb)] = {'classes': list(classes), 'classifier': type( - self.classifier).__name__, 'classifier_obj': None, 'serialization': None} + hycprob[str(key_comb)] = { + "classes": list(classes), + "classifier": type(self.classifier).__name__, + "classifier_obj": None, + "serialization": None, + } return {"hybcprob": hycprob} @staticmethod - def choose(node_info: Dict[str, Dict[str, LogitParams]], - pvals: List[Union[str, float]]) -> str: + def choose( + node_info: Dict[str, Dict[str, LogitParams]], pvals: List[Union[str, float]] + ) -> str: """ Return value from ConditionalLogit node params: @@ -108,22 +124,21 @@ def choose(node_info: Dict[str, Dict[str, LogitParams]], else: lgpvals.append(pval) - if any(parent_value == 'nan' for parent_value in dispvals): + if any(parent_value == "nan" for parent_value in dispvals): return np.nan lgdistribution = node_info["hybcprob"][str(dispvals)] # JOBLIB if len(lgdistribution["classes"]) > 1: - if lgdistribution["serialization"] == 'joblib': + if lgdistribution["serialization"] == "joblib": model = joblib.load(lgdistribution["classifier_obj"]) else: # str_model = lgdistribution["classifier_obj"].decode('latin1').replace('\'', '\"') - bytes_model = lgdistribution["classifier_obj"].encode('latin1') + bytes_model = lgdistribution["classifier_obj"].encode("latin1") model = pickle.loads(bytes_model) - distribution = model.predict_proba( - np.array(lgpvals).reshape(1, -1))[0] + distribution = model.predict_proba(np.array(lgpvals).reshape(1, -1))[0] rand = random.random() rindex = 0 @@ -142,8 +157,9 @@ def choose(node_info: Dict[str, Dict[str, LogitParams]], return str(lgdistribution["classes"][0]) @staticmethod - def predict(node_info: Dict[str, Dict[str, LogitParams]], - pvals: List[Union[str, float]]) -> str: + def predict( + node_info: Dict[str, Dict[str, LogitParams]], pvals: List[Union[str, float]] + ) -> str: """ Return value from ConditionalLogit node params: @@ -163,11 +179,11 @@ def predict(node_info: Dict[str, Dict[str, LogitParams]], # JOBLIB if len(lgdistribution["classes"]) > 1: - if lgdistribution["serialization"] == 'joblib': + if lgdistribution["serialization"] == "joblib": model = joblib.load(lgdistribution["classifier_obj"]) else: # str_model = lgdistribution["classifier_obj"].decode('latin1').replace('\'', '\"') - bytes_model = lgdistribution["classifier_obj"].encode('latin1') + bytes_model = lgdistribution["classifier_obj"].encode("latin1") model = pickle.loads(bytes_model) pred = model.predict(np.array(lgpvals).reshape(1, -1))[0] diff --git a/bamt/nodes/conditional_mixture_gaussian_node.py b/bamt/nodes/conditional_mixture_gaussian_node.py index 2723eaf..280aada 100644 --- a/bamt/nodes/conditional_mixture_gaussian_node.py +++ b/bamt/nodes/conditional_mixture_gaussian_node.py @@ -18,10 +18,11 @@ class ConditionalMixtureGaussianNode(BaseNode): def __init__(self, name): super(ConditionalMixtureGaussianNode, self).__init__(name) - self.type = 'ConditionalMixtureGaussian' + self.type = "ConditionalMixtureGaussian" def fit_parameters( - self, data: DataFrame) -> Dict[str, Dict[str, CondMixtureGaussParams]]: + self, data: DataFrame + ) -> Dict[str, Dict[str, CondMixtureGaussParams]]: """ Train params for Conditional Mixture Gaussian Node. Return: @@ -48,69 +49,74 @@ def fit_parameters( # 'LRTS')#int((component(new_data, nodes, 'aic') + # component(new_data, nodes, 'bic')) / 2) n_comp = int( - (component( - new_data, - nodes, - 'aic') + - component( - new_data, - nodes, - 'bic')) / - 2) + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) # n_comp = 3 - gmm = GMM( - n_components=n_comp).from_samples( - new_data[nodes].values, - n_iter=500, - init_params='kmeans++') + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) else: # component(new_data, [node], # 'LRTS')#int((component(new_data, [node], 'aic') + # component(new_data, [node], 'bic')) / 2) - n_comp = int((component(new_data, - [self.name], - 'aic') + component(new_data, - [self.name], - 'bic')) / 2) + n_comp = int( + ( + component(new_data, [self.name], "aic") + + component(new_data, [self.name], "bic") + ) + / 2 + ) # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples(np.transpose( - [new_data[self.name].values]), n_iter=500, init_params='kmeans++') + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([new_data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) means = gmm.means.tolist() cov = gmm.covariances.tolist() # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) w = gmm.priors.tolist() # [] # for row in weigts: # w.append(np.mean(row)) - hycprob[str(key_comb)] = { - 'covars': cov, 'mean': means, 'coef': w} + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} elif new_data.shape[0] != 0: n_comp = 1 gmm = GMM(n_components=n_comp) if self.cont_parents: gmm.from_samples(new_data[nodes].values) else: - gmm.from_samples(np.transpose( - [new_data[self.name].values])) + gmm.from_samples(np.transpose([new_data[self.name].values])) means = gmm.means.tolist() cov = gmm.covariances.tolist() # weigts = np.transpose(gmm.to_responsibilities(np.transpose([new_data[node].values]))) w = gmm.priors.tolist() # [] # for row in weigts: # w.append(np.mean(row)) - hycprob[str(key_comb)] = { - 'covars': cov, 'mean': means, 'coef': w} + hycprob[str(key_comb)] = {"covars": cov, "mean": means, "coef": w} else: if self.cont_parents: hycprob[str(key_comb)] = { - 'covars': np.nan, 'mean': np.nan, 'coef': []} + "covars": np.nan, + "mean": np.nan, + "coef": [], + } else: hycprob[str(key_comb)] = { - 'covars': np.nan, 'mean': np.nan, 'coef': []} + "covars": np.nan, + "mean": np.nan, + "coef": [], + } return {"hybcprob": hycprob} @staticmethod - def choose(node_info: Dict[str, Dict[str, CondMixtureGaussParams]], - pvals: List[Union[str, float]]) -> Optional[float]: + def choose( + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: """ Function to get value from ConditionalMixtureGaussian node params: @@ -137,7 +143,8 @@ def choose(node_info: Dict[str, Dict[str, CondMixtureGaussParams]], n_components=n_comp, priors=w, means=mean, - covariances=covariance) + covariances=covariance, + ) cond_gmm = gmm.condition(indexes, [lgpvals]) sample = cond_gmm.sample(1)[0][0] else: @@ -145,18 +152,18 @@ def choose(node_info: Dict[str, Dict[str, CondMixtureGaussParams]], else: n_comp = len(w) gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance) + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) sample = gmm.sample(1)[0][0] else: sample = np.nan return sample @staticmethod - def predict(node_info: Dict[str, Dict[str, CondMixtureGaussParams]], - pvals: List[Union[str, float]]) -> Optional[float]: + def predict( + node_info: Dict[str, Dict[str, CondMixtureGaussParams]], + pvals: List[Union[str, float]], + ) -> Optional[float]: """ Function to get prediction from ConditionalMixtureGaussian node params: @@ -184,7 +191,8 @@ def predict(node_info: Dict[str, Dict[str, CondMixtureGaussParams]], n_components=n_comp, priors=w, means=mean, - covariances=covariance) + covariances=covariance, + ) sample = gmm.predict(indexes, [lgpvals])[0][0] else: sample = np.nan diff --git a/bamt/nodes/discrete_node.py b/bamt/nodes/discrete_node.py index 364c106..8d2cbc0 100644 --- a/bamt/nodes/discrete_node.py +++ b/bamt/nodes/discrete_node.py @@ -17,7 +17,7 @@ class DiscreteNode(BaseNode): def __init__(self, name): super(DiscreteNode, self).__init__(name) - self.type = 'Discrete' + self.type = "Discrete" def fit_parameters(self, data: DataFrame, num_workers: int = 1): """ @@ -31,35 +31,31 @@ def fit_parameters(self, data: DataFrame, num_workers: int = 1): def worker(node: Type[BaseNode]) -> DiscreteParams: parents = node.disc_parents + node.cont_parents if not parents: - dist = DiscreteDistribution.from_samples( - data[node.name].values) + dist = DiscreteDistribution.from_samples(data[node.name].values) cprob = list(dict(sorted(dist.items())).values()) - vals = sorted([str(x) - for x in list(dist.parameters[0].keys())]) + vals = sorted([str(x) for x in list(dist.parameters[0].keys())]) else: - dist = DiscreteDistribution.from_samples( - data[node.name].values) - vals = sorted([str(x) - for x in list(dist.parameters[0].keys())]) + dist = DiscreteDistribution.from_samples(data[node.name].values) + vals = sorted([str(x) for x in list(dist.parameters[0].keys())]) dist = ConditionalProbabilityTable.from_samples( - data[parents + [node.name]].values) + data[parents + [node.name]].values + ) params = dist.parameters[0] cprob = dict() for i in range(0, len(params), len(vals)): probs = [] for j in range(i, (i + len(vals))): probs.append(params[j][-1]) - combination = [str(x) for x in params[i][0:len(parents)]] + combination = [str(x) for x in params[i][0 : len(parents)]] cprob[str(combination)] = probs - return {"cprob": cprob, 'vals': vals} + return {"cprob": cprob, "vals": vals} pool = ThreadPoolExecutor(num_workers) future = pool.submit(worker, self) return future.result() @staticmethod - def choose(node_info: Dict[str, Union[float, str]], - pvals: List[str]) -> str: + def choose(node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: """ Return value from discrete node params: @@ -68,12 +64,12 @@ def choose(node_info: Dict[str, Union[float, str]], """ rindex = 0 random.seed() - vals = node_info['vals'] + vals = node_info["vals"] if not pvals: - dist = node_info['cprob'] + dist = node_info["cprob"] else: # noinspection PyTypeChecker - dist = node_info['cprob'][str(pvals)] + dist = node_info["cprob"][str(pvals)] lbound = 0 ubound = 0 rand = random.random() @@ -88,8 +84,7 @@ def choose(node_info: Dict[str, Union[float, str]], return vals[rindex] @staticmethod - def predict(node_info: Dict[str, Union[float, str]], - pvals: List[str]) -> str: + def predict(node_info: Dict[str, Union[float, str]], pvals: List[str]) -> str: """function for prediction based on evidence values in discrete node Args: @@ -100,17 +95,15 @@ def predict(node_info: Dict[str, Union[float, str]], str: prediction """ - vals = node_info['vals'] + vals = node_info["vals"] disct = [] if not pvals: - dist = node_info['cprob'] + dist = node_info["cprob"] else: # noinspection PyTypeChecker - dist = node_info['cprob'][str(pvals)] + dist = node_info["cprob"][str(pvals)] max_value = max(dist) - indices = [ - index for index, - value in enumerate(dist) if value == max_value] + indices = [index for index, value in enumerate(dist) if value == max_value] max_ind = 0 if len(indices) == 1: max_ind = indices[0] diff --git a/bamt/nodes/gaussian_node.py b/bamt/nodes/gaussian_node.py index 294fc09..f72dcf3 100644 --- a/bamt/nodes/gaussian_node.py +++ b/bamt/nodes/gaussian_node.py @@ -27,46 +27,55 @@ def __init__(self, name, regressor: Optional[object] = None): if regressor is None: regressor = linear_model.LinearRegression() self.regressor = regressor - self.type = 'Gaussian' + f" ({type(self.regressor).__name__})" + self.type = "Gaussian" + f" ({type(self.regressor).__name__})" def fit_parameters(self, data: DataFrame) -> GaussianParams: parents = self.cont_parents if parents: self.regressor.fit(data[parents].values, data[self.name].values) predicted_value = self.regressor.predict(data[parents].values) - variance = mse(data[self.name].values, - predicted_value, squared=False) + variance = mse(data[self.name].values, predicted_value, squared=False) serialization = self.choose_serialization(self.regressor) - if serialization == 'pickle': + if serialization == "pickle": ex_b = pickle.dumps(self.regressor, protocol=4) # model_ser = ex_b.decode('latin1').replace('\'', '\"') - model_ser = ex_b.decode('latin1') - return {'mean': np.nan, - 'regressor_obj': model_ser, - 'regressor': type(self.regressor).__name__, - 'variance': variance, - 'serialization': 'pickle'} + model_ser = ex_b.decode("latin1") + return { + "mean": np.nan, + "regressor_obj": model_ser, + "regressor": type(self.regressor).__name__, + "variance": variance, + "serialization": "pickle", + } else: logger_nodes.warning( - f"{self.name}::Pickle failed. BAMT will use Joblib. | " + str(serialization.args[0])) - - path = self.get_path_joblib(node_name=self.name.replace( - ' ', '_'), specific=f"{self.name.replace(' ', '_')}") + f"{self.name}::Pickle failed. BAMT will use Joblib. | " + + str(serialization.args[0]) + ) + + path = self.get_path_joblib( + node_name=self.name.replace(" ", "_"), + specific=f"{self.name.replace(' ', '_')}", + ) joblib.dump(self.regressor, path, compress=True, protocol=4) - return {'mean': np.nan, - 'regressor_obj': path, - 'regressor': type(self.regressor).__name__, - 'variance': variance, - 'serialization': 'joblib'} + return { + "mean": np.nan, + "regressor_obj": path, + "regressor": type(self.regressor).__name__, + "variance": variance, + "serialization": "joblib", + } else: mean_base = np.mean(data[self.name].values) variance = np.var(data[self.name].values) - return {'mean': mean_base, - 'regressor_obj': None, - 'regressor': None, - 'variance': variance, - 'serialization': None} + return { + "mean": mean_base, + "regressor_obj": None, + "regressor": None, + "variance": variance, + "serialization": None, + } @staticmethod def choose(node_info: GaussianParams, pvals: List[float]) -> float: @@ -78,21 +87,19 @@ def choose(node_info: GaussianParams, pvals: List[float]) -> float: """ if pvals: for el in pvals: - if str(el) == 'nan': + if str(el) == "nan": return np.nan - if node_info["serialization"] == 'joblib': + if node_info["serialization"] == "joblib": model = joblib.load(node_info["regressor_obj"]) else: - a = node_info["regressor_obj"].encode('latin1') + a = node_info["regressor_obj"].encode("latin1") model = pickle.loads(a) cond_mean = model.predict(np.array(pvals).reshape(1, -1))[0] - var = node_info['variance'] + var = node_info["variance"] return random.gauss(cond_mean, var) else: - return random.gauss( - node_info['mean'], math.sqrt( - node_info['variance'])) + return random.gauss(node_info["mean"], math.sqrt(node_info["variance"])) @staticmethod def predict(node_info: GaussianParams, pvals: List[float]) -> float: @@ -105,15 +112,15 @@ def predict(node_info: GaussianParams, pvals: List[float]) -> float: if pvals: for el in pvals: - if str(el) == 'nan': + if str(el) == "nan": return np.nan - if node_info["serialization"] == 'joblib': + if node_info["serialization"] == "joblib": model = joblib.load(node_info["regressor_obj"]) else: - a = node_info["regressor_obj"].encode('latin1') + a = node_info["regressor_obj"].encode("latin1") model = pickle.loads(a) pred = model.predict(np.array(pvals).reshape(1, -1))[0] return pred else: - return node_info['mean'] + return node_info["mean"] diff --git a/bamt/nodes/logit_node.py b/bamt/nodes/logit_node.py index 27fb5df..5f770eb 100644 --- a/bamt/nodes/logit_node.py +++ b/bamt/nodes/logit_node.py @@ -22,9 +22,10 @@ def __init__(self, name, classifier: Optional[object] = None): super(LogitNode, self).__init__(name) if classifier is None: classifier = linear_model.LogisticRegression( - multi_class='multinomial', solver='newton-cg', max_iter=100) + multi_class="multinomial", solver="newton-cg", max_iter=100 + ) self.classifier = classifier - self.type = 'Logit' + f" ({type(self.classifier).__name__})" + self.type = "Logit" + f" ({type(self.classifier).__name__})" def fit_parameters(self, data: DataFrame) -> LogitParams: model_ser = None @@ -34,25 +35,27 @@ def fit_parameters(self, data: DataFrame) -> LogitParams: self.classifier.fit(data[parents].values, data[self.name].values) serialization = self.choose_serialization(self.classifier) - if serialization == 'pickle': + if serialization == "pickle": ex_b = pickle.dumps(self.classifier, protocol=4) # model_ser = ex_b.decode('latin1').replace('\'', '\"') - model_ser = ex_b.decode('latin1') - serialization_name = 'pickle' + model_ser = ex_b.decode("latin1") + serialization_name = "pickle" else: logger_nodes.warning( - f"{self.name}::Pickle failed. BAMT will use Joblib. | " + str(serialization.args[0])) + f"{self.name}::Pickle failed. BAMT will use Joblib. | " + + str(serialization.args[0]) + ) - path = self.get_path_joblib( - self.name, specific=self.name.replace( - ' ', '_')) + path = self.get_path_joblib(self.name, specific=self.name.replace(" ", "_")) joblib.dump(self.classifier, path, compress=True, protocol=4) - serialization_name = 'joblib' - return {'classes': list(self.classifier.classes_), - 'classifier_obj': path or model_ser, - 'classifier': type(self.classifier).__name__, - 'serialization': serialization_name} + serialization_name = "joblib" + return { + "classes": list(self.classifier.classes_), + "classifier_obj": path or model_ser, + "classifier": type(self.classifier).__name__, + "serialization": serialization_name, + } def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: """ @@ -65,14 +68,13 @@ def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: rindex = 0 if len(node_info["classes"]) > 1: - if node_info["serialization"] == 'joblib': + if node_info["serialization"] == "joblib": model = joblib.load(node_info["classifier_obj"]) else: # str_model = node_info["classifier_obj"].decode('latin1').replace('\'', '\"') - a = node_info["classifier_obj"].encode('latin1') + a = node_info["classifier_obj"].encode("latin1") model = pickle.loads(a) - distribution = model.predict_proba( - np.array(pvals).reshape(1, -1))[0] + distribution = model.predict_proba(np.array(pvals).reshape(1, -1))[0] # choose rand = random.random() @@ -80,7 +82,7 @@ def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: ubound = 0 for interval in range(len(node_info["classes"])): ubound += distribution[interval] - if (lbound <= rand and rand < ubound): + if lbound <= rand and rand < ubound: rindex = interval break else: @@ -91,8 +93,7 @@ def choose(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: else: return str(node_info["classes"][0]) - def predict(self, node_info: LogitParams, - pvals: List[Union[float]]) -> str: + def predict(self, node_info: LogitParams, pvals: List[Union[float]]) -> str: """ Return prediction from Logit node params: @@ -101,11 +102,11 @@ def predict(self, node_info: LogitParams, """ if len(node_info["classes"]) > 1: - if node_info["serialization"] == 'joblib': + if node_info["serialization"] == "joblib": model = joblib.load(node_info["classifier_obj"]) else: # str_model = node_info["classifier_obj"].decode('latin1').replace('\'', '\"') - a = node_info["classifier_obj"].encode('latin1') + a = node_info["classifier_obj"].encode("latin1") model = pickle.loads(a) pred = model.predict(np.array(pvals).reshape(1, -1))[0] diff --git a/bamt/nodes/mixture_gaussian_node.py b/bamt/nodes/mixture_gaussian_node.py index 74804fc..93dc724 100644 --- a/bamt/nodes/mixture_gaussian_node.py +++ b/bamt/nodes/mixture_gaussian_node.py @@ -17,7 +17,7 @@ class MixtureGaussianNode(BaseNode): def __init__(self, name): super(MixtureGaussianNode, self).__init__(name) - self.type = 'MixtureGaussian' + self.type = "MixtureGaussian" def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: """ @@ -25,14 +25,19 @@ def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: """ parents = self.disc_parents + self.cont_parents if not parents: - n_comp = int((component(data, - [self.name], - 'aic') + component(data, - [self.name], - 'bic')) / 2) # component(data, [node], 'LRTS')# + n_comp = int( + ( + component(data, [self.name], "aic") + + component(data, [self.name], "bic") + ) + / 2 + ) # component(data, [node], 'LRTS')# # n_comp = 3 - gmm = GMM(n_components=n_comp).from_samples(np.transpose( - [data[self.name].values]), n_iter=500, init_params='kmeans++') + gmm = GMM(n_components=n_comp).from_samples( + np.transpose([data[self.name].values]), + n_iter=500, + init_params="kmeans++", + ) means = gmm.means.tolist() cov = gmm.covariances.tolist() # weigts = np.transpose(gmm.to_responsibilities(np.transpose([data[node].values]))) @@ -46,34 +51,28 @@ def fit_parameters(self, data: DataFrame) -> MixtureGaussianParams: new_data = data[nodes] new_data.reset_index(inplace=True, drop=True) n_comp = int( - (component( - new_data, - nodes, - 'aic') + - component( - new_data, - nodes, - 'bic')) / - 2) # component(new_data, nodes, 'LRTS')# + ( + component(new_data, nodes, "aic") + + component(new_data, nodes, "bic") + ) + / 2 + ) # component(new_data, nodes, 'LRTS')# # n_comp = 3 - gmm = GMM( - n_components=n_comp).from_samples( - new_data[nodes].values, - n_iter=500, - init_params='kmeans++') + gmm = GMM(n_components=n_comp).from_samples( + new_data[nodes].values, n_iter=500, init_params="kmeans++" + ) means = gmm.means.tolist() cov = gmm.covariances.tolist() # weigts = np.transpose(gmm.to_responsibilities(new_data[nodes].values)) w = gmm.priors.tolist() # [] # for row in weigts: # w.append(np.mean(row)) - return {"mean": means, - "coef": w, - "covars": cov} + return {"mean": means, "coef": w, "covars": cov} @staticmethod - def choose(node_info: MixtureGaussianParams, - pvals: List[Union[str, float]]) -> Optional[float]: + def choose( + node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: """ Func to get value from current node node_info: nodes info from distributions @@ -83,7 +82,7 @@ def choose(node_info: MixtureGaussianParams, mean = node_info["mean"] covariance = node_info["covars"] w = node_info["coef"] - n_comp = len(node_info['coef']) + n_comp = len(node_info["coef"]) if n_comp != 0: if pvals: indexes = [i for i in range(1, len(pvals) + 1)] @@ -92,25 +91,25 @@ def choose(node_info: MixtureGaussianParams, n_components=n_comp, priors=w, means=mean, - covariances=covariance) + covariances=covariance, + ) cond_gmm = gmm.condition(indexes, [pvals]) sample = cond_gmm.sample(1)[0][0] else: sample = np.nan else: gmm = GMM( - n_components=n_comp, - priors=w, - means=mean, - covariances=covariance) + n_components=n_comp, priors=w, means=mean, covariances=covariance + ) sample = gmm.sample(1)[0][0] else: sample = np.nan return sample @staticmethod - def predict(node_info: MixtureGaussianParams, - pvals: List[Union[str, float]]) -> Optional[float]: + def predict( + node_info: MixtureGaussianParams, pvals: List[Union[str, float]] + ) -> Optional[float]: """ Func to get prediction from current node node_info: nodes info from distributions @@ -120,7 +119,7 @@ def predict(node_info: MixtureGaussianParams, mean = node_info["mean"] covariance = node_info["covars"] w = node_info["coef"] - n_comp = len(node_info['coef']) + n_comp = len(node_info["coef"]) if n_comp != 0: if pvals: indexes = [i for i in range(1, len(pvals) + 1)] @@ -129,7 +128,8 @@ def predict(node_info: MixtureGaussianParams, n_components=n_comp, priors=w, means=mean, - covariances=covariance) + covariances=covariance, + ) sample = gmm.predict(indexes, [pvals])[0][0] else: sample = np.nan diff --git a/bamt/preprocess/discretization.py b/bamt/preprocess/discretization.py index 234081c..43010f7 100644 --- a/bamt/preprocess/discretization.py +++ b/bamt/preprocess/discretization.py @@ -19,11 +19,11 @@ def get_nodes_sign(data: pd.DataFrame) -> dict: nodes_types = get_nodes_type(data) columns_sign = dict() for c in data.columns.to_list(): - if nodes_types[c] == 'cont': + if nodes_types[c] == "cont": if (data[c] < 0).any(): - columns_sign[c] = 'neg' + columns_sign[c] = "neg" else: - columns_sign[c] = 'pos' + columns_sign[c] = "pos" return columns_sign @@ -39,18 +39,22 @@ def get_nodes_type(data: pd.DataFrame) -> dict: """ column_type = dict() for c in data.columns.to_list(): - if (data[c].dtypes == 'float64') | (data[c].dtypes == 'float32'): - column_type[c] = 'cont' - if (data[c].dtypes == 'str') | ( - data[c].dtypes == 'O') | (data[c].dtypes == 'b'): - column_type[c] = 'disc' - if ((data[c].dtypes == 'int64') | (data[c].dtypes == 'int32')): - column_type[c] = 'disc' + if (data[c].dtypes == "float64") | (data[c].dtypes == "float32"): + column_type[c] = "cont" + if ( + (data[c].dtypes == "str") + | (data[c].dtypes == "O") + | (data[c].dtypes == "b") + ): + column_type[c] = "disc" + if (data[c].dtypes == "int64") | (data[c].dtypes == "int32"): + column_type[c] = "disc" return column_type -def discretization(data: pd.DataFrame, method: str, columns: list, - bins: int = 5) -> Tuple[pd.DataFrame, KBinsDiscretizer]: +def discretization( + data: pd.DataFrame, method: str, columns: list, bins: int = 5 +) -> Tuple[pd.DataFrame, KBinsDiscretizer]: """Discretization of continuous parameters Args: @@ -66,18 +70,18 @@ def discretization(data: pd.DataFrame, method: str, columns: list, data = data.dropna() data.reset_index(inplace=True, drop=True) d_data = copy(data) - est = KBinsDiscretizer(n_bins=bins, encode='ordinal') + est = KBinsDiscretizer(n_bins=bins, encode="ordinal") strategy_dict = { - 'equal_intervals': 'uniform', - 'equal_frequency': 'quantile', - 'kmeans': 'kmeans' + "equal_intervals": "uniform", + "equal_frequency": "quantile", + "kmeans": "kmeans", } if method in strategy_dict: est.strategy = strategy_dict[method] data_discrete = est.fit_transform(d_data.loc[:, columns].values) - d_data[columns] = data_discrete.astype('int') + d_data[columns] = data_discrete.astype("int") else: - raise Exception('This discretization method is not supported') + raise Exception("This discretization method is not supported") return d_data, est @@ -98,8 +102,9 @@ def onehot_encoding(data, columns): return d_data, None -def code_categories(data: pd.DataFrame, method: str, - columns: list) -> Tuple[pd.DataFrame, dict]: +def code_categories( + data: pd.DataFrame, method: str, columns: list +) -> Tuple[pd.DataFrame, dict]: """Encoding categorical parameters Args: @@ -113,22 +118,18 @@ def code_categories(data: pd.DataFrame, method: str, """ data = data.dropna() data.reset_index(inplace=True, drop=True) - encoding_func_dict = { - 'label': label_encoding, - 'onehot': onehot_encoding - } + encoding_func_dict = {"label": label_encoding, "onehot": onehot_encoding} if method in encoding_func_dict: d_data, encoder_dict = encoding_func_dict[method](data, columns) else: - raise Exception('This encoding method is not supported') + raise Exception("This encoding method is not supported") return d_data, encoder_dict def inverse_discretization( - data: pd.DataFrame, - columns: list, - discretizer: KBinsDiscretizer) -> pd.DataFrame: + data: pd.DataFrame, columns: list, discretizer: KBinsDiscretizer +) -> pd.DataFrame: """Inverse discretization for numeric params Args: @@ -145,8 +146,7 @@ def inverse_discretization( return new_data -def decode(data: pd.DataFrame, columns: list, - encoder_dict: dict) -> pd.DataFrame: +def decode(data: pd.DataFrame, columns: list, encoder_dict: dict) -> pd.DataFrame: """Decoding categorical params to initial labels Args: diff --git a/bamt/preprocess/graph.py b/bamt/preprocess/graph.py index 0ae0516..f598468 100644 --- a/bamt/preprocess/graph.py +++ b/bamt/preprocess/graph.py @@ -39,5 +39,6 @@ def edges_to_dict(edges: list): nodes = nodes_from_edges(edges) parents_dict = defaultdict(list, {node: [] for node in nodes}) parents_dict.update( - {child: parents_dict[child] + [parent] for parent, child in edges}) + {child: parents_dict[child] + [parent] for parent, child in edges} + ) return parents_dict diff --git a/bamt/preprocess/numpy_pandas.py b/bamt/preprocess/numpy_pandas.py index 2b32a8e..94b174e 100644 --- a/bamt/preprocess/numpy_pandas.py +++ b/bamt/preprocess/numpy_pandas.py @@ -1,6 +1,7 @@ import os import sys import inspect + # currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) # parentdir = os.path.dirname(currentdir) # sys.path.insert(0,parentdir) @@ -21,8 +22,11 @@ def loc_to_DataFrame(data: np.array): if data.T.ndim == 1: data = data.T nodes_type = {0: nodes_type[0]} - dtype = {key: 'int64' if value == 'disc' else 'float64' for key, - value in nodes_type.items() if value in ['disc', 'cont']} + dtype = { + key: "int64" if value == "disc" else "float64" + for key, value in nodes_type.items() + if value in ["disc", "cont"] + } df = pd.DataFrame(data).astype(dtype) df.columns = df.columns.map(str) return df @@ -46,13 +50,13 @@ def get_type_numpy(data: np.array): for i, row in enumerate(arr): if row.ndim == 0 or row.T.ndim == 0: row_is_integer = np.issubdtype(row, np.integer) or row.is_integer() - column_type[i] = 'disc' if row_is_integer else 'cont' + column_type[i] = "disc" if row_is_integer else "cont" else: all_row_is_integer = all( - np.issubdtype( - x, np.integer) or x.is_integer() for x in row) - column_type[i] = 'disc' if all_row_is_integer else 'cont' - if column_type[i] not in ['disc', 'cont']: - print('get_type_numpy: Incorrect type of row') + np.issubdtype(x, np.integer) or x.is_integer() for x in row + ) + column_type[i] = "disc" if all_row_is_integer else "cont" + if column_type[i] not in ["disc", "cont"]: + print("get_type_numpy: Incorrect type of row") print(row) return column_type diff --git a/bamt/preprocessors.py b/bamt/preprocessors.py index 4ba3c02..4ab0e6a 100644 --- a/bamt/preprocessors.py +++ b/bamt/preprocessors.py @@ -21,20 +21,22 @@ def get_nodes_types(data): def get_nodes_signs(self, data): return gru.nodes_signs(nodes_types=self.nodes_types, data=data) - def code_categories(self, data: DataFrame, - encoder) -> Tuple[DataFrame, Dict[str, Dict]]: + def code_categories( + self, data: DataFrame, encoder + ) -> Tuple[DataFrame, Dict[str, Dict]]: """Encoding categorical parameters - Args: - data (DataFrame): input dataset - encoder: any object with fit_transform method + Args: + data (DataFrame): input dataset + encoder: any object with fit_transform method - Returns: - pd.DataFrame: output dataset with encoded parameters - dict: dictionary with values and codes - """ - columns = [col for col in data.columns.to_list( - ) if self.nodes_types[col] == 'disc'] + Returns: + pd.DataFrame: output dataset with encoded parameters + dict: dictionary with values and codes + """ + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "disc" + ] df = data.copy() # INPUT DF. Debugging SettingWithCopyWarning if not columns: return df, None @@ -47,25 +49,26 @@ def code_categories(self, data: DataFrame, df[col_name] = encoder.fit_transform(column.values) except TypeError as exc: logger_preprocessor.error( - f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}") + f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}" + ) try: - mapping = dict( - zip(encoder.classes_, range(len(encoder.classes_)))) + mapping = dict(zip(encoder.classes_, range(len(encoder.classes_)))) encoder_dict[col_name] = mapping except BaseException: pass return df, encoder_dict def discretize(self, data: DataFrame, discretizer) -> tuple: - columns = [col for col in data.columns.to_list( - ) if self.nodes_types[col] == 'cont'] + columns = [ + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] df = data.copy() if not columns: return df, None data = df[columns] data_discrete = discretizer.fit_transform(data.values) - df[columns] = data_discrete.astype('int') + df[columns] = data_discrete.astype("int") return df, discretizer @@ -82,24 +85,27 @@ def __init__(self, pipeline: list): @property def info(self): - return {'types': self.nodes_types, 'signs': self.nodes_signs} + return {"types": self.nodes_types, "signs": self.nodes_signs} def scan(self, data: DataFrame): """ Function to scan data. If something is wrong, it will be send to log file """ columns_cont = [ - col for col in data.columns.to_list() if self.nodes_types[col] == 'cont'] + col for col in data.columns.to_list() if self.nodes_types[col] == "cont" + ] if not columns_cont: logger_preprocessor.info("No one column is continuous") - columns_disc = [col for col in data.columns.to_list( - ) if self.nodes_types[col] in ['disc', 'disc_num']] + columns_disc = [ + col + for col in data.columns.to_list() + if self.nodes_types[col] in ["disc", "disc_num"] + ] if not columns_disc: logger_preprocessor.info("No one column is discrete") - def apply(self, data: DataFrame, - dropna: bool = True) -> Tuple[DataFrame, Dict]: + def apply(self, data: DataFrame, dropna: bool = True) -> Tuple[DataFrame, Dict]: """ Apply pipeline data: data to apply on @@ -116,9 +122,8 @@ def apply(self, data: DataFrame, self.nodes_signs = self.get_nodes_signs(data) self.scan(df) for name, instrument in self.pipeline: - if name == 'encoder': - df, self.coder = self.code_categories( - data=data, encoder=instrument) - if name == 'discretizer': + if name == "encoder": + df, self.coder = self.code_categories(data=data, encoder=instrument) + if name == "discretizer": df, est = self.discretize(data=df, discretizer=instrument) return df, self.coder diff --git a/bamt/redef_HC.py b/bamt/redef_HC.py index 8ac8c85..a523981 100644 --- a/bamt/redef_HC.py +++ b/bamt/redef_HC.py @@ -33,15 +33,16 @@ def hc( - data, - metric='MI', - max_iter=200, - debug=False, - init_nodes=None, - restriction=None, - init_edges=None, - remove_geo_edges=True, - black_list=None): + data, + metric="MI", + max_iter=200, + debug=False, + init_nodes=None, + restriction=None, + init_edges=None, + remove_geo_edges=True, + black_list=None, +): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the @@ -121,11 +122,11 @@ def hc( bn = BayesNet(c_dict) mutual_information = mi_gauss - if metric == 'BIC': + if metric == "BIC": mutual_information = BIC_local - if metric == 'AIC': + if metric == "AIC": mutual_information = AIC_local - if metric == 'LL': + if metric == "LL": mutual_information = log_lik_local data = data.values @@ -140,37 +141,41 @@ def hc( max_delta = 0 if debug: - print('ITERATION: ', _iter) + print("ITERATION: ", _iter) ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): - if v not in c_dict[u] and u != v and not would_cause_cycle( - c_dict, u, v) and len(p_dict[v]) != 3: + if ( + v not in c_dict[u] + and u != v + and not would_cause_cycle(c_dict, u, v) + and len(p_dict[v]) != 3 + ): # FOR MMHC ALGORITHM -> Edge Restrictions - if (init_nodes is None or not (v in init_nodes)) and ( - restriction is None or (u, v) in restriction) and ( - black_list is None or not ((u, v) in black_list)): + if ( + (init_nodes is None or not (v in init_nodes)) + and (restriction is None or (u, v) in restriction) + and (black_list is None or not ((u, v) in black_list)) + ): # SCORE FOR 'V' -> gaining a parent # without 'u' as parent old_cols = (v,) + tuple(p_dict[v]) if old_cols not in cache: - cache[old_cols] = mutual_information( - data[:, old_cols]) + cache[old_cols] = mutual_information(data[:, old_cols]) mi_old = cache[old_cols] new_cols = old_cols + (u,) # with'u' as parent if new_cols not in cache: - cache[new_cols] = mutual_information( - data[:, new_cols]) + cache[new_cols] = mutual_information(data[:, new_cols]) mi_new = cache[new_cols] delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: - print('Improved Arc Addition: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Addition: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Addition' + max_operation = "Addition" max_arc = (u, v) # ### TEST ARC DELETIONS ### @@ -188,44 +193,42 @@ def hc( mi_new = cache[new_cols] delta_score = nrow * (mi_old - mi_new) - if (delta_score > max_delta): + if delta_score > max_delta: if init_edges is None: if debug: - print('Improved Arc Deletion: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Deletion' + max_operation = "Deletion" max_arc = (u, v) else: if (u, v) in init_edges: if remove_geo_edges: if debug: - print( - 'Improved Arc Deletion: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Deletion' + max_operation = "Deletion" max_arc = (u, v) else: if debug: - print('Improved Arc Deletion: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Deletion: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Deletion' + max_operation = "Deletion" max_arc = (u, v) # ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): - if v in c_dict[u] and not would_cause_cycle( - c_dict, v, u, reverse=True) and len( - p_dict[u]) != 3 and ( - init_nodes is None or not ( - u in init_nodes)) and ( - restriction is None or ( - v, u) in restriction) and ( - black_list is None or not ( - (v, u) in black_list)): + if ( + v in c_dict[u] + and not would_cause_cycle(c_dict, v, u, reverse=True) + and len(p_dict[u]) != 3 + and (init_nodes is None or not (u in init_nodes)) + and (restriction is None or (v, u) in restriction) + and (black_list is None or not ((v, u) in black_list)) + ): old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent if old_cols not in cache: cache[old_cols] = mutual_information(data[:, old_cols]) @@ -249,50 +252,49 @@ def hc( # COMBINED DELTA-SCORES delta_score = delta1 + delta2 - if (delta_score > max_delta): + if delta_score > max_delta: if init_edges is None: if debug: - print('Improved Arc Reversal: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Reversal' + max_operation = "Reversal" max_arc = (u, v) else: if (u, v) in init_edges: if remove_geo_edges: if debug: - print( - 'Improved Arc Reversal: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Reversal' + max_operation = "Reversal" max_arc = (u, v) else: if debug: - print('Improved Arc Reversal: ', (u, v)) - print('Delta Score: ', delta_score) + print("Improved Arc Reversal: ", (u, v)) + print("Delta Score: ", delta_score) max_delta = delta_score - max_operation = 'Reversal' + max_operation = "Reversal" max_arc = (u, v) if max_delta != 0: improvement = True u, v = max_arc - if max_operation == 'Addition': + if max_operation == "Addition": if debug: - print('ADDING: ', max_arc, '\n') + print("ADDING: ", max_arc, "\n") c_dict[u].append(v) p_dict[v].append(u) - elif max_operation == 'Deletion': + elif max_operation == "Deletion": if debug: - print('DELETING: ', max_arc, '\n') + print("DELETING: ", max_arc, "\n") c_dict[u].remove(v) p_dict[v].remove(u) - elif max_operation == 'Reversal': + elif max_operation == "Reversal": if debug: - print('REVERSING: ', max_arc, '\n') + print("REVERSING: ", max_arc, "\n") c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) @@ -300,13 +302,13 @@ def hc( else: if debug: - print('No Improvement on Iter: ', _iter) + print("No Improvement on Iter: ", _iter) ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: - print('Max Iteration Reached') + print("Max Iteration Reached") break bn = BayesNet(c_dict) diff --git a/bamt/redef_info_scores.py b/bamt/redef_info_scores.py index fb52f70..c4ce4ee 100644 --- a/bamt/redef_info_scores.py +++ b/bamt/redef_info_scores.py @@ -8,23 +8,28 @@ from bamt.preprocess.graph import edges_to_dict -def info_score(edges: list, data: pd.DataFrame, method='LL'): - score_funcs = {'LL': log_lik_local, 'BIC': BIC_local, 'AIC': AIC_local} +def info_score(edges: list, data: pd.DataFrame, method="LL"): + score_funcs = {"LL": log_lik_local, "BIC": BIC_local, "AIC": AIC_local} score = score_funcs.get(method.upper(), BIC_local) parents_dict = edges_to_dict(edges) nodes_with_edges = parents_dict.keys() - scores = [score(data[child_parents].copy(), method) - for var in nodes_with_edges - for child_parents in ([var] + parents_dict[var],)] - scores += [score(data[[var]].copy(), method) for var in - set(data.columns).difference(set(nodes_with_edges))] + scores = [ + score(data[child_parents].copy(), method) + for var in nodes_with_edges + for child_parents in ([var] + parents_dict[var],) + ] + scores += [ + score(data[[var]].copy(), method) + for var in set(data.columns).difference(set(nodes_with_edges)) + ] return sum(scores) ##### INFORMATION-THEORETIC SCORING FUNCTIONS ##### -def log_likelihood(bn, data, method='LL'): + +def log_likelihood(bn, data, method="LL"): """ Determining log-likelihood of the parameters of a Bayesian Network. This is a quite simple @@ -89,30 +94,36 @@ def log_likelihood(bn, data, method='LL'): """ NROW = data.shape[0] - mi_scores = [mutual_information(data[:, - (bn.V.index(rv), - ) + tuple([bn.V.index(p) for p in bn.parents(rv)])], - method=method) for rv in bn.nodes()] - ent_scores = [entropy(data[:, bn.V.index(rv)], method=method) - for rv in bn.nodes()] + mi_scores = [ + mutual_information( + data[:, (bn.V.index(rv),) + tuple([bn.V.index(p) for p in bn.parents(rv)])], + method=method, + ) + for rv in bn.nodes() + ] + ent_scores = [entropy(data[:, bn.V.index(rv)], method=method) for rv in bn.nodes()] return NROW * (sum(mi_scores) - sum(ent_scores)) -def log_lik_local(data, method='LL'): +def log_lik_local(data, method="LL"): NROW = data.shape[0] with warnings.catch_warnings(): warnings.simplefilter("ignore") if isinstance(data, pd.DataFrame): - return (NROW * (mutual_information(data, method=method) - - entropy(data.iloc[:, 0], method=method))) + return NROW * ( + mutual_information(data, method=method) + - entropy(data.iloc[:, 0], method=method) + ) elif isinstance(data, pd.Series): return 0.0 elif isinstance(data, np.ndarray): - return (NROW * (mutual_information(data, method=method) - - entropy(data[:, 0], method=method))) + return NROW * ( + mutual_information(data, method=method) + - entropy(data[:, 0], method=method) + ) -def BIC_local(data, method='BIC'): +def BIC_local(data, method="BIC"): NROW = data.shape[0] log_score = log_lik_local(data, method=method) try: @@ -133,15 +144,18 @@ def num_params(data): # Calculate number of parameters for numpy array if isinstance(data, np.ndarray): node_type = get_type_numpy(data) - columns_for_discrete = [param for param, - node in node_type.items() if node == 'cont'] - columns_for_code = [param for param, - node in node_type.items() if node == 'disc'] + columns_for_discrete = [ + param for param, node in node_type.items() if node == "cont" + ] + columns_for_code = [ + param for param, node in node_type.items() if node == "disc" + ] prod = 1 for var in columns_for_code: - prod *= len(np.unique(data[:, var]) - ) if data.ndim != 1 else len(np.unique(data)) + prod *= ( + len(np.unique(data[:, var])) if data.ndim != 1 else len(np.unique(data)) + ) if columns_for_discrete: prod *= len(columns_for_discrete) @@ -152,12 +166,12 @@ def num_params(data): return sys.float_info.max # Raise an error if data type is unexpected - print('Num_params: Unexpected data type') + print("Num_params: Unexpected data type") print(data) return None -def AIC_local(data, method='AIC'): +def AIC_local(data, method="AIC"): log_score = log_lik_local(data, method=method) penalty = num_params(data) return log_score - penalty diff --git a/bamt/utils/EvoUtils.py b/bamt/utils/EvoUtils.py index 488912e..741fe74 100644 --- a/bamt/utils/EvoUtils.py +++ b/bamt/utils/EvoUtils.py @@ -42,18 +42,18 @@ def custom_mutation_add(graph: CustomGraphModel, **kwargs): for _ in range(num_mut): rid = random.choice(range(len(graph.nodes))) random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice( - range(len(graph.nodes)))] - nodes_not_cycling = (random_node.descriptive_id not in - [n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node)] and - other_random_node.descriptive_id not in - [n.descriptive_id for n in ordered_subnodes_hierarchy(random_node)]) + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + nodes_not_cycling = random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(other_random_node) + ] and other_random_node.descriptive_id not in [ + n.descriptive_id for n in ordered_subnodes_hierarchy(random_node) + ] if nodes_not_cycling: random_node.nodes_from.append(other_random_node) break except Exception as ex: - print(f'Incorrect connection: {ex}') + print(f"Incorrect connection: {ex}") return graph @@ -63,9 +63,11 @@ def custom_mutation_delete(graph: OptGraph, **kwargs): for _ in range(num_mut): rid = random.choice(range(len(graph.nodes))) random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice( - range(len(graph.nodes)))] - if random_node.nodes_from is not None and other_random_node in random_node.nodes_from: + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): random_node.nodes_from.remove(other_random_node) break except Exception as ex: @@ -79,9 +81,11 @@ def custom_mutation_reverse(graph: OptGraph, **kwargs): for _ in range(num_mut): rid = random.choice(range(len(graph.nodes))) random_node = graph.nodes[rid] - other_random_node = graph.nodes[random.choice( - range(len(graph.nodes)))] - if random_node.nodes_from is not None and other_random_node in random_node.nodes_from: + other_random_node = graph.nodes[random.choice(range(len(graph.nodes)))] + if ( + random_node.nodes_from is not None + and other_random_node in random_node.nodes_from + ): random_node.nodes_from.remove(other_random_node) other_random_node.nodes_from.append(random_node) break @@ -93,7 +97,7 @@ def custom_mutation_reverse(graph: OptGraph, **kwargs): def has_no_duplicates(graph): _, labels = graph_structure_as_nx_graph(graph) if len(labels.values()) != len(set(labels.values())): - raise ValueError('Custom graph has duplicates') + raise ValueError("Custom graph has duplicates") return True @@ -101,7 +105,7 @@ def has_no_blacklist_edges(graph, blacklist): nx_graph, _ = graph_structure_as_nx_graph(graph) for edge in nx_graph.edges(): if edge in blacklist: - raise ValueError('Graph contains blacklisted edges') + raise ValueError("Graph contains blacklisted edges") return True @@ -109,5 +113,5 @@ def has_only_whitelist_edges(graph, whitelist): nx_graph, _ = graph_structure_as_nx_graph(graph) for edge in nx_graph.edges(): if edge not in whitelist: - raise ValueError('Graph contains non-whitelisted edges') + raise ValueError("Graph contains non-whitelisted edges") return True diff --git a/bamt/utils/GraphUtils.py b/bamt/utils/GraphUtils.py index 1146393..4353e38 100644 --- a/bamt/utils/GraphUtils.py +++ b/bamt/utils/GraphUtils.py @@ -20,53 +20,50 @@ def nodes_types(data: DataFrame) -> Dict[str, str]: column_type = dict() for c in data.columns.to_list(): - disc = ['str', 'O', 'b', 'categorical', 'object', 'bool'] - disc_numerical = ['int32', 'int64'] - cont = ['float32', 'float64'] + disc = ["str", "O", "b", "categorical", "object", "bool"] + disc_numerical = ["int32", "int64"] + cont = ["float32", "float64"] if data[c].dtype.name in disc: - column_type[c] = 'disc' + column_type[c] = "disc" elif data[c].dtype.name in cont: - column_type[c] = 'cont' + column_type[c] = "cont" elif data[c].dtype.name in disc_numerical: - column_type[c] = 'disc_num' + column_type[c] = "disc_num" else: - logger_preprocessor.error( - f'Unsupported data type. Dtype: {data[c].dtypes}') + logger_preprocessor.error(f"Unsupported data type. Dtype: {data[c].dtypes}") return column_type def nodes_signs(nodes_types: dict, data: DataFrame) -> Dict[str, str]: """Function to define sign of the node - neg - if node has negative values - pos - if node has only positive values + neg - if node has negative values + pos - if node has only positive values - Args: - data (pd.DataFrame): input dataset + Args: + data (pd.DataFrame): input dataset - Returns: - dict: output dictionary where 'key' - node name and 'value' - sign of data - """ + Returns: + dict: output dictionary where 'key' - node name and 'value' - sign of data + """ if list(nodes_types.keys()) != data.columns.to_list(): logger_preprocessor.error("Nodes_types dictionary is not full.") return columns_sign = dict() for c in data.columns.to_list(): - if nodes_types[c] == 'cont': + if nodes_types[c] == "cont": if (data[c] < 0).any(): - columns_sign[c] = 'neg' + columns_sign[c] = "neg" else: - columns_sign[c] = 'pos' + columns_sign[c] = "pos" return columns_sign def get_descriptor(data) -> Dict[str, Dict[str, str]]: - return {'types': nodes_types(data), - 'signs': nodes_signs(nodes_types(data), data)} + return {"types": nodes_types(data), "signs": nodes_signs(nodes_types(data), data)} -def toporder(nodes: List[Type[BaseNode]], - edges: List[Tuple]) -> List[List[str]]: +def toporder(nodes: List[Type[BaseNode]], edges: List[Tuple]) -> List[List[str]]: """ Function for topological sorting """ diff --git a/bamt/utils/MathUtils.py b/bamt/utils/MathUtils.py index 62073ad..813575d 100644 --- a/bamt/utils/MathUtils.py +++ b/bamt/utils/MathUtils.py @@ -34,8 +34,7 @@ def lrts_comp(data): def mix_norm_cdf(x, weights, means, covars): mcdf = 0.0 for i in range(len(weights)): - mcdf += weights[i] * \ - stats.norm.cdf(x, loc=means[i][0], scale=covars[i][0][0]) + mcdf += weights[i] * stats.norm.cdf(x, loc=means[i][0], scale=covars[i][0][0]) return mcdf @@ -49,8 +48,7 @@ def theoretical_quantile(data, n_comp): d = np.arange(np.min(data), np.max(data), step) for i in d: x.append(i) - q.append(mix_norm_cdf(i, model.weights_, - model.means_, model.covariances_)) + q.append(mix_norm_cdf(i, model.weights_, model.means_, model.covariances_)) return x, q @@ -61,7 +59,7 @@ def quantile_mix(p, vals, q): def probability_mix(val, vals, q): ind = vals.index(min(vals, key=lambda x: abs(x - val))) - return (q[ind]) + return q[ind] def sum_dist(data, vals, q): @@ -86,7 +84,7 @@ def component(data, columns, method): x = np.transpose([data[columns[0]].values]) else: x = data[columns].values - if method == 'aic': + if method == "aic": lowest_aic = np.infty comp_lowest = 0 for i in range(1, max_comp + 1, 1): @@ -98,7 +96,7 @@ def component(data, columns, method): comp_lowest = i n = comp_lowest - if method == 'bic': + if method == "bic": lowest_bic = np.infty comp_lowest = 0 for i in range(1, max_comp + 1, 1): @@ -110,9 +108,9 @@ def component(data, columns, method): comp_lowest = i n = comp_lowest - if method == 'LRTS': + if method == "LRTS": n = lrts_comp(x) - if method == 'quantile': + if method == "quantile": biggest_p = -1 * np.infty comp_biggest = 0 for i in range(1, max_comp, 1): @@ -144,7 +142,7 @@ def get_n_nearest(data, columns, corr=False, number_close=5): close_ind = data[c].sort_values(ascending=False).index.tolist() else: close_ind = data[c].sort_values().index.tolist() - groups.append(close_ind[0:number_close + 1]) + groups.append(close_ind[0 : number_close + 1]) return groups @@ -164,33 +162,28 @@ def get_proximity_matrix(df, proximity_metric) -> pd.DataFrame: encoder = OrdinalEncoder() df_coded = df - columnsToEncode = list(df_coded.select_dtypes( - include=['category', 'object'])) + columnsToEncode = list(df_coded.select_dtypes(include=["category", "object"])) - df_coded[columnsToEncode] = encoder.fit_transform( - df_coded[columnsToEncode]) + df_coded[columnsToEncode] = encoder.fit_transform(df_coded[columnsToEncode]) - df_distance = pd.DataFrame(data=np.zeros( - (len(df.columns), len(df.columns))), columns=df.columns) + df_distance = pd.DataFrame( + data=np.zeros((len(df.columns), len(df.columns))), columns=df.columns + ) df_distance.index = df.columns - if proximity_metric == 'MI': + if proximity_metric == "MI": for c1 in df.columns: for c2 in df.columns: - dist = mutual_info_score( - df_coded[c1].values, df_coded[c2].values) + dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) df_distance.loc[c1, c2] = dist - elif proximity_metric == 'corr': - df_distance = df_coded.corr(method='pearson') + elif proximity_metric == "corr": + df_distance = df_coded.corr(method="pearson") return df_distance -def get_brave_matrix( - df_columns, - proximity_matrix, - n_nearest=5) -> pd.DataFrame: +def get_brave_matrix(df_columns, proximity_matrix, n_nearest=5) -> pd.DataFrame: """Returns matrix Brave coeffitients of the DataFrame, requires proximity measure to be calculated Args: @@ -203,14 +196,16 @@ def get_brave_matrix( brave_matrix: DataFrame of Brave coefficients """ - brave_matrix = pd.DataFrame(data=np.zeros( - (len(df_columns), len(df_columns))), columns=df_columns) + brave_matrix = pd.DataFrame( + data=np.zeros((len(df_columns), len(df_columns))), columns=df_columns + ) brave_matrix.index = df_columns - groups = get_n_nearest(proximity_matrix, df_columns.tolist(), - corr=True, number_close=n_nearest) + groups = get_n_nearest( + proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest + ) - counter_zeroer = .0 + counter_zeroer = 0.0 for c1 in df_columns: for c2 in df_columns: @@ -230,9 +225,9 @@ def get_brave_matrix( d += 1 if (a + c) * (b + d) != 0 and (a + b) * (c + d) != 0: - - br = (a * len(groups) + (a + c) * (a + b)) / \ - ((math.sqrt((a + c) * (b + d))) * (math.sqrt((a + b) * (c + d)))) + br = (a * len(groups) + (a + c) * (a + b)) / ( + (math.sqrt((a + c) * (b + d))) * (math.sqrt((a + b) * (c + d))) + ) else: br = (a * len(groups) + (a + c) * (a + b)) / 0.0000000001 brave_matrix.loc[c1, c2] = br @@ -268,10 +263,12 @@ def precision_recall(pred_net: list, true_net: list, decimal=4): pred_len = len(pred_net) true_len = len(true_net) shd = pred_len + true_len - corr_undirected - corr_dir - return {'AP': round(corr_undirected / pred_len, decimal), - 'AR': round(corr_undirected / true_len, decimal), - # 'F1_undir': round(2 * (corr_undirected / pred_len) * (corr_undirected / true_len) / (corr_undirected / pred_len + corr_undirected / true_len), decimal), - 'AHP': round(corr_dir / pred_len, decimal), - 'AHR': round(corr_dir / true_len, decimal), - # 'F1_directed': round(2*(corr_dir/pred_len)*(corr_dir/true_len)/(corr_dir/pred_len+corr_dir/true_len), decimal), - 'SHD': shd} + return { + "AP": round(corr_undirected / pred_len, decimal), + "AR": round(corr_undirected / true_len, decimal), + # 'F1_undir': round(2 * (corr_undirected / pred_len) * (corr_undirected / true_len) / (corr_undirected / pred_len + corr_undirected / true_len), decimal), + "AHP": round(corr_dir / pred_len, decimal), + "AHR": round(corr_dir / true_len, decimal), + # 'F1_directed': round(2*(corr_dir/pred_len)*(corr_dir/true_len)/(corr_dir/pred_len+corr_dir/true_len), decimal), + "SHD": shd, + } diff --git a/tests/BigbraveBNTest.py b/tests/BigbraveBNTest.py index f19a966..21bb808 100644 --- a/tests/BigbraveBNTest.py +++ b/tests/BigbraveBNTest.py @@ -13,17 +13,17 @@ encoder = preprocessing.LabelEncoder() discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" +) -p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data_discrete) info = p.info space_restrictor = BigBraveBN() -space_restrictor.set_possible_edges_by_brave( - df=data_discrete) +space_restrictor.set_possible_edges_by_brave(df=data_discrete) ps = space_restrictor.possible_edges @@ -31,23 +31,22 @@ bn_discrete.add_nodes(descriptor=info) -params = {'white_list': ps} -bn_discrete.add_edges(discretized_data, scoring_function=( - 'K2', K2Score), params=params) +params = {"white_list": ps} +bn_discrete.add_edges(discretized_data, scoring_function=("K2", K2Score), params=params) encoder = preprocessing.LabelEncoder() discretizer = preprocessing.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" +) -p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = pp.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data_continuous) info = p.info space_restrictor = BigBraveBN() -space_restrictor.set_possible_edges_by_brave( - df=data_continuous) +space_restrictor.set_possible_edges_by_brave(df=data_continuous) ps = space_restrictor.possible_edges @@ -55,6 +54,7 @@ bn_continuous.add_nodes(descriptor=info) -params = {'white_list': ps} +params = {"white_list": ps} bn_continuous.add_edges( - discretized_data, scoring_function=('K2', K2Score), params=params) + discretized_data, scoring_function=("K2", K2Score), params=params +) diff --git a/tests/LoadBN.py b/tests/LoadBN.py index f7dac4c..7c7a44f 100644 --- a/tests/LoadBN.py +++ b/tests/LoadBN.py @@ -5,14 +5,23 @@ import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -21,9 +30,11 @@ bn.add_nodes(info) -structure = [("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability")] +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] bn.set_structure(edges=structure) diff --git a/tests/MainTest.py b/tests/MainTest.py index 91d1db2..ff7a26b 100644 --- a/tests/MainTest.py +++ b/tests/MainTest.py @@ -6,20 +6,29 @@ from bamt.preprocessors import Preprocessor import bamt.networks as Networks -''' +""" Optional: You can also uncomment print() that you need. -''' +""" hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv") -cont_data = hack_data[['Gross', 'Netpay', 'Porosity', - 'Permeability', 'Depth']].dropna() -disc_data = hack_data[['Tectonic regime', 'Period', - 'Lithology', 'Structural setting']].dropna() -hybrid_data = hack_data[['Tectonic regime', 'Period', - 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', - 'Permeability', 'Depth']].dropna() +cont_data = hack_data[["Gross", "Netpay", "Porosity", "Permeability", "Depth"]].dropna() +disc_data = hack_data[ + ["Tectonic regime", "Period", "Lithology", "Structural setting"] +].dropna() +hybrid_data = hack_data[ + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +].dropna() cont_test_data = cont_data[cont_data.columns[:-1]] cont_target = cont_data[cont_data.columns[-1]] @@ -29,27 +38,24 @@ hybrid_target = hybrid_data[hybrid_data.columns[-1]] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer(n_bins=5, - encode='ordinal', - strategy='uniform') -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) # Discrete pipeline discretized_data, _ = p.apply(disc_data) disc_bn = Networks.DiscreteBN() info = p.info disc_bn.add_nodes(info) -disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +disc_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) disc_bn.fit_parameters(data=disc_data) disc_bn.calculate_weights(discretized_data) disc_predicted_values = disc_bn.predict(test=disc_test_data) -disc_predicted_values = pd.DataFrame.from_dict( - disc_predicted_values, orient='columns') +disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient="columns") synth_disc_data = disc_bn.sample(50) -disc_bn.save('./disc_bn.json') +disc_bn.save("./disc_bn.json") disc_bn2 = Networks.DiscreteBN() -disc_bn2.load('./disc_bn.json') +disc_bn2.load("./disc_bn.json") synth_disc_data2 = disc_bn2.sample(50) # print(disc_bn.weights) # print(disc_bn2.weights) @@ -63,17 +69,16 @@ cont_bn = Networks.ContinuousBN(use_mixture=True) info = p.info cont_bn.add_nodes(info) -cont_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +cont_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) cont_bn.fit_parameters(data=cont_data) cont_bn.calculate_weights(discretized_data) cont_predicted_values = cont_bn.predict(test=cont_test_data) -cont_predicted_values = pd.DataFrame.from_dict( - cont_predicted_values, orient='columns') +cont_predicted_values = pd.DataFrame.from_dict(cont_predicted_values, orient="columns") synth_cont_data = cont_bn.sample(50) -cont_bn.save('./cont_bn.json') +cont_bn.save("./cont_bn.json") cont_bn2 = Networks.ContinuousBN(use_mixture=True) -cont_bn2.load('./cont_bn.json') +cont_bn2.load("./cont_bn.json") synth_cont_data2 = cont_bn2.sample(50) # print(cont_bn.weights) # print(cont_bn2.weights) @@ -91,21 +96,22 @@ info = p.info hybrid_bn.add_nodes(info) hybrid_bn2.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) -hybrid_bn2.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) +hybrid_bn2.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) hybrid_bn.fit_parameters(data=hybrid_data) hybrid_bn2.fit_parameters(data=hybrid_data) hybrid_bn.calculate_weights(discretized_data) hybrid_bn2.calculate_weights(discretized_data) hybrid_predicted_values = hybrid_bn.predict(test=hybrid_test_data) hybrid_predicted_values = pd.DataFrame.from_dict( - hybrid_predicted_values, orient='columns') + hybrid_predicted_values, orient="columns" +) synth_hybrid_data = hybrid_bn.sample(50) synth_hybrid_data2 = hybrid_bn2.sample(50) -hybrid_bn.save('./hybrid_bn.json') +hybrid_bn.save("./hybrid_bn.json") hybrid_bn3 = Networks.HybridBN(use_mixture=True) -hybrid_bn3.load('./hybrid_bn.json') +hybrid_bn3.load("./hybrid_bn.json") synth_hybrid_data3 = hybrid_bn3.sample(50) # print(hybrid_bn.weights) # print(hybrid_bn2.weights) @@ -124,9 +130,9 @@ hybrid_bn = Networks.HybridBN(use_mixture=True) info = p.info hybrid_bn.add_nodes(info) -hybrid_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) +hybrid_bn.add_edges(data=discretized_data, scoring_function=("K2", K2Score)) hybrid_bn.fit_parameters(data=hybrid_data) -hybrid_bn.save('./hybrid_bn_without_weights.json') +hybrid_bn.save("./hybrid_bn_without_weights.json") hybrid_bn2 = Networks.HybridBN(use_mixture=True) -hybrid_bn2.load('./hybrid_bn_without_weights.json') +hybrid_bn2.load("./hybrid_bn_without_weights.json") # print(hybrid_bn2.weights) diff --git a/tests/MetricsTest.py b/tests/MetricsTest.py index 74997ca..05dc87f 100644 --- a/tests/MetricsTest.py +++ b/tests/MetricsTest.py @@ -12,15 +12,16 @@ h = pd.read_csv("data/real data/hack_processed_with_rf.csv") cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting', - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", +] h = h[cols] print(h.describe()) @@ -29,12 +30,9 @@ print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) # ----------- discrete_data, est = p.apply(h) @@ -42,15 +40,15 @@ bn = Networks.HybridBN(has_logit=True) # all may vary bn.add_nodes(descriptor=info) -bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',)) +bn.add_edges(data=discrete_data, optimizer="HC", scoring_function=("MI",)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() -print(f'PL elapsed: {t2 - t1}') +print(f"PL elapsed: {t2 - t1}") -columns = ['Lithology', 'Structural setting', 'Porosity', 'Depth'] +columns = ["Lithology", "Structural setting", "Porosity", "Depth"] validY = h[columns].dropna() validX = h.drop(columns, axis=1).dropna() @@ -58,4 +56,4 @@ pred_param = bn.predict(validX, parall_count=3) time_2 = time.time() print(pred_param) -print(f'Predict elapsed: {time_2 - time_1}') +print(f"Predict elapsed: {time_2 - time_1}") diff --git a/tests/NetworksTest.py b/tests/NetworksTest.py index 7d9b7f4..c665b44 100644 --- a/tests/NetworksTest.py +++ b/tests/NetworksTest.py @@ -1,6 +1,7 @@ import json import time import itertools + # import abc import pandas as pd @@ -18,12 +19,14 @@ class NetworkTest(object): - def __init__(self, - directory: str, - verbose: bool = False, - case_id: int = 0, - sample_n: int = 500, - sample_tol: float = .6): + def __init__( + self, + directory: str, + verbose: bool = False, + case_id: int = 0, + sample_n: int = 500, + sample_tol: float = 0.6, + ): """ sample_n: number of rows in sample sample_tol: precent of acceptable number of nans. @@ -49,26 +52,23 @@ def test_preprocess(self): if self.case_id == 0: self.discrete_cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting'] - self.cont_cols = [ - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] + self.cont_cols = ["Gross", "Netpay", "Porosity", "Permeability", "Depth"] self.hybrid_cols = [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting', - 'Gross', - 'Netpay', - 'Porosity', - 'Permeability', - 'Depth'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] # Base of standards self.base = "hack_" + self.type else: @@ -85,9 +85,10 @@ def test_preprocess(self): encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') + n_bins=5, encode="ordinal", strategy="uniform" + ) - p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) + p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(data) info = p.info @@ -96,17 +97,14 @@ def test_preprocess(self): assert info == json.load(open(f"{self.base}/hack_info.json")) except AssertionError: failed = True - self.verboseprint( - self._tabularize_output("ERROR", "Bad descriptor") - ) + self.verboseprint(self._tabularize_output("ERROR", "Bad descriptor")) try: assert_frame_equal( discretized_data, - pd.read_csv( - f"{self.base}/hack_data.csv", - index_col=0), - check_dtype=False) + pd.read_csv(f"{self.base}/hack_data.csv", index_col=0), + check_dtype=False, + ) except Exception as ex: failed = True self.verboseprint(self._tabularize_output("ERROR", str(ex))) @@ -157,8 +155,11 @@ def test_predict(self): else: raise Exception("Inner error") - preds = self.bn.predict(test=pd.read_csv(self.directory)[cols[:2]].dropna(), - progress_bar=False, parall_count=2) + preds = self.bn.predict( + test=pd.read_csv(self.directory)[cols[:2]].dropna(), + progress_bar=False, + parall_count=2, + ) # with open(f"{self.base}/hack_predict.json", "r") as f: # p = json.load(f) @@ -166,47 +167,59 @@ def test_predict(self): if self.type == "continuous": # cols: ['Porosity', 'Permeability', 'Depth'] for node in preds.keys(): - right_val = json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node] - test_val = np.mean( - [mx for mx in preds[node] if not np.isnan(mx)]) - assert np.all(np.isclose(test_val, right_val, rtol=.4) - ), f"Predict failed: {node, right_val, test_val}" + right_val = json.load(open(f"{self.base}/hack_predict.json"))[self.sf][ + node + ] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) + assert np.all( + np.isclose(test_val, right_val, rtol=0.4) + ), f"Predict failed: {node, right_val, test_val}" elif self.type == "discrete": # cols: ['Lithology', 'Structural setting'] for node in preds.keys(): test_vals = pd.Series(preds[node]).value_counts().to_dict() for category, right_val in json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node].items(): + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): try: - assert np.all(np.isclose(test_vals[category], right_val, atol=5)), \ - f"Predict failed: {node, test_vals[category], right_val}" + assert np.all( + np.isclose(test_vals[category], right_val, atol=5) + ), f"Predict failed: {node, test_vals[category], right_val}" except KeyError as ex: print("Unknown preds category: ", ex.args[0]) continue elif self.type == "hybrid": cont_nodes = [ - node for node in self.bn.nodes_names if self.info["types"][node] == "cont"] + node + for node in self.bn.nodes_names + if self.info["types"][node] == "cont" + ] for node in preds.keys(): if node in cont_nodes: - right_val = json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node] - test_val = np.mean( - [mx for mx in preds[node] if not np.isnan(mx)]) + right_val = json.load(open(f"{self.base}/hack_predict.json"))[ + self.sf + ][node] + test_val = np.mean([mx for mx in preds[node] if not np.isnan(mx)]) # p[self.sf][node] = test_val s = [right_val, test_val] - assert np.all(np.isclose(min(s), max(s), atol=5, rtol=.6)), \ - f"Predict failed: {node, test_val, right_val}" + assert np.all( + np.isclose(min(s), max(s), atol=5, rtol=0.6) + ), f"Predict failed: {node, test_val, right_val}" else: test_vals = pd.Series(preds[node]).value_counts().to_dict() # p[self.sf][node] = test_vals for category, right_val in json.load( - open(f"{self.base}/hack_predict.json"))[self.sf][node].items(): + open(f"{self.base}/hack_predict.json") + )[self.sf][node].items(): try: - assert np.all(np.isclose(min(test_vals[category], right_val), - max(right_val, test_vals[category]), - atol=100, rtol=.5)), \ - f"Predict failed: {node, test_vals[category], right_val}" + assert np.all( + np.isclose( + min(test_vals[category], right_val), + max(right_val, test_vals[category]), + atol=100, + rtol=0.5, + ) + ), f"Predict failed: {node, test_vals[category], right_val}" except KeyError as ex: print("Unknown preds category: ", ex.args[0]) continue @@ -233,7 +246,7 @@ def use_rules(*args, **kwargs): class TestDiscreteBN(NetworkTest): def __init__(self, **kwargs): super(TestDiscreteBN, self).__init__(**kwargs) - self.type = 'discrete' + self.type = "discrete" def test_structure_learning(self): failed = False @@ -243,22 +256,23 @@ def test_structure_learning(self): try: assert bn.nodes_names == [ - 'Tectonic regime', - 'Period', - 'Lithology', - 'Structural setting'] + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + ] except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load( - open(f"{self.base}/hack_edges.json"))[self.sf] + assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[self.sf] except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -278,14 +292,15 @@ def test_parameters_learning(self): self.bn.fit_parameters(pd.read_csv(self.directory)[self.discrete_cols]) try: - assert self.bn.distributions == json.load( - open(f"{self.base}/hack_params.json"))[self.sf] + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[self.sf] + ) except AssertionError: failed = True self.verboseprint( - self._tabularize_output( - f"Parameters ({self.sf})", - "bad distributions")) + self._tabularize_output(f"Parameters ({self.sf})", "bad distributions") + ) if not failed: status = "OK" @@ -303,10 +318,7 @@ def apply(self): t1 = time.time() self.test_structure_learning() if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_parameters_learning() @@ -321,32 +333,30 @@ def apply(self): class TestContinuousBN(NetworkTest): def __init__(self, **kwargs): super(TestContinuousBN, self).__init__(**kwargs) - self.type = 'continuous' + self.type = "continuous" def test_setters(self): failed = False bn = Networks.ContinuousBN() ns = [] - for d in [ - Nodes.GaussianNode( - name="Node" + - str(id)) for id in range( - 0, - 4)]: + for d in [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 4)]: ns.append(d) bn.set_structure(nodes=ns) bn.set_classifiers( classifiers={ - 'Node0': DecisionTreeClassifier(), - 'Node1': RandomForestClassifier(), - 'Node2': KNeighborsClassifier( - n_neighbors=2)}) - - assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == \ - ["DecisionTreeClassifier()", "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)"], "Setter | Classifiers are wrong." + "Node0": DecisionTreeClassifier(), + "Node1": RandomForestClassifier(), + "Node2": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node0", "Node1", "Node2"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." if not failed: status = "OK" @@ -363,20 +373,29 @@ def test_structure_learning(self, use_mixture: bool = False): bn.add_nodes(descriptor=self.info) try: - assert bn.nodes_names == json.load( - open(f"{self.base}/hack_nodes.json"))[f"use_mixture={use_mixture}"][self.sf] + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load( - open(f"{self.base}/hack_edges.json"))[f"use_mixture={use_mixture}"][self.sf] + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][self.sf] + ) except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -389,8 +408,9 @@ def test_structure_learning(self, use_mixture: bool = False): print( self._tabularize_output( - f"Structure ({self.sf}, use_mixture={self.use_mixture})", - status)) + f"Structure ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) def test_parameters_learning(self): failed = False @@ -401,18 +421,26 @@ def test_parameters_learning(self): empty_data = {"mean": [], "covars": [], "coef": []} for k, v in self.bn.distributions.items(): assert all( - [v[obj] != empty for obj, empty in empty_data.items()]), f"Empty data in {k}." - assert .9 <= sum( - v["coef"]) <= 1.1, f"{sum(v['coef'])} || {k}'s: coefs are wrong." + [v[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {k}." + assert ( + 0.9 <= sum(v["coef"]) <= 1.1 + ), f"{sum(v['coef'])} || {k}'s: coefs are wrong." else: - assert self.bn.distributions == json.load(open( - f"{self.base}/hack_params.json"))["use_mixture=False"][self.sf], "Bad distributions." + assert ( + self.bn.distributions + == json.load(open(f"{self.base}/hack_params.json"))[ + "use_mixture=False" + ][self.sf] + ), "Bad distributions." except AssertionError as ex: failed = True self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture})", - ex.args[0])) + ex.args[0], + ) + ) if not failed: status = "OK" @@ -421,8 +449,9 @@ def test_parameters_learning(self): print( self._tabularize_output( - f"Parameters ({self.sf}, use_mixture={self.use_mixture})", - status)) + f"Parameters ({self.sf}, use_mixture={self.use_mixture})", status + ) + ) def apply(self): print(f"Executing {self.type} BN tests.") @@ -436,10 +465,7 @@ def apply(self): t1 = time.time() self.test_structure_learning(use_mixture=use_mixture) if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_parameters_learning() @@ -454,7 +480,7 @@ def apply(self): class TestHybridBN(NetworkTest): def __init__(self, **kwargs): super(TestHybridBN, self).__init__(**kwargs) - self.type = 'hybrid' + self.type = "hybrid" def test_setters(self): failed = False @@ -462,48 +488,58 @@ def test_setters(self): bn = Networks.HybridBN(has_logit=True) ns = [] for d, g in zip( - [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], - [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)]): + [Nodes.GaussianNode(name="Node" + str(id)) for id in range(0, 3)], + [Nodes.DiscreteNode(name="Node" + str(id)) for id in range(3, 6)], + ): ns.append(d) ns.append(g) - edges = [('Node0', 'Node3'), ('Node3', 'Node1'), - ('Node1', 'Node4'), ('Node4', 'Node2'), ('Node2', 'Node5')] + edges = [ + ("Node0", "Node3"), + ("Node3", "Node1"), + ("Node1", "Node4"), + ("Node4", "Node2"), + ("Node2", "Node5"), + ] test_info = { - 'types': { - 'Node0': 'cont', - 'Node1': 'cont', - 'Node2': 'cont', - 'Node3': 'disc', - 'Node4': 'disc', - 'Node5': 'disc'}, - 'signs': { - 'Node0': 'pos', - 'Node1': 'pos', - 'Node2': 'pos'}} + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "disc", + "Node4": "disc", + "Node5": "disc", + }, + "signs": {"Node0": "pos", "Node1": "pos", "Node2": "pos"}, + } # Structure setter - bn.set_structure(info=test_info, - nodes=ns, - edges=edges) - - assert ['Gaussian (LinearRegression)', 'Logit (LogisticRegression)', 'ConditionalGaussian (LinearRegression)', - 'Logit (LogisticRegression)', 'ConditionalGaussian (LinearRegression)', - 'Logit (LogisticRegression)'] == \ - [node.type for node in bn.nodes], "Setter | Nodes are not the same." + bn.set_structure(info=test_info, nodes=ns, edges=edges) + + assert [ + "Gaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + "ConditionalGaussian (LinearRegression)", + "Logit (LogisticRegression)", + ] == [node.type for node in bn.nodes], "Setter | Nodes are not the same." assert edges == bn.edges, "Setter | Edges are not the same." # Classifiers setters bn.set_classifiers( classifiers={ - 'Node3': DecisionTreeClassifier(), - 'Node4': RandomForestClassifier(), - 'Node5': KNeighborsClassifier( - n_neighbors=2)}) - - assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == \ - ["DecisionTreeClassifier()", "RandomForestClassifier()", - "KNeighborsClassifier(n_neighbors=2)"], "Setter | Classifiers are wrong." + "Node3": DecisionTreeClassifier(), + "Node4": RandomForestClassifier(), + "Node5": KNeighborsClassifier(n_neighbors=2), + } + ) + + assert [str(bn[node].classifier) for node in ["Node3", "Node4", "Node5"]] == [ + "DecisionTreeClassifier()", + "RandomForestClassifier()", + "KNeighborsClassifier(n_neighbors=2)", + ], "Setter | Classifiers are wrong." # Parameters setters @@ -521,9 +557,8 @@ def test_setters(self): print(self._tabularize_output("Setters", status)) def test_structure_learning( - self, - use_mixture: bool = False, - has_logit: bool = False): + self, use_mixture: bool = False, has_logit: bool = False + ): self.use_mixture = use_mixture self.has_logit = has_logit failed = False @@ -532,20 +567,29 @@ def test_structure_learning( bn.add_nodes(descriptor=self.info) try: - assert bn.nodes_names == json.load(open(f"{self.base}/hack_nodes.json"))[ - f"use_mixture={use_mixture}"][f"has_logit={has_logit}"][self.sf] + assert ( + bn.nodes_names + == json.load(open(f"{self.base}/hack_nodes.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) except AssertionError: failed = True self.verboseprint( self._tabularize_output( - "ERROR", - "first stage failed (wrong init nodes).")) + "ERROR", "first stage failed (wrong init nodes)." + ) + ) bn.add_edges(self.data, (self.sf,), progress_bar=False) try: - assert bn.edges == json.load(open(f"{self.base}/hack_edges.json"))[ - f"use_mixture={use_mixture}"][f"has_logit={has_logit}"][self.sf] + assert ( + bn.edges + == json.load(open(f"{self.base}/hack_edges.json"))[ + f"use_mixture={use_mixture}" + ][f"has_logit={has_logit}"][self.sf] + ) except AssertionError: failed = True self.verboseprint(f"Stage 2 failed with {self.sf}.") @@ -559,24 +603,27 @@ def test_structure_learning( print( self._tabularize_output( f"Structure ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status)) + status, + ) + ) @staticmethod def non_empty_gaussian_nodes(name, node_params): empty_data = {"mean": [], "covars": [], "coef": []} - assert all([node_params[obj] != empty for obj, - empty in empty_data.items()]), f"Empty data in {name}." + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." @staticmethod def non_empty_logit_nodes(name, node_params): empty_data = {"classes": [], "classifier_obj": None} - assert all([node_params[obj] != empty for obj, - empty in empty_data.items()]), f"Empty data in {name}." + assert all( + [node_params[obj] != empty for obj, empty in empty_data.items()] + ), f"Empty data in {name}." @staticmethod def sum_equals_to_1(name, node_params): - assert .9 <= sum( - node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." + assert 0.9 <= sum(node_params["coef"]) <= 1.1, f"{name}'s: coefs are wrong." def _validate_node(self, name, type, node_params, true_vals): try: @@ -585,32 +632,34 @@ def _validate_node(self, name, type, node_params, true_vals): self.non_empty_gaussian_nodes, self.sum_equals_to_1, name=name, - node_params=node_params) + node_params=node_params, + ) elif type == "ConditionalMixtureGaussian": for comb, data in node_params["hybcprob"].items(): self.use_rules( self.non_empty_gaussian_nodes, self.sum_equals_to_1, name=name, - node_params=data) + node_params=data, + ) elif type.startswith("Logit"): self.use_rules( - self.non_empty_logit_nodes, - name=name, - node_params=node_params) + self.non_empty_logit_nodes, name=name, node_params=node_params + ) elif type.startswith("ConditionalLogit"): for comb, data in node_params["hybcprob"].items(): self.use_rules( - self.non_empty_logit_nodes, - name=name, - node_params=data) + self.non_empty_logit_nodes, name=name, node_params=data + ) else: assert node_params == true_vals, f"Parameters error on {name}, {type}" except AssertionError as ex: self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0])) + ex.args[0], + ) + ) def test_parameters_learning(self): failed = False @@ -618,7 +667,8 @@ def test_parameters_learning(self): self.bn.fit_parameters(pd.read_csv(self.directory)[self.hybrid_cols]) try: true_params = json.load(open(f"{self.base}/hack_params.json"))[ - f"use_mixture={self.use_mixture}"][f"has_logit={self.has_logit}"][self.sf] + f"use_mixture={self.use_mixture}" + ][f"has_logit={self.has_logit}"][self.sf] node_type_dict = {node.name: node.type for node in self.bn.nodes} for name, type in node_type_dict.items(): @@ -629,7 +679,9 @@ def test_parameters_learning(self): self.verboseprint( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - ex.args[0])) + ex.args[0], + ) + ) if not failed: status = "OK" @@ -639,7 +691,9 @@ def test_parameters_learning(self): print( self._tabularize_output( f"Parameters ({self.sf}, use_mixture={self.use_mixture}, has_logit={self.has_logit})", - status)) + status, + ) + ) def apply(self): print(f"Executing {self.type} BN tests.") @@ -647,19 +701,16 @@ def apply(self): self.test_setters() t0 = time.time() - for use_mixture, has_logit in itertools.product( - [True, False], repeat=2): + for use_mixture, has_logit in itertools.product([True, False], repeat=2): for sf in ["MI", "K2", "BIC"]: self.sf = sf t1 = time.time() self.test_structure_learning( - use_mixture=use_mixture, has_logit=has_logit) + use_mixture=use_mixture, has_logit=has_logit + ) self.test_parameters_learning() if not self.bn: - print( - self._tabularize_output( - f"Error on {sf}", - "No structure")) + print(self._tabularize_output(f"Error on {sf}", "No structure")) print("-" * 8) continue self.test_sampling() diff --git a/tests/SaveBN.py b/tests/SaveBN.py index 04b3042..8fb2401 100644 --- a/tests/SaveBN.py +++ b/tests/SaveBN.py @@ -2,17 +2,27 @@ import pandas as pd from sklearn import preprocessing as pp import bamt.networks as Networks + # import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, encode='ordinal', strategy='uniform') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform") -p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) +p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -21,9 +31,11 @@ bn.add_nodes(info) -structure = [("Tectonic regime", "Structural setting"), - ("Gross", "Netpay"), - ("Lithology", "Permeability")] +structure = [ + ("Tectonic regime", "Structural setting"), + ("Gross", "Netpay"), + ("Lithology", "Permeability"), +] bn.set_structure(edges=structure) diff --git a/tests/main.py b/tests/main.py index 520b62e..bc1904a 100644 --- a/tests/main.py +++ b/tests/main.py @@ -5,15 +5,17 @@ import traceback # Print only errors -logging.getLogger('preprocessor').setLevel(logging.ERROR) +logging.getLogger("preprocessor").setLevel(logging.ERROR) if __name__ == "__main__": t0 = time.time() dir = r"../data/real data/hack_processed_with_rf.csv" - tests = [TestHybridBN(directory=dir), - TestDiscreteBN(directory=dir), - TestContinuousBN(directory=dir)] + tests = [ + TestHybridBN(directory=dir), + TestDiscreteBN(directory=dir), + TestContinuousBN(directory=dir), + ] for test in tests: try: diff --git a/tests/sendingClassifiersLogit.py b/tests/sendingClassifiersLogit.py index 5045077..a0c3613 100644 --- a/tests/sendingClassifiersLogit.py +++ b/tests/sendingClassifiersLogit.py @@ -11,17 +11,23 @@ import pandas as pd hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -40,9 +46,13 @@ bn.add_nodes(info) bn.add_edges(discretized_data, scoring_function=("BIC",), progress_bar=False) -bn.set_classifiers(classifiers={'Structural setting': DecisionTreeClassifier(), - 'Lithology': RandomForestClassifier(), - 'Period': KNeighborsClassifier(n_neighbors=2)}) +bn.set_classifiers( + classifiers={ + "Structural setting": DecisionTreeClassifier(), + "Lithology": RandomForestClassifier(), + "Period": KNeighborsClassifier(n_neighbors=2), + } +) bn.fit_parameters(hack_data) diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py index 0168661..35f23a1 100644 --- a/tests/sendingRegressors.py +++ b/tests/sendingRegressors.py @@ -12,17 +12,23 @@ import pandas as pd hack_data = pd.read_csv("../data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] +] encoder = pp.LabelEncoder() -discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') +discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile") -p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) +p = preprocessors.Preprocessor([("encoder", encoder), ("discretizer", discretizer)]) discretized_data, est = p.apply(hack_data) @@ -42,11 +48,11 @@ bn.set_regressor( regressors={ - 'Depth': CatBoostRegressor( - logging_level="Silent", - allow_writing_files=False), - 'Gross': RandomForestRegressor(), - 'Porosity': DecisionTreeRegressor()}) + "Depth": CatBoostRegressor(logging_level="Silent", allow_writing_files=False), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } +) bn.fit_parameters(hack_data) diff --git a/tests/test_builders.py b/tests/test_builders.py index d117e31..491a9ca 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -21,322 +21,713 @@ class TestStructureBuilder(unittest.TestCase): - def setUp(self): self.data = pd.DataFrame(columns=["Node0", "Node1", "Node2"]) - self.descriptor = {"types": {"Node0": "cont", - "Node1": "disc", - "Node2": "disc_num"}, - "signs": {"Node0": "pos"}} + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "disc_num"}, + "signs": {"Node0": "pos"}, + } self.SB = StructureBuilder(descriptor=self.descriptor) def test_restrict(self): self.SB.has_logit = True - self.SB.restrict(data=self.data, - init_nodes=None, - bl_add=None) + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) self.assertEqual(self.SB.black_list, [], msg="Restrict wrong edges.") # --------- self.SB.has_logit = False - self.SB.restrict(data=self.data, - init_nodes=None, - bl_add=None) + self.SB.restrict(data=self.data, init_nodes=None, bl_add=None) self.assertEqual( - self.SB.black_list, [ - ('Node0', 'Node1'), ('Node0', 'Node2')], msg="Restricted edges are allowed.") + self.SB.black_list, + [("Node0", "Node1"), ("Node0", "Node2")], + msg="Restricted edges are allowed.", + ) def test_get_family(self): self.assertIsNone(self.SB.get_family()) - self.SB.skeleton['V'] = [GaussianNode(name="Node0"), - DiscreteNode(name="Node1"), - DiscreteNode(name="Node2")] + self.SB.skeleton["V"] = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + DiscreteNode(name="Node2"), + ] self.assertIsNone(self.SB.get_family()) # Note that the method get_family is not supposed to be used by user (only developer), # so we don't cover a case with restricted edges here (we did this in # the previous test). - self.SB.skeleton['E'] = [("Node1", "Node0"), ("Node2", "Node1"), - ("Node2", "Node0")] + self.SB.skeleton["E"] = [ + ("Node1", "Node0"), + ("Node2", "Node1"), + ("Node2", "Node0"), + ] self.SB.get_family() # Node: [[cont_parents], [disc_parents], [children]] data = [ - [[], [], ['Node1', 'Node0']], - [[], ['Node2'], ['Node0']], - [[], ['Node1', 'Node2'], []] + [[], [], ["Node1", "Node0"]], + [[], ["Node2"], ["Node0"]], + [[], ["Node1", "Node2"], []], ] for node_nummer in range(3): - self.assertEqual(self.SB.skeleton["V"][node_nummer].cont_parents, - data[node_nummer][0]) - self.assertEqual(self.SB.skeleton["V"][node_nummer].disc_parents, - data[node_nummer][1]) - self.assertEqual(self.SB.skeleton["V"][node_nummer].children, - data[node_nummer][2]) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].cont_parents, data[node_nummer][0] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].disc_parents, data[node_nummer][1] + ) + self.assertEqual( + self.SB.skeleton["V"][node_nummer].children, data[node_nummer][2] + ) class TestVerticesDefiner(unittest.TestCase): - def setUp(self): - self.descriptor = {"types": {"Node0": "cont", - "Node1": "cont", - "Node2": "cont", - "Node3": "cont", - "Node4": "disc", - "Node5": "disc", - "Node6": "disc_num", - "Node7": "disc_num"}, - "signs": {"Node0": "pos", "Node1": "neg"}} - - self.VD = VerticesDefiner( - descriptor=self.descriptor, regressor=None) + self.descriptor = { + "types": { + "Node0": "cont", + "Node1": "cont", + "Node2": "cont", + "Node3": "cont", + "Node4": "disc", + "Node5": "disc", + "Node6": "disc_num", + "Node7": "disc_num", + }, + "signs": {"Node0": "pos", "Node1": "neg"}, + } + + self.VD = VerticesDefiner(descriptor=self.descriptor, regressor=None) def test_first_level(self): self.assertEqual( - self.VD.vertices, [ - GaussianNode( - name="Node0"), GaussianNode( - name="Node1"), GaussianNode( - name="Node2"), GaussianNode( - name="Node3"), DiscreteNode( - name="Node4"), DiscreteNode( - name="Node5"), DiscreteNode( - name="Node6"), DiscreteNode( - name="Node7")]) + self.VD.vertices, + [ + GaussianNode(name="Node0"), + GaussianNode(name="Node1"), + GaussianNode(name="Node2"), + GaussianNode(name="Node3"), + DiscreteNode(name="Node4"), + DiscreteNode(name="Node5"), + DiscreteNode(name="Node6"), + DiscreteNode(name="Node7"), + ], + ) def test_overwrite_vetrex(self): - self.assertEqual(self.VD.skeleton, {'V': [], 'E': []}) + self.assertEqual(self.VD.skeleton, {"V": [], "E": []}) def reload(): - self.VD.skeleton['V'] = self.VD.vertices - self.VD.skeleton['E'] = [ - ("Node0", - "Node7"), - ("Node0", - "Node1"), - ("Node0", - "Node2"), - ("Node0", - "Node5"), - ("Node4", - "Node2"), - ("Node4", - "Node5"), - ("Node4", - "Node6"), - ("Node4", - "Node3")] + self.VD.skeleton["V"] = self.VD.vertices + self.VD.skeleton["E"] = [ + ("Node0", "Node7"), + ("Node0", "Node1"), + ("Node0", "Node2"), + ("Node0", "Node5"), + ("Node4", "Node2"), + ("Node4", "Node5"), + ("Node4", "Node6"), + ("Node4", "Node3"), + ] self.VD.get_family() data = { "True, True": { - 'Node0': 'MixtureGaussian', - 'Node4': 'Discrete', - 'Node7': 'Logit (LogisticRegression)', - 'Node1': 'MixtureGaussian', - 'Node2': 'ConditionalMixtureGaussian', - 'Node5': 'ConditionalLogit (LogisticRegression)', - 'Node6': 'Discrete', - 'Node3': 'ConditionalMixtureGaussian'}, + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, "True, False": { - 'Node0': 'MixtureGaussian', - 'Node4': 'Discrete', - 'Node7': 'Discrete', - 'Node1': 'MixtureGaussian', - 'Node2': 'ConditionalMixtureGaussian', - 'Node5': 'Discrete', - 'Node6': 'Discrete', - 'Node3': 'ConditionalMixtureGaussian'}, + "Node0": "MixtureGaussian", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "MixtureGaussian", + "Node2": "ConditionalMixtureGaussian", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalMixtureGaussian", + }, "False, True": { - 'Node0': 'Gaussian (LinearRegression)', - 'Node4': 'Discrete', - 'Node7': 'Logit (LogisticRegression)', - 'Node1': 'Gaussian (LinearRegression)', - 'Node2': 'ConditionalGaussian (LinearRegression)', - 'Node5': 'ConditionalLogit (LogisticRegression)', - 'Node6': 'Discrete', - 'Node3': 'ConditionalGaussian (LinearRegression)'}, + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Logit (LogisticRegression)", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "ConditionalLogit (LogisticRegression)", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, "False, False": { - 'Node0': 'Gaussian (LinearRegression)', - 'Node4': 'Discrete', - 'Node7': 'Discrete', - 'Node1': 'Gaussian (LinearRegression)', - 'Node2': 'ConditionalGaussian (LinearRegression)', - 'Node5': 'Discrete', - 'Node6': 'Discrete', - 'Node3': 'ConditionalGaussian (LinearRegression)'}} - - for use_mixture, has_logit in itertools.product( - [True, False], repeat=2): + "Node0": "Gaussian (LinearRegression)", + "Node4": "Discrete", + "Node7": "Discrete", + "Node1": "Gaussian (LinearRegression)", + "Node2": "ConditionalGaussian (LinearRegression)", + "Node5": "Discrete", + "Node6": "Discrete", + "Node3": "ConditionalGaussian (LinearRegression)", + }, + } + + for use_mixture, has_logit in itertools.product([True, False], repeat=2): reload() self.VD.overwrite_vertex( has_logit=has_logit, use_mixture=use_mixture, classifier=None, - regressor=None) + regressor=None, + ) self.assertEqual( - { - node.name: node.type for node in self.VD.skeleton["V"]}, + {node.name: node.type for node in self.VD.skeleton["V"]}, data[f"{use_mixture}, {has_logit}"], - msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}") + msg=f"failed on use_mixture={use_mixture} and has_logit={has_logit}", + ) class TestHillClimbDefiner(unittest.TestCase): def setUp(self): - self.descriptor = {'signs': {'Depth': 'pos', - 'Gross': 'pos', - 'Netpay': 'pos', - 'Permeability': 'pos', - 'Porosity': 'pos'}, - 'types': {'Depth': 'cont', - 'Gross': 'cont', - 'Lithology': 'disc', - 'Netpay': 'cont', - 'Period': 'disc', - 'Permeability': 'cont', - 'Porosity': 'cont', - 'Structural setting': 'disc', - 'Tectonic regime': 'disc'}} + self.descriptor = { + "signs": { + "Depth": "pos", + "Gross": "pos", + "Netpay": "pos", + "Permeability": "pos", + "Porosity": "pos", + }, + "types": { + "Depth": "cont", + "Gross": "cont", + "Lithology": "disc", + "Netpay": "cont", + "Period": "disc", + "Permeability": "cont", + "Porosity": "cont", + "Structural setting": "disc", + "Tectonic regime": "disc", + }, + } self.data = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } def test_apply_K2(self): - hcd = HillClimbDefiner(data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("K2",)) - - hcd.apply_K2(data=pd.DataFrame(self.data), - init_edges=None, - progress_bar=False, - remove_init_edges=False, - white_list=None) + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("K2",), + ) + + hcd.apply_K2( + data=pd.DataFrame(self.data), + init_edges=None, + progress_bar=False, + remove_init_edges=False, + white_list=None, + ) right_edges = [ - [ - 'Tectonic regime', 'Structural setting'], [ - 'Tectonic regime', 'Depth'], [ - 'Tectonic regime', 'Netpay'], [ - 'Period', 'Porosity'], [ - 'Period', 'Tectonic regime'], [ - 'Period', 'Netpay'], [ - 'Lithology', 'Permeability'], [ - 'Lithology', 'Period'], [ - 'Lithology', 'Tectonic regime'], [ - 'Structural setting', 'Netpay'], [ - 'Netpay', 'Gross'], [ - 'Porosity', 'Permeability'], [ - 'Porosity', 'Depth'], [ - 'Porosity', 'Netpay'], [ - 'Permeability', 'Netpay']] + ["Tectonic regime", "Structural setting"], + ["Tectonic regime", "Depth"], + ["Tectonic regime", "Netpay"], + ["Period", "Porosity"], + ["Period", "Tectonic regime"], + ["Period", "Netpay"], + ["Lithology", "Permeability"], + ["Lithology", "Period"], + ["Lithology", "Tectonic regime"], + ["Structural setting", "Netpay"], + ["Netpay", "Gross"], + ["Porosity", "Permeability"], + ["Porosity", "Depth"], + ["Porosity", "Netpay"], + ["Permeability", "Netpay"], + ] self.assertEqual(hcd.skeleton["E"], right_edges) def test_apply_group1(self): - hcd = HillClimbDefiner(data=pd.DataFrame(self.data), - descriptor=self.descriptor, - scoring_function=("MI",)) - - hcd.restrict( - data=pd.DataFrame( - self.data), - bl_add=None, - init_nodes=None) + hcd = HillClimbDefiner( + data=pd.DataFrame(self.data), + descriptor=self.descriptor, + scoring_function=("MI",), + ) + + hcd.restrict(data=pd.DataFrame(self.data), bl_add=None, init_nodes=None) hcd.apply_group1( - data=pd.DataFrame( - self.data), + data=pd.DataFrame(self.data), progress_bar=False, init_edges=None, remove_init_edges=False, - white_list=None) + white_list=None, + ) right_edges = [ - [ - 'Lithology', 'Depth'], [ - 'Period', 'Gross'], [ - 'Netpay', 'Gross'], [ - 'Period', 'Netpay'], [ - 'Depth', 'Period'], [ - 'Depth', 'Permeability'], [ - 'Netpay', 'Permeability'], [ - 'Period', 'Porosity'], [ - 'Netpay', 'Porosity'], [ - 'Permeability', 'Structural setting'], [ - 'Netpay', 'Structural setting'], [ - 'Period', 'Tectonic regime'], [ - 'Netpay', 'Tectonic regime']] + ["Lithology", "Depth"], + ["Period", "Gross"], + ["Netpay", "Gross"], + ["Period", "Netpay"], + ["Depth", "Period"], + ["Depth", "Permeability"], + ["Netpay", "Permeability"], + ["Period", "Porosity"], + ["Netpay", "Porosity"], + ["Permeability", "Structural setting"], + ["Netpay", "Structural setting"], + ["Period", "Tectonic regime"], + ["Netpay", "Tectonic regime"], + ] self.assertEqual(hcd.skeleton["E"], right_edges) class TestEvoStructureBuilder(unittest.TestCase): - def setUp(self): self.data = pd.read_csv(r"data/benchmark/asia.csv", index_col=0) - self.descriptor = {'types': {'asia': 'disc', - 'tub': 'disc', - 'smoke': 'disc', - 'lung': 'disc', - 'bronc': 'disc', - 'either': 'disc', - 'xray': 'disc', - 'dysp': 'disc'}, - 'signs': {}} - self.evo_builder = EvoStructureBuilder(data=self.data, - descriptor=self.descriptor, - regressor=None, - has_logit=True, - use_mixture=True) + self.descriptor = { + "types": { + "asia": "disc", + "tub": "disc", + "smoke": "disc", + "lung": "disc", + "bronc": "disc", + "either": "disc", + "xray": "disc", + "dysp": "disc", + }, + "signs": {}, + } + self.evo_builder = EvoStructureBuilder( + data=self.data, + descriptor=self.descriptor, + regressor=None, + has_logit=True, + use_mixture=True, + ) # Replace this with your actual reference DAG self.reference_dag = [ - ('asia', 'tub'), - ('tub', 'either'), - ('smoke', 'lung'), - ('smoke', 'bronc'), - ('lung', 'either'), - ('bronc', 'dysp'), - ('either', 'xray'), - ('either', 'dysp') - ] + ("asia", "tub"), + ("tub", "either"), + ("smoke", "lung"), + ("smoke", "bronc"), + ("lung", "either"), + ("bronc", "dysp"), + ("either", "xray"), + ("either", "dysp"), + ] def test_build(self): # placeholder kwargs kwargs = {} self.evo_builder.build( - data=self.data, - classifier=None, - regressor=None, - **kwargs) + data=self.data, classifier=None, regressor=None, **kwargs + ) - obtained_dag = self.evo_builder.skeleton['E'] + obtained_dag = self.evo_builder.skeleton["E"] num_edges = len(obtained_dag) - self.assertGreaterEqual(num_edges, 1, msg="Obtained graph should have at least one edge.") + self.assertGreaterEqual( + num_edges, 1, msg="Obtained graph should have at least one edge." + ) - dist = precision_recall(obtained_dag, self.reference_dag)['SHD'] + dist = precision_recall(obtained_dag, self.reference_dag)["SHD"] self.assertLess( dist, 15, - msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}") + msg=f"Structural Hamming Distance should be less than 15, obtained SHD = {dist}", + ) if __name__ == "__main__": diff --git a/tests/test_networks.py b/tests/test_networks.py index 8eb367d..fc1c590 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -34,17 +34,27 @@ def assertIsDir(self, path): def prepare_bn_and_data(self): # prepare bn where models were set by set_model hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[ - ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', - 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth']] + [ + "Tectonic regime", + "Period", + "Lithology", + "Structural setting", + "Gross", + "Netpay", + "Porosity", + "Permeability", + "Depth", + ] + ] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer( - n_bins=5, - encode='ordinal', - strategy='quantile') + n_bins=5, encode="ordinal", strategy="quantile" + ) p = preprocessors.Preprocessor( - [('encoder', encoder), ('discretizer', discretizer)]) + [("encoder", encoder), ("discretizer", discretizer)] + ) discretized_data, est = p.apply(hack_data) @@ -54,31 +64,36 @@ def prepare_bn_and_data(self): self.bn.add_nodes(info) self.bn.add_edges( - discretized_data, scoring_function=( - "BIC",), progress_bar=False) + discretized_data, scoring_function=("BIC",), progress_bar=False + ) self.bn.set_regressor( regressors={ - 'Depth': CatBoostRegressor( - logging_level="Silent", - allow_writing_files=False), - 'Gross': RandomForestRegressor(), - 'Porosity': DecisionTreeRegressor()}) + "Depth": CatBoostRegressor( + logging_level="Silent", allow_writing_files=False + ), + "Gross": RandomForestRegressor(), + "Porosity": DecisionTreeRegressor(), + } + ) return hack_data class TestBaseNetwork(TestCaseBase): - def setUp(self): self.bn = BaseNetwork() - self.nodes = [GaussianNode(name="Node0"), DiscreteNode(name="Node1"), - GaussianNode(name="Node2")] + self.nodes = [ + GaussianNode(name="Node0"), + DiscreteNode(name="Node1"), + GaussianNode(name="Node2"), + ] self.edges = [("Node0", "Node1"), ("Node1", "Node2")] - self.descriptor = {"types": {"Node0": "cont", "Node1": "disc", - "Node2": "cont"}, - "signs": {"Node0": "pos", "Node1": "neg"}} + self.descriptor = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {"Node0": "pos", "Node1": "neg"}, + } def test_validate(self): descriptor_t = {"types": {"Node0": "Abstract", "Node1": "Abstract"}} @@ -92,7 +107,7 @@ def test_update_descriptor(self): # Nodes out self.bn.nodes = [GaussianNode(name="Node0")] self.bn.update_descriptor() - self.assertEqual({'Node0': 'cont'}, self.bn.descriptor["types"]) + self.assertEqual({"Node0": "cont"}, self.bn.descriptor["types"]) # It uses only Vertices Definer, test of this is in builders tests. def test_add_nodes(self): @@ -111,21 +126,20 @@ def __init__(self, name): self.name = name # set without mapping - self.assertIsNone( - self.bn.set_nodes( - nodes=[ - GaussianNode( - name="Node0")])) + self.assertIsNone(self.bn.set_nodes(nodes=[GaussianNode(name="Node0")])) - map = {"types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, - "signs": {}} + map = { + "types": {"Node0": "cont", "Node1": "disc", "Node2": "cont"}, + "signs": {}, + } self.bn.set_nodes(nodes=self.nodes, info=map) self.assertEqual(self.bn.nodes, self.nodes) - self.bn.set_nodes(nodes=[MyNode(name="Node-1"), MyNode("Node-2")], - info={"types": {"Node-1": "cont", "Node-2": "disc"}, - "signs": {}}) + self.bn.set_nodes( + nodes=[MyNode(name="Node-1"), MyNode("Node-2")], + info={"types": {"Node-1": "cont", "Node-2": "disc"}, "signs": {}}, + ) self.assertEqual(self.bn.nodes, []) def test_set_edges(self): @@ -135,7 +149,7 @@ def test_set_edges(self): self.bn.set_nodes(nodes=self.nodes, info=self.descriptor) self.bn.set_edges(edges=self.edges) - self.assertEqual([('Node1', 'Node2')], self.bn.edges) + self.assertEqual([("Node1", "Node2")], self.bn.edges) # The test consists of 2 previous methods that are tested, # plus methods of builders, they are tested as well. @@ -176,6 +190,7 @@ def test_save(self): def test_fit_parameters(self): from bamt.networks.base import STORAGE + # here we test only initialization of the folder self.bn.has_logit = True self.bn.nodes = [LogitNode(name="Node0")] @@ -193,103 +208,839 @@ def test_joblib_pathsave(self): self.bn.fit_parameters(hack_data) self.assertGreater( - self.bn.sample( - 100, - progress_bar=False).size, - 0, - "Sampling is broken") + self.bn.sample(100, progress_bar=False).size, 0, "Sampling is broken" + ) - saveloc = self.bn.distributions["Gross"]['hybcprob']["['COMPRESSION']"]['regressor_obj'] + saveloc = self.bn.distributions["Gross"]["hybcprob"]["['COMPRESSION']"][ + "regressor_obj" + ] self.assertIsFile(saveloc) def test_sample(self): data = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } nodes = [ - DiscreteNode( - name='Tectonic regime'), DiscreteNode( - name='Period'), DiscreteNode( - name='Lithology'), DiscreteNode( - name='Structural setting'), DiscreteNode( - name='Gross'), DiscreteNode( - name='Netpay'), DiscreteNode( - name='Porosity'), DiscreteNode( - name='Permeability'), DiscreteNode( - name='Depth')] + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + DiscreteNode(name="Gross"), + DiscreteNode(name="Netpay"), + DiscreteNode(name="Porosity"), + DiscreteNode(name="Permeability"), + DiscreteNode(name="Depth"), + ] - self.bn.set_nodes( - nodes, info={ - "types": { - k.name: "disc" for k in nodes}}) - self.bn.set_edges([["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"]]) + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) self.bn.fit_parameters(pd.DataFrame.from_records(data)) - self.assertIsNotNone( - self.bn.sample( - 50, - as_df=False, - progress_bar=False)) + self.assertIsNotNone(self.bn.sample(50, as_df=False, progress_bar=False)) def test_predict(self): seq = { - 'Tectonic regime': [0, 1, 4, 4, 0, 2, 0, 0, 0, 0, 3, 1, 0, 3, 0, 1, 4, 0, 4, 3, 4, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 0, 0, 3, 2, 3, 2, 3, 3, 3, 0], - 'Period': [3, 1, 4, 4, 1, 1, 0, 0, 3, 5, 3, 9, 0, 5, 0, 3, 5, 3, 2, 4, 4, 1, 5, 7, 7, 7, 1, 1, 1, 1, 4, 6, - 8, 4, 4, 5, 4, 7, 5, 5, 0], - 'Lithology': [2, 4, 6, 4, 2, 2, 2, 2, 4, 4, 4, 4, 1, 4, 1, 4, 4, 4, 5, 3, 2, 2, 2, 4, 1, 1, 3, 4, 4, 4, 4, - 2, 0, 3, 4, 4, 4, 4, 4, 4, 2], - 'Structural setting': [2, 6, 10, 10, 7, 5, 8, 8, 2, 2, 6, 6, 3, 7, 3, 6, 10, 9, 3, 0, 0, 7, 6, 6, 6, 7, 6, - 6, 6, 6, 8, 2, 9, 4, 7, 6, 1, 8, 4, 4, 3], - 'Gross': [1, 3, 1, 3, 1, 0, 2, 3, 0, 4, 4, 4, 0, 3, 0, 0, 3, 4, 0, 4, 3, 2, 2, 4, 0, 4, 1, 2, 2, 4, 2, 4, 3, - 1, 1, 1, 2, 3, 0, 2, 1], - 'Netpay': [3, 2, 1, 4, 2, 0, 2, 2, 1, 4, 3, 4, 0, 3, 1, 1, 0, 4, 1, 3, 4, 3, 3, 4, 0, 4, 0, 1, 2, 4, 2, 3, - 2, 1, 2, 0, 2, 4, 1, 3, 0], - 'Porosity': [3, 0, 4, 3, 3, 1, 0, 0, 3, 0, 2, 1, 2, 3, 0, 2, 3, 0, 0, 4, 2, 4, 2, 2, 1, 1, 1, 3, 3, 2, 4, 3, - 1, 4, 4, 4, 3, 1, 4, 4, 0], - 'Permeability': [4, 0, 3, 3, 2, 1, 1, 1, 1, 0, 4, 4, 1, 3, 1, 4, 3, 0, 0, 3, 0, 1, 2, 0, 2, 2, 1, 2, 3, 4, - 3, 2, 2, 2, 4, 4, 3, 0, 4, 4, 0], - 'Depth': [1, 4, 3, 4, 1, 3, 1, 3, 1, 4, 3, 4, 1, 2, 1, 4, 0, 4, 0, 0, 3, 2, 3, 2, 2, 3, 4, 2, 2, 4, 1, 0, 2, - 0, 4, 0, 1, 2, 0, 0, 3]} + "Tectonic regime": [ + 0, + 1, + 4, + 4, + 0, + 2, + 0, + 0, + 0, + 0, + 3, + 1, + 0, + 3, + 0, + 1, + 4, + 0, + 4, + 3, + 4, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 3, + 2, + 3, + 2, + 3, + 3, + 3, + 0, + ], + "Period": [ + 3, + 1, + 4, + 4, + 1, + 1, + 0, + 0, + 3, + 5, + 3, + 9, + 0, + 5, + 0, + 3, + 5, + 3, + 2, + 4, + 4, + 1, + 5, + 7, + 7, + 7, + 1, + 1, + 1, + 1, + 4, + 6, + 8, + 4, + 4, + 5, + 4, + 7, + 5, + 5, + 0, + ], + "Lithology": [ + 2, + 4, + 6, + 4, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 1, + 4, + 4, + 4, + 5, + 3, + 2, + 2, + 2, + 4, + 1, + 1, + 3, + 4, + 4, + 4, + 4, + 2, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + ], + "Structural setting": [ + 2, + 6, + 10, + 10, + 7, + 5, + 8, + 8, + 2, + 2, + 6, + 6, + 3, + 7, + 3, + 6, + 10, + 9, + 3, + 0, + 0, + 7, + 6, + 6, + 6, + 7, + 6, + 6, + 6, + 6, + 8, + 2, + 9, + 4, + 7, + 6, + 1, + 8, + 4, + 4, + 3, + ], + "Gross": [ + 1, + 3, + 1, + 3, + 1, + 0, + 2, + 3, + 0, + 4, + 4, + 4, + 0, + 3, + 0, + 0, + 3, + 4, + 0, + 4, + 3, + 2, + 2, + 4, + 0, + 4, + 1, + 2, + 2, + 4, + 2, + 4, + 3, + 1, + 1, + 1, + 2, + 3, + 0, + 2, + 1, + ], + "Netpay": [ + 3, + 2, + 1, + 4, + 2, + 0, + 2, + 2, + 1, + 4, + 3, + 4, + 0, + 3, + 1, + 1, + 0, + 4, + 1, + 3, + 4, + 3, + 3, + 4, + 0, + 4, + 0, + 1, + 2, + 4, + 2, + 3, + 2, + 1, + 2, + 0, + 2, + 4, + 1, + 3, + 0, + ], + "Porosity": [ + 3, + 0, + 4, + 3, + 3, + 1, + 0, + 0, + 3, + 0, + 2, + 1, + 2, + 3, + 0, + 2, + 3, + 0, + 0, + 4, + 2, + 4, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 2, + 4, + 3, + 1, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 0, + ], + "Permeability": [ + 4, + 0, + 3, + 3, + 2, + 1, + 1, + 1, + 1, + 0, + 4, + 4, + 1, + 3, + 1, + 4, + 3, + 0, + 0, + 3, + 0, + 1, + 2, + 0, + 2, + 2, + 1, + 2, + 3, + 4, + 3, + 2, + 2, + 2, + 4, + 4, + 3, + 0, + 4, + 4, + 0, + ], + "Depth": [ + 1, + 4, + 3, + 4, + 1, + 3, + 1, + 3, + 1, + 4, + 3, + 4, + 1, + 2, + 1, + 4, + 0, + 4, + 0, + 0, + 3, + 2, + 3, + 2, + 2, + 3, + 4, + 2, + 2, + 4, + 1, + 0, + 2, + 0, + 4, + 0, + 1, + 2, + 0, + 0, + 3, + ], + } data = pd.DataFrame.from_records(seq) nodes = [ - DiscreteNode( - name='Tectonic regime'), DiscreteNode( - name='Period'), DiscreteNode( - name='Lithology'), DiscreteNode( - name='Structural setting')] + DiscreteNode(name="Tectonic regime"), + DiscreteNode(name="Period"), + DiscreteNode(name="Lithology"), + DiscreteNode(name="Structural setting"), + ] - self.bn.set_nodes( - nodes, info={ - "types": { - k.name: "disc" for k in nodes}}) - self.bn.set_edges([["Tectonic regime", "Period"], - ["Structural setting", "Period"], - ["Tectonic regime", "Lithology"], - ["Lithology", "Structural setting"]]) + self.bn.set_nodes(nodes, info={"types": {k.name: "disc" for k in nodes}}) + self.bn.set_edges( + [ + ["Tectonic regime", "Period"], + ["Structural setting", "Period"], + ["Tectonic regime", "Lithology"], + ["Lithology", "Structural setting"], + ] + ) self.bn.fit_parameters(data) - result = self.bn.predict( - data.iloc[:, :3], parall_count=2, progress_bar=False) + result = self.bn.predict(data.iloc[:, :3], parall_count=2, progress_bar=False) self.assertNotEqual(result, {}) for v in result.values(): diff --git a/tests/test_nodes.py b/tests/test_nodes.py index f437725..87d2695 100644 --- a/tests/test_nodes.py +++ b/tests/test_nodes.py @@ -4,6 +4,7 @@ import numpy as np from bamt.nodes import * + logging.getLogger("nodes").setLevel(logging.CRITICAL) @@ -57,13 +58,12 @@ def test_choose_serialization(self): class TestDiscreteNode(unittest.TestCase): - def setUp(self): self.node = discrete_node.DiscreteNode(name="test") self.data_dict = { "node0": np.random.normal(0, 4, 30), - "node1": np.random.normal(0, .1, 30), - "node2": np.random.normal(0, .3, 30), + "node1": np.random.normal(0, 0.1, 30), + "node2": np.random.normal(0, 0.3, 30), "test": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -75,8 +75,7 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertIsNotNone(params["vals"]) self.assertNotEqual(params["vals"], []) @@ -85,31 +84,26 @@ def test_fit_parameters(self): def test_choose(self): pvals = ["cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue([self.node.choose(params, pvals) in params["vals"]]) def test_predict(self): pvals = ["cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue([self.node.predict(params, pvals) in params["vals"]]) - self.assertRaises( - KeyError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) class TestGaussianNode(unittest.TestCase): - def setUp(self): self.node = gaussian_node.GaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "foster-son": np.random.normal(2.5, .2, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -125,9 +119,11 @@ def test_fit_parameters(self): node_without_parents.children = ["node6", "node5"] params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + pd.DataFrame.from_records(self.data_dict) + ) params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + pd.DataFrame.from_records(self.data_dict) + ) self.assertIsNotNone(params_parents["regressor_obj"]) self.assertTrue(pd.isna(params_parents["mean"])) @@ -139,31 +135,25 @@ def test_fit_parameters(self): def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises( - ValueError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(ValueError, self.node.predict, params, ["bad", "values"]) class TestConditionalGaussianNode(unittest.TestCase): - def setUp(self): - self.node = conditional_gaussian_node.ConditionalGaussianNode( - name="test") + self.node = conditional_gaussian_node.ConditionalGaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "foster-son": np.random.normal(2.5, .2, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "foster-son": np.random.normal(2.5, 0.2, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -177,17 +167,19 @@ def setUp(self): def fit_parameters(self, regressor=None): if regressor is not None: self.node.regressor = regressor - self.node.type = 'ConditionalGaussian' + \ - f" ({type(regressor).__name__})" + self.node.type = "ConditionalGaussian" + f" ({type(regressor).__name__})" node_without_parents = conditional_gaussian_node.ConditionalGaussianNode( - name="foster-son") + name="foster-son" + ) node_without_parents.children = ["node6", "node5"] params_parents = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict))["hybcprob"] + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"] params_foster = node_without_parents.fit_parameters( - pd.DataFrame.from_records(self.data_dict))["hybcprob"]['[]'] + pd.DataFrame.from_records(self.data_dict) + )["hybcprob"]["[]"] self.assertIsNone(params_foster["regressor_obj"]) self.assertIsNotNone(params_foster["mean"]) @@ -220,29 +212,24 @@ def fit_parameters(self, regressor=None): def test_choose(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) - self.assertRaises( - KeyError, self.node.predict, params, [ - "bad", "values"]) + self.assertRaises(KeyError, self.node.predict, params, ["bad", "values"]) class TestMixtureGaussianNode(unittest.TestCase): - def setUp(self): self.node = mixture_gaussian_node.MixtureGaussianNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -254,33 +241,30 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertAlmostEqual(sum(params["coef"]), 1, delta=1e-5) def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) class TestConditionalMixtureGaussianNode(unittest.TestCase): - def setUp(self): self.node = conditional_mixture_gaussian_node.ConditionalMixtureGaussianNode( - name="test") + name="test" + ) self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "test": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "test": np.random.normal(3, 0.3, 30), "node2": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -292,9 +276,9 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records( - self.data_dict))["hybcprob"] + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict))[ + "hybcprob" + ] report = [] # sometimes combination's data can be empty, so we set the percent of # empty combinations @@ -307,26 +291,23 @@ def test_fit_parameters(self): def test_choose(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.choose(params, pvals), float)) def test_predict(self): pvals = [1.05, 1.95, "cat4", "cat7"] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(isinstance(self.node.predict(params, pvals), float)) class TestLogitNode(unittest.TestCase): - def setUp(self): self.node = logit_node.LogitNode(name="test") self.data_dict = { "node0": np.random.normal(1, 4, 30), - "node1": np.random.normal(2, .1, 30), - "node2": np.random.normal(3, .3, 30), + "node1": np.random.normal(2, 0.1, 30), + "node2": np.random.normal(3, 0.3, 30), "test": np.random.choice(["cat1", "cat2", "cat3"], 30), "node4": np.random.choice(["cat4", "cat5", "cat6"], 30), "node5": np.random.choice(["cat7", "cat8", "cat9"], 30), @@ -338,23 +319,19 @@ def setUp(self): self.node.children = ["node6"] def test_fit_parameters(self): - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertIsNotNone(params["classifier_obj"]) def test_choose(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) self.assertTrue(self.node.choose(params, pvals) in params["classes"]) def test_predict(self): pvals = [1.05, 1.95] - params = self.node.fit_parameters( - pd.DataFrame.from_records(self.data_dict)) + params = self.node.fit_parameters(pd.DataFrame.from_records(self.data_dict)) - self.assertTrue( - [self.node.predict(params, pvals) in params["classes"]]) + self.assertTrue([self.node.predict(params, pvals) in params["classes"]]) if __name__ == "__main__":