Skip to content

Commit

Permalink
Evolutionary algorithm for structure learning, builders decomposing a…
Browse files Browse the repository at this point in the history
…nd refactoring (#62)
  • Loading branch information
jrzkaminski authored Jun 5, 2023
1 parent b326e4f commit 25ba073
Show file tree
Hide file tree
Showing 25 changed files with 786 additions and 258 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ tests/bamt
tutorials/Test.ipynb
tutorials/bamt
.DS_Store
/example_socio.ipynb
4 changes: 4 additions & 0 deletions bamt/builders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__all__ = ["builders_base",
"evo_builder",
"hc_builder"
]
191 changes: 7 additions & 184 deletions bamt/builders.py → bamt/builders/builders_base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import itertools

from pgmpy.base import DAG
from pgmpy.estimators import HillClimbSearch
from bamt.redef_HC import hc as hc_method

from bamt.nodes.discrete_node import DiscreteNode
from bamt.nodes.gaussian_node import GaussianNode
from bamt.nodes.conditional_logit_node import ConditionalLogitNode
Expand All @@ -16,7 +12,7 @@
from pandas import DataFrame
from bamt.utils import GraphUtils as gru

from typing import Dict, List, Optional, Tuple, Callable, TypedDict, Union, Sequence
from typing import Dict, List, Optional, Tuple, Callable, TypedDict, Sequence, Union


class ParamDict(TypedDict, total=False):
Expand Down Expand Up @@ -110,8 +106,8 @@ def get_family(self):
self.skeleton['V'][id].children = children

ordered = gru.toporder(self.skeleton['V'], self.skeleton['E'])
notOrdered = [node.name for node in self.skeleton['V']]
mask = [notOrdered.index(name) for name in ordered]
not_ordered = [node.name for node in self.skeleton['V']]
mask = [not_ordered.index(name) for name in ordered]
self.skeleton['V'] = [self.skeleton['V'][i] for i in mask]


Expand Down Expand Up @@ -153,7 +149,7 @@ def overwrite_vertex(
"""
Level 2: Redefined nodes according structure (parents)
:param classifier: an object to pass into logit, condLogit nodes
:param regressor: an object to pass into gaussianish nodes
:param regressor: an object to pass into gaussian nodes
:param has_logit allows edges from cont to disc nodes
:param use_mixture allows using Mixture
"""
Expand Down Expand Up @@ -203,189 +199,16 @@ def __init__(self, descriptor: Dict[str, Dict[str, str]]):
super(EdgesDefiner, self).__init__(descriptor)


class HillClimbDefiner(VerticesDefiner, EdgesDefiner):
"""
Object to define structure and pass it into skeleton
"""

class BaseDefiner(VerticesDefiner, EdgesDefiner):
def __init__(self, data: DataFrame, descriptor: Dict[str, Dict[str, str]],
scoring_function: Union[Tuple[str, Callable], Tuple[str]],
regressor: Optional[object] = None):
"""
:param scoring_function: a tuple with following format (Name, scoring_function)
"""

self.scoring_function = scoring_function
self.optimizer = HillClimbSearch(data)
self.params = {'init_edges': None,
'init_nodes': None,
'remove_init_edges': True,
'white_list': None,
'bl_add': None}
super(HillClimbDefiner, self).__init__(descriptor, regressor=regressor)

def apply_K2(self,
data: DataFrame,
init_edges: Optional[List[Tuple[str,
str]]],
progress_bar: bool,
remove_init_edges: bool,
white_list: Optional[List[Tuple[str,
str]]]):
"""
:param init_edges: list of tuples, a graph to start learning with
:param remove_init_edges: allows changes in a model defined by user
:param data: user's data
:param progress_bar: verbose regime
:param white_list: list of allowed edges
"""
import bamt.utils.GraphUtils as gru
if not all([i in ['disc', 'disc_num']
for i in gru.nodes_types(data).values()]):
logger_builder.error(
f"K2 deals only with discrete data. Continuous data: {[col for col, type in gru.nodes_types(data).items() if type not in ['disc', 'disc_num']]}")
return None

if len(self.scoring_function) != 2:
from pgmpy.estimators import K2Score
scoring_function = K2Score
else:
scoring_function = self.scoring_function[1]

if not init_edges:
best_model = self.optimizer.estimate(
scoring_method=scoring_function(data),
black_list=self.black_list,
white_list=white_list,
show_progress=progress_bar
)
else:

if remove_init_edges:
startdag = DAG()
nodes = [str(v) for v in self.vertices]
startdag.add_nodes_from(nodes=nodes)
startdag.add_edges_from(ebunch=init_edges)
best_model = self.optimizer.estimate(
black_list=self.black_list,
white_list=white_list,
start_dag=startdag,
show_progress=False)
else:
best_model = self.optimizer.estimate(
black_list=self.black_list,
white_list=white_list,
fixed_edges=init_edges,
show_progress=False)

structure = [list(x) for x in list(best_model.edges())]
self.skeleton['E'] = structure

def apply_group1(self,
data: DataFrame,
progress_bar: bool,
init_edges: Optional[List[Tuple[str,
str]]],
remove_init_edges: bool,
white_list: Optional[List[Tuple[str,
str]]]):
"""
This method implements the group of scoring functions.
Group:
"MI" - Mutual Information,
"LL" - Log Likelihood,
"BIC" - Bayess Information Criteria,
"AIC" - Akaike information Criteria.
"""
column_name_dict = dict([(n.name, i)
for i, n in enumerate(self.vertices)])
blacklist_new = []
for pair in self.black_list:
blacklist_new.append(
(column_name_dict[pair[0]], column_name_dict[pair[1]]))
if white_list:
white_list_old = white_list[:]
white_list = []
for pair in white_list_old:
white_list.append(
(column_name_dict[pair[0]], column_name_dict[pair[1]]))
if init_edges:
init_edges_old = init_edges[:]
init_edges = []
for pair in init_edges_old:
init_edges.append(
(column_name_dict[pair[0]], column_name_dict[pair[1]]))

bn = hc_method(
data,
metric=self.scoring_function[0],
restriction=white_list,
init_edges=init_edges,
remove_geo_edges=remove_init_edges,
black_list=blacklist_new,
debug=progress_bar)
structure = []
nodes = sorted(list(bn.nodes()))
for rv in nodes:
for pa in bn.F[rv]['parents']:
structure.append([list(column_name_dict.keys())[list(column_name_dict.values()).index(
pa)], list(column_name_dict.keys())[list(column_name_dict.values()).index(rv)]])
self.skeleton['E'] = structure


class HCStructureBuilder(HillClimbDefiner):
"""
Final object with build method
"""

def __init__(self, data: DataFrame,
descriptor: Dict[str, Dict[str, str]],
scoring_function: Tuple[str, Callable],
regressor: Optional[object],
has_logit: bool, use_mixture: bool):
"""
:param data: train data
:param descriptor: map for data
"""

super(
HCStructureBuilder,
self).__init__(
descriptor=descriptor,
data=data,
scoring_function=scoring_function,
regressor=regressor)
self.use_mixture = use_mixture
self.has_logit = has_logit

def build(self, data: DataFrame,
progress_bar: bool,
classifier: Optional[object],
regressor: Optional[object],
params: Optional[ParamDict] = None):
if params:
for param, value in params.items():
self.params[param] = value

init_nodes = self.params.pop('init_nodes')
bl_add = self.params.pop('bl_add')

# Level 1
self.skeleton['V'] = self.vertices

self.restrict(data, init_nodes, bl_add)
if self.scoring_function[0] == 'K2':
self.apply_K2(data=data, progress_bar=progress_bar, **self.params)
elif self.scoring_function[0] in ['MI', 'LL', 'BIC', 'AIC']:
self.apply_group1(
data=data,
progress_bar=progress_bar,
**self.params)

# Level 2

self.get_family()
self.overwrite_vertex(has_logit=self.has_logit,
use_mixture=self.use_mixture,
classifier=classifier,
regressor=regressor)
super().__init__(descriptor, regressor=regressor)
self.optimizer = None # will be defined in subclasses
Loading

0 comments on commit 25ba073

Please sign in to comment.