diff --git a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py index 004a237..d2b4848 100644 --- a/bamt/external/pyitlib/DiscreteRandomVariableUtils.py +++ b/bamt/external/pyitlib/DiscreteRandomVariableUtils.py @@ -8,17 +8,17 @@ def information_mutual_conditional( - x, - y, - z, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - alphabet_x=None, - alphabet_y=None, - Alphabet_Z=None, - keep_dims=False, + x, + y, + z, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + alphabet_x=None, + alphabet_y=None, + Alphabet_Z=None, + keep_dims=False, ): x, fill_value_X = _sanitise_array_input(x, fill_value) y, fill_value_Y = _sanitise_array_input(y, fill_value) @@ -113,6 +113,7 @@ def information_mutual_conditional( Alphabet_Z = np.reshape(Alphabet_Z, (-1, Alphabet_Z.shape[-1])) I = [] for i in range(z.shape[0]): + def f(X, Y, Alphabet_X, Alphabet_Y): return information_mutual_conditional( X, @@ -150,28 +151,28 @@ def f(X, Y, Alphabet_X, Alphabet_Y): for i in range(x.shape[0]): I_ = ( - entropy_joint( - np.vstack((x[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_x[i], Alphabet_Z[i]), fill_value), - ) - + entropy_joint( - np.vstack((y[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_y[i], Alphabet_Z[i]), fill_value), - ) - - entropy_joint( - np.vstack((x[i], y[i], z[i])), - base, - fill_value, - estimator, - _vstack_pad((alphabet_x[i], alphabet_y[i], Alphabet_Z[i]), fill_value), - ) - - entropy_joint(z[i], base, fill_value, estimator, Alphabet_Z[i]) + entropy_joint( + np.vstack((x[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], Alphabet_Z[i]), fill_value), + ) + + entropy_joint( + np.vstack((y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint( + np.vstack((x[i], y[i], z[i])), + base, + fill_value, + estimator, + _vstack_pad((alphabet_x[i], alphabet_y[i], Alphabet_Z[i]), fill_value), + ) + - entropy_joint(z[i], base, fill_value, estimator, Alphabet_Z[i]) ) I[i] = I_ @@ -185,15 +186,15 @@ def f(X, Y, Alphabet_X, Alphabet_Y): def information_mutual( - X, - Y=None, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - Alphabet_X=None, - Alphabet_Y=None, - keep_dims=False, + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, ): H_conditional = entropy_conditional( X, Y, cartesian_product, base, fill_value, estimator, Alphabet_X, Alphabet_Y @@ -236,7 +237,7 @@ def entropy_pmf(P, base=2, require_valid_pmf=True, keep_dims=False): def entropy_joint( - X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False + X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims=False ): X, fill_value_X = _sanitise_array_input(X, fill_value) if Alphabet_X is not None: @@ -302,15 +303,15 @@ def entropy_joint( def entropy_conditional( - X, - Y=None, - cartesian_product=False, - base=2, - fill_value=-1, - estimator="ML", - Alphabet_X=None, - Alphabet_Y=None, - keep_dims=False, + X, + Y=None, + cartesian_product=False, + base=2, + fill_value=-1, + estimator="ML", + Alphabet_X=None, + Alphabet_Y=None, + keep_dims=False, ): if Y is None: Y = X @@ -485,10 +486,10 @@ def entropy(X, base=2, fill_value=-1, estimator="ML", Alphabet_X=None, keep_dims # P_0 is the probability mass assigned to each additional empty bin P, P_0 = _estimate_probabilities(L, estimator, n_additional_empty_bins) H_0 = ( - n_additional_empty_bins - * P_0 - * -np.log2(P_0 + np.spacing(0)) - / np.log2(base) + n_additional_empty_bins + * P_0 + * -np.log2(P_0 + np.spacing(0)) + / np.log2(base) ) H[i] = entropy_pmf(P, base, require_valid_pmf=False) + H_0 @@ -571,7 +572,7 @@ def _map_observations_to_integers(Symbol_matrices, Fill_values): assert len(Symbol_matrices) == len(Fill_values) FILL_VALUE = -1 if np.any([A.dtype != "int" for A in Symbol_matrices]) or np.any( - np.array(Fill_values) != FILL_VALUE + np.array(Fill_values) != FILL_VALUE ): L = sklearn.preprocessing.LabelEncoder() F = [np.atleast_1d(v) for v in Fill_values] @@ -606,7 +607,7 @@ def _isnan_element(x): def _determine_number_additional_empty_bins( - Counts, Alphabet, Full_Alphabet, fill_value + Counts, Alphabet, Full_Alphabet, fill_value ): alphabet_sizes = np.sum(np.atleast_2d(Full_Alphabet) != fill_value, axis=-1) if np.any(alphabet_sizes != fill_value): @@ -630,34 +631,35 @@ def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): # 2) James-Stein approach may be used as an alternative # 3) Dirichlet prior may be used in all other cases - assert (np.sum(Counts) > 0) - assert (np.all(Counts.astype('int') == Counts)) - assert (n_additional_empty_bins >= 0) - Counts = Counts.astype('int') + assert np.sum(Counts) > 0 + assert np.all(Counts.astype("int") == Counts) + assert n_additional_empty_bins >= 0 + Counts = Counts.astype("int") if isinstance(estimator, str): - estimator = estimator.upper().replace(' ', '') + estimator = estimator.upper().replace(" ", "") - if np.isreal(estimator) or estimator in ('ML', 'PERKS', 'MINIMAX'): + if np.isreal(estimator) or estimator in ("ML", "PERKS", "MINIMAX"): if np.isreal(estimator): alpha = estimator - elif estimator == 'PERKS': + elif estimator == "PERKS": alpha = 1.0 / (Counts.size + n_additional_empty_bins) - elif estimator == 'MINIMAX': - alpha = np.sqrt(np.sum(Counts)) / \ - (Counts.size + n_additional_empty_bins) + elif estimator == "MINIMAX": + alpha = np.sqrt(np.sum(Counts)) / (Counts.size + n_additional_empty_bins) else: alpha = 0 - Theta = (Counts + alpha) / \ - (1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins)) + Theta = (Counts + alpha) / ( + 1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins) + ) # Theta_0 is the probability mass assigned to each additional empty bin if n_additional_empty_bins > 0: - Theta_0 = alpha / (1.0 * np.sum(Counts) + - alpha * (Counts.size + n_additional_empty_bins)) + Theta_0 = alpha / ( + 1.0 * np.sum(Counts) + alpha * (Counts.size + n_additional_empty_bins) + ) else: Theta_0 = 0 - elif estimator == 'GOOD-TURING': + elif estimator == "GOOD-TURING": # TODO We could also add a Chen-Chao vocabulary size estimator (See # Bhat Suma's thesis) @@ -679,34 +681,35 @@ def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): # Fit least squares regression line to plot of log(Z_r) versus log(r) x = np.log10(np.arange(1, Z_r.size)) - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): y = np.log10(Z_r[1:]) x = x[np.isfinite(y)] y = y[np.isfinite(y)] - m, c = np.linalg.lstsq(np.vstack([x, np.ones(x.size)]).T, y, - rcond=None)[0] + m, c = np.linalg.lstsq(np.vstack([x, np.ones(x.size)]).T, y, rcond=None)[0] if m >= -1: - warnings.warn("Regression slope < -1 requirement in linear " - "Good-Turing estimate not satisfied") + warnings.warn( + "Regression slope < -1 requirement in linear " + "Good-Turing estimate not satisfied" + ) # Compute smoothed value of N_r based on interpolation # We need to refer to SmoothedN_{r+1} for all observed values of r SmoothedN_r = np.zeros(N_r.size + 1) - SmoothedN_r[1:] = 10 ** (np.log10(np.arange(1, SmoothedN_r.size)) * - m + c) + SmoothedN_r[1:] = 10 ** (np.log10(np.arange(1, SmoothedN_r.size)) * m + c) # Determine threshold value of r at which to use smoothed values of N_r # (SmoothedN_r), as apposed to straightforward N_r. # Variance of Turing estimate - with np.errstate(invalid='ignore', divide='ignore'): - VARr_T = (np.arange(N_r.size) + 1) ** 2 * \ - (1.0 * np.append(N_r[1:], 0) / (N_r ** 2)) * \ - (1 + np.append(N_r[1:], 0) / N_r) + with np.errstate(invalid="ignore", divide="ignore"): + VARr_T = ( + (np.arange(N_r.size) + 1) ** 2 + * (1.0 * np.append(N_r[1:], 0) / (N_r**2)) + * (1 + np.append(N_r[1:], 0) / N_r) + ) x = (np.arange(N_r.size) + 1) * 1.0 * np.append(N_r[1:], 0) / N_r - y = (np.arange(N_r.size) + 1) * \ - 1.0 * SmoothedN_r[1:] / (SmoothedN_r[:-1]) - assert (np.isinf(VARr_T[0]) or np.isnan(VARr_T[0])) + y = (np.arange(N_r.size) + 1) * 1.0 * SmoothedN_r[1:] / (SmoothedN_r[:-1]) + assert np.isinf(VARr_T[0]) or np.isnan(VARr_T[0]) turing_is_sig_diff = np.abs(x - y) > 1.96 * np.sqrt(VARr_T) - assert (turing_is_sig_diff[0] == np.array(False)) + assert turing_is_sig_diff[0] == np.array(False) # NB: 0th element can be safely ignored, since always 0 T = np.where(turing_is_sig_diff == np.array(False))[0] if T.size > 1: @@ -722,8 +725,12 @@ def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): # objects observed r times, r>0 p_r = np.zeros(N_r.size) N = np.sum(Counts) - p_r[1:] = (np.arange(1, N_r.size) + 1) * \ - 1.0 * SmoothedN_r[2:] / (SmoothedN_r[1:-1] * N) + p_r[1:] = ( + (np.arange(1, N_r.size) + 1) + * 1.0 + * SmoothedN_r[2:] + / (SmoothedN_r[1:-1] * N) + ) # Estimate probability of observing any unseen symbol p_r[0] = 1.0 * N_r[1] / N @@ -735,15 +742,16 @@ def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): if np.any(Counts == 0) or n_additional_empty_bins > 0: Theta = (1 - p_r[0]) * Theta / np.sum(Theta) else: - warnings.warn("No unobserved outcomes specified. Disregarding the " - "probability mass allocated to any unobserved " - "outcomes.") + warnings.warn( + "No unobserved outcomes specified. Disregarding the " + "probability mass allocated to any unobserved " + "outcomes." + ) Theta = Theta / np.sum(Theta) # Divide p_0 among unobserved symbols - with np.errstate(invalid='ignore', divide='ignore'): - p_emptybin = p_r[0] / (np.sum(Counts == 0) + - n_additional_empty_bins) + with np.errstate(invalid="ignore", divide="ignore"): + p_emptybin = p_r[0] / (np.sum(Counts == 0) + n_additional_empty_bins) Theta[Counts == 0] = p_emptybin # Theta_0 is the probability mass assigned to each additional empty bin if n_additional_empty_bins > 0: @@ -751,14 +759,17 @@ def _estimate_probabilities(Counts, estimator, n_additional_empty_bins=0): else: Theta_0 = 0 - elif estimator == 'JAMES-STEIN': - Theta, _ = _estimate_probabilities(Counts, 'ML') + elif estimator == "JAMES-STEIN": + Theta, _ = _estimate_probabilities(Counts, "ML") p_uniform = 1.0 / (Counts.size + n_additional_empty_bins) - with np.errstate(invalid='ignore', divide='ignore'): - Lambda = (1 - np.sum(Theta ** 2)) / \ - ((np.sum(Counts) - 1) * - (np.sum((p_uniform - Theta) ** 2) + - n_additional_empty_bins * p_uniform ** 2)) + with np.errstate(invalid="ignore", divide="ignore"): + Lambda = (1 - np.sum(Theta**2)) / ( + (np.sum(Counts) - 1) + * ( + np.sum((p_uniform - Theta) ** 2) + + n_additional_empty_bins * p_uniform**2 + ) + ) if Lambda > 1: Lambda = 1 diff --git a/bamt/log.py b/bamt/log.py index 74d65fc..2935da9 100644 --- a/bamt/log.py +++ b/bamt/log.py @@ -39,7 +39,9 @@ def switch_console_out(self, value: bool): for logger in self.loggers.values(): if self.has_handler(logger, handler_class): self.remove_handler_type(logger, handler_class) - logger.addHandler(logging.NullHandler() if not value else logging.root.handlers[0]) + logger.addHandler( + logging.NullHandler() if not value else logging.root.handlers[0] + ) def switch_file_out(self, value: bool, log_file: str): """ diff --git a/bamt/nodes/conditional_mixture_gaussian_node.py b/bamt/nodes/conditional_mixture_gaussian_node.py index bdd3263..db6fa0f 100644 --- a/bamt/nodes/conditional_mixture_gaussian_node.py +++ b/bamt/nodes/conditional_mixture_gaussian_node.py @@ -165,11 +165,11 @@ def choose( mean, covariance, w = self.get_dist(node_info, pvals) # check if w is nan or list of weights - if not isinstance(w, np.ndarray): + if not isinstance(w, np.ndarray): return np.nan - + n_comp = len(w) - + gmm = GMM( n_components=n_comp, priors=w, diff --git a/bamt/utils/composite_utils/MLUtils.py b/bamt/utils/composite_utils/MLUtils.py index 9c3cefb..2dd3558 100644 --- a/bamt/utils/composite_utils/MLUtils.py +++ b/bamt/utils/composite_utils/MLUtils.py @@ -1,9 +1,8 @@ import json from random import choice - -import pkg_resources from typing import Union +import pkg_resources from catboost import CatBoostClassifier, CatBoostRegressor from golem.core.dag.graph_node import GraphNode from sklearn.cluster import KMeans diff --git a/docs/source/conf.py b/docs/source/conf.py index 92a1e5f..436a7d8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -import datetime import sys from pathlib import Path diff --git a/tests/sendingRegressors.py b/tests/sendingRegressors.py index b0c79ae..346e795 100644 --- a/tests/sendingRegressors.py +++ b/tests/sendingRegressors.py @@ -4,7 +4,6 @@ from catboost import CatBoostRegressor from sklearn import preprocessing as pp from sklearn.ensemble import RandomForestRegressor - # from sklearn.linear_model import ElasticNet from sklearn.tree import DecisionTreeRegressor