diff --git a/.gitignore b/.gitignore index aaee16c..46aa550 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ experiments/ *__pycache__ .DS_Store .vscode +logs \ No newline at end of file diff --git a/lkmeans/examples/data/experiment_data.py b/lkmeans/examples/data/experiment_data.py index e9b759d..d9cc562 100644 --- a/lkmeans/examples/data/experiment_data.py +++ b/lkmeans/examples/data/experiment_data.py @@ -11,21 +11,16 @@ def get_experiment_data(num_clusters: int, dimension: int) -> tuple[int, float, Function for generation the synthetic data for experiments by dimension and number of clusters ''' - n_clusters: int = 0 prob: float = 0. mu_prefix: list[list[float | int]] = [[]] sigma_list: list[float | int] = [] if num_clusters == 2: - print('Experiment with 2 clusters') - n_clusters = 2 sigma_list = [1, 1] prob = 0.5 mu_prefix = [[-4, 0], [4, 0]] elif num_clusters == 3: - print('Experiment with 3 clusters') - n_clusters = 3 sigma_list = [1, 1, 1] prob = 1/3 mu_prefix = [[4, 0, 0], [0, 4, 0], [0, 0, 4]] @@ -35,4 +30,4 @@ def get_experiment_data(num_clusters: int, dimension: int) -> tuple[int, float, mu_list = [np.array([x + [0] * (dimension - len(x))]) for x in mu_prefix] cov_matrix = [get_covariance_matrix(sigma, dimension) for sigma in sigma_list] - return n_clusters, prob, mu_list, cov_matrix + return num_clusters, prob, mu_list, cov_matrix diff --git a/lkmeans/examples/experiment.py b/lkmeans/examples/experiment.py deleted file mode 100644 index 2dda889..0000000 --- a/lkmeans/examples/experiment.py +++ /dev/null @@ -1,138 +0,0 @@ -import time -from pathlib import Path - -from matplotlib import pyplot as plt -from numpy.typing import NDArray -from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score - -from lkmeans import LKMeans -from lkmeans.examples.data.points_generator import generate_mix_distribution -from lkmeans.examples.decomposition import get_tsne_clusters -from lkmeans.examples.metric_meter import GraphicMeter, MetricMeter, MetricTable, insert_hline - - -# pylint: disable=too-many-arguments, too-many-locals -def repeat_iteration( - repeats: int, - n_clusters: int, - n_points: int, - prob: float, - cov_matrices: list[NDArray], - t: float, - mu_list: list[NDArray], - p: float | int, - makes_plot: bool, - output_path: Path - -): - repeat_metric_meter = MetricMeter() - for _ in range(repeats): - - clusters, labels, centroids = generate_mix_distribution( - probability=prob, - mu_list=mu_list, - cov_matrices=cov_matrices, - n_samples=n_points, - t=t - ) - - experiment_time = time.perf_counter() - lkmeans = LKMeans(n_clusters=n_clusters, p=p) - generated_labels = lkmeans.fit_predict(clusters) - experiment_time = time.perf_counter() - experiment_time - - repeat_metric_meter.add_combination( - ari=adjusted_rand_score(labels, generated_labels), - ami=float(adjusted_mutual_info_score(labels, generated_labels)), - inertia=lkmeans.inertia_, - time=experiment_time - ) - if makes_plot: - figure_name = f'factor_{t:.1f}'.replace('.', '_') - fig = get_tsne_clusters(clusters, labels, centroids) - fig.savefig( - str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight') - plt.close(fig) - average_ari, average_ami, average_inertia, average_time = repeat_metric_meter.get_average() - return average_ari, average_ami, average_inertia, average_time - - -# pylint: disable= too-many-arguments, too-many-locals -def run_experiment( - n_clusters: int, - distance_parameters: list[float], - minkowski_parameters: list[float | int], - repeats: int, - n_points: int, - cov_matrices: list[NDArray], - prob: float, - mu_list: list[NDArray], - experiment_name: str, - output_path: Path, - makes_plot: bool = False) -> None: - '''Function for evaluation experiment''' - - output_path.mkdir(exist_ok=True, parents=True) - - table = MetricTable() - - generator = [GraphicMeter(distance_parameters, 't') - for _ in minkowski_parameters] - graphic_t_metrics_dict = dict(zip(minkowski_parameters, generator)) - - for t in distance_parameters: - - graphic_p_metrics = GraphicMeter(minkowski_parameters, 'p') - for p in minkowski_parameters: - - average_ari, average_ami, average_inertia, average_time = repeat_iteration( - repeats, n_clusters, n_points, prob, - cov_matrices, t, mu_list, p, makes_plot, output_path - ) - - table.add_to_frame( - ari=average_ari, - ami=average_ami, - inertia=average_inertia, - time=average_time, - name=f'{experiment_name}, T:{t:.1f}, P:{p}' - ) - - graphic_p_metrics.add_combination( - ari=average_ari, - ami=average_ami, - inertia=average_inertia, - time=average_time - ) - - graphic_t_metrics_dict[p].add_combination( - ari=average_ari, - ami=average_ami, - inertia=average_inertia, - time=average_time - ) - - for metric_graph in ['ARI', 'AMI', 'Inertia', 'Time']: - figure_name = f'factor_{t:.1f}_{metric_graph}'.replace('.', '_') - fig = graphic_p_metrics.get_graph(metric_graph) - fig.savefig( - str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight') - plt.close(fig) - - print(table.get_table()) - - for p, graph_t_meter in graphic_t_metrics_dict.items(): - for metric in ['ARI', 'AMI', 'Inertia', 'Time']: - figure_name = f'{metric}_by_t_with_p_{p}'.replace('.', '_') - fig = graph_t_meter.get_graph(metric) - fig.savefig( - str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight') - plt.close(fig) - - table_name = 'experiment 1' - table = table.get_latex_table(caption='Experiment 1') - table = insert_hline(table) - - latex_logs = output_path / f'{table_name.replace(" ", "_")}.tex' - with latex_logs.open('w') as f: - f.write(table) diff --git a/lkmeans/examples/main.py b/lkmeans/examples/main.py index 9432eba..d175adc 100644 --- a/lkmeans/examples/main.py +++ b/lkmeans/examples/main.py @@ -1,54 +1,96 @@ -from argparse import ArgumentParser -from pathlib import Path +import time +from collections import defaultdict +from enum import Enum +from typing import Dict +import numpy as np +from numpy.typing import NDArray +from sklearn.metrics import accuracy_score, adjusted_mutual_info_score, adjusted_rand_score +from tap import Tap + +from lkmeans.clustering import HardSSLKMeans, LKMeans, SoftSSLKMeans +from lkmeans.clustering.base import Clustering +from lkmeans.clustering.supervised.utils import select_supervisor_targets from lkmeans.examples.data.experiment_data import get_experiment_data -from lkmeans.examples.experiment import run_experiment - -parser = ArgumentParser() - -parser.add_argument( - '--path', - type=Path, - default=Path('experiments'), - help='Path to save results' -) - -parser.add_argument( - '--num-clusters', - type=int, - default=2 -) - - -def main(): - args = parser.parse_args() - experiments_path = args.path - - minkowski_parameter = [0.2, 0.6, 1, 1.5, 2, 3, 5] - T_parameter = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] - repeats = 100 - n_points = [100, 500, 1000] - - dimension = 20 - n_clusters, prob, mu_list, cov_matrices = get_experiment_data( - num_clusters=args.num_clusters, dimension=dimension) - - for points in n_points: - experiment_name = f'Clusters:{n_clusters}, points:{points}' - output_path = experiments_path / f'exp_{args.num_clusters}_points_{points}' - - run_experiment( - n_clusters=n_clusters, - distance_parameters=T_parameter, - minkowski_parameters=minkowski_parameter, - repeats=repeats, - n_points=points, - cov_matrices=cov_matrices, - prob=prob, +from lkmeans.examples.data.points_generator import generate_mix_distribution + + +class ClusteringAlgorithmType(Enum): + LKMEANS = 'lkmeans' + SOFT_SS_LKMEANS = 'soft_ss_lkmeans' + HARD_SS_LKMEANS = 'hard_ss_lkmeans' + + +class ExperimentArguments(Tap): + minkowski_parameter: float + t_parameter: float + n_points: int + clustering_algorithm: ClusteringAlgorithmType = ClusteringAlgorithmType.LKMEANS + + num_clusters: int = 2 + dimension: int = 20 + repeats: int = 10 + supervision_ratio: float = 0 + + +def get_clustering_algorithm(clustering_type: ClusteringAlgorithmType) -> Clustering: + clustering_map: Dict[clustering_type, Clustering] = { + ClusteringAlgorithmType.LKMEANS: LKMeans, + ClusteringAlgorithmType.SOFT_SS_LKMEANS: SoftSSLKMeans, + ClusteringAlgorithmType.HARD_SS_LKMEANS: HardSSLKMeans + } + return clustering_map[clustering_type] + + +def calculate_metrics(labels: NDArray, generated_labels: NDArray) -> Dict[str, float]: + return { + 'ari': float(adjusted_rand_score(labels, generated_labels)), + 'ami': float(adjusted_mutual_info_score(labels, generated_labels)), + 'accuracy': float(accuracy_score(labels, generated_labels)), + } + + +def main() -> None: + args = ExperimentArguments(underscores_to_dashes=True).parse_args() + + _, prob, mu_list, cov_matrices = get_experiment_data(args.num_clusters, args.dimension) + + clustering = get_clustering_algorithm(args.clustering_algorithm) + + average_result = defaultdict(list) + + for _ in range(args.repeats): + + clusters, labels, _ = generate_mix_distribution( + probability=prob, mu_list=mu_list, - experiment_name=experiment_name, - output_path=output_path + cov_matrices=cov_matrices, + n_samples=args.n_points, + t=args.t_parameter + ) + + lkmeans = clustering(n_clusters=args.num_clusters, p=args.minkowski_parameter) + + if args.clustering_algorithm is ClusteringAlgorithmType.LKMEANS: + + experiment_time = time.perf_counter() + generated_labels = lkmeans.fit_predict(clusters) + else: + experiment_time = time.perf_counter() + supervisor_targets = select_supervisor_targets(labels, args.supervision_ratio) + generated_labels = lkmeans.fit_predict(clusters, supervisor_targets) + experiment_time = time.perf_counter() - experiment_time + + metrics_dict = calculate_metrics( + labels=labels, + generated_labels=generated_labels, ) + result = {**metrics_dict, 'time': experiment_time, 'inertia': lkmeans.inertia_} + for key, value in result.items(): + average_result[key].append(value) + for key, value in result.items(): + average_result[key] = np.mean(value) + print(dict(average_result)) if __name__ == '__main__': diff --git a/lkmeans/examples/metric_meter.py b/lkmeans/examples/metric_meter.py deleted file mode 100644 index dd11f70..0000000 --- a/lkmeans/examples/metric_meter.py +++ /dev/null @@ -1,113 +0,0 @@ -from typing import Optional - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - - -class MetricTable: - def __init__(self) -> None: - self.frames = [] - - def add_frame(self, frame: pd.DataFrame) -> None: - self.frames.append(frame) - - def add_to_frame(self, - ari: float, - ami: float, - inertia: float, - time: float, - name: Optional[str] = 'Experiment' - ) -> None: - data = {'ARI': f'{ari:.2f}', 'AMI': f'{ami:.2f}', - 'Inertia': f'{inertia:.2f}', 'Time': f'{time:.2f}'} - frame = pd.DataFrame(data, [name]) - self.frames.append(frame) - - def add_empty_frame(self, time: bool) -> None: - empty = 'N/A' - data = {'ARI': empty, 'AMI': empty} - if time: - data['Time'] = empty - frame = pd.DataFrame(data, [empty]) - self.frames.append(frame) - - def get_table(self) -> pd.DataFrame: - return pd.concat(self.frames, join="inner") - - def get_latex_table(self, caption: str = '') -> str: - table = self.get_table() - return table.style.to_latex(caption=caption) - - -def insert_hline(latex_str: str) -> str: - lines_strings = latex_str.splitlines() - result = [] - - for line in lines_strings: - if 'N/A' in line: - result.append('\\midrule') - else: - result.append(line) - result = '\n'.join(result) - return result - - -class MetricMeter: - def __init__(self) -> None: - self.ari = [] - self.ami = [] - self.inertia = [] - self.time = [] - - def add_ari(self, value: float) -> None: - self.ari.append(value) - - def add_ami(self, value: float) -> None: - self.ami.append(value) - - def add_inertia(self, value: float) -> None: - self.inertia.append(value) - - def add_time(self, value: float) -> None: - self.time.append(value) - - def add_combination(self, ari: float, ami: float, inertia: float, time: float) -> None: - self.add_ari(abs(ari)) - self.add_ami(abs(ami)) - self.add_inertia(inertia) - self.add_time(time) - - def get_average(self) -> tuple[float, float, float, float]: - return float(np.mean(self.ari)), float(np.mean(self.ami)), \ - float(np.mean(self.inertia)), float(np.mean(self.time)) - - -class GraphicMeter(MetricMeter): - def __init__(self, base: list, base_name: str) -> None: - super().__init__() - self.base = base - self.base_name = base_name - - def get_graph(self, key: str): - values = {'ARI': self.ari, 'AMI': self.ami, - 'Inertia': self.inertia, 'Time': self.time} - - fig, ax = plt.subplots(figsize=(5, 4)) - param = values[key] - ax.plot(self.base, param, '-o') - ax.grid(True, color='gray', linestyle='--', linewidth=0.5) - - if self.base_name == 'p': - ax.set_xticks(self.base) - else: - ax.set_xticks(np.linspace(0, 1, 11)) - ax.set_xlabel(self.base_name) - - if key in ('ARI', 'AMI'): - ax.set_yticks(np.arange(0, 1.1, 0.1)) - else: - ax.set_yticks(np.linspace(np.min(param), np.max(param), 10)) - ax.set_ylabel(key) - # ax.set_title(f'{key} vs. {self.base_name}') - return fig diff --git a/lkmeans/examples/scripts/runner_supervised.sh b/lkmeans/examples/scripts/runner_supervised.sh new file mode 100644 index 0000000..4ab156f --- /dev/null +++ b/lkmeans/examples/scripts/runner_supervised.sh @@ -0,0 +1,52 @@ +#! bin/bash + +LOGDIR=logs +set -ex + +mkdir -p ${LOGDIR} + +# VALUES +MINKOSKI_VALUES=(0.5 1 2 5) +T_VALUES=(0 0.2 0.4 0.6 0.8) +N_POINTS_VALUES=(100 500 1000) +CLUSTERINGS_VALUES=(soft_ss_lkmeans hard_ss_lkmeans) +SUPERVISION_RATIO_VALUES=(0.1 0.15 0.2) +DIMENSION_VALUES=(20) +NUM_CLUSTERS_VALUES=(2 3) + +# Constants +REPEATS=10 + +source .env + +for NUM_CLUSTERS in "${NUM_CLUSTERS_VALUES[@]}";do + for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do + for T in "${T_VALUES[@]}";do + for N_POINTS in "${N_POINTS_VALUES[@]}";do + for DIMENSION in "${DIMENSION_VALUES[@]}";do + for CLUSTERING in "${CLUSTERINGS_VALUES[@]}";do + for SUPERVISION_RATIO in "${SUPERVISION_RATIO_VALUES[@]}";do + + +NAME="${CLUSTERING}_|_supervision_${SUPERVISION_RATIO}_|_num-clusters_${NUM_CLUSTERS}_|_minkowski_${MINKOVSKI}_|_t_${T}_|_n-points_${N_POINTS}_|_dimension_${DIMENSION}_|_repeats_${REPEATS}.log" + + echo ${NAME} + PARAMETERS=" + --num-clusters ${NUM_CLUSTERS} \ + --minkowski-parameter ${MINKOVSKI} \ + --t-parameter ${T} \ + --n-points ${N_POINTS} \ + --dimension ${DIMENSION} \ + --clustering-algorithm ${CLUSTERING} \ + --supervision-ratio ${SUPERVISION_RATIO} \ + --repeats ${REPEATS} \ + " + python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME} + done + done + done + done + done + done +done + diff --git a/lkmeans/examples/scripts/runner_unsupervised.sh b/lkmeans/examples/scripts/runner_unsupervised.sh new file mode 100644 index 0000000..139b201 --- /dev/null +++ b/lkmeans/examples/scripts/runner_unsupervised.sh @@ -0,0 +1,45 @@ +#! bin/bash + +LOGDIR=logs +set -ex + +mkdir -p ${LOGDIR} + +# VALUES +MINKOSKI_VALUES=(0.5 1 2 5) +T_VALUES=(0 0.2 0.4 0.6 0.8) +N_POINTS_VALUES=(100 500 1000) +DIMENSION_VALUES=(20) +NUM_CLUSTERS_VALUES=(2 3) + +# Constants +CLUSTERING=lkmeans +REPEATS=10 + +source .env + +for NUM_CLUSTERS in "${NUM_CLUSTERS_VALUES[@]}";do + for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do + for T in "${T_VALUES[@]}";do + for N_POINTS in "${N_POINTS_VALUES[@]}";do + for DIMENSION in "${DIMENSION_VALUES[@]}";do + +NAME="${CLUSTERING}_|_num-clusters_${NUM_CLUSTERS}_|_minkowski_${MINKOVSKI}_|_t_${T}_|_n-points_${N_POINTS}_|_dimension_${DIMENSION}_|_repeats_${REPEATS}.log" + + echo ${NAME} + PARAMETERS=" + --num-clusters ${NUM_CLUSTERS} \ + --minkowski-parameter ${MINKOVSKI} \ + --t-parameter ${T} \ + --n-points ${N_POINTS} \ + --dimension ${DIMENSION} \ + --clustering-algorithm ${CLUSTERING} \ + --repeats ${REPEATS} \ + " + python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME} + done + done + done + done +done +