From 050719c3b5abfc7356e04b13a66fcfcde5e30a1e Mon Sep 17 00:00:00 2001
From: Trutnev Aleksei <alexgiving@mail.ru>
Date: Sat, 25 May 2024 22:37:25 +0300
Subject: [PATCH] init

---
 .gitignore                                    |   1 +
 lkmeans/examples/data/experiment_data.py      |   7 +-
 lkmeans/examples/experiment.py                | 138 ------------------
 lkmeans/examples/main.py                      | 136 +++++++++++------
 lkmeans/examples/metric_meter.py              | 113 --------------
 lkmeans/examples/scripts/runner_supervised.sh |  52 +++++++
 .../examples/scripts/runner_unsupervised.sh   |  45 ++++++
 7 files changed, 188 insertions(+), 304 deletions(-)
 delete mode 100644 lkmeans/examples/experiment.py
 delete mode 100644 lkmeans/examples/metric_meter.py
 create mode 100644 lkmeans/examples/scripts/runner_supervised.sh
 create mode 100644 lkmeans/examples/scripts/runner_unsupervised.sh

diff --git a/.gitignore b/.gitignore
index aaee16c..46aa550 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ experiments/
 *__pycache__
 .DS_Store
 .vscode
+logs
\ No newline at end of file
diff --git a/lkmeans/examples/data/experiment_data.py b/lkmeans/examples/data/experiment_data.py
index e9b759d..d9cc562 100644
--- a/lkmeans/examples/data/experiment_data.py
+++ b/lkmeans/examples/data/experiment_data.py
@@ -11,21 +11,16 @@ def get_experiment_data(num_clusters: int, dimension: int) -> tuple[int, float,
     Function for generation the synthetic data for experiments
     by dimension and number of clusters
     '''
-    n_clusters: int = 0
     prob: float = 0.
     mu_prefix: list[list[float | int]] = [[]]
     sigma_list: list[float | int] = []
 
     if num_clusters == 2:
-        print('Experiment with 2 clusters')
-        n_clusters = 2
         sigma_list = [1, 1]
         prob = 0.5
         mu_prefix = [[-4, 0], [4, 0]]
 
     elif num_clusters == 3:
-        print('Experiment with 3 clusters')
-        n_clusters = 3
         sigma_list = [1, 1, 1]
         prob = 1/3
         mu_prefix = [[4, 0, 0], [0, 4, 0], [0, 0, 4]]
@@ -35,4 +30,4 @@ def get_experiment_data(num_clusters: int, dimension: int) -> tuple[int, float,
 
     mu_list = [np.array([x + [0] * (dimension - len(x))]) for x in mu_prefix]
     cov_matrix = [get_covariance_matrix(sigma, dimension) for sigma in sigma_list]
-    return n_clusters, prob, mu_list, cov_matrix
+    return num_clusters, prob, mu_list, cov_matrix
diff --git a/lkmeans/examples/experiment.py b/lkmeans/examples/experiment.py
deleted file mode 100644
index 2dda889..0000000
--- a/lkmeans/examples/experiment.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import time
-from pathlib import Path
-
-from matplotlib import pyplot as plt
-from numpy.typing import NDArray
-from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
-
-from lkmeans import LKMeans
-from lkmeans.examples.data.points_generator import generate_mix_distribution
-from lkmeans.examples.decomposition import get_tsne_clusters
-from lkmeans.examples.metric_meter import GraphicMeter, MetricMeter, MetricTable, insert_hline
-
-
-# pylint: disable=too-many-arguments, too-many-locals
-def repeat_iteration(
-        repeats: int,
-        n_clusters: int,
-        n_points: int,
-        prob: float,
-        cov_matrices: list[NDArray],
-        t: float,
-        mu_list: list[NDArray],
-        p: float | int,
-        makes_plot: bool,
-        output_path: Path
-
-):
-    repeat_metric_meter = MetricMeter()
-    for _ in range(repeats):
-
-        clusters, labels, centroids = generate_mix_distribution(
-            probability=prob,
-            mu_list=mu_list,
-            cov_matrices=cov_matrices,
-            n_samples=n_points,
-            t=t
-        )
-
-        experiment_time = time.perf_counter()
-        lkmeans = LKMeans(n_clusters=n_clusters, p=p)
-        generated_labels = lkmeans.fit_predict(clusters)
-        experiment_time = time.perf_counter() - experiment_time
-
-        repeat_metric_meter.add_combination(
-            ari=adjusted_rand_score(labels, generated_labels),
-            ami=float(adjusted_mutual_info_score(labels, generated_labels)),
-            inertia=lkmeans.inertia_,
-            time=experiment_time
-        )
-        if makes_plot:
-            figure_name = f'factor_{t:.1f}'.replace('.', '_')
-            fig = get_tsne_clusters(clusters, labels, centroids)
-            fig.savefig(
-                str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight')
-            plt.close(fig)
-    average_ari, average_ami, average_inertia, average_time = repeat_metric_meter.get_average()
-    return average_ari, average_ami, average_inertia, average_time
-
-
-# pylint: disable= too-many-arguments, too-many-locals
-def run_experiment(
-        n_clusters: int,
-        distance_parameters: list[float],
-        minkowski_parameters: list[float | int],
-        repeats: int,
-        n_points: int,
-        cov_matrices: list[NDArray],
-        prob: float,
-        mu_list: list[NDArray],
-        experiment_name: str,
-        output_path: Path,
-        makes_plot: bool = False) -> None:
-    '''Function for evaluation experiment'''
-
-    output_path.mkdir(exist_ok=True, parents=True)
-
-    table = MetricTable()
-
-    generator = [GraphicMeter(distance_parameters, 't')
-                 for _ in minkowski_parameters]
-    graphic_t_metrics_dict = dict(zip(minkowski_parameters, generator))
-
-    for t in distance_parameters:
-
-        graphic_p_metrics = GraphicMeter(minkowski_parameters, 'p')
-        for p in minkowski_parameters:
-
-            average_ari, average_ami, average_inertia, average_time = repeat_iteration(
-                repeats, n_clusters, n_points, prob,
-                cov_matrices, t, mu_list, p, makes_plot, output_path
-            )
-
-            table.add_to_frame(
-                ari=average_ari,
-                ami=average_ami,
-                inertia=average_inertia,
-                time=average_time,
-                name=f'{experiment_name}, T:{t:.1f}, P:{p}'
-            )
-
-            graphic_p_metrics.add_combination(
-                ari=average_ari,
-                ami=average_ami,
-                inertia=average_inertia,
-                time=average_time
-            )
-
-            graphic_t_metrics_dict[p].add_combination(
-                ari=average_ari,
-                ami=average_ami,
-                inertia=average_inertia,
-                time=average_time
-            )
-
-        for metric_graph in ['ARI', 'AMI', 'Inertia', 'Time']:
-            figure_name = f'factor_{t:.1f}_{metric_graph}'.replace('.', '_')
-            fig = graphic_p_metrics.get_graph(metric_graph)
-            fig.savefig(
-                str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight')
-            plt.close(fig)
-
-    print(table.get_table())
-
-    for p, graph_t_meter in graphic_t_metrics_dict.items():
-        for metric in ['ARI', 'AMI', 'Inertia', 'Time']:
-            figure_name = f'{metric}_by_t_with_p_{p}'.replace('.', '_')
-            fig = graph_t_meter.get_graph(metric)
-            fig.savefig(
-                str(output_path / f'{figure_name}.png'), dpi=300, bbox_inches='tight')
-            plt.close(fig)
-
-    table_name = 'experiment 1'
-    table = table.get_latex_table(caption='Experiment 1')
-    table = insert_hline(table)
-
-    latex_logs = output_path / f'{table_name.replace(" ", "_")}.tex'
-    with latex_logs.open('w') as f:
-        f.write(table)
diff --git a/lkmeans/examples/main.py b/lkmeans/examples/main.py
index 9432eba..d175adc 100644
--- a/lkmeans/examples/main.py
+++ b/lkmeans/examples/main.py
@@ -1,54 +1,96 @@
-from argparse import ArgumentParser
-from pathlib import Path
+import time
+from collections import defaultdict
+from enum import Enum
+from typing import Dict
 
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.metrics import accuracy_score, adjusted_mutual_info_score, adjusted_rand_score
+from tap import Tap
+
+from lkmeans.clustering import HardSSLKMeans, LKMeans, SoftSSLKMeans
+from lkmeans.clustering.base import Clustering
+from lkmeans.clustering.supervised.utils import select_supervisor_targets
 from lkmeans.examples.data.experiment_data import get_experiment_data
-from lkmeans.examples.experiment import run_experiment
-
-parser = ArgumentParser()
-
-parser.add_argument(
-    '--path',
-    type=Path,
-    default=Path('experiments'),
-    help='Path to save results'
-)
-
-parser.add_argument(
-    '--num-clusters',
-    type=int,
-    default=2
-)
-
-
-def main():
-    args = parser.parse_args()
-    experiments_path = args.path
-
-    minkowski_parameter = [0.2, 0.6, 1, 1.5, 2, 3, 5]
-    T_parameter = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-    repeats = 100
-    n_points = [100, 500, 1000]
-
-    dimension = 20
-    n_clusters, prob, mu_list, cov_matrices = get_experiment_data(
-        num_clusters=args.num_clusters, dimension=dimension)
-
-    for points in n_points:
-        experiment_name = f'Clusters:{n_clusters}, points:{points}'
-        output_path = experiments_path / f'exp_{args.num_clusters}_points_{points}'
-
-        run_experiment(
-            n_clusters=n_clusters,
-            distance_parameters=T_parameter,
-            minkowski_parameters=minkowski_parameter,
-            repeats=repeats,
-            n_points=points,
-            cov_matrices=cov_matrices,
-            prob=prob,
+from lkmeans.examples.data.points_generator import generate_mix_distribution
+
+
+class ClusteringAlgorithmType(Enum):
+    LKMEANS = 'lkmeans'
+    SOFT_SS_LKMEANS = 'soft_ss_lkmeans'
+    HARD_SS_LKMEANS = 'hard_ss_lkmeans'
+
+
+class ExperimentArguments(Tap):
+    minkowski_parameter: float
+    t_parameter: float
+    n_points: int
+    clustering_algorithm: ClusteringAlgorithmType = ClusteringAlgorithmType.LKMEANS
+
+    num_clusters: int = 2
+    dimension: int = 20
+    repeats: int = 10
+    supervision_ratio: float = 0
+
+
+def get_clustering_algorithm(clustering_type: ClusteringAlgorithmType) -> Clustering:
+    clustering_map: Dict[clustering_type, Clustering] = {
+        ClusteringAlgorithmType.LKMEANS: LKMeans,
+        ClusteringAlgorithmType.SOFT_SS_LKMEANS: SoftSSLKMeans,
+        ClusteringAlgorithmType.HARD_SS_LKMEANS: HardSSLKMeans
+    }
+    return clustering_map[clustering_type]
+
+
+def calculate_metrics(labels: NDArray, generated_labels: NDArray) -> Dict[str, float]:
+    return {
+        'ari': float(adjusted_rand_score(labels, generated_labels)),
+        'ami': float(adjusted_mutual_info_score(labels, generated_labels)),
+        'accuracy': float(accuracy_score(labels, generated_labels)),
+    }
+
+
+def main() -> None:
+    args = ExperimentArguments(underscores_to_dashes=True).parse_args()
+
+    _, prob, mu_list, cov_matrices = get_experiment_data(args.num_clusters, args.dimension)
+
+    clustering = get_clustering_algorithm(args.clustering_algorithm)
+
+    average_result = defaultdict(list)
+
+    for _ in range(args.repeats):
+
+        clusters, labels, _ = generate_mix_distribution(
+            probability=prob,
             mu_list=mu_list,
-            experiment_name=experiment_name,
-            output_path=output_path
+            cov_matrices=cov_matrices,
+            n_samples=args.n_points,
+            t=args.t_parameter
+        )
+
+        lkmeans = clustering(n_clusters=args.num_clusters, p=args.minkowski_parameter)
+
+        if args.clustering_algorithm is ClusteringAlgorithmType.LKMEANS:
+
+            experiment_time = time.perf_counter()
+            generated_labels = lkmeans.fit_predict(clusters)
+        else:
+            experiment_time = time.perf_counter()
+            supervisor_targets = select_supervisor_targets(labels, args.supervision_ratio)
+            generated_labels = lkmeans.fit_predict(clusters, supervisor_targets)
+        experiment_time = time.perf_counter() - experiment_time
+
+        metrics_dict = calculate_metrics(
+            labels=labels,
+            generated_labels=generated_labels,
         )
+        result = {**metrics_dict, 'time': experiment_time, 'inertia': lkmeans.inertia_}
+        for key, value in result.items():
+            average_result[key].append(value)
+    for key, value in result.items():
+        average_result[key] = np.mean(value)
+    print(dict(average_result))
 
 
 if __name__ == '__main__':
diff --git a/lkmeans/examples/metric_meter.py b/lkmeans/examples/metric_meter.py
deleted file mode 100644
index dd11f70..0000000
--- a/lkmeans/examples/metric_meter.py
+++ /dev/null
@@ -1,113 +0,0 @@
-from typing import Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-
-class MetricTable:
-    def __init__(self) -> None:
-        self.frames = []
-
-    def add_frame(self, frame: pd.DataFrame) -> None:
-        self.frames.append(frame)
-
-    def add_to_frame(self,
-                     ari: float,
-                     ami: float,
-                     inertia: float,
-                     time: float,
-                     name: Optional[str] = 'Experiment'
-                     ) -> None:
-        data = {'ARI': f'{ari:.2f}', 'AMI': f'{ami:.2f}',
-                'Inertia': f'{inertia:.2f}', 'Time': f'{time:.2f}'}
-        frame = pd.DataFrame(data, [name])
-        self.frames.append(frame)
-
-    def add_empty_frame(self, time: bool) -> None:
-        empty = 'N/A'
-        data = {'ARI': empty, 'AMI': empty}
-        if time:
-            data['Time'] = empty
-        frame = pd.DataFrame(data, [empty])
-        self.frames.append(frame)
-
-    def get_table(self) -> pd.DataFrame:
-        return pd.concat(self.frames, join="inner")
-
-    def get_latex_table(self, caption: str = '') -> str:
-        table = self.get_table()
-        return table.style.to_latex(caption=caption)
-
-
-def insert_hline(latex_str: str) -> str:
-    lines_strings = latex_str.splitlines()
-    result = []
-
-    for line in lines_strings:
-        if 'N/A' in line:
-            result.append('\\midrule')
-        else:
-            result.append(line)
-    result = '\n'.join(result)
-    return result
-
-
-class MetricMeter:
-    def __init__(self) -> None:
-        self.ari = []
-        self.ami = []
-        self.inertia = []
-        self.time = []
-
-    def add_ari(self, value: float) -> None:
-        self.ari.append(value)
-
-    def add_ami(self, value: float) -> None:
-        self.ami.append(value)
-
-    def add_inertia(self, value: float) -> None:
-        self.inertia.append(value)
-
-    def add_time(self, value: float) -> None:
-        self.time.append(value)
-
-    def add_combination(self, ari: float, ami: float, inertia: float, time: float) -> None:
-        self.add_ari(abs(ari))
-        self.add_ami(abs(ami))
-        self.add_inertia(inertia)
-        self.add_time(time)
-
-    def get_average(self) -> tuple[float, float, float, float]:
-        return float(np.mean(self.ari)), float(np.mean(self.ami)), \
-            float(np.mean(self.inertia)), float(np.mean(self.time))
-
-
-class GraphicMeter(MetricMeter):
-    def __init__(self, base: list, base_name: str) -> None:
-        super().__init__()
-        self.base = base
-        self.base_name = base_name
-
-    def get_graph(self, key: str):
-        values = {'ARI': self.ari, 'AMI': self.ami,
-                  'Inertia': self.inertia, 'Time': self.time}
-
-        fig, ax = plt.subplots(figsize=(5, 4))
-        param = values[key]
-        ax.plot(self.base, param, '-o')
-        ax.grid(True, color='gray', linestyle='--', linewidth=0.5)
-
-        if self.base_name == 'p':
-            ax.set_xticks(self.base)
-        else:
-            ax.set_xticks(np.linspace(0, 1, 11))
-        ax.set_xlabel(self.base_name)
-
-        if key in ('ARI', 'AMI'):
-            ax.set_yticks(np.arange(0, 1.1, 0.1))
-        else:
-            ax.set_yticks(np.linspace(np.min(param), np.max(param), 10))
-        ax.set_ylabel(key)
-        # ax.set_title(f'{key} vs. {self.base_name}')
-        return fig
diff --git a/lkmeans/examples/scripts/runner_supervised.sh b/lkmeans/examples/scripts/runner_supervised.sh
new file mode 100644
index 0000000..4ab156f
--- /dev/null
+++ b/lkmeans/examples/scripts/runner_supervised.sh
@@ -0,0 +1,52 @@
+#! bin/bash
+
+LOGDIR=logs
+set -ex
+
+mkdir -p ${LOGDIR}
+
+# VALUES
+MINKOSKI_VALUES=(0.5 1 2 5)
+T_VALUES=(0 0.2 0.4 0.6 0.8)
+N_POINTS_VALUES=(100 500 1000)
+CLUSTERINGS_VALUES=(soft_ss_lkmeans hard_ss_lkmeans)
+SUPERVISION_RATIO_VALUES=(0.1 0.15 0.2)
+DIMENSION_VALUES=(20)
+NUM_CLUSTERS_VALUES=(2 3)
+
+# Constants
+REPEATS=10
+
+source .env
+
+for NUM_CLUSTERS in "${NUM_CLUSTERS_VALUES[@]}";do
+    for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do
+        for T in "${T_VALUES[@]}";do
+            for N_POINTS in "${N_POINTS_VALUES[@]}";do
+                for DIMENSION in "${DIMENSION_VALUES[@]}";do
+                    for CLUSTERING in "${CLUSTERINGS_VALUES[@]}";do
+                        for SUPERVISION_RATIO in "${SUPERVISION_RATIO_VALUES[@]}";do
+
+
+NAME="${CLUSTERING}_|_supervision_${SUPERVISION_RATIO}_|_num-clusters_${NUM_CLUSTERS}_|_minkowski_${MINKOVSKI}_|_t_${T}_|_n-points_${N_POINTS}_|_dimension_${DIMENSION}_|_repeats_${REPEATS}.log"
+
+                            echo ${NAME}
+                            PARAMETERS="
+                            --num-clusters ${NUM_CLUSTERS} \
+                            --minkowski-parameter ${MINKOVSKI} \
+                            --t-parameter ${T} \
+                            --n-points ${N_POINTS} \
+                            --dimension ${DIMENSION} \
+                            --clustering-algorithm ${CLUSTERING} \
+                            --supervision-ratio ${SUPERVISION_RATIO} \
+                            --repeats ${REPEATS} \
+                            "
+                            python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME}
+                        done
+                    done
+                done
+            done
+        done
+    done
+done
+
diff --git a/lkmeans/examples/scripts/runner_unsupervised.sh b/lkmeans/examples/scripts/runner_unsupervised.sh
new file mode 100644
index 0000000..139b201
--- /dev/null
+++ b/lkmeans/examples/scripts/runner_unsupervised.sh
@@ -0,0 +1,45 @@
+#! bin/bash
+
+LOGDIR=logs
+set -ex
+
+mkdir -p ${LOGDIR}
+
+# VALUES
+MINKOSKI_VALUES=(0.5 1 2 5)
+T_VALUES=(0 0.2 0.4 0.6 0.8)
+N_POINTS_VALUES=(100 500 1000)
+DIMENSION_VALUES=(20)
+NUM_CLUSTERS_VALUES=(2 3)
+
+# Constants
+CLUSTERING=lkmeans
+REPEATS=10
+
+source .env
+
+for NUM_CLUSTERS in "${NUM_CLUSTERS_VALUES[@]}";do
+    for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do
+        for T in "${T_VALUES[@]}";do
+            for N_POINTS in "${N_POINTS_VALUES[@]}";do
+                for DIMENSION in "${DIMENSION_VALUES[@]}";do
+
+NAME="${CLUSTERING}_|_num-clusters_${NUM_CLUSTERS}_|_minkowski_${MINKOVSKI}_|_t_${T}_|_n-points_${N_POINTS}_|_dimension_${DIMENSION}_|_repeats_${REPEATS}.log"
+
+                    echo ${NAME}
+                    PARAMETERS="
+                    --num-clusters ${NUM_CLUSTERS} \
+                    --minkowski-parameter ${MINKOVSKI} \
+                    --t-parameter ${T} \
+                    --n-points ${N_POINTS} \
+                    --dimension ${DIMENSION} \
+                    --clustering-algorithm ${CLUSTERING} \
+                    --repeats ${REPEATS} \
+                    "
+                    python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME}
+                done
+            done
+        done
+    done
+done
+