-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcluster.py
225 lines (178 loc) · 7.18 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import cupy as cp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_extra.cluster import KMedoids
from tqdm import tqdm
def load_vectors(fname="model_eval_results.csv"):
"""
Loads prediction vectors from file. Returns a matrix of shape (10000, 5) - (num_examples, num_models)
"""
if not os.path.exists(fname):
raise FileNotFoundError(f"File {fname} does not exist")
df = pd.read_csv(fname)
# Extract relevant columns and convert to NumPy array directly
vectors = df[
["ViT Label", "ConvNext Label", "Swin Label", "ViT16Lora Label", "ResNet Label"]
].to_numpy(dtype=int)
return vectors
def multiclass_distance_matrix(
vectors, metric="chebyshev", cache_file_prefix="dist_matrix"
):
"""
Compute the pairwise multiclass distance matrix using GPU acceleration.
Supports 'chebyshev' and 'hamming' metrics. Cache the result to disk to avoid recomputation.
"""
# Determine the cache file name based on the metric
cache_file = f"{cache_file_prefix}_{metric}.npy"
# Check if the cached file exists
if os.path.exists(cache_file):
print(f"Loading distance matrix from {cache_file}")
return np.load(cache_file)
# Transfer vectors to GPU
vectors_gpu = cp.array(vectors)
# Define the metric functions
def chebyshev_metric_gpu(v1, v2):
return cp.max(v1 != v2, axis=-1).astype(int)
def hamming_metric_gpu(v1, v2):
return cp.mean(v1 != v2, axis=-1).astype(float)
# Select the appropriate metric function
if metric == "chebyshev":
metric_function = chebyshev_metric_gpu
elif metric == "hamming":
metric_function = hamming_metric_gpu
else:
raise ValueError("Unsupported metric. Choose 'chebyshev' or 'hamming'.")
# Compute pairwise distances on GPU
dist_matrix_gpu = cp.zeros(
(vectors_gpu.shape[0], vectors_gpu.shape[0]), dtype=float
)
for i in tqdm(range(vectors_gpu.shape[0])):
dist_matrix_gpu[i] = metric_function(vectors_gpu[i], vectors_gpu)
# Transfer the result back to CPU
dist_matrix = cp.asnumpy(dist_matrix_gpu)
# Save the computed distance matrix to disk
np.save(cache_file, dist_matrix)
print(f"Distance matrix saved to {cache_file}")
return dist_matrix
def cluster_with_kmedoids(dist_matrix, n_clusters=2):
"""
Cluster the data using K-Medoids with the precomputed distance matrix.
"""
print("Clustering with K-Medoids...")
kmedoids = KMedoids(
n_clusters=n_clusters, metric="precomputed", random_state=42, init="k-medoids++"
)
cluster_labels = kmedoids.fit_predict(dist_matrix)
print(f"Cluster centers (medoid indices): {kmedoids.medoid_indices_}")
print(f"Cluster labels: {np.bincount(cluster_labels)}")
return cluster_labels, kmedoids
def save_clustered_dataframe(
*,
vectors,
cluster_labels,
original_file="model_eval_results.csv",
output_file="model_eval_results_with_clusters.csv",
):
"""
Save the original dataframe with an additional column for cluster labels.
"""
# Load the original dataframe
df = pd.read_csv(original_file)
# Add the cluster labels to the dataframe
df["Cluster Label"] = cluster_labels
# Save the new dataframe with cluster labels
df.to_csv(output_file, index=False)
print(f"Dataframe with cluster labels saved to {output_file}")
def plot_per_cluster_accuracy(
clustered_file="model_eval_results_with_clusters.csv",
human_file="cifar10h-raw.csv",
metric="chebyshev",
):
"""
Plot the accuracy of each model and humans in each cluster and the size of each cluster.
"""
# Ensure the directory exists
output_dir = "clustering_results"
os.makedirs(output_dir, exist_ok=True)
df = pd.read_csv(clustered_file)
human_df = pd.read_csv(human_file)
cluster_indices = sorted(df["Cluster Label"].unique())
# accuracies[cluster_index][model_name] = accuracy
accuracies = {}
cluster_sizes = {}
for cluster in cluster_indices:
accuracies[cluster] = {}
cluster_df = df[df["Cluster Label"] == cluster]
cluster_sizes[cluster] = len(cluster_df)
# Select rows from human_df where image_filename matches filenames in cluster_df
matching_human_df = human_df[
human_df["image_filename"].isin(cluster_df["filename"])
]
for model in ["ViT", "ConvNext", "Swin", "ViT16Lora", "ResNet"]:
model_df = cluster_df[f"{model} Label"]
accuracy = (model_df == cluster_df["True label"]).mean()
accuracies[cluster][model] = accuracy
human_accuracy = matching_human_df["correct_guess"].mean()
accuracies[cluster]["Human"] = human_accuracy
# Create a figure with two subplots
fig, ax1 = plt.subplots(2, 1, figsize=(10, 12))
# Plot the accuracies
models = ["ViT", "ConvNext", "Swin", "ViT16Lora", "ResNet", "Human"]
cluster_labels = [f"Cluster {cluster}" for cluster in cluster_indices]
offset = 0.1 # Offset to jitter points within each cluster
for i, model in enumerate(models):
cluster_values = [accuracies[cluster][model] for cluster in cluster_indices]
x_positions = [
cluster + (i - len(models) / 2) * offset for cluster in cluster_indices
]
ax1[0].scatter(x_positions, cluster_values, label=model)
ax1[0].set_xlabel("Cluster")
ax1[0].set_ylabel("Accuracy")
ax1[0].set_title(
f"Accuracy of each model in each cluster, num clusters = {len(cluster_indices)}, metric = {metric}"
)
ax1[0].set_xticks(ticks=range(len(cluster_indices)))
ax1[0].set_xticklabels(cluster_labels, rotation=45)
ax1[0].legend()
ax1[0].grid(True)
# Plot the cluster sizes
cluster_sizes_values = [cluster_sizes[cluster] for cluster in cluster_indices]
ax1[1].bar(cluster_labels, cluster_sizes_values, color="skyblue")
ax1[1].set_xlabel("Cluster")
ax1[1].set_ylabel("Size")
ax1[1].set_title("Size of each cluster")
ax1[1].set_xticklabels(cluster_labels, rotation=45)
ax1[1].grid(True)
plt.tight_layout()
plt.savefig(
os.path.join(
output_dir,
f"per_cluster_accuracy_and_size_num_clusters_{len(cluster_indices)}_metric_{metric}.png",
)
)
plt.close()
def generate_result(n_clusters: int, metric: str):
# Load vectors and compute the optimized distance matrix
vectors = load_vectors()
print("vectors loaded")
dist_matrix = multiclass_distance_matrix(vectors, metric=metric)
print("distance matrix loaded")
# Cluster the data using K-Medoids
cluster_labels, kmedoids = cluster_with_kmedoids(dist_matrix, n_clusters=n_clusters)
print("Clustering completed")
print(f"Number of examples in each cluster: {np.bincount(cluster_labels)}")
# Save the clustered dataframe
save_clustered_dataframe(
vectors=vectors,
cluster_labels=cluster_labels,
)
# Plot the accuracy of each model in each cluster
plot_per_cluster_accuracy(metric=metric)
def main():
for metric in ["chebyshev", "hamming"]:
for n_clusters in range(2, 21):
generate_result(n_clusters, metric)
if __name__ == "__main__":
main()