Skip to content

Commit

Permalink
updating site on Thu Jul 11 16:13:55 CDT 2024
Browse files Browse the repository at this point in the history
  • Loading branch information
hongtaoh committed Jul 11, 2024
1 parent c4b6ad5 commit 2099d14
Show file tree
Hide file tree
Showing 15 changed files with 922 additions and 37 deletions.
314 changes: 310 additions & 4 deletions content/cn/blog/2024-07-05-k-means.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---

title: "K-means 算法讲解"
date: 2024-07-06
date: 2024-07-05
author: 郝鸿涛
slug: k-means
draft: false
Expand Down Expand Up @@ -187,7 +187,7 @@ def kmeans_clustering(data, centroids):
# for each data point
for d in data:
# calcuate distance from each centroid to this data point
distances = [np.linalg.norm(d - centroids[c]) for c in range(k)]
distances = np.linalg.norm(d - centroids, axis = 1)
# the index of centroid that is the closest to this data point
min_distance_idx = np.argmin(distances)
# add this data point to the associated cluster
Expand Down Expand Up @@ -273,6 +273,24 @@ while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > threshold:





![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_4.png)





![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_5.png)





![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_6.png)



## 制作动画

上面的代码我们可以把每一步画出来,我们试试制作一个动画。
Expand Down Expand Up @@ -333,10 +351,298 @@ def update(frame):
anim = FuncAnimation(fig, update, frames=np.arange(20), interval=1000, repeat=False)

writer = FFMpegWriter(fps=1) # Adjust fps as needed
anim.save("/cn/blog/2024-07-05-k-means_files/kmeans_clustering.mp4", writer=writer)
anim.save("img/kmeans_clustering.mp4", writer=writer)

# Show the animation
# plt.show()
```

{{< video "/cn/blog/2024-07-05-k-means_files/kmeans_clustering.mp4" >}}


![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_16_0.png)



上面的动画视频在 [这里](/cn/blog/2024-07-05-kmeans_files/kmeans_clustering.mp4)

## 评估结果

我们现在重点不在可视化,我们主要看一下我们的代码是否正确。


```python
def get_final_centroids_and_clusters(data, n_clusters, threshold, seed=None):
"""
Perform K-means clustering on the provided data.
Parameters:
data (numpy.ndarray): Data points to cluster.
n_clusters (int): Number of clusters.
threshold (float): Convergence threshold.
seed (int, optional): Random seed for reproducibility.
Returns:
updated_centroids (numpy.ndarray): Final centroids after clustering.
clusters (list): Data points classified into clusters.
"""
if seed is not None:
np.random.seed(seed)

data_size = data.shape[0]
# Initialize centroids by randomly sampling from the data
initial_centroids = data[np.random.choice(data_size, n_clusters, replace=False)]
updated_centroids, clusters = kmeans_clustering(data, initial_centroids)
centroids = initial_centroids

while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > threshold:
centroids = updated_centroids
updated_centroids, clusters = kmeans_clustering(data, centroids)

return updated_centroids, clusters

def kmeans_clustering(data, centroids):
"""
Perform one iteration of K-means clustering.
Parameters:
data (numpy.ndarray): Data points to cluster.
centroids (numpy.ndarray): Current centroids.
Returns:
updated_centroids (numpy.ndarray): Updated centroids.
clusters (list): Data points classified into clusters.
"""
k = len(centroids)
clusters = [[] for _ in range(k)]

for d in data:
# Calculate distances from the data point to each centroid
distances = np.linalg.norm(d - centroids, axis=1)
# Find the index of the closest centroid
min_distance_idx = np.argmin(distances)
# Assign the data point to the closest centroid's cluster
clusters[min_distance_idx].append(d)

updated_centroids = []
for cluster in clusters:
if cluster:
updated_centroids.append(np.mean(cluster, axis=0))
else:
# Handle empty cluster by reinitializing the centroid randomly from the data
updated_centroids.append(data[np.random.choice(data.shape[0])])

return np.array(updated_centroids), clusters

```


```python
height_weight_df = pd.read_csv("data/height_weight_data.csv")
height_weight_data = height_weight_df[['Height', 'Weight']].values
final_centroids, clusters = get_final_centroids_and_clusters(
height_weight_data, n_clusters=2, threshold=1e-3, seed = 0
)
final_centroids
```




array([[150.04743075, 59.32777344],
[171.59570541, 80.42337816]])




```python
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(height_weight_data)
kmeans.cluster_centers_
```




array([[150.04743075, 59.32777344],
[171.59570541, 80.42337816]])



我们看到我们的结果和 `sklearn` 的一致,那说明我们的没问题。但是缺少了一个功能:预测。比如,我随便给你一个数据点,`[130, 66]`,它属于 cluster1 还是 cluster2?

我们接下来实现这个功能。


```python
def predict(data, final_centroids):
"""
Predict the closest cluster each data point in the data belongs to.
Parameters:
data (numpy.ndarray): Data points to predict.
Returns:
labels (numpy.ndarray): Cluster labels for each data point.
"""
k = len(data)
labels = []

for d in data:
# Calculate distances from the data point to each centroid
distances = np.linalg.norm(d - final_centroids, axis=1)
# Find the index of the closest centroid
min_distance_idx = np.argmin(distances)
# That index is the label
labels.append(min_distance_idx)
return np.array(labels)

```


```python
predict([[130, 66], [144, 55]], final_centroids), kmeans.predict([[130, 66], [144, 55]])
```




(array([0, 0]), array([0, 0], dtype=int32))



我们可以看到,我们写的这个函数,其结果和 `sklearn` 的结果一致。

## 进一步深化

接下来,我们要写一个 Python Class,而不是用三个独立的 `def`


```python
class KMeans:
def __init__(self, n_clusters, threshold, seed=None):
"""
Initialize the KMeans class with the number of clusters, convergence threshold, and optional random seed.
Parameters:
n_clusters (int): Number of clusters.
threshold (float): Convergence threshold.
seed (int, optional): Random seed for reproducibility.
"""
self.n_clusters = n_clusters
self.threshold = threshold
self.seed = seed
self.centroids = None

def fit(self, data):
"""
Fit the K-means algorithm to the data.
Parameters:
data (numpy.ndarray): Data points to cluster.
Returns:
self (KMeans): Fitted KMeans instance.
"""
if self.seed is not None:
np.random.seed(self.seed)
data_size = data.shape[0]
# Initialize centroids by randomly sampling from the data
initial_centroids = data[np.random.choice(data_size, self.n_clusters, replace=False)]
updated_centroids, clusters = kmeans_clustering(data, initial_centroids)
centroids = initial_centroids

while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > self.threshold:
centroids = updated_centroids
updated_centroids, clusters = self._kmeans_clustering(data, centroids)

self.centroids = updated_centroids
self.clusters = clusters

return self

def _kmeans_clustering(self, data, centroids):
"""
Perform one iteration of K-means clustering.
Parameters:
data (numpy.ndarray): Data points to cluster.
centroids (numpy.ndarray): Current centroids.
Returns:
updated_centroids (numpy.ndarray): Updated centroids.
clusters (list): Data points classified into clusters.
"""
k = len(centroids)
clusters = [[] for _ in range(k)]

for d in data:
# Calculate distances from the data point to each centroid
distances = np.linalg.norm(d - centroids, axis=1)
# Find the index of the closest centroid
min_distance_idx = np.argmin(distances)
# Assign the data point to the closest centroid's cluster
clusters[min_distance_idx].append(d)

updated_centroids = []
for cluster in clusters:
if cluster:
updated_centroids.append(np.mean(cluster, axis=0))
else:
# Handle empty cluster by reinitializing the centroid randomly from the data
updated_centroids.append(data[np.random.choice(data.shape[0])])

return np.array(updated_centroids), clusters

def predict(self, data):
"""
Predict the closest cluster each data point in the data belongs to.
Parameters:
data (numpy.ndarray): Data points to predict.
Returns:
labels (numpy.ndarray): Cluster labels for each data point.
"""
labels = []

for d in data:
# Calculate distances from the data point to each centroid
distances = np.linalg.norm(d - self.centroids, axis=1)
# Find the index of the closest centroid
min_distance_idx = np.argmin(distances)
# That index is the label
labels.append(min_distance_idx)
return np.array(labels)

# Example usage:
# kmeans = KMeans(n_clusters=3, threshold=0.001, seed=42)
# kmeans.fit(data)
# labels = kmeans.predict(data)
```


```python
kmeans = KMeans(n_clusters=2, threshold=0.001, seed = 0)
kmeans_result = kmeans.fit(height_weight_data)
kmeans_result.centroids
```




array([[150.04743075, 59.32777344],
[171.59570541, 80.42337816]])




```python
kmeans.predict([[130, 66], [144, 55]])
```




array([0, 0])


406 changes: 373 additions & 33 deletions notebooks/2024-07-05-k-means.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 2099d14

Please sign in to comment.