updating site on Thu Jul 11 16:13:55 CDT 2024

hongtaoh · Jul 11, 2024 · 2099d14 · 2099d14
1 parent c4b6ad5
commit 2099d14
Show file tree

Hide file tree

Showing 15 changed files with 922 additions and 37 deletions.
diff --git a/content/cn/blog/2024-07-05-k-means.md b/content/cn/blog/2024-07-05-k-means.md
@@ -1,7 +1,7 @@
 ---
 
 title: "K-means 算法讲解"
-date: 2024-07-06
+date: 2024-07-05
 author: 郝鸿涛
 slug: k-means
 draft: false
@@ -187,7 +187,7 @@ def kmeans_clustering(data, centroids):
     # for each data point
     for d in data:
         # calcuate distance from each centroid to this data point
-        distances = [np.linalg.norm(d - centroids[c]) for c in range(k)]
+        distances = np.linalg.norm(d - centroids, axis = 1)
         # the index of centroid that is the closest to this data point
         min_distance_idx = np.argmin(distances)
         # add this data point to the associated cluster
@@ -273,6 +273,24 @@ while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > threshold:
 
 
 
+
+
+![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_4.png)
+
+
+
+
+
+![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_5.png)
+
+
+
+
+
+![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_14_6.png)
+
+
+
 ## 制作动画
 
 上面的代码我们可以把每一步画出来，我们试试制作一个动画。
@@ -333,10 +351,298 @@ def update(frame):
 anim = FuncAnimation(fig, update, frames=np.arange(20), interval=1000, repeat=False)
 
 writer = FFMpegWriter(fps=1)  # Adjust fps as needed
-anim.save("/cn/blog/2024-07-05-k-means_files/kmeans_clustering.mp4", writer=writer)
+anim.save("img/kmeans_clustering.mp4", writer=writer)
 
 # Show the animation
 # plt.show()
 ```
 
-{{< video "/cn/blog/2024-07-05-k-means_files/kmeans_clustering.mp4" >}}
+
+
+![png](/cn/blog/2024-07-05-k-means_files/2024-07-05-k-means_16_0.png)
+
+
+
+上面的动画视频在 [这里](/cn/blog/2024-07-05-kmeans_files/kmeans_clustering.mp4)
+
+## 评估结果
+
+我们现在重点不在可视化，我们主要看一下我们的代码是否正确。
+
+
+```python
+def get_final_centroids_and_clusters(data, n_clusters, threshold, seed=None):
+    """
+    Perform K-means clustering on the provided data.
+    
+    Parameters:
+        data (numpy.ndarray): Data points to cluster.
+        n_clusters (int): Number of clusters.
+        threshold (float): Convergence threshold.
+        seed (int, optional): Random seed for reproducibility.
+    
+    Returns:
+        updated_centroids (numpy.ndarray): Final centroids after clustering.
+        clusters (list): Data points classified into clusters.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+
+    data_size = data.shape[0]
+    # Initialize centroids by randomly sampling from the data
+    initial_centroids = data[np.random.choice(data_size, n_clusters, replace=False)]
+    updated_centroids, clusters = kmeans_clustering(data, initial_centroids)
+    centroids = initial_centroids
+
+    while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > threshold:
+        centroids = updated_centroids
+        updated_centroids, clusters = kmeans_clustering(data, centroids)
+
+    return updated_centroids, clusters
+
+def kmeans_clustering(data, centroids):
+    """
+    Perform one iteration of K-means clustering.
+    
+    Parameters:
+        data (numpy.ndarray): Data points to cluster.
+        centroids (numpy.ndarray): Current centroids.
+    
+    Returns:
+        updated_centroids (numpy.ndarray): Updated centroids.
+        clusters (list): Data points classified into clusters.
+    """
+    k = len(centroids)
+    clusters = [[] for _ in range(k)]
+
+    for d in data:
+        # Calculate distances from the data point to each centroid
+        distances = np.linalg.norm(d - centroids, axis=1)
+        # Find the index of the closest centroid
+        min_distance_idx = np.argmin(distances)
+        # Assign the data point to the closest centroid's cluster
+        clusters[min_distance_idx].append(d)
+
+    updated_centroids = []
+    for cluster in clusters:
+        if cluster:
+            updated_centroids.append(np.mean(cluster, axis=0))
+        else:
+            # Handle empty cluster by reinitializing the centroid randomly from the data
+            updated_centroids.append(data[np.random.choice(data.shape[0])])
+
+    return np.array(updated_centroids), clusters
+
+```
+
+
+```python
+height_weight_df = pd.read_csv("data/height_weight_data.csv")
+height_weight_data = height_weight_df[['Height', 'Weight']].values
+final_centroids, clusters = get_final_centroids_and_clusters(
+    height_weight_data, n_clusters=2, threshold=1e-3, seed = 0
+)
+final_centroids
+```
+
+
+
+
+    array([[150.04743075,  59.32777344],
+           [171.59570541,  80.42337816]])
+
+
+
+
+```python
+from sklearn.cluster import KMeans
+kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(height_weight_data)
+kmeans.cluster_centers_
+```
+
+
+
+
+    array([[150.04743075,  59.32777344],
+           [171.59570541,  80.42337816]])
+
+
+
+我们看到我们的结果和 `sklearn` 的一致，那说明我们的没问题。但是缺少了一个功能：预测。比如，我随便给你一个数据点，`[130, 66]`，它属于 cluster1 还是 cluster2？
+
+我们接下来实现这个功能。
+
+
+```python
+def predict(data, final_centroids):
+    """
+    Predict the closest cluster each data point in the data belongs to.
+    
+    Parameters:
+        data (numpy.ndarray): Data points to predict.
+    
+    Returns:
+        labels (numpy.ndarray): Cluster labels for each data point.
+    """
+    k = len(data)
+    labels = []
+
+    for d in data:
+        # Calculate distances from the data point to each centroid
+        distances = np.linalg.norm(d - final_centroids, axis=1)
+        # Find the index of the closest centroid
+        min_distance_idx = np.argmin(distances)
+        # That index is the label
+        labels.append(min_distance_idx)
+    return np.array(labels)
+
+```
+
+
+```python
+predict([[130, 66], [144, 55]], final_centroids), kmeans.predict([[130, 66], [144, 55]])
+```
+
+
+
+
+    (array([0, 0]), array([0, 0], dtype=int32))
+
+
+
+我们可以看到，我们写的这个函数，其结果和 `sklearn` 的结果一致。
+
+## 进一步深化
+
+接下来，我们要写一个 Python Class，而不是用三个独立的 `def`。
+
+
+```python
+class KMeans:
+    def __init__(self, n_clusters, threshold, seed=None):
+        """
+        Initialize the KMeans class with the number of clusters, convergence threshold, and optional random seed.
+        
+        Parameters:
+            n_clusters (int): Number of clusters.
+            threshold (float): Convergence threshold.
+            seed (int, optional): Random seed for reproducibility.
+        """
+        self.n_clusters = n_clusters
+        self.threshold = threshold 
+        self.seed = seed 
+        self.centroids = None
+
+    def fit(self, data):
+        """
+        Fit the K-means algorithm to the data.
+        
+        Parameters:
+            data (numpy.ndarray): Data points to cluster.
+        
+        Returns:
+            self (KMeans): Fitted KMeans instance.
+        """
+        if self.seed is not None:
+            np.random.seed(self.seed)
+        data_size = data.shape[0]
+        # Initialize centroids by randomly sampling from the data
+        initial_centroids = data[np.random.choice(data_size, self.n_clusters, replace=False)]
+        updated_centroids, clusters = kmeans_clustering(data, initial_centroids)
+        centroids = initial_centroids
+
+        while np.max(np.linalg.norm(updated_centroids - centroids, axis=1)) > self.threshold:
+            centroids = updated_centroids
+            updated_centroids, clusters = self._kmeans_clustering(data, centroids)
+
+        self.centroids = updated_centroids
+        self.clusters = clusters
+
+        return self 
+
+    def _kmeans_clustering(self, data, centroids):
+        """
+        Perform one iteration of K-means clustering.
+        
+        Parameters:
+            data (numpy.ndarray): Data points to cluster.
+            centroids (numpy.ndarray): Current centroids.
+        
+        Returns:
+            updated_centroids (numpy.ndarray): Updated centroids.
+            clusters (list): Data points classified into clusters.
+        """
+        k = len(centroids)
+        clusters = [[] for _ in range(k)]
+
+        for d in data:
+            # Calculate distances from the data point to each centroid
+            distances = np.linalg.norm(d - centroids, axis=1)
+            # Find the index of the closest centroid
+            min_distance_idx = np.argmin(distances)
+            # Assign the data point to the closest centroid's cluster
+            clusters[min_distance_idx].append(d)
+
+        updated_centroids = []
+        for cluster in clusters:
+            if cluster:
+                updated_centroids.append(np.mean(cluster, axis=0))
+            else:
+                # Handle empty cluster by reinitializing the centroid randomly from the data
+                updated_centroids.append(data[np.random.choice(data.shape[0])])
+
+        return np.array(updated_centroids), clusters
+
+    def predict(self, data):
+        """
+        Predict the closest cluster each data point in the data belongs to.
+        
+        Parameters:
+            data (numpy.ndarray): Data points to predict.
+        
+        Returns:
+            labels (numpy.ndarray): Cluster labels for each data point.
+        """
+        labels = []
+
+        for d in data:
+            # Calculate distances from the data point to each centroid
+            distances = np.linalg.norm(d - self.centroids, axis=1)
+            # Find the index of the closest centroid
+            min_distance_idx = np.argmin(distances)
+            # That index is the label
+            labels.append(min_distance_idx)
+        return np.array(labels)
+
+# Example usage:
+# kmeans = KMeans(n_clusters=3, threshold=0.001, seed=42)
+# kmeans.fit(data)
+# labels = kmeans.predict(data)
+```
+
+
+```python
+kmeans = KMeans(n_clusters=2, threshold=0.001, seed = 0)
+kmeans_result = kmeans.fit(height_weight_data)
+kmeans_result.centroids
+```
+
+
+
+
+    array([[150.04743075,  59.32777344],
+           [171.59570541,  80.42337816]])
+
+
+
+
+```python
+kmeans.predict([[130, 66], [144, 55]])
+```
+
+
+
+
+    array([0, 0])
+
+
diff --git a/notebooks/2024-07-05-k-means.ipynb b/notebooks/2024-07-05-k-means.ipynb