first commit

nanounanue · Jan 25, 2010 · 0082ae9 · 0082ae9
commit 0082ae9
Show file tree

Hide file tree

Showing 27 changed files with 8,538 additions and 0 deletions.
diff --git a/K-Means_clustering/README b/K-Means_clustering/README
@@ -0,0 +1,11 @@
+
+Links:
+
+http://blogs.sun.com/yongsun/entry/k_means_and_k_means
+http://www.solcoproject.net/?secid=6&pid=30
+http://sourceforge.net/search/?type_of_search=soft&words=k-means&search=Search
+http://www.inb.mu-luebeck.de/biosoft/biopython/api/Bio/Tools/Clustering/kMeans.py.html
+http://www.inb.mu-luebeck.de/biosoft/biopython/api/index.html
+http://www.phpandme.net/2009/03/calculating-distance-between-two-documents/
+http://www.phpandme.net/2009/03/bayesian-text-classification/
+http://people.revoledu.com/kardi/tutorial/kMean/Resources.htm
diff --git a/K-Means_clustering/cluster.php b/K-Means_clustering/cluster.php
@@ -0,0 +1,132 @@
+<?php
+/**
+ * K-means clustering with centroid and normalize value
+ *
+ * @see   http://phpir.com/clustering
+ * @see   
+ */
+
+define(WITH_NORMALIZE, false);
+
+$data = array( 
+	array(0.05, 0.95),
+	array(0.1, 0.9),
+	array(0.2, 0.8),
+	array(0.25, 0.75),
+	array(0.45, 0.55),
+	array(0.5, 0.5),
+	array(0.55, 0.45), 
+	array(0.85, 0.15),
+	array(0.9, 0.1),
+	array(0.95, 0.05)
+);
+
+
+var_dump(kMeans($data, 3, WITH_NORMALIZE));
+
+function initialiseCentroids(array $data, $k, $normalize = false) {
+	$dimensions = count($data[0]);
+	$centroids = array();
+	$dimmax = array();
+	$dimmin = array(); 
+	foreach($data as $document) {
+		foreach($document as $dim => $val) {
+			if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) {
+				$dimmax[$dim] = $val;
+			}
+			if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) {
+				$dimmin[$dim] = $val;
+			}
+		}
+	}
+	for($i = 0; $i < $k; $i++) {
+		$centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize);
+	}
+	return $centroids;
+}
+
+function initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize = false) {
+	$total = 0;
+	$centroid = array();
+	for($j = 0; $j < $dimensions; $j++) {
+		$total += $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000));
+	}
+
+	$centroid = ( false === $normalize ? $centroid : normaliseValue($centroid, $total) );
+
+	return $centroid;
+}
+
+function kMeans($data, $k, $normalize = false) {
+	$centroids = initialiseCentroids($data, $k, $normalize = false);
+	$mapping = array();
+
+	while(true) {
+		$new_mapping = assignCentroids($data, $centroids);
+		foreach($new_mapping as $documentID => $centroidID) {
+			if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) {
+				$mapping = $new_mapping;
+				break;
+			} else {
+				return formatResults($mapping, $data, $centroids); 
+			}
+		}
+		$centroids  = updateCentroids($mapping, $data, $k); 
+	}
+}
+
+function formatResults($mapping, $data, $centroids) {
+	$result  = array();
+	$result['centroids'] = $centroids;
+	foreach($mapping as $documentID => $centroidID) {
+		$result[$centroidID][] = implode(',', $data[$documentID]);
+	}
+	return $result;
+}
+
+function assignCentroids($data, $centroids) {
+	$mapping = array();
+
+	foreach($data as $documentID => $document) {
+		$minDist = 100;
+		$minCentroid = null;
+		foreach($centroids as $centroidID => $centroid) {
+			$dist = 0;
+			foreach($centroid as $dim => $value) {
+				$dist += abs($value - $document[$dim]);
+			}
+			if($dist < $minDist) {
+				$minDist = $dist;
+				$minCentroid = $centroidID;
+			}
+		}
+		$mapping[$documentID] = $minCentroid;
+	}
+
+	return $mapping;
+}
+
+function updateCentroids($mapping, $data, $k) {
+	$centroids = array();
+	$counts = array_count_values($mapping);
+
+	foreach($mapping as $documentID => $centroidID) {
+		foreach($data[$documentID] as $dim => $value) {
+			$centroids[$centroidID][$dim] += ($value/$counts[$centroidID]); 
+		}
+	}
+
+	if(count($centroids) < $k) {
+		$centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids)));
+	}
+
+	return $centroids;
+}
+
+function normaliseValue(array $vector, $total) {
+	foreach($vector as &$value) {
+		$value = $value/$total;
+	}
+	return $vector;
+}
+
diff --git a/K-Means_clustering/clustering.py b/K-Means_clustering/clustering.py
@@ -0,0 +1,132 @@
+
+# clustering.py contains classes and functions that cluster data points
+import sys, math, random
+
+
+# -- The Point class represents points in n-dimensional space
+class Point:
+    # Instance variables
+    # self.coords is a list of coordinates for this Point
+    # self.n is the number of dimensions this Point lives in (ie, its space)
+    # self.reference is an object bound to this Point
+    # Initialize new Points
+    def __init__(self, coords, reference=None):
+        self.coords = coords
+        self.n = len(coords)
+        self.reference = reference
+    # Return a string representation of this Point
+    def __repr__(self):
+        return str(self.coords)
+
+
+# -- The Cluster class represents clusters of points in n-dimensional space
+class Cluster:
+    # Instance variables
+    # self.points is a list of Points associated with this Cluster
+    # self.n is the number of dimensions this Cluster's Points live in
+    # self.centroid is the sample mean Point of this Cluster
+    def __init__(self, points):
+        # We forbid empty Clusters (they don't make mathematical sense!)
+        if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
+        self.points = points
+        self.n = points[0].n
+        # We also forbid Clusters containing Points in different spaces
+        # Ie, no Clusters with 2D Points and 3D Points
+        for p in points:
+            if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
+        # Figure out what the centroid of this Cluster should be
+        self.centroid = self.calculateCentroid()
+    # Return a string representation of this Cluster
+    def __repr__(self):
+        return str(self.points)
+    # Update function for the K-means algorithm
+    # Assigns a new list of Points to this Cluster, returns centroid difference
+    def update(self, points):
+        old_centroid = self.centroid
+        self.points = points
+        self.centroid = self.calculateCentroid()
+        return getDistance(old_centroid, self.centroid)
+    # Calculates the centroid Point - the centroid is the sample mean Point
+    # (in plain English, the average of all the Points in the Cluster)
+    def calculateCentroid(self):
+        centroid_coords = []
+        # For each coordinate:
+        for i in range(self.n):
+            # Take the average across all Points
+            centroid_coords.append(0.0)
+            for p in self.points:
+                centroid_coords[i] = centroid_coords[i]+p.coords[i]
+            centroid_coords[i] = centroid_coords[i]/len(self.points)
+        # Return a Point object using the average coordinates
+        return Point(centroid_coords)
+
+
+# -- Return Clusters of Points formed by K-means clustering
+def kmeans(points, k, cutoff):
+    # Randomly sample k Points from the points list, build Clusters around them
+    initial = random.sample(points, k)
+    clusters = []
+    for p in initial: clusters.append(Cluster([p]))
+    # Enter the program loop
+    while True:
+        # Make a list for each Cluster
+        lists = []
+        for c in clusters: lists.append([])
+        # For each Point:
+        for p in points:
+            # Figure out which Cluster's centroid is the nearest
+            smallest_distance = getDistance(p, clusters[0].centroid)
+            index = 0
+            for i in range(len(clusters[1:])):
+                distance = getDistance(p, clusters[i+1].centroid)
+                if distance < smallest_distance:
+                    smallest_distance = distance
+                    index = i+1
+            # Add this Point to that Cluster's corresponding list
+            lists[index].append(p)
+        # Update each Cluster with the corresponding list
+        # Record the biggest centroid shift for any Cluster
+        biggest_shift = 0.0
+        for i in range(len(clusters)):
+            shift = clusters[i].update(lists[i])
+            biggest_shift = max(biggest_shift, shift)
+        # If the biggest centroid shift is less than the cutoff, stop
+        if biggest_shift < cutoff: break
+    # Return the list of Clusters
+    return clusters
+
+
+# -- Get the Euclidean distance between two Points
+def getDistance(a, b):
+    # Forbid measurements between Points in different spaces
+    if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
+    # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
+    ret = 0.0
+    for i in range(a.n):
+        ret = ret+pow((a.coords[i]-b.coords[i]), 2)
+    return math.sqrt(ret)
+
+# -- Create a random Point in n-dimensional space
+def makeRandomPoint(n, lower, upper):
+    coords = []
+    for i in range(n): coords.append(random.uniform(lower, upper))
+    return Point(coords)
+
+# -- Main function
+def main(args):
+    num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200
+    # Create num_points random Points in n-dimensional space
+    points = []
+    for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))
+    # Cluster the points using the K-means algorithm
+    clusters = kmeans(points, k, cutoff)
+    # Print the results
+    print "\nPOINTS:"
+    for p in points: print "P:", p
+    print "\nCLUSTERS:"
+    for c in clusters: print "C:", c
+
+# -- The following code executes upon command-line invocation
+if __name__ == "__main__":
+    main(sys.argv)
+
diff --git a/K-Means_clustering/masterThesis-VR.pdf b/K-Means_clustering/masterThesis-VR.pdf