Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Marcin Bielak authored and Marcin Bielak committed Jan 25, 2010
0 parents commit 0082ae9
Show file tree
Hide file tree
Showing 27 changed files with 8,538 additions and 0 deletions.
11 changes: 11 additions & 0 deletions K-Means_clustering/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

Links:

http://blogs.sun.com/yongsun/entry/k_means_and_k_means
http://www.solcoproject.net/?secid=6&pid=30
http://sourceforge.net/search/?type_of_search=soft&words=k-means&search=Search
http://www.inb.mu-luebeck.de/biosoft/biopython/api/Bio/Tools/Clustering/kMeans.py.html
http://www.inb.mu-luebeck.de/biosoft/biopython/api/index.html
http://www.phpandme.net/2009/03/calculating-distance-between-two-documents/
http://www.phpandme.net/2009/03/bayesian-text-classification/
http://people.revoledu.com/kardi/tutorial/kMean/Resources.htm
132 changes: 132 additions & 0 deletions K-Means_clustering/cluster.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?php
/**
* K-means clustering with centroid and normalize value
*
* @see http://phpir.com/clustering
* @see
*/

define(WITH_NORMALIZE, false);

$data = array(
array(0.05, 0.95),
array(0.1, 0.9),
array(0.2, 0.8),
array(0.25, 0.75),
array(0.45, 0.55),
array(0.5, 0.5),
array(0.55, 0.45),
array(0.85, 0.15),
array(0.9, 0.1),
array(0.95, 0.05)
);


var_dump(kMeans($data, 3, WITH_NORMALIZE));

function initialiseCentroids(array $data, $k, $normalize = false) {
$dimensions = count($data[0]);
$centroids = array();
$dimmax = array();
$dimmin = array();
foreach($data as $document) {
foreach($document as $dim => $val) {
if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) {
$dimmax[$dim] = $val;
}
if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) {
$dimmin[$dim] = $val;
}
}
}
for($i = 0; $i < $k; $i++) {
$centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize);
}
return $centroids;
}

function initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize = false) {
$total = 0;
$centroid = array();
for($j = 0; $j < $dimensions; $j++) {
$total += $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000));
}

$centroid = ( false === $normalize ? $centroid : normaliseValue($centroid, $total) );

return $centroid;
}

function kMeans($data, $k, $normalize = false) {
$centroids = initialiseCentroids($data, $k, $normalize = false);
$mapping = array();

while(true) {
$new_mapping = assignCentroids($data, $centroids);
foreach($new_mapping as $documentID => $centroidID) {
if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) {
$mapping = $new_mapping;
break;
} else {
return formatResults($mapping, $data, $centroids);
}
}
$centroids = updateCentroids($mapping, $data, $k);
}
}

function formatResults($mapping, $data, $centroids) {
$result = array();
$result['centroids'] = $centroids;
foreach($mapping as $documentID => $centroidID) {
$result[$centroidID][] = implode(',', $data[$documentID]);
}
return $result;
}

function assignCentroids($data, $centroids) {
$mapping = array();

foreach($data as $documentID => $document) {
$minDist = 100;
$minCentroid = null;
foreach($centroids as $centroidID => $centroid) {
$dist = 0;
foreach($centroid as $dim => $value) {
$dist += abs($value - $document[$dim]);
}
if($dist < $minDist) {
$minDist = $dist;
$minCentroid = $centroidID;
}
}
$mapping[$documentID] = $minCentroid;
}

return $mapping;
}

function updateCentroids($mapping, $data, $k) {
$centroids = array();
$counts = array_count_values($mapping);

foreach($mapping as $documentID => $centroidID) {
foreach($data[$documentID] as $dim => $value) {
$centroids[$centroidID][$dim] += ($value/$counts[$centroidID]);
}
}

if(count($centroids) < $k) {
$centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids)));
}

return $centroids;
}

function normaliseValue(array $vector, $total) {
foreach($vector as &$value) {
$value = $value/$total;
}
return $vector;
}

132 changes: 132 additions & 0 deletions K-Means_clustering/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@

# clustering.py contains classes and functions that cluster data points
import sys, math, random


# -- The Point class represents points in n-dimensional space
class Point:
# Instance variables
# self.coords is a list of coordinates for this Point
# self.n is the number of dimensions this Point lives in (ie, its space)
# self.reference is an object bound to this Point
# Initialize new Points
def __init__(self, coords, reference=None):
self.coords = coords
self.n = len(coords)
self.reference = reference
# Return a string representation of this Point
def __repr__(self):
return str(self.coords)


# -- The Cluster class represents clusters of points in n-dimensional space
class Cluster:
# Instance variables
# self.points is a list of Points associated with this Cluster
# self.n is the number of dimensions this Cluster's Points live in
# self.centroid is the sample mean Point of this Cluster
def __init__(self, points):
# We forbid empty Clusters (they don't make mathematical sense!)
if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
self.points = points
self.n = points[0].n
# We also forbid Clusters containing Points in different spaces
# Ie, no Clusters with 2D Points and 3D Points
for p in points:
if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
# Figure out what the centroid of this Cluster should be
self.centroid = self.calculateCentroid()
# Return a string representation of this Cluster
def __repr__(self):
return str(self.points)
# Update function for the K-means algorithm
# Assigns a new list of Points to this Cluster, returns centroid difference
def update(self, points):
old_centroid = self.centroid
self.points = points
self.centroid = self.calculateCentroid()
return getDistance(old_centroid, self.centroid)
# Calculates the centroid Point - the centroid is the sample mean Point
# (in plain English, the average of all the Points in the Cluster)
def calculateCentroid(self):
centroid_coords = []
# For each coordinate:
for i in range(self.n):
# Take the average across all Points
centroid_coords.append(0.0)
for p in self.points:
centroid_coords[i] = centroid_coords[i]+p.coords[i]
centroid_coords[i] = centroid_coords[i]/len(self.points)
# Return a Point object using the average coordinates
return Point(centroid_coords)


# -- Return Clusters of Points formed by K-means clustering
def kmeans(points, k, cutoff):
# Randomly sample k Points from the points list, build Clusters around them
initial = random.sample(points, k)
clusters = []
for p in initial: clusters.append(Cluster([p]))
# Enter the program loop
while True:
# Make a list for each Cluster
lists = []
for c in clusters: lists.append([])
# For each Point:
for p in points:
# Figure out which Cluster's centroid is the nearest
smallest_distance = getDistance(p, clusters[0].centroid)
index = 0
for i in range(len(clusters[1:])):
distance = getDistance(p, clusters[i+1].centroid)
if distance < smallest_distance:
smallest_distance = distance
index = i+1
# Add this Point to that Cluster's corresponding list
lists[index].append(p)
# Update each Cluster with the corresponding list
# Record the biggest centroid shift for any Cluster
biggest_shift = 0.0
for i in range(len(clusters)):
shift = clusters[i].update(lists[i])
biggest_shift = max(biggest_shift, shift)
# If the biggest centroid shift is less than the cutoff, stop
if biggest_shift < cutoff: break
# Return the list of Clusters
return clusters


# -- Get the Euclidean distance between two Points
def getDistance(a, b):
# Forbid measurements between Points in different spaces
if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
# Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
ret = 0.0
for i in range(a.n):
ret = ret+pow((a.coords[i]-b.coords[i]), 2)
return math.sqrt(ret)

# -- Create a random Point in n-dimensional space
def makeRandomPoint(n, lower, upper):
coords = []
for i in range(n): coords.append(random.uniform(lower, upper))
return Point(coords)

# -- Main function
def main(args):
num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200
# Create num_points random Points in n-dimensional space
points = []
for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))
# Cluster the points using the K-means algorithm
clusters = kmeans(points, k, cutoff)
# Print the results
print "\nPOINTS:"
for p in points: print "P:", p
print "\nCLUSTERS:"
for c in clusters: print "C:", c

# -- The following code executes upon command-line invocation
if __name__ == "__main__":
main(sys.argv)

Binary file added K-Means_clustering/masterThesis-VR.pdf
Binary file not shown.
Loading

0 comments on commit 0082ae9

Please sign in to comment.