forked from bieli/data_mining_experiments
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Marcin Bielak
authored and
Marcin Bielak
committed
Jan 25, 2010
0 parents
commit 0082ae9
Showing
27 changed files
with
8,538 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
Links: | ||
|
||
http://blogs.sun.com/yongsun/entry/k_means_and_k_means | ||
http://www.solcoproject.net/?secid=6&pid=30 | ||
http://sourceforge.net/search/?type_of_search=soft&words=k-means&search=Search | ||
http://www.inb.mu-luebeck.de/biosoft/biopython/api/Bio/Tools/Clustering/kMeans.py.html | ||
http://www.inb.mu-luebeck.de/biosoft/biopython/api/index.html | ||
http://www.phpandme.net/2009/03/calculating-distance-between-two-documents/ | ||
http://www.phpandme.net/2009/03/bayesian-text-classification/ | ||
http://people.revoledu.com/kardi/tutorial/kMean/Resources.htm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
<?php | ||
/** | ||
* K-means clustering with centroid and normalize value | ||
* | ||
* @see http://phpir.com/clustering | ||
* @see | ||
*/ | ||
|
||
define(WITH_NORMALIZE, false); | ||
|
||
$data = array( | ||
array(0.05, 0.95), | ||
array(0.1, 0.9), | ||
array(0.2, 0.8), | ||
array(0.25, 0.75), | ||
array(0.45, 0.55), | ||
array(0.5, 0.5), | ||
array(0.55, 0.45), | ||
array(0.85, 0.15), | ||
array(0.9, 0.1), | ||
array(0.95, 0.05) | ||
); | ||
|
||
|
||
var_dump(kMeans($data, 3, WITH_NORMALIZE)); | ||
|
||
function initialiseCentroids(array $data, $k, $normalize = false) { | ||
$dimensions = count($data[0]); | ||
$centroids = array(); | ||
$dimmax = array(); | ||
$dimmin = array(); | ||
foreach($data as $document) { | ||
foreach($document as $dim => $val) { | ||
if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) { | ||
$dimmax[$dim] = $val; | ||
} | ||
if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) { | ||
$dimmin[$dim] = $val; | ||
} | ||
} | ||
} | ||
for($i = 0; $i < $k; $i++) { | ||
$centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize); | ||
} | ||
return $centroids; | ||
} | ||
|
||
function initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize = false) { | ||
$total = 0; | ||
$centroid = array(); | ||
for($j = 0; $j < $dimensions; $j++) { | ||
$total += $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000)); | ||
} | ||
|
||
$centroid = ( false === $normalize ? $centroid : normaliseValue($centroid, $total) ); | ||
|
||
return $centroid; | ||
} | ||
|
||
function kMeans($data, $k, $normalize = false) { | ||
$centroids = initialiseCentroids($data, $k, $normalize = false); | ||
$mapping = array(); | ||
|
||
while(true) { | ||
$new_mapping = assignCentroids($data, $centroids); | ||
foreach($new_mapping as $documentID => $centroidID) { | ||
if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) { | ||
$mapping = $new_mapping; | ||
break; | ||
} else { | ||
return formatResults($mapping, $data, $centroids); | ||
} | ||
} | ||
$centroids = updateCentroids($mapping, $data, $k); | ||
} | ||
} | ||
|
||
function formatResults($mapping, $data, $centroids) { | ||
$result = array(); | ||
$result['centroids'] = $centroids; | ||
foreach($mapping as $documentID => $centroidID) { | ||
$result[$centroidID][] = implode(',', $data[$documentID]); | ||
} | ||
return $result; | ||
} | ||
|
||
function assignCentroids($data, $centroids) { | ||
$mapping = array(); | ||
|
||
foreach($data as $documentID => $document) { | ||
$minDist = 100; | ||
$minCentroid = null; | ||
foreach($centroids as $centroidID => $centroid) { | ||
$dist = 0; | ||
foreach($centroid as $dim => $value) { | ||
$dist += abs($value - $document[$dim]); | ||
} | ||
if($dist < $minDist) { | ||
$minDist = $dist; | ||
$minCentroid = $centroidID; | ||
} | ||
} | ||
$mapping[$documentID] = $minCentroid; | ||
} | ||
|
||
return $mapping; | ||
} | ||
|
||
function updateCentroids($mapping, $data, $k) { | ||
$centroids = array(); | ||
$counts = array_count_values($mapping); | ||
|
||
foreach($mapping as $documentID => $centroidID) { | ||
foreach($data[$documentID] as $dim => $value) { | ||
$centroids[$centroidID][$dim] += ($value/$counts[$centroidID]); | ||
} | ||
} | ||
|
||
if(count($centroids) < $k) { | ||
$centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids))); | ||
} | ||
|
||
return $centroids; | ||
} | ||
|
||
function normaliseValue(array $vector, $total) { | ||
foreach($vector as &$value) { | ||
$value = $value/$total; | ||
} | ||
return $vector; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
|
||
# clustering.py contains classes and functions that cluster data points | ||
import sys, math, random | ||
|
||
|
||
# -- The Point class represents points in n-dimensional space | ||
class Point: | ||
# Instance variables | ||
# self.coords is a list of coordinates for this Point | ||
# self.n is the number of dimensions this Point lives in (ie, its space) | ||
# self.reference is an object bound to this Point | ||
# Initialize new Points | ||
def __init__(self, coords, reference=None): | ||
self.coords = coords | ||
self.n = len(coords) | ||
self.reference = reference | ||
# Return a string representation of this Point | ||
def __repr__(self): | ||
return str(self.coords) | ||
|
||
|
||
# -- The Cluster class represents clusters of points in n-dimensional space | ||
class Cluster: | ||
# Instance variables | ||
# self.points is a list of Points associated with this Cluster | ||
# self.n is the number of dimensions this Cluster's Points live in | ||
# self.centroid is the sample mean Point of this Cluster | ||
def __init__(self, points): | ||
# We forbid empty Clusters (they don't make mathematical sense!) | ||
if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER") | ||
self.points = points | ||
self.n = points[0].n | ||
# We also forbid Clusters containing Points in different spaces | ||
# Ie, no Clusters with 2D Points and 3D Points | ||
for p in points: | ||
if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER") | ||
# Figure out what the centroid of this Cluster should be | ||
self.centroid = self.calculateCentroid() | ||
# Return a string representation of this Cluster | ||
def __repr__(self): | ||
return str(self.points) | ||
# Update function for the K-means algorithm | ||
# Assigns a new list of Points to this Cluster, returns centroid difference | ||
def update(self, points): | ||
old_centroid = self.centroid | ||
self.points = points | ||
self.centroid = self.calculateCentroid() | ||
return getDistance(old_centroid, self.centroid) | ||
# Calculates the centroid Point - the centroid is the sample mean Point | ||
# (in plain English, the average of all the Points in the Cluster) | ||
def calculateCentroid(self): | ||
centroid_coords = [] | ||
# For each coordinate: | ||
for i in range(self.n): | ||
# Take the average across all Points | ||
centroid_coords.append(0.0) | ||
for p in self.points: | ||
centroid_coords[i] = centroid_coords[i]+p.coords[i] | ||
centroid_coords[i] = centroid_coords[i]/len(self.points) | ||
# Return a Point object using the average coordinates | ||
return Point(centroid_coords) | ||
|
||
|
||
# -- Return Clusters of Points formed by K-means clustering | ||
def kmeans(points, k, cutoff): | ||
# Randomly sample k Points from the points list, build Clusters around them | ||
initial = random.sample(points, k) | ||
clusters = [] | ||
for p in initial: clusters.append(Cluster([p])) | ||
# Enter the program loop | ||
while True: | ||
# Make a list for each Cluster | ||
lists = [] | ||
for c in clusters: lists.append([]) | ||
# For each Point: | ||
for p in points: | ||
# Figure out which Cluster's centroid is the nearest | ||
smallest_distance = getDistance(p, clusters[0].centroid) | ||
index = 0 | ||
for i in range(len(clusters[1:])): | ||
distance = getDistance(p, clusters[i+1].centroid) | ||
if distance < smallest_distance: | ||
smallest_distance = distance | ||
index = i+1 | ||
# Add this Point to that Cluster's corresponding list | ||
lists[index].append(p) | ||
# Update each Cluster with the corresponding list | ||
# Record the biggest centroid shift for any Cluster | ||
biggest_shift = 0.0 | ||
for i in range(len(clusters)): | ||
shift = clusters[i].update(lists[i]) | ||
biggest_shift = max(biggest_shift, shift) | ||
# If the biggest centroid shift is less than the cutoff, stop | ||
if biggest_shift < cutoff: break | ||
# Return the list of Clusters | ||
return clusters | ||
|
||
|
||
# -- Get the Euclidean distance between two Points | ||
def getDistance(a, b): | ||
# Forbid measurements between Points in different spaces | ||
if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS") | ||
# Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i) | ||
ret = 0.0 | ||
for i in range(a.n): | ||
ret = ret+pow((a.coords[i]-b.coords[i]), 2) | ||
return math.sqrt(ret) | ||
|
||
# -- Create a random Point in n-dimensional space | ||
def makeRandomPoint(n, lower, upper): | ||
coords = [] | ||
for i in range(n): coords.append(random.uniform(lower, upper)) | ||
return Point(coords) | ||
|
||
# -- Main function | ||
def main(args): | ||
num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200 | ||
# Create num_points random Points in n-dimensional space | ||
points = [] | ||
for i in range(num_points): points.append(makeRandomPoint(n, lower, upper)) | ||
# Cluster the points using the K-means algorithm | ||
clusters = kmeans(points, k, cutoff) | ||
# Print the results | ||
print "\nPOINTS:" | ||
for p in points: print "P:", p | ||
print "\nCLUSTERS:" | ||
for c in clusters: print "C:", c | ||
|
||
# -- The following code executes upon command-line invocation | ||
if __name__ == "__main__": | ||
main(sys.argv) | ||
|
Binary file not shown.
Oops, something went wrong.