From 4b2ca49d4f899693cd761ab52072a6ab1e74edfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 10 Jul 2023 17:07:57 +0200 Subject: [PATCH 01/82] first commit --- calinskiharabasz.jl | 59 +++++++++++++++++++++++++++++++++++++++++++++ daviesbouldin.jl | 21 ++++++++++++++++ dunn.jl | 20 +++++++++++++++ mytest.jl | 35 +++++++++++++++++++++++++++ qualityindices.jl | 44 +++++++++++++++++++++++++++++++++ xiebeni.jl | 51 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 230 insertions(+) create mode 100644 calinskiharabasz.jl create mode 100644 daviesbouldin.jl create mode 100644 dunn.jl create mode 100644 mytest.jl create mode 100644 qualityindices.jl create mode 100644 xiebeni.jl diff --git a/calinskiharabasz.jl b/calinskiharabasz.jl new file mode 100644 index 00000000..1abf38ee --- /dev/null +++ b/calinskiharabasz.jl @@ -0,0 +1,59 @@ +using Distances +# 1-based indexing + +function calinski_harabasz( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() + ) + + _check_qualityindex_argument(X,centers,assignments) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k + ) + + counts = [count(==(j)assignment) for j in 1:k] + globalCenter = mean(X, dims=2) + outerInertia = sum( + counts[j] * distance(centers[:, j], globalCenter) for j in 1:k + ) + + return (outerInertia / (k - 1)) / (innerInertia / (n - k)) +end + +calinski_harabasz(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = + calinski_harabasz(X, R.centers, R.assignments, distance) + + + +function calinski_harabasz( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + weights::AbstractMatrix{<:AbstractFloat}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean() + ) + + _check_qualityindex_argument(X, centers, weights, fuzziness) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + ) + + globalCenter = mean(X, dims=2)[:] + centerDistances = colwise(distance, centers, globalCenter) + outerInertia = sum( + weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k + ) + + return (outerInertia / (k - 1)) / (innerInertia / (n - k)) +end + +calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = + calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) diff --git a/daviesbouldin.jl b/daviesbouldin.jl new file mode 100644 index 00000000..9aa327a2 --- /dev/null +++ b/daviesbouldin.jl @@ -0,0 +1,21 @@ + + +function davies_bouldin( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() + ) + + _check_qualityindex_argument(X, centers, assignments) + + n, k = size(X, 2), size(centers,2) + + centerDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] + centerDistances = pairwise(distance,centers) + + return maximum( (centerDiameters[j₁] + centerDiameters[j₂]) / centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k ) / k +end + +davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = + davies_bouldin(X, R.centers, R.assignments, distance) \ No newline at end of file diff --git a/dunn.jl b/dunn.jl new file mode 100644 index 00000000..24fbbbc5 --- /dev/null +++ b/dunn.jl @@ -0,0 +1,20 @@ + + + + +function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + + _check_qualityindex_argument(assignments, dist) + + n = size(dist, 1) + k = maximum(assignments) +end + + +dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = + dunn(assignments, pairwise(distance,X)) + +dunn(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean()) = + dunn(X, R.assignments, distance) + +dunn(R::ClusteringResult, dist::AbstractMatrix{<:Real}) = dunn(R.assignments, dist) \ No newline at end of file diff --git a/mytest.jl b/mytest.jl new file mode 100644 index 00000000..9d1d5e02 --- /dev/null +++ b/mytest.jl @@ -0,0 +1,35 @@ +using Plots, Clustering, Statistics + +X = Matrix{Float64}(undef,2,20) + +for k in 1:10 + X[:,k] = [4,5] .+ 0.2randn(2) +end +for k in 11:15 + X[:,k] = [9,-5] .+ 0.2randn(2) +end +for k in 15:20 + X[:,k] = [-4,-9] .+ 0.5randn(2) +end + + +scatter(X[1,:],X[2,:], + label = nothing, +) + +## +resf = fuzzy_cmeans(X,3,2) + +res = kmeans(X,3) + +q = [calinski_harabasz(X,kmeans(X,k)) for k in 2:5] +q = [xie_beni(X,kmeans(X,k)) for k in 2:5] +q = [davies_bouldin(X,kmeans(X,k)) for k in 2:5] + +qf = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] +qf = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] + +plot(2:5,q) + + +calinski_harabasz(X,res) \ No newline at end of file diff --git a/qualityindices.jl b/qualityindices.jl new file mode 100644 index 00000000..b1c0a195 --- /dev/null +++ b/qualityindices.jl @@ -0,0 +1,44 @@ + + + +function _check_qualityindex_argument( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + ) + d, n = size(X) + dc, k = size(centers) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) + (1 <= k <= n) || throw(ArgumentError("Cluster number k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + for j = 1:n + (1 <= assignments[j] <= k) || throw(ArgumentError("Bad assignments[$j]=$(assignments[j]): should be in 1:$k range.")) + end +end + +function _check_qualityindex_argument( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + weights::AbstractMatrix{<:AbstractFloat}, + fuzziness::Real, + ) + d, n = size(X) + dc, k = size(centers) + nw, kw = size(weights) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) + n == nw || throw(DimensionMismatch("Inconsistent data length for `X` and `weights`.")) + k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) + (1 <= k <= n) || throw(ArgumentError("Cluster number k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + all(>=(zero(eltype(weights))), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) + 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) +end + +function _check_qualityindex_argument(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + n, m = size(dist) + n == m || throw(ArgumentError("Distance matrix must be square.")) +end \ No newline at end of file diff --git a/xiebeni.jl b/xiebeni.jl new file mode 100644 index 00000000..8ca63253 --- /dev/null +++ b/xiebeni.jl @@ -0,0 +1,51 @@ +using Distances +# 1-based indexing + +function xie_beni( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() + ) + + _check_qualityindex_argument(X, centers, assignments) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k + ) + + centerDistances = pairwise(distance,centers) + minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + return innerInertia / (n * minOuterDistance) +end + +xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = + xie_beni(X, R.centers, R.assignments, distance) + + + +function xie_beni( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + weights::AbstractMatrix{<:AbstractFloat}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean() + ) + + _check_qualityindex_argument(X, centers, weights, fuzziness) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + ) + + centerDistances = pairwise(distance,centers) + minOuterDistance = minimum(centerDistances[i,j] for i in 1:k for j in i+1:k) + return innerInertia / (n * minOuterDistance) +end + +xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = + xie_beni(X, R.centers, R.weights, fuzziness, distance) From 19e12c78aa8a7b7ad02add521cd20f167ec3fec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 10 Jul 2023 18:39:28 +0200 Subject: [PATCH 02/82] Update qualityindices.jl --- qualityindices.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qualityindices.jl b/qualityindices.jl index b1c0a195..d69a17a1 100644 --- a/qualityindices.jl +++ b/qualityindices.jl @@ -34,11 +34,13 @@ function _check_qualityindex_argument( (1 <= k <= n) || throw(ArgumentError("Cluster number k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - all(>=(zero(eltype(weights))), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) + all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) end function _check_qualityindex_argument(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) n, m = size(dist) + na = length(assignments) n == m || throw(ArgumentError("Distance matrix must be square.")) + n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) end \ No newline at end of file From 2549d107d219247d9abda3bea6749eee17be7375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 10 Jul 2023 18:42:11 +0200 Subject: [PATCH 03/82] . --- calinskiharabasz.jl | 6 ++---- daviesbouldin.jl | 1 - dunn.jl | 1 - xiebeni.jl | 2 -- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/calinskiharabasz.jl b/calinskiharabasz.jl index 1abf38ee..f5b2e4bb 100644 --- a/calinskiharabasz.jl +++ b/calinskiharabasz.jl @@ -7,8 +7,7 @@ function calinski_harabasz( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - - _check_qualityindex_argument(X,centers,assignments) + _check_qualityindex_argument(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -16,7 +15,7 @@ function calinski_harabasz( sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) - counts = [count(==(j)assignment) for j in 1:k] + counts = [count(==(j), assignments) for j in 1:k] globalCenter = mean(X, dims=2) outerInertia = sum( counts[j] * distance(centers[:, j], globalCenter) for j in 1:k @@ -37,7 +36,6 @@ function calinski_harabasz( fuzziness::Real, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) diff --git a/daviesbouldin.jl b/daviesbouldin.jl index 9aa327a2..648ae3b1 100644 --- a/daviesbouldin.jl +++ b/daviesbouldin.jl @@ -6,7 +6,6 @@ function davies_bouldin( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, assignments) n, k = size(X, 2), size(centers,2) diff --git a/dunn.jl b/dunn.jl index 24fbbbc5..5cc60b84 100644 --- a/dunn.jl +++ b/dunn.jl @@ -3,7 +3,6 @@ function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) - _check_qualityindex_argument(assignments, dist) n = size(dist, 1) diff --git a/xiebeni.jl b/xiebeni.jl index 8ca63253..b77d8e25 100644 --- a/xiebeni.jl +++ b/xiebeni.jl @@ -7,7 +7,6 @@ function xie_beni( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -33,7 +32,6 @@ function xie_beni( fuzziness::Real, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) From 6909fcd7d428800755261e31e57f0f888851be60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 10 Jul 2023 18:42:48 +0200 Subject: [PATCH 04/82] Update mytest.jl --- mytest.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mytest.jl b/mytest.jl index 9d1d5e02..a5af67f0 100644 --- a/mytest.jl +++ b/mytest.jl @@ -26,8 +26,8 @@ q = [calinski_harabasz(X,kmeans(X,k)) for k in 2:5] q = [xie_beni(X,kmeans(X,k)) for k in 2:5] q = [davies_bouldin(X,kmeans(X,k)) for k in 2:5] -qf = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] -qf = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] +q = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] +q = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] plot(2:5,q) From 1caf2abd538f27cb435182d052e296a67acf2e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 13:29:13 +0200 Subject: [PATCH 05/82] refactor --- .vscode/settings.json | 1 + calinskiharabasz.jl | 57 ------------- daviesbouldin.jl | 20 ----- dunn.jl | 19 ----- qualityindices.jl | 194 +++++++++++++++++++++++++++++++++++++++--- xiebeni.jl | 49 ----------- 6 files changed, 185 insertions(+), 155 deletions(-) create mode 100644 .vscode/settings.json delete mode 100644 calinskiharabasz.jl delete mode 100644 daviesbouldin.jl delete mode 100644 dunn.jl delete mode 100644 xiebeni.jl diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/calinskiharabasz.jl b/calinskiharabasz.jl deleted file mode 100644 index f5b2e4bb..00000000 --- a/calinskiharabasz.jl +++ /dev/null @@ -1,57 +0,0 @@ -using Distances -# 1-based indexing - -function calinski_harabasz( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() - ) - _check_qualityindex_argument(X, centers, assignments) - - n, k = size(X, 2), size(centers,2) - - innerInertia = sum( - sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k - ) - - counts = [count(==(j), assignments) for j in 1:k] - globalCenter = mean(X, dims=2) - outerInertia = sum( - counts[j] * distance(centers[:, j], globalCenter) for j in 1:k - ) - - return (outerInertia / (k - 1)) / (innerInertia / (n - k)) -end - -calinski_harabasz(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = - calinski_harabasz(X, R.centers, R.assignments, distance) - - - -function calinski_harabasz( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - weights::AbstractMatrix{<:AbstractFloat}, - fuzziness::Real, - distance::SemiMetric=SqEuclidean() - ) - _check_qualityindex_argument(X, centers, weights, fuzziness) - - n, k = size(X, 2), size(centers,2) - - innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k - ) - - globalCenter = mean(X, dims=2)[:] - centerDistances = colwise(distance, centers, globalCenter) - outerInertia = sum( - weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k - ) - - return (outerInertia / (k - 1)) / (innerInertia / (n - k)) -end - -calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = - calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) diff --git a/daviesbouldin.jl b/daviesbouldin.jl deleted file mode 100644 index 648ae3b1..00000000 --- a/daviesbouldin.jl +++ /dev/null @@ -1,20 +0,0 @@ - - -function davies_bouldin( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() - ) - _check_qualityindex_argument(X, centers, assignments) - - n, k = size(X, 2), size(centers,2) - - centerDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] - centerDistances = pairwise(distance,centers) - - return maximum( (centerDiameters[j₁] + centerDiameters[j₂]) / centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k ) / k -end - -davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = - davies_bouldin(X, R.centers, R.assignments, distance) \ No newline at end of file diff --git a/dunn.jl b/dunn.jl deleted file mode 100644 index 5cc60b84..00000000 --- a/dunn.jl +++ /dev/null @@ -1,19 +0,0 @@ - - - - -function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) - _check_qualityindex_argument(assignments, dist) - - n = size(dist, 1) - k = maximum(assignments) -end - - -dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = - dunn(assignments, pairwise(distance,X)) - -dunn(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean()) = - dunn(X, R.assignments, distance) - -dunn(R::ClusteringResult, dist::AbstractMatrix{<:Real}) = dunn(R.assignments, dist) \ No newline at end of file diff --git a/qualityindices.jl b/qualityindices.jl index d69a17a1..b7d8f59f 100644 --- a/qualityindices.jl +++ b/qualityindices.jl @@ -1,10 +1,12 @@ +using Distances +# 1-based indexing - +# argument checking function _check_qualityindex_argument( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - assignments::AbstractVector{<:Integer}, + X::AbstractMatrix{<:Real}, # data matrix (d x n) + centers::AbstractMatrix{<:AbstractFloat}, # cluster centers (d x k) + assignments::AbstractVector{<:Integer}, # assignments (n) ) d, n = size(X) dc, k = size(centers) @@ -19,10 +21,10 @@ function _check_qualityindex_argument( end function _check_qualityindex_argument( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - weights::AbstractMatrix{<:AbstractFloat}, - fuzziness::Real, + X::AbstractMatrix{<:Real}, # data matrix (d x n) + centers::AbstractMatrix{<:AbstractFloat}, # cluster centers (d x k) + weights::AbstractMatrix{<:AbstractFloat}, # assigned weights (n x C) + fuzziness::Real, # cluster fuzziness ) d, n = size(X) dc, k = size(centers) @@ -38,9 +40,181 @@ function _check_qualityindex_argument( 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) end -function _check_qualityindex_argument(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) +function _check_qualityindex_argument( + assignments::AbstractVector{<:Integer}, # assignments (n) + dist::AbstractMatrix{<:Real} # data distance matrix (n x n) + ) n, m = size(dist) na = length(assignments) n == m || throw(ArgumentError("Distance matrix must be square.")) n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) -end \ No newline at end of file +end + +# Calinski-Harabasz index + +function calinski_harabasz( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() +) +_check_qualityindex_argument(X, centers, assignments) + +n, k = size(X, 2), size(centers,2) + +innerInertia = sum( + sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k +) + +counts = [count(==(j), assignments) for j in 1:k] +globalCenter = mean(X, dims=2) +outerInertia = sum( + counts[j] * distance(centers[:, j], globalCenter) for j in 1:k +) + +return (outerInertia / (k - 1)) / (innerInertia / (n - k)) +end + +calinski_harabasz(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = +calinski_harabasz(X, R.centers, R.assignments, distance) + + +function calinski_harabasz( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + weights::AbstractMatrix{<:AbstractFloat}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean() +) +_check_qualityindex_argument(X, centers, weights, fuzziness) + +n, k = size(X, 2), size(centers,2) + +innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k +) + +globalCenter = mean(X, dims=2)[:] +centerDistances = colwise(distance, centers, globalCenter) +outerInertia = sum( + weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k +) + +return (outerInertia / (k - 1)) / (innerInertia / (n - k)) +end + +calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = +calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) + + +# Davies-Bouldin idex + +function davies_bouldin( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() +) +_check_qualityindex_argument(X, centers, assignments) + +n, k = size(X, 2), size(centers,2) + +centerDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] +centerDistances = pairwise(distance,centers) + +return maximum( (centerDiameters[j₁] + centerDiameters[j₂]) / centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k ) / k +end + +davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = +davies_bouldin(X, R.centers, R.assignments, distance) + + +# Xie-Beni index + +function xie_beni( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() + ) + _check_qualityindex_argument(X, centers, assignments) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k + ) + + centerDistances = pairwise(distance,centers) + minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + return innerInertia / (n * minOuterDistance) +end + +xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = + xie_beni(X, R.centers, R.assignments, distance) + + +function xie_beni( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:AbstractFloat}, + weights::AbstractMatrix{<:AbstractFloat}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean() + ) + _check_qualityindex_argument(X, centers, weights, fuzziness) + + n, k = size(X, 2), size(centers,2) + + innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + ) + + centerDistances = pairwise(distance,centers) + minOuterDistance = minimum(centerDistances[i,j] for i in 1:k for j in i+1:k) + return innerInertia / (n * minOuterDistance) +end + +xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = + xie_beni(X, R.centers, R.weights, fuzziness, distance) + + +# Dunn index + +function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + _check_qualityindex_argument(assignments, dist) + + n = size(dist, 1) + k = maximum(assignments) + + minOuterDistance = eltype(dist)(Inf) + + for j₁ in 1:k, j₂ in j₁+1:k + # δ is min distance between points from clusters j₁ and j₂ + δ = minimum(dist[i₁,i₂] for i₁ in findall(==(j₁), assignments), i₂ in findall(==(j₂), assignments)) + + if δ < minOuterDistance + minOuterDistance = δ + end + end + + maxInnerDistance = eltype(dist)(-Inf) + + for j in 1:k + # Δ is max distance between points in cluster j + Δ = maximum(dist[i₁,i₂] for i₁ in findall(==(j), assignments), i₂ in findall(==(j), assignments)) + + if Δ > maxInnerDistance + maxInnerDistance = Δ + end + end + + return minOuterDistance / maxInnerDistance +end + +dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = + dunn(assignments, pairwise(distance,X)) + +dunn(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean()) = + dunn(X, R.assignments, distance) + +dunn(R::ClusteringResult, dist::AbstractMatrix{<:Real}) = dunn(R.assignments, dist) \ No newline at end of file diff --git a/xiebeni.jl b/xiebeni.jl deleted file mode 100644 index b77d8e25..00000000 --- a/xiebeni.jl +++ /dev/null @@ -1,49 +0,0 @@ -using Distances -# 1-based indexing - -function xie_beni( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() - ) - _check_qualityindex_argument(X, centers, assignments) - - n, k = size(X, 2), size(centers,2) - - innerInertia = sum( - sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k - ) - - centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return innerInertia / (n * minOuterDistance) -end - -xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = - xie_beni(X, R.centers, R.assignments, distance) - - - -function xie_beni( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - weights::AbstractMatrix{<:AbstractFloat}, - fuzziness::Real, - distance::SemiMetric=SqEuclidean() - ) - _check_qualityindex_argument(X, centers, weights, fuzziness) - - n, k = size(X, 2), size(centers,2) - - innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k - ) - - centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[i,j] for i in 1:k for j in i+1:k) - return innerInertia / (n * minOuterDistance) -end - -xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = - xie_beni(X, R.centers, R.weights, fuzziness, distance) From 326137fe99d26df86caf6ba7aa943fc6875ff288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 13:39:23 +0200 Subject: [PATCH 06/82] move --- qualityindices.jl => src/qualityindices.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename qualityindices.jl => src/qualityindices.jl (100%) diff --git a/qualityindices.jl b/src/qualityindices.jl similarity index 100% rename from qualityindices.jl rename to src/qualityindices.jl From 6e849794d8ffacaed11b3f382218609f5e897558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:38:08 +0200 Subject: [PATCH 07/82] last corrections --- mytest.jl | 47 ++++++++++++++++++++++++++++-- src/Clustering.jl | 7 +++-- src/qualityindices.jl | 66 ++++++++++++++++++++++--------------------- 3 files changed, 84 insertions(+), 36 deletions(-) diff --git a/mytest.jl b/mytest.jl index 9d1d5e02..758cb559 100644 --- a/mytest.jl +++ b/mytest.jl @@ -1,4 +1,4 @@ -using Plots, Clustering, Statistics +using Plots, Clustering, Distances, Statistics X = Matrix{Float64}(undef,2,20) @@ -25,11 +25,54 @@ res = kmeans(X,3) q = [calinski_harabasz(X,kmeans(X,k)) for k in 2:5] q = [xie_beni(X,kmeans(X,k)) for k in 2:5] q = [davies_bouldin(X,kmeans(X,k)) for k in 2:5] +q = [dunn(X,kmeans(X,k),SqEuclidean()) for k in 2:5] qf = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] qf = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] plot(2:5,q) +## test data -calinski_harabasz(X,res) \ No newline at end of file +Y = [-2 4; 2 4; 2 1; 3 0; 2 -1; 1 0; 2 -4; -2 -4; -2 1; -1 0; -2 -1; -3 0] +C = [0 4; 2 0; 0 -4; -2 0] +A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] +W = [ + 1 0 0 0 + 1 0 0 0 + 0 1 0 0 + 0 1 0 0 + 0 1 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 1 0 + 0 0 0 1 + 0 0 0 1 + 0 0 0 1 + 0 0 0 1 +] +scatter(Y[:,1],Y[:,2], + axisratio = :equal, + #seriescolor = palette(default)[A], +) +scatter!(C[:,1],C[:,2], + marker = :square +) + +## tests +using Test + +@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2), zeros(2,3), [1, 2]) +@test_throws DimensionMismatch Clustering._check_qualityindex_argument(zeros(2,2),zeros(3,2), [1, 2]) +@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2),zeros(2,1), [1, ]) +@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2),zeros(2,2), [1, 2]) + +@test calinski_harabasz(Y',C',A,Euclidean()) ≈ (32/3) / (16/8) +@test calinski_harabasz(Y',C',W,2,Euclidean()) ≈ (32/3) / (16/8) + +@test davies_bouldin(Y',C',A,Euclidean()) ≈ 3/2√5 + +@test xie_beni(Y',C',A,Euclidean()) ≈ 1/3 +@test xie_beni(Y',C',W,2,Euclidean()) ≈ 1/3 + +@test dunn(Y',A,Euclidean()) ≈ 1/2 \ No newline at end of file diff --git a/src/Clustering.jl b/src/Clustering.jl index bb7f44cd..3a0f29d0 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -68,7 +68,10 @@ module Clustering mcl, MCLResult, # pair confusion matrix - confusion + confusion, + + # quality indices + calinski_harabasz, davies_bouldin, xie_beni, dunn ## source files @@ -89,7 +92,7 @@ module Clustering include("vmeasure.jl") include("mutualinfo.jl") include("confusion.jl") - + include("qualityindices.jl") include("hclust.jl") include("deprecate.jl") diff --git a/src/qualityindices.jl b/src/qualityindices.jl index b7d8f59f..c528fccd 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -1,18 +1,14 @@ -using Distances -# 1-based indexing - -# argument checking function _check_qualityindex_argument( X::AbstractMatrix{<:Real}, # data matrix (d x n) - centers::AbstractMatrix{<:AbstractFloat}, # cluster centers (d x k) + centers::AbstractMatrix{<:Real}, # cluster centers (d x k) assignments::AbstractVector{<:Integer}, # assignments (n) ) d, n = size(X) dc, k = size(centers) d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) - (1 <= k <= n) || throw(ArgumentError("Cluster number k must be from 1:n (n=$n), k=$k given.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) for j = 1:n @@ -22,8 +18,8 @@ end function _check_qualityindex_argument( X::AbstractMatrix{<:Real}, # data matrix (d x n) - centers::AbstractMatrix{<:AbstractFloat}, # cluster centers (d x k) - weights::AbstractMatrix{<:AbstractFloat}, # assigned weights (n x C) + centers::AbstractMatrix{<:Real}, # cluster centers (d x k) + weights::AbstractMatrix{<:Real}, # assigned weights (n x C) fuzziness::Real, # cluster fuzziness ) d, n = size(X) @@ -33,7 +29,7 @@ function _check_qualityindex_argument( d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) n == nw || throw(DimensionMismatch("Inconsistent data length for `X` and `weights`.")) k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) - (1 <= k <= n) || throw(ArgumentError("Cluster number k must be from 1:n (n=$n), k=$k given.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) @@ -54,7 +50,7 @@ end function calinski_harabasz( X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, + centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) @@ -62,14 +58,15 @@ _check_qualityindex_argument(X, centers, assignments) n, k = size(X, 2), size(centers,2) -innerInertia = sum( - sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k -) - counts = [count(==(j), assignments) for j in 1:k] -globalCenter = mean(X, dims=2) +globalCenter = mean(X, dims=2)[:] +centerDistances = colwise(distance, centers, globalCenter) outerInertia = sum( - counts[j] * distance(centers[:, j], globalCenter) for j in 1:k + counts[j] * centerDistances[j] for j in 1:k +) + +innerInertia = sum( + sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) return (outerInertia / (k - 1)) / (innerInertia / (n - k)) @@ -81,8 +78,8 @@ calinski_harabasz(X, R.centers, R.assignments, distance) function calinski_harabasz( X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - weights::AbstractMatrix{<:AbstractFloat}, + centers::AbstractMatrix{<:Real}, + weights::AbstractMatrix{<:Real}, fuzziness::Real, distance::SemiMetric=SqEuclidean() ) @@ -90,16 +87,16 @@ _check_qualityindex_argument(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) -innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k -) - globalCenter = mean(X, dims=2)[:] centerDistances = colwise(distance, centers, globalCenter) outerInertia = sum( weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k ) +innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k +) + return (outerInertia / (k - 1)) / (innerInertia / (n - k)) end @@ -111,18 +108,23 @@ calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) function davies_bouldin( X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, + centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() -) -_check_qualityindex_argument(X, centers, assignments) + ) + _check_qualityindex_argument(X, centers, assignments) -n, k = size(X, 2), size(centers,2) + n, k = size(X, 2), size(centers,2) -centerDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] -centerDistances = pairwise(distance,centers) + clusterDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] + centerDistances = pairwise(distance,centers) + + DB = mean( + maximum( (clusterDiameters[j₁] + clusterDiameters[j₂]) / centerDistances[j₁,j₂] for j₂ in 1:k if j₂ ≠ j₁) + for j₁ in 1:k + ) -return maximum( (centerDiameters[j₁] + centerDiameters[j₂]) / centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k ) / k + return DB end davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = @@ -133,7 +135,7 @@ davies_bouldin(X, R.centers, R.assignments, distance) function xie_beni( X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, + centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) @@ -156,8 +158,8 @@ xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEucl function xie_beni( X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:AbstractFloat}, - weights::AbstractMatrix{<:AbstractFloat}, + centers::AbstractMatrix{<:Real}, + weights::AbstractMatrix{<:Real}, fuzziness::Real, distance::SemiMetric=SqEuclidean() ) From b2c644c2d6068f6d702e38d966a8973253f3d556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:42:58 +0200 Subject: [PATCH 08/82] _arguments --- src/qualityindices.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index c528fccd..8bbbd7b3 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -1,5 +1,5 @@ -function _check_qualityindex_argument( +function _check_qualityindex_arguments( X::AbstractMatrix{<:Real}, # data matrix (d x n) centers::AbstractMatrix{<:Real}, # cluster centers (d x k) assignments::AbstractVector{<:Integer}, # assignments (n) @@ -16,7 +16,7 @@ function _check_qualityindex_argument( end end -function _check_qualityindex_argument( +function _check_qualityindex_arguments( X::AbstractMatrix{<:Real}, # data matrix (d x n) centers::AbstractMatrix{<:Real}, # cluster centers (d x k) weights::AbstractMatrix{<:Real}, # assigned weights (n x C) @@ -36,7 +36,7 @@ function _check_qualityindex_argument( 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) end -function _check_qualityindex_argument( +function _check_qualityindex_arguments( assignments::AbstractVector{<:Integer}, # assignments (n) dist::AbstractMatrix{<:Real} # data distance matrix (n x n) ) @@ -54,7 +54,7 @@ function calinski_harabasz( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) -_check_qualityindex_argument(X, centers, assignments) +_check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -83,7 +83,7 @@ function calinski_harabasz( fuzziness::Real, distance::SemiMetric=SqEuclidean() ) -_check_qualityindex_argument(X, centers, weights, fuzziness) +_check_qualityindex_arguments(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) @@ -112,7 +112,7 @@ function davies_bouldin( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, assignments) + _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -128,7 +128,7 @@ function davies_bouldin( end davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = -davies_bouldin(X, R.centers, R.assignments, distance) + davies_bouldin(X, R.centers, R.assignments, distance) # Xie-Beni index @@ -139,7 +139,7 @@ function xie_beni( assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, assignments) + _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -163,7 +163,7 @@ function xie_beni( fuzziness::Real, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_argument(X, centers, weights, fuzziness) + _check_qualityindex_arguments(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) @@ -183,7 +183,7 @@ xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, dista # Dunn index function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) - _check_qualityindex_argument(assignments, dist) + _check_qualityindex_arguments(assignments, dist) n = size(dist, 1) k = maximum(assignments) From 1ad7e677b135b87e982087f071d421e5d415bbda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:45:56 +0200 Subject: [PATCH 09/82] Update qualityindices.jl --- src/qualityindices.jl | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 8bbbd7b3..c4947215 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -77,27 +77,27 @@ calinski_harabasz(X, R.centers, R.assignments, distance) function calinski_harabasz( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}, - fuzziness::Real, - distance::SemiMetric=SqEuclidean() -) -_check_qualityindex_arguments(X, centers, weights, fuzziness) + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + weights::AbstractMatrix{<:Real}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean() + ) + _check_qualityindex_arguments(X, centers, weights, fuzziness) -n, k = size(X, 2), size(centers,2) + n, k = size(X, 2), size(centers,2) -globalCenter = mean(X, dims=2)[:] -centerDistances = colwise(distance, centers, globalCenter) -outerInertia = sum( - weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k -) + globalCenter = mean(X, dims=2)[:] + centerDistances = colwise(distance, centers, globalCenter) + outerInertia = sum( + weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k + ) -innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k -) + innerInertia = sum( + weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + ) -return (outerInertia / (k - 1)) / (innerInertia / (n - k)) + return (outerInertia / (k - 1)) / (innerInertia / (n - k)) end calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = @@ -107,10 +107,10 @@ calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) # Davies-Bouldin idex function davies_bouldin( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean() ) _check_qualityindex_arguments(X, centers, assignments) From 67cc0052d70618ee642c69ad2f814acc4707af2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:49:24 +0200 Subject: [PATCH 10/82] Update qualityindices.jl --- src/qualityindices.jl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index c4947215..10c1d6cb 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -1,8 +1,8 @@ function _check_qualityindex_arguments( - X::AbstractMatrix{<:Real}, # data matrix (d x n) - centers::AbstractMatrix{<:Real}, # cluster centers (d x k) - assignments::AbstractVector{<:Integer}, # assignments (n) + X::AbstractMatrix{<:Real}, # data matrix (d x n) + centers::AbstractMatrix{<:Real}, # cluster centers (d x k) + assignments::AbstractVector{<:Integer}, # assignments (n) ) d, n = size(X) dc, k = size(centers) @@ -17,10 +17,10 @@ function _check_qualityindex_arguments( end function _check_qualityindex_arguments( - X::AbstractMatrix{<:Real}, # data matrix (d x n) + X::AbstractMatrix{<:Real}, # data matrix (d x n) centers::AbstractMatrix{<:Real}, # cluster centers (d x k) - weights::AbstractMatrix{<:Real}, # assigned weights (n x C) - fuzziness::Real, # cluster fuzziness + weights::AbstractMatrix{<:Real}, # assigned weights (n x k) + fuzziness::Real, # cluster fuzziness ) d, n = size(X) dc, k = size(centers) @@ -185,7 +185,6 @@ xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, dista function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) _check_qualityindex_arguments(assignments, dist) - n = size(dist, 1) k = maximum(assignments) minOuterDistance = eltype(dist)(Inf) From b23b998b2b3a47bb2816b776f5282945d4f5d05e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:06:57 +0200 Subject: [PATCH 11/82] Update qualityindices.jl --- src/qualityindices.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 10c1d6cb..920ed810 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -101,7 +101,7 @@ function calinski_harabasz( end calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = -calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) + calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) # Davies-Bouldin idex From 713d01b85c7c6263baf950c909847b02a7968cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:08:27 +0200 Subject: [PATCH 12/82] Update qualityindices.jl --- src/qualityindices.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 920ed810..605a046c 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -114,7 +114,7 @@ function davies_bouldin( ) _check_qualityindex_arguments(X, centers, assignments) - n, k = size(X, 2), size(centers,2) + k = size(centers,2) clusterDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] centerDistances = pairwise(distance,centers) From a5c083103e5bd85d61bc035f86c4a5a30e05eb40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:16:38 +0200 Subject: [PATCH 13/82] Update qualityindices.jl --- src/qualityindices.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 605a046c..8c7ccfb6 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -187,7 +187,7 @@ function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Rea k = maximum(assignments) - minOuterDistance = eltype(dist)(Inf) + minOuterDistance = typemax(eltype(dist)) for j₁ in 1:k, j₂ in j₁+1:k # δ is min distance between points from clusters j₁ and j₂ @@ -198,7 +198,7 @@ function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Rea end end - maxInnerDistance = eltype(dist)(-Inf) + maxInnerDistance = typemin(eltype(dist)) for j in 1:k # Δ is max distance between points in cluster j From fa7a877cb44eb77c7efc09656594bb38b53652c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:19:43 +0200 Subject: [PATCH 14/82] Update qualityindices.jl --- src/qualityindices.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 8c7ccfb6..02a0caa8 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -11,7 +11,7 @@ function _check_qualityindex_arguments( (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - for j = 1:n + for j in 1:n (1 <= assignments[j] <= k) || throw(ArgumentError("Bad assignments[$j]=$(assignments[j]): should be in 1:$k range.")) end end From e5485db97c63425920a7cfe51e6a07feb12722c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 11 Jul 2023 19:35:28 +0200 Subject: [PATCH 15/82] Update qualityindices.jl --- src/qualityindices.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 02a0caa8..8d73ebac 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -149,6 +149,7 @@ function xie_beni( centerDistances = pairwise(distance,centers) minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + return innerInertia / (n * minOuterDistance) end @@ -172,7 +173,8 @@ function xie_beni( ) centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[i,j] for i in 1:k for j in i+1:k) + minOuterDistance = minimum(centerDistances[i,j] for j₁ in 1:k for j₂ in j₁+1:k) + return innerInertia / (n * minOuterDistance) end From 439647fec3bdc117c032a60a636da8845e3e417a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Wed, 12 Jul 2023 13:03:23 +0200 Subject: [PATCH 16/82] indices correction --- mytest.jl | 8 ++++---- src/qualityindices.jl | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mytest.jl b/mytest.jl index 758cb559..e7d61161 100644 --- a/mytest.jl +++ b/mytest.jl @@ -62,10 +62,10 @@ scatter!(C[:,1],C[:,2], ## tests using Test -@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2), zeros(2,3), [1, 2]) -@test_throws DimensionMismatch Clustering._check_qualityindex_argument(zeros(2,2),zeros(3,2), [1, 2]) -@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2),zeros(2,1), [1, ]) -@test_throws ArgumentError Clustering._check_qualityindex_argument(zeros(2,2),zeros(2,2), [1, 2]) +@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2), zeros(2,3), [1, 2]) +@test_throws DimensionMismatch Clustering._check_qualityindex_arguments(zeros(2,2),zeros(3,2), [1, 2]) +@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2),zeros(2,1), [1, ]) +@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2),zeros(2,2), [1, 2]) @test calinski_harabasz(Y',C',A,Euclidean()) ≈ (32/3) / (16/8) @test calinski_harabasz(Y',C',W,2,Euclidean()) ≈ (32/3) / (16/8) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 8d73ebac..4b91abe9 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -93,8 +93,9 @@ function calinski_harabasz( weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k ) + pointCentreDistances = pairwise(distance,X,centers) innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k ) return (outerInertia / (k - 1)) / (innerInertia / (n - k)) @@ -173,7 +174,7 @@ function xie_beni( ) centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[i,j] for j₁ in 1:k for j₂ in j₁+1:k) + minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) return innerInertia / (n * minOuterDistance) end From 6f2ec3e84eb46eb6442dfcbf65be32a65480c714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:35:58 +0200 Subject: [PATCH 17/82] small corrections --- .vscode/settings.json | 1 - src/qualityindices.jl | 23 +++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9e26dfee..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 4b91abe9..ed549227 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -59,14 +59,12 @@ _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) counts = [count(==(j), assignments) for j in 1:k] -globalCenter = mean(X, dims=2)[:] -centerDistances = colwise(distance, centers, globalCenter) -outerInertia = sum( - counts[j] * centerDistances[j] for j in 1:k -) +globalCenter = vec(mean(X, dims=2)) +centerDistances = pairwise(distance, centers, globalCenter) +outerInertia = counts ⋅ centerDistances innerInertia = sum( - sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k + sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) return (outerInertia / (k - 1)) / (innerInertia / (n - k)) @@ -87,13 +85,13 @@ function calinski_harabasz( n, k = size(X, 2), size(centers,2) - globalCenter = mean(X, dims=2)[:] - centerDistances = colwise(distance, centers, globalCenter) + globalCenter = vec(mean(X, dims=2)) + centerDistances = pairwise(distance, centers, globalCenter) outerInertia = sum( weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k ) - pointCentreDistances = pairwise(distance,X,centers) + pointCentreDistances = pairwise(distance, X, centers) innerInertia = sum( weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k ) @@ -117,7 +115,7 @@ function davies_bouldin( k = size(centers,2) - clusterDiameters = [mean(colwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] + clusterDiameters = [mean(pairwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] centerDistances = pairwise(distance,centers) DB = mean( @@ -145,7 +143,7 @@ function xie_beni( n, k = size(X, 2), size(centers,2) innerInertia = sum( - sum(colwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k + sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) centerDistances = pairwise(distance,centers) @@ -169,8 +167,9 @@ function xie_beni( n, k = size(X, 2), size(centers,2) + pointCentreDistances = pairwise(distance, X, centers) innerInertia = sum( - weights[i,j]^fuzziness * distance(X[:,i],centers[:,j]) for i in 1:n, j in 1:k + weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k ) centerDistances = pairwise(distance,centers) From 6ac56fc745c0fb3db01d6dcfb5332c986856a55d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:38:37 +0200 Subject: [PATCH 18/82] Update src/qualityindices.jl Co-authored-by: Alexey Stukalov --- src/qualityindices.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index ed549227..bfcb3130 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -67,7 +67,7 @@ innerInertia = sum( sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) -return (outerInertia / (k - 1)) / (innerInertia / (n - k)) + return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end calinski_harabasz(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = From 7f4787185af5ce6e5c11891d6e872b4433d2268d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 14 Jul 2023 22:51:21 +0200 Subject: [PATCH 19/82] Update mytest.jl Co-authored-by: Alexey Stukalov --- mytest.jl | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mytest.jl b/mytest.jl index e7d61161..9329b071 100644 --- a/mytest.jl +++ b/mytest.jl @@ -1,16 +1,8 @@ using Plots, Clustering, Distances, Statistics -X = Matrix{Float64}(undef,2,20) - -for k in 1:10 - X[:,k] = [4,5] .+ 0.2randn(2) -end -for k in 11:15 - X[:,k] = [9,-5] .+ 0.2randn(2) -end -for k in 15:20 - X[:,k] = [-4,-9] .+ 0.5randn(2) -end +X = hcat([4., 5.] .+ 0.2 * randn(2, 10), + [9., -5.] .+ 0.2 * randn(2, 5), + [-4., -9.] .+ 0.5 * randn(2, 5)) scatter(X[1,:],X[2,:], From fad8349eb88d944c5ba747fe390c4a558cab5c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 14 Jul 2023 23:04:45 +0200 Subject: [PATCH 20/82] Update qualityindices.jl --- src/qualityindices.jl | 56 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index bfcb3130..64d0900a 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -59,11 +59,11 @@ _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) counts = [count(==(j), assignments) for j in 1:k] -globalCenter = vec(mean(X, dims=2)) -centerDistances = pairwise(distance, centers, globalCenter) -outerInertia = counts ⋅ centerDistances +global_center = vec(mean(X, dims=2)) +center_distances = pairwise(distance, centers, global_center) +outer_intertia = counts ⋅ center_distances -innerInertia = sum( +inner_intertia = sum( sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) @@ -85,18 +85,18 @@ function calinski_harabasz( n, k = size(X, 2), size(centers,2) - globalCenter = vec(mean(X, dims=2)) - centerDistances = pairwise(distance, centers, globalCenter) - outerInertia = sum( - weights[i,j]^fuzziness * centerDistances[j] for i in 1:n, j in 1:k + global_center = vec(mean(X, dims=2)) + center_distances = pairwise(distance, centers, global_center) + outer_intertia = sum( + weights[i,j]^fuzziness * center_distances[j] for i in 1:n, j in 1:k ) pointCentreDistances = pairwise(distance, X, centers) - innerInertia = sum( + inner_intertia = sum( weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k ) - return (outerInertia / (k - 1)) / (innerInertia / (n - k)) + return (outer_intertia / (k - 1)) / (inner_intertia / (n - k)) end calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = @@ -115,11 +115,11 @@ function davies_bouldin( k = size(centers,2) - clusterDiameters = [mean(pairwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] - centerDistances = pairwise(distance,centers) + cluster_diameters = [mean(pairwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] + center_distances = pairwise(distance,centers) DB = mean( - maximum( (clusterDiameters[j₁] + clusterDiameters[j₂]) / centerDistances[j₁,j₂] for j₂ in 1:k if j₂ ≠ j₁) + maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in 1:k if j₂ ≠ j₁) for j₁ in 1:k ) @@ -142,14 +142,14 @@ function xie_beni( n, k = size(X, 2), size(centers,2) - innerInertia = sum( + inner_intertia = sum( sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k ) - centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + center_distances = pairwise(distance,centers) + min_outer_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return innerInertia / (n * minOuterDistance) + return inner_intertia / (n * min_outer_distance) end xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = @@ -168,14 +168,14 @@ function xie_beni( n, k = size(X, 2), size(centers,2) pointCentreDistances = pairwise(distance, X, centers) - innerInertia = sum( + inner_intertia = sum( weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k ) - centerDistances = pairwise(distance,centers) - minOuterDistance = minimum(centerDistances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + center_distances = pairwise(distance,centers) + min_outer_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return innerInertia / (n * minOuterDistance) + return inner_intertia / (n * min_outer_distance) end xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = @@ -189,29 +189,29 @@ function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Rea k = maximum(assignments) - minOuterDistance = typemax(eltype(dist)) + min_outer_distance = typemax(eltype(dist)) for j₁ in 1:k, j₂ in j₁+1:k # δ is min distance between points from clusters j₁ and j₂ δ = minimum(dist[i₁,i₂] for i₁ in findall(==(j₁), assignments), i₂ in findall(==(j₂), assignments)) - if δ < minOuterDistance - minOuterDistance = δ + if δ < min_outer_distance + min_outer_distance = δ end end - maxInnerDistance = typemin(eltype(dist)) + max_inner_distance = typemin(eltype(dist)) for j in 1:k # Δ is max distance between points in cluster j Δ = maximum(dist[i₁,i₂] for i₁ in findall(==(j), assignments), i₂ in findall(==(j), assignments)) - if Δ > maxInnerDistance - maxInnerDistance = Δ + if Δ > max_inner_distance + max_inner_distance = Δ end end - return minOuterDistance / maxInnerDistance + return min_outer_distance / max_inner_distance end dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = From 3ecacd28eafd8f64533e3f21750330231a4f12f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 14 Jul 2023 23:05:15 +0200 Subject: [PATCH 21/82] Update src/qualityindices.jl Co-authored-by: Alexey Stukalov --- src/qualityindices.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index bfcb3130..c5ef5553 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -58,7 +58,11 @@ _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) -counts = [count(==(j), assignments) for j in 1:k] + clu_samples = [Vector{Int}() for _ in 1:k] + for (i, a) in enumerate(assignments) + push!(clu_samples[a], i) + end + clu_sizes = length.(clu_samples) globalCenter = vec(mean(X, dims=2)) centerDistances = pairwise(distance, centers, globalCenter) outerInertia = counts ⋅ centerDistances From 567832a6819b773237f1183becc382a5af2ec8b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 16 Jul 2023 00:41:44 +0200 Subject: [PATCH 22/82] indxing independence --- mytest.jl | 2 + src/qualityindices.jl | 123 ++++++++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/mytest.jl b/mytest.jl index 9aa12cf8..87298b8b 100644 --- a/mytest.jl +++ b/mytest.jl @@ -1,4 +1,5 @@ using Plots, Clustering, Distances, Statistics +using LinearAlgebra # for testing X = hcat([4., 5.] .+ 0.2 * randn(2, 10), [9., -5.] .+ 0.2 * randn(2, 5), @@ -52,6 +53,7 @@ scatter!(C[:,1],C[:,2], ) ## tests + using Test @test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2), zeros(2,3), [1, 2]) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 25171323..8ec9b140 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -5,14 +5,15 @@ function _check_qualityindex_arguments( assignments::AbstractVector{<:Integer}, # assignments (n) ) d, n = size(X) + _, data_idx = axes(X) dc, k = size(centers) d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - for j in 1:n - (1 <= assignments[j] <= k) || throw(ArgumentError("Bad assignments[$j]=$(assignments[j]): should be in 1:$k range.")) + for i in eachindex(assignments) + (assignments[i] in data_idx) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) end end @@ -46,6 +47,34 @@ function _check_qualityindex_arguments( n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) end + +function _gather_samples(assignments, k) + cluster_samples = [Int[] for _ in 1:k] + for (i, a) in enumerate(assignments) + push!(cluster_samples[a], i) + end + return cluster_samples +end + + +function _inner_inertia(X, centers, cluster_samples, distance) # shared between hard clustering calinski_harabasz and xie_beni + inner_inertia = sum( + sum(colwise(distance, view(X, :, samples), center)) + for (center, samples) in zip(eachcol(centers), cluster_samples) + ) + return inner_inertia +end + +function _inner_inertia(X, centers, weights, fuzziness, distance) # shared between soft clustering calinski_harabasz and xie_beni + n, k = size(X, 2), size(centers, 2) + w_idx1, w_idx2 = axes(weights) + pointCentreDistances = pairwise(distance, eachcol(X), eachcol(centers)) + inner_inertia = sum( + weights[i₁,j₁]^fuzziness * pointCentreDistances[i₂,j₂] for (i₁,i₂) in zip(w_idx1,1:n), (j₁,j₂) in zip(w_idx2, 1:k) + ) + return inner_inertia +end + # Calinski-Harabasz index function calinski_harabasz( @@ -58,19 +87,12 @@ function calinski_harabasz( n, k = size(X, 2), size(centers, 2) - clu_samples = [Int[] for _ in 1:k] - for (i, a) in enumerate(assignments) - push!(clu_samples[a], i) - end - clu_sizes = length.(clu_samples) + cluster_samples = _gather_samples(assignments, k) global_center = vec(mean(X, dims=2)) - center_distances = pairwise(distance, centers, global_center) - outer_inertia = counts ⋅ center_distances + center_distances = colwise(distance, centers, global_center) + outer_inertia = length.(cluster_samples) ⋅ center_distances - inner_inertia = sum( - sum(pairwise(distance, view(X, :, samples), clu_center)) - for (clu_center, samples) in zip(eachcol(centers), clu_samples) - ) + inner_inertia = _inner_inertia(X, centers, cluster_samples, distance) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -89,17 +111,15 @@ function calinski_harabasz( _check_qualityindex_arguments(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) + w_idx1, w_idx2 = axes(weights) global_center = vec(mean(X, dims=2)) - center_distances = pairwise(distance, centers, global_center) + center_distances = colwise(distance, centers, global_center) outer_intertia = sum( - weights[i,j]^fuzziness * center_distances[j] for i in 1:n, j in 1:k + weights[i,j₁]^fuzziness * center_distances[j₂] for i in w_idx1, (j₁,j₂) in zip(w_idx2, 1:k) ) - pointCentreDistances = pairwise(distance, X, centers) - inner_intertia = sum( - weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k - ) + inner_intertia = _inner_inertia(X, centers, weights, fuzziness, distance) return (outer_intertia / (k - 1)) / (inner_intertia / (n - k)) end @@ -118,16 +138,18 @@ function davies_bouldin( ) _check_qualityindex_arguments(X, centers, assignments) - k = size(centers,2) + k = size(centers, 2) + c_idx = axes(centers,2) + + cluster_samples = _gather_samples(assignments, k) - cluster_diameters = [mean(pairwise(distance,view(X, :, assignments .== j), centers[:,j])) for j in 1:k ] + cluster_diameters = [mean(colwise(distance,view(X, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] center_distances = pairwise(distance,centers) DB = mean( - maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in 1:k if j₂ ≠ j₁) - for j₁ in 1:k + maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) + for j₁ in c_idx ) - return DB end @@ -147,14 +169,13 @@ function xie_beni( n, k = size(X, 2), size(centers,2) - inner_intertia = sum( - sum(pairwise(distance, view(X, :, assignments .== j), centers[:, j])) for j in 1:k - ) + cluster_samples = _gather_samples(assignments, k) + inner_intertia = _inner_inertia(X, centers, cluster_samples, distance) center_distances = pairwise(distance,centers) - min_outer_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return inner_intertia / (n * min_outer_distance) + return inner_intertia / (n * min_center_distance) end xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = @@ -170,17 +191,14 @@ function xie_beni( ) _check_qualityindex_arguments(X, centers, weights, fuzziness) - n, k = size(X, 2), size(centers,2) + n, k = size(X, 2), size(centers, 2) - pointCentreDistances = pairwise(distance, X, centers) - inner_intertia = sum( - weights[i,j]^fuzziness * pointCentreDistances[i,j] for i in 1:n, j in 1:k - ) + inner_intertia = _inner_inertia(X, centers, weights, fuzziness, distance) - center_distances = pairwise(distance,centers) - min_outer_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + center_distances = pairwise(distance, eachcol(centers)) + min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return inner_intertia / (n * min_outer_distance) + return inner_intertia / (n * min_center_distance) end xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = @@ -193,34 +211,23 @@ function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Rea _check_qualityindex_arguments(assignments, dist) k = maximum(assignments) + cluster_samples = _gather_samples(assignments, k) - min_outer_distance = typemax(eltype(dist)) - - for j₁ in 1:k, j₂ in j₁+1:k - # δ is min distance between points from clusters j₁ and j₂ - δ = minimum(dist[i₁,i₂] for i₁ in findall(==(j₁), assignments), i₂ in findall(==(j₂), assignments)) - - if δ < min_outer_distance - min_outer_distance = δ - end - end - - max_inner_distance = typemin(eltype(dist)) - - for j in 1:k - # Δ is max distance between points in cluster j - Δ = maximum(dist[i₁,i₂] for i₁ in findall(==(j), assignments), i₂ in findall(==(j), assignments)) + min_outer_distance = minimum( + minimum(view(dist, cluster_samples[j₁], cluster_samples[j₂])) + for j₁ in 1:k for j₂ in j₁+1:k + ) - if Δ > max_inner_distance - max_inner_distance = Δ - end - end + max_inner_distance = maximum( + maximum(dist[i₁,i₂] for i₁ in sample, i₂ in sample) + for sample in cluster_samples + ) return min_outer_distance / max_inner_distance end dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = - dunn(assignments, pairwise(distance,X)) + dunn(assignments, pairwise(distance,eachcol(X))) dunn(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean()) = dunn(X, R.assignments, distance) From d1f9f9d6d03e8342a8bcd8efaf5a4960dc482884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 16 Jul 2023 00:47:08 +0200 Subject: [PATCH 23/82] Update mytest.jl Co-authored-by: Alexey Stukalov --- mytest.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mytest.jl b/mytest.jl index 87298b8b..957d67a1 100644 --- a/mytest.jl +++ b/mytest.jl @@ -15,10 +15,9 @@ resf = fuzzy_cmeans(X,3,2) res = kmeans(X,3) -q = [calinski_harabasz(X,kmeans(X,k)) for k in 2:5] -q = [xie_beni(X,kmeans(X,k)) for k in 2:5] -q = [davies_bouldin(X,kmeans(X,k)) for k in 2:5] -q = [dunn(X,kmeans(X,k),SqEuclidean()) for k in 2:5] +clusterings = kmeans.(Ref(X), 2:5) +kmeans_quality = Dict(qmetric => clustering_quality.(Ref(X), clusterings, quality_metric=qmetric) + for qmetric in [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn]) q = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] q = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] From 4b64473d1078c727568434f822278cbec0975178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 16 Jul 2023 18:48:28 +0200 Subject: [PATCH 24/82] Update src/qualityindices.jl Co-authored-by: Alexey Stukalov --- src/qualityindices.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 8ec9b140..a5db019a 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -97,8 +97,6 @@ function calinski_harabasz( return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end -calinski_harabasz(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = -calinski_harabasz(X, R.centers, R.assignments, distance) function calinski_harabasz( From 79a370937e9950d2bda39201e6c17710799803f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 16 Jul 2023 18:49:50 +0200 Subject: [PATCH 25/82] Update qualityindices.jl --- src/qualityindices.jl | 85 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 8ec9b140..3c2d6e98 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -1,4 +1,89 @@ + + +function clustering_quality( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, + distance::SemiMetric=SqEuclidean(); + quality_index::Symbol + ) + d, n = size(X) + _, data_idx = axes(X) + dc, k = size(centers) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + for i in eachindex(assignments) + (assignments[i] in data_idx) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) + end + + + if quality_index ∈ (:silhouettes, :silhouette, :s) + elseif quality_index ∈ (:calinski_harabasz, :Calinski-Harabasz, :ch) + elseif quality_index ∈ (:xie_beni, :Xie-Beni, :xb) + elseif quality_index ∈ (:davies_bouldin, :Davies-Bouldin, :db) + elseif quality_index ∈ (:dunn, Dunn, :d) + else + error(ArgumentError("Quality index $quality_index not available.")) + end +end + +clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = + clustering_quality(X, R.centers, R.assignments, distance; quality_index = quality_index) + + +function clustering_quality( + X::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + weights::AbstractMatrix{<:Real}, + fuzziness::Real, + distance::SemiMetric=SqEuclidean(); + quality_index::Symbol + ) + d, n = size(X) + dc, k = size(centers) + nw, kw = size(weights) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) + n == nw || throw(DimensionMismatch("Inconsistent data length for `X` and `weights`.")) + k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) + 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) + + +end + +clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = + clustering_quality(X, R.centers, R.weights, fuzziness, distance; quality_index) + +function clustering_quality( + assignments::AbstractVector{<:Integer}, + dist::AbstractMatrix{<:Real}; + quality_index::Symbol = :dunn + ) + n, m = size(dist) + na = length(assignments) + n == m || throw(ArgumentError("Distance matrix must be square.")) + n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) + +end + + +clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean(); quality_index::Symbol = :dunn) = + clustering_quality(assignments, pairwise(distance,eachcol(X)); quality_index) + +clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol = :dunn) = + clustering_quality(X, R.assignments, distance; quality_index) + +clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol = :dunn) = + clustering_quality(R.assignments, dist; quality_index) + function _check_qualityindex_arguments( X::AbstractMatrix{<:Real}, # data matrix (d x n) centers::AbstractMatrix{<:Real}, # cluster centers (d x k) From e14257cf15d56a372a3a02c35a3adb14eb6bb10a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 17 Jul 2023 18:50:22 +0200 Subject: [PATCH 26/82] add clustering_quality --- src/Clustering.jl | 9 +- src/qualityindices.jl | 228 ++++++++++++++++++++++-------------------- 2 files changed, 126 insertions(+), 111 deletions(-) diff --git a/src/Clustering.jl b/src/Clustering.jl index 3a0f29d0..671fc4f5 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -49,6 +49,9 @@ module Clustering # silhouette silhouettes, + # quality indices + clustering_quality, + # varinfo varinfo, @@ -68,10 +71,8 @@ module Clustering mcl, MCLResult, # pair confusion matrix - confusion, + confusion - # quality indices - calinski_harabasz, davies_bouldin, xie_beni, dunn ## source files @@ -87,12 +88,12 @@ module Clustering include("counts.jl") include("silhouette.jl") + include("qualityindices.jl") include("randindex.jl") include("varinfo.jl") include("vmeasure.jl") include("mutualinfo.jl") include("confusion.jl") - include("qualityindices.jl") include("hclust.jl") include("deprecate.jl") diff --git a/src/qualityindices.jl b/src/qualityindices.jl index 1bc4cd9f..45c4e865 100644 --- a/src/qualityindices.jl +++ b/src/qualityindices.jl @@ -1,6 +1,32 @@ - +""" + clustering_quality(X, centers, assignments, [distance;] quality_index) + clustering_quality(X, kmeans_clustering, [distance;] quality_index) + +Compute chosen quality index value for a hard clustering. + +# Arguments + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns + - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) + - `kmeans_clustering::KmeansResult`: the output of kmeans method + - `distance::SemiMetric=SqEuclidean()`: : `SemiMetric` object that defines the distance between the data points + - `quality_index::Symbol`: chosen quality index + +# Available quality indices: +Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. + +- `:silhouettes`: average silhouette index (↑), for all silhouettes use `silhouettes` method instead +- `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters +- `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers +- `:davies_bouldin`: Davies-Bouldin index (↑) returns average similarity between each cluster and its most similar one, averaged over all the clusters + +# References +> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the +> Interpretation and Validation of Cluster Analysis*. Computational and +> Applied Mathematics. 20: 53–65. +""" function clustering_quality( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, @@ -9,7 +35,6 @@ function clustering_quality( quality_index::Symbol ) d, n = size(X) - _, data_idx = axes(X) dc, k = size(centers) d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) @@ -17,15 +42,15 @@ function clustering_quality( k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) for i in eachindex(assignments) - (assignments[i] in data_idx) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) + (assignments[i] in axes(X, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) end - - if quality_index ∈ (:silhouettes, :silhouette, :s) - elseif quality_index ∈ (:calinski_harabasz, :Calinski-Harabasz, :ch) - elseif quality_index ∈ (:xie_beni, :Xie-Beni, :xb) - elseif quality_index ∈ (:davies_bouldin, :Davies-Bouldin, :db) - elseif quality_index ∈ (:dunn, Dunn, :d) + if quality_index ∈ (:calinski_harabasz, :Calinski_Harabasz, :ch) + _cluquality_calinski_harabasz(X, centers, assignments, distance) + elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) + _cluquality_xie_beni(X, centers, assignments, distance) + elseif quality_index ∈ (:davies_bouldin, :Davies_Bouldin, :db) + _cluquality_davies_bouldin(X, centers, assignments, distance) else error(ArgumentError("Quality index $quality_index not available.")) end @@ -34,7 +59,32 @@ end clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = clustering_quality(X, R.centers, R.assignments, distance; quality_index = quality_index) - +""" + clustering_quality(X, centers, weights, fuzziness, [distance;] quality_index) + clustering_quality(X, fuzzy_cmeans_clustering, fuzziness, [distance;] quality_index) + +Compute chosen quality index value for a soft (fuzzy) clustering + +# Arguments + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns + - `weights::AbstractMatrix`: ``n×k`` matrix with fuzzy clustering weights, `weights[i,j]` is the degree of membership of ``i``-th data point to ``j``-th cluster + - `fuzziness::Real`: clustering fuzziness > 1 + - `fuzzy_cmeans_clustering::FuzzyCMeansResult`: the output of fuzzy_cmeans method + - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points + - `quality_index::Symbol`: chosen quality index + + # Available quality indices: + Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. + + - `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters + - `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers + +# References +> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the +> Interpretation and Validation of Cluster Analysis*. Computational and +> Applied Mathematics. 20: 53–65. +""" function clustering_quality( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, @@ -56,84 +106,77 @@ function clustering_quality( all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) - + if quality_index ∈ (:calinski_harabasz, :Calinski_Harabasz, :ch) + _cluquality_calinski_harabasz(X, centers, weights, fuzziness, distance) + elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) + _cluquality_xie_beni(X, centers, weights, fuzziness, distance) + else + error(ArgumentError("Quality index $quality_index not available.")) + end end clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = clustering_quality(X, R.centers, R.weights, fuzziness, distance; quality_index) +""" + + clustering_quality(assignments, dist; quality_index) + clustering_quality(clustering, dist; quality_index) + clustering_quality(data, assignments, [distance;] quality_index) + clustering_quality(data, clustering, [distance;] quality_index) + +Compute chosen quality index value for a clustering in a case cluster centres may be not known. + +# Arguments + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `assignments::AbstractVector{Int}`: the vector of point assignments (cluster indices) + - `dist::AbstractMatrix`: a ``n×n`` pairwise distance matrix; ``dist_{ij}`` is the distance between ``i``-th and ``j``-th points. + - `distance::SemiMetric=SqEuclidean()`: : `SemiMetric` object that defines the distance between the data points + - `clustering::ClusteringResult`: the output of some clustering method + - `quality_index::Symbol`: chosen quality index + +# Available quality indices: +Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. + +- `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead +- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter + +# References +> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the +> Interpretation and Validation of Cluster Analysis*. Computational and +> Applied Mathematics. 20: 53–65. +""" function clustering_quality( assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}; - quality_index::Symbol = :dunn + quality_index::Symbol ) n, m = size(dist) na = length(assignments) n == m || throw(ArgumentError("Distance matrix must be square.")) n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) -end - - -clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean(); quality_index::Symbol = :dunn) = - clustering_quality(assignments, pairwise(distance,eachcol(X)); quality_index) - -clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol = :dunn) = - clustering_quality(X, R.assignments, distance; quality_index) - -clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol = :dunn) = - clustering_quality(R.assignments, dist; quality_index) - -function _check_qualityindex_arguments( - X::AbstractMatrix{<:Real}, # data matrix (d x n) - centers::AbstractMatrix{<:Real}, # cluster centers (d x k) - assignments::AbstractVector{<:Integer}, # assignments (n) - ) - d, n = size(X) - _, data_idx = axes(X) - dc, k = size(centers) - - d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) - (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) - k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) - n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - for i in eachindex(assignments) - (assignments[i] in data_idx) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) + if quality_index ∈ (:silhouettes, :silhouette, :s) + mean(silhouettes(assignments, dist)) + elseif quality_index ∈ (:dunn, :Dunn, :d) + _cluquality_dunn(assignments, dist) + else + error(ArgumentError("Quality index $quality_index not available.")) end end -function _check_qualityindex_arguments( - X::AbstractMatrix{<:Real}, # data matrix (d x n) - centers::AbstractMatrix{<:Real}, # cluster centers (d x k) - weights::AbstractMatrix{<:Real}, # assigned weights (n x k) - fuzziness::Real, # cluster fuzziness - ) - d, n = size(X) - dc, k = size(centers) - nw, kw = size(weights) - d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) - n == nw || throw(DimensionMismatch("Inconsistent data length for `X` and `weights`.")) - k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) - (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) - k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) - n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) - 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) -end +clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = + clustering_quality(assignments, pairwise(distance,eachcol(X)); quality_index = quality_index) -function _check_qualityindex_arguments( - assignments::AbstractVector{<:Integer}, # assignments (n) - dist::AbstractMatrix{<:Real} # data distance matrix (n x n) - ) - n, m = size(dist) - na = length(assignments) - n == m || throw(ArgumentError("Distance matrix must be square.")) - n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) -end +clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = + clustering_quality(R.assignments, pairwise(distance,eachcol(X)); quality_index = quality_index) + +clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol) = + clustering_quality(R.assignments, dist; quality_index = quality_index) -function _gather_samples(assignments, k) +function _gather_samples(assignments, k) # cluster_samples[j]: indices of points in cluster j cluster_samples = [Int[] for _ in 1:k] for (i, a) in enumerate(assignments) push!(cluster_samples[a], i) @@ -142,7 +185,7 @@ function _gather_samples(assignments, k) end -function _inner_inertia(X, centers, cluster_samples, distance) # shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, cluster_samples, distance) # hard clustering, shared between hard clustering calinski_harabasz and xie_beni inner_inertia = sum( sum(colwise(distance, view(X, :, samples), center)) for (center, samples) in zip(eachcol(centers), cluster_samples) @@ -150,7 +193,7 @@ function _inner_inertia(X, centers, cluster_samples, distance) # shared between return inner_inertia end -function _inner_inertia(X, centers, weights, fuzziness, distance) # shared between soft clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, weights, fuzziness, distance) # soft clustering, shared between soft clustering calinski_harabasz and xie_beni n, k = size(X, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) pointCentreDistances = pairwise(distance, eachcol(X), eachcol(centers)) @@ -162,13 +205,12 @@ end # Calinski-Harabasz index -function calinski_harabasz( +function _cluquality_calinski_harabasz( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers, 2) @@ -182,16 +224,13 @@ function calinski_harabasz( return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end - - -function calinski_harabasz( +function _cluquality_calinski_harabasz( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_arguments(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers,2) w_idx1, w_idx2 = axes(weights) @@ -207,19 +246,14 @@ function calinski_harabasz( return (outer_intertia / (k - 1)) / (inner_intertia / (n - k)) end -calinski_harabasz(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = - calinski_harabasz(X, R.centers, R.weights, fuzziness, distance) - - # Davies-Bouldin idex -function davies_bouldin( +function _cluquality_davies_bouldin( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_arguments(X, centers, assignments) k = size(centers, 2) c_idx = axes(centers,2) @@ -236,19 +270,15 @@ function davies_bouldin( return DB end -davies_bouldin(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = - davies_bouldin(X, R.centers, R.assignments, distance) - # Xie-Beni index -function xie_beni( +function _cluquality_xie_beni( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_arguments(X, centers, assignments) n, k = size(X, 2), size(centers,2) @@ -261,18 +291,13 @@ function xie_beni( return inner_intertia / (n * min_center_distance) end -xie_beni(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean()) = - xie_beni(X, R.centers, R.assignments, distance) - - -function xie_beni( +function _cluquality_xie_beni( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, distance::SemiMetric=SqEuclidean() ) - _check_qualityindex_arguments(X, centers, weights, fuzziness) n, k = size(X, 2), size(centers, 2) @@ -284,35 +309,24 @@ function xie_beni( return inner_intertia / (n * min_center_distance) end -xie_beni(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean()) = - xie_beni(X, R.centers, R.weights, fuzziness, distance) - # Dunn index -function dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) - _check_qualityindex_arguments(assignments, dist) +function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) k = maximum(assignments) + cluster_samples = _gather_samples(assignments, k) min_outer_distance = minimum( - minimum(view(dist, cluster_samples[j₁], cluster_samples[j₂])) + minimum(view(dist, cluster_samples[j₁], cluster_samples[j₂]), init = typemax(eltype(dist))) for j₁ in 1:k for j₂ in j₁+1:k ) max_inner_distance = maximum( - maximum(dist[i₁,i₂] for i₁ in sample, i₂ in sample) + maximum(dist[i₁,i₂] for i₁ in sample, i₂ in sample, init = typemin(eltype(dist))) for sample in cluster_samples ) return min_outer_distance / max_inner_distance end - -dunn(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean()) = - dunn(assignments, pairwise(distance,eachcol(X))) - -dunn(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean()) = - dunn(X, R.assignments, distance) - -dunn(R::ClusteringResult, dist::AbstractMatrix{<:Real}) = dunn(R.assignments, dist) \ No newline at end of file From 39a55990c0405ffce928a052c84fcbab74b42570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 17 Jul 2023 18:51:50 +0200 Subject: [PATCH 27/82] Update Clustering.jl --- src/Clustering.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Clustering.jl b/src/Clustering.jl index 671fc4f5..a9469eb6 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -87,8 +87,10 @@ module Clustering include("fuzzycmeans.jl") include("counts.jl") + include("silhouette.jl") include("qualityindices.jl") + include("randindex.jl") include("varinfo.jl") include("vmeasure.jl") From c003b631c8baa462b92e78e0e93d97d969cc3416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 17 Jul 2023 21:15:08 +0200 Subject: [PATCH 28/82] Update validate.md --- docs/source/validate.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/source/validate.md b/docs/source/validate.md index be57af62..a7ba50a7 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -54,6 +54,28 @@ Higher values indicate better separation of clusters w.r.t. point distances. silhouettes ``` +## Clustering quality indices + +A group of clustering evaluation metrics which are intrinsic and depend only on the clustering itself. They can be used to compare different clustering algorithms and choose the optimal number of clusters. + +Available methods are: + +### Average silhouette index + +The average over all silhouettes in the data set, see section **Silhouettes** + +### Calinski-Harabasz index + +### Xie-Beni index + +### Davis-Bouldin index + +### Dunn index + + +```@docs +clustering_quality +``` ## Variation of Information From 1e1cd6f0a19866bbe2cf008386d6fc731395ecbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 18 Jul 2023 17:59:32 +0200 Subject: [PATCH 29/82] docs + tests --- docs/source/validate.md | 57 ++++++++++++--- examples/clustering_quality.jl | 68 ++++++++++++++++++ mytest.jl | 71 ------------------- ...ualityindices.jl => clustering_quality.jl} | 38 +++++----- src/mutualinfo.jl | 4 +- test/clustering_quality.jl | 58 +++++++++++++++ 6 files changed, 194 insertions(+), 102 deletions(-) create mode 100644 examples/clustering_quality.jl delete mode 100644 mytest.jl rename src/{qualityindices.jl => clustering_quality.jl} (90%) create mode 100644 test/clustering_quality.jl diff --git a/docs/source/validate.md b/docs/source/validate.md index a7ba50a7..620fe9de 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -56,21 +56,62 @@ silhouettes ## Clustering quality indices -A group of clustering evaluation metrics which are intrinsic and depend only on the clustering itself. They can be used to compare different clustering algorithms and choose the optimal number of clusters. +A group of clustering evaluation metrics which are intrinsic, i.e. depend only on the clustering itself. They can be used to compare different clustering algorithms or choose the optimal number of clusters. -Available methods are: +The data points are denoted by ``x_1,x_2,\ldots, x_n``, clusters by ``C_1,C_2,\ldots,C_k`` and their centers by ``c_j``, ``c`` is global center of the dataset, ``d`` is a given similarity (distance) function. For soft (fuzzy) clustering ``w_{ij}`` are weights measuring membership of point ``x_i`` to cluster ``C_j`` and ``m`` is the fuzziness parameter. Arrow up (↑) or down (↓) indicate if higher or lower index values indicate better quality. -### Average silhouette index +Given this notation, available indices and their definitions are: -The average over all silhouettes in the data set, see section **Silhouettes** +### Average silhouette index (↑) -### Calinski-Harabasz index +Option `:silhouettes`. The average over all silhouettes in the data set, see section **Silhouettes** for a more detailed description of the method. -### Xie-Beni index +### Calinski-Harabasz index (↑) -### Davis-Bouldin index +Option `:calinski_harabasz`. Measures corrected ratio between the summed internal inertia of clusters divided by global inertia of the cluster centers. For hard clustering and soft (fuzzy) it is defined as + +```math + +\frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad +\frac{n-k}{k-1} \frac{\sum_{C_j}\sum_{x_i} w_{ik}^md(x_i,c_j)}{\sum\limits_{C_j}\sum\limits_{x_i}w_{ij}^m d(c_j,c)} +``` +respectively. + + +### Xie-Beni index (↓) +Option `:xie_beni`. Measures ratio between summed inertia of clusters and minimum distance between cluster centres. For hard clustering and soft (fuzzy) clustering. It is defined as +```math +\frac{\sum_{C_j}\sum_{x_i\in C_j}d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +\quad \text{and}\quad +\frac{\sum_{C_j}\sum_{x_i} w_{ij}^md(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +``` +respectively. +### [Davis-Bouldin index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) (↓) +Option `:davis_bouldin`. It measures average cohesion based on the cluster diameters and distances between cluster centers. It is defined as + +```math +\frac{1}{k}\sum_{C_{j_1}}\max_{c_{j_2}\neq c_{j_1}}\frac{S(C_{j_1})+S(C_{j_2})}{d(c_{j_1},c_{j_2})} +``` +where +```math +S(C_j) = \frac{1}{|C_j|}\sum_{x_i\in C_j}d(x_i,c_j). +``` +### [Dunn index](https://en.wikipedia.org/wiki/Dunn_index) (↑) +Option `:dunn`. More computationally demanding index which can be used when the centres are not known. It measures ratio between the nearest neighbour distance divided by the maximum cluster diameter. It is defined as +```math +\frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \delta(C_{j_1},C_{j_2})}{\max\limits_{C_j}\Delta(C_j)} +``` + +where +```math +\delta(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \Delta(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). +``` + +### References +> Olatz Arbelaitz *et al.* (2013). *An extensive comparative study of cluster validity indices*. Pattern Recognition. 46 1: 243-256. [doi:10.1016/j.patcog.2012.07.021](https://doi.org/10.1016/j.patcog.2012.07.021) + +> Aybükë Oztürk, Stéphane Lallich, Jérôme Darmont. (2018). *A Visual Quality Index for Fuzzy C-Means*. 14th International Conference on Artificial Intelligence Applications and Innovations (AIAI 2018). 546-555. [doi:10.1007/978-3-319-92007-8_46](https://doi.org/10.1007/978-3-319-92007-8_46). -### Dunn index ```@docs diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl new file mode 100644 index 00000000..461e5ea2 --- /dev/null +++ b/examples/clustering_quality.jl @@ -0,0 +1,68 @@ +using Plots, Clustering, Distances, Statistics +using LinearAlgebra + +## visualisation of the exemplary data +## there are 3 real clusters + +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + + +scatter(X[1,:],X[2,:], + label = "exemplary data points", + xlabel = "x", + ylabel = "y", +) + +## hard clustering quality for number of clusters in 2:5 + +clusterings = kmeans.(Ref(X), 2:5) +hard_indices = [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] + +kmeans_quality = + Dict(qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) + for qidx in hard_indices + ) + + +p = [ + plot(2:5, [kmeans_quality[qidx] ], + marker = :circle, + title = string.(qidx), + label = nothing, + ) + for qidx in hard_indices +] +plot(p..., + layout = (3,2), + plot_title = "Quality indices for various number of clusters" +) + +## soft clustering quality for number of clusters in 2:5 + +fuzziness = 2 +soft_indices = [:calinski_harabasz, :xie_beni] +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), 2:5, fuzziness) + +fuzzy_cmeans_quality = + Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness, quality_index = qidx) + for qidx in soft_indices + ) + + +p = [ + plot(2:5, [kmeans_quality[qidx] ], + marker = :circle, + title = string.(qidx), + label = nothing, + ) + for qidx in soft_indices +] +plot(p..., + layout = (2,1), + plot_title = "Quality indices for various number of clusters" +) + + + diff --git a/mytest.jl b/mytest.jl deleted file mode 100644 index 957d67a1..00000000 --- a/mytest.jl +++ /dev/null @@ -1,71 +0,0 @@ -using Plots, Clustering, Distances, Statistics -using LinearAlgebra # for testing - -X = hcat([4., 5.] .+ 0.2 * randn(2, 10), - [9., -5.] .+ 0.2 * randn(2, 5), - [-4., -9.] .+ 0.5 * randn(2, 5)) - - -scatter(X[1,:],X[2,:], - label = nothing, -) - -## -resf = fuzzy_cmeans(X,3,2) - -res = kmeans(X,3) - -clusterings = kmeans.(Ref(X), 2:5) -kmeans_quality = Dict(qmetric => clustering_quality.(Ref(X), clusterings, quality_metric=qmetric) - for qmetric in [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn]) - -q = [calinski_harabasz(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] -q = [xie_beni(X,fuzzy_cmeans(X,k,2), 2) for k in 2:5] - -plot(2:5,q) - -## test data - -Y = [-2 4; 2 4; 2 1; 3 0; 2 -1; 1 0; 2 -4; -2 -4; -2 1; -1 0; -2 -1; -3 0] -C = [0 4; 2 0; 0 -4; -2 0] -A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] -W = [ - 1 0 0 0 - 1 0 0 0 - 0 1 0 0 - 0 1 0 0 - 0 1 0 0 - 0 1 0 0 - 0 0 1 0 - 0 0 1 0 - 0 0 0 1 - 0 0 0 1 - 0 0 0 1 - 0 0 0 1 -] -scatter(Y[:,1],Y[:,2], - axisratio = :equal, - #seriescolor = palette(default)[A], -) -scatter!(C[:,1],C[:,2], - marker = :square -) - -## tests - -using Test - -@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2), zeros(2,3), [1, 2]) -@test_throws DimensionMismatch Clustering._check_qualityindex_arguments(zeros(2,2),zeros(3,2), [1, 2]) -@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2),zeros(2,1), [1, ]) -@test_throws ArgumentError Clustering._check_qualityindex_arguments(zeros(2,2),zeros(2,2), [1, 2]) - -@test calinski_harabasz(Y',C',A,Euclidean()) ≈ (32/3) / (16/8) -@test calinski_harabasz(Y',C',W,2,Euclidean()) ≈ (32/3) / (16/8) - -@test davies_bouldin(Y',C',A,Euclidean()) ≈ 3/2√5 - -@test xie_beni(Y',C',A,Euclidean()) ≈ 1/3 -@test xie_beni(Y',C',W,2,Euclidean()) ≈ 1/3 - -@test dunn(Y',A,Euclidean()) ≈ 1/2 \ No newline at end of file diff --git a/src/qualityindices.jl b/src/clustering_quality.jl similarity index 90% rename from src/qualityindices.jl rename to src/clustering_quality.jl index 45c4e865..f26b5a3d 100644 --- a/src/qualityindices.jl +++ b/src/clustering_quality.jl @@ -20,12 +20,10 @@ Depending on the index higher (↑) or lower (↓) value suggests better cluster - `:silhouettes`: average silhouette index (↑), for all silhouettes use `silhouettes` method instead - `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters - `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers -- `:davies_bouldin`: Davies-Bouldin index (↑) returns average similarity between each cluster and its most similar one, averaged over all the clusters +- `:davies_bouldin`: Davies-Bouldin index (↓) returns average similarity between each cluster and its most similar one, averaged over all the clusters +- `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead, it does not make use of `centers` argument +- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter, it does not make use of `centers` argument -# References -> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the -> Interpretation and Validation of Cluster Analysis*. Computational and -> Applied Mathematics. 20: 53–65. """ function clustering_quality( X::AbstractMatrix{<:Real}, @@ -42,7 +40,7 @@ function clustering_quality( k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) for i in eachindex(assignments) - (assignments[i] in axes(X, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) + (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) end if quality_index ∈ (:calinski_harabasz, :Calinski_Harabasz, :ch) @@ -51,10 +49,16 @@ function clustering_quality( _cluquality_xie_beni(X, centers, assignments, distance) elseif quality_index ∈ (:davies_bouldin, :Davies_Bouldin, :db) _cluquality_davies_bouldin(X, centers, assignments, distance) + else quality_index ∈ (:davies_bouldin, :Davies_Bouldin, :db) + if quality_index ∈ (:silhouettes, :silhouette, :s) + mean(silhouettes(assignments, pairwise(distance, eachcol(X)))) + elseif quality_index ∈ (:dunn, :Dunn, :d) + _cluquality_dunn(assignments, pairwise(distance, eachcol(X))) else error(ArgumentError("Quality index $quality_index not available.")) end end +end clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = clustering_quality(X, R.centers, R.assignments, distance; quality_index = quality_index) @@ -80,10 +84,6 @@ Compute chosen quality index value for a soft (fuzzy) clustering - `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters - `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers -# References -> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the -> Interpretation and Validation of Cluster Analysis*. Computational and -> Applied Mathematics. 20: 53–65. """ function clustering_quality( X::AbstractMatrix{<:Real}, @@ -120,8 +120,8 @@ clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::R """ - clustering_quality(assignments, dist; quality_index) - clustering_quality(clustering, dist; quality_index) + clustering_quality(assignments, dist_matrix; quality_index) + clustering_quality(clustering, dist_matrix; quality_index) clustering_quality(data, assignments, [distance;] quality_index) clustering_quality(data, clustering, [distance;] quality_index) @@ -130,8 +130,8 @@ Compute chosen quality index value for a clustering in a case cluster centres ma # Arguments - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - `assignments::AbstractVector{Int}`: the vector of point assignments (cluster indices) - - `dist::AbstractMatrix`: a ``n×n`` pairwise distance matrix; ``dist_{ij}`` is the distance between ``i``-th and ``j``-th points. - - `distance::SemiMetric=SqEuclidean()`: : `SemiMetric` object that defines the distance between the data points + - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points. + - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `clustering::ClusteringResult`: the output of some clustering method - `quality_index::Symbol`: chosen quality index @@ -141,10 +141,6 @@ Depending on the index higher (↑) or lower (↓) value suggests better cluster - `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead - `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter -# References -> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the -> Interpretation and Validation of Cluster Analysis*. Computational and -> Applied Mathematics. 20: 53–65. """ function clustering_quality( assignments::AbstractVector{<:Integer}, @@ -178,14 +174,14 @@ clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_in function _gather_samples(assignments, k) # cluster_samples[j]: indices of points in cluster j cluster_samples = [Int[] for _ in 1:k] - for (i, a) in enumerate(assignments) + for (i, a) in zip(eachindex(assignments), assignments) push!(cluster_samples[a], i) end return cluster_samples end -function _inner_inertia(X, centers, cluster_samples, distance) # hard clustering, shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, cluster_samples, distance) # shared between hard clustering calinski_harabasz and xie_beni inner_inertia = sum( sum(colwise(distance, view(X, :, samples), center)) for (center, samples) in zip(eachcol(centers), cluster_samples) @@ -193,7 +189,7 @@ function _inner_inertia(X, centers, cluster_samples, distance) # hard clustering return inner_inertia end -function _inner_inertia(X, centers, weights, fuzziness, distance) # soft clustering, shared between soft clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, weights, fuzziness, distance) # shared between soft clustering calinski_harabasz and xie_beni n, k = size(X, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) pointCentreDistances = pairwise(distance, eachcol(X), eachcol(centers)) diff --git a/src/mutualinfo.jl b/src/mutualinfo.jl index f50a7e4f..65b0a527 100644 --- a/src/mutualinfo.jl +++ b/src/mutualinfo.jl @@ -35,7 +35,7 @@ If `normed` parameter is `true` the return value is the normalized mutual inform see "Data Mining Practical Machine Tools and Techniques", Witten & Frank 2005. # References -> Vinh, Epps, and Bailey, (2009). “Information theoretic measures for clusterings comparison”. -Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. +> Vinh, Epps, and Bailey, (2009). *Information theoretic measures for clusterings comparison*. +> Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. """ mutualinfo(a, b; normed::Bool=true) = _mutualinfo(counts(a, b), normed) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl new file mode 100644 index 00000000..53df5554 --- /dev/null +++ b/test/clustering_quality.jl @@ -0,0 +1,58 @@ +using Test +using Clustering, Distances +using OffsetArrays + +@testset "clustering_quality()" begin + + # test data with 4 clusters + + Y = [-2 4; 2 4; 2 1; 3 0; 2 -1; 1 0; 2 -4; -2 -4; -2 1; -1 0; -2 -1; -3 0] + C = [0 4; 2 0; 0 -4; -2 0] + A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] + W = [ + 1 0 0 0 + 1 0 0 0 + 0 1 0 0 + 0 1 0 0 + 0 1 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 1 0 + 0 0 0 1 + 0 0 0 1 + 0 0 0 1 + 0 0 0 1 + ] + + # visualisation of the data + # using Plots + # scatter(Y[:,1],Y[:,2], + # axisratio = :equal, + # #seriescolor = palette(default)[A], + # ) + # scatter!(C[:,1],C[:,2], + # marker = :square, + # label = "cluster centers", + # ) + + @testset "input checks" begin + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,3), [1, 2], quality_index = :calinski_harabasz) + @test_throws DimensionMismatch clustering_quality(zeros(2,2),zeros(3,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,1), [1, ], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws DimensionMismatch clustering_quality([1,2,3], zeros(2,2), quality_index = :dunn) + end + + @testset "correct index values" begin + @test clustering_quality(Y', C', A, Euclidean(), quality_index = :calinski_harabasz) ≈ (32/3) / (16/8) + @test clustering_quality(Y', C', W, 2, Euclidean(), quality_index = :calinski_harabasz) ≈ (32/3) / (16/8) + + @test clustering_quality(Y', C', A, Euclidean(), quality_index = :davies_bouldin) ≈ 3/2√5 + + @test clustering_quality(Y', C', A, Euclidean(), quality_index = :xie_beni) ≈ 1/3 + @test clustering_quality(Y', C', W, 2, Euclidean(), quality_index = :xie_beni) ≈ 1/3 + + @test clustering_quality(Y', A, Euclidean(), quality_index = :dunn) ≈ 1/2 + end + +end \ No newline at end of file From c78bfeef3d95812b287b0d1cec1c5d1ca48e58d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:05:46 +0200 Subject: [PATCH 30/82] Update Clustering.jl --- src/Clustering.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Clustering.jl b/src/Clustering.jl index a9469eb6..0a03429a 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -89,7 +89,7 @@ module Clustering include("counts.jl") include("silhouette.jl") - include("qualityindices.jl") + include("clustering_quality.jl") include("randindex.jl") include("varinfo.jl") From 6579f8ebb70a1476534987c1579253b3ad6bd959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:45:05 +0200 Subject: [PATCH 31/82] docs update --- docs/source/validate.md | 78 ++++++++++++++++++++++++++++++++++ examples/clustering_quality.jl | 6 +-- 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 620fe9de..8dc85500 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -112,7 +112,85 @@ where > Aybükë Oztürk, Stéphane Lallich, Jérôme Darmont. (2018). *A Visual Quality Index for Fuzzy C-Means*. 14th International Conference on Artificial Intelligence Applications and Innovations (AIAI 2018). 546-555. [doi:10.1007/978-3-319-92007-8_46](https://doi.org/10.1007/978-3-319-92007-8_46). +### Examples + +Exemplary data with 3 clusters. +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + + +scatter(X[1,:],X[2,:], + label = "exemplary data points", + xlabel = "x", + ylabel = "y", + legend = :right, +) +``` + +Hard clustering quality for number of clusters in `2:5` + +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +clusterings = kmeans.(Ref(X), 2:5) +hard_indices = [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] + +kmeans_quality = + Dict(qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) + for qidx in hard_indices + ) + +p = [ + plot(2:5, kmeans_quality[qidx], + marker = :circle, + title = string.(qidx), + label = nothing, + ) + for qidx in hard_indices +] +plot(p..., + layout = (3,2), + plot_title = "Quality indices for various number of clusters" +) +``` + +Soft clustering quality for number of clusters in `2:5` +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +fuzziness = 2 +soft_indices = [:calinski_harabasz, :xie_beni] +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), 2:5, fuzziness) + +fuzzy_cmeans_quality = + Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness, quality_index = qidx) + for qidx in soft_indices + ) + + +p = [ + plot(2:5, fuzzy_cmeans_quality[qidx], + marker = :circle, + title = string.(qidx), + label = nothing, + ) + for qidx in soft_indices +] +plot(p..., + layout = (2,1), + plot_title = "Quality indices for various number of clusters" +) +``` ```@docs clustering_quality diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl index 461e5ea2..5a3f73af 100644 --- a/examples/clustering_quality.jl +++ b/examples/clustering_quality.jl @@ -1,5 +1,4 @@ -using Plots, Clustering, Distances, Statistics -using LinearAlgebra +using Plots, Clustering ## visualisation of the exemplary data ## there are 3 real clusters @@ -13,6 +12,7 @@ scatter(X[1,:],X[2,:], label = "exemplary data points", xlabel = "x", ylabel = "y", + legend = :right, ) ## hard clustering quality for number of clusters in 2:5 @@ -52,7 +52,7 @@ fuzzy_cmeans_quality = p = [ - plot(2:5, [kmeans_quality[qidx] ], + plot(2:5, fuzzy_cmeans_quality[qidx], marker = :circle, title = string.(qidx), label = nothing, From 3b6c9e800b173ccd1d8a062349f283f59fe63379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sat, 29 Jul 2023 16:25:55 +0200 Subject: [PATCH 32/82] Update clustering_quality.jl --- examples/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl index 5a3f73af..73a73220 100644 --- a/examples/clustering_quality.jl +++ b/examples/clustering_quality.jl @@ -27,7 +27,7 @@ kmeans_quality = p = [ - plot(2:5, [kmeans_quality[qidx] ], + plot(2:5, kmeans_quality[qidx], marker = :circle, title = string.(qidx), label = nothing, From b535f367cf166753addbf226f5415cfef6076057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 8 Oct 2023 00:06:59 +0200 Subject: [PATCH 33/82] Update clustering_quality.jl --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index f26b5a3d..a7b8b3a8 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -242,7 +242,7 @@ function _cluquality_calinski_harabasz( return (outer_intertia / (k - 1)) / (inner_intertia / (n - k)) end -# Davies-Bouldin idex +# Davies-Bouldin index function _cluquality_davies_bouldin( X::AbstractMatrix{<:Real}, From 9da97270fd08bbbad6059a8ee170e64da35c278a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:14:57 +0200 Subject: [PATCH 34/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index f26b5a3d..fa5d944a 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -252,7 +252,7 @@ function _cluquality_davies_bouldin( ) k = size(centers, 2) - c_idx = axes(centers,2) + c_idx = axes(centers, 2) cluster_samples = _gather_samples(assignments, k) From 10c74aba6f40b6771d3e10355f453f4afcfcfb08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:16:58 +0200 Subject: [PATCH 35/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index fa5d944a..9ea149d7 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -11,7 +11,7 @@ Compute chosen quality index value for a hard clustering. - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - `kmeans_clustering::KmeansResult`: the output of kmeans method - - `distance::SemiMetric=SqEuclidean()`: : `SemiMetric` object that defines the distance between the data points + - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `quality_index::Symbol`: chosen quality index # Available quality indices: From f9217a7d6d7f0765e6df5b0487db7464727764fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:19:26 +0200 Subject: [PATCH 36/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 9ea149d7..12364aa2 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -12,7 +12,7 @@ Compute chosen quality index value for a hard clustering. - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - `kmeans_clustering::KmeansResult`: the output of kmeans method - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - - `quality_index::Symbol`: chosen quality index + - `quality_index::Symbol`: quality index to calculate; see below for the supported options # Available quality indices: Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. From 8c2a451543f60a33cd73da2e0765bdcb16bf9d09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:20:02 +0200 Subject: [PATCH 37/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 12364aa2..8ff3ec75 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -10,7 +10,7 @@ Compute chosen quality index value for a hard clustering. - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - - `kmeans_clustering::KmeansResult`: the output of kmeans method + - `clustering::ClusteringResult`: the output of the clustering method - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `quality_index::Symbol`: quality index to calculate; see below for the supported options From 9c33d000d873573665f343ccaf721025cd016ef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:21:46 +0200 Subject: [PATCH 38/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 8ff3ec75..9e45fe45 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -55,7 +55,7 @@ function clustering_quality( elseif quality_index ∈ (:dunn, :Dunn, :d) _cluquality_dunn(assignments, pairwise(distance, eachcol(X))) else - error(ArgumentError("Quality index $quality_index not available.")) + throw(ArgumentError("Quality index $quality_index not supported.")) end end end From de6aee0a331077d0792c9ebdf6e148960a032279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:23:45 +0200 Subject: [PATCH 39/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 9e45fe45..b8545567 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -4,7 +4,7 @@ clustering_quality(X, centers, assignments, [distance;] quality_index) clustering_quality(X, kmeans_clustering, [distance;] quality_index) -Compute chosen quality index value for a hard clustering. +Compute the clustering quality index for a given clustering. # Arguments - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point From 673b73398d8c96d71b3f628ed0f3494ce7601036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:24:12 +0200 Subject: [PATCH 40/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index b8545567..b1828d9d 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -111,7 +111,7 @@ function clustering_quality( elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) _cluquality_xie_beni(X, centers, weights, fuzziness, distance) else - error(ArgumentError("Quality index $quality_index not available.")) + throw(ArgumentError("Quality index $quality_index not supported.")) end end From 8e33d535924530c32725fb77fb05fda398026943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:10:52 +0200 Subject: [PATCH 41/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index b1828d9d..ac8b4516 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -239,7 +239,7 @@ function _cluquality_calinski_harabasz( inner_intertia = _inner_inertia(X, centers, weights, fuzziness, distance) - return (outer_intertia / (k - 1)) / (inner_intertia / (n - k)) + return (outer_intertia / inner_inertia) * (n - k) / (k - 1) end # Davies-Bouldin idex From e87e2b49a20c14726d0fdc8f16bc83384236c1fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 9 Oct 2023 22:42:50 +0200 Subject: [PATCH 42/82] Apply suggestions from code review Co-authored-by: Alexey Stukalov --- docs/source/validate.md | 1 - src/clustering_quality.jl | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 8dc85500..868797a8 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -60,7 +60,6 @@ A group of clustering evaluation metrics which are intrinsic, i.e. depend only o The data points are denoted by ``x_1,x_2,\ldots, x_n``, clusters by ``C_1,C_2,\ldots,C_k`` and their centers by ``c_j``, ``c`` is global center of the dataset, ``d`` is a given similarity (distance) function. For soft (fuzzy) clustering ``w_{ij}`` are weights measuring membership of point ``x_i`` to cluster ``C_j`` and ``m`` is the fuzziness parameter. Arrow up (↑) or down (↓) indicate if higher or lower index values indicate better quality. -Given this notation, available indices and their definitions are: ### Average silhouette index (↑) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index ac8b4516..ba86843e 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -14,11 +14,12 @@ Compute the clustering quality index for a given clustering. - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `quality_index::Symbol`: quality index to calculate; see below for the supported options -# Available quality indices: -Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. +# Supported quality indices + +Please refer to the [documentation](@ref clustering_quality) for the extended description of the quality indices. -- `:silhouettes`: average silhouette index (↑), for all silhouettes use `silhouettes` method instead -- `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters +- `:silhouettes`: average silhouette index, for all silhouettes use [`silhouettes`](@ref) method instead +- `:calinski_harabasz`: Calinski-Harabsz index, the corrected ratio of inertia between cluster centers and within-clusters inertia - `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers - `:davies_bouldin`: Davies-Bouldin index (↓) returns average similarity between each cluster and its most similar one, averaged over all the clusters - `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead, it does not make use of `centers` argument @@ -43,7 +44,7 @@ function clustering_quality( (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) end - if quality_index ∈ (:calinski_harabasz, :Calinski_Harabasz, :ch) + if quality_index == :calinski_harabasz _cluquality_calinski_harabasz(X, centers, assignments, distance) elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) _cluquality_xie_beni(X, centers, assignments, distance) @@ -106,7 +107,7 @@ function clustering_quality( all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) - if quality_index ∈ (:calinski_harabasz, :Calinski_Harabasz, :ch) + if quality_index == :calinski_harabasz _cluquality_calinski_harabasz(X, centers, weights, fuzziness, distance) elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) _cluquality_xie_beni(X, centers, weights, fuzziness, distance) @@ -152,7 +153,7 @@ function clustering_quality( n == m || throw(ArgumentError("Distance matrix must be square.")) n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) - if quality_index ∈ (:silhouettes, :silhouette, :s) + if quality_index == :silhouettes mean(silhouettes(assignments, dist)) elseif quality_index ∈ (:dunn, :Dunn, :d) _cluquality_dunn(assignments, dist) @@ -228,7 +229,7 @@ function _cluquality_calinski_harabasz( distance::SemiMetric=SqEuclidean() ) - n, k = size(X, 2), size(centers,2) + n, k = size(X, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) global_center = vec(mean(X, dims=2)) From d6164622d7bcc6f8c74a0f790aef5a360580bde1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 10 Oct 2023 00:02:48 +0200 Subject: [PATCH 43/82] removing aliases --- src/clustering_quality.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index ba86843e..c0cdc06c 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -46,14 +46,14 @@ function clustering_quality( if quality_index == :calinski_harabasz _cluquality_calinski_harabasz(X, centers, assignments, distance) - elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) + elseif quality_index == :xie_beni _cluquality_xie_beni(X, centers, assignments, distance) - elseif quality_index ∈ (:davies_bouldin, :Davies_Bouldin, :db) + elseif quality_index == :davies_bouldin _cluquality_davies_bouldin(X, centers, assignments, distance) - else quality_index ∈ (:davies_bouldin, :Davies_Bouldin, :db) - if quality_index ∈ (:silhouettes, :silhouette, :s) + else quality_index == :davies_bouldin + if quality_index == :silhouettes mean(silhouettes(assignments, pairwise(distance, eachcol(X)))) - elseif quality_index ∈ (:dunn, :Dunn, :d) + elseif quality_index == :dunn _cluquality_dunn(assignments, pairwise(distance, eachcol(X))) else throw(ArgumentError("Quality index $quality_index not supported.")) @@ -155,7 +155,7 @@ function clustering_quality( if quality_index == :silhouettes mean(silhouettes(assignments, dist)) - elseif quality_index ∈ (:dunn, :Dunn, :d) + elseif quality_index == :dunn _cluquality_dunn(assignments, dist) else error(ArgumentError("Quality index $quality_index not available.")) From df61f017c95fb4eb4436ca687f7f8c1da7497e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 10 Oct 2023 00:34:44 +0200 Subject: [PATCH 44/82] Update examples --- examples/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl index 73a73220..6b2f8294 100644 --- a/examples/clustering_quality.jl +++ b/examples/clustering_quality.jl @@ -18,7 +18,7 @@ scatter(X[1,:],X[2,:], ## hard clustering quality for number of clusters in 2:5 clusterings = kmeans.(Ref(X), 2:5) -hard_indices = [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] +hard_indices = [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] kmeans_quality = Dict(qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) From 07713097e49d2c6a3ef0f35ca3f88f5088d66855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 10 Oct 2023 13:39:37 +0200 Subject: [PATCH 45/82] Apply suggestions from code review Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index c0cdc06c..f072bb3d 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -1,13 +1,13 @@ """ - clustering_quality(X, centers, assignments, [distance;] quality_index) - clustering_quality(X, kmeans_clustering, [distance;] quality_index) + clustering_quality(X, centers, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) Compute the clustering quality index for a given clustering. # Arguments - - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point if `metric` is provided; otherwise ``n×n`` matrix of distances between the points - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - `clustering::ClusteringResult`: the output of the clustering method @@ -65,8 +65,8 @@ clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMet clustering_quality(X, R.centers, R.assignments, distance; quality_index = quality_index) """ - clustering_quality(X, centers, weights, fuzziness, [distance;] quality_index) - clustering_quality(X, fuzzy_cmeans_clustering, fuzziness, [distance;] quality_index) + clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) + clustering_quality(data, clustering; quality_index, fuzziness, [metric]) Compute chosen quality index value for a soft (fuzzy) clustering From 4ea03e6a375679daca6422907fde96b5866741d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 10 Oct 2023 13:42:35 +0200 Subject: [PATCH 46/82] Apply suggestions from code review Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index f072bb3d..e8d623f6 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -123,8 +123,8 @@ clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::R clustering_quality(assignments, dist_matrix; quality_index) clustering_quality(clustering, dist_matrix; quality_index) - clustering_quality(data, assignments, [distance;] quality_index) - clustering_quality(data, clustering, [distance;] quality_index) + clustering_quality(data, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) Compute chosen quality index value for a clustering in a case cluster centres may be not known. From 734df67c8eb47ee226307cac6784fc5d5fbf5e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 10 Oct 2023 17:19:59 +0200 Subject: [PATCH 47/82] distance->metric --- .gitignore | 1 + src/clustering_quality.jl | 84 +++++++++++++++++++------------------- test/clustering_quality.jl | 12 +++--- 3 files changed, 49 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 0b53ec64..15ddac13 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ doc/build Manifest.toml +*.swp diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index e8d623f6..bc6e95ca 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -11,7 +11,7 @@ Compute the clustering quality index for a given clustering. - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - `clustering::ClusteringResult`: the output of the clustering method - - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points + - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `quality_index::Symbol`: quality index to calculate; see below for the supported options # Supported quality indices @@ -29,9 +29,9 @@ Please refer to the [documentation](@ref clustering_quality) for the extended de function clustering_quality( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, - assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean(); - quality_index::Symbol + assignments::AbstractVector{<:Integer}; + quality_index::Symbol, + metric::SemiMetric=SqEuclidean() ) d, n = size(X) dc, k = size(centers) @@ -45,24 +45,24 @@ function clustering_quality( end if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(X, centers, assignments, distance) + _cluquality_calinski_harabasz(X, centers, assignments, metric) elseif quality_index == :xie_beni - _cluquality_xie_beni(X, centers, assignments, distance) + _cluquality_xie_beni(X, centers, assignments, metric) elseif quality_index == :davies_bouldin - _cluquality_davies_bouldin(X, centers, assignments, distance) + _cluquality_davies_bouldin(X, centers, assignments, metric) else quality_index == :davies_bouldin if quality_index == :silhouettes - mean(silhouettes(assignments, pairwise(distance, eachcol(X)))) + mean(silhouettes(assignments, pairwise(metric, eachcol(X)))) elseif quality_index == :dunn - _cluquality_dunn(assignments, pairwise(distance, eachcol(X))) + _cluquality_dunn(assignments, pairwise(metric, eachcol(X))) else throw(ArgumentError("Quality index $quality_index not supported.")) end end end -clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = - clustering_quality(X, R.centers, R.assignments, distance; quality_index = quality_index) +clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(X, R.centers, R.assignments; quality_index = quality_index, metric = metric) """ clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) @@ -89,10 +89,10 @@ Compute chosen quality index value for a soft (fuzzy) clustering function clustering_quality( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}, + weights::AbstractMatrix{<:Real}; + quality_index::Symbol, fuzziness::Real, - distance::SemiMetric=SqEuclidean(); - quality_index::Symbol + metric::SemiMetric=SqEuclidean() ) d, n = size(X) dc, k = size(centers) @@ -108,16 +108,16 @@ function clustering_quality( 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(X, centers, weights, fuzziness, distance) + _cluquality_calinski_harabasz(X, centers, weights, fuzziness, metric) elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) - _cluquality_xie_beni(X, centers, weights, fuzziness, distance) + _cluquality_xie_beni(X, centers, weights, fuzziness, metric) else throw(ArgumentError("Quality index $quality_index not supported.")) end end -clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult, fuzziness::Real, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = - clustering_quality(X, R.centers, R.weights, fuzziness, distance; quality_index) +clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean()) = + clustering_quality(X, R.centers, R.weights; quality_index = quality_index, fuzziness = fuzziness, metric = metric) """ @@ -132,7 +132,7 @@ Compute chosen quality index value for a clustering in a case cluster centres ma - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - `assignments::AbstractVector{Int}`: the vector of point assignments (cluster indices) - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points. - - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points + - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - `clustering::ClusteringResult`: the output of some clustering method - `quality_index::Symbol`: chosen quality index @@ -163,11 +163,11 @@ function clustering_quality( end -clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = - clustering_quality(assignments, pairwise(distance,eachcol(X)); quality_index = quality_index) +clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(assignments, pairwise(metric,eachcol(X)); quality_index = quality_index) -clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult, distance::SemiMetric=SqEuclidean(); quality_index::Symbol) = - clustering_quality(R.assignments, pairwise(distance,eachcol(X)); quality_index = quality_index) +clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(R.assignments, pairwise(metric,eachcol(X)); quality_index = quality_index) clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol) = clustering_quality(R.assignments, dist; quality_index = quality_index) @@ -182,18 +182,18 @@ function _gather_samples(assignments, k) # cluster_samples[j]: indices of points end -function _inner_inertia(X, centers, cluster_samples, distance) # shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, cluster_samples, metric) # shared between hard clustering calinski_harabasz and xie_beni inner_inertia = sum( - sum(colwise(distance, view(X, :, samples), center)) + sum(colwise(metric, view(X, :, samples), center)) for (center, samples) in zip(eachcol(centers), cluster_samples) ) return inner_inertia end -function _inner_inertia(X, centers, weights, fuzziness, distance) # shared between soft clustering calinski_harabasz and xie_beni +function _inner_inertia(X, centers, weights, fuzziness, metric) # shared between soft clustering calinski_harabasz and xie_beni n, k = size(X, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) - pointCentreDistances = pairwise(distance, eachcol(X), eachcol(centers)) + pointCentreDistances = pairwise(metric, eachcol(X), eachcol(centers)) inner_inertia = sum( weights[i₁,j₁]^fuzziness * pointCentreDistances[i₂,j₂] for (i₁,i₂) in zip(w_idx1,1:n), (j₁,j₂) in zip(w_idx2, 1:k) ) @@ -206,17 +206,17 @@ function _cluquality_calinski_harabasz( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() + metric::SemiMetric=SqEuclidean() ) n, k = size(X, 2), size(centers, 2) cluster_samples = _gather_samples(assignments, k) global_center = vec(mean(X, dims=2)) - center_distances = colwise(distance, centers, global_center) + center_distances = colwise(metric, centers, global_center) outer_inertia = length.(cluster_samples) ⋅ center_distances - inner_inertia = _inner_inertia(X, centers, cluster_samples, distance) + inner_inertia = _inner_inertia(X, centers, cluster_samples, metric) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -226,19 +226,19 @@ function _cluquality_calinski_harabasz( centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, - distance::SemiMetric=SqEuclidean() + metric::SemiMetric=SqEuclidean() ) n, k = size(X, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) global_center = vec(mean(X, dims=2)) - center_distances = colwise(distance, centers, global_center) + center_distances = colwise(metric, centers, global_center) outer_intertia = sum( weights[i,j₁]^fuzziness * center_distances[j₂] for i in w_idx1, (j₁,j₂) in zip(w_idx2, 1:k) ) - inner_intertia = _inner_inertia(X, centers, weights, fuzziness, distance) + inner_inertia = _inner_inertia(X, centers, weights, fuzziness, metric) return (outer_intertia / inner_inertia) * (n - k) / (k - 1) end @@ -249,7 +249,7 @@ function _cluquality_davies_bouldin( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() + metric::SemiMetric=SqEuclidean() ) k = size(centers, 2) @@ -257,8 +257,8 @@ function _cluquality_davies_bouldin( cluster_samples = _gather_samples(assignments, k) - cluster_diameters = [mean(colwise(distance,view(X, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] - center_distances = pairwise(distance,centers) + cluster_diameters = [mean(colwise(metric,view(X, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] + center_distances = pairwise(metric,centers) DB = mean( maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) @@ -274,15 +274,15 @@ function _cluquality_xie_beni( X::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, - distance::SemiMetric=SqEuclidean() + metric::SemiMetric=SqEuclidean() ) n, k = size(X, 2), size(centers,2) cluster_samples = _gather_samples(assignments, k) - inner_intertia = _inner_inertia(X, centers, cluster_samples, distance) + inner_intertia = _inner_inertia(X, centers, cluster_samples, metric) - center_distances = pairwise(distance,centers) + center_distances = pairwise(metric,centers) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) return inner_intertia / (n * min_center_distance) @@ -293,14 +293,14 @@ function _cluquality_xie_beni( centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, - distance::SemiMetric=SqEuclidean() + metric::SemiMetric=SqEuclidean() ) n, k = size(X, 2), size(centers, 2) - inner_intertia = _inner_inertia(X, centers, weights, fuzziness, distance) + inner_intertia = _inner_inertia(X, centers, weights, fuzziness, metric) - center_distances = pairwise(distance, eachcol(centers)) + center_distances = pairwise(metric, eachcol(centers)) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) return inner_intertia / (n * min_center_distance) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 53df5554..6afc62f9 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -44,15 +44,15 @@ using OffsetArrays end @testset "correct index values" begin - @test clustering_quality(Y', C', A, Euclidean(), quality_index = :calinski_harabasz) ≈ (32/3) / (16/8) - @test clustering_quality(Y', C', W, 2, Euclidean(), quality_index = :calinski_harabasz) ≈ (32/3) / (16/8) + @test clustering_quality(Y', C', A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y', C', W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) - @test clustering_quality(Y', C', A, Euclidean(), quality_index = :davies_bouldin) ≈ 3/2√5 + @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2√5 - @test clustering_quality(Y', C', A, Euclidean(), quality_index = :xie_beni) ≈ 1/3 - @test clustering_quality(Y', C', W, 2, Euclidean(), quality_index = :xie_beni) ≈ 1/3 + @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 - @test clustering_quality(Y', A, Euclidean(), quality_index = :dunn) ≈ 1/2 + @test clustering_quality(Y', A; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 end end \ No newline at end of file From bc65829c78cae00891b6bbcde15a3e32878a780c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 27 Oct 2023 17:48:42 +0200 Subject: [PATCH 48/82] docs --- .gitignore | 5 + Project.toml | 2 + docs/source/validate.md | 80 ++++++++----- examples/clustering_quality.jl | 2 +- src/clustering_quality.jl | 199 +++++++++++++++------------------ 5 files changed, 150 insertions(+), 138 deletions(-) diff --git a/.gitignore b/.gitignore index 15ddac13..21255bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ doc/build Manifest.toml *.swp +.vscode/settings.json +docs/build/ +docs/build/assets/ +Project.toml +*.html diff --git a/Project.toml b/Project.toml index ec189ee3..a976b9a2 100644 --- a/Project.toml +++ b/Project.toml @@ -4,9 +4,11 @@ version = "0.15.4" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/docs/source/validate.md b/docs/source/validate.md index 868797a8..0c969372 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -46,8 +46,8 @@ s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} from the ``i``-th point to the points in the ``k``-th cluster. Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th -point lies well within its own cluster. This property allows using -`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality. +point lies well within its own cluster. This property allows using average silhouette value +`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality; it is also available using `clustering_quality(...; quality_index = :silhouettes)` method. Higher values indicate better separation of clusters w.r.t. point distances. ```@docs @@ -58,35 +58,52 @@ silhouettes A group of clustering evaluation metrics which are intrinsic, i.e. depend only on the clustering itself. They can be used to compare different clustering algorithms or choose the optimal number of clusters. -The data points are denoted by ``x_1,x_2,\ldots, x_n``, clusters by ``C_1,C_2,\ldots,C_k`` and their centers by ``c_j``, ``c`` is global center of the dataset, ``d`` is a given similarity (distance) function. For soft (fuzzy) clustering ``w_{ij}`` are weights measuring membership of point ``x_i`` to cluster ``C_j`` and ``m`` is the fuzziness parameter. Arrow up (↑) or down (↓) indicate if higher or lower index values indicate better quality. -### Average silhouette index (↑) +| **index name** | **quality_index** | **type** | **direction** | **cluster centers** | +|:-----------------:|:--------------------:|:----------:|:-------------:|:-------------------:| +| Calinski-Harabasz | `:calinsky_harabasz` | hard/fuzzy | up | required | +| Xie-Beni | `:xie_beni` | hard/fuzzy | down | required | +| Davis-Bouldin | `:davis_bouldin` | hard | down | required | +| Dunn | `:dunn` | hard | up | not required | +| silhouettes | `:silhouettes` | hard | up | not required | -Option `:silhouettes`. The average over all silhouettes in the data set, see section **Silhouettes** for a more detailed description of the method. -### Calinski-Harabasz index (↑) +```@docs +Clustering.clustering_quality +``` + +Notation for the index definitions below: +- ``x_1,x_2,\ldots, x_n``: data points, +- ``C_1,C_2,\ldots,C_k``: clusters, +- ``c_j`` and ``c``: cluster centers and global dataset center, +- ``d``: a similarity (distance) function, +- ``w_{ij}``: weights measuring membership of a point ``x_i`` to a cluster ``C_j``, +- ``\alpha``: a fuzziness parameter. + +### Calinski-Harabasz index -Option `:calinski_harabasz`. Measures corrected ratio between the summed internal inertia of clusters divided by global inertia of the cluster centers. For hard clustering and soft (fuzzy) it is defined as +Option `:calinski_harabasz`. Higher values indicate better quality. Measures corrected ratio between global inertia of the cluster centers and the summed internal inertias of clusters. For hard and fuzzy (soft) clustering it is defined as ```math \frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad -\frac{n-k}{k-1} \frac{\sum_{C_j}\sum_{x_i} w_{ik}^md(x_i,c_j)}{\sum\limits_{C_j}\sum\limits_{x_i}w_{ij}^m d(c_j,c)} +\frac{n-k}{k-1} \frac{\sum_{C_j} \sum_{x_i} w_{ik}^\alpha d(x_i,c_j)}{\sum\limits_{C_j}\sum\limits_{x_i}w_{ij}^\alpha d(c_j,c)} ``` respectively. -### Xie-Beni index (↓) -Option `:xie_beni`. Measures ratio between summed inertia of clusters and minimum distance between cluster centres. For hard clustering and soft (fuzzy) clustering. It is defined as +### Xie-Beni index +Option `:xie_beni`. Lower values indicate better quality. Measures ratio between summed inertia of clusters and minimum distance between cluster centres. For hard clustering and fuzzy (soft) clustering. It is defined as ```math \frac{\sum_{C_j}\sum_{x_i\in C_j}d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } \quad \text{and}\quad -\frac{\sum_{C_j}\sum_{x_i} w_{ij}^md(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +\frac{\sum_{C_j}\sum_{x_i} w_{ij}^\alpha d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } ``` respectively. -### [Davis-Bouldin index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) (↓) -Option `:davis_bouldin`. It measures average cohesion based on the cluster diameters and distances between cluster centers. It is defined as + +### [Davis-Bouldin index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) +Option `:davis_bouldin`. Lower values indicate better quality. It measures average cohesion based on the cluster diameters and distances between cluster centers. It is defined as ```math \frac{1}{k}\sum_{C_{j_1}}\max_{c_{j_2}\neq c_{j_1}}\frac{S(C_{j_1})+S(C_{j_2})}{d(c_{j_1},c_{j_2})} @@ -95,17 +112,22 @@ where ```math S(C_j) = \frac{1}{|C_j|}\sum_{x_i\in C_j}d(x_i,c_j). ``` -### [Dunn index](https://en.wikipedia.org/wiki/Dunn_index) (↑) -Option `:dunn`. More computationally demanding index which can be used when the centres are not known. It measures ratio between the nearest neighbour distance divided by the maximum cluster diameter. It is defined as +### [Dunn index](https://en.wikipedia.org/wiki/Dunn_index) +Option `:dunn`. Higher values indicate better quality. More computationally demanding index which can be used when the centres are not known. It measures ratio between the nearest neighbour distance divided by the maximum cluster diameter. It is defined as ```math -\frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \delta(C_{j_1},C_{j_2})}{\max\limits_{C_j}\Delta(C_j)} +\frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \mathrm{dist}(C_{j_1},C_{j_2})}{\max\limits_{C_j}\mathrm{diam}(C_j)} ``` - where ```math -\delta(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \Delta(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). +\mathrm{dist}(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \mathrm{diam}(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). ``` + +### Average silhouette index + +Option `:silhouettes`. Higher values indicate better quality. It returns the average over silhouette values in the whole data set. See section [Silhouettes](#silhouettes) for a more detailed description of the method. + + ### References > Olatz Arbelaitz *et al.* (2013). *An extensive comparative study of cluster validity indices*. Pattern Recognition. 46 1: 243-256. [doi:10.1016/j.patcog.2012.07.021](https://doi.org/10.1016/j.patcog.2012.07.021) @@ -113,7 +135,7 @@ where ### Examples -Exemplary data with 3 clusters. +Exemplary data with 3 real clusters. ```@example using Plots, Clustering X = hcat([4., 5.] .+ 0.4 * randn(2, 10), @@ -129,7 +151,7 @@ scatter(X[1,:],X[2,:], ) ``` -Hard clustering quality for number of clusters in `2:5` +Hard clustering quality for K-means method number of clusters in `2:5` ```@example using Plots, Clustering @@ -138,7 +160,7 @@ X = hcat([4., 5.] .+ 0.4 * randn(2, 10), [-4., -9.] .+ 1 * randn(2, 5)) clusterings = kmeans.(Ref(X), 2:5) -hard_indices = [:silhouette, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] +hard_indices = [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] kmeans_quality = Dict(qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) @@ -159,7 +181,7 @@ plot(p..., ) ``` -Soft clustering quality for number of clusters in `2:5` +Fuzzy clustering quality for fuzzy C-means method with number of clusters in `2:5` ```@example using Plots, Clustering X = hcat([4., 5.] .+ 0.4 * randn(2, 10), @@ -167,12 +189,12 @@ X = hcat([4., 5.] .+ 0.4 * randn(2, 10), [-4., -9.] .+ 1 * randn(2, 5)) fuzziness = 2 -soft_indices = [:calinski_harabasz, :xie_beni] +fuzzy_indices = [:calinski_harabasz, :xie_beni] fuzzy_clusterings = fuzzy_cmeans.(Ref(X), 2:5, fuzziness) fuzzy_cmeans_quality = - Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness, quality_index = qidx) - for qidx in soft_indices + Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness = fuzziness, quality_index = qidx) + for qidx in fuzzy_indices ) @@ -182,7 +204,7 @@ p = [ title = string.(qidx), label = nothing, ) - for qidx in soft_indices + for qidx in fuzzy_indices ] plot(p..., layout = (2,1), @@ -191,10 +213,6 @@ plot(p..., ``` -```@docs -clustering_quality -``` - ## Variation of Information [Variation of information](http://en.wikipedia.org/wiki/Variation_of_information) @@ -204,7 +222,7 @@ information*, but it is a true metric, *i.e.* it is symmetric and satisfies the triangle inequality. ```@docs -varinfo +Clustering.varinfo ``` diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl index 6b2f8294..4a93b33f 100644 --- a/examples/clustering_quality.jl +++ b/examples/clustering_quality.jl @@ -46,7 +46,7 @@ soft_indices = [:calinski_harabasz, :xie_beni] fuzzy_clusterings = fuzzy_cmeans.(Ref(X), 2:5, fuzziness) fuzzy_cmeans_quality = - Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness, quality_index = qidx) + Dict(qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness = fuzziness, quality_index = qidx) for qidx in soft_indices ) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index bc6e95ca..56d459fb 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -1,105 +1,111 @@ - +# hard clustering indices interface + general docs """ - clustering_quality(X, centers, assignments; quality_index, [metric]) +For hard clustering: + + clustering_quality(data, centers, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) + +For fuzzy clustering: + + clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) + clustering_quality(data, clustering; quality_index, fuzziness, [metric]) + +For hard clustering without cluster centers known: + + clustering_quality(assignments, dist_matrix; quality_index) + clustering_quality(clustering, dist_matrix; quality_index) + clustering_quality(data, assignments; quality_index, [metric]) clustering_quality(data, clustering; quality_index, [metric]) Compute the clustering quality index for a given clustering. +Returns a real number which is the value of the chosen quality index type of the given clustering. + # Arguments - - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point if `metric` is provided; otherwise ``n×n`` matrix of distances between the points + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) - - `clustering::ClusteringResult`: the output of the clustering method - - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points + - `weights::AbstractMatrix`: ``n×k`` matrix with fuzzy clustering weights, `weights[i,j]` is the degree of membership of ``i``-th data point to ``j``-th cluster + - `clustering::Union{ClusteringResult, FuzzyCMeansResult}`: the output of the clustering method - `quality_index::Symbol`: quality index to calculate; see below for the supported options + - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points -# Supported quality indices + # Keyword arguments + - `quality_index::Symbol`: quality index to calculate; see below for the supported options + - `fuzziness::Real`: clustering fuzziness > 1 + - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the metric/distance/similarity function + +When calling `clustering_quality` one can give `centers`, `assignments` or `weights` arguments by hand or provide a single `clustering` argument from which the necessary data will be read automatically. -Please refer to the [documentation](@ref clustering_quality) for the extended description of the quality indices. +For clustering without known cluster centers the datapoints are not required, only `dist_matrix` is necessary. If given, `data` and `metric` will be used to calculate distance matrix instead. -- `:silhouettes`: average silhouette index, for all silhouettes use [`silhouettes`](@ref) method instead -- `:calinski_harabasz`: Calinski-Harabsz index, the corrected ratio of inertia between cluster centers and within-clusters inertia -- `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers +# Supported quality indices + +Symbols ↑/↓ are quality direction. +- `:calinski_harabasz`: hard or fuzzy Calinski-Harabsz index (↑) returns the corrected ratio of between cluster centers inertia and within-clusters inertia +- `:xie_beni`: hard or fuzzy Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers - `:davies_bouldin`: Davies-Bouldin index (↓) returns average similarity between each cluster and its most similar one, averaged over all the clusters -- `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead, it does not make use of `centers` argument -- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter, it does not make use of `centers` argument +- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter; it does not make use of `centers` argument +- `:silhouettes`: average silhouette index (↑), for all silhouettes use [`silhouettes`](@ref) method instead; it does not make use of `centers` argument +Please refer to the [documentation](@ref clustering_quality) for the definitions and usage descriptions of the supported quality indices. """ function clustering_quality( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - assignments::AbstractVector{<:Integer}; + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + assignments::AbstractVector{<:Integer}; # n vector quality_index::Symbol, metric::SemiMetric=SqEuclidean() ) - d, n = size(X) + d, n = size(data) dc, k = size(centers) - d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) for i in eachindex(assignments) - (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `X`.")) + (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `data`.")) end if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(X, centers, assignments, metric) + _cluquality_calinski_harabasz(data, centers, assignments, metric) elseif quality_index == :xie_beni - _cluquality_xie_beni(X, centers, assignments, metric) + _cluquality_xie_beni(data, centers, assignments, metric) elseif quality_index == :davies_bouldin - _cluquality_davies_bouldin(X, centers, assignments, metric) + _cluquality_davies_bouldin(data, centers, assignments, metric) else quality_index == :davies_bouldin if quality_index == :silhouettes - mean(silhouettes(assignments, pairwise(metric, eachcol(X)))) + mean(silhouettes(assignments, pairwise(metric, eachcol(data)))) elseif quality_index == :dunn - _cluquality_dunn(assignments, pairwise(metric, eachcol(X))) + _cluquality_dunn(assignments, pairwise(metric, eachcol(data))) else throw(ArgumentError("Quality index $quality_index not supported.")) end end end -clustering_quality(X::AbstractMatrix{<:Real}, R::KmeansResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = - clustering_quality(X, R.centers, R.assignments; quality_index = quality_index, metric = metric) +clustering_quality(data::AbstractMatrix{<:Real}, R::KmeansResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.assignments; quality_index = quality_index, metric = metric) -""" - clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) - clustering_quality(data, clustering; quality_index, fuzziness, [metric]) -Compute chosen quality index value for a soft (fuzzy) clustering +# fuzzy clustering indices interface -# Arguments - - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - - `weights::AbstractMatrix`: ``n×k`` matrix with fuzzy clustering weights, `weights[i,j]` is the degree of membership of ``i``-th data point to ``j``-th cluster - - `fuzziness::Real`: clustering fuzziness > 1 - - `fuzzy_cmeans_clustering::FuzzyCMeansResult`: the output of fuzzy_cmeans method - - `distance::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - - `quality_index::Symbol`: chosen quality index - - # Available quality indices: - Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. - - - `:calinski_harabasz`: Calinski-Harabsz index (↑) returns corrected ratio between inertia between cluster centers and inertia within clusters - - `:xie_beni`: Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers - -""" function clustering_quality( - X::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}; + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + weights::AbstractMatrix{<:Real}; # n×k matrix quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean() ) - d, n = size(X) + d, n = size(data) dc, k = size(centers) nw, kw = size(weights) - d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `X` and `centers`.")) - n == nw || throw(DimensionMismatch("Inconsistent data length for `X` and `weights`.")) + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) + n == nw || throw(DimensionMismatch("Inconsistent data length for `data` and `weights`.")) k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) @@ -108,44 +114,23 @@ function clustering_quality( 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(X, centers, weights, fuzziness, metric) + _cluquality_calinski_harabasz(data, centers, weights, fuzziness, metric) elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) - _cluquality_xie_beni(X, centers, weights, fuzziness, metric) + _cluquality_xie_beni(data, centers, weights, fuzziness, metric) else throw(ArgumentError("Quality index $quality_index not supported.")) end end -clustering_quality(X::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean()) = - clustering_quality(X, R.centers, R.weights; quality_index = quality_index, fuzziness = fuzziness, metric = metric) - -""" - - clustering_quality(assignments, dist_matrix; quality_index) - clustering_quality(clustering, dist_matrix; quality_index) - clustering_quality(data, assignments; quality_index, [metric]) - clustering_quality(data, clustering; quality_index, [metric]) - -Compute chosen quality index value for a clustering in a case cluster centres may be not known. - -# Arguments - - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - - `assignments::AbstractVector{Int}`: the vector of point assignments (cluster indices) - - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points. - - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the distance between the data points - - `clustering::ClusteringResult`: the output of some clustering method - - `quality_index::Symbol`: chosen quality index +clustering_quality(data::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.weights; quality_index = quality_index, fuzziness = fuzziness, metric = metric) -# Available quality indices: -Depending on the index higher (↑) or lower (↓) value suggests better clustering quality. -- `:silhouettes`: average silhouette index (↑), to obtain all silhouettes use `silhouettes` function instead -- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter +# clustering indices with cluster centres not known interface -""" -function clustering_quality( - assignments::AbstractVector{<:Integer}, - dist::AbstractMatrix{<:Real}; +function clustering_quality( + assignments::AbstractVector{<:Integer}, # n vector + dist::AbstractMatrix{<:Real}; # n×n matrix quality_index::Symbol ) n, m = size(dist) @@ -163,16 +148,18 @@ function clustering_quality( end -clustering_quality(X::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = - clustering_quality(assignments, pairwise(metric,eachcol(X)); quality_index = quality_index) +clustering_quality(data::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(assignments, pairwise(metric,eachcol(data)); quality_index = quality_index) -clustering_quality(X::AbstractMatrix{<:Real}, R::ClusteringResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = - clustering_quality(R.assignments, pairwise(metric,eachcol(X)); quality_index = quality_index) +clustering_quality(data::AbstractMatrix{<:Real}, R::ClusteringResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(R.assignments, pairwise(metric,eachcol(data)); quality_index = quality_index) clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol) = clustering_quality(R.assignments, dist; quality_index = quality_index) +# utility functions + function _gather_samples(assignments, k) # cluster_samples[j]: indices of points in cluster j cluster_samples = [Int[] for _ in 1:k] for (i, a) in zip(eachindex(assignments), assignments) @@ -182,18 +169,18 @@ function _gather_samples(assignments, k) # cluster_samples[j]: indices of points end -function _inner_inertia(X, centers, cluster_samples, metric) # shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia(data, centers, cluster_samples, metric) # shared between hard clustering calinski_harabasz and xie_beni inner_inertia = sum( - sum(colwise(metric, view(X, :, samples), center)) + sum(colwise(metric, view(data, :, samples), center)) for (center, samples) in zip(eachcol(centers), cluster_samples) ) return inner_inertia end -function _inner_inertia(X, centers, weights, fuzziness, metric) # shared between soft clustering calinski_harabasz and xie_beni - n, k = size(X, 2), size(centers, 2) +function _inner_inertia(data, centers, weights, fuzziness, metric) # shared between soft clustering calinski_harabasz and xie_beni + n, k = size(data, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) - pointCentreDistances = pairwise(metric, eachcol(X), eachcol(centers)) + pointCentreDistances = pairwise(metric, eachcol(data), eachcol(centers)) inner_inertia = sum( weights[i₁,j₁]^fuzziness * pointCentreDistances[i₂,j₂] for (i₁,i₂) in zip(w_idx1,1:n), (j₁,j₂) in zip(w_idx2, 1:k) ) @@ -203,50 +190,50 @@ end # Calinski-Harabasz index function _cluquality_calinski_harabasz( - X::AbstractMatrix{<:Real}, + data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, metric::SemiMetric=SqEuclidean() ) - n, k = size(X, 2), size(centers, 2) + n, k = size(data, 2), size(centers, 2) cluster_samples = _gather_samples(assignments, k) - global_center = vec(mean(X, dims=2)) + global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) outer_inertia = length.(cluster_samples) ⋅ center_distances - inner_inertia = _inner_inertia(X, centers, cluster_samples, metric) + inner_inertia = _inner_inertia(data, centers, cluster_samples, metric) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end function _cluquality_calinski_harabasz( - X::AbstractMatrix{<:Real}, + data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, metric::SemiMetric=SqEuclidean() ) - n, k = size(X, 2), size(centers, 2) + n, k = size(data, 2), size(centers, 2) w_idx1, w_idx2 = axes(weights) - global_center = vec(mean(X, dims=2)) + global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) outer_intertia = sum( weights[i,j₁]^fuzziness * center_distances[j₂] for i in w_idx1, (j₁,j₂) in zip(w_idx2, 1:k) ) - inner_inertia = _inner_inertia(X, centers, weights, fuzziness, metric) + inner_inertia = _inner_inertia(data, centers, weights, fuzziness, metric) return (outer_intertia / inner_inertia) * (n - k) / (k - 1) end -# Davies-Bouldin idex +# Davies-Bouldin index function _cluquality_davies_bouldin( - X::AbstractMatrix{<:Real}, + data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, metric::SemiMetric=SqEuclidean() @@ -257,7 +244,7 @@ function _cluquality_davies_bouldin( cluster_samples = _gather_samples(assignments, k) - cluster_diameters = [mean(colwise(metric,view(X, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] + cluster_diameters = [mean(colwise(metric,view(data, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] center_distances = pairwise(metric,centers) DB = mean( @@ -271,16 +258,16 @@ end # Xie-Beni index function _cluquality_xie_beni( - X::AbstractMatrix{<:Real}, + data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, metric::SemiMetric=SqEuclidean() ) - n, k = size(X, 2), size(centers,2) + n, k = size(data, 2), size(centers,2) cluster_samples = _gather_samples(assignments, k) - inner_intertia = _inner_inertia(X, centers, cluster_samples, metric) + inner_intertia = _inner_inertia(data, centers, cluster_samples, metric) center_distances = pairwise(metric,centers) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) @@ -289,16 +276,16 @@ function _cluquality_xie_beni( end function _cluquality_xie_beni( - X::AbstractMatrix{<:Real}, + data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, weights::AbstractMatrix{<:Real}, fuzziness::Real, metric::SemiMetric=SqEuclidean() ) - n, k = size(X, 2), size(centers, 2) + n, k = size(data, 2), size(centers, 2) - inner_intertia = _inner_inertia(X, centers, weights, fuzziness, metric) + inner_intertia = _inner_inertia(data, centers, weights, fuzziness, metric) center_distances = pairwise(metric, eachcol(centers)) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) From 96040796f3205bc3ecdd50cb5406a624618eed52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sat, 28 Oct 2023 00:33:17 +0200 Subject: [PATCH 49/82] Update .gitignore Co-authored-by: Alexey Stukalov --- .gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 21255bf2..b25d56b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,5 @@ doc/build Manifest.toml *.swp -.vscode/settings.json +.vscode docs/build/ -docs/build/assets/ -Project.toml -*.html From 58f592f77821f5622fa0c32967a0dd549e770d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sat, 28 Oct 2023 00:35:14 +0200 Subject: [PATCH 50/82] Update Project.toml Co-authored-by: Alexey Stukalov --- Project.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Project.toml b/Project.toml index a976b9a2..ec189ee3 100644 --- a/Project.toml +++ b/Project.toml @@ -4,11 +4,9 @@ version = "0.15.4" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" From c1d787d94f9b916863ff2c0d9f80c8a495cadb3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 29 Oct 2023 02:08:19 +0200 Subject: [PATCH 51/82] alorithm changes + small corrs --- docs/source/validate.md | 2 +- src/clustering_quality.jl | 35 ++++++++++++++++++----------------- test/runtests.jl | 1 + 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 0c969372..f37602a2 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -88,7 +88,7 @@ Option `:calinski_harabasz`. Higher values indicate better quality. Measures cor ```math \frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad -\frac{n-k}{k-1} \frac{\sum_{C_j} \sum_{x_i} w_{ik}^\alpha d(x_i,c_j)}{\sum\limits_{C_j}\sum\limits_{x_i}w_{ij}^\alpha d(c_j,c)} +\frac{n-k}{k-1} \frac{\sum\limits_{C_j}\left(\sum\limits_{x_i}w_{ij}^\alpha\right) d(c_j,c)}{\sum_{C_j} \sum_{x_i} w_{ij}^\alpha d(x_i,c_j)} ``` respectively. diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 56d459fb..ca0dfdca 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -177,13 +177,14 @@ function _inner_inertia(data, centers, cluster_samples, metric) # shared between return inner_inertia end -function _inner_inertia(data, centers, weights, fuzziness, metric) # shared between soft clustering calinski_harabasz and xie_beni - n, k = size(data, 2), size(centers, 2) - w_idx1, w_idx2 = axes(weights) +function _inner_inertia(data, centers, weights, fuzziness, metric) # shared between fuzzy clustering calinski_harabasz and xie_beni + pointCentreDistances = pairwise(metric, eachcol(data), eachcol(centers)) + inner_inertia = sum( - weights[i₁,j₁]^fuzziness * pointCentreDistances[i₂,j₂] for (i₁,i₂) in zip(w_idx1,1:n), (j₁,j₂) in zip(w_idx2, 1:k) + w^fuzziness * d for (w, d) in zip(weights, pointCentreDistances) ) + return inner_inertia end @@ -298,19 +299,19 @@ end function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) - k = maximum(assignments) - - cluster_samples = _gather_samples(assignments, k) - - min_outer_distance = minimum( - minimum(view(dist, cluster_samples[j₁], cluster_samples[j₂]), init = typemax(eltype(dist))) - for j₁ in 1:k for j₂ in j₁+1:k - ) - - max_inner_distance = maximum( - maximum(dist[i₁,i₂] for i₁ in sample, i₂ in sample, init = typemin(eltype(dist))) - for sample in cluster_samples - ) + max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) + for i in eachindex(assignments), j in (i + 1):lastindex(assignments) + d = dist[i,j] + if assignments[i] == assignments[j] + if max_inner_distance < d + max_inner_distance = d + end + else + if min_outer_distance > d + min_outer_distance = d + end + end + end return min_outer_distance / max_inner_distance end diff --git a/test/runtests.jl b/test/runtests.jl index 9eaca2ed..42301653 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,6 +14,7 @@ tests = ["seeding", "fuzzycmeans", "counts", "silhouette", + "clustering_quality", "varinfo", "randindex", "hclust", From e8f95387e70696c01b4b285cb2e361fc1d9b8b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 29 Oct 2023 02:57:59 +0200 Subject: [PATCH 52/82] Update clustering_quality.jl --- src/clustering_quality.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index ca0dfdca..132fef10 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -218,17 +218,17 @@ function _cluquality_calinski_harabasz( ) n, k = size(data, 2), size(centers, 2) - w_idx1, w_idx2 = axes(weights) global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_intertia = sum( - weights[i,j₁]^fuzziness * center_distances[j₂] for i in w_idx1, (j₁,j₂) in zip(w_idx2, 1:k) - ) + outer_inertia = + sum(sum(w^fuzziness for w in w_col) * d + for (w_col, d) in zip(eachcol(weights), center_distances) + ) inner_inertia = _inner_inertia(data, centers, weights, fuzziness, metric) - return (outer_intertia / inner_inertia) * (n - k) / (k - 1) + return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end # Davies-Bouldin index From 4716c840bb998ab6761de9d052ae6b1f8226c78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 29 Oct 2023 02:04:16 +0100 Subject: [PATCH 53/82] Update clustering_quality.jl --- test/clustering_quality.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 6afc62f9..9468cf21 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -1,6 +1,5 @@ using Test using Clustering, Distances -using OffsetArrays @testset "clustering_quality()" begin From 55f4cf8db9d431f185a5e5b9900fa7fc140eff0e Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 13:51:48 -0700 Subject: [PATCH 54/82] fix quality_index matching --- src/clustering_quality.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 132fef10..d80746b5 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -75,8 +75,7 @@ function clustering_quality( _cluquality_xie_beni(data, centers, assignments, metric) elseif quality_index == :davies_bouldin _cluquality_davies_bouldin(data, centers, assignments, metric) - else quality_index == :davies_bouldin - if quality_index == :silhouettes + elseif quality_index == :silhouettes mean(silhouettes(assignments, pairwise(metric, eachcol(data)))) elseif quality_index == :dunn _cluquality_dunn(assignments, pairwise(metric, eachcol(data))) @@ -84,7 +83,6 @@ function clustering_quality( throw(ArgumentError("Quality index $quality_index not supported.")) end end -end clustering_quality(data::AbstractMatrix{<:Real}, R::KmeansResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = clustering_quality(data, R.centers, R.assignments; quality_index = quality_index, metric = metric) From fede9ddcdf1bcc1b1e1b8b79e72b46085fafdc69 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 13:53:10 -0700 Subject: [PATCH 55/82] remove :xie_beni aliases --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index d80746b5..2c11989f 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -113,7 +113,7 @@ function clustering_quality( if quality_index == :calinski_harabasz _cluquality_calinski_harabasz(data, centers, weights, fuzziness, metric) - elseif quality_index ∈ (:xie_beni, :Xie_Beni, :xb) + elseif quality_index == :xie_beni _cluquality_xie_beni(data, centers, weights, fuzziness, metric) else throw(ArgumentError("Quality index $quality_index not supported.")) From 946bcd1487810445dc8c948892ea0c766f90fbfa Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 13:53:55 -0700 Subject: [PATCH 56/82] try to fix tests on 1.0 --- test/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 9468cf21..385694e7 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -46,7 +46,7 @@ using Clustering, Distances @test clustering_quality(Y', C', A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) @test clustering_quality(Y', C', W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) - @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2√5 + @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2 sqrt(5) @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 From 07ae425a27c472bf75e590bcc90db7c8f6ea99c1 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 13:59:45 -0700 Subject: [PATCH 57/82] try to fix test, take #2 --- test/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 385694e7..f7d6b313 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -46,7 +46,7 @@ using Clustering, Distances @test clustering_quality(Y', C', A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) @test clustering_quality(Y', C', W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) - @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2 sqrt(5) + @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 From 2e7b159415fb700c2dc6b2f09cf99e17b01f352a Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 14:17:43 -0700 Subject: [PATCH 58/82] fix some eachcol() uses for 1.0 --- src/clustering_quality.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 2c11989f..760a8801 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -76,9 +76,9 @@ function clustering_quality( elseif quality_index == :davies_bouldin _cluquality_davies_bouldin(data, centers, assignments, metric) elseif quality_index == :silhouettes - mean(silhouettes(assignments, pairwise(metric, eachcol(data)))) + mean(silhouettes(assignments, pairwise(metric, data, dims=2))) elseif quality_index == :dunn - _cluquality_dunn(assignments, pairwise(metric, eachcol(data))) + _cluquality_dunn(assignments, pairwise(metric, data, dims=2)) else throw(ArgumentError("Quality index $quality_index not supported.")) end @@ -147,10 +147,10 @@ end clustering_quality(data::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = - clustering_quality(assignments, pairwise(metric,eachcol(data)); quality_index = quality_index) + clustering_quality(assignments, pairwise(metric, data, dims=2); quality_index = quality_index) clustering_quality(data::AbstractMatrix{<:Real}, R::ClusteringResult; quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = - clustering_quality(R.assignments, pairwise(metric,eachcol(data)); quality_index = quality_index) + clustering_quality(R.assignments, pairwise(metric, data, dims=2); quality_index = quality_index) clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; quality_index::Symbol) = clustering_quality(R.assignments, dist; quality_index = quality_index) @@ -177,7 +177,7 @@ end function _inner_inertia(data, centers, weights, fuzziness, metric) # shared between fuzzy clustering calinski_harabasz and xie_beni - pointCentreDistances = pairwise(metric, eachcol(data), eachcol(centers)) + pointCentreDistances = pairwise(metric, data, centers, dims=2) inner_inertia = sum( w^fuzziness * d for (w, d) in zip(weights, pointCentreDistances) @@ -286,7 +286,7 @@ function _cluquality_xie_beni( inner_intertia = _inner_inertia(data, centers, weights, fuzziness, metric) - center_distances = pairwise(metric, eachcol(centers)) + center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) return inner_intertia / (n * min_center_distance) From d074e52a9f8e0880503dbefecb6627972e44283b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 29 Oct 2023 22:26:17 +0100 Subject: [PATCH 59/82] sqrt 5 --- test/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 9468cf21..209b5a9d 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -46,7 +46,7 @@ using Clustering, Distances @test clustering_quality(Y', C', A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) @test clustering_quality(Y', C', W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) - @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2√5 + @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/2sqrt(5) @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 From e255fe4a1229edb00c6d3faad9486d178ecfb5be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Sun, 29 Oct 2023 22:30:05 +0100 Subject: [PATCH 60/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 760a8801..3957cbad 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -175,14 +175,13 @@ function _inner_inertia(data, centers, cluster_samples, metric) # shared between return inner_inertia end -function _inner_inertia(data, centers, weights, fuzziness, metric) # shared between fuzzy clustering calinski_harabasz and xie_beni - +# shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) +function _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, + weights::AbstractMatrix, fuzziness::Real) pointCentreDistances = pairwise(metric, data, centers, dims=2) - inner_inertia = sum( w^fuzziness * d for (w, d) in zip(weights, pointCentreDistances) ) - return inner_inertia end From 23a2216f82e56b1934e8d460efca345a81f1babe Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 14:56:19 -0700 Subject: [PATCH 61/82] inner_inertia: use assignments directly --- src/clustering_quality.jl | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 3957cbad..e1268898 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -167,13 +167,10 @@ function _gather_samples(assignments, k) # cluster_samples[j]: indices of points end -function _inner_inertia(data, centers, cluster_samples, metric) # shared between hard clustering calinski_harabasz and xie_beni - inner_inertia = sum( - sum(colwise(metric, view(data, :, samples), center)) - for (center, samples) in zip(eachcol(centers), cluster_samples) - ) - return inner_inertia -end +# shared between calinski_harabasz and xie_beni (hard version) +_inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}) = + sum(metric(view(data, :, i), view(center, :, clu)) for (i, clu) in enumerate(assignments)) # shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) function _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, @@ -196,12 +193,10 @@ function _cluquality_calinski_harabasz( n, k = size(data, 2), size(centers, 2) - cluster_samples = _gather_samples(assignments, k) global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_inertia = length.(cluster_samples) ⋅ center_distances - - inner_inertia = _inner_inertia(data, centers, cluster_samples, metric) + outer_inertia = sum(center_distances[clu] for clu in assignments) + inner_inertia = _inner_inertia(metric, data, centers, assignments) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -219,11 +214,9 @@ function _cluquality_calinski_harabasz( global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_inertia = - sum(sum(w^fuzziness for w in w_col) * d - for (w_col, d) in zip(eachcol(weights), center_distances) - ) - inner_inertia = _inner_inertia(data, centers, weights, fuzziness, metric) + outer_inertia = sum(sum(w^fuzziness for w in view(weights, :, clu)) * d + for (clu, d) in enumerate(center_distances)) + inner_inertia = _inner_inertia(metric, data, centers, weights, fuzziness) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -264,10 +257,8 @@ function _cluquality_xie_beni( n, k = size(data, 2), size(centers,2) - cluster_samples = _gather_samples(assignments, k) - inner_intertia = _inner_inertia(data, centers, cluster_samples, metric) - - center_distances = pairwise(metric,centers) + inner_intertia = _inner_inertia(metric, data, centers, assignments) + center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) return inner_intertia / (n * min_center_distance) @@ -283,8 +274,7 @@ function _cluquality_xie_beni( n, k = size(data, 2), size(centers, 2) - inner_intertia = _inner_inertia(data, centers, weights, fuzziness, metric) - + inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) From a2a6e805071ed6e51d94057d1cf0713543593a90 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 15:00:12 -0700 Subject: [PATCH 62/82] fix typo --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index e1268898..9717251e 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -170,7 +170,7 @@ end # shared between calinski_harabasz and xie_beni (hard version) _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, assignments::AbstractVector{<:Integer}) = - sum(metric(view(data, :, i), view(center, :, clu)) for (i, clu) in enumerate(assignments)) + sum(metric(view(data, :, i), view(centers, :, clu)) for (i, clu) in enumerate(assignments)) # shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) function _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, From 58259bcd82c2683e16b8df6f3324eae16067b804 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 15:22:30 -0700 Subject: [PATCH 63/82] unify hard and fuzzy calinski_harabasz and xie_beni --- src/clustering_quality.jl | 88 +++++++++++++-------------------------- 1 file changed, 29 insertions(+), 59 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 9717251e..a2d20546 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -70,9 +70,9 @@ function clustering_quality( end if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(data, centers, assignments, metric) + _cluquality_calinski_harabasz(metric, data, centers, assignments, nothing) elseif quality_index == :xie_beni - _cluquality_xie_beni(data, centers, assignments, metric) + _cluquality_xie_beni(metric, data, centers, assignments, nothing) elseif quality_index == :davies_bouldin _cluquality_davies_bouldin(data, centers, assignments, metric) elseif quality_index == :silhouettes @@ -112,9 +112,9 @@ function clustering_quality( 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) if quality_index == :calinski_harabasz - _cluquality_calinski_harabasz(data, centers, weights, fuzziness, metric) + _cluquality_calinski_harabasz(metric, data, centers, weights, fuzziness) elseif quality_index == :xie_beni - _cluquality_xie_beni(data, centers, weights, fuzziness, metric) + _cluquality_xie_beni(metric, data, centers, weights, fuzziness) else throw(ArgumentError("Quality index $quality_index not supported.")) end @@ -169,7 +169,7 @@ end # shared between calinski_harabasz and xie_beni (hard version) _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, - assignments::AbstractVector{<:Integer}) = + assignments::AbstractVector{<:Integer}, fuzziness::Nothing) = sum(metric(view(data, :, i), view(centers, :, clu)) for (i, clu) in enumerate(assignments)) # shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) @@ -182,42 +182,33 @@ function _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::Abstr return inner_inertia end -# Calinski-Harabasz index - -function _cluquality_calinski_harabasz( - data::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - assignments::AbstractVector{<:Integer}, - metric::SemiMetric=SqEuclidean() - ) - - n, k = size(data, 2), size(centers, 2) - +# "hard" version +function _outer_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, fuzziness::Nothing) global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_inertia = sum(center_distances[clu] for clu in assignments) - inner_inertia = _inner_inertia(metric, data, centers, assignments) + return sum(center_distances[clu] for clu in assignments) +end - return (outer_inertia / inner_inertia) * (n - k) / (k - 1) +# "fuzzy" version +function _outer_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, + weights::AbstractMatrix, fuzziness::Real) + global_center = vec(mean(data, dims=2)) + center_distances = colwise(metric, centers, global_center) + return sum(sum(w^fuzziness for w in view(weights, :, clu)) * d + for (clu, d) in enumerate(center_distances)) end -function _cluquality_calinski_harabasz( +function _cluquality_calinski_harabasz( + metric::SemiMetric, data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}, - fuzziness::Real, - metric::SemiMetric=SqEuclidean() - ) - + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) n, k = size(data, 2), size(centers, 2) - - global_center = vec(mean(data, dims=2)) - center_distances = colwise(metric, centers, global_center) - - outer_inertia = sum(sum(w^fuzziness for w in view(weights, :, clu)) * d - for (clu, d) in enumerate(center_distances)) - inner_inertia = _inner_inertia(metric, data, centers, weights, fuzziness) - + outer_inertia = _outer_inertia(metric, data, centers, assignments, fuzziness) + inner_inertia = _inner_inertia(metric, data, centers, assignments, fuzziness) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -246,38 +237,17 @@ function _cluquality_davies_bouldin( end -# Xie-Beni index - function _cluquality_xie_beni( + metric::SemiMetric, data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, - assignments::AbstractVector{<:Integer}, - metric::SemiMetric=SqEuclidean() - ) - + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) n, k = size(data, 2), size(centers,2) - - inner_intertia = _inner_inertia(metric, data, centers, assignments) + inner_intertia = _inner_inertia(metric, data, centers, assignments, fuzziness) center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - - return inner_intertia / (n * min_center_distance) -end - -function _cluquality_xie_beni( - data::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}, - fuzziness::Real, - metric::SemiMetric=SqEuclidean() - ) - - n, k = size(data, 2), size(centers, 2) - - inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) - center_distances = pairwise(metric, centers, dims=2) - min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return inner_intertia / (n * min_center_distance) end From 6fa67143c6c238c57784f42751d8a26ee3f849af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 01:17:18 +0100 Subject: [PATCH 64/82] _inner_inertia --- src/clustering_quality.jl | 31 ++++++++++++++++++++----------- test/clustering_quality.jl | 4 ++++ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 3957cbad..ae227ded 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -141,7 +141,7 @@ function clustering_quality( elseif quality_index == :dunn _cluquality_dunn(assignments, dist) else - error(ArgumentError("Quality index $quality_index not available.")) + throw(ArgumentError("Quality index $quality_index not supported.")) end end @@ -166,18 +166,28 @@ function _gather_samples(assignments, k) # cluster_samples[j]: indices of points return cluster_samples end - -function _inner_inertia(data, centers, cluster_samples, metric) # shared between hard clustering calinski_harabasz and xie_beni +# shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer} + ) inner_inertia = sum( sum(colwise(metric, view(data, :, samples), center)) - for (center, samples) in zip(eachcol(centers), cluster_samples) + for (center, samples) in zip(eachcol(centers), _gather_samples(assignments, size(centers)[2])) ) return inner_inertia end # shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) -function _inner_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, - weights::AbstractMatrix, fuzziness::Real) +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real + ) pointCentreDistances = pairwise(metric, data, centers, dims=2) inner_inertia = sum( w^fuzziness * d for (w, d) in zip(weights, pointCentreDistances) @@ -201,7 +211,7 @@ function _cluquality_calinski_harabasz( center_distances = colwise(metric, centers, global_center) outer_inertia = length.(cluster_samples) ⋅ center_distances - inner_inertia = _inner_inertia(data, centers, cluster_samples, metric) + inner_inertia = _inner_inertia(metric, data, centers, assignments) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -223,7 +233,7 @@ function _cluquality_calinski_harabasz( sum(sum(w^fuzziness for w in w_col) * d for (w_col, d) in zip(eachcol(weights), center_distances) ) - inner_inertia = _inner_inertia(data, centers, weights, fuzziness, metric) + inner_inertia = _inner_inertia(metric, data, centers, weights, fuzziness) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end @@ -264,8 +274,7 @@ function _cluquality_xie_beni( n, k = size(data, 2), size(centers,2) - cluster_samples = _gather_samples(assignments, k) - inner_intertia = _inner_inertia(data, centers, cluster_samples, metric) + inner_intertia = _inner_inertia(metric, data, centers, assignments) center_distances = pairwise(metric,centers) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) @@ -283,7 +292,7 @@ function _cluquality_xie_beni( n, k = size(data, 2), size(centers, 2) - inner_intertia = _inner_inertia(data, centers, weights, fuzziness, metric) + inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 6111cdd7..9b4be531 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -40,6 +40,9 @@ using Clustering, Distances @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,1), [1, ], quality_index = :calinski_harabasz) @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,2), [1, 2], quality_index = :calinski_harabasz) @test_throws DimensionMismatch clustering_quality([1,2,3], zeros(2,2), quality_index = :dunn) + @test_throws ArgumentError clustering_quality(Y', C', A; quality_index = :nonexistent_index) + @test_throws ArgumentError clustering_quality(Y', C', W; quality_index = :nonexistent_index, fuzziness = 2) + @test_throws ArgumentError clustering_quality(Y', A; quality_index = :nonexistent_index) end @testset "correct index values" begin @@ -50,6 +53,7 @@ using Clustering, Distances @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(Y', A; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 end From b184d9e275c3d482666b922d63734cbffc6ea556 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 17:18:33 -0700 Subject: [PATCH 65/82] cluquality_dunn(): inbounds access to dist --- src/clustering_quality.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index a2d20546..49b2fd11 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -252,14 +252,12 @@ function _cluquality_xie_beni( end -# Dunn index - function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + T = eltype(dist) + max_inner_distance, min_outer_distance = typemin(T), typemax(T) - max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) - for i in eachindex(assignments), j in (i + 1):lastindex(assignments) - d = dist[i,j] + @inbounds d = dist[i, j] if assignments[i] == assignments[j] if max_inner_distance < d max_inner_distance = d From f681112a325ece6167c225dd59bcf780ef0639c1 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 17:32:17 -0700 Subject: [PATCH 66/82] tweak davies_bouldin --- src/clustering_quality.jl | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 49b2fd11..8bf5bb0d 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -74,7 +74,7 @@ function clustering_quality( elseif quality_index == :xie_beni _cluquality_xie_beni(metric, data, centers, assignments, nothing) elseif quality_index == :davies_bouldin - _cluquality_davies_bouldin(data, centers, assignments, metric) + _cluquality_davies_bouldin(metric, data, centers, assignments) elseif quality_index == :silhouettes mean(silhouettes(assignments, pairwise(metric, data, dims=2))) elseif quality_index == :dunn @@ -212,28 +212,21 @@ function _cluquality_calinski_harabasz( return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end -# Davies-Bouldin index - function _cluquality_davies_bouldin( + metric::SemiMetric, data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, - metric::SemiMetric=SqEuclidean() - ) - - k = size(centers, 2) - c_idx = axes(centers, 2) - - cluster_samples = _gather_samples(assignments, k) - - cluster_diameters = [mean(colwise(metric,view(data, :, sample), centers[:,j])) for (j, sample) in zip(c_idx, cluster_samples) ] - center_distances = pairwise(metric,centers) - - DB = mean( - maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) - for j₁ in c_idx - ) - return DB +) + clu_idx = axes(centers, 2) + clu_samples = _gather_samples(assignments, length(clu_idx)) + clu_diams = [mean(colwise(metric, view(data, :, samples), view(centers, :, clu))) + for (clu, samples) in zip(clu_idx, clu_samples)] + center_dists = pairwise(metric, centers, dims=2) + + return mean(maximum((clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁, j₂] + for j₂ in clu_idx if j₂ ≠ j₁) + for j₁ in clu_idx) end From d00acba3b53d901144775fbf9a55b559f1924d7e Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 29 Oct 2023 17:40:36 -0700 Subject: [PATCH 67/82] davies_bouldin: use inbounds --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 8bf5bb0d..8c07f7cf 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -224,7 +224,7 @@ function _cluquality_davies_bouldin( for (clu, samples) in zip(clu_idx, clu_samples)] center_dists = pairwise(metric, centers, dims=2) - return mean(maximum((clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁, j₂] + return mean(maximum(@inbounds (clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁, j₂] for j₂ in clu_idx if j₂ ≠ j₁) for j₁ in clu_idx) end From ab46a36e750492d27861471fbedbb04303c7dedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 19:33:36 +0100 Subject: [PATCH 68/82] add tests + inbounds --- src/clustering_quality.jl | 13 +++++++---- test/clustering_quality.jl | 48 ++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index ae227ded..c7908b77 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -1,4 +1,4 @@ -# hard clustering indices interface + general docs +# interface of hard clustering indices + general docs """ For hard clustering: @@ -68,6 +68,9 @@ function clustering_quality( for i in eachindex(assignments) (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `data`.")) end + for i in 1:k + i ∉ assignments && @warn "Cluster number $(i) is empty. Clustering quality calculation may not be reliable." + end if quality_index == :calinski_harabasz _cluquality_calinski_harabasz(data, centers, assignments, metric) @@ -88,7 +91,7 @@ clustering_quality(data::AbstractMatrix{<:Real}, R::KmeansResult; quality_index: clustering_quality(data, R.centers, R.assignments; quality_index = quality_index, metric = metric) -# fuzzy clustering indices interface +# interface of fuzzy clustering indices function clustering_quality( data::AbstractMatrix{<:Real}, # d×n matrix @@ -124,7 +127,7 @@ clustering_quality(data::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; quality_i clustering_quality(data, R.centers, R.weights; quality_index = quality_index, fuzziness = fuzziness, metric = metric) -# clustering indices with cluster centres not known interface +# interface of clustering indices with cluster centres not known function clustering_quality( assignments::AbstractVector{<:Integer}, # n vector @@ -256,7 +259,7 @@ function _cluquality_davies_bouldin( center_distances = pairwise(metric,centers) DB = mean( - maximum( (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) + maximum(@inbounds (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) for j₁ in c_idx ) return DB @@ -307,7 +310,7 @@ function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::Abstract max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) - for i in eachindex(assignments), j in (i + 1):lastindex(assignments) + @inbounds for i in eachindex(assignments), j in (i + 1):lastindex(assignments) d = dist[i,j] if assignments[i] == assignments[j] if max_inner_distance < d diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 9b4be531..71b9f2b5 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -4,7 +4,6 @@ using Clustering, Distances @testset "clustering_quality()" begin # test data with 4 clusters - Y = [-2 4; 2 4; 2 1; 3 0; 2 -1; 1 0; 2 -4; -2 -4; -2 1; -1 0; -2 -1; -3 0] C = [0 4; 2 0; 0 -4; -2 0] A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] @@ -22,23 +21,28 @@ using Clustering, Distances 0 0 0 1 0 0 0 1 ] - - # visualisation of the data - # using Plots - # scatter(Y[:,1],Y[:,2], - # axisratio = :equal, - # #seriescolor = palette(default)[A], - # ) - # scatter!(C[:,1],C[:,2], - # marker = :square, - # label = "cluster centers", - # ) + W2 = [ + 1 0 0 0 + 1 0 0 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 0 1 0 + 0 0 1 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + ] @testset "input checks" begin @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,3), [1, 2], quality_index = :calinski_harabasz) @test_throws DimensionMismatch clustering_quality(zeros(2,2),zeros(3,2), [1, 2], quality_index = :calinski_harabasz) @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,1), [1, ], quality_index = :calinski_harabasz) @test_throws ArgumentError clustering_quality(zeros(2,2),zeros(2,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0),zeros(0,0), zeros(Int,0); quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0), zeros(0,0),zeros(0,0); quality_index = :calinski_harabasz, fuzziness = 2) @test_throws DimensionMismatch clustering_quality([1,2,3], zeros(2,2), quality_index = :dunn) @test_throws ArgumentError clustering_quality(Y', C', A; quality_index = :nonexistent_index) @test_throws ArgumentError clustering_quality(Y', C', W; quality_index = :nonexistent_index, fuzziness = 2) @@ -48,13 +52,33 @@ using Clustering, Distances @testset "correct index values" begin @test clustering_quality(Y', C', A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) @test clustering_quality(Y', C', W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y', C', W2; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ 8/3 * ( 24 ) / (14+sqrt(17)) @test clustering_quality(Y', C', A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) @test clustering_quality(Y', C', A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 @test clustering_quality(Y', C', W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(Y', C', W2; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ (14+sqrt(17)) / (12 * 4) @test clustering_quality(Y', A; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 end + @testset "alternate arguments" begin + # mock hard and fuzzy clusterings for testing interface; only C, W and A arguments are actually used + hardClustering = KmeansResult(Float64.(C'), A, ones(12), [4, 4, 4], ones(4), 42., 42, true) + fuzzyClustering = FuzzyCMeansResult(Float64.(C'), Float64.(W), 42, true) + + @test clustering_quality(Y', hardClustering; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y', fuzzyClustering; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(hardClustering, pairwise(Euclidean(), Y', dims=2); quality_index = :dunn) ≈ 1/2 + end + + @testset "empty clusters" begin + # degenerated clustering + degC = [0 4; 2 0; 0 -4; -2 0; -2 0] + degA = [1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 5, 5] # no 4th cluster + + @test_logs (:warn, "Cluster number 4 is empty. Clustering quality calculation may not be reliable.") clustering_quality(Y', degC', degA; quality_index = :calinski_harabasz) + end + end \ No newline at end of file From ef377d1ec4655c6b4dc084cc21be9be596bfed79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 20:41:53 +0100 Subject: [PATCH 69/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index babdcc12..f63e1159 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -174,7 +174,8 @@ function _inner_inertia( metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, - assignments::AbstractVector{<:Integer} + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing ) inner_inertia = sum( sum(colwise(metric, view(data, :, samples), center)) From 18930afb361eea5d1cf25f689cc5a1aaf3b5a20e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 20:42:06 +0100 Subject: [PATCH 70/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index f63e1159..772f1fb3 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -231,11 +231,8 @@ function _cluquality_calinski_harabasz( global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_inertia = - sum(sum(w^fuzziness for w in w_col) * d - for (w_col, d) in zip(eachcol(weights), center_distances) - ) - inner_inertia = _inner_inertia(metric, data, centers, weights, fuzziness) + outer_inertia = _outer_inertia(metric, data, centers, assignments, fuzziness) + inner_inertia = _inner_inertia(metric, data, centers, assignments, fuzziness) return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end From f29e3026be8581fe375789643573a281cb5c4843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 21:39:36 +0100 Subject: [PATCH 71/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 772f1fb3..cd0e0965 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -65,8 +65,8 @@ function clustering_quality( (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) - for i in eachindex(assignments) - (assignments[i] in axes(centers, 2)) || throw(ArgumentError("Bad assignments[$i]=$(assignments[i]) is not a valid index for `data`.")) + for (i, clu) in enumerate(assignments) + (clu in axes(centers, 2)) || throw(ArgumentError("Invalid cluster index: assignments[$i]=$(clu).")) end for i in 1:k i ∉ assignments && @warn "Cluster number $(i) is empty. Clustering quality calculation may not be reliable." From 6d807b3516d9690f89f7d733318edc0aa889bc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 23:22:32 +0100 Subject: [PATCH 72/82] unbroke previous merge --- src/clustering_quality.jl | 83 ++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index cd0e0965..ea2676fb 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -199,24 +199,33 @@ function _inner_inertia( return inner_inertia end -# "hard" version -function _outer_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, - assignments::AbstractVector{<:Integer}, fuzziness::Nothing) +# hard outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing + ) global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) return sum(center_distances[clu] for clu in assignments) end -# "fuzzy" version -function _outer_inertia(metric::SemiMetric, data::AbstractMatrix, centers::AbstractMatrix, - weights::AbstractMatrix, fuzziness::Real) +# fuzzy outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real + ) + global_center = vec(mean(data, dims=2)) center_distances = colwise(metric, centers, global_center) - outer_inertia = length.(cluster_samples) ⋅ center_distances - - inner_inertia = _inner_inertia(metric, data, centers, assignments) - - return (outer_inertia / inner_inertia) * (n - k) / (k - 1) + return sum(sum(w^fuzziness for w in view(weights, :, clu)) * d + for (clu, d) in enumerate(center_distances) + ) end function _cluquality_calinski_harabasz( @@ -225,12 +234,8 @@ function _cluquality_calinski_harabasz( centers::AbstractMatrix{<:Real}, assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, fuzziness::Union{Real, Nothing} -) + ) n, k = size(data, 2), size(centers, 2) - - global_center = vec(mean(data, dims=2)) - center_distances = colwise(metric, centers, global_center) - outer_inertia = _outer_inertia(metric, data, centers, assignments, fuzziness) inner_inertia = _inner_inertia(metric, data, centers, assignments, fuzziness) @@ -242,7 +247,7 @@ function _cluquality_davies_bouldin( data::AbstractMatrix{<:Real}, centers::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}, -) + ) clu_idx = axes(centers, 2) clu_samples = _gather_samples(assignments, length(clu_idx)) clu_diams = [mean(colwise(metric, view(data, :, samples), view(centers, :, clu))) @@ -250,8 +255,8 @@ function _cluquality_davies_bouldin( center_dists = pairwise(metric, centers, dims=2) DB = mean( - maximum(@inbounds (cluster_diameters[j₁] + cluster_diameters[j₂]) / center_distances[j₁,j₂] for j₂ in c_idx if j₂ ≠ j₁) - for j₁ in c_idx + maximum(@inbounds (clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁,j₂] for j₂ in clu_idx if j₂ ≠ j₁) + for j₁ in clu_idx ) return DB end @@ -265,41 +270,39 @@ function _cluquality_xie_beni( centers::AbstractMatrix{<:Real}, assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, fuzziness::Union{Real, Nothing} -) + ) n, k = size(data, 2), size(centers,2) - - inner_intertia = _inner_inertia(metric, data, centers, assignments) - - center_distances = pairwise(metric,centers) + inner_intertia = _inner_inertia(metric, data, centers, assignments, fuzziness) + center_distances = pairwise(metric, centers, dims=2) min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - + return inner_intertia / (n * min_center_distance) end -function _cluquality_xie_beni( - data::AbstractMatrix{<:Real}, - centers::AbstractMatrix{<:Real}, - weights::AbstractMatrix{<:Real}, - fuzziness::Real, - metric::SemiMetric=SqEuclidean() - ) +# function _cluquality_xie_beni( +# data::AbstractMatrix{<:Real}, +# centers::AbstractMatrix{<:Real}, +# weights::AbstractMatrix{<:Real}, +# fuzziness::Real, +# metric::SemiMetric=SqEuclidean() +# ) - n, k = size(data, 2), size(centers, 2) +# n, k = size(data, 2), size(centers, 2) - inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) +# inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) - center_distances = pairwise(metric, centers, dims=2) - min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) - return inner_intertia / (n * min_center_distance) -end +# center_distances = pairwise(metric, centers, dims=2) +# min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) +# return inner_intertia / (n * min_center_distance) +# end function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) - @inbounds for i in eachindex(assignments), j in (i + 1):lastindex(assignments) - d = dist[i,j] + for i in eachindex(assignments), j in (i + 1):lastindex(assignments) + @inbounds d = dist[i,j] if assignments[i] == assignments[j] if max_inner_distance < d max_inner_distance = d From c185b80a123ffce211cf98d68c0132ae67f171c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 30 Oct 2023 23:31:15 +0100 Subject: [PATCH 73/82] formatting --- src/clustering_quality.jl | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index ea2676fb..12bb75cb 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -228,6 +228,7 @@ function _outer_inertia( ) end +# Calinsk-Harabasz index function _cluquality_calinski_harabasz( metric::SemiMetric, data::AbstractMatrix{<:Real}, @@ -242,6 +243,8 @@ function _cluquality_calinski_harabasz( return (outer_inertia / inner_inertia) * (n - k) / (k - 1) end + +# Davies Bouldin index function _cluquality_davies_bouldin( metric::SemiMetric, data::AbstractMatrix{<:Real}, @@ -263,7 +266,6 @@ end # Xie-Beni index - function _cluquality_xie_beni( metric::SemiMetric, data::AbstractMatrix{<:Real}, @@ -279,24 +281,7 @@ function _cluquality_xie_beni( return inner_intertia / (n * min_center_distance) end -# function _cluquality_xie_beni( -# data::AbstractMatrix{<:Real}, -# centers::AbstractMatrix{<:Real}, -# weights::AbstractMatrix{<:Real}, -# fuzziness::Real, -# metric::SemiMetric=SqEuclidean() -# ) - -# n, k = size(data, 2), size(centers, 2) - -# inner_intertia = _inner_inertia(metric, data, centers, weights, fuzziness) - -# center_distances = pairwise(metric, centers, dims=2) -# min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) -# return inner_intertia / (n * min_center_distance) -# end - - +# Dunn index function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) From f7de43e8728063527f1287ec2b0a974cbb2548b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 31 Oct 2023 00:01:19 +0100 Subject: [PATCH 74/82] for Julia 1.0: eachol -> view --- src/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 12bb75cb..104e6dab 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -179,7 +179,7 @@ function _inner_inertia( ) inner_inertia = sum( sum(colwise(metric, view(data, :, samples), center)) - for (center, samples) in zip(eachcol(centers), _gather_samples(assignments, size(centers)[2])) + for (center, samples) in zip((view(centers,:,j) for j in axes(centers)[2]), _gather_samples(assignments, size(centers)[2])) ) return inner_inertia end From 381ea6cbbf7d4bf57a7cf1f254fb2a358a21970f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 31 Oct 2023 00:25:18 +0100 Subject: [PATCH 75/82] Update src/clustering_quality.jl Co-authored-by: Alexey Stukalov --- src/clustering_quality.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 104e6dab..d850aa56 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -65,11 +65,14 @@ function clustering_quality( (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + seen_clusters = falses(k) for (i, clu) in enumerate(assignments) (clu in axes(centers, 2)) || throw(ArgumentError("Invalid cluster index: assignments[$i]=$(clu).")) + seen_clusters[clu] = true end - for i in 1:k - i ∉ assignments && @warn "Cluster number $(i) is empty. Clustering quality calculation may not be reliable." + if !all(seen_clusters) + empty_clu_ixs = findall(~seen_clusters) + @warn "Empty cluster(s) detected: $(join(String.(empty_clu_ixs), ", ")). clustering_quality() results might be incorrect." end if quality_index == :calinski_harabasz From 88b699f22debd5b23d6d21585988f5902397f50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 31 Oct 2023 00:33:37 +0100 Subject: [PATCH 76/82] fix seen_clusters --- src/clustering_quality.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index d850aa56..7f384753 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -71,8 +71,8 @@ function clustering_quality( seen_clusters[clu] = true end if !all(seen_clusters) - empty_clu_ixs = findall(~seen_clusters) - @warn "Empty cluster(s) detected: $(join(String.(empty_clu_ixs), ", ")). clustering_quality() results might be incorrect." + empty_clu_ixs = findall(!, seen_clusters) + @warn "Detected empty cluster(s) no.: $(join(string.(empty_clu_ixs), ", ")). clustering_quality() results might be incorrect." end if quality_index == :calinski_harabasz @@ -182,7 +182,7 @@ function _inner_inertia( ) inner_inertia = sum( sum(colwise(metric, view(data, :, samples), center)) - for (center, samples) in zip((view(centers,:,j) for j in axes(centers)[2]), _gather_samples(assignments, size(centers)[2])) + for (center, samples) in zip((view(centers,:,j) for j in axes(centers, 2)), _gather_samples(assignments, size(centers, 2))) ) return inner_inertia end From f3ae01c3659c4fc3e364f4492670321f28f38d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 31 Oct 2023 00:56:22 +0100 Subject: [PATCH 77/82] Update clustering_quality.jl --- test/clustering_quality.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl index 71b9f2b5..acfe20d8 100644 --- a/test/clustering_quality.jl +++ b/test/clustering_quality.jl @@ -78,7 +78,7 @@ using Clustering, Distances degC = [0 4; 2 0; 0 -4; -2 0; -2 0] degA = [1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 5, 5] # no 4th cluster - @test_logs (:warn, "Cluster number 4 is empty. Clustering quality calculation may not be reliable.") clustering_quality(Y', degC', degA; quality_index = :calinski_harabasz) + @test_logs (:warn, "Detected empty cluster(s) no.: 4. clustering_quality() results might be incorrect.") clustering_quality(Y', degC', degA; quality_index = :calinski_harabasz) end end \ No newline at end of file From 714f689e4028ec46dd25e65ba9747f730e1b3840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Tue, 31 Oct 2023 12:56:47 +0100 Subject: [PATCH 78/82] Update validate.md --- docs/source/validate.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 0c969372..d846b1d4 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -151,7 +151,7 @@ scatter(X[1,:],X[2,:], ) ``` -Hard clustering quality for K-means method number of clusters in `2:5` +Hard clustering quality for K-means method with number of clusters in `2:5`. ```@example using Plots, Clustering @@ -170,7 +170,7 @@ kmeans_quality = p = [ plot(2:5, kmeans_quality[qidx], marker = :circle, - title = string.(qidx), + title = ":"*string.(qidx), label = nothing, ) for qidx in hard_indices @@ -181,7 +181,7 @@ plot(p..., ) ``` -Fuzzy clustering quality for fuzzy C-means method with number of clusters in `2:5` +Fuzzy clustering quality for fuzzy C-means method with number of clusters in `2:5`. ```@example using Plots, Clustering X = hcat([4., 5.] .+ 0.4 * randn(2, 10), @@ -201,7 +201,7 @@ fuzzy_cmeans_quality = p = [ plot(2:5, fuzzy_cmeans_quality[qidx], marker = :circle, - title = string.(qidx), + title = ":"*string.(qidx), label = nothing, ) for qidx in fuzzy_indices From 92c7e7f08dde6b1d424de797407c51fcd1279090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Mon, 15 Jan 2024 20:26:02 -0800 Subject: [PATCH 79/82] add clustering_quality() --- .gitignore | 3 + docs/source/validate.md | 150 +++++++++++++++- examples/clustering_quality.jl | 55 ++++++ src/Clustering.jl | 8 +- src/clustering_quality.jl | 314 +++++++++++++++++++++++++++++++++ src/mutualinfo.jl | 4 +- test/clustering_quality.jl | 122 +++++++++++++ test/runtests.jl | 1 + 8 files changed, 649 insertions(+), 8 deletions(-) create mode 100644 examples/clustering_quality.jl create mode 100644 src/clustering_quality.jl create mode 100644 test/clustering_quality.jl diff --git a/.gitignore b/.gitignore index 0b53ec64..b25d56b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ doc/build Manifest.toml +*.swp +.vscode +docs/build/ diff --git a/docs/source/validate.md b/docs/source/validate.md index 11485389..8b1fdbe6 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -16,7 +16,6 @@ It shows how similar are the two clusterings on a cluster level. counts(a::ClusteringResult, b::ClusteringResult) ``` - ## Rand index [Rand index](http://en.wikipedia.org/wiki/Rand_index) is a measure of @@ -28,7 +27,6 @@ even when the original class labels are not used. randindex ``` - ## Silhouettes [Silhouettes](http://en.wikipedia.org/wiki/Silhouette_(clustering)) is @@ -46,14 +44,156 @@ s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} from the ``i``-th point to the points in the ``k``-th cluster. Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th -point lies well within its own cluster. This property allows using -`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality. +point lies well within its own cluster. This property allows using average silhouette value +`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality; it is also available using `clustering_quality(...; quality_index = :silhouettes)` method. Higher values indicate better separation of clusters w.r.t. point distances. ```@docs silhouettes ``` +## Clustering quality indices + +A group of clustering evaluation metrics which are intrinsic, i.e. depend only on the clustering itself. They can be used to compare different clustering algorithms or choose the optimal number of clusters. + + + +| **index name** | **quality_index** | **type** | **direction** | **cluster centers** | +|:-----------------:|:--------------------:|:----------:|:-------------:|:-------------------:| +| Calinski-Harabasz | `:calinsky_harabasz` | hard/fuzzy | up | required | +| Xie-Beni | `:xie_beni` | hard/fuzzy | down | required | +| Davis-Bouldin | `:davis_bouldin` | hard | down | required | +| Dunn | `:dunn` | hard | up | not required | +| silhouettes | `:silhouettes` | hard | up | not required | + + +```@docs +Clustering.clustering_quality +``` + +Notation for the index definitions below: +- ``x_1, x_2, \ldots, x_n``: data points, +- ``C_1, C_2, \ldots, C_k``: clusters, +- ``c_j`` and ``c``: cluster centers and global dataset center, +- ``d``: a similarity (distance) function, +- ``w_{ij}``: weights measuring membership of a point ``x_i`` to a cluster ``C_j``, +- ``\alpha``: a fuzziness parameter. + +### Calinski-Harabasz index + +Option `:calinski_harabasz`. Higher values indicate better quality. Measures corrected ratio between global inertia of the cluster centers and the summed internal inertias of clusters. For hard and fuzzy (soft) clustering it is defined as + +```math + +\frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad +\frac{n-k}{k-1} \frac{\sum\limits_{C_j}\left(\sum\limits_{x_i}w_{ij}^\alpha\right) d(c_j,c)}{\sum_{C_j} \sum_{x_i} w_{ij}^\alpha d(x_i,c_j)} +``` +respectively. + + +### Xie-Beni index +Option `:xie_beni`. Lower values indicate better quality. Measures ratio between summed inertia of clusters and minimum distance between cluster centres. For hard clustering and fuzzy (soft) clustering. It is defined as +```math +\frac{\sum_{C_j}\sum_{x_i\in C_j}d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +\quad \text{and}\quad +\frac{\sum_{C_j}\sum_{x_i} w_{ij}^\alpha d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } +``` +respectively. + +### [Davis-Bouldin index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) +Option `:davis_bouldin`. Lower values indicate better quality. It measures average cohesion based on the cluster diameters and distances between cluster centers. It is defined as + +```math +\frac{1}{k}\sum_{C_{j_1}}\max_{c_{j_2}\neq c_{j_1}}\frac{S(C_{j_1})+S(C_{j_2})}{d(c_{j_1},c_{j_2})} +``` +where +```math +S(C_j) = \frac{1}{|C_j|}\sum_{x_i\in C_j}d(x_i,c_j). +``` +### [Dunn index](https://en.wikipedia.org/wiki/Dunn_index) +Option `:dunn`. Higher values indicate better quality. More computationally demanding index which can be used when the centres are not known. It measures ratio between the nearest neighbour distance divided by the maximum cluster diameter. It is defined as +```math +\frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \mathrm{dist}(C_{j_1},C_{j_2})}{\max\limits_{C_j}\mathrm{diam}(C_j)} +``` +where +```math +\mathrm{dist}(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \mathrm{diam}(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). +``` + +### Average silhouette index + +Option `:silhouettes`. Higher values indicate better quality. It returns the average over silhouette values in the whole data set. See section [Silhouettes](#silhouettes) for a more detailed description of the method. + + +### References +> Olatz Arbelaitz *et al.* (2013). *An extensive comparative study of cluster validity indices*. Pattern Recognition. 46 1: 243-256. [doi:10.1016/j.patcog.2012.07.021](https://doi.org/10.1016/j.patcog.2012.07.021) + +> Aybükë Oztürk, Stéphane Lallich, Jérôme Darmont. (2018). *A Visual Quality Index for Fuzzy C-Means*. 14th International Conference on Artificial Intelligence Applications and Innovations (AIAI 2018). 546-555. [doi:10.1007/978-3-319-92007-8_46](https://doi.org/10.1007/978-3-319-92007-8_46). + +### Examples + +Exemplary data with 3 real clusters. +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + + +scatter(view(X, 1, :), view(X, 2, :), + label = "data points", + xlabel = "x", + ylabel = "y", + legend = :right, +) +``` + +Hard clustering quality for K-means method with 2 to 5 clusters: + +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +nclusters = 2:5 +clusterings = kmeans.(Ref(X), nclusters) + +plot(( + plot(nclusters, + clustering_quality.(Ref(X), clusterings, quality_index = qidx), + marker = :circle, + title = ":$qidx", label = nothing, + ) for qidx in [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn])..., + layout = (3, 2), + xaxis = "N clusters", + plot_title = "\"Hard\" clustering quality indices" +) +``` + +Fuzzy clustering quality for fuzzy C-means method with 2 to 5 clusters: +```@example +using Plots, Clustering +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +fuzziness = 2 +fuzzy_nclusters = 2:5 +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), fuzzy_nclusters, fuzziness) + +plot(( + plot(fuzzy_nclusters, + clustering_quality.(Ref(X), fuzzy_clusterings, + fuzziness = fuzziness, quality_index = qidx), + marker = :circle, + title = ":$qidx", label = nothing, + ) for qidx in [:calinski_harabasz, :xie_beni])..., + layout = (2, 1), + xaxis = "N clusters", + plot_title = "\"Soft\" clustering quality indices" +) +``` ## Variation of Information @@ -64,7 +204,7 @@ information*, but it is a true metric, *i.e.* it is symmetric and satisfies the triangle inequality. ```@docs -varinfo +Clustering.varinfo ``` diff --git a/examples/clustering_quality.jl b/examples/clustering_quality.jl new file mode 100644 index 00000000..0c902707 --- /dev/null +++ b/examples/clustering_quality.jl @@ -0,0 +1,55 @@ +using Plots, Clustering + +## test data with 3 clusters +X = hcat([4., 5.] .+ 0.4 * randn(2, 10), + [9., -5.] .+ 0.4 * randn(2, 5), + [-4., -9.] .+ 1 * randn(2, 5)) + +## visualisation of the exemplary data +scatter(X[1,:], X[2,:], + label = "data points", + xlabel = "x", + ylabel = "y", + legend = :right, +) + +nclusters = 2:5 + +## hard clustering quality +clusterings = kmeans.(Ref(X), nclusters) +hard_indices = [:silhouettes, :calinski_harabasz, :xie_beni, :davies_bouldin, :dunn] + +kmeans_quality = Dict( + qidx => clustering_quality.(Ref(X), clusterings, quality_index = qidx) + for qidx in hard_indices) + +plot(( + plot(nclusters, kmeans_quality[qidx], + marker = :circle, + title = qidx, + label = nothing, + ) for qidx in hard_indices)..., + layout = (3, 2), + xaxis = "N clusters", + plot_title = "\"Hard\" clustering quality indices" +) + +## soft clustering quality +fuzziness = 2 +fuzzy_clusterings = fuzzy_cmeans.(Ref(X), nclusters, fuzziness) +soft_indices = [:calinski_harabasz, :xie_beni] + +fuzzy_cmeans_quality = Dict( + qidx => clustering_quality.(Ref(X), fuzzy_clusterings, fuzziness = fuzziness, quality_index = qidx) + for qidx in soft_indices) + +plot(( + plot(nclusters, fuzzy_cmeans_quality[qidx], + marker = :circle, + title = qidx, + label = nothing, + ) for qidx in soft_indices)..., + layout = (2, 1), + xaxis = "N clusters", + plot_title = "\"Soft\" clustering quality indices" +) diff --git a/src/Clustering.jl b/src/Clustering.jl index 808b37b2..fae9ee82 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -49,6 +49,9 @@ module Clustering # silhouette silhouettes, + # quality indices + clustering_quality, + # varinfo varinfo, @@ -70,6 +73,7 @@ module Clustering # pair confusion matrix confusion + ## source files include("utils.jl") @@ -84,13 +88,15 @@ module Clustering include("counts.jl") include("cluster_distances.jl") + include("silhouette.jl") + include("clustering_quality.jl") + include("randindex.jl") include("varinfo.jl") include("vmeasure.jl") include("mutualinfo.jl") include("confusion.jl") - include("hclust.jl") include("deprecate.jl") diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl new file mode 100644 index 00000000..8acadc19 --- /dev/null +++ b/src/clustering_quality.jl @@ -0,0 +1,314 @@ +# main method for hard clustering indices + docs +""" +For hard clustering: + + clustering_quality(data, centers, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) + +For fuzzy clustering: + + clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) + clustering_quality(data, clustering; quality_index, fuzziness, [metric]) + +For hard clustering without cluster centers known: + + clustering_quality(assignments, dist_matrix; quality_index) + clustering_quality(clustering, dist_matrix; quality_index) + clustering_quality(data, assignments; quality_index, [metric]) + clustering_quality(data, clustering; quality_index, [metric]) + +Compute the clustering quality index for a given clustering. + +Returns a real number which is the value of the chosen quality index type of the given clustering. + +# Arguments + - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point + - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns + - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) + - `weights::AbstractMatrix`: ``n×k`` matrix with fuzzy clustering weights, `weights[i,j]` is the degree of membership of ``i``-th data point to ``j``-th cluster + - `clustering::Union{ClusteringResult, FuzzyCMeansResult}`: the output of the clustering method + - `quality_index::Symbol`: quality index to calculate; see below for the supported options + - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points + + # Keyword arguments + - `quality_index::Symbol`: quality index to calculate; see below for the supported options + - `fuzziness::Real`: clustering fuzziness > 1 + - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the metric/distance/similarity function + +When calling `clustering_quality` one can give `centers`, `assignments` or `weights` arguments by hand or provide a single `clustering` argument from which the necessary data will be read automatically. + +For clustering without known cluster centers the datapoints are not required, only `dist_matrix` is necessary. If given, `data` and `metric` will be used to calculate distance matrix instead. + +# Supported quality indices + +Symbols ↑/↓ are quality direction. +- `:calinski_harabasz`: hard or fuzzy Calinski-Harabsz index (↑) returns the corrected ratio of between cluster centers inertia and within-clusters inertia +- `:xie_beni`: hard or fuzzy Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers +- `:davies_bouldin`: Davies-Bouldin index (↓) returns average similarity between each cluster and its most similar one, averaged over all the clusters +- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter; it does not make use of `centers` argument +- `:silhouettes`: average silhouette index (↑), for all silhouettes use [`silhouettes`](@ref) method instead; it does not make use of `centers` argument +Please refer to the [documentation](@ref clustering_quality) for the definitions and usage descriptions of the supported quality indices. + +""" +function clustering_quality( + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + assignments::AbstractVector{<:Integer}; # n vector + quality_index::Symbol, + metric::SemiMetric=SqEuclidean() +) + d, n = size(data) + dc, k = size(centers) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + seen_clusters = falses(k) + for (i, clu) in enumerate(assignments) + (clu in axes(centers, 2)) || throw(ArgumentError("Invalid cluster index: assignments[$i]=$(clu).")) + seen_clusters[clu] = true + end + if !all(seen_clusters) + empty_clu_ixs = findall(!, seen_clusters) + @warn "Detected empty cluster(s): $(join(string.("#", empty_clu_ixs), ", ")). clustering_quality() results might be incorrect." + + newClusterIndices = cumsum(seen_clusters) + centers = view(centers, :, seen_clusters) + assignments = newClusterIndices[assignments] + end + + if quality_index == :calinski_harabasz + _cluquality_calinski_harabasz(metric, data, centers, assignments, nothing) + elseif quality_index == :xie_beni + _cluquality_xie_beni(metric, data, centers, assignments, nothing) + elseif quality_index == :davies_bouldin + _cluquality_davies_bouldin(metric, data, centers, assignments) + elseif quality_index == :silhouettes + mean(silhouettes(assignments, pairwise(metric, data, dims=2))) + elseif quality_index == :dunn + _cluquality_dunn(assignments, pairwise(metric, data, dims=2)) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + +clustering_quality(data::AbstractMatrix{<:Real}, R::ClusteringResult; + quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.assignments; + quality_index = quality_index, metric = metric) + + +# main method for fuzzy clustering indices +function clustering_quality( + data::AbstractMatrix{<:Real}, # d×n matrix + centers::AbstractMatrix{<:Real}, # d×k matrix + weights::AbstractMatrix{<:Real}; # n×k matrix + quality_index::Symbol, + fuzziness::Real, + metric::SemiMetric=SqEuclidean() +) + d, n = size(data) + dc, k = size(centers) + nw, kw = size(weights) + + d == dc || throw(DimensionMismatch("Inconsistent array dimensions for `data` and `centers`.")) + n == nw || throw(DimensionMismatch("Inconsistent data length for `data` and `weights`.")) + k == kw || throw(DimensionMismatch("Inconsistent number of clusters for `centers` and `weights`.")) + (1 <= k <= n) || throw(ArgumentError("Number of clusters k must be from 1:n (n=$n), k=$k given.")) + k >= 2 || throw(ArgumentError("Quality index not defined for the degenerated clustering with a single cluster.")) + n == k && throw(ArgumentError("Quality index not defined for the degenerated clustering where each data point is its own cluster.")) + all(>=(0), weights) || throw(ArgumentError("All weights must be larger or equal 0.")) + 1 < fuzziness || throw(ArgumentError("Fuzziness must be greater than 1 ($fuzziness given)")) + + if quality_index == :calinski_harabasz + _cluquality_calinski_harabasz(metric, data, centers, weights, fuzziness) + elseif quality_index == :xie_beni + _cluquality_xie_beni(metric, data, centers, weights, fuzziness) + elseif quality_index in [:davies_bouldin, :silhouettes, :dunn] + throw(ArgumentError("quality_index=:$quality_index does not support fuzzy clusterings.")) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + +clustering_quality(data::AbstractMatrix{<:Real}, R::FuzzyCMeansResult; + quality_index::Symbol, fuzziness::Real, metric::SemiMetric=SqEuclidean()) = + clustering_quality(data, R.centers, R.weights; + quality_index = quality_index, + fuzziness = fuzziness, metric = metric) + + +# main method for clustering indices when cluster centres not known +function clustering_quality( + assignments::AbstractVector{<:Integer}, # n vector + dist::AbstractMatrix{<:Real}; # n×n matrix + quality_index::Symbol +) + n, m = size(dist) + na = length(assignments) + n == m || throw(ArgumentError("Distance matrix must be square.")) + n == na || throw(DimensionMismatch("Inconsistent array dimensions for distance matrix and assignments.")) + + if quality_index == :silhouettes + mean(silhouettes(assignments, dist)) + elseif quality_index == :dunn + _cluquality_dunn(assignments, dist) + elseif quality_index ∈ [:calinski_harabasz, :xie_beni, :davies_bouldin] + throw(ArgumentError("quality_index=:$quality_index requires cluster centers.")) + else + throw(ArgumentError("quality_index=:$quality_index not supported.")) + end +end + + +clustering_quality(data::AbstractMatrix{<:Real}, assignments::AbstractVector{<:Integer}; + quality_index::Symbol, metric::SemiMetric=SqEuclidean()) = + clustering_quality(assignments, pairwise(metric, data, dims=2); + quality_index = quality_index) + +clustering_quality(R::ClusteringResult, dist::AbstractMatrix{<:Real}; + quality_index::Symbol) = + clustering_quality(R.assignments, dist; + quality_index = quality_index) + + +# utility functions + +# convert assignments into a vector of vectors of data point indices for each cluster +function _gather_samples(assignments, k) + cluster_samples = [Int[] for _ in 1:k] + for (i, a) in zip(eachindex(assignments), assignments) + push!(cluster_samples[a], i) + end + return cluster_samples +end + +# shared between hard clustering calinski_harabasz and xie_beni +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing +) + inner_inertia = sum( + sum(colwise(metric, view(data, :, samples), center)) + for (center, samples) in zip((view(centers, :, j) for j in axes(centers, 2)), + _gather_samples(assignments, size(centers, 2))) + ) + return inner_inertia +end + +# shared between fuzzy clustering calinski_harabasz and xie_beni (fuzzy version) +function _inner_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real +) + data_to_center_dists = pairwise(metric, data, centers, dims=2) + inner_inertia = sum( + w^fuzziness * d for (w, d) in zip(weights, data_to_center_dists) + ) + return inner_inertia +end + +# hard outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + assignments::AbstractVector{<:Integer}, + fuzziness::Nothing +) + global_center = vec(mean(data, dims=2)) + center_distances = colwise(metric, centers, global_center) + return sum(center_distances[clu] for clu in assignments) +end + +# fuzzy outer inertia for calinski_harabasz +function _outer_inertia( + metric::SemiMetric, + data::AbstractMatrix, + centers::AbstractMatrix, + weights::AbstractMatrix, + fuzziness::Real +) + global_center = vec(mean(data, dims=2)) + center_distances = colwise(metric, centers, global_center) + return sum(sum(w^fuzziness for w in view(weights, :, clu)) * d + for (clu, d) in enumerate(center_distances)) +end + +# Calinsk-Harabasz index +function _cluquality_calinski_harabasz( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) + n, k = size(data, 2), size(centers, 2) + outer_inertia = _outer_inertia(metric, data, centers, assignments, fuzziness) + inner_inertia = _inner_inertia(metric, data, centers, assignments, fuzziness) + return (outer_inertia / inner_inertia) * (n - k) / (k - 1) +end + + +# Davies Bouldin index +function _cluquality_davies_bouldin( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, +) + clu_idx = axes(centers, 2) + clu_samples = _gather_samples(assignments, length(clu_idx)) + clu_diams = [mean(colwise(metric, view(data, :, samples), view(centers, :, clu))) + for (clu, samples) in zip(clu_idx, clu_samples)] + center_dists = pairwise(metric, centers, dims=2) + + DB = mean( + maximum(@inbounds (clu_diams[j₁] + clu_diams[j₂]) / center_dists[j₁, j₂] + for j₂ in clu_idx if j₂ ≠ j₁) + for j₁ in clu_idx) + return DB +end + + +# Xie-Beni index +function _cluquality_xie_beni( + metric::SemiMetric, + data::AbstractMatrix{<:Real}, + centers::AbstractMatrix{<:Real}, + assignments::Union{AbstractVector{<:Integer}, AbstractMatrix{<:Real}}, + fuzziness::Union{Real, Nothing} +) + n, k = size(data, 2), size(centers, 2) + inner_intertia = _inner_inertia(metric, data, centers, assignments, fuzziness) + center_distances = pairwise(metric, centers, dims=2) + min_center_distance = minimum(center_distances[j₁,j₂] for j₁ in 1:k for j₂ in j₁+1:k) + + return inner_intertia / (n * min_center_distance) +end + +# Dunn index +function _cluquality_dunn(assignments::AbstractVector{<:Integer}, dist::AbstractMatrix{<:Real}) + max_inner_distance, min_outer_distance = typemin(eltype(dist)), typemax(eltype(dist)) + + for i in eachindex(assignments), j in (i + 1):lastindex(assignments) + @inbounds d = dist[i, j] + if assignments[i] == assignments[j] + if max_inner_distance < d + max_inner_distance = d + end + else + if min_outer_distance > d + min_outer_distance = d + end + end + end + return min_outer_distance / max_inner_distance +end diff --git a/src/mutualinfo.jl b/src/mutualinfo.jl index f50a7e4f..65b0a527 100644 --- a/src/mutualinfo.jl +++ b/src/mutualinfo.jl @@ -35,7 +35,7 @@ If `normed` parameter is `true` the return value is the normalized mutual inform see "Data Mining Practical Machine Tools and Techniques", Witten & Frank 2005. # References -> Vinh, Epps, and Bailey, (2009). “Information theoretic measures for clusterings comparison”. -Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. +> Vinh, Epps, and Bailey, (2009). *Information theoretic measures for clusterings comparison*. +> Proceedings of the 26th Annual International Conference on Machine Learning - ICML ‘09. """ mutualinfo(a, b; normed::Bool=true) = _mutualinfo(counts(a, b), normed) diff --git a/test/clustering_quality.jl b/test/clustering_quality.jl new file mode 100644 index 00000000..4e7c1d3a --- /dev/null +++ b/test/clustering_quality.jl @@ -0,0 +1,122 @@ +using Test +using Clustering, Distances + +@testset "clustering_quality()" begin + + # test data with 12 2D points and 4 clusters + Y = [-2 2 2 3 2 1 2 -2 -2 -1 -2 -3 + 4 4 1 0 -1 0 -4 -4 1 0 -1 0] + # cluster centers + C = [0 2 0 -2 + 4 0 -4 0] + # point-to-cluster assignments + A = [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4] + # convert A to fuzzy clusters weights + W = zeros(Int, (size(Y, 2), size(C, 2))) + for (i, c) in enumerate(A) + W[i, c] = 1 + end + # fuzzy clustering with 4 clusters + W2 = [ + 1 0 0 0 + 1 0 0 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 0 1 0 + 0 0 1 0 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + 0 1/2 0 1/2 + ] + # mock hard and fuzzy clusterings for testing interface; only C, W and A arguments are actually used + A_kmeans = KmeansResult(Float64.(C), A, ones(12), [4, 4, 4], ones(4), 42., 42, true) + W_cmeans = FuzzyCMeansResult(Float64.(C), Float64.(W), 42, true) + W2_cmeans = FuzzyCMeansResult(Float64.(C), Float64.(W2), 42, true) + + @testset "input checks" begin + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,3), [1, 2], quality_index = :calinski_harabasz) + @test_throws DimensionMismatch clustering_quality(zeros(2,2), zeros(3,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,1), [1, ], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(2,2), zeros(2,2), [1, 2], quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0), zeros(0,0), zeros(Int,0); quality_index = :calinski_harabasz) + @test_throws ArgumentError clustering_quality(zeros(0,0), zeros(0,0), zeros(0,0); quality_index = :calinski_harabasz, fuzziness = 2) + @test_throws DimensionMismatch clustering_quality([1,2,3], zeros(2,2), quality_index = :dunn) + # wrong quality index + @test_throws ArgumentError clustering_quality(Y, C, A; quality_index = :nonexistent_index) + @test_throws ArgumentError clustering_quality(Y, C, W; quality_index = :nonexistent_index, fuzziness = 2) + @test_throws ArgumentError clustering_quality(Y, A; quality_index = :nonexistent_index) + end + + @testset "correct quality index values" begin + @testset "calinski_harabasz" begin + @test clustering_quality(Y, C, A; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y, A_kmeans; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + # requires centers + @test_throws ArgumentError clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz) + + @test clustering_quality(Y, C, W; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) + @test clustering_quality(Y, W_cmeans; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ (32/3) / (16/8) + @test_throws MethodError clustering_quality(W_cmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz, fuzziness = 2) ≈ (32/3) / (16/8) + + @test clustering_quality(Y, C, W2; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ 8/3 * ( 24 ) / (14+sqrt(17)) + @test clustering_quality(Y, W2_cmeans; quality_index = :calinski_harabasz, fuzziness = 2, metric = Euclidean()) ≈ 8/3 * ( 24 ) / (14+sqrt(17)) + @test_throws MethodError clustering_quality(W2_cmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :calinski_harabasz, fuzziness = 2) + end + + @testset "davies_bouldin" begin + @test clustering_quality(Y, C, A; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) + @test clustering_quality(Y, A_kmeans; quality_index = :davies_bouldin, metric = Euclidean()) ≈ 3/sqrt(20) + # requires centers + @test_throws ArgumentError clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :davies_bouldin) ≈ 3/sqrt(20) + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :davies_bouldin, fuzziness = 2) + end + + @testset "dunn" begin + @test clustering_quality(Y, C, A; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 + @test clustering_quality(Y, A_kmeans; quality_index = :dunn, metric = Euclidean()) ≈ 1/2 + @test clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :dunn) ≈ 1/2 + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :dunn, fuzziness = 2) + end + + @testset "xie_beni" begin + @test clustering_quality(Y, C, A; quality_index = :xie_beni, metric = Euclidean()) ≈ 1/3 + + @test clustering_quality(Y, C, W; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + @test clustering_quality(Y, W_cmeans; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ 1/3 + + @test clustering_quality(Y, C, W2; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ (14+sqrt(17)) / (12 * 4) + @test clustering_quality(Y, W2_cmeans; quality_index = :xie_beni, fuzziness = 2, metric = Euclidean()) ≈ (14+sqrt(17)) / (12 * 4) + end + + @testset "silhouettes" begin + avg_silh = 1 - 1/12*( # average over silhouettes 1 - a_i * 1/b_i + + 4 * 16 /(3+2sqrt(17)+5) # 4 points in clusters 1 and 3 + + 4 * (2sqrt(2)+2)/3 * 1/4 # 4 points in clusters 2 and 4, top + bottom + + 2 * (2sqrt(2)+2)/3 * 4/(4+2sqrt(26)+6) # 2 points clusters 2 and 4, left and right + + 2 * (2sqrt(2)+2)/3 * 4/(2+2sqrt(10)+4) # 2 points clusters 2 and 4, center + ) + @test clustering_quality(Y, A; quality_index = :silhouettes, metric = Euclidean()) ≈ avg_silh + @test clustering_quality(Y, A_kmeans; quality_index = :silhouettes, metric = Euclidean()) ≈ avg_silh + @test clustering_quality(A_kmeans, pairwise(Euclidean(), Y, dims=2); quality_index = :silhouettes) ≈ avg_silh + # fuzziness not supported + @test_throws ArgumentError clustering_quality(Y, W_cmeans; quality_index = :silhouettes, fuzziness = 2) + end + end + + @testset "empty clusters" begin + # degenerated clustering, no 4th cluster + degenC = [0 2 0 -2 -2 + 4 0 -4 0 0] + degenA = [1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 5, 5] + + @test_logs((:warn, "Detected empty cluster(s): #4. clustering_quality() results might be incorrect."), + clustering_quality(Y, degenC, degenA; quality_index = :calinski_harabasz)) + @test clustering_quality(Y, degenC, degenA; quality_index = :calinski_harabasz, metric = Euclidean()) ≈ (32/3) / (16/8) + end + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 9eaca2ed..42301653 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,6 +14,7 @@ tests = ["seeding", "fuzzycmeans", "counts", "silhouette", + "clustering_quality", "varinfo", "randindex", "hclust", From 6b446dbde18ab9775c7cf7888c5144471dbf24d7 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Mon, 15 Jan 2024 15:21:26 -0800 Subject: [PATCH 80/82] docs: refactor clustering validation --- docs/source/index.md | 8 +- docs/source/validate.md | 212 +++++++++++++++++++------------------- src/clustering_quality.jl | 48 +++++---- src/confusion.jl | 5 + src/counts.jl | 5 + 5 files changed, 149 insertions(+), 129 deletions(-) diff --git a/docs/source/index.md b/docs/source/index.md index b0c25ed1..21c4b949 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -3,7 +3,7 @@ *Clustering.jl* is a Julia package for data clustering. It covers the two aspects of data clustering: - - [Clustering Algorithms](@ref clu_algo_basics), *e.g.* K-means, K-medoids, Affinity - propagation, and DBSCAN, etc. - - [Clustering Evaluation](@ref clu_validate), *e.g.* Silhouettes and variational - information. + - [Clustering Algorithms](@ref clu_algo_basics): K-means, K-medoids, Affinity + propagation, DBSCAN etc. + - [Clustering Comparison & Evaluation](@ref clu_validate): cross-tabulation, variational + and mutual information, intrinsic clustering quality indices, such as *silhouettes*, etc. diff --git a/docs/source/validate.md b/docs/source/validate.md index 8b1fdbe6..5b3640be 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -1,10 +1,13 @@ # [Evaluation & Validation](@id clu_validate) -*Clustering.jl* package provides a number of methods to evaluate the results of -a clustering algorithm and/or to validate its correctness. +*Clustering.jl* package provides a number of methods to compare different clusterings, +evaluate clustering quality or validate its correctness. +## Clustering comparison -## Cross tabulation +Methods to compare two clusterings and measure their similarity. + +### Cross tabulation [Cross tabulation](https://en.wikipedia.org/wiki/Contingency_table), or *contingency matrix*, is a basis for many clustering quality measures. @@ -13,10 +16,20 @@ It shows how similar are the two clusterings on a cluster level. *Clustering.jl* extends `StatsBase.counts()` with methods that accept [`ClusteringResult`](@ref) arguments: ```@docs -counts(a::ClusteringResult, b::ClusteringResult) +counts(::ClusteringResult, ::ClusteringResult) +``` + +### Confusion matrix + +[Confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) +for the two clusterings is a 2×2 contingency table that counts +how frequently the pair of data points are in the same or different clusters. + +```@docs +confusion ``` -## Rand index +### Rand index [Rand index](http://en.wikipedia.org/wiki/Rand_index) is a measure of the similarity between the two data clusterings. From a mathematical @@ -27,51 +40,68 @@ even when the original class labels are not used. randindex ``` -## Silhouettes +### Variation of Information -[Silhouettes](http://en.wikipedia.org/wiki/Silhouette_(clustering)) is -a method for evaluating the quality of clustering. Particularly, it provides a -quantitative way to measure how well each point lies within its cluster in -comparison to the other clusters. +[Variation of information](http://en.wikipedia.org/wiki/Variation_of_information) +(also known as *shared information distance*) is a measure of the +distance between the two clusterings. It is devised from the *mutual +information*, but it is a true metric, *i.e.* it is symmetric and satisfies +the triangle inequality. -The *Silhouette* value for the ``i``-th data point is: -```math -s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} +```@docs +Clustering.varinfo ``` - - ``a_i`` is the average distance from the ``i``-th point to the other points in - the same cluster ``z_i``, - - ``b_i ≝ \min_{k \ne z_i} b_{ik}``, where ``b_{ik}`` is the average distance - from the ``i``-th point to the points in the ``k``-th cluster. +### V-measure -Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th -point lies well within its own cluster. This property allows using average silhouette value -`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality; it is also available using `clustering_quality(...; quality_index = :silhouettes)` method. -Higher values indicate better separation of clusters w.r.t. point distances. +*V*-measure can be used to compare the clustering results with the +existing class labels of data points or with the alternative clustering. +It is defined as the harmonic mean of homogeneity (``h``) and completeness +(``c``) of the clustering: +```math +V_{\beta} = (1+\beta)\frac{h \cdot c}{\beta \cdot h + c}. +``` +Both ``h`` and ``c`` can be expressed in terms of the mutual information and +entropy measures from the information theory. Homogeneity (``h``) is maximized +when each cluster contains elements of as few different classes as possible. +Completeness (``c``) aims to put all elements of each class in single clusters. +The ``\beta`` parameter (``\beta > 0``) could used to control the weights of +``h`` and ``c`` in the final measure. If ``\beta > 1``, *completeness* has more +weight, and when ``\beta < 1`` it's *homogeneity*. ```@docs -silhouettes +vmeasure ``` -## Clustering quality indices +### Mutual information -A group of clustering evaluation metrics which are intrinsic, i.e. depend only on the clustering itself. They can be used to compare different clustering algorithms or choose the optimal number of clusters. +[Mutual information](https://en.wikipedia.org/wiki/Mutual_information) +quantifies the "amount of information" obtained about one random variable +through observing the other random variable. It is used in determining +the similarity of two different clusterings of a dataset. +```@docs +mutualinfo +``` +## Clustering quality indices -| **index name** | **quality_index** | **type** | **direction** | **cluster centers** | -|:-----------------:|:--------------------:|:----------:|:-------------:|:-------------------:| -| Calinski-Harabasz | `:calinsky_harabasz` | hard/fuzzy | up | required | -| Xie-Beni | `:xie_beni` | hard/fuzzy | down | required | -| Davis-Bouldin | `:davis_bouldin` | hard | down | required | -| Dunn | `:dunn` | hard | up | not required | -| silhouettes | `:silhouettes` | hard | up | not required | +[`clustering_quality()`][@ref clustering_quality] methods allow computing *intrinsic* clustering quality indices, +i.e. the metrics that depend only on the clustering itself and do not use the external knowledge. +These metrics can be used to compare different clustering algorithms or choose the optimal number of clusters. +| **quality index** | **`quality_index` option** | **clustering type** | **better quality** | **cluster centers** | +|:-------------------------------------------:|:--------------------:|:----------:|:-------------:|:-------------------:| +| [Calinski-Harabasz](@ref calinsky_harabasz) | `:calinsky_harabasz` | hard/fuzzy | *higher* values | required | +| [Xie-Beni](@ref xie_beni) | `:xie_beni` | hard/fuzzy | *lower* values | required | +| [Davis-Bouldin](@ref davis_bouldin) | `:davis_bouldin` | hard | *lower* values | required | +| [Dunn](@ref dunn) | `:dunn` | hard | *higher* values | not required | +| [silhouettes](@ref silhouettes_index) | `:silhouettes` | hard | *higher* values | not required | ```@docs -Clustering.clustering_quality +clustering_quality ``` -Notation for the index definitions below: +The clustering quality index definitions use the following notation: - ``x_1, x_2, \ldots, x_n``: data points, - ``C_1, C_2, \ldots, C_k``: clusters, - ``c_j`` and ``c``: cluster centers and global dataset center, @@ -79,30 +109,32 @@ Notation for the index definitions below: - ``w_{ij}``: weights measuring membership of a point ``x_i`` to a cluster ``C_j``, - ``\alpha``: a fuzziness parameter. -### Calinski-Harabasz index - -Option `:calinski_harabasz`. Higher values indicate better quality. Measures corrected ratio between global inertia of the cluster centers and the summed internal inertias of clusters. For hard and fuzzy (soft) clustering it is defined as +### [Calinski-Harabasz index](@id calinsky_harabasz) +[*Calinski-Harabasz* index](https://en.wikipedia.org/wiki/Calinski%E2%80%93Harabasz_index) (option `:calinski_harabasz`) +measures corrected ratio between global inertia of the cluster centers and the summed internal inertias of clusters: ```math - \frac{n-k}{k-1}\frac{\sum_{C_j}|C_j|d(c_j,c)}{\sum\limits_{C_j}\sum\limits_{x_i\in C_j} d(x_i,c_j)} \quad \text{and}\quad \frac{n-k}{k-1} \frac{\sum\limits_{C_j}\left(\sum\limits_{x_i}w_{ij}^\alpha\right) d(c_j,c)}{\sum_{C_j} \sum_{x_i} w_{ij}^\alpha d(x_i,c_j)} ``` -respectively. +for hard and fuzzy (soft) clusterings, respectively. +*Higher* values indicate better quality. +### [Xie-Beni index](@id xie_beni) -### Xie-Beni index -Option `:xie_beni`. Lower values indicate better quality. Measures ratio between summed inertia of clusters and minimum distance between cluster centres. For hard clustering and fuzzy (soft) clustering. It is defined as +*Xie-Beni* index (option `:xie_beni`) measures ratio between summed inertia of clusters +and the minimum distance between cluster centres: ```math \frac{\sum_{C_j}\sum_{x_i\in C_j}d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } \quad \text{and}\quad \frac{\sum_{C_j}\sum_{x_i} w_{ij}^\alpha d(x_i,c_j)}{n\min\limits_{c_{j_1}\neq c_{j_2}} d(c_{j_1},c_{j_2}) } ``` -respectively. - -### [Davis-Bouldin index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) -Option `:davis_bouldin`. Lower values indicate better quality. It measures average cohesion based on the cluster diameters and distances between cluster centers. It is defined as +for hard and fuzzy (soft) clusterings, respectively. +*Lower* values indicate better quality. +### [Davis-Bouldin index](@id davis_bouldin) +[*Davis-Bouldin* index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) +(option `:davis_bouldin`) measures average cohesion based on the cluster diameters and distances between cluster centers: ```math \frac{1}{k}\sum_{C_{j_1}}\max_{c_{j_2}\neq c_{j_1}}\frac{S(C_{j_1})+S(C_{j_2})}{d(c_{j_1},c_{j_2})} ``` @@ -110,8 +142,11 @@ where ```math S(C_j) = \frac{1}{|C_j|}\sum_{x_i\in C_j}d(x_i,c_j). ``` -### [Dunn index](https://en.wikipedia.org/wiki/Dunn_index) -Option `:dunn`. Higher values indicate better quality. More computationally demanding index which can be used when the centres are not known. It measures ratio between the nearest neighbour distance divided by the maximum cluster diameter. It is defined as +*Lower* values indicate better quality. + +### [Dunn index](@id dunn) +[*Dunn* index](https://en.wikipedia.org/wiki/Dunn_index) (option `:dunn`) +measures the ratio between the nearest neighbour distance divided by the maximum cluster diameter: ```math \frac{\min\limits_{ C_{j_1}\neq C_{j_2}} \mathrm{dist}(C_{j_1},C_{j_2})}{\max\limits_{C_j}\mathrm{diam}(C_j)} ``` @@ -119,13 +154,36 @@ where ```math \mathrm{dist}(C_{j_1},C_{j_2}) = \min\limits_{x_{i_1}\in C_{j_1},x_{i_2}\in C_{j_2}} d(x_{i_1},x_{i_2}),\quad \mathrm{diam}(C_j) = \max\limits_{x_{i_1},x_{i_2}\in C_j} d(x_{i_1},x_{i_2}). ``` +It is more computationally demanding quality index, which can be used when the centres are not known. *Higher* values indicate better quality. -### Average silhouette index +### [Silhouettes](@id silhouettes_index) -Option `:silhouettes`. Higher values indicate better quality. It returns the average over silhouette values in the whole data set. See section [Silhouettes](#silhouettes) for a more detailed description of the method. +[*Silhouettes* metric](http://en.wikipedia.org/wiki/Silhouette_(clustering)) quantifies the correctness of point-to-cluster asssignment by +comparing the distance of the point to its cluster and to the other clusters. +The *Silhouette* value for the ``i``-th data point is: +```math +s_i = \frac{b_i - a_i}{\max(a_i, b_i)}, \ \text{where} +``` + - ``a_i`` is the average distance from the ``i``-th point to the other points in + the *same* cluster ``z_i``, + - ``b_i ≝ \min_{k \ne z_i} b_{ik}``, where ``b_{ik}`` is the average distance + from the ``i``-th point to the points in the ``k``-th cluster. + +Note that ``s_i \le 1``, and that ``s_i`` is close to ``1`` when the ``i``-th +point lies well within its own cluster. This property allows using average silhouette value +`mean(silhouettes(assignments, counts, X))` as a measure of clustering quality; +it is also available using [`clustering_quality(...; quality_index = :silhouettes)`](@ref clustering_quality) method. +Higher values indicate better separation of clusters w.r.t. point distances. -### References +```@docs +silhouettes +``` + +[`clustering_quality(..., quality_index=:silhouettes)`][@ref clustering_quality] +provides mean silhouette metric for the datapoints. Higher values indicate better quality. + +## References > Olatz Arbelaitz *et al.* (2013). *An extensive comparative study of cluster validity indices*. Pattern Recognition. 46 1: 243-256. [doi:10.1016/j.patcog.2012.07.021](https://doi.org/10.1016/j.patcog.2012.07.021) > Aybükë Oztürk, Stéphane Lallich, Jérôme Darmont. (2018). *A Visual Quality Index for Fuzzy C-Means*. 14th International Conference on Artificial Intelligence Applications and Innovations (AIAI 2018). 546-555. [doi:10.1007/978-3-319-92007-8_46](https://doi.org/10.1007/978-3-319-92007-8_46). @@ -195,62 +253,8 @@ plot(( ) ``` -## Variation of Information - -[Variation of information](http://en.wikipedia.org/wiki/Variation_of_information) -(also known as *shared information distance*) is a measure of the -distance between the two clusterings. It is devised from the *mutual -information*, but it is a true metric, *i.e.* it is symmetric and satisfies -the triangle inequality. - -```@docs -Clustering.varinfo -``` - - -## V-measure - -*V*-measure can be used to compare the clustering results with the -existing class labels of data points or with the alternative clustering. -It is defined as the harmonic mean of homogeneity (``h``) and completeness -(``c``) of the clustering: -```math -V_{\beta} = (1+\beta)\frac{h \cdot c}{\beta \cdot h + c}. -``` -Both ``h`` and ``c`` can be expressed in terms of the mutual information and -entropy measures from the information theory. Homogeneity (``h``) is maximized -when each cluster contains elements of as few different classes as possible. -Completeness (``c``) aims to put all elements of each class in single clusters. -The ``\beta`` parameter (``\beta > 0``) could used to control the weights of -``h`` and ``c`` in the final measure. If ``\beta > 1``, *completeness* has more -weight, and when ``\beta < 1`` it's *homogeneity*. - -```@docs -vmeasure -``` - -## Mutual information - -[Mutual information](https://en.wikipedia.org/wiki/Mutual_information) -quantifies the "amount of information" obtained about one random variable -through observing the other random variable. It is used in determining -the similarity of two different clusterings of a dataset. - -```@docs -mutualinfo -``` - -## Confusion matrix - -Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) -arising from two clusterings is a 2×2 contingency table representation of -the partition co-occurrence, see [`counts`](@ref). - -```@docs -confusion -``` ## Other packages * [ClusteringBenchmarks.jl](https://github.com/HolyLab/ClusteringBenchmarks.jl) provides - benchmark datasets and implements additional methods for evaluating clustering performance. \ No newline at end of file + benchmark datasets and implements additional methods for evaluating clustering performance. diff --git a/src/clustering_quality.jl b/src/clustering_quality.jl index 8acadc19..2d21859a 100644 --- a/src/clustering_quality.jl +++ b/src/clustering_quality.jl @@ -1,27 +1,30 @@ # main method for hard clustering indices + docs """ -For hard clustering: +For "hard" clustering: clustering_quality(data, centers, assignments; quality_index, [metric]) clustering_quality(data, clustering; quality_index, [metric]) -For fuzzy clustering: +For fuzzy ("soft") clustering: clustering_quality(data, centers, weights; quality_index, fuzziness, [metric]) clustering_quality(data, clustering; quality_index, fuzziness, [metric]) -For hard clustering without cluster centers known: +For "hard" clustering without specifying cluster centers: - clustering_quality(assignments, dist_matrix; quality_index) - clustering_quality(clustering, dist_matrix; quality_index) clustering_quality(data, assignments; quality_index, [metric]) clustering_quality(data, clustering; quality_index, [metric]) -Compute the clustering quality index for a given clustering. +For "hard" clustering without specifying data points and cluster centers: -Returns a real number which is the value of the chosen quality index type of the given clustering. + clustering_quality(assignments, dist_matrix; quality_index) + clustering_quality(clustering, dist_matrix; quality_index) -# Arguments +Compute the *quality index* for a given clustering. + +Returns a quality index (real value). + +## Arguments - `data::AbstractMatrix`: ``d×n`` data matrix with each column representing one ``d``-dimensional data point - `centers::AbstractMatrix`: ``d×k`` matrix with cluster centers represented as columns - `assignments::AbstractVector{Int}`: ``n`` vector of point assignments (cluster indices) @@ -30,25 +33,28 @@ Returns a real number which is the value of the chosen quality index type of the - `quality_index::Symbol`: quality index to calculate; see below for the supported options - `dist_matrix::AbstractMatrix`: a ``n×n`` pairwise distance matrix; `dist_matrix[i,j]` is the distance between ``i``-th and ``j``-th points - # Keyword arguments - - `quality_index::Symbol`: quality index to calculate; see below for the supported options - - `fuzziness::Real`: clustering fuzziness > 1 + ## Keyword arguments + - `quality_index::Symbol`: clustering *quality index* to calculate; see below for the supported options + - `fuzziness::Real`: clustering *fuzziness* > 1 - `metric::SemiMetric=SqEuclidean()`: `SemiMetric` object that defines the metric/distance/similarity function -When calling `clustering_quality` one can give `centers`, `assignments` or `weights` arguments by hand or provide a single `clustering` argument from which the necessary data will be read automatically. +When calling `clustering_quality`, one can explicitly specify `centers`, `assignments`, and `weights`, +or provide `ClusteringResult` via `clustering`, from which the necessary data will be read automatically. -For clustering without known cluster centers the datapoints are not required, only `dist_matrix` is necessary. If given, `data` and `metric` will be used to calculate distance matrix instead. +For clustering without known cluster centers the `data` points are not required. +`dist_matrix` could be provided explicitly, otherwise it would be calculated from the `data` points +using the specified `metric`. -# Supported quality indices +## Supported quality indices -Symbols ↑/↓ are quality direction. -- `:calinski_harabasz`: hard or fuzzy Calinski-Harabsz index (↑) returns the corrected ratio of between cluster centers inertia and within-clusters inertia -- `:xie_beni`: hard or fuzzy Xie-Beni index (↓) returns ratio betwen inertia within clusters and minimal distance between cluster centers -- `:davies_bouldin`: Davies-Bouldin index (↓) returns average similarity between each cluster and its most similar one, averaged over all the clusters -- `:dunn`: Dunn index (↑) returns ratio between minimal distance between clusters and maximal cluster diameter; it does not make use of `centers` argument -- `:silhouettes`: average silhouette index (↑), for all silhouettes use [`silhouettes`](@ref) method instead; it does not make use of `centers` argument -Please refer to the [documentation](@ref clustering_quality) for the definitions and usage descriptions of the supported quality indices. +- `:calinski_harabasz`: hard or fuzzy Calinski-Harabsz index (↑), the corrected ratio of between cluster centers inertia and within-clusters inertia +- `:xie_beni`: hard or fuzzy Xie-Beni index (↓), the ratio betwen inertia within clusters and minimal distance between the cluster centers +- `:davies_bouldin`: Davies-Bouldin index (↓), the similarity between the cluster and the other most similar one, averaged over all clusters +- `:dunn`: Dunn index (↑), the ratio of the minimal distance between clusters and the maximal cluster diameter +- `:silhouettes`: the average silhouette index (↑), see [`silhouettes`](@ref) +The arrows ↑ or ↓ specify the direction of the incresing clustering quality. +Please refer to the [documentation](@ref clustering_quality) for more details on the clustering quality indices. """ function clustering_quality( data::AbstractMatrix{<:Real}, # d×n matrix diff --git a/src/confusion.jl b/src/confusion.jl index 89cc42d7..6556b814 100644 --- a/src/confusion.jl +++ b/src/confusion.jl @@ -19,6 +19,11 @@ true negatives is `C₂₂`: |:--:|:-:|:-:| |Positive|C₁₁|C₁₂| |Negative|C₂₁|C₂₂| + +## See also + +[`counts(a::ClusteringResult, a::ClusteringResult)`](@ref counts) for full *contingency matrix*. + """ function confusion(::Type{T}, a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) where T<:Union{Integer, AbstractFloat} cc = counts(a, b) diff --git a/src/counts.jl b/src/counts.jl index 1eef56fb..7e5a303c 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -29,6 +29,11 @@ from `b`. The clusterings could be specified either as [`ClusteringResult`](@ref) instances or as vectors of data point assignments. + +## See also + +[`confusion(a::ClusteringResult, a::ClusteringResult)`](@ref confusion) for 2×2 *confusion matrix*. + """ counts(a::ClusteringResult, b::ClusteringResult) = _counts(assignments(a), assignments(b)) From 40256b56205dc175b40d3081457d93d48623b2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 19 Jan 2024 13:40:10 +0100 Subject: [PATCH 81/82] Update validate.md docs links correction --- docs/source/validate.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 5b3640be..b05613e8 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -85,7 +85,7 @@ mutualinfo ## Clustering quality indices -[`clustering_quality()`][@ref clustering_quality] methods allow computing *intrinsic* clustering quality indices, +[`clustering_quality()`](@ref clustering_quality) methods allow computing *intrinsic* clustering quality indices, i.e. the metrics that depend only on the clustering itself and do not use the external knowledge. These metrics can be used to compare different clustering algorithms or choose the optimal number of clusters. @@ -180,7 +180,7 @@ Higher values indicate better separation of clusters w.r.t. point distances. silhouettes ``` -[`clustering_quality(..., quality_index=:silhouettes)`][@ref clustering_quality] +[`clustering_quality(..., quality_index=:silhouettes)`](@ref clustering_quality) provides mean silhouette metric for the datapoints. Higher values indicate better quality. ## References From d50223aa11424d9d19d28ad4643491bac0de251b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=9Al=C4=99zak?= <128084860+jaksle@users.noreply.github.com> Date: Fri, 19 Jan 2024 13:51:42 +0100 Subject: [PATCH 82/82] correcting links in docs validate --- docs/source/validate.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/validate.md b/docs/source/validate.md index 5b3640be..b05613e8 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -85,7 +85,7 @@ mutualinfo ## Clustering quality indices -[`clustering_quality()`][@ref clustering_quality] methods allow computing *intrinsic* clustering quality indices, +[`clustering_quality()`](@ref clustering_quality) methods allow computing *intrinsic* clustering quality indices, i.e. the metrics that depend only on the clustering itself and do not use the external knowledge. These metrics can be used to compare different clustering algorithms or choose the optimal number of clusters. @@ -180,7 +180,7 @@ Higher values indicate better separation of clusters w.r.t. point distances. silhouettes ``` -[`clustering_quality(..., quality_index=:silhouettes)`][@ref clustering_quality] +[`clustering_quality(..., quality_index=:silhouettes)`](@ref clustering_quality) provides mean silhouette metric for the datapoints. Higher values indicate better quality. ## References