CCMMR implements convex clustering using the minimization algorithm presented in the paper Convex Clustering through MM: An Efficient Algorithm to Perform Hierarchical Clustering by D.J.W. Touw, P.J.F. Groenen, and Y. Terada. For issues, please use Github Issues.
There is also a Python package available.
CCMMR has the following dependencies: - r2r - RANN - Rcpp - RcppEigen
To install CCMMR, clone the repository, open CCMMR.Rproj
in RStudio, and press install in the build panel. Another option is to
use devtools to install the package from GitHub.
library(devtools)
install_github("djwtouw/CCMMR")
The following examples are also part of the documentation of the package. Start with loading the package.
library(CCMMR)
After loading the data, a sparse weight matrix is constructed based
on the k = 5
nearest neighbors. This means that nonzero
weights are computed only for pairs of objects that are k
nearest neighbors of each other. By default, the weight matrix is
constructed so that every observation is (in)directly connected to all
other observations via nonzero weights. This ensures that the minimum
number of clusters is one. To turn off this behavior, set
connected = FALSE
.
# Load data
data(two_half_moons)
= as.matrix(two_half_moons)
data = data[, -3]
X = data[, 3]
y
# Get sparse weights in dictionary of keys format with k = 5 and phi = 8
= sparse_weights(X, 5, 8.0)
W
# Set a sequence for lambda
= seq(0, 2400, 1)
lambdas
# Compute clusterpath
= convex_clusterpath(X, W, lambdas)
res
# Get cluster labels for two clusters
= clusters(res, 2)
labels
# Plot the clusterpath with colors based on the cluster labels
plot(res, col = labels)
In the previous example, the choice for \(\lambda\) has determined what the number of
clusters was going to be. However, it can be difficult to guess in
advance what value for \(\lambda\)
corresponds to a particular number of clusters. The following code looks
for clusterings in a specified range. If no upper bound is specified,
just a single number of clusters (equal to target_low
) is
looked for.
# Load data
data(two_half_moons)
= as.matrix(two_half_moons)
data = data[, -3]
X = data[, 3]
y
# Get sparse weights in dictionary of keys format with k = 5 and phi = 8
= sparse_weights(X, 5, 8.0)
W
# Perform convex clustering with a target number of clusters
= convex_clustering(X, W, target_low = 2, target_high = 5)
res1
# Plot the clustering for 2 to 5 clusters
par(mfrow=c(2, 2))
plot(X, col = clusters(res1, 2), main = "2 clusters", pch = 19,
xlab = expression(X[1]), ylab = expression(X[2]))
plot(X, col = clusters(res1, 3), main = "3 clusters", pch = 19,
xlab = expression(X[1]), ylab = expression(X[2]))
plot(X, col = clusters(res1, 4), main = "4 clusters", pch = 19,
xlab = expression(X[1]), ylab = expression(X[2]))
plot(X, col = clusters(res1, 5), main = "5 clusters", pch = 19,
xlab = expression(X[1]), ylab = expression(X[2]))
# A more generalized approach to plotting the results of a range of clusters
= convex_clustering(X, W, target_low = 2, target_high = 7)
res2
# Plot the clusterings
= length(res2$num_clusters)
k par(mfrow=c(ceiling(k / ceiling(sqrt(k))), ceiling(sqrt(k))))
for (i in 1:k) {
= clusters(res2, res2$num_clusters[k + 1 - i])
labels = length(unique(labels))
c
plot(X, col = labels, main = paste(c, "clusters"), pch = 19,
xlab = expression(X[1]), ylab = expression(X[2]))
}
As an alternative to the clusterpath, convex clustering results can
also be visualized using a dendrogram. In the following example, convex
clustering is applied to a small generated data set, after which the
as.hclust()
function transforms the output into a
hclust
object. Consequently, the standard
plot()
can be used to plot a dendrogram. Note that
hclust
objects require the clusterpath to terminate in a
single cluster.
# Demonstration of converting a clusterpath into a dendrogram, first generate
# data
set.seed(6)
= matrix(rnorm(14), ncol = 2)
X = rep(1, nrow(X))
y
# Get sparse distances in dictionary of keys format with k = 3
= sparse_weights(X, 3, 4.0)
W
# Sequence for lambda
= seq(0, 45, 0.02)
lambdas
# Compute results
= convex_clusterpath(X, W, lambdas)
res
# Generate hclust object
= as.hclust(res)
hcl $height = sqrt(hcl$height)
hcl
# Plot clusterpath (left) and dendrogram (right)
par(mfrow=c(1, 2))
plot(res, y, label = c(1:7))
plot(hcl, ylab = expression(sqrt(lambda)), xlab = NA, sub = NA, main = NA,
hang = -1)