#A short clustering example, using k-means and partition around medoids (PAM) clustering #Some useful references: #https://en.wikipedia.org/wiki/K-means_clustering #https://en.wikipedia.org/wiki/K-medoids #https://www.cs.umb.edu/cs738/pam1.pdf #load libraries library(cluster) library(apcluster) library(MASS) library(cluster.datasets) library(clValid) library(clusterCrit) #load and prepare data data(UScereal) df <- UScereal head(df) ndf <- df[,c(-1,-9,-11)] ndf <- scale(ndf) quartz() #opens a new graphics window on a mac - you may need to alter this for your particular setup #visualize data parcoord(ndf) #cluster data using kmeans c1km <- kmeans(ndf,3) c1km$cluster #cluster data using PAM c2p <- pam(ndf,3) c2p$cluster #visualize both quartz() parcoord(ndf,col=c1km$cluster) quartz() parcoord(ndf,col=c2p$cluster) #validate the resulting clusters by generating three different evaluation metrics #C_index: Lower is better #Dunn: Higher is better #Calinski_Harabasz: Higher is better c1kmeval <- intCriteria(ndf,c1km$cluster,c("C_index","Calinski_Harabasz","Dunn")) c1kmeval c2peval <- intCriteria(ndf,c2p$cluster,c("C_index","Calinski_Harabasz","Dunn")) c1kmeval vignette("clusterCrit") #set up labels and extract the results cluster_methods <- c("kmeans", "pam") c_index_vals <- c(c1kmeval$c_index, c2peval$c_index) #compare the results for each of the three metrics. c_index_vals <- c(c1kmeval$c_index, c2peval$c_index) best <- bestCriterion(c_index_vals,"C_index") print(paste0("According to C_index, the best cluster option is ",cluster_methods[best])) c_h_vals <- c(c1kmeval$calinski_harabasz, c2peval$calinski_harabasz) best <- bestCriterion(c_h_vals, "Calinski_Harabasz") print(paste0("According to Calinski_Harabasz, the best cluster option is ",cluster_methods[best])) dunn_vals <- c(c1kmeval$dunn, c2peval$dunn) best <- bestCriterion(dunn_vals,"Dunn") print(paste0("According to Dunn, the best cluster option is ",cluster_methods[best]))