#A short clustering example, using k-means and partition around medoids (PAM) clustering
#Some useful references:
#https://en.wikipedia.org/wiki/K-means_clustering
#https://en.wikipedia.org/wiki/K-medoids
#https://www.cs.umb.edu/cs738/pam1.pdf

#load libraries
library(cluster)
library(apcluster)
library(MASS)
library(cluster.datasets)
library(clValid)
library(clusterCrit)

#load and prepare data
data(UScereal)
df <- UScereal
head(df)
ndf <- df[,c(-1,-9,-11)]
ndf <- scale(ndf)

quartz() #opens a new graphics window on a mac - you may need to alter this for your particular setup

#visualize data
parcoord(ndf)

#cluster data using kmeans
c1km <- kmeans(ndf,3)
c1km$cluster

#cluster data using PAM
c2p <- pam(ndf,3)
c2p$cluster

#visualize both
quartz()
parcoord(ndf,col=c1km$cluster)
quartz()
parcoord(ndf,col=c2p$cluster)

#validate the resulting clusters by generating three different evaluation metrics

#C_index: Lower is better
#Dunn: Higher is better
#Calinski_Harabasz: Higher is better

c1kmeval <- intCriteria(ndf,c1km$cluster,c("C_index","Calinski_Harabasz","Dunn"))
c1kmeval
c2peval <- intCriteria(ndf,c2p$cluster,c("C_index","Calinski_Harabasz","Dunn"))
c1kmeval

vignette("clusterCrit")

#set up labels and extract the results
cluster_methods <- c("kmeans", "pam")
c_index_vals <- c(c1kmeval$c_index, c2peval$c_index)

#compare the results for each of the three metrics.
c_index_vals <- c(c1kmeval$c_index, c2peval$c_index)
best <- bestCriterion(c_index_vals,"C_index")
print(paste0("According to C_index, the best cluster option is ",cluster_methods[best]))

c_h_vals <- c(c1kmeval$calinski_harabasz, c2peval$calinski_harabasz)
best <- bestCriterion(c_h_vals, "Calinski_Harabasz")
print(paste0("According to Calinski_Harabasz, the best cluster option is ",cluster_methods[best]))

dunn_vals <- c(c1kmeval$dunn, c2peval$dunn)
best <- bestCriterion(dunn_vals,"Dunn")
print(paste0("According to Dunn, the best cluster option is ",cluster_methods[best]))