## Cluster analysis page, with R-commands to doing a variety of cluster analysis ### strongly recommend you to go over this - you can get a lot of cool R commands ## to help for your research or HW.. http://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters ## Also commands in today's presentation - James - lecture 1 ## another source on wikipedia There is a lot of good information here http://en.wikipedia.org/wiki/K-medoids. ### K-means cluster.. library(maps) library(mapdata) nk = 20 #number of clusters to consider ### x is the data matrix to cluster - columns are the attributes to cluster on ### rows are records/observations to cluster wss=1:nk wss <- (nrow(x)-1)*sum(apply(x,2,var)) for (i in 2:nk) wss[i] <- sum(kmeans(x,centers=i)$withinss) plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") ## select the K corresponding to the the minimum WSS or WSS beyond with the ### drop is small. Similar to the Eigen spectrum ## kbest = the best number of clusters. ### if x is lon, lat and precip, for example zclust = kmeans(x,centers=kbest) plot(lon,lat,col=c(zclust$cluster),xlab="",ylab="") US(add=TRUE, col="grey", lwd=2,xlim=range(-125,-100)) ### Use BIC to get best number of clusters.. ### example.. library(mclust) # Run the function to see how many clusters # it finds to be optimal, set it to search for # at least 1 model and up 20. d_clust <- Mclust(as.matrix(x), G=1:20) m.best <- dim(d_clust$z)[2] cat("model-based optimal number of clusters:", m.best, "\n") # 4 clusters plot(d_clust) ###### ### partition using mediods - similar to K-means but using the cluster median.. ## example using ggplot2 - feel free to replace it with your data. library(cluster) library(ggplot2) x <- rbind(cbind(rnorm(10,0,0.5), rnorm(10,0,0.5)), cbind(rnorm(15,5,0.5), rnorm(15,5,0.5))) max_k <- 5 sil <- numeric(length(2:max_k)) for(i in 2:max_k) { p <- pam(x, i, stand=TRUE) sil[i-1] <- mean(silhouette(p)) } qplot(2:max_k,sil,geom='line')+theme_bw() Then once you know the optimal number of clusters: k <- 2 clusters <- pam(x, k, stand=TRUE, cluster.only=T) qplot(x[,1],x[,2],color=factor(clusters))+theme_bw() ##############