## Cluster analysis page, with R-commands to doing a variety of cluster analysis ### strongly recommend you to go over this - you can get a lot of cool R commands ## to help for your research or HW.. http://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters ## Also commands in James' presentation - James - lecture 1 ## another source on wikipedia There is a lot of good information here http://en.wikipedia.org/wiki/K-medoids. ### K-means cluster.. library(maps) library(mapdata) nk = 20 #number of clusters to consider ### x is the data matrix to cluster - columns are the attributes to cluster on ### rows are records/observations to cluster ### in your case it will be lat, long and precipitation over India ### You might want to scale the data before clustering. ### e.g., x = scale(x) #### GEt the Average summer rainfall over India ### Read in Rajeevan Data (summer JJAS average) 1951 - 2009 nrows=35 ncols=33 ntime = 59 #JJAS 1951 - JJAS 2009 ntimep = 59 # JJAS 1951 - JJAS 2009 N = nrows*ncols ### create the Lat - Long grid for India ygrid=seq(6.5,38.5,by=1) ny=length(ygrid) xgrid=seq(66.5,100.5,by=1) nx=length(xgrid) xygrid=matrix(0,nrow=nx*ny,ncol=2) i=0 for(iy in 1:ny){ for(ix in 1:nx){ i=i+1 xygrid[i,1]=ygrid[iy] xygrid[i,2]=xgrid[ix] } } # REad Rajeevan seasonal average data.. data=readBin("Rajeevan-AISMR-JJAS1951-JJAS2009.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap") data <- array(data = data, dim=c( nrows, ncols, ntime ) ) data1=data[,,1] # Missing value is NaN, change it to a negative number. data1[data1 == "NaN"]=-99999. # the lat -long data grid.. index=1:(nx*ny) index1=index[data1 >= 0] # only non-missing data. xygrid1=xygrid[index1,] x1=xygrid1[,2] nsites=length(index1) # locations with data -i.e. global locations data2=data1[index1] ### data matrix - rows are years and columns are locations on the globe with data raindata=matrix(NA,nrow=ntimep, ncol=nsites) for(i in 1:ntimep){ data1=data[,,i] data1[data1 == "NaN"]=-99999. index1=index[data1 >= 0] data2=data1[index1] raindata[i,]=data2 } indexgrid = index1 rm("data") #remove the object data to clear up space ## write out the grid locations if you need.. #write(t(xygrid1),file="rajeevan-aismr-locs.txt",ncol=2) ######### raindata contains the summer season average rainfall ### at each grid point covering India for the period 1951 - 2009, 59 years ## get the average rain at each grid point over all the years avgrain = apply(raindata,2,mean) ############ seasonal maximum rainfall ###### x = cbind(xygrid1[,2],xygrid1[,1],avgrain) ## lon, lat and avg rain lon = xygrid1[,2] lat = xygrid1[,1] #### cluster on x == select best number from WSS nk = 12 ## number of clusters to consider wss=1:nk wss <- (nrow(x)-1)*sum(apply(x,2,var)) for (i in 2:nk) wss[i] <- sum(kmeans(x,centers=i)$withinss) plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") ## select the K corresponding to the the minimum WSS or WSS beyond with the ### drop is small. Similar to the Eigen spectrum ## kbest = the best number of clusters. ### if x is lon, lat and precip, for example zclust = kmeans(x,centers=kbest) plot(lon,lat,col=c(zclust$cluster),xlab="",ylab="") world(add=TRUE,shift=TRUE) ### Use BIC to get best number of clusters.. ### example.. library(mclust) # Run the function to see how many clusters # it finds to be optimal, set it to search for # at least 1 model and up 20. d_clust <- Mclust(as.matrix(x), G=1:20) m.best <- dim(d_clust$z)[2] cat("model-based optimal number of clusters:", m.best, "\n") # 4 clusters plot(d_clust) ###### ### partition using mediods - similar to K-means but using the cluster median.. ## example using ggplot2 - feel free to replace it with your data. library(cluster) library(ggplot2) ### replace the example data below with the precip data x, above x <- rbind(cbind(rnorm(10,0,0.5), rnorm(10,0,0.5)), cbind(rnorm(15,5,0.5), rnorm(15,5,0.5))) max_k <- 12 sil <- numeric(length(2:max_k)) for(i in 2:max_k) { p <- pam(x, i, stand=TRUE) sil[i-1] <- mean(silhouette(p)) } qplot(2:max_k,sil,geom='line')+theme_bw() Then once you know the optimal number of clusters: k <- 2 clusters <- pam(x, k, stand=TRUE, cluster.only=T) qplot(x[,1],x[,2],color=factor(clusters))+theme_bw() ############# El Nino and La Nina years ensoindex = scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.Indices/.nino/.EXTENDED/.NINO4/T/%28Jun%201951%29/%28Dec%202009%29/RANGE/T/4/boxAverage/T/12/STEP/data.ch") ensoindex = scale(ensoindex) indexy = 1:59 elninoyears = indexy[ensoindex >= 0.75] laninayears = indexy[ensoindex <= -0.75] elninorain = avgrain[elninoyears,] laninarain = avgrain[laninayears,] ### now cluster separately elninorain and laninarain #### compare the spatial patterns with that of the avgrain