## Cluster analysis page, with R-commands to doing a variety of cluster analysis
### strongly recommend you to go over this - you can get a lot of cool R commands
## to help for your research or HW..
http://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters

## Also commands in James' presentation - James - lecture 1

## another source on wikipedia
There is a lot of good information here http://en.wikipedia.org/wiki/K-medoids. 

### K-means cluster..

library(maps)
library(mapdata)

nk = 20  #number of clusters to consider
### x is the data matrix to cluster - columns are the attributes to cluster on
### rows are records/observations to cluster
### in your case it will be lat, long  and precipitation over India
### You might want to scale the data before clustering.
### e.g., x = scale(x)

#### GEt the Average summer rainfall over India

### Read in Rajeevan Data (summer JJAS average) 1951 - 2009 

nrows=35
ncols=33
ntime = 59    #JJAS 1951 - JJAS 2009
ntimep = 59   # JJAS 1951 - JJAS 2009
N = nrows*ncols


### create the Lat - Long grid for India

ygrid=seq(6.5,38.5,by=1)
ny=length(ygrid)

xgrid=seq(66.5,100.5,by=1)
nx=length(xgrid)

xygrid=matrix(0,nrow=nx*ny,ncol=2)

i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}

}

# REad Rajeevan seasonal average data..

data=readBin("Rajeevan-AISMR-JJAS1951-JJAS2009.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")


data <- array(data = data, dim=c( nrows, ncols, ntime ) )


data1=data[,,1]
# Missing value is NaN, change it to a negative number.
data1[data1 == "NaN"]=-99999.
# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 >= 0]	# only non-missing data.
xygrid1=xygrid[index1,]
x1=xygrid1[,2]

nsites=length(index1)	# locations with data -i.e. global locations
data2=data1[index1]

### data matrix - rows are years and columns are locations on the globe with data
raindata=matrix(NA,nrow=ntimep, ncol=nsites)


for(i in 1:ntimep){
data1=data[,,i]
data1[data1 == "NaN"]=-99999.
index1=index[data1 >= 0]
data2=data1[index1]
raindata[i,]=data2
}

indexgrid = index1

rm("data")	#remove the object data to clear up space

## write out the grid locations if you need..
#write(t(xygrid1),file="rajeevan-aismr-locs.txt",ncol=2)

######### raindata contains the summer season average rainfall
### at each grid point covering India for the period 1951 - 2009, 59 years

## get the average rain at each grid point over all the years 
 avgrain = apply(raindata,2,mean)
 
 
 ############ seasonal maximum rainfall ######
 
 

 x = cbind(xygrid1[,2],xygrid1[,1],avgrain)   ## lon, lat and avg rain

 lon = xygrid1[,2]
 lat = xygrid1[,1]
#### cluster on x  == select best number from WSS

nk = 12    ## number of clusters to consider
wss=1:nk

wss <- (nrow(x)-1)*sum(apply(x,2,var))
for (i in 2:nk) wss[i] <- sum(kmeans(x,centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares") 

## select the K corresponding to the the minimum WSS or WSS beyond with the 
### drop is small. Similar to the Eigen spectrum
## kbest = the best number of clusters.

### if x is lon, lat and precip, for example
zclust = kmeans(x,centers=kbest)
plot(lon,lat,col=c(zclust$cluster),xlab="",ylab="")
world(add=TRUE,shift=TRUE)


### Use BIC to get best number of clusters..
### example..

library(mclust)
# Run the function to see how many clusters
# it finds to be optimal, set it to search for
# at least 1 model and up 20.
d_clust <- Mclust(as.matrix(x), G=1:20)
m.best <- dim(d_clust$z)[2]
cat("model-based optimal number of clusters:", m.best, "\n")
# 4 clusters
plot(d_clust)

######

###  partition using mediods - similar to K-means but using the cluster median..
## example using ggplot2 - feel free to replace it with your data.

library(cluster)
library(ggplot2)

### replace the example data below with the precip data x, above
x <- rbind(cbind(rnorm(10,0,0.5), rnorm(10,0,0.5)),
           cbind(rnorm(15,5,0.5), rnorm(15,5,0.5)))

max_k <- 12
sil <- numeric(length(2:max_k))

for(i in 2:max_k) {
    p <- pam(x, i, stand=TRUE)
    sil[i-1] <- mean(silhouette(p))
}
qplot(2:max_k,sil,geom='line')+theme_bw()

Then once you know the optimal number of clusters:

k <- 2
clusters <- pam(x, k, stand=TRUE, cluster.only=T)
qplot(x[,1],x[,2],color=factor(clusters))+theme_bw()

############# El Nino and La Nina years

ensoindex = scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.Indices/.nino/.EXTENDED/.NINO4/T/%28Jun%201951%29/%28Dec%202009%29/RANGE/T/4/boxAverage/T/12/STEP/data.ch")

ensoindex = scale(ensoindex)

indexy = 1:59

elninoyears = indexy[ensoindex >= 0.75]
laninayears = indexy[ensoindex <= -0.75]

elninorain = avgrain[elninoyears,]


laninarain = avgrain[laninayears,]


### now cluster separately elninorain and laninarain
#### compare the spatial patterns with that of the avgrain