10. Perform clustering of the summer season precipitation

using the method proposed in Bernard et al. (2013) and used in Bracken et al. (2015) – these papers are linked on the class page. Comment on the cluster patterns in comparison with those from problem 9.

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")
Read data
nrows=68
ncols=65

ntime <- 67   #Jun-Sep 1950 - 2016

nyrs <- ntime 

nglobe <- nrows*ncols
N <- nrows*ncols


### Lat - Long grid..
#ygrid=seq(6.5,38.5,by=0.25)
ygrid=seq(6.5,38.5,by=0.5)
ny=length(ygrid)

#xgrid=seq(66.5,100,by=0.25)
xgrid=seq(66.5,100,by=0.5)
nx=length(xgrid)

xygrid=matrix(0,nrow=nx*ny,ncol=2)

i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}

}

# Read India Gridded Rainfall data..

data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")

#data=readBin("data.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")

data <- array(data = data, dim=c( nrows, ncols, ntime ) )

data1=data[,,1]
    
# the lat -long data grid..
index=1:(nx*ny)

## pull only data and the locations with non-missing data
index1=index[data1 != "NaN"]    # only non-missing data.
xygrid1=xygrid[index1,]

nsites=length(index1)

data2=data1[index1]

### Rain data matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
raindata=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
data1=data[,,i]
index1=index[ data1 != "NaN"]
data2=data1[index1]
raindata[i,]=data2
}

index = 1:dim(raindata)[2]

xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]

xygrid1=xygrid[index2,]
rainavg = raindata[,index3]
indexgrid = index2
rm("data")  #remove the object data to clear up space

seasonal_avg <- colMeans(rainavg)

X1 <- cbind(xygrid1, seasonal_avg) colnames(X1) <- c(“Latitude”, “Longitude”, “SeasonalAvg”)

distance_data <- read.table(“data/Rajgrid-lon-lat-elev-dista-distb.txt”, header=FALSE) names(distance_data) <- c(“Longitude”, “Latitude”, “Elevation”, “Dist_ArabianSea”, “Dist_BayOfBengal”)

Merge the two datasets

X <- merge(X1, distance_data, by=c(“Longitude”, “Latitude”)) X <- X[, c(“Longitude”, “Latitude”, “Elevation”, “Dist_ArabianSea”, “Dist_BayOfBengal”, “SeasonalAvg”)] prec_obs <- X$SeasonalAvg # Extract precipitation column

Perform clustering of the Extremes using the method used in Bracken et al. (2015)
lat <- xygrid1[,1]
lon <- xygrid1[,2]

loc <- xygrid1[, c(2, 1)]

ks = 2:20
# Do the clustering for all k values
cluster_list = list()
for(k in ks){
  cluster_list[[which(ks == k)]] = pam_fmado_ll(rainavg,k,loc[,1:2])
}

# Find the average silhouette value (smaller is better)
sil = sapply(cluster_list,function(x)x$silinfo$avg.width)

# Plot the average silhouette value versus k, look for a maximum
g1 = ggplot() +
  geom_line(mapping = aes(ks,sil)) +
  geom_point(mapping = aes(ks,sil), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
  labs(title = "Silhouette width",x = 'Number of clusters',y = 'Silhouette width')+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)

# Find the optimal number of clusters
optimal_k = ks[which.max(sil)]
print(sprintf('The optimal number of clusters is %s', optimal_k))
## [1] "The optimal number of clusters is 3"
Display the clusters spatially
#clusters = cbind(loc, cluster = cluster_list[[which(ks==optimal_k)]]$clustering)
colnames(loc) <- c("lon", "lat")
clusters = data.frame(loc, Cluster = factor(cluster_list[[which(ks == optimal_k)]]$clustering))



# Plot clusters over India
n_clusters <- length(unique(clusters$Cluster))
cluster_colors <- rainbow(n_clusters)
cluster_numeric <- as.numeric(clusters$Cluster)



plot(clusters$lon, clusters$lat,
     col = cluster_colors[cluster_numeric],
     pch = 21, bg = cluster_colors[cluster_numeric],cex = 0.3,
     xlim = c(65, 100), ylim = c(5, 40),
     xlab = "Longitude", ylab = "Latitude",
     main = "Clustering of Seasonal Precipitation over India")
maps::map("world", add = TRUE, wrap = c(0, 360), lwd = 1.5)
grid()

cluster_means <- aggregate(clusters[, c("lon", "lat")], by = list(clusters$Cluster), FUN = mean)

points(cluster_means$lon, cluster_means$lat,
       pch = 2,                     # triangle point
       cex = 1,                   # size
       lwd = 2,                     # stroke thickness
       col = "black")              # outline color


text(cluster_means$lon, cluster_means$lat, labels = cluster_means$Group.1,
     pos = 3, cex = 0.8)

legend("topright", legend = paste("Cluster", 1:n_clusters),
       fill = cluster_colors[1:n_clusters], border = "black",
       title = "Cluster", bg = "white", box.lwd = 0.5)

Comment on the cluster patterns in comparison with those from problem 9.

This method grouped locations with similar seasonal precipitation behavior, not just elevation or geographic closeness. Cluster patterns were more consistent regionally.