using the method proposed in Bernard et al. (2013) and used in Bracken et al. (2015) – these papers are linked on the class page. Comment on the cluster patterns in comparison with those from problem 9.
#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE # Change to TRUE if you want to save the plot
# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)
source("HW2_Library.R")
nrows=68
ncols=65
ntime <- 67 #Jun-Sep 1950 - 2016
nyrs <- ntime
nglobe <- nrows*ncols
N <- nrows*ncols
### Lat - Long grid..
#ygrid=seq(6.5,38.5,by=0.25)
ygrid=seq(6.5,38.5,by=0.5)
ny=length(ygrid)
#xgrid=seq(66.5,100,by=0.25)
xgrid=seq(66.5,100,by=0.5)
nx=length(xgrid)
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}
}
# Read India Gridded Rainfall data..
data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
#data=readBin("data.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data1=data[,,1]
# the lat -long data grid..
index=1:(nx*ny)
## pull only data and the locations with non-missing data
index1=index[data1 != "NaN"] # only non-missing data.
xygrid1=xygrid[index1,]
nsites=length(index1)
data2=data1[index1]
### Rain data matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
raindata=matrix(NA,nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1=data[,,i]
index1=index[ data1 != "NaN"]
data2=data1[index1]
raindata[i,]=data2
}
index = 1:dim(raindata)[2]
xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]
xygrid1=xygrid[index2,]
rainavg = raindata[,index3]
indexgrid = index2
rm("data") #remove the object data to clear up space
seasonal_avg <- colMeans(rainavg)
X1 <- cbind(xygrid1, seasonal_avg) colnames(X1) <- c(“Latitude”, “Longitude”, “SeasonalAvg”)
distance_data <- read.table(“data/Rajgrid-lon-lat-elev-dista-distb.txt”, header=FALSE) names(distance_data) <- c(“Longitude”, “Latitude”, “Elevation”, “Dist_ArabianSea”, “Dist_BayOfBengal”)
X <- merge(X1, distance_data, by=c(“Longitude”, “Latitude”)) X <- X[, c(“Longitude”, “Latitude”, “Elevation”, “Dist_ArabianSea”, “Dist_BayOfBengal”, “SeasonalAvg”)] prec_obs <- X$SeasonalAvg # Extract precipitation column
lat <- xygrid1[,1]
lon <- xygrid1[,2]
loc <- xygrid1[, c(2, 1)]
ks = 2:20
# Do the clustering for all k values
cluster_list = list()
for(k in ks){
cluster_list[[which(ks == k)]] = pam_fmado_ll(rainavg,k,loc[,1:2])
}
# Find the average silhouette value (smaller is better)
sil = sapply(cluster_list,function(x)x$silinfo$avg.width)
# Plot the average silhouette value versus k, look for a maximum
g1 = ggplot() +
geom_line(mapping = aes(ks,sil)) +
geom_point(mapping = aes(ks,sil), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
labs(title = "Silhouette width",x = 'Number of clusters',y = 'Silhouette width')+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)
# Find the optimal number of clusters
optimal_k = ks[which.max(sil)]
print(sprintf('The optimal number of clusters is %s', optimal_k))
## [1] "The optimal number of clusters is 3"
#clusters = cbind(loc, cluster = cluster_list[[which(ks==optimal_k)]]$clustering)
colnames(loc) <- c("lon", "lat")
clusters = data.frame(loc, Cluster = factor(cluster_list[[which(ks == optimal_k)]]$clustering))
# Plot clusters over India
n_clusters <- length(unique(clusters$Cluster))
cluster_colors <- rainbow(n_clusters)
cluster_numeric <- as.numeric(clusters$Cluster)
plot(clusters$lon, clusters$lat,
col = cluster_colors[cluster_numeric],
pch = 21, bg = cluster_colors[cluster_numeric],cex = 0.3,
xlim = c(65, 100), ylim = c(5, 40),
xlab = "Longitude", ylab = "Latitude",
main = "Clustering of Seasonal Precipitation over India")
maps::map("world", add = TRUE, wrap = c(0, 360), lwd = 1.5)
grid()
cluster_means <- aggregate(clusters[, c("lon", "lat")], by = list(clusters$Cluster), FUN = mean)
points(cluster_means$lon, cluster_means$lat,
pch = 2, # triangle point
cex = 1, # size
lwd = 2, # stroke thickness
col = "black") # outline color
text(cluster_means$lon, cluster_means$lat, labels = cluster_means$Group.1,
pos = 3, cex = 0.8)
legend("topright", legend = paste("Cluster", 1:n_clusters),
fill = cluster_colors[1:n_clusters], border = "black",
title = "Cluster", bg = "white", box.lwd = 0.5)
Comment on the cluster patterns in comparison with those from problem 9.
This method grouped locations with similar seasonal precipitation behavior, not just elevation or geographic closeness. Cluster patterns were more consistent regionally.