--- title: "hw2_prob10" author: "Haiying Peng" date: "2025-04-04" output: html_document: default --- #### 10. Perform clustering of the summer season precipitation using the method proposed in Bernard et al. (2013) and used in Bracken et al. (2015) – these papers are linked on the class page. Comment on the cluster patterns in comparison with those from problem 9. ```{r,message=FALSE,warning=FALSE} #### Clear Memory and Set Options rm(list=ls()) options(warn=-1) save <- FALSE # Change to TRUE if you want to save the plot # Set working directory (Modify if necessary) script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path) setwd(script_dir) source("HW2_Library.R") ``` ##### Read data ```{r} nrows=68 ncols=65 ntime <- 67 #Jun-Sep 1950 - 2016 nyrs <- ntime nglobe <- nrows*ncols N <- nrows*ncols ### Lat - Long grid.. #ygrid=seq(6.5,38.5,by=0.25) ygrid=seq(6.5,38.5,by=0.5) ny=length(ygrid) #xgrid=seq(66.5,100,by=0.25) xgrid=seq(66.5,100,by=0.5) nx=length(xgrid) xygrid=matrix(0,nrow=nx*ny,ncol=2) i=0 for(iy in 1:ny){ for(ix in 1:nx){ i=i+1 xygrid[i,1]=ygrid[iy] xygrid[i,2]=xgrid[ix] } } # Read India Gridded Rainfall data.. data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap") #data=readBin("data.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap") data <- array(data = data, dim=c( nrows, ncols, ntime ) ) data1=data[,,1] # the lat -long data grid.. index=1:(nx*ny) ## pull only data and the locations with non-missing data index1=index[data1 != "NaN"] # only non-missing data. xygrid1=xygrid[index1,] nsites=length(index1) data2=data1[index1] ### Rain data matrix - rows are seasonal (i.e. one value per year) ## and columns are locations raindata=matrix(NA,nrow=nyrs, ncol=nsites) for(i in 1:nyrs){ data1=data[,,i] index1=index[ data1 != "NaN"] data2=data1[index1] raindata[i,]=data2 } index = 1:dim(raindata)[2] xx = apply(raindata,2,mean) index2 = index1[xx > 0] index3 = index[xx > 0] xygrid1=xygrid[index2,] rainavg = raindata[,index3] indexgrid = index2 rm("data") #remove the object data to clear up space ``` seasonal_avg <- colMeans(rainavg) X1 <- cbind(xygrid1, seasonal_avg) colnames(X1) <- c("Latitude", "Longitude", "SeasonalAvg") distance_data <- read.table("data/Rajgrid-lon-lat-elev-dista-distb.txt", header=FALSE) names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal") #### Merge the two datasets X <- merge(X1, distance_data, by=c("Longitude", "Latitude")) X <- X[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal", "SeasonalAvg")] prec_obs <- X$SeasonalAvg # Extract precipitation column ##### Perform clustering of the Extremes using the method used in Bracken et al. (2015) ```{r} lat <- xygrid1[,1] lon <- xygrid1[,2] loc <- xygrid1[, c(2, 1)] ks = 2:20 # Do the clustering for all k values cluster_list = list() for(k in ks){ cluster_list[[which(ks == k)]] = pam_fmado_ll(rainavg,k,loc[,1:2]) } # Find the average silhouette value (smaller is better) sil = sapply(cluster_list,function(x)x$silinfo$avg.width) # Plot the average silhouette value versus k, look for a maximum g1 = ggplot() + geom_line(mapping = aes(ks,sil)) + geom_point(mapping = aes(ks,sil), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") + labs(title = "Silhouette width",x = 'Number of clusters',y = 'Silhouette width')+ theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5)) show(g1) # Find the optimal number of clusters optimal_k = ks[which.max(sil)] print(sprintf('The optimal number of clusters is %s', optimal_k)) ``` ##### Display the clusters spatially ```{r} #clusters = cbind(loc, cluster = cluster_list[[which(ks==optimal_k)]]$clustering) colnames(loc) <- c("lon", "lat") clusters = data.frame(loc, Cluster = factor(cluster_list[[which(ks == optimal_k)]]$clustering)) # Plot clusters over India n_clusters <- length(unique(clusters$Cluster)) cluster_colors <- rainbow(n_clusters) cluster_numeric <- as.numeric(clusters$Cluster) plot(clusters$lon, clusters$lat, col = cluster_colors[cluster_numeric], pch = 21, bg = cluster_colors[cluster_numeric],cex = 0.3, xlim = c(65, 100), ylim = c(5, 40), xlab = "Longitude", ylab = "Latitude", main = "Clustering of Seasonal Precipitation over India") maps::map("world", add = TRUE, wrap = c(0, 360), lwd = 1.5) grid() cluster_means <- aggregate(clusters[, c("lon", "lat")], by = list(clusters$Cluster), FUN = mean) points(cluster_means$lon, cluster_means$lat, pch = 2, # triangle point cex = 1, # size lwd = 2, # stroke thickness col = "black") # outline color text(cluster_means$lon, cluster_means$lat, labels = cluster_means$Group.1, pos = 3, cex = 0.8) legend("topright", legend = paste("Cluster", 1:n_clusters), fill = cluster_colors[1:n_clusters], border = "black", title = "Cluster", bg = "white", box.lwd = 0.5) ``` #### Comment on the cluster patterns in comparison with those from problem 9. This method grouped locations with similar seasonal precipitation behavior, not just elevation or geographic closeness. Cluster patterns were more consistent regionally.