Catalina Jerez
catalina.jerez@colorado.edu
Cluster the seasonal average precipitation using latitude, longitude and elevation (over average value at each location) The clustering involves:
Identify the best number of clusters (Kbest).
Select a desired number of clusters, j;
Cluster the data and computer the WSS (within-cluster sum of squares);
Repeat for j=1,10;
Plot j versus WSS; and
Select the best number of clusters Kbest, where the WSS starts to saturate.
Cluster the data into Kbest clusters and display them spatially.
Employ any other clustering method – K-medoid clustering and/or Hierarchical Clustering.
Cluster the same using latitude, longitude, elevation, distance to Bay of Bengal and distance to Arabian Sea.
Perform clustering of the summer season precipitation using the method proposed in Bernard et al. (2013) and used in Bracken et al. (2015).
Uses F-madogram, a distance metric tailored to extreme events rather than mean values.
Includes spatial weighting (distance between stations).
Final clustering via Partitioning Around Medoids (PAM).
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 543249 29.1 1201998 64.2 NA 700282 37.4
## Vcells 1004958 7.7 8388608 64.0 36864 1963597 15.0
rm(list = ls())
library(cluster)
library(factoextra)
# Load necessary libraries
spatialAnalysis.Lib = c("maps", "rnaturalearth", "sf", "fields")
statisAnalysis.Lib = c("caret", "DescTools")
dataManipulation.Lib = c("dplyr", "reshape2", "tidyr")
dataVisualization.Lib = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer",
"scales")
modeling.Lib = c("hydroGOF")
list.packages = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
modeling.Lib,
dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
## maps rnaturalearth sf fields caret
## TRUE TRUE TRUE TRUE TRUE
## DescTools hydroGOF dplyr reshape2 tidyr
## TRUE TRUE TRUE TRUE TRUE
## ggplot2 ggrepel ggpubr RColorBrewer scales
## TRUE TRUE TRUE TRUE TRUE
# interpolate function -------------------------------------------------------------
interp.fields = function(varname, topo, coords) {
surface = list(
x = topo$lon,
y = topo$lat,
z = topo[[varname]]
)
interp.surface(surface, loc = coords)
}
# Cluster function -------------------------------------------------------------
cluster.kbest = function(clust.data, vars, max.k = 10){
# prepare the data
clust.scale = clust.data %>% dplyr::select(all_of(vars)) %>% scale()
# find best number of clusters, Kbest ----------------------------------------
# we compute WSS for k = 1 to max.k
wss.kbest = sapply(1:max.k, function(k) {
kmeans(clust.scale, centers = k, nstart = 50)$tot.withinss
})
# calculate WSS reduction and find the elbow
wss.df = data.frame(k = 1:max.k, wss = wss.kbest)
wss.df$wss.diff = c(NA, diff(wss.df$wss))
wss.df$wss.drop = c(NA, -diff(wss.df$wss))
wss.df$drop.ratio = c(NA, diff(wss.df$wss) / wss.df$wss[-max.k])
Kbest = which.min(diff(diff(wss.kbest))) + 1
# WSS vs k
wss.plot =
ggplot(wss.df, aes(x = k, y = wss)) +
# k best
geom_vline(xintercept = Kbest, linetype = "dashed", linewidth = 1.1,
color = "#41b6c4") +
# data
geom_line(color = "#67000d") +
geom_point(shape = 21, size = 2.5, color = "#67000d", fill = "#fc9272") +
annotate("text", x = Kbest, y = max(wss.kbest), label = paste("Kbest =", Kbest), hjust = -0.2) +
# scales
scale_x_continuous(breaks = pretty_breaks(n = 4)) +
# labs
labs(title = "Elbow method for optimal k",
x = "Number of clusters (k)",
y = "Within-Cluster Sum of Squares") +
# theme
customize.bw
# K-Means Clustering ---------------------------------------------------------
kmeans.kbest = kmeans(clust.scale, centers = Kbest, nstart = 1)
clust.data$kmeans = as.factor(kmeans.kbest$cluster)
# K-medoids (with Partitioning Around Medoids) -------------------------------
pam.kbest = pam(clust.scale, k = Kbest)
clust.data$kmedois = as.factor(pam.kbest$cluster)
# Hierarquical Clustering ----------------------------------------------------
hc.kbest = hclust(dist(clust.scale), method = "ward.D2")
clust.data$hc = as.factor(cutree(hc.kbest, k = Kbest))
# Spatial Clust --------------------------------------------------------------
kmeans.plot = plot.spatialClust(clust.data, "kmeans")
kmedois.plot = plot.spatialClust(clust.data, "kmedois")
hc.plot = plot.spatialClust(clust.data, "hc")
return(list(clust.data = clust.data,
Kbest = Kbest,
wss.plot = wss.plot,
kmeans.plot = kmeans.plot,
kmedois.plot = kmedois.plot,
hc.plot = hc.plot))
}
# Extreme cluster function -----------------------------------------------------
# proposed in Bernard et al. (2013) and used in Bracken et al. (2015)
pam_fmado_ll = function (x, k, ll) {
# x - Design matrix, typically colums are stations, rows are time
# k - Number of clusters
# ll - two column matrix of lat long points (or preferably projected) with N rows
ndata = ncol(x) # number of stations
ntime = nrow(x) # number of time points
# compute the F-madogram distance
vF = array(NaN, dim = c(ntime, ndata))
for (p in 1:ndata) {
x.vec = as.vector(x[, p])
Femp = ecdf(x.vec)(x.vec)
vF[, p] = Femp
}
DD = dist(t(vF), method = "manhattan", diag = TRUE, upper = TRUE)/(2 * ntime)
# weight by physical distance
DDll = dist(ll, method = 'manhattan')
DDw = as.matrix(DD) + t(t(as.matrix(DDll))/apply(as.matrix(DDll),2,max))*max(as.matrix(DD))
# do the clustering
output = pam(DDw, k, diss = TRUE, medoids = NULL)
return(output)
}
# functions for graphs ----------------------------------------------------------
# constants
text.size = 15
customize.bw = theme_bw(base_family = "Times") +
theme(
axis.text.x = element_text(size = text.size, vjust = 0.5),
axis.text.y = element_text(size = text.size),
axis.title = element_text(size = text.size, face = "bold"),
plot.title = element_text(size = text.size, face = "bold"),
plot.subtitle = element_text(size = text.size-1, face = "italic"),
legend.position = "bottom",
legend.title = element_text(size = text.size, face = "bold"),
legend.text = element_text(size = text.size),
panel.border = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
)
# world map
world.sf = ne_countries(scale = "medium", returnclass = "sf") %>%
st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
st_transform("+proj=longlat +datum=WGS84")
# considering max.k = 10 cluster. update if required more clusters.
scale.color = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99",
"#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a")
# plot spatial cluster
plot.spatialClust = function(data, clust){
# data = clust.data
# clust = "kmeans"
# filter data ----------------------------------------------------------------
data = data.frame(
lon = data$lon,
lat = data$lat,
clust = data[[clust]] )
# calculate centroids and counts ---------------------------------------------
centroids = data %>%
dplyr::group_by(clust) %>%
dplyr::summarise(
lon = mean(lon, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
count = n(),
.groups = "drop"
)
# plot ----------------------------------------------------------------------
ggplot(data, aes(x = lon, y = lat, color = clust)) +
# cluster
geom_point(size = 2) +
geom_point(data = centroids, aes(x = lon, y = lat, fill = clust), color = "#000",
shape = 21, size = 11, stroke = 1, alpha = .8, inherit.aes = FALSE) +
geom_text(data = centroids, aes(x = lon, y = lat, label = count),
color = "#000", size = 4.5, fontface = "bold", vjust = .5) +
# regions
annotate("rect", xmin = 66, xmax = 100, ymin = 6, ymax = 39,
color = "#41ab5d", fill = NA, linewidth = 1) +
geom_sf(data = world.sf, fill = NA, color = "black",
linewidth = 0.2, inherit.aes = FALSE) +
# scales
coord_sf(xlim = c(65, 101), ylim = c(5, 40), expand = FALSE, crs = "EPSG:4326") +
scale_color_manual("Cluster, K", values = scale.color) +
scale_fill_manual("Cluster, K" , values = scale.color) +
guides(color = guide_legend(nrow = 1, override.aes = list(size = 4))) +
scale_y_continuous(breaks = seq(10, 40 , 10)) +
scale_x_continuous(breaks = seq(60, 100, 10)) +
labs(title = "Clustering of Seasonal Rainfall",
subtitle = clust, y = "Latitude", x= "Longitude") +
# theme
customize.bw
}
Rainfall data for JJAS from 1950 to 2016 is read and reshaped into a matrix where each row represents one season (year) and each column a spatial location. Only valid (non-missing) data are retained.
path.data = "https://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2/Spring-2025-data-commands/"
# grid
nrows = 68
ncols = 65
ntime = 67
# read India location valid points
rain.locs = read.table(paste0(path.data, "India-rain-locs.txt"))
colnames(rain.locs) = c("lat", "lon", "index")
# read data binary
data.raw = readBin(paste0(path.data, "India-Rain-JJAS-05deg-1950-2016.r4"),
what = "numeric",
n = (nrows * ncols * ntime),
size = 4,
endian = "swap")
# reshape to (ny, nx, ntime)
data.array = array(data.raw, dim = c(nrows, ncols, ntime))
rain.indices = rain.locs[, 3]
nsites = length(rain.indices)
# construct rain.avg (ntime rows × valid rain sites columns)
rain.avg = matrix(NA, nrow = ntime, ncol = nsites)
for (t in 1:ntime) {
rain.slice = data.array[, , t]
rain.avg[t, ] = rain.slice[rain.indices]
}
nsites = length(rain.indices)
rain.mean.temp = rowMeans(rain.avg, na.rm = TRUE)
rain.mean.site = colMeans(rain.avg, na.rm = TRUE)
raj.topo = read.table(paste0(path.data, "Rajgrid-lon-lat-elev-dista-distb.txt"))
colnames(raj.topo) = c("lon", "lat", "elev", "distArabian", "distBengal")
# dim(rain.avg)
# dim(raj.topo)
rain.coords = rain.locs[, c("lon", "lat")]
rain.elev = interp.fields("elev", topo = raj.topo, coords = rain.coords)
rain.distArabian = interp.fields("distArabian", topo = raj.topo, coords = rain.coords)
rain.distBengal = interp.fields("distBengal" , topo = raj.topo, coords = rain.coords)
# final data frame for clustering
clust.df = data.frame(
lon = rain.coords$lon,
lat = rain.coords$lat,
elev = rain.elev,
distArabian = rain.distArabian,
distBengal = rain.distBengal,
precip = rain.mean.site) %>%
drop_na()
# Check for NAs
summary(clust.df)
## lon lat elev distArabian
## Min. :68.50 Min. : 8.50 Min. : 4.0 Min. : 1.000
## 1st Qu.:75.50 1st Qu.:20.00 1st Qu.: 86.0 1st Qu.: 5.481
## Median :78.00 Median :24.00 Median : 239.5 Median : 9.525
## Mean :79.46 Mean :23.59 Mean : 466.6 Mean : 9.902
## 3rd Qu.:82.50 3rd Qu.:27.50 3rd Qu.: 486.3 3rd Qu.:13.944
## Max. :97.00 Max. :36.50 Max. :5785.0 Max. :24.964
## distBengal precip
## Min. : 0.000 Min. : 60.14
## 1st Qu.: 2.927 1st Qu.: 424.15
## Median : 5.759 Median : 777.63
## Mean : 6.438 Mean : 853.71
## 3rd Qu.: 9.604 3rd Qu.:1084.32
## Max. :18.358 Max. :4719.14
cluster.topo = cluster.kbest(clust.data = clust.df,
vars = c("lat", "lon", "elev", "precip"),
max.k = 10)
cluster.topo$wss.plot
* The WSS curve flattens at 8 clusters, showing minimal gain beyond
that.
cluster.topo$kmeans.plot
cluster.topo$kmedois.plot
cluster.topo$hc.plot
KMeans clusters are relatively well-distributed across India’s geography and precipitation zones.
KMedoids handles outliers better than KMeans, and produces more geographically coherent clusters compared to KMeans.
Hierarchical Clustering shows clear regional patterns, but sensitive to outliers and tends to form some small-sized clusters.
cluster.dist = cluster.kbest(clust.data = clust.df,
vars = c("lat", "lon", "elev", "precip",
"distArabian", "distBengal"),
max.k = 10)
cluster.dist$wss.plot
cluster.dist$kmeans.plot
cluster.dist$kmedois.plot
cluster.dist$hc.plot
Added distance features increased cluster sharpness, especially in coastal vs inland contrast.
KMeans and KMedoids both reflect rainfall influence from proximity to oceans.
ks = 2:10
cluster.ext = list()
for(k in ks){
cluster.ext[[which(ks == k)]] = pam_fmado_ll(x = rain.avg,
k = k,
ll = rain.locs[, c("lon", "lat")])
}
# Find the average silhouette value (smaller is better)
sil.avg = sapply(cluster.ext, function(x)x$silinfo$avg.width)
optimal.k = ks[which.max(sil.avg)]
# Plot the average silhouette value versus k, look for a maximum
ggplot() +
# k best
geom_vline(xintercept = optimal.k, linetype = "dashed", linewidth = 1.1,
color = "#41b6c4") +
annotate("text", x = optimal.k, y = max(sil.avg),
label = paste("Kbest =", optimal.k), hjust = -0.2) +
# data
geom_line(mapping = aes(ks, sil.avg), color = "#67000d") +
geom_point(mapping = aes(ks, sil.avg),
shape = 21, size = 2.5, color = "#67000d", fill = "#fc9272") +
# labs
labs(title = "Silhouette width method for optimal k",
x = 'Number of clusters (k)',y = 'Silhouette width')+
# theme
customize.bw
clust.fmado = rain.locs[, c("lon", "lat")] # lon, lat for 1250 valid grid points
clust.fmado$fmado = as.factor(cluster.ext[[which(ks == optimal.k)]]$clustering)
plot.spatialClust(clust.fmado, "fmado")
Silhouette Analysis selected Kbest = 3
Spatial clusters are broad regional partitioning across India. Western, Central, and Eastern zones aligns with monsoon variability extremes.
If the goal is study overall climate regions, go with KMeans without added geographical features (Section 3).
If the focus is extreme rainfall events, use F-madogram clustering (Section 5) for disaster planning or hydrological design.
| Aspect | KMeans/Medoids | Extreme Clustering |
|---|---|---|
| Metric basis | Mean values (e.g., average rainfall) | Extreme event frequency/distribution |
| Best K | 8 (without distance), 9 (with distance) | 3 |
| Sensitivity to elevation/coasts | High (with added features) | Moderate |
| Clarity of coastal influence | Strong in K = 9 (Section 4) | Moderate but evident |
| Cluster shapes | Geometrically cohesive | Broad regions; clusters shaped by extremes |
| Application focus | General climatological structure | Extreme rainfall-focused planning |
| Model used | KMeans, KMedoids, HC | PAM + F-Madogram |