Catalina Jerez
catalina.jerez@colorado.edu
Apply self-organizing maps (SOM) to the summer season rainfall over India. Try 3 x 3 or 4 x 4 node configuration. For each node composite the rainfall and the SSTs and show the maps.
Methodology:
Step 1: Randomly position the grid’s neurons in the data space.
Step 2: Select one data point, either randomly or systematically cycling through the dataset in order.
Step 3: Find the neuron that is closest to the chosen data point. This neuron is called the Best Matching Unit (BMU).
Step 4: Move the BMU closer to that data point. The distance moved by the BMU is determined by a learning rate, which decreases after each iteration.
Step 5: Move the BMU’s neighbors closer to that data point as well, with farther away neighbors moving less. Neighbors are identified using a radius around the BMU, and the value for this radius decreases after each iteration.
Step 6: Update the learning rate and BMU radius, before repeating Steps 1 to 4. Iterate these steps until positions of neurons have been stabilized.
Goal: provide insights into the relationship between the SSTs and their signature on the rainfall extremes.
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 543028 29.1 1201367 64.2 NA 700282 37.4
## Vcells 1003651 7.7 8388608 64.0 36864 1963597 15.0
rm(list = ls())
# Load necessary libraries
spatialAnalysis.Lib = c("maps", "rnaturalearth", "sf", "fields", "raster",
"rasterVis")
statisAnalysis.Lib = c("DescTools")
dataManipulation.Lib = c("dplyr", "reshape2", "tidyr")
dataVisualization.Lib = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer")
modeling.Lib = c("kohonen")
list.packages = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
modeling.Lib,
dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
## maps rnaturalearth sf fields raster
## TRUE TRUE TRUE TRUE TRUE
## rasterVis DescTools kohonen dplyr reshape2
## TRUE TRUE TRUE TRUE TRUE
## tidyr ggplot2 ggrepel ggpubr RColorBrewer
## TRUE TRUE TRUE TRUE TRUE
# functions for graphs ----------------------------------------------------------
# constants
text.size = 15
customize.bw = theme_bw(base_family = "Times") +
theme(
axis.text.x = element_text(size = text.size, vjust = 0.5),
axis.text.y = element_text(size = text.size),
axis.title = element_text(size = text.size, face = "bold"),
plot.title = element_text(size = text.size, face = "bold"),
plot.subtitle = element_text(size = text.size-1, face = "italic"),
# legend
legend.position = "bottom",
legend.title = element_text(size = text.size, face = "bold"),
legend.text = element_text(size = text.size),
# strip
strip.background = element_blank(),
panel.border = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
)
# world map
world.sf = ne_countries(scale = "medium", returnclass = "sf") %>%
st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
st_transform("+proj=longlat +datum=WGS84")
# considering max.k = 10 cluster. update if required more clusters.
scale.color = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99",
"#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a")
prec.colors = colorRampPalette(c("#0000FF", "#00FFFF", "#00FF00", "#FFFF00",
"#FFA500", "#FF0000", "#FF00FF"))(1000)
sst.colors = colorRampPalette(c('#681165', "#073B7A", "#166ABF", "#2A95CC",
"#2BB8B3", "#2FCF8B", "#FFF300", "#F7B000",
"#E84F10", '#99000d','#000000'))(1000)
# plot spatial cluster
plot.spatialSOM = function(data){
# data = rain.nodes
ggplot(data, aes(x = lon, y = lat)) +
# correlation
geom_tile(aes(fill = value)) +
geom_contour(aes(z = value), color = "#000", bins = 6, linewidth = 0.5, na.rm = TRUE) +
# regions
annotate("rect", xmin = 66, xmax = 100, ymin = 6, ymax = 39,
color = "#41ab5d", fill = NA, linewidth = 1) +
geom_sf(data = world.sf, fill = NA, color = "black",
linewidth = 0.2, inherit.aes = FALSE) +
# labs
labs(title = "SOM of Seasonal Rainfall",
y = "Latitude", x= "Longitude") +
facet_wrap(~ variable, ncol = 4) +
# scales
scale_fill_gradientn("Rainfall (mm)", colors = prec.colors) +
guides(fill = guide_colorbar(barwidth = 15, barheight = 2) ) +
coord_sf(xlim = c(65, 101), ylim = c(5, 40), expand = FALSE, crs = "EPSG:4326") +
scale_y_continuous(breaks = seq(10, 40 , 10)) +
scale_x_continuous(breaks = seq(60, 100, 20)) +
#theme
customize.bw +
theme(strip.text = element_text(face = "italic", size = text.size))
}
# plot spatial cluster
plot.spatialSOM.sst = function(data){
# data = rain.nodes
data <- data %>%
mutate(lon = ifelse(lon > 180, lon - 360, lon)) # shift 0–360 to -180–180
ggplot(data, aes(x = lon, y = lat)) +
# correlation
geom_tile(aes(fill = value)) +
geom_contour(aes(z = value), color = "#000", bins = 6, linewidth = 0.5, na.rm = TRUE) +
# regions
annotate("rect", xmin = 66, xmax = 100, ymin = 6, ymax = 39,
color = "#bdbdbd", fill = NA, linewidth = 1) +
geom_sf(data = world.sf, fill = NA, color = "black",
linewidth = 0.2, inherit.aes = FALSE) +
# labs
labs(title = "SOM of Seasonal SST",
y = "Latitude", x= "Longitude") +
facet_wrap(~ variable, ncol = 2) +
# scales
scale_fill_gradientn("SST", colors = sst.colors) +
guides(fill = guide_colorbar(barwidth = 15, barheight = 2) ) +
coord_sf(ylim = c(-20, 20), expand = FALSE, crs = "EPSG:4326") +
scale_y_continuous(breaks = seq(-40, 40, 20))+
#theme
customize.bw +
theme(strip.text = element_text(face = "italic", size = text.size))
}
Rainfall data for JJAS from 1950 to 2016 is read and reshaped into a matrix where each row represents one season (year) and each column a spatial location. Only valid (non-missing) data are retained.
path.data = "https://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2/Spring-2025-data-commands/"
# Global grid
# deg grid
nrows = 68
ncols = 65
ntime = 67 # Jun-Sep 1950 - 2016
# Lat - Long grid
ygrid = seq(6.5 , 38.5, by=0.5)
xgrid = seq(66.5, 100 , by=0.5)
ny = length(ygrid)
nx = length(xgrid)
xygrid= matrix(0, nrow = nx*ny, ncol=2)
i = 0
for(iy in 1:ny){
for(ix in 1:nx){
i = i+1
xygrid[i,1] = ygrid[iy]
xygrid[i,2] = xgrid[ix]
}
}
# read India location valid points
rain.locs = read.table(paste0(path.data, "India-rain-locs.txt"))
colnames(rain.locs) = c("lat", "lon", "index")
# read data binary
data.raw = readBin(paste0(path.data, "India-Rain-JJAS-05deg-1950-2016.r4"),
what = "numeric",
n = (nx * ny * ntime),
size = 4,
endian = "swap")
# reshape to (ny, nx, ntime)
rain.array = array(data.raw, dim = c(ny, nx, ntime))
rain.indices = rain.locs[, 3]
nsites = length(rain.indices)
# construct rain.avg (ntime rows × valid rain sites columns)
rain.avg = matrix(NA, nrow = ntime, ncol = nsites)
for (t in 1:ntime) {
rain.slice = rain.array[, , t]
rain.avg[t, ] = rain.slice[rain.indices]
}
rain.mean.temp = rowMeans(rain.avg, na.rm = TRUE)
rain.mean.site = colMeans(rain.avg, na.rm = TRUE)
SST data for the tropical Pacific is loaded and reshaped. Only valid SST points are retained based on the provided locations.
path.data = "https://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2/Spring-2025-data-commands/"
# grid SST (NOAA tropical)
ntime = 67 # Jun-Sep 1950 - 2016
# ygrid.sst = seq(-16, 16 , by = 2)
ygrid.sst = seq(-20, 20 , by = 2)
xgrid.sst = seq(0 , 358, by = 2)
ny.sst = length(ygrid.sst)
nx.sst = length(xgrid.sst)
# read SST valid points
sst.locs = read.table(paste0(path.data, "NOAA-trop-sst-locs.txt"))
colnames(sst.locs) = c("lat", "lon", "index")
sst.indices = sst.locs[, 3]
nsites = length(sst.indices)
# read SST binary
data.raw = readBin(paste0(path.data, "NOAA-Trop-JJAS-SST-1950-2016.r4"),
what = "numeric",
n = (nx.sst * ny.sst * ntime),
size = 4,
endian = "swap")
# reshape to (ny, nx, ntime)
sst.array = array(data.raw, dim = c(ny.sst, nx.sst, ntime))
# construct sst.avg (ntime rows × valid SST sites columns)
sst.avg = matrix(NA, nrow = ntime, ncol = nsites)
for (t in 1:ntime) {
sst.slice = sst.array[, , t]
sst.avg[t, ] = sst.slice[sst.indices]
}
set.seed(250403)
# Train SOM (e.g., 3x3 or 4x4 grid)
n.nodes = 4
times.node = n.nodes * n.nodes
som.grid = somgrid(xdim = n.nodes, ydim = n.nodes, topo = "rectangular")
som.model = som(scale(rain.avg), grid = som.grid, rlen = 200)
plot(som.model, type = "codes")
plot(som.model, type = "changes")
plot(som.model, type = "counts")
plot(som.model, type = "mapping")
# Array: Location x Node
prec.nodes = as.data.frame(array(NA, dim = c(dim(rain.avg)[2], times.node) ))
for (i in 1:times.node) {
index = which(som.model$unit.classif == i) # which years belong to which node
tmp.df = rain.avg[index,]
if (length(index) > 1) {
prec.nodes[,i] = colMeans(tmp.df, na.rm = T)
}
else {
prec.nodes[,i] = tmp.df
}
}
colnames(prec.nodes) = paste0("Node ", seq(1, times.node))
prec.nodes = prec.nodes %>%
dplyr::mutate(across(
everything(), ~ Winsorize(., val = quantile(., probs = c(0.1, 0.9), na.rm = T) ) ))
rain.nodes = rain.locs[, c("lon", "lat")]
rain.nodes = cbind(rain.nodes, prec.nodes)
rain.nodes = melt(rain.nodes, id = c("lon", "lat"))
plot.spatialSOM(rain.nodes)
Node 1–4: generally lower rainfall, some dry western regions.
Node 5–8: moderate rainfall in central India.
Node 9–12: high rainfall over northeast, central zones.
Node 13–16: extreme rainfall patterns, with intense values in coastal & northeast zones.
sst.nodes = as.data.frame(array(NA, dim = c(dim(sst.avg)[2], times.node)))
for(i in 1:times.node){
index = which(som.model$unit.classif==i)
tmp.df = sst.avg[index,]
if(length(index)>1){
sst.nodes[,i] = colMeans(tmp.df, na.rm=T)
}else{
sst.nodes[,i] = tmp.df
}
}
colnames(sst.nodes) = paste0("Node ", seq(1, times.node))
sst.nodes = sst.nodes %>%
dplyr::mutate(across(
everything(), ~ Winsorize(., val = quantile(., probs = c(0.05, 0.95), na.rm = T) ) ))
sst.nodes = cbind(sst.locs[, c("lon", "lat")], sst.nodes)
sst.nodes = melt(sst.nodes, id = c("lon", "lat"))
plot.spatialSOM.sst(sst.nodes)
The strongest El Niño pattern appears in Nodes 7, 8, 12, with SST warming in central/eastern Pacific.
La Niña (cool central/eastern Pacific) correspond to wet monsoon years in Nodes 13–16, which aligning with stronger rainfall signals in central/northeastern India.
The 4×4 SOM grid captures diverse rainfall patterns, many linked to known SST anomalies like El Niño and La Niña.
The approach enhances monsoon forecasting by bridging local rainfall fields with large-scale ocean-atmosphere signals, allowing us identify:
| SOM Node Range | Rainfall Pattern | SST Signature | Likely Teleconnection |
|---|---|---|---|
| 1–4 | Dry | El Niño (warm East Pac) | Weak monsoon |
| 5–8 | Moderate | Neutral/Central SSTs | Transition phase |
| 9–12 | Wet | Slightly cool SSTs | Mild La Niña influence |
| 13–16 | Very Wet | La Niña (cool East Pac) | Strong monsoon |