Catalina Jerez
catalina.jerez@colorado.edu
To investigate spatio-temporal patterns in Indian monsoon rainfall (JJAS: June to September), we perform:
The aim is to identify the dominant modes of rainfall variability and explore their association with key ocean-atmosphere processes (e.g., ENSO and the Indian Ocean Dipole).
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 543202 29.1 1201865 64.2 NA 700280 37.4
## Vcells 1005101 7.7 8388608 64.0 36864 1963567 15.0
rm(list = ls())
# Load necessary libraries
spatialAnalysis.Lib = c("fields", "maps", "rnaturalearth", "sf")
statisAnalysis.Lib = c("psych")
dataManipulation.Lib = c("dplyr", "reshape2", "tidyr", "tidyverse")
dataVisualization.Lib = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer")
list.packages = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
## fields maps rnaturalearth sf psych
## TRUE TRUE TRUE TRUE TRUE
## dplyr reshape2 tidyr tidyverse ggplot2
## TRUE TRUE TRUE TRUE TRUE
## ggrepel ggpubr RColorBrewer
## TRUE TRUE TRUE
Below we define functions for plotting the eigenvalue spectrum, spatial patterns, temporal PC series, and SST fields.
# constants
text.size = 15
customize.bw = theme_bw(base_family = "Times") +
theme(
axis.text.x = element_text(size = text.size, vjust = 0.5),
axis.text.y = element_text(size = text.size),
axis.title = element_text(size = text.size + 2, face = "bold"),
plot.title = element_text(size = text.size + 2, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = text.size, face = "bold"),
legend.text = element_text(size = text.size),
panel.border = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
)
# color palettes
palette.rdbu = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200)
# world map
world.sf = ne_countries(scale = "medium", returnclass = "sf") %>%
st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
st_transform("+proj=longlat +datum=WGS84")
# plot for PCA
plot.eigenSpectrum = function(pca_result, n = 15) {
eigvals = pca_result$sdev^2
frac_var = eigvals / sum(eigvals)
df = data.frame(Mode = 1:length(frac_var), Variance = frac_var)
breaks = seq(0, 15, 3); breaks[1] = 1
ggplot(df[1:n, ], aes(x = Mode, y = Variance)) +
geom_line(color = "#636363") +
geom_point(color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
scale_x_continuous(breaks = breaks)+
labs(y = "Fraction Variance", x = "Mode",title = "PCA Eigenvalue Spectrum")+
theme_bw(base_size = 12, base_family = "Times") +
customize.bw
}
# plot spatial patterns
plot.spatialPattern = function(eigenvector, grid, title, fill_limits = c(-0.1, 0.1) ) {
df = bind_cols(grid, value = eigenvector)
colnames(df)[c(1,2)] = c("lat", "lon")
df = df %>% # shift data from 0–360 to -180–180
mutate(lon = ifelse(lon > 180, lon - 360, lon))
ggplot(df, aes(x = lon, y = lat, fill = value)) +
geom_tile() +
# regions
geom_rect(aes(xmin = 66, xmax = 100, ymin = 6, ymax = 39),
inherit.aes = FALSE, color = "#41ab5d", fill = NA, linewidth = 1) +
geom_sf(data = world.sf, fill = NA, color = "black",
linewidth = 0.2, inherit.aes = FALSE) +
# labs
labs(title = title, x = "Longitude", y = "Latitude") +
# scales
# coord_sf(xlim = c(-180, 180), ylim = c(-20, 20), expand = FALSE, crs = "EPSG:4326") +
coord_sf(xlim = c(65, 101), ylim = c(5, 40), expand = FALSE, crs = "EPSG:4326") +
scale_fill_gradientn(colors = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200),
limits = fill_limits,
guide = guide_colorbar(barwidth = 15, barheight = 2) ) +
scale_y_continuous(breaks = seq(10, 40 , 10))+
scale_x_continuous(breaks = seq(60, 100, 10))+
# theme
customize.bw
}
# plot temporal PC series + ENSO indices
plot.temporalPC = function(pc, indices, title) {
df = data.frame(Year = 1950:2016, PC = scale(pc))
df = cbind(df, indices)
# calculate correlation
cors = apply(df[, -c(1, 2)], 2, function(x) round(cor(df$PC, x, use = "complete.obs"), 2) )
cor.tx = paste0(paste(names(cors), cors, sep = " = "), collapse = ", ")
df.melt = reshape2::melt(df, id.vars = c("Year", "PC"),
variable.name = "Index",
value.name = "value")
df.melt = df.melt %>%
mutate(phase = case_when(value > 0.5 ~ "Warm phase",
value < -0.5 ~ "Cool phase",
TRUE ~ "Neutral phase"))
levels(df.melt$phase) = c("Cool phase", "Neutral phase","Warm phase")
# warm phase, cold phase, and neutral phase
scale.colors = c("#053061", "#00441b", "#b2182b")
scale.fill = c("#92c5de", "#a6dba0", "#f4a582")
ggplot(df.melt, aes(x = Year)) +
geom_hline(yintercept = 0, linetype = "solid", color = "#e0e0e0") +
geom_hline(yintercept = 2, linetype = "solid", color = "#67001f") +
geom_hline(yintercept =-2, linetype = "solid", color = "#053061") +
# ENSO indices
geom_col(aes(y = value, color = phase, fill = phase)) +
geom_line(aes(y = value), linewidth = .3, linetype = 1, color = "#67001f") +
# PC
geom_line(aes(y = PC), color = "#000", linewidth = .9) +
geom_point(aes(y = PC), color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
# labs
labs(title = paste0(title, "\n", cor.tx), y = "PC (scaled) / ENSO Index", x = "Year") +
facet_wrap(~ Index, ncol = 1, scales = "free_y") +
# scales
scale_x_continuous(limits = c(1950, 2015), breaks = seq(1950, 2015, 10)) +
scale_color_manual("Indice", values = scale.colors) +
scale_fill_manual("Indice", values = scale.fill) +
customize.bw +
theme(strip.text = element_text(size = text.size),
strip.background = element_blank())
}
# plot fields
plot.fields = function(pc, grd.avg, title){
r = apply(grd.avg, 2, function(x) cor(pc, x, use = "complete.obs"))
# significance (2-sided t-test)
n = nrow(grd.avg)
t = r * sqrt((n - 2) / (1 - r^2))
p = 2 * pt(-abs(t), df = n - 2)
sig = ifelse(p < 0.05, TRUE, FALSE)
# complete the grd
zcor = rep(NaN , nx.sst * ny.sst)
zsig = rep(FALSE, nx.sst * ny.sst)
zcor[sst.locs[, 3]] = r
zsig[sst.locs[, 3]] = sig
# grid for plot
df = expand.grid(lon = xgrid.sst, lat = ygrid.sst)
df$cor = zcor
df$sig = zsig
# pc.maps[[i]] =
ggplot(df, aes(x = lon, y = lat)) +
# correlation
geom_tile(aes(fill = cor)) +
geom_contour(aes(z = cor), color = "#000", bins = 6, linewidth = 0.5, na.rm = TRUE) +
# significance points
geom_point(data = df[df$sig,], aes(x = lon, y = lat), color = "#000", shape = 21, size = 0.6, stroke = 0.2) +
# regions
geom_rect(aes(xmin = 66, xmax = 100, ymin = 6, ymax = 39),
inherit.aes = FALSE, color = "#41ab5d", fill = NA, linewidth = 1) +
# borders("world", colour = "#000") +
geom_sf(data = world.sf, fill = NA, color = "black",
linewidth = 0.2, inherit.aes = FALSE) +
# labs
labs(title = title, x = "Longitude", y = "Latitude")+
# scales
scale_fill_gradientn(colors = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200),
limits = c(-0.5, 0.5),
guide = guide_colorbar(barwidth = 15, barheight = 2) ) +
coord_sf(xlim = c(0, 358), ylim = c(-20, 20), expand = FALSE, crs = "EPSG:4326") +
scale_y_continuous(breaks = seq(-20, 20, 10))+
#theme
customize.bw
}
A 0.25° latitude-longitude grid is created over the study area, and the grid points are stored in xygrid.
# deg grid
nrows = 68
ncols = 65
ntime = 67 # Jun-Sep 1950 - 2016
nyrs = ntime # 1902 - 2016
nglobe = nrows*ncols
N = nrows*ncols
# Lat - Long grid
ygrid = seq(6.5 , 38.5, by=0.5)
xgrid = seq(66.5, 100 , by=0.5)
ny = length(ygrid)
nx = length(xgrid)
xygrid= matrix(0, nrow = nx*ny, ncol=2)
i = 0
for(iy in 1:ny){
for(ix in 1:nx){
i = i+1
xygrid[i,1] = ygrid[iy]
xygrid[i,2] = xgrid[ix]
}
}
Rainfall data for JJAS from 1950 to 2016 is read and reshaped into a matrix where each row represents one season (year) and each column a spatial location. Only valid (non-missing) data are retained.
path.data = "https://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2/Spring-2025-data-commands/"
data = readBin(paste0(path.data, "India-Rain-JJAS-05deg-1950-2016.r4"),
what = "numeric",
n = (nrows * ncols * ntime),
size = 4,
endian = "swap")
# Grid points over India
data = array(data = data, dim=c( nrows, ncols, ntime ) )
data1 = data[,,1]
index = 1:(nx*ny)
# pull only data and the locations with non-missing data
index1 = index[data1 != "NaN"]
xygrid1 = xygrid[index1,]
nsites = length(index1)
data2 = data1[index1]
# Rain data matrix - rows are seasonal (i.e. one value per year)
# and columns are locations
raindata = matrix(NA, nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1 = data[,,i]
index1 = index[ data1 != "NaN"]
data2 = data1[index1]
raindata[i,] = data2
}
index = 1:dim(raindata)[2]
xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]
xygrid1 = xygrid[index2,]
rainavg = raindata[,index3]
indexgrid = index2
rm("data") #remove the object data to clear up space
SST data for the tropical Pacific is loaded and reshaped. Only valid SST points are retained based on the provided locations.
# grid SST (NOAA tropical)
ygrid.sst = seq(-16, 16 , by = 2)
xgrid.sst = seq(0 , 358, by = 2)
ny.sst = length(ygrid.sst)
nx.sst = length(xgrid.sst)
# read SST valid points
sst.locs = read.table(paste0(path.data, "NOAA-trop-sst-locs.txt"))
# read SST binary
data.raw = readBin(paste0(path.data, "NOAA-Trop-JJAS-SST-1950-2016.r4"),
what = "numeric",
n = (nx.sst * ny.sst * ntime),
size = 4,
endian = "swap")
# reshape to (ny, nx, ntime)
sst.array = array(data.raw, dim = c(ny.sst, nx.sst, ntime))
sst.indices = sst.locs[, 3]
nsites = length(sst.indices)
# construct sst.avg (ntime rows × valid SST sites columns)
sst.avg = matrix(NA, nrow = ntime, ncol = nsites)
for (t in 1:ntime) {
sst.slice = sst.array[, , t]
sst.avg[t, ] = sst.slice[sst.indices]
}
nino.ind = c("NINO12","NINO3", "NINO34", "NINO4")
base.url = "http://iridl.ldeo.columbia.edu/SOURCES/.Indices/.nino/.EXTENDED"
suffix.url = "/T/(Jan%201950)/(Dec%202016)/RANGE/T/(Jun-Sep)/seasonalAverage/data.ch"
enso.list = lapply(nino.ind, function(ind){
scan(paste0(base.url, "/.", ind, suffix.url))
})
names(enso.list) = tolower(nino.ind)
enso.indices = as.data.frame(enso.list)
The rainfall data is scaled and PCA is performed to extract the
dominant modes of variability.
The eigenvalue spectrum helps us
understand how much variance each mode explains.
# perform PCA
rainscale = scale(rainavg) # scale the data
rain.pca = prcomp(rainscale, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (first 15 modes)
plot.eigenSpectrum(rain.pca)
\(\to\) The eigenvalue spectrum plot
shows that the first few modes (especially mode 1)
explain a significant fraction of the total variance.
\(\to\) Thus, the dominant patterns
of rainfall variability are captured by the leading principal
components.
Plotting the Leading 4 Spatial Patterns (EOFs)
# plot leading 4 spatial patterns (EOFs)
eof.p = list()
for (i in 1:4) {
eof.p[[i]] = plot.spatialPattern(rain.pca$rotation[, i], xygrid1, paste("EOF", i))
}
ggarrange(eof.p[[1]], eof.p[[2]], eof.p[[3]], eof.p[[4]],
common.legend = TRUE,
legend = "bottom",
ncol = 2, nrow = 2)
\(\to\) EOF1:
displays a large-scale coherent pattern that likely represents the
overall monsoon signal across India.
\(\to\) EOF2–EOF4: highlight regional anomalies, capturing gradients (east-west or north-south) in the rainfall field. These patterns point to localized modes of variability.
Plotting the Leading 4 Temporal Patterns (PCs)
# plot leading 4 temporal patterns (PCs)
pc.temp = list()
for (i in 1:4) {
pc.temp[[i]] = plot.temporalPC(pc = rain.pca$x[, i],
indices = enso.indices,
title = paste("Principal Component", i)
)
}
pc.temp[[1]]
pc.temp[[2]]
pc.temp[[3]]
pc.temp[[4]]
# ggarrange(pc.temp[[1]], pc.temp[[2]], pc.temp[[3]], pc.temp[[4]],
# ncol = 1)
\(\to\) The temporal plots show interannual variability. PC1, for instance, may reflect the overall strength of the monsoon, while PCs 2–4 capture more nuanced, regional fluctuations over time.
Rotation is applied to enhance the physical interpretability of the spatial patterns.
The varimax rotation redistributes the explained variance so that the resulting patterns (rotated EOFs) are more localized.
pc.data = rain.pca$x[, 1:6]
rain.rot = principal(pc.data, nfactors = 6, rotate = "varimax", scores = TRUE)
Plotting Rotated Spatial Patterns
rot.eof = list()
for (i in 1:4) {
# project back spatially: rotated spatial = rotation * loading
rotated_spatial = as.matrix(rain.pca$rotation[, 1:6]) %*% as.matrix(rain.rot$loadings[, i])
rot.eof[[i]] = plot.spatialPattern(rotated_spatial, xygrid1, paste("Rotated EOF", i))
}
ggarrange(rot.eof[[1]], rot.eof[[2]],
rot.eof[[3]], rot.eof[[4]],
common.legend = TRUE,
legend = "bottom",
ncol = 2, nrow = 2)
Plotting Rotated Temporal Patterns
rot.pc = list()
for (i in 1:4) {
rot.pc[[i]] = plot.temporalPC(pc = rain.rot$scores[, i],
indices = enso.indices,
title = paste("Rotated Principal Component", i)
)
}
rot.pc[[1]]
rot.pc[[2]]
rot.pc[[3]]
rot.pc[[4]]
# ggarrange(rot.pc[[1]], rot.pc[[2]], rot.pc[[3]], rot.pc[[4]],
# ncol = 1)
The rotated EOFs are more localized compared to their unrotated counterparts, making it easier to associate them with specific regional rainfall patterns.
The temporal evolution of the rotated PCs remains similar to the unrotated PCs, ensuring that the major modes of variability are preserved but are now more physically interpretable.
PCA Results: the eigenvalue spectrum and EOF maps indicate that the first few modes capture most of the variability. PC1 represents the large-scale monsoon signal, whereas PCs 2–4 represent more regional variations.
Rotated PCA: varimax rotation enhances the physical interpretability of the spatial patterns by localizing them, which is useful for identifying region-specific rainfall behaviors.
SST Correlations: significant correlations between rainfall PCs and SST fields provide evidence of ocean-atmosphere coupling. In particular, the strong association between PC1 and Pacific SSTs reinforces the role of ENSO in modulating monsoon variability.