Catalina Jerez
catalina.jerez@colorado.edu


To investigate spatio-temporal patterns in Indian monsoon rainfall (JJAS: June to September), we perform:

The aim is to identify the dominant modes of rainfall variability and explore their association with key ocean-atmosphere processes (e.g., ENSO and the Indian Ocean Dipole).

1 Library & functions

1.1 Libraries

gc()
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  543202 29.1    1201865 64.2         NA   700280 37.4
## Vcells 1005101  7.7    8388608 64.0      36864  1963567 15.0
rm(list = ls())

# Load necessary libraries
spatialAnalysis.Lib     = c("fields", "maps", "rnaturalearth", "sf")
statisAnalysis.Lib      = c("psych")
dataManipulation.Lib    = c("dplyr", "reshape2", "tidyr", "tidyverse") 
dataVisualization.Lib   = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer")
list.packages           = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
                                   dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
##        fields          maps rnaturalearth            sf         psych 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##         dplyr      reshape2         tidyr     tidyverse       ggplot2 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##       ggrepel        ggpubr  RColorBrewer 
##          TRUE          TRUE          TRUE

1.2 Functions

Below we define functions for plotting the eigenvalue spectrum, spatial patterns, temporal PC series, and SST fields.

# constants 
text.size    = 15
customize.bw = theme_bw(base_family = "Times") +
  theme(
    axis.text.x        = element_text(size = text.size, vjust = 0.5),
    axis.text.y        = element_text(size = text.size),
    axis.title         = element_text(size = text.size + 2, face = "bold"),
    plot.title         = element_text(size = text.size + 2, face = "bold"),
    
    legend.position    = "bottom",
    legend.title       = element_text(size = text.size, face = "bold"),
    legend.text        = element_text(size = text.size),
    
    panel.border       = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
    panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
    panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank()
  )
# color palettes
palette.rdbu = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200)
# world map
world.sf     = ne_countries(scale = "medium", returnclass = "sf") %>%
    st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
    st_transform("+proj=longlat +datum=WGS84")


#  plot for PCA
plot.eigenSpectrum = function(pca_result, n = 15) {
  eigvals  = pca_result$sdev^2
  frac_var = eigvals / sum(eigvals)
  df       = data.frame(Mode = 1:length(frac_var), Variance = frac_var)
  breaks   = seq(0, 15, 3); breaks[1] = 1
  ggplot(df[1:n, ], aes(x = Mode, y = Variance)) +
    geom_line(color  = "#636363") + 
    geom_point(color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
    scale_x_continuous(breaks = breaks)+
    labs(y = "Fraction Variance", x = "Mode",title = "PCA Eigenvalue Spectrum")+
    theme_bw(base_size = 12, base_family = "Times") + 
    customize.bw
}

# plot spatial patterns
plot.spatialPattern = function(eigenvector, grid, title, fill_limits = c(-0.1, 0.1) ) {
  df                   = bind_cols(grid, value = eigenvector)
  colnames(df)[c(1,2)] = c("lat", "lon")
  df                   = df %>% # shift data from 0–360 to -180–180
    mutate(lon = ifelse(lon > 180, lon - 360, lon))
  
  ggplot(df, aes(x = lon, y = lat, fill = value)) +
    geom_tile() +
   # regions
    geom_rect(aes(xmin = 66, xmax = 100, ymin = 6, ymax = 39),
              inherit.aes = FALSE, color = "#41ab5d", fill = NA, linewidth = 1) +
    geom_sf(data = world.sf, fill = NA, color = "black", 
            linewidth = 0.2, inherit.aes = FALSE) +
    # labs
    labs(title = title, x = "Longitude", y = "Latitude") +
    # scales
    # coord_sf(xlim = c(-180, 180), ylim = c(-20, 20), expand = FALSE, crs = "EPSG:4326") +
    coord_sf(xlim = c(65, 101), ylim = c(5, 40), expand = FALSE, crs = "EPSG:4326") +
    scale_fill_gradientn(colors = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200), 
                         limits = fill_limits,
                         guide  = guide_colorbar(barwidth = 15, barheight = 2) ) +
    scale_y_continuous(breaks = seq(10, 40 , 10))+
    scale_x_continuous(breaks = seq(60, 100, 10))+
    # theme 
    customize.bw
}


# plot temporal PC series + ENSO indices
plot.temporalPC = function(pc, indices, title) {
  df      = data.frame(Year = 1950:2016, PC = scale(pc))
  df      = cbind(df, indices)
  # calculate correlation
  cors    = apply(df[, -c(1, 2)], 2, function(x) round(cor(df$PC, x, use = "complete.obs"), 2) )
  cor.tx  = paste0(paste(names(cors), cors, sep = " = "), collapse = ", ")
  df.melt = reshape2::melt(df, id.vars  = c("Year", "PC"), 
                           variable.name = "Index", 
                           value.name    = "value")
  df.melt = df.melt %>%
    mutate(phase = case_when(value > 0.5  ~ "Warm phase",
                             value < -0.5 ~ "Cool phase",
                             TRUE         ~ "Neutral phase"))
  
  levels(df.melt$phase) = c("Cool phase", "Neutral phase","Warm phase")
  # warm phase, cold phase, and neutral phase
  scale.colors = c("#053061", "#00441b", "#b2182b")
  scale.fill   = c("#92c5de", "#a6dba0", "#f4a582")
  
  ggplot(df.melt, aes(x = Year)) +
    geom_hline(yintercept = 0, linetype = "solid", color = "#e0e0e0") +
    geom_hline(yintercept = 2, linetype = "solid", color = "#67001f") +
    geom_hline(yintercept =-2, linetype = "solid", color = "#053061") +
    
    # ENSO indices
    geom_col(aes(y = value, color = phase, fill = phase)) + 
    geom_line(aes(y = value), linewidth = .3, linetype = 1, color = "#67001f") +
    # PC 
    geom_line(aes(y = PC), color = "#000", linewidth = .9) +
    geom_point(aes(y = PC), color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
    
    # labs 
    labs(title = paste0(title, "\n", cor.tx), y = "PC (scaled) / ENSO Index", x = "Year") +
    facet_wrap(~ Index, ncol = 1, scales = "free_y") +
    # scales
    scale_x_continuous(limits = c(1950, 2015), breaks = seq(1950, 2015, 10)) + 
    scale_color_manual("Indice", values = scale.colors) + 
    scale_fill_manual("Indice", values = scale.fill) + 
    customize.bw + 
    theme(strip.text = element_text(size = text.size),
          strip.background = element_blank())
}

# plot fields 
plot.fields = function(pc, grd.avg, title){
  r   = apply(grd.avg, 2, function(x) cor(pc, x, use = "complete.obs"))
  
  # significance (2-sided t-test)
  n   = nrow(grd.avg)
  t   = r * sqrt((n - 2) / (1 - r^2))
  p   = 2 * pt(-abs(t), df = n - 2)
  sig = ifelse(p < 0.05, TRUE, FALSE)
  
  # complete the grd
  zcor     = rep(NaN  , nx.sst * ny.sst)
  zsig     = rep(FALSE, nx.sst * ny.sst)
  zcor[sst.locs[, 3]] = r
  zsig[sst.locs[, 3]] = sig
  
  # grid for plot
  df     = expand.grid(lon = xgrid.sst, lat = ygrid.sst)
  df$cor = zcor
  df$sig = zsig
  
  # pc.maps[[i]] = 
  ggplot(df, aes(x = lon, y = lat)) +
    # correlation 
    geom_tile(aes(fill = cor)) +
    geom_contour(aes(z = cor), color = "#000", bins = 6, linewidth = 0.5, na.rm = TRUE) +
    # significance points 
    geom_point(data = df[df$sig,], aes(x = lon, y = lat), color = "#000", shape = 21, size = 0.6, stroke = 0.2) +
    # regions
    geom_rect(aes(xmin = 66, xmax = 100, ymin = 6, ymax = 39),
              inherit.aes = FALSE, color = "#41ab5d", fill = NA, linewidth = 1) +
    # borders("world", colour = "#000") +
    geom_sf(data = world.sf, fill = NA, color = "black", 
            linewidth = 0.2, inherit.aes = FALSE) +
    # labs
    labs(title = title, x = "Longitude", y = "Latitude")+
    # scales
    scale_fill_gradientn(colors = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200), 
                         limits = c(-0.5, 0.5),
                         guide  = guide_colorbar(barwidth = 15, barheight = 2) ) +
    coord_sf(xlim = c(0, 358), ylim = c(-20, 20), expand = FALSE, crs = "EPSG:4326") +
    scale_y_continuous(breaks = seq(-20, 20, 10))+
    #theme
    customize.bw
}

2 Load data and pre-process

2.1 Global grid setup

A 0.25° latitude-longitude grid is created over the study area, and the grid points are stored in xygrid.

# deg grid
nrows = 68
ncols = 65
ntime = 67    # Jun-Sep 1950 - 2016
nyrs  = ntime # 1902 - 2016
nglobe = nrows*ncols
N      = nrows*ncols

# Lat - Long grid
ygrid = seq(6.5 , 38.5, by=0.5)
xgrid = seq(66.5, 100 , by=0.5)
ny    = length(ygrid)
nx    = length(xgrid)
xygrid= matrix(0, nrow = nx*ny, ncol=2)
i = 0
for(iy in 1:ny){
  for(ix in 1:nx){
  i           = i+1
  xygrid[i,1] = ygrid[iy]
  xygrid[i,2] = xgrid[ix]
  }
}

2.2 Rajeevan Gridded rainfall

Rainfall data for JJAS from 1950 to 2016 is read and reshaped into a matrix where each row represents one season (year) and each column a spatial location. Only valid (non-missing) data are retained.

path.data = "https://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2/Spring-2025-data-commands/"
data      = readBin(paste0(path.data, "India-Rain-JJAS-05deg-1950-2016.r4"), 
                    what   = "numeric", 
                    n      = (nrows * ncols * ntime), 
                    size   = 4,
                    endian = "swap")
# Grid points over India
data  = array(data = data, dim=c( nrows, ncols, ntime ) )
data1 = data[,,1]
index = 1:(nx*ny)

# pull only data and the locations with non-missing data
index1  = index[data1 != "NaN"]
xygrid1 = xygrid[index1,]
nsites  = length(index1)
data2   = data1[index1]

# Rain data matrix - rows are seasonal (i.e. one value per year)
# and columns are locations
raindata     = matrix(NA, nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1        = data[,,i]
index1       = index[ data1 != "NaN"]
data2        = data1[index1]
raindata[i,] = data2
}

index     = 1:dim(raindata)[2]
xx        = apply(raindata,2,mean)
index2    = index1[xx > 0]
index3    = index[xx > 0]
xygrid1   = xygrid[index2,]
rainavg   = raindata[,index3]
indexgrid = index2
rm("data") #remove the object data to clear up space

2.3 Sea Surface Temperature fields

SST data for the tropical Pacific is loaded and reshaped. Only valid SST points are retained based on the provided locations.

# grid SST (NOAA tropical)
ygrid.sst = seq(-16, 16 , by = 2)
xgrid.sst = seq(0  , 358, by = 2)
ny.sst    = length(ygrid.sst)
nx.sst    = length(xgrid.sst)

# read SST valid points
sst.locs = read.table(paste0(path.data, "NOAA-trop-sst-locs.txt"))
# read SST binary
data.raw = readBin(paste0(path.data, "NOAA-Trop-JJAS-SST-1950-2016.r4"),
                       what   = "numeric",
                       n      = (nx.sst * ny.sst * ntime),
                       size   = 4,
                       endian = "swap")
# reshape to (ny, nx, ntime)
sst.array   = array(data.raw, dim = c(ny.sst, nx.sst, ntime))
sst.indices = sst.locs[, 3]
nsites      = length(sst.indices)

# construct sst.avg (ntime rows × valid SST sites columns)
sst.avg     = matrix(NA, nrow = ntime, ncol = nsites)
for (t in 1:ntime) {
  sst.slice    = sst.array[, , t]
  sst.avg[t, ] = sst.slice[sst.indices]
}

2.4 Indices ENSO EXTENDED

nino.ind   = c("NINO12","NINO3", "NINO34", "NINO4")
base.url   = "http://iridl.ldeo.columbia.edu/SOURCES/.Indices/.nino/.EXTENDED"
suffix.url = "/T/(Jan%201950)/(Dec%202016)/RANGE/T/(Jun-Sep)/seasonalAverage/data.ch"
enso.list  = lapply(nino.ind, function(ind){
  scan(paste0(base.url, "/.", ind, suffix.url))
})

names(enso.list) = tolower(nino.ind)
enso.indices     = as.data.frame(enso.list)

3 Principal Component Analysis (PCA)

The rainfall data is scaled and PCA is performed to extract the dominant modes of variability.
The eigenvalue spectrum helps us understand how much variance each mode explains.

# perform PCA
rainscale = scale(rainavg) # scale the data
rain.pca  = prcomp(rainscale, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (first 15 modes)
plot.eigenSpectrum(rain.pca)

\(\to\) The eigenvalue spectrum plot shows that the first few modes (especially mode 1) explain a significant fraction of the total variance.

\(\to\) Thus, the dominant patterns of rainfall variability are captured by the leading principal components.

Plotting the Leading 4 Spatial Patterns (EOFs)

# plot leading 4 spatial patterns (EOFs)
eof.p   = list()
for (i in 1:4) {
 eof.p[[i]] = plot.spatialPattern(rain.pca$rotation[, i], xygrid1, paste("EOF", i))
}
ggarrange(eof.p[[1]], eof.p[[2]], eof.p[[3]], eof.p[[4]], 
          common.legend = TRUE,
          legend        = "bottom",
          ncol = 2, nrow = 2)

\(\to\) EOF1: displays a large-scale coherent pattern that likely represents the overall monsoon signal across India.

\(\to\) EOF2–EOF4: highlight regional anomalies, capturing gradients (east-west or north-south) in the rainfall field. These patterns point to localized modes of variability.

Plotting the Leading 4 Temporal Patterns (PCs)

# plot leading 4 temporal patterns (PCs)
pc.temp = list()
for (i in 1:4) {
  pc.temp[[i]] = plot.temporalPC(pc      = rain.pca$x[, i], 
                                 indices = enso.indices,
                                 title   = paste("Principal Component", i)
                                 )
}
pc.temp[[1]]

pc.temp[[2]]

pc.temp[[3]]

pc.temp[[4]]

# ggarrange(pc.temp[[1]], pc.temp[[2]], pc.temp[[3]], pc.temp[[4]], 
          # ncol = 1)

\(\to\) The temporal plots show interannual variability. PC1, for instance, may reflect the overall strength of the monsoon, while PCs 2–4 capture more nuanced, regional fluctuations over time.

4 Rotated PCA (Varimax)

Rotation is applied to enhance the physical interpretability of the spatial patterns.

The varimax rotation redistributes the explained variance so that the resulting patterns (rotated EOFs) are more localized.

pc.data  = rain.pca$x[, 1:6]
rain.rot = principal(pc.data, nfactors = 6, rotate = "varimax", scores = TRUE)

Plotting Rotated Spatial Patterns

rot.eof  = list()
for (i in 1:4) {
  # project back spatially: rotated spatial = rotation * loading
  rotated_spatial = as.matrix(rain.pca$rotation[, 1:6]) %*% as.matrix(rain.rot$loadings[, i])
  rot.eof[[i]]    = plot.spatialPattern(rotated_spatial, xygrid1, paste("Rotated EOF", i))
}
ggarrange(rot.eof[[1]], rot.eof[[2]], 
          rot.eof[[3]], rot.eof[[4]], 
          common.legend = TRUE,
          legend = "bottom",
          ncol = 2, nrow = 2)

Plotting Rotated Temporal Patterns

rot.pc = list()
for (i in 1:4) {
  rot.pc[[i]] = plot.temporalPC(pc      = rain.rot$scores[, i], 
                                indices = enso.indices,
                                title   = paste("Rotated Principal Component", i)
                                 )
}
rot.pc[[1]]

rot.pc[[2]]

rot.pc[[3]]

rot.pc[[4]]

# ggarrange(rot.pc[[1]], rot.pc[[2]], rot.pc[[3]], rot.pc[[4]], 
#           ncol = 1)

4.1 Discussion on Rotated PCA

  • The rotated EOFs are more localized compared to their unrotated counterparts, making it easier to associate them with specific regional rainfall patterns.

  • The temporal evolution of the rotated PCs remains similar to the unrotated PCs, ensuring that the major modes of variability are preserved but are now more physically interpretable.

5 Correlated PCs with SST fields

Finally, we examine how the rainfall modes are linked to SST anomalies. For each of the first four PCs, we plot the correlation with SST and mark statistically significant regions (p < 0.05).

sst.maps = list()
for (i in 1:4) {
  pc.i          = rain.pca$x[, i]
  pc.title      = paste("Rainfall PC", i, "vs SST (r & p < 0.05)")
  sst.maps[[i]] = plot.fields(pc      = pc.i, 
                              grd.avg = sst.avg,
                              title   = pc.title )
}

ggarrange(sst.maps[[1]], sst.maps[[2]], sst.maps[[3]], sst.maps[[4]], 
          ncol          = 1,
          common.legend = TRUE,
          legend        = "bottom")

SST correlation analysis

\(\to\) PC1: the map shows a strong negative correlation with SST in the central/eastern Pacific, suggesting a link with ENSO events (El Niño/La Niña).

\(\to\) PC2: reveals SST anomalies in the Indian Ocean, hinting at an influence of the Indian Ocean Dipole (IOD).

\(\to\) PC3 & PC4: display mixed and weaker signals, implying that multiple or regional oceanic processes might be at play.

6 Final remarks

  1. PCA Results: the eigenvalue spectrum and EOF maps indicate that the first few modes capture most of the variability. PC1 represents the large-scale monsoon signal, whereas PCs 2–4 represent more regional variations.

  2. Rotated PCA: varimax rotation enhances the physical interpretability of the spatial patterns by localizing them, which is useful for identifying region-specific rainfall behaviors.

  3. SST Correlations: significant correlations between rainfall PCs and SST fields provide evidence of ocean-atmosphere coupling. In particular, the strong association between PC1 and Pacific SSTs reinforces the role of ENSO in modulating monsoon variability.