7. For the all India Summer Rainfall with the tropical SST PCs as covariates, fit these models

  1. CART
  2. Gradient Boosting Tree
  3. Random Forest

Infer the outputs and compare their relative performances on RMSE, plots of historical vs model rainfall.

Read data
# Load libraries
library(maps)
library(akima)
library(fields)
library(RColorBrewer)
library(gbm)

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")

# Color palette
myPalette <- colorRampPalette(rev(brewer.pal(9, "RdBu")), space="Lab")
Read data
### india rainfall data
nrows=68
ncols=65

ntime <- 67   #Jun-Sep 1950 - 2016

nyrs <- ntime 

nglobe <- nrows*ncols

### Lat - Long grid..
#ygrid=seq(6.5,38.5,by=0.25)
ygrid=seq(6.5,38.5,by=0.5)
ny=length(ygrid)

#xgrid=seq(66.5,100,by=0.25)
xgrid=seq(66.5,100,by=0.5)
nx=length(xgrid)

xygrid=matrix(0,nrow=nx*ny,ncol=2)

i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}

}

# Read India Gridded Rainfall data..

data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")


data <- array(data = data, dim=c( nrows, ncols, ntime ) )

data1=data[,,1]
    
# the lat -long data grid..
index=1:(nx*ny)

## pull only data and the locations with non-missing data
index1=index[data1 != "NaN"]    # only non-missing data.
xygrid1=xygrid[index1,]

nsites=length(index1)

data2=data1[index1]

### Rain data matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
raindata=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
data1=data[,,i]
index1=index[ data1 != "NaN"]
data2=data1[index1]
raindata[i,]=data2
}


index = 1:dim(raindata)[2]

xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]

xygrid1=xygrid[index2,]
rainavg = raindata[,index3]

indexgrid = index2
rm("data")  #remove the object data to clear up space
#### SST data
## read SST data
nrows_sst <- 180
ncols_sst <- 17
ntime = 67   #Jun-Sep 1950 - 2016

nglobe_sst = nrows_sst * ncols_sst

### Lat - Long grid..
ygrid_sst = seq(-16,16,by=2)
ny_sst =length(ygrid_sst)

xgrid_sst=seq(0,358,by=2)
nx_sst=length(xgrid_sst)

xygrid_sst=matrix(0,nrow=nx_sst * ny_sst,ncol=2)

i=0
for(iy_sst in 1:ny_sst){
for(ix_sst in 1:nx_sst){
i=i+1
xygrid_sst[i,1]=ygrid_sst[iy_sst]
xygrid_sst[i,2]=xgrid_sst[ix_sst]
}

}
data_sst=readBin("data/NOAA-Trop-JJAS-SST-1950-2016.r4",what="numeric", n=( nrows_sst * ncols_sst * ntime), size=4,endian="swap")

data_sst <- array(data_sst, dim=c( nrows_sst, ncols_sst, ntime ) )

data1_sst=data_sst[,,1]
    

# the lat -long data grid..
index_sst=1:(nx_sst*ny_sst)

## pull only data and the locations with non-missing data
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"]   # only non-missing data.
xygrid1_sst=xygrid_sst[index1_sst,]

nsites_sst=length(index1_sst)

data2_sst=data1_sst[index1_sst]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=ntime, ncol=nsites_sst)


for(i in 1:ntime){
data1_sst=data_sst[,,i]
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"]
data2_sst=data1_sst[index1_sst]
sstdata[i,]=data2_sst
}

sstannavg = sstdata
indexgrid_sst = index1_sst
rm("data_sst")  #remove the object data to clear up space
Perform PCA
###################### PCA on SST
##

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs_sst=var(sstscale)

#do an Eigen decomposition..
zsvd_sst=svd(zs_sst)

#Principal Components...
sstpcs=t(t(zsvd_sst$u) %*% t(sstscale))
(a)CART
N = nrow(rainavg)
ngrid = ncol(rainavg)

treePred_grid = matrix(rep(0,N),ncol=ngrid,nrow=N)
for (j in 1:ngrid) {
  rain_grid = rainavg[,j] # rainfall at one grid point over time

  data_j = as.data.frame(cbind(rain_grid,sstpcs[,1:4]))
  names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
  
  #Fit CART model
  tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data_j, model = T)
  
  treePred_grid[,j] <- predict(tree.precip)

}

## correlate the forecasted rainfall with the historical precipitation
xcor_tree = diag(cor(treePred_grid,rainavg))
xrmse_tree = sqrt(colMeans((treePred_grid - rainavg)^2))

# plot of R^2 between the observed and predicted summer precipitation at each grid point
rainlocs = read.table("data/India-rain-locs.txt")
xlong = xgrid
ylat = ygrid

  zfull_cor = rep(NaN,nx*ny)
  zfull_rmse = rep(NaN,nx*ny)
  zfull_cor[rainlocs[,3]]=xcor_tree
  zfull_rmse[rainlocs[,3]]=xrmse_tree
  zmat_cor = matrix(zfull_cor,nrow=nx,ncol=ny)
  zmat_rmse = matrix(zfull_rmse,nrow=nx,ncol=ny)
 
  image.plot(xlong,ylat,zmat_cor,ylim=range(5,40),col=myPalette(200),zlim=c(0.5,0.8))
  contour(xlong,ylat,(zmat_cor),ylim=range(5,40),add=TRUE,nlev=6,lwd=2)
  title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (CART)"))
  maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2)
 grid()

 image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (CART)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()

 # Boxplot correlation and RMSE across the grids
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_tree,
        main = "Correlation (R2)",
        col = "skyblue",
        border = "blue")

# Boxplot of RMSE
boxplot(xrmse_tree,
        main = "RMSE",
        col = "salmon",
        border = "red")

(b) Fit Gradient Boosting Model
gbmPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N)

for (j in 1:ngrid) {
  rain_grid = rainavg[,j]  # rainfall at one grid point over time

  data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4]))
  names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
  
  # Fit Gradient Boosting model
  gbm.precip = gbm(
    formula = prec ~ PC1 + PC2 + PC3 + PC4,
    data = data_j,
    distribution = "gaussian",
    n.trees = 100,         # number of boosting iterations
    interaction.depth = 2, # tree depth
    shrinkage = 0.05,      # learning rate
    n.minobsinnode = 5,    # minimum observations per node
    verbose = FALSE
  )
  
  # Predict using the fitted GBM model
  gbmPred_grid[,j] <- predict(gbm.precip, newdata = data_j, n.trees = 100)
}
## correlate the forecasted rainfall with the historical precipitation
xcor_gbm = diag(cor(gbmPred_grid, rainavg))
xrmse_gbm = sqrt(colMeans((gbmPred_grid - rainavg)^2))

## Spatial plot of R² and RMSE
zfull_cor = rep(NaN, nx * ny)
zfull_rmse = rep(NaN, nx * ny)
zfull_cor[rainlocs[,3]] = xcor_gbm
zfull_rmse[rainlocs[,3]] = xrmse_gbm
zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny)
zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny)

image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.7, 0.9))
contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (GBM)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()

image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (GBM)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()

## Boxplots of correlation and RMSE
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_gbm,
        main = "Correlation (R2)",
        col = "skyblue",
        border = "blue")

# Boxplot of RMSE
boxplot(xrmse_gbm,
        main = "RMSE",
        col = "salmon",
        border = "red")

(c) Fit Random Forest
RFPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N)

for (j in 1:ngrid) {
  rain_grid = rainavg[,j]  # rainfall at one grid point over time

  data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4]))
  names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
  
  # Fit Random forest model
  forest.precip = randomForest(
    formula = prec ~ PC1 + PC2 + PC3 + PC4,
    data = data_j)
  
  # Predict using the fitted GBM model
  RFPred_grid[,j] <- predict(forest.precip, newdata = data_j, n.trees = 100)
}
## correlate the forecasted rainfall with the historical precipitation
xcor_rf = diag(cor(RFPred_grid, rainavg))
xrmse_rf = sqrt(colMeans((RFPred_grid - rainavg)^2))

## Spatial plot of R² and RMSE
zfull_cor = rep(NaN, nx * ny)
zfull_rmse = rep(NaN, nx * ny)
zfull_cor[rainlocs[,3]] = xcor_rf
zfull_rmse[rainlocs[,3]] = xrmse_rf
zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny)
zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny)

image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.9, 1.0))
contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (Random Forest)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()

image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 300))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (Random Forest)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()

## Boxplots of correlation and RMSE
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_gbm,
        main = "Correlation (R2)",
        col = "skyblue",
        border = "blue")

# Boxplot of RMSE
boxplot(xrmse_gbm,
        main = "RMSE",
        col = "salmon",
        border = "red")

# historical vs model rainfall
obs <- as.vector(rainavg)
pred_tree <- as.vector(treePred_grid)
pred_gbm <- as.vector(gbmPred_grid)
pred_rf <- as.vector(RFPred_grid)

r2_tree <- round(cor(obs, pred_tree), 2)
r2_gbm <- round(cor(obs, pred_gbm), 2)
r2_rf <- round(cor(obs, pred_rf), 2)

max_val <- max(c(obs, pred_gbm,pred_rf), na.rm = TRUE)

par(pin = c(2.9, 2.9))

plot(obs, pred_tree,
     xlab = "Observed Rainfall (mm)",
     ylab = "Predicted Rainfall (mm)",
     main = "Predicted vs Observed Rainfall_CART",
     xlim = range(0, max_val),
     ylim = range(0, max_val),
     pch = 1, cex=0.4)

abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
     labels = bquote(R^2 == .(r2_tree)),
     cex = 1)

plot(obs, pred_gbm,
     xlab = "Observed Rainfall (mm)",
     ylab = "Predicted Rainfall (mm)",
     main = "Predicted vs Observed Rainfall_GBM",
     xlim = c(0, max_val),
     ylim = c(0, max_val),
     pch = 1, cex=0.4)

abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
     labels = bquote(R^2 == .(r2_gbm)),
     cex = 1)

plot(obs, pred_rf,
     xlab = "Observed Rainfall (mm)",
     ylab = "Predicted Rainfall (mm)",
     main = "Predicted vs Observed Rainfall_RandomForest",
     xlim = c(0, max_val),
     ylim = c(0, max_val),
     pch = 1, cex=0.4)

abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
     labels = bquote(R^2 == .(r2_rf)),
     cex = 1)

# Summary statistics

rmse_stats <- data.frame(
  Model = c("CART", "GBM", "Random Forest"),
  Min = c(min(xrmse_tree), min(xrmse_gbm), min(xrmse_rf)),
  Q1 = c(quantile(xrmse_tree, 0.25), quantile(xrmse_gbm, 0.25), quantile(xrmse_rf, 0.25)),
  Mean = c(mean(xrmse_tree), mean(xrmse_gbm), mean(xrmse_rf)),
  Median = c(median(xrmse_tree), median(xrmse_gbm), median(xrmse_rf)),
  Q3 = c(quantile(xrmse_tree, 0.75), quantile(xrmse_gbm, 0.75), quantile(xrmse_rf, 0.75)),
  Max = c(max(xrmse_tree), max(xrmse_gbm), max(xrmse_rf)),
  SDE = c(sd(xrmse_tree), sd(xrmse_gbm), sd(xrmse_rf))
)
print(rmse_stats)
##           Model      Min        Q1     Mean   Median       Q3      Max
## 1          CART 52.83186 133.86084 205.4350 176.2781 223.5459 1603.128
## 2           GBM 56.14803 128.00595 196.1523 169.1798 211.9266 1549.896
## 3 Random Forest 40.36190  93.90475 144.2085 124.3787 155.0375 1130.963
##         SDE
## 1 133.01727
## 2 126.71570
## 3  93.74521

Conclusion: Relative Performance Based on RMSE

Random Forest ≈ GBM > CART CART has a wide RMSE range with several high-error grids.The RMSE spatial map shows large areas with errors exceeding 250 mm, indicating poor model performance in many regions.The CART model struggles to capture complex non-linear relationships in the rainfall–SST linkage, leading to greater prediction error.

GBM and Ranfom Forest offer a major accuracy improvement over CART. The lower RMSE and tighter error spread make them strong models for spatial rainfall prediction, particularly in capturing non-linear dependencies in SST–rainfall relationships.