Infer the outputs and compare their relative performances on RMSE, plots of historical vs model rainfall.
# Load libraries
library(maps)
library(akima)
library(fields)
library(RColorBrewer)
library(gbm)
#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE # Change to TRUE if you want to save the plot
# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)
source("HW2_Library.R")
# Color palette
myPalette <- colorRampPalette(rev(brewer.pal(9, "RdBu")), space="Lab")
### india rainfall data
nrows=68
ncols=65
ntime <- 67 #Jun-Sep 1950 - 2016
nyrs <- ntime
nglobe <- nrows*ncols
### Lat - Long grid..
#ygrid=seq(6.5,38.5,by=0.25)
ygrid=seq(6.5,38.5,by=0.5)
ny=length(ygrid)
#xgrid=seq(66.5,100,by=0.25)
xgrid=seq(66.5,100,by=0.5)
nx=length(xgrid)
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}
}
# Read India Gridded Rainfall data..
data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data1=data[,,1]
# the lat -long data grid..
index=1:(nx*ny)
## pull only data and the locations with non-missing data
index1=index[data1 != "NaN"] # only non-missing data.
xygrid1=xygrid[index1,]
nsites=length(index1)
data2=data1[index1]
### Rain data matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
raindata=matrix(NA,nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1=data[,,i]
index1=index[ data1 != "NaN"]
data2=data1[index1]
raindata[i,]=data2
}
index = 1:dim(raindata)[2]
xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]
xygrid1=xygrid[index2,]
rainavg = raindata[,index3]
indexgrid = index2
rm("data") #remove the object data to clear up space
#### SST data
## read SST data
nrows_sst <- 180
ncols_sst <- 17
ntime = 67 #Jun-Sep 1950 - 2016
nglobe_sst = nrows_sst * ncols_sst
### Lat - Long grid..
ygrid_sst = seq(-16,16,by=2)
ny_sst =length(ygrid_sst)
xgrid_sst=seq(0,358,by=2)
nx_sst=length(xgrid_sst)
xygrid_sst=matrix(0,nrow=nx_sst * ny_sst,ncol=2)
i=0
for(iy_sst in 1:ny_sst){
for(ix_sst in 1:nx_sst){
i=i+1
xygrid_sst[i,1]=ygrid_sst[iy_sst]
xygrid_sst[i,2]=xgrid_sst[ix_sst]
}
}
data_sst=readBin("data/NOAA-Trop-JJAS-SST-1950-2016.r4",what="numeric", n=( nrows_sst * ncols_sst * ntime), size=4,endian="swap")
data_sst <- array(data_sst, dim=c( nrows_sst, ncols_sst, ntime ) )
data1_sst=data_sst[,,1]
# the lat -long data grid..
index_sst=1:(nx_sst*ny_sst)
## pull only data and the locations with non-missing data
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] # only non-missing data.
xygrid1_sst=xygrid_sst[index1_sst,]
nsites_sst=length(index1_sst)
data2_sst=data1_sst[index1_sst]
### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=ntime, ncol=nsites_sst)
for(i in 1:ntime){
data1_sst=data_sst[,,i]
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"]
data2_sst=data1_sst[index1_sst]
sstdata[i,]=data2_sst
}
sstannavg = sstdata
indexgrid_sst = index1_sst
rm("data_sst") #remove the object data to clear up space
###################### PCA on SST
##
#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs_sst=var(sstscale)
#do an Eigen decomposition..
zsvd_sst=svd(zs_sst)
#Principal Components...
sstpcs=t(t(zsvd_sst$u) %*% t(sstscale))
N = nrow(rainavg)
ngrid = ncol(rainavg)
treePred_grid = matrix(rep(0,N),ncol=ngrid,nrow=N)
for (j in 1:ngrid) {
rain_grid = rainavg[,j] # rainfall at one grid point over time
data_j = as.data.frame(cbind(rain_grid,sstpcs[,1:4]))
names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
#Fit CART model
tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data_j, model = T)
treePred_grid[,j] <- predict(tree.precip)
}
## correlate the forecasted rainfall with the historical precipitation
xcor_tree = diag(cor(treePred_grid,rainavg))
xrmse_tree = sqrt(colMeans((treePred_grid - rainavg)^2))
# plot of R^2 between the observed and predicted summer precipitation at each grid point
rainlocs = read.table("data/India-rain-locs.txt")
xlong = xgrid
ylat = ygrid
zfull_cor = rep(NaN,nx*ny)
zfull_rmse = rep(NaN,nx*ny)
zfull_cor[rainlocs[,3]]=xcor_tree
zfull_rmse[rainlocs[,3]]=xrmse_tree
zmat_cor = matrix(zfull_cor,nrow=nx,ncol=ny)
zmat_rmse = matrix(zfull_rmse,nrow=nx,ncol=ny)
image.plot(xlong,ylat,zmat_cor,ylim=range(5,40),col=myPalette(200),zlim=c(0.5,0.8))
contour(xlong,ylat,(zmat_cor),ylim=range(5,40),add=TRUE,nlev=6,lwd=2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (CART)"))
maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2)
grid()
image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (CART)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()
# Boxplot correlation and RMSE across the grids
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_tree,
main = "Correlation (R2)",
col = "skyblue",
border = "blue")
# Boxplot of RMSE
boxplot(xrmse_tree,
main = "RMSE",
col = "salmon",
border = "red")
gbmPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N)
for (j in 1:ngrid) {
rain_grid = rainavg[,j] # rainfall at one grid point over time
data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4]))
names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
# Fit Gradient Boosting model
gbm.precip = gbm(
formula = prec ~ PC1 + PC2 + PC3 + PC4,
data = data_j,
distribution = "gaussian",
n.trees = 100, # number of boosting iterations
interaction.depth = 2, # tree depth
shrinkage = 0.05, # learning rate
n.minobsinnode = 5, # minimum observations per node
verbose = FALSE
)
# Predict using the fitted GBM model
gbmPred_grid[,j] <- predict(gbm.precip, newdata = data_j, n.trees = 100)
}
## correlate the forecasted rainfall with the historical precipitation
xcor_gbm = diag(cor(gbmPred_grid, rainavg))
xrmse_gbm = sqrt(colMeans((gbmPred_grid - rainavg)^2))
## Spatial plot of R² and RMSE
zfull_cor = rep(NaN, nx * ny)
zfull_rmse = rep(NaN, nx * ny)
zfull_cor[rainlocs[,3]] = xcor_gbm
zfull_rmse[rainlocs[,3]] = xrmse_gbm
zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny)
zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny)
image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.7, 0.9))
contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (GBM)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()
image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (GBM)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()
## Boxplots of correlation and RMSE
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_gbm,
main = "Correlation (R2)",
col = "skyblue",
border = "blue")
# Boxplot of RMSE
boxplot(xrmse_gbm,
main = "RMSE",
col = "salmon",
border = "red")
RFPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N)
for (j in 1:ngrid) {
rain_grid = rainavg[,j] # rainfall at one grid point over time
data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4]))
names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4")
# Fit Random forest model
forest.precip = randomForest(
formula = prec ~ PC1 + PC2 + PC3 + PC4,
data = data_j)
# Predict using the fitted GBM model
RFPred_grid[,j] <- predict(forest.precip, newdata = data_j, n.trees = 100)
}
## correlate the forecasted rainfall with the historical precipitation
xcor_rf = diag(cor(RFPred_grid, rainavg))
xrmse_rf = sqrt(colMeans((RFPred_grid - rainavg)^2))
## Spatial plot of R² and RMSE
zfull_cor = rep(NaN, nx * ny)
zfull_rmse = rep(NaN, nx * ny)
zfull_cor[rainlocs[,3]] = xcor_rf
zfull_rmse[rainlocs[,3]] = xrmse_rf
zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny)
zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny)
image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.9, 1.0))
contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (Random Forest)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()
image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 300))
contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2)
title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (Random Forest)"))
maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2)
grid()
## Boxplots of correlation and RMSE
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_gbm,
main = "Correlation (R2)",
col = "skyblue",
border = "blue")
# Boxplot of RMSE
boxplot(xrmse_gbm,
main = "RMSE",
col = "salmon",
border = "red")
# historical vs model rainfall
obs <- as.vector(rainavg)
pred_tree <- as.vector(treePred_grid)
pred_gbm <- as.vector(gbmPred_grid)
pred_rf <- as.vector(RFPred_grid)
r2_tree <- round(cor(obs, pred_tree), 2)
r2_gbm <- round(cor(obs, pred_gbm), 2)
r2_rf <- round(cor(obs, pred_rf), 2)
max_val <- max(c(obs, pred_gbm,pred_rf), na.rm = TRUE)
par(pin = c(2.9, 2.9))
plot(obs, pred_tree,
xlab = "Observed Rainfall (mm)",
ylab = "Predicted Rainfall (mm)",
main = "Predicted vs Observed Rainfall_CART",
xlim = range(0, max_val),
ylim = range(0, max_val),
pch = 1, cex=0.4)
abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
labels = bquote(R^2 == .(r2_tree)),
cex = 1)
plot(obs, pred_gbm,
xlab = "Observed Rainfall (mm)",
ylab = "Predicted Rainfall (mm)",
main = "Predicted vs Observed Rainfall_GBM",
xlim = c(0, max_val),
ylim = c(0, max_val),
pch = 1, cex=0.4)
abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
labels = bquote(R^2 == .(r2_gbm)),
cex = 1)
plot(obs, pred_rf,
xlab = "Observed Rainfall (mm)",
ylab = "Predicted Rainfall (mm)",
main = "Predicted vs Observed Rainfall_RandomForest",
xlim = c(0, max_val),
ylim = c(0, max_val),
pch = 1, cex=0.4)
abline(a=0, b=1, lty=1)
text(x = max(obs) -1000, y = min(pred_tree) + 100,
labels = bquote(R^2 == .(r2_rf)),
cex = 1)
# Summary statistics
rmse_stats <- data.frame(
Model = c("CART", "GBM", "Random Forest"),
Min = c(min(xrmse_tree), min(xrmse_gbm), min(xrmse_rf)),
Q1 = c(quantile(xrmse_tree, 0.25), quantile(xrmse_gbm, 0.25), quantile(xrmse_rf, 0.25)),
Mean = c(mean(xrmse_tree), mean(xrmse_gbm), mean(xrmse_rf)),
Median = c(median(xrmse_tree), median(xrmse_gbm), median(xrmse_rf)),
Q3 = c(quantile(xrmse_tree, 0.75), quantile(xrmse_gbm, 0.75), quantile(xrmse_rf, 0.75)),
Max = c(max(xrmse_tree), max(xrmse_gbm), max(xrmse_rf)),
SDE = c(sd(xrmse_tree), sd(xrmse_gbm), sd(xrmse_rf))
)
print(rmse_stats)
## Model Min Q1 Mean Median Q3 Max
## 1 CART 52.83186 133.86084 205.4350 176.2781 223.5459 1603.128
## 2 GBM 56.14803 128.00595 196.1523 169.1798 211.9266 1549.896
## 3 Random Forest 40.36190 93.90475 144.2085 124.3787 155.0375 1130.963
## SDE
## 1 133.01727
## 2 126.71570
## 3 93.74521
Random Forest ≈ GBM > CART CART has a wide RMSE range with several high-error grids.The RMSE spatial map shows large areas with errors exceeding 250 mm, indicating poor model performance in many regions.The CART model struggles to capture complex non-linear relationships in the rainfall–SST linkage, leading to greater prediction error.
GBM and Ranfom Forest offer a major accuracy improvement over CART. The lower RMSE and tighter error spread make them strong models for spatial rainfall prediction, particularly in capturing non-linear dependencies in SST–rainfall relationships.