--- title: "hw2_prob7" author: "Haiying Peng" date: "2025-03-25" output: html_document: default pdf_document: default --- #### 7. For the all India Summer Rainfall with the tropical SST PCs as covariates, fit these models (a) CART (b) Gradient Boosting Tree (c) Random Forest Infer the outputs and compare their relative performances on RMSE, plots of historical vs model rainfall. ##### Read data ```{r,message=FALSE, warning=FALSE} # Load libraries library(maps) library(akima) library(fields) library(RColorBrewer) library(gbm) #### Clear Memory and Set Options rm(list=ls()) options(warn=-1) save <- FALSE # Change to TRUE if you want to save the plot # Set working directory (Modify if necessary) script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path) setwd(script_dir) source("HW2_Library.R") # Color palette myPalette <- colorRampPalette(rev(brewer.pal(9, "RdBu")), space="Lab") ``` ##### Read data ```{r} ### india rainfall data nrows=68 ncols=65 ntime <- 67 #Jun-Sep 1950 - 2016 nyrs <- ntime nglobe <- nrows*ncols ### Lat - Long grid.. #ygrid=seq(6.5,38.5,by=0.25) ygrid=seq(6.5,38.5,by=0.5) ny=length(ygrid) #xgrid=seq(66.5,100,by=0.25) xgrid=seq(66.5,100,by=0.5) nx=length(xgrid) xygrid=matrix(0,nrow=nx*ny,ncol=2) i=0 for(iy in 1:ny){ for(ix in 1:nx){ i=i+1 xygrid[i,1]=ygrid[iy] xygrid[i,2]=xgrid[ix] } } # Read India Gridded Rainfall data.. data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap") data <- array(data = data, dim=c( nrows, ncols, ntime ) ) data1=data[,,1] # the lat -long data grid.. index=1:(nx*ny) ## pull only data and the locations with non-missing data index1=index[data1 != "NaN"] # only non-missing data. xygrid1=xygrid[index1,] nsites=length(index1) data2=data1[index1] ### Rain data matrix - rows are seasonal (i.e. one value per year) ## and columns are locations raindata=matrix(NA,nrow=nyrs, ncol=nsites) for(i in 1:nyrs){ data1=data[,,i] index1=index[ data1 != "NaN"] data2=data1[index1] raindata[i,]=data2 } index = 1:dim(raindata)[2] xx = apply(raindata,2,mean) index2 = index1[xx > 0] index3 = index[xx > 0] xygrid1=xygrid[index2,] rainavg = raindata[,index3] indexgrid = index2 rm("data") #remove the object data to clear up space ``` ```{r} #### SST data ## read SST data nrows_sst <- 180 ncols_sst <- 17 ntime = 67 #Jun-Sep 1950 - 2016 nglobe_sst = nrows_sst * ncols_sst ### Lat - Long grid.. ygrid_sst = seq(-16,16,by=2) ny_sst =length(ygrid_sst) xgrid_sst=seq(0,358,by=2) nx_sst=length(xgrid_sst) xygrid_sst=matrix(0,nrow=nx_sst * ny_sst,ncol=2) i=0 for(iy_sst in 1:ny_sst){ for(ix_sst in 1:nx_sst){ i=i+1 xygrid_sst[i,1]=ygrid_sst[iy_sst] xygrid_sst[i,2]=xgrid_sst[ix_sst] } } data_sst=readBin("data/NOAA-Trop-JJAS-SST-1950-2016.r4",what="numeric", n=( nrows_sst * ncols_sst * ntime), size=4,endian="swap") data_sst <- array(data_sst, dim=c( nrows_sst, ncols_sst, ntime ) ) data1_sst=data_sst[,,1] # the lat -long data grid.. index_sst=1:(nx_sst*ny_sst) ## pull only data and the locations with non-missing data index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] # only non-missing data. xygrid1_sst=xygrid_sst[index1_sst,] nsites_sst=length(index1_sst) data2_sst=data1_sst[index1_sst] ### SSTdata matrix - rows are seasonal (i.e. one value per year) ## and columns are locations sstdata=matrix(NA,nrow=ntime, ncol=nsites_sst) for(i in 1:ntime){ data1_sst=data_sst[,,i] index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] data2_sst=data1_sst[index1_sst] sstdata[i,]=data2_sst } sstannavg = sstdata indexgrid_sst = index1_sst rm("data_sst") #remove the object data to clear up space ``` ##### Perform PCA ```{r} ###################### PCA on SST ## #get variance matrix.. ## scale the data sstscale = scale(sstannavg) zs_sst=var(sstscale) #do an Eigen decomposition.. zsvd_sst=svd(zs_sst) #Principal Components... sstpcs=t(t(zsvd_sst$u) %*% t(sstscale)) ``` ##### (a)CART ```{r,message=FALSE, warning=FALSE} N = nrow(rainavg) ngrid = ncol(rainavg) treePred_grid = matrix(rep(0,N),ncol=ngrid,nrow=N) for (j in 1:ngrid) { rain_grid = rainavg[,j] # rainfall at one grid point over time data_j = as.data.frame(cbind(rain_grid,sstpcs[,1:4])) names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4") #Fit CART model tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data_j, model = T) treePred_grid[,j] <- predict(tree.precip) } ## correlate the forecasted rainfall with the historical precipitation xcor_tree = diag(cor(treePred_grid,rainavg)) xrmse_tree = sqrt(colMeans((treePred_grid - rainavg)^2)) # plot of R^2 between the observed and predicted summer precipitation at each grid point rainlocs = read.table("data/India-rain-locs.txt") xlong = xgrid ylat = ygrid zfull_cor = rep(NaN,nx*ny) zfull_rmse = rep(NaN,nx*ny) zfull_cor[rainlocs[,3]]=xcor_tree zfull_rmse[rainlocs[,3]]=xrmse_tree zmat_cor = matrix(zfull_cor,nrow=nx,ncol=ny) zmat_rmse = matrix(zfull_rmse,nrow=nx,ncol=ny) image.plot(xlong,ylat,zmat_cor,ylim=range(5,40),col=myPalette(200),zlim=c(0.5,0.8)) contour(xlong,ylat,(zmat_cor),ylim=range(5,40),add=TRUE,nlev=6,lwd=2) title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (CART)")) maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2) grid() image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350)) contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2) title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (CART)")) maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2) grid() # Boxplot correlation and RMSE across the grids par(mfrow = c(1, 2)) # Boxplot of correlation boxplot(xcor_tree, main = "Correlation (R2)", col = "skyblue", border = "blue") # Boxplot of RMSE boxplot(xrmse_tree, main = "RMSE", col = "salmon", border = "red") ``` ##### (b) Fit Gradient Boosting Model ```{r} gbmPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N) for (j in 1:ngrid) { rain_grid = rainavg[,j] # rainfall at one grid point over time data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4])) names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4") # Fit Gradient Boosting model gbm.precip = gbm( formula = prec ~ PC1 + PC2 + PC3 + PC4, data = data_j, distribution = "gaussian", n.trees = 100, # number of boosting iterations interaction.depth = 2, # tree depth shrinkage = 0.05, # learning rate n.minobsinnode = 5, # minimum observations per node verbose = FALSE ) # Predict using the fitted GBM model gbmPred_grid[,j] <- predict(gbm.precip, newdata = data_j, n.trees = 100) } ## correlate the forecasted rainfall with the historical precipitation xcor_gbm = diag(cor(gbmPred_grid, rainavg)) xrmse_gbm = sqrt(colMeans((gbmPred_grid - rainavg)^2)) ## Spatial plot of R² and RMSE zfull_cor = rep(NaN, nx * ny) zfull_rmse = rep(NaN, nx * ny) zfull_cor[rainlocs[,3]] = xcor_gbm zfull_rmse[rainlocs[,3]] = xrmse_gbm zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny) zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny) image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.7, 0.9)) contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2) title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (GBM)")) maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2) grid() image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 350)) contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2) title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (GBM)")) maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2) grid() ## Boxplots of correlation and RMSE par(mfrow = c(1, 2)) # Boxplot of correlation boxplot(xcor_gbm, main = "Correlation (R2)", col = "skyblue", border = "blue") # Boxplot of RMSE boxplot(xrmse_gbm, main = "RMSE", col = "salmon", border = "red") ``` ##### (c) Fit Random Forest ```{r,warning=FALSE} RFPred_grid = matrix(rep(0,N), ncol=ngrid, nrow=N) for (j in 1:ngrid) { rain_grid = rainavg[,j] # rainfall at one grid point over time data_j = as.data.frame(cbind(rain_grid, sstpcs[,1:4])) names(data_j) = c("prec", "PC1", "PC2", "PC3", "PC4") # Fit Random forest model forest.precip = randomForest( formula = prec ~ PC1 + PC2 + PC3 + PC4, data = data_j) # Predict using the fitted GBM model RFPred_grid[,j] <- predict(forest.precip, newdata = data_j, n.trees = 100) } ## correlate the forecasted rainfall with the historical precipitation xcor_rf = diag(cor(RFPred_grid, rainavg)) xrmse_rf = sqrt(colMeans((RFPred_grid - rainavg)^2)) ## Spatial plot of R² and RMSE zfull_cor = rep(NaN, nx * ny) zfull_rmse = rep(NaN, nx * ny) zfull_cor[rainlocs[,3]] = xcor_rf zfull_rmse[rainlocs[,3]] = xrmse_rf zmat_cor = matrix(zfull_cor, nrow = nx, ncol = ny) zmat_rmse = matrix(zfull_rmse, nrow = nx, ncol = ny) image.plot(xlong, ylat, zmat_cor, ylim = range(5, 40), col = myPalette(200), zlim = c(0.9, 1.0)) contour(xlong, ylat, zmat_cor, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2) title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation (Random Forest)")) maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2) grid() image.plot(xlong, ylat, zmat_rmse, ylim = range(5, 40), col = myPalette(200), zlim = c(50, 300)) contour(xlong, ylat, zmat_rmse, ylim = range(5, 40), add = TRUE, nlev = 6, lwd = 2) title(main = bquote(RMSE ~ "between the Observed and Predicted Precipitation (Random Forest)")) maps::map('world', wrap = c(0, 360), add = TRUE, resolution = 0, lwd = 2) grid() ## Boxplots of correlation and RMSE par(mfrow = c(1, 2)) # Boxplot of correlation boxplot(xcor_gbm, main = "Correlation (R2)", col = "skyblue", border = "blue") # Boxplot of RMSE boxplot(xrmse_gbm, main = "RMSE", col = "salmon", border = "red") ``` ```{r} # historical vs model rainfall obs <- as.vector(rainavg) pred_tree <- as.vector(treePred_grid) pred_gbm <- as.vector(gbmPred_grid) pred_rf <- as.vector(RFPred_grid) r2_tree <- round(cor(obs, pred_tree), 2) r2_gbm <- round(cor(obs, pred_gbm), 2) r2_rf <- round(cor(obs, pred_rf), 2) max_val <- max(c(obs, pred_gbm,pred_rf), na.rm = TRUE) par(pin = c(2.9, 2.9)) plot(obs, pred_tree, xlab = "Observed Rainfall (mm)", ylab = "Predicted Rainfall (mm)", main = "Predicted vs Observed Rainfall_CART", xlim = range(0, max_val), ylim = range(0, max_val), pch = 1, cex=0.4) abline(a=0, b=1, lty=1) text(x = max(obs) -1000, y = min(pred_tree) + 100, labels = bquote(R^2 == .(r2_tree)), cex = 1) plot(obs, pred_gbm, xlab = "Observed Rainfall (mm)", ylab = "Predicted Rainfall (mm)", main = "Predicted vs Observed Rainfall_GBM", xlim = c(0, max_val), ylim = c(0, max_val), pch = 1, cex=0.4) abline(a=0, b=1, lty=1) text(x = max(obs) -1000, y = min(pred_tree) + 100, labels = bquote(R^2 == .(r2_gbm)), cex = 1) plot(obs, pred_rf, xlab = "Observed Rainfall (mm)", ylab = "Predicted Rainfall (mm)", main = "Predicted vs Observed Rainfall_RandomForest", xlim = c(0, max_val), ylim = c(0, max_val), pch = 1, cex=0.4) abline(a=0, b=1, lty=1) text(x = max(obs) -1000, y = min(pred_tree) + 100, labels = bquote(R^2 == .(r2_rf)), cex = 1) ``` ```{r} # Summary statistics rmse_stats <- data.frame( Model = c("CART", "GBM", "Random Forest"), Min = c(min(xrmse_tree), min(xrmse_gbm), min(xrmse_rf)), Q1 = c(quantile(xrmse_tree, 0.25), quantile(xrmse_gbm, 0.25), quantile(xrmse_rf, 0.25)), Mean = c(mean(xrmse_tree), mean(xrmse_gbm), mean(xrmse_rf)), Median = c(median(xrmse_tree), median(xrmse_gbm), median(xrmse_rf)), Q3 = c(quantile(xrmse_tree, 0.75), quantile(xrmse_gbm, 0.75), quantile(xrmse_rf, 0.75)), Max = c(max(xrmse_tree), max(xrmse_gbm), max(xrmse_rf)), SDE = c(sd(xrmse_tree), sd(xrmse_gbm), sd(xrmse_rf)) ) print(rmse_stats) ``` #### Conclusion: Relative Performance Based on RMSE Random Forest ≈ GBM > CART CART has a wide RMSE range with several high-error grids.The RMSE spatial map shows large areas with errors exceeding 250 mm, indicating poor model performance in many regions.The CART model struggles to capture complex non-linear relationships in the rainfall–SST linkage, leading to greater prediction error. GBM and Ranfom Forest offer a major accuracy improvement over CART. The lower RMSE and tighter error spread make them strong models for spatial rainfall prediction, particularly in capturing non-linear dependencies in SST–rainfall relationships.