--- title: "hw2_prob8" author: "Haiying Peng" date: "2025-03-25" output: html_document: default pdf_document: default --- #### CART and Random Forest – Multivariate Forecasting 8. Fit a CART model to the leading 4 PCs of summer season rainfall over India, using the 4 - 5 leading PCs of summer global tropical SSTs as covariates. -You will fit a CART model for each PC separately -with the CART estimates of the PCs, estimate the ‘model precipitation’ at all the locations by multiplying the PCs estimates with Eigen Vectors (a) Compare the CART model precipitation with historic. Boxplot correlation and RMSE across the grids (b) Repeat (a) above with a random forest model Compare and summarize your findings. ```{r} # Load libraries library(maps) library(akima) library(fields) library(RColorBrewer) #### Clear Memory and Set Options rm(list=ls()) options(warn=-1) save <- FALSE # Change to TRUE if you want to save the plot # Set working directory (Modify if necessary) script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path) setwd(script_dir) source("HW2_Library.R") # Color palette myPalette <- colorRampPalette(rev(brewer.pal(9, "RdBu")), space="Lab") ``` ##### Read data ```{r} ### india rainfall data nrows=68 ncols=65 ntime <- 67 #Jun-Sep 1950 - 2016 nyrs <- ntime nglobe <- nrows*ncols ### Lat - Long grid.. #ygrid=seq(6.5,38.5,by=0.25) ygrid=seq(6.5,38.5,by=0.5) ny=length(ygrid) #xgrid=seq(66.5,100,by=0.25) xgrid=seq(66.5,100,by=0.5) nx=length(xgrid) xygrid=matrix(0,nrow=nx*ny,ncol=2) i=0 for(iy in 1:ny){ for(ix in 1:nx){ i=i+1 xygrid[i,1]=ygrid[iy] xygrid[i,2]=xgrid[ix] } } # Read India Gridded Rainfall data.. data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap") data <- array(data = data, dim=c( nrows, ncols, ntime ) ) data1=data[,,1] # the lat -long data grid.. index=1:(nx*ny) ## pull only data and the locations with non-missing data index1=index[data1 != "NaN"] # only non-missing data. xygrid1=xygrid[index1,] nsites=length(index1) data2=data1[index1] ### Rain data matrix - rows are seasonal (i.e. one value per year) ## and columns are locations raindata=matrix(NA,nrow=nyrs, ncol=nsites) for(i in 1:nyrs){ data1=data[,,i] index1=index[ data1 != "NaN"] data2=data1[index1] raindata[i,]=data2 } index = 1:dim(raindata)[2] xx = apply(raindata,2,mean) index2 = index1[xx > 0] index3 = index[xx > 0] xygrid1=xygrid[index2,] rainavg = raindata[,index3] indexgrid = index2 rm("data") #remove the object data to clear up space ``` ```{r} #### SST data ## read SST data nrows_sst <- 180 ncols_sst <- 17 ntime = 67 #Jun-Sep 1950 - 2016 nglobe_sst = nrows_sst * ncols_sst ### Lat - Long grid.. ygrid_sst = seq(-16,16,by=2) ny_sst =length(ygrid_sst) xgrid_sst=seq(0,358,by=2) nx_sst=length(xgrid_sst) xygrid_sst=matrix(0,nrow=nx_sst * ny_sst,ncol=2) i=0 for(iy_sst in 1:ny_sst){ for(ix_sst in 1:nx_sst){ i=i+1 xygrid_sst[i,1]=ygrid_sst[iy_sst] xygrid_sst[i,2]=xgrid_sst[ix_sst] } } data_sst=readBin("data/NOAA-Trop-JJAS-SST-1950-2016.r4",what="numeric", n=( nrows_sst * ncols_sst * ntime), size=4,endian="swap") data_sst <- array(data_sst, dim=c( nrows_sst, ncols_sst, ntime ) ) data1_sst=data_sst[,,1] # the lat -long data grid.. index_sst=1:(nx_sst*ny_sst) ## pull only data and the locations with non-missing data index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] # only non-missing data. xygrid1_sst=xygrid_sst[index1_sst,] nsites_sst=length(index1_sst) data2_sst=data1_sst[index1_sst] ### SSTdata matrix - rows are seasonal (i.e. one value per year) ## and columns are locations sstdata=matrix(NA,nrow=ntime, ncol=nsites_sst) for(i in 1:ntime){ data1_sst=data_sst[,,i] index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] data2_sst=data1_sst[index1_sst] sstdata[i,]=data2_sst } sstannavg = sstdata indexgrid_sst = index1_sst rm("data_sst") #remove the object data to clear up space ``` ##### Perform PCA ```{r} ###################### PCA on india rainfall ## # Standardize and perform PCA rainscale = scale(rainavg) zs=var(rainscale) #do an Eigen decomposition.. zsvd=svd(zs) #Principal Components... rainpcs=t(t(zsvd$u) %*% t(rainscale)) #Eigen Vectors prec_eof=zsvd$u ###################### PCA on SST ## #get variance matrix.. ## scale the data sstscale = scale(sstannavg) zs_sst=var(sstscale) #do an Eigen decomposition.. zsvd_sst=svd(zs_sst) #Principal Components... sstpcs=t(t(zsvd_sst$u) %*% t(sstscale)) ``` ##### Fit a CART model to the leading 4 PCs of summer season rainfall over India, using the 4 - 5 leading PCs of summer global tropical SSTs as covariates ```{r,message=FALSE, warning=FALSE} npc = 4 N = nrow(rainpcs) treePred = matrix(rep(0,N),ncol=npc,nrow=N) for (i in 1:npc) { data1 = as.data.frame(cbind(rainpcs[,i],sstpcs[,1:4])) names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4") tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data1, model = T) treeFit = treeFun(tree.precip, toPlot = T, title = paste("PC", i, sep = " ")) treePred[,i] <- predict(treeFit) # Drop 10% cross-validation and boxplot correlation and RMSE myRange <- range(treePred, rainpcs[,i]) skill_tree = c(sqrt(mean((rainpcs[,i]-treePred)^2)),cor(rainpcs[,i], treePred)) names(skill_tree)=c("rmse","Correlation") Skill = Drop_10_pred(data1,treeFit$besTree,"tree") f1=ggplot() + geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) + geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]), color = "red", size = 3)+ labs(title = "", x = "",y = "")+ theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5)) f2=ggplot() + geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) + geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]), color = "red", size = 3)+ labs(title="",x = "",y = "") multiplot(f1, f2, cols = 2) } ``` ##### With the CART estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors ```{r,message=FALSE, warning=FALSE} N1 = ncol(rainpcs) - npc prec_pcs_pred = cbind(treePred,matrix(rep(0,N),ncol=N1,nrow=N)) prec_eof <- zsvd$u #Eigen Vectors ### Keep only the first npc Eigen Vectors and set rest to zero E = matrix(0,nrow=dim(rainscale)[2],ncol=dim(rainscale)[2]) E[,1:npc]=prec_eof[,1:npc] ## back transform to get the winter precipitation field prec_pred = prec_pcs_pred %*% t(E) ### rescale the winter precipitation precMean = apply(raindata,2,mean) precSd = apply(raindata,2,sd) prec_pred = t(t(prec_pred)*precSd + precMean) ``` #### (a) Compare the CART model precipitation with historic. Boxplot correlation and RMSE across the grids ```{r} ## correlate the forecasted rainfall with the historical precipitation xcor = diag(cor(prec_pred,rainavg)) xrmse = sqrt(colMeans((prec_pred - rainavg)^2)) # plot of R^2 between the observed and predicted summer precipitation at each grid point rainlocs = read.table("data/India-rain-locs.txt") xlong = xgrid ylat = ygrid zfull = rep(NaN,nx*ny) # zfull[rainlocs[,3]]=xcor zmat = matrix(zfull,nrow=nx,ncol=ny) image.plot(xlong,ylat,zmat,ylim=range(5,40),col=myPalette(200),zlim=c(-0.5,0.5)) contour(xlong,ylat,(zmat),ylim=range(5,40),add=TRUE,nlev=3,lwd=2) title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation")) maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2) grid() # Boxplot correlation and RMSE across the grids par(mfrow = c(1, 2)) # Boxplot of correlation boxplot(xcor, main = "Correlation (R2)", col = "skyblue", border = "blue") # Boxplot of RMSE boxplot(xrmse, main = "RMSE", col = "salmon", border = "red") ``` #### (b) Repeat (a) above with a random forest model ```{r,message=FALSE, warning=FALSE} forestPred = matrix(rep(0,N),ncol=npc,nrow=N) for (i in 1:npc) { data1 = as.data.frame(cbind(rainpcs[,i],sstpcs[,1:4])) names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4") forest.precip = randomForest(prec ~ PC1+PC2+PC3+PC4, data = data1) plot(forest.precip, main = paste("Random Forest for PC", i, sep = " ")) forestPred[,i] <- predict(forest.precip) # Drop 10% cross-validation and boxplot correlation and RMSE myRange <- range(forestPred, rainpcs[,i]) skill_tree = c(sqrt(mean((rainpcs[,i]-forestPred)^2)),cor(rainpcs[,i], forestPred)) names(skill_tree)=c("rmse","Correlation") Skill = Drop_10_pred(data1,treeFit$besTree,"tree") f1=ggplot() + geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) + geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]), color = "red", size = 3)+ labs(title = "", x = "",y = "")+ theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5)) f2=ggplot() + geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) + geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]), color = "red", size = 3)+ labs(title="",x = "",y = "") multiplot(f1, f2, cols = 2) } ``` ##### With the Random Forest estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors ```{r,message=FALSE, warning=FALSE} prec_pcs_pred_forest = cbind(forestPred,matrix(rep(0,N),ncol=N1,nrow=N)) ## back transform to get the winter precipitation field prec_pred_forest = prec_pcs_pred_forest %*% t(E) ### rescale the winter precipitation prec_pred_forest = t(t(prec_pred_forest)*precSd + precMean) ``` ##### Compare the Random Forest model precipitation with observed historic precipitation ```{r} ## correlate the forecasted rainfall with the historical precipitation xcor_forest = diag(cor(prec_pred_forest,rainavg)) xrmse_forest = sqrt(colMeans((prec_pred_forest - rainavg)^2)) # plot of R^2 between the observed and predicted summer precipitation at each grid point xlong = xgrid ylat = ygrid zfull = rep(NaN,nx*ny) # zfull[rainlocs[,3]]=xcor_forest zmat = matrix(zfull,nrow=nx,ncol=ny) image.plot(xlong,ylat,zmat,ylim=range(5,40),col=myPalette(200),zlim=c(-0.5,0.5)) contour(xlong,ylat,(zmat),ylim=range(5,40),add=TRUE,nlev=3,lwd=2) title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation")) maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2) grid() # Boxplot correlation and RMSE across the grids par(mfrow = c(1, 2)) # Boxplot of correlation boxplot(xcor_forest, main = "Correlation (R2)", col = "skyblue", border = "blue") # Boxplot of RMSE boxplot(xrmse_forest, main = "RMSE", col = "salmon", border = "red") ``` #### Conclusions Random Forest generally achieved higher R² values across most grid points compared to CART. The boxplot of correlation shows a tighter and higher distribution for Random Forest, indicating more consistent and accurate predictions. The spatial plots of R² between observed and predicted precipitation:Random Forest showed wider areas of positive correlation, indicating better spatial performance.CART had more regions with near-zero or negative correlations, showing poorer fit in those areas.