# Load libraries
library(maps)
library(akima)
## Warning: package 'akima' was built under R version 4.4.3
library(fields)
## Loading required package: spam
## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: viridisLite
##
## Try help(fields) to get started.
library(RColorBrewer)
#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE # Change to TRUE if you want to save the plot
# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)
source("HW2_Library.R")
## Package 'sm', version 2.2-6.0: type help(sm) for summary information
## Loading required package: lattice
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## Loading required package: randomForest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## locfit 1.5-9.11 2025-01-27
##
## Attaching package: 'MASS'
## The following object is masked from 'package:MPV':
##
## cement
## The following object is masked from 'package:sm':
##
## muscle
## Loading required package: stats4
##
## Attaching package: 'stats4'
## The following object is masked from 'package:spam':
##
## mle
## Loading required package: splines
## Loading required package: boot
##
## Attaching package: 'boot'
## The following objects are masked from 'package:VGAM':
##
## logit, simplex
## The following object is masked from 'package:MPV':
##
## motor
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:sm':
##
## dogs
## Loading required package: CircStats
##
## Attaching package: 'CircStats'
## The following objects are masked from 'package:VGAM':
##
## dcard, rcard
## Loading required package: dtw
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:spam':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loaded dtw v1.23-1. See ?dtw for help, citation("dtw") for use in publication.
##
## Attaching package: 'verification'
## The following object is masked from 'package:VGAM':
##
## exponential
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## --------------------------------------------------------------
## Analysis of Geostatistical Data
## For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
## geoR version 1.9-4 (built on 2024-02-14) is now loaded
## --------------------------------------------------------------
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:maps':
##
## map
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:fields':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'cluster'
## The following object is masked from 'package:maps':
##
## votes.repub
##
## Attaching package: 'kohonen'
## The following object is masked from 'package:mclust':
##
## map
## The following object is masked from 'package:maps':
##
## map
# Color palette
myPalette <- colorRampPalette(rev(brewer.pal(9, "RdBu")), space="Lab")
### india rainfall data
nrows=68
ncols=65
ntime <- 67 #Jun-Sep 1950 - 2016
nyrs <- ntime
nglobe <- nrows*ncols
### Lat - Long grid..
#ygrid=seq(6.5,38.5,by=0.25)
ygrid=seq(6.5,38.5,by=0.5)
ny=length(ygrid)
#xgrid=seq(66.5,100,by=0.25)
xgrid=seq(66.5,100,by=0.5)
nx=length(xgrid)
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}
}
# Read India Gridded Rainfall data..
data=readBin("data/India-Rain-JJAS-05deg-1950-2016.r4",what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data1=data[,,1]
# the lat -long data grid..
index=1:(nx*ny)
## pull only data and the locations with non-missing data
index1=index[data1 != "NaN"] # only non-missing data.
xygrid1=xygrid[index1,]
nsites=length(index1)
data2=data1[index1]
### Rain data matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
raindata=matrix(NA,nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1=data[,,i]
index1=index[ data1 != "NaN"]
data2=data1[index1]
raindata[i,]=data2
}
index = 1:dim(raindata)[2]
xx = apply(raindata,2,mean)
index2 = index1[xx > 0]
index3 = index[xx > 0]
xygrid1=xygrid[index2,]
rainavg = raindata[,index3]
indexgrid = index2
rm("data") #remove the object data to clear up space
#### SST data
## read SST data
nrows_sst <- 180
ncols_sst <- 17
ntime = 67 #Jun-Sep 1950 - 2016
nglobe_sst = nrows_sst * ncols_sst
### Lat - Long grid..
ygrid_sst = seq(-16,16,by=2)
ny_sst =length(ygrid_sst)
xgrid_sst=seq(0,358,by=2)
nx_sst=length(xgrid_sst)
xygrid_sst=matrix(0,nrow=nx_sst * ny_sst,ncol=2)
i=0
for(iy_sst in 1:ny_sst){
for(ix_sst in 1:nx_sst){
i=i+1
xygrid_sst[i,1]=ygrid_sst[iy_sst]
xygrid_sst[i,2]=xgrid_sst[ix_sst]
}
}
data_sst=readBin("data/NOAA-Trop-JJAS-SST-1950-2016.r4",what="numeric", n=( nrows_sst * ncols_sst * ntime), size=4,endian="swap")
data_sst <- array(data_sst, dim=c( nrows_sst, ncols_sst, ntime ) )
data1_sst=data_sst[,,1]
# the lat -long data grid..
index_sst=1:(nx_sst*ny_sst)
## pull only data and the locations with non-missing data
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"] # only non-missing data.
xygrid1_sst=xygrid_sst[index1_sst,]
nsites_sst=length(index1_sst)
data2_sst=data1_sst[index1_sst]
### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=ntime, ncol=nsites_sst)
for(i in 1:ntime){
data1_sst=data_sst[,,i]
index1_sst=index_sst[data1_sst < 20 & data1_sst != "NaN"]
data2_sst=data1_sst[index1_sst]
sstdata[i,]=data2_sst
}
sstannavg = sstdata
indexgrid_sst = index1_sst
rm("data_sst") #remove the object data to clear up space
###################### PCA on india rainfall
##
# Standardize and perform PCA
rainscale = scale(rainavg)
zs=var(rainscale)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
rainpcs=t(t(zsvd$u) %*% t(rainscale))
#Eigen Vectors
prec_eof=zsvd$u
###################### PCA on SST
##
#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs_sst=var(sstscale)
#do an Eigen decomposition..
zsvd_sst=svd(zs_sst)
#Principal Components...
sstpcs=t(t(zsvd_sst$u) %*% t(sstscale))
npc = 4
N = nrow(rainpcs)
treePred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
data1 = as.data.frame(cbind(rainpcs[,i],sstpcs[,1:4]))
names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data1, model = T)
treeFit = treeFun(tree.precip, toPlot = T, title = paste("PC", i, sep = " "))
treePred[,i] <- predict(treeFit)
# Drop 10% cross-validation and boxplot correlation and RMSE
myRange <- range(treePred, rainpcs[,i])
skill_tree = c(sqrt(mean((rainpcs[,i]-treePred)^2)),cor(rainpcs[,i], treePred))
names(skill_tree)=c("rmse","Correlation")
Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
f1=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
color = "red", size = 3)+
labs(title = "", x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f2=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")
multiplot(f1, f2, cols = 2)
}
N1 = ncol(rainpcs) - npc
prec_pcs_pred = cbind(treePred,matrix(rep(0,N),ncol=N1,nrow=N))
prec_eof <- zsvd$u #Eigen Vectors
### Keep only the first npc Eigen Vectors and set rest to zero
E = matrix(0,nrow=dim(rainscale)[2],ncol=dim(rainscale)[2])
E[,1:npc]=prec_eof[,1:npc]
## back transform to get the winter precipitation field
prec_pred = prec_pcs_pred %*% t(E)
### rescale the winter precipitation
precMean = apply(raindata,2,mean)
precSd = apply(raindata,2,sd)
prec_pred = t(t(prec_pred)*precSd + precMean)
Boxplot correlation and RMSE across the grids
## correlate the forecasted rainfall with the historical precipitation
xcor = diag(cor(prec_pred,rainavg))
xrmse = sqrt(colMeans((prec_pred - rainavg)^2))
# plot of R^2 between the observed and predicted summer precipitation at each grid point
rainlocs = read.table("data/India-rain-locs.txt")
xlong = xgrid
ylat = ygrid
zfull = rep(NaN,nx*ny) #
zfull[rainlocs[,3]]=xcor
zmat = matrix(zfull,nrow=nx,ncol=ny)
image.plot(xlong,ylat,zmat,ylim=range(5,40),col=myPalette(200),zlim=c(-0.5,0.5))
contour(xlong,ylat,(zmat),ylim=range(5,40),add=TRUE,nlev=3,lwd=2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation"))
maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2)
grid()
# Boxplot correlation and RMSE across the grids
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor,
main = "Correlation (R2)",
col = "skyblue",
border = "blue")
# Boxplot of RMSE
boxplot(xrmse,
main = "RMSE",
col = "salmon",
border = "red")
#### (b) Repeat (a) above with a random forest model
forestPred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
data1 = as.data.frame(cbind(rainpcs[,i],sstpcs[,1:4]))
names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
forest.precip = randomForest(prec ~ PC1+PC2+PC3+PC4, data = data1)
plot(forest.precip, main = paste("Random Forest for PC", i, sep = " "))
forestPred[,i] <- predict(forest.precip)
# Drop 10% cross-validation and boxplot correlation and RMSE
myRange <- range(forestPred, rainpcs[,i])
skill_tree = c(sqrt(mean((rainpcs[,i]-forestPred)^2)),cor(rainpcs[,i], forestPred))
names(skill_tree)=c("rmse","Correlation")
Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
f1=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
color = "red", size = 3)+
labs(title = "", x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f2=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")
multiplot(f1, f2, cols = 2)
}
prec_pcs_pred_forest = cbind(forestPred,matrix(rep(0,N),ncol=N1,nrow=N))
## back transform to get the winter precipitation field
prec_pred_forest = prec_pcs_pred_forest %*% t(E)
### rescale the winter precipitation
prec_pred_forest = t(t(prec_pred_forest)*precSd + precMean)
## correlate the forecasted rainfall with the historical precipitation
xcor_forest = diag(cor(prec_pred_forest,rainavg))
xrmse_forest = sqrt(colMeans((prec_pred_forest - rainavg)^2))
# plot of R^2 between the observed and predicted summer precipitation at each grid point
xlong = xgrid
ylat = ygrid
zfull = rep(NaN,nx*ny) #
zfull[rainlocs[,3]]=xcor_forest
zmat = matrix(zfull,nrow=nx,ncol=ny)
image.plot(xlong,ylat,zmat,ylim=range(5,40),col=myPalette(200),zlim=c(-0.5,0.5))
contour(xlong,ylat,(zmat),ylim=range(5,40),add=TRUE,nlev=3,lwd=2)
title(main = bquote(R^2 ~ "between the Observed and Predicted Precipitation"))
maps::map('world',wrap=c(0,360),add=TRUE,resolution=0,lwd=2)
grid()
# Boxplot correlation and RMSE across the grids
par(mfrow = c(1, 2))
# Boxplot of correlation
boxplot(xcor_forest,
main = "Correlation (R2)",
col = "skyblue",
border = "blue")
# Boxplot of RMSE
boxplot(xrmse_forest,
main = "RMSE",
col = "salmon",
border = "red")
Random Forest generally achieved higher R² values across most grid points compared to CART. The boxplot of correlation shows a tighter and higher distribution for Random Forest, indicating more consistent and accurate predictions. The spatial plots of R² between observed and predicted precipitation:Random Forest showed wider areas of positive correlation, indicating better spatial performance.CART had more regions with near-zero or negative correlations, showing poorer fit in those areas.