HW2_Prob6

6. CART - Multivariate Forecasting
Fit a CART model to the leading 4 PCs of winter 3-day precipitation extreme, using the four leading PCs of winter SSTs as covariates.
- You will fit a CART model for each PC separately
- with the CART estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PCs estimates with eigenvectors
(a) Compare the CART model precipitation with historic. Also perform a drop-10% cross validationg and boxplot correlation and RMSE
(b) Repeat (a) above with a random forest model
Compare and summarize your findings

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
# Unload packages that create a conflict with map("world2", add = T)
detach(package:mclust)
detach(package:sm)
detach(package:kohonen)

nrows=72
ncols=36
ntime = 118    #Nov-Mar 1901 - Nov-Mar 2018
nyrs = 55       # Nov-Mar 1964 - Nov-Mar 2018

nglobe = nrows*ncols
N = nrows*ncols

### Lat - Long grid..
locs=matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/sst-lat-long.txt"), ncol=2, byrow=T)
ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid)

xgrid=seq(27.5,382.5,by=5)
#xgrid[xgrid > 180]=xgrid[xgrid > 180]-360  #longitude on 0-360 grid if needed
xgrid[xgrid > 180]=xgrid[xgrid > 180]
nx=length(xgrid)

xygrid=matrix(0,nrow=nx*ny,ncol=2)

i=0
for(iy in 1:ny){
  for(ix in 1:nx){
    i=i+1
    xygrid[i,1]=ygrid[iy]
    xygrid[i,2]=xgrid[ix]
  }
  
}

Read Kaplan SST data

data=readBin(paste(mainDir, "data/prob4/data.r4", sep = ""),what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data = data[,,64:ntime]

data1=data[,,1]

# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 < 20 & data1 != "NaN"]   # only non-missing data.
xygrid1=xygrid[index1,]

x1=xygrid1[,2]
#x1[x1 < 0]= x1[x1 < 0] + 360
#xygrid1[,2]=x1

nsites=length(index1)
data2=data1[index1]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
  data1=data[,,i]
  index1=index[data1 < 20 & data1 != "NaN"]
  data2=data1[index1]
  sstdata[i,]=data2
}

sstannavg = sstdata
indexgrid = index1
rm("data")  #remove the object data to clear up space

## write out the grid locations..
write(t(xygrid1),file="kaplan-sst-locs.txt",ncol=2)

Perform a PCA on the winter global SST anomalies

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
sst_pcs=t(t(zsvd$u) %*% t(sstscale))

#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

Perform a PCA on the winter extreme 3-day precipitation

data = read.table(paste(mainDir, "data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
loc = read.table(paste(mainDir, "data/Precipitaton_data.txt", sep = ""), header = T)
lon = loc[,1]
lat = loc[,2]

precip = data[,2:ncol(data)]
precMean = apply(precip, 2, mean)
precSd = apply(precip, 2, sd)

## Perform PCA
# Get Variance
#get variance matrix..
## scale the data
precip_scale = scale(precip)
zs=var(precip_scale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
precip_pcs = t(t(zsvd$u) %*% t(precip_scale))

#Eigen Vectors
prec_eof=zsvd$u

Fit a CART model to the leading 4 PCs of winter 3-day precipitation extreme, using the four leading PCs of winter SSTs as covariates

npc = 4
N = nrow(precip_pcs)
treePred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
  data1 = as.data.frame(cbind(precip_pcs[,i],sst_pcs[,1:4]))
  names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
  tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data1, model = T)
  treeFit = treeFun(tree.precip, toPlot = T, title = paste("PC", i, sep = " "))
  treePred[,i] <- predict(treeFit)
  # Drop 10% cross-validation and boxplot correlation and RMSE
  myRange <- range(treePred, precip_pcs[,i])
  skill_tree = c(sqrt(mean((precip_pcs[,i]-treePred)^2)),cor(precip_pcs[,i], treePred))
  names(skill_tree)=c("rmse","Correlation")
  Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
    f1=ggplot() +
      geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
      geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
                 color = "red", size = 3)+
      labs(title = "", x = "",y = "")+
      theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
    f2=ggplot() +
      geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
      geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
                 color = "red", size = 3)+
      labs(title="",x = "",y = "")
    multiplot(f1, f2, cols = 2)
}

With the CART estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors

N1 = ncol(precip_pcs) - npc
prec_pcs_pred = cbind(treePred,matrix(rep(0,N),ncol=N1,nrow=N))

###  Keep only the first npc Eigen Vectors and set rest to zero
E = matrix(0,nrow=dim(precip_scale)[2],ncol=dim(precip_scale)[2])
E[,1:npc]=prec_eof[,1:npc]

## back transform to get the winter precipitation field
prec_pred =  prec_pcs_pred %*% t(E)
### rescale the winter precipitation
prec_pred = t(t(prec_pred)*precSd + precMean)

(a) Compare the CART model precipitation with observed historic precipitation

## correlate the forecasted rainfall with the historical winter precipitation
preccor = diag(cor(prec_pred,precip))

# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat, preccor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a, expand = F)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5), expand = F)
box()

(b) Repeat (a) with a random forest model

forestPred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
  data1 = as.data.frame(cbind(precip_pcs[,i],sst_pcs[,1:4]))
  names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
  forest.precip = randomForest(prec ~ PC1+PC2+PC3+PC4, data = data1)
  plot(forest.precip, main = paste("Random Forest for PC", i, sep = " "))
  forestPred[,i] <- predict(forest.precip)
  # Drop 10% cross-validation and boxplot correlation and RMSE
  myRange <- range(forestPred, precip_pcs[,i])
  skill_tree = c(sqrt(mean((precip_pcs[,i]-forestPred)^2)),cor(precip_pcs[,i], forestPred))
  names(skill_tree)=c("rmse","Correlation")
  Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
  f1=ggplot() +
    geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
    geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
               color = "red", size = 3)+
    labs(title = "", x = "",y = "")+
    theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
  f2=ggplot() +
    geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
    geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
               color = "red", size = 3)+
    labs(title="",x = "",y = "")
  multiplot(f1, f2, cols = 2)
}

With the Random Forest estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors

prec_pcs_pred = cbind(forestPred,matrix(rep(0,N),ncol=N1,nrow=N))

## back transform to get the winter precipitation field
prec_pred =  prec_pcs_pred %*% t(E)
### rescale the winter precipitation
prec_pred = t(t(prec_pred)*precSd + precMean)

(a) Compare the CART model precipitation with observed historic precipitation

## correlate the forecasted rainfall with the historical winter precipitation
preccor = diag(cor(prec_pred,precip))

# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat, preccor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5))
box()

Compare and summarize findings

In cases where the minimum cross-validation error occurred at 1 node for the CART model, the unpruned tree was used
For PC 1 of precipitation, SST PC 3 > PC 2 were identified as the most important covariates. For precipitation PC 2, SST PC 2 > PC 4 > PC 1 & 3 were the most important covariates. For precipitation PC 3, SST PC 3 > PC 2 > PC 1 were the most important covariates. For precipitation PC 4, SST PC 4 > PC 3 > PC 2 were the most important covariates.
The CART model performed better at predicting precipitation compared to the Random Forest model at most locations
Drop-10% cross validation shows that the CART model performed best for PC 1 of the winter 3-day maximum precipitation data

HW2_Prob6

Anna Starodubtseva

November 16, 2021

Read Kaplan SST data

Perform a PCA on the winter global SST anomalies

Perform a PCA on the winter extreme 3-day precipitation

Fit a CART model to the leading 4 PCs of winter 3-day precipitation extreme, using the four leading PCs of winter SSTs as covariates

With the CART estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors

(a) Compare the CART model precipitation with observed historic precipitation

(b) Repeat (a) with a random forest model

With the Random Forest estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PC estimates with Eigen Vectors

(a) Compare the CART model precipitation with observed historic precipitation

Compare and summarize findings