6. CART - Multivariate Forecasting
Fit a CART model to the leading 4 PCs of winter 3-day precipitation extreme, using the four leading PCs of winter SSTs as covariates.
- You will fit a CART model for each PC separately
- with the CART estimates of the PCs, estimate the ‘model precipitation’ at all locations by multiplying the PCs estimates with eigenvectors
(a) Compare the CART model precipitation with historic. Also perform a drop-10% cross validationg and boxplot correlation and RMSE
(b) Repeat (a) above with a random forest model
Compare and summarize your findings
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
# Unload packages that create a conflict with map("world2", add = T)
detach(package:mclust)
detach(package:sm)
detach(package:kohonen)
nrows=72
ncols=36
ntime = 118 #Nov-Mar 1901 - Nov-Mar 2018
nyrs = 55 # Nov-Mar 1964 - Nov-Mar 2018
nglobe = nrows*ncols
N = nrows*ncols
### Lat - Long grid..
locs=matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/sst-lat-long.txt"), ncol=2, byrow=T)
ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid)
xgrid=seq(27.5,382.5,by=5)
#xgrid[xgrid > 180]=xgrid[xgrid > 180]-360 #longitude on 0-360 grid if needed
xgrid[xgrid > 180]=xgrid[xgrid > 180]
nx=length(xgrid)
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}
}
data=readBin(paste(mainDir, "data/prob4/data.r4", sep = ""),what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data = data[,,64:ntime]
data1=data[,,1]
# the lat -long data grid..
index=1:(nx*ny)
index1=index[data1 < 20 & data1 != "NaN"] # only non-missing data.
xygrid1=xygrid[index1,]
x1=xygrid1[,2]
#x1[x1 < 0]= x1[x1 < 0] + 360
#xygrid1[,2]=x1
nsites=length(index1)
data2=data1[index1]
### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1=data[,,i]
index1=index[data1 < 20 & data1 != "NaN"]
data2=data1[index1]
sstdata[i,]=data2
}
sstannavg = sstdata
indexgrid = index1
rm("data") #remove the object data to clear up space
## write out the grid locations..
write(t(xygrid1),file="kaplan-sst-locs.txt",ncol=2)
#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
sst_pcs=t(t(zsvd$u) %*% t(sstscale))
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
data = read.table(paste(mainDir, "data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
loc = read.table(paste(mainDir, "data/Precipitaton_data.txt", sep = ""), header = T)
lon = loc[,1]
lat = loc[,2]
precip = data[,2:ncol(data)]
precMean = apply(precip, 2, mean)
precSd = apply(precip, 2, sd)
## Perform PCA
# Get Variance
#get variance matrix..
## scale the data
precip_scale = scale(precip)
zs=var(precip_scale)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
precip_pcs = t(t(zsvd$u) %*% t(precip_scale))
#Eigen Vectors
prec_eof=zsvd$u
npc = 4
N = nrow(precip_pcs)
treePred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
data1 = as.data.frame(cbind(precip_pcs[,i],sst_pcs[,1:4]))
names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
tree.precip = tree(prec ~ PC1+PC2+PC3+PC4, data = data1, model = T)
treeFit = treeFun(tree.precip, toPlot = T, title = paste("PC", i, sep = " "))
treePred[,i] <- predict(treeFit)
# Drop 10% cross-validation and boxplot correlation and RMSE
myRange <- range(treePred, precip_pcs[,i])
skill_tree = c(sqrt(mean((precip_pcs[,i]-treePred)^2)),cor(precip_pcs[,i], treePred))
names(skill_tree)=c("rmse","Correlation")
Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
f1=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
color = "red", size = 3)+
labs(title = "", x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f2=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")
multiplot(f1, f2, cols = 2)
}
N1 = ncol(precip_pcs) - npc
prec_pcs_pred = cbind(treePred,matrix(rep(0,N),ncol=N1,nrow=N))
### Keep only the first npc Eigen Vectors and set rest to zero
E = matrix(0,nrow=dim(precip_scale)[2],ncol=dim(precip_scale)[2])
E[,1:npc]=prec_eof[,1:npc]
## back transform to get the winter precipitation field
prec_pred = prec_pcs_pred %*% t(E)
### rescale the winter precipitation
prec_pred = t(t(prec_pred)*precSd + precMean)
## correlate the forecasted rainfall with the historical winter precipitation
preccor = diag(cor(prec_pred,precip))
# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat, preccor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a, expand = F)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5), expand = F)
box()
forestPred = matrix(rep(0,N),ncol=npc,nrow=N)
for (i in 1:npc) {
data1 = as.data.frame(cbind(precip_pcs[,i],sst_pcs[,1:4]))
names(data1) = c("prec", "PC1", "PC2", "PC3", "PC4")
forest.precip = randomForest(prec ~ PC1+PC2+PC3+PC4, data = data1)
plot(forest.precip, main = paste("Random Forest for PC", i, sep = " "))
forestPred[,i] <- predict(forest.precip)
# Drop 10% cross-validation and boxplot correlation and RMSE
myRange <- range(forestPred, precip_pcs[,i])
skill_tree = c(sqrt(mean((precip_pcs[,i]-forestPred)^2)),cor(precip_pcs[,i], forestPred))
names(skill_tree)=c("rmse","Correlation")
Skill = Drop_10_pred(data1,treeFit$besTree,"tree")
f1=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
color = "red", size = 3)+
labs(title = "", x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f2=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")
multiplot(f1, f2, cols = 2)
}
prec_pcs_pred = cbind(forestPred,matrix(rep(0,N),ncol=N1,nrow=N))
## back transform to get the winter precipitation field
prec_pred = prec_pcs_pred %*% t(E)
### rescale the winter precipitation
prec_pred = t(t(prec_pred)*precSd + precMean)
## correlate the forecasted rainfall with the historical winter precipitation
preccor = diag(cor(prec_pred,precip))
# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat, preccor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5))
box()