10. Joint Spatial Analysis & Field Forecasting using CCA
One of the objectives of multivariate analysis is to enable multivariate forecasting.
Here we wish to predict the winter 3-day maximum precipitation over Southwest U.S. as a function of winter SSTs. We use CCA for this and the simplified steps are described below.
(i) Take the leading ~4 PCs of the winter SSTs (which you can obtain following the procedure in problem 4) and perform CCA with the leading ~4 PCs of precipitation.
(ii) Fit a regression for each canonical variate - canonical variate of precipitation related to canonical variate of SSTs. Use these regressions to predict the flow variates and back transform them to the original space - i.e., the precipitation space.
(iii) Evaluate the performance by computing R2 between the observed and predicted summer precipitation at each grid point.
(iv) Do a leave-one-out cross validation Compare with results from problem 6.

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
# Unload packages that create a conflict with map("world2", add = T)
detach(package:mclust)
detach(package:sm)
detach(package:kohonen)

nrows=72
ncols=36
ntime = 118    #Nov-Mar 1901 - Nov-Mar 2018
nyrs = 55       # Nov-Mar 1964 - Nov-Mar 2018

nglobe = nrows*ncols
N = nrows*ncols

### Lat - Long grid..
locs=matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/sst-lat-long.txt"), ncol=2, byrow=T)
ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid)

xgrid=seq(27.5,382.5,by=5)
#xgrid[xgrid > 180]=xgrid[xgrid > 180]-360  #longitude on 0-360 grid if needed
xgrid[xgrid > 180]=xgrid[xgrid > 180]
nx=length(xgrid)

xygrid=matrix(0,nrow=nx*ny,ncol=2)

i=0
for(iy in 1:ny){
  for(ix in 1:nx){
    i=i+1
    xygrid[i,1]=ygrid[iy]
    xygrid[i,2]=xgrid[ix]
  }
  
}

Read Kaplan SST data

data=readBin(paste(mainDir, "data/prob4/data.r4", sep = ""),what="numeric", n=( nrows * ncols * ntime), size=4,endian="swap")
data <- array(data = data, dim=c( nrows, ncols, ntime ) )
data = data[,,64:ntime]

data1=data[,,1]

# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 < 20 & data1 != "NaN"]   # only non-missing data.
xygrid1=xygrid[index1,]

x1=xygrid1[,2]
#x1[x1 < 0]= x1[x1 < 0] + 360
#xygrid1[,2]=x1

nsites=length(index1)
data2=data1[index1]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstdata=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
  data1=data[,,i]
  index1=index[data1 < 20 & data1 != "NaN"]
  data2=data1[index1]
  sstdata[i,]=data2
}

sstannavg = sstdata
indexgrid = index1
rm("data")  #remove the object data to clear up space

## write out the grid locations..
write(t(xygrid1),file="kaplan-sst-locs.txt",ncol=2)

Perform a PCA on the winter global SST anomalies

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=t(t(zsvd$u) %*% t(sstscale))

#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

npc = 4     # Take leading 4 PCs
sst_pcs = pcs[,1:npc]
sst_full = pcs

Perform a PCA on the winter extreme 3-day precipitation

data = read.table(paste(mainDir, "data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
loc = read.table(paste(mainDir, "data/Precipitaton_data.txt", sep = ""), header = T)
lon = loc[,1]
lat = loc[,2]

precip = data[,2:ncol(data)]
precMean = apply(precip, 2, mean)
precSd = apply(precip, 2, sd)

## Perform PCA
# Get Variance
#get variance matrix..
## scale the data
precip_scale = scale(precip)
zs=var(precip_scale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs = t(t(zsvd$u) %*% t(precip_scale))

prec_eof = zsvd$u

prec_pcs = pcs[,1:npc]
prec_full = pcs

(i) Take the leading ~4 PCs of the winter SSTs and perform CCA with the leading ~4 PCs of precipitation

M=dim(sst_pcs)[2]
J=dim(prec_pcs)[2]
J=min(M,J)
N = length(prec_pcs[,1])
Qx1 = qr.Q(qr(sst_pcs))
Qy1 = qr.Q(qr(prec_pcs))
T11 = qr.R(qr(sst_pcs))
T22 = qr.R(qr(prec_pcs))
VV=t(Qx1) %*% Qy1
BB = solve(T22) %*% svd(VV)$v * sqrt(N-1)
wm1 = prec_pcs %*% BB
AA = solve(T11) %*% svd(VV)$u * sqrt(N-1)
vm1 = sst_pcs %*% AA

(iii) Evaluate the performance by computing R2 between the observed and predicted summer precipitation at each grid point

## correlate the forecasted rainfall with the historical winter precipitation
preccor = diag(cor(prec_pred,precip))

# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat,preccor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5))
box()

(iv) Do a leave-one-out cross validation

prec_loocv = matrix(0,nrow=nrow(precip_scale),ncol=ncol(precip_scale))
## If you wish to remove a component from the data, say first component.
for (i in 1:nrow(precip_scale)) {
  for (j in 1:ncol(precip_scale)) {
    prec_rem = precip_scale
    prec_rem[i,j] = 0
    # Now perform PCA on prec_rem
    zs=var(prec_rem)
    # do an Eigen decomposition..
    zsvd=svd(zs)
    # Principal Components...
    pcs = t(t(zsvd$u) %*% t(prec_rem))
    prec_eof = zsvd$u
    prec_pcs = pcs[,1:npc]
    prec_full = pcs
    # Take the leading ~4 PCs of the winter SSTs and perform CCA with the leading ~4 PCs of precipitation
    M=dim(sst_pcs)[2]
    J=dim(prec_pcs)[2]
    J=min(M,J)
    N = length(prec_pcs[,1])
    Qx1 = qr.Q(qr(sst_pcs))
    Qy1 = qr.Q(qr(prec_pcs))
    T11 = qr.R(qr(sst_pcs))
    T22 = qr.R(qr(prec_pcs))
    VV=t(Qx1) %*% Qy1
    BB = solve(T22) %*% svd(VV)$v * sqrt(N-1)
    wm1 = prec_pcs %*% BB
    AA = solve(T11) %*% svd(VV)$u * sqrt(N-1)
    vm1 = sst_pcs %*% AA
    # Fit a regression for each canonical variate - canonical variate of precipitation related to canonical variate of SSTs
    # Predict the winter precipitation PCS
    betahat = solve(t(AA) %*% t(sst_pcs)%*% sst_pcs %*% AA) %*% t(AA) %*% t(sst_pcs) %*% prec_pcs
    ypred = sst_pcs %*% AA %*% betahat
    # first npc PCs from the PC forecast above and the remaining PCs are set to
    # their means - i.e., 0
    N1 = dim(prec_rem)[2]-(npc)
    prec_pcs_pred = cbind(ypred,matrix(rep(0,N),ncol=N1,nrow=N))
    #  Keep only the first npc Eigen Vectors and set rest to zero
    E = matrix(0,nrow=dim(prec_rem)[2],ncol=dim(prec_rem)[2])
    E[,1:npc]=prec_eof[,1:npc]
    # back transform to get the winter precipitation field
    prec_pred =  prec_pcs_pred %*% t(E)
    # rescale the winter precipitation
    prec_pred = t(t(prec_pred)*precSd + precMean)
    # Replace missing data
    prec_loocv[i,j] = prec_pred[i,j]
  }
}

Plot of R2 between observed and predicted precipitation from LOOCV

## correlate the forecasted rainfall with the historical winter precipitation
loocvcor = diag(cor(prec_loocv,precip))

# plot of R^2 between the observed and predicted winter precipitation at each grid point
nlevel = 11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(loocvcor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(lon, lat, loocvcor, xlim = range(-115,-101.3), ylim = range(29.5,42.5), nx=30, ny=30,col=coltab,zlim=zr,lab.breaks=break_a)
mtext(bquote(~ R^2 ~ "between the Observed and Predicted Southwest U.S. Winter Precipitation"), side = 3, line = 0.2, cex = 0.8)
grid(col = "gray70", lty = 2)
US(add = T, col = "gray40", lwd = 1, xlim = range(-115,-101.3), ylim = range(29.5,42.5))
box()

Compare with results from problem 6

  • The model performs better for predicted precipitation in Arizona and New Mexico and worse for most locations in Utah
  • The R2 between observed and predicted precipitation from leave-one-out cross validation shows that out-of-sample prediction performance is generally better in the same areas where the original model performs well
  • For most locations, the CART model performed better (higher R2) than the CCA model, which generally outperformed the Random Forest model. The CCA model performed better in similar locations as the CART model