library(locfit)

swe=matrix(scan("Colorado-April1SWE-1949-99.txt"),ncol=46, byrow=T)
#swe=scale(swe)

flows=matrix(scan("AMJJFLOW-49-99.txt"),ncol=6,byrow=T)
flows=flows/(10^6)			#Million cubic meter

flowmeans=colMeans(flows)
flowsds=apply(flows,2,sd)
#flows=scale(flows)

zswe=prcomp(swe)
(zswe$sd^2) / sum(zswe$sd^2)

zflows=prcomp(flows)
(zflows$sd^2) / sum(zflows$sd^2)
flowpcs = zflows$x    #this is a 6 column matrix - first col. is first PC etc.
floweofs = zflows$u   #6 X 6 matrix - first column is first EOF etc.

#-----  SVD ----
C=var(flows,swe)  #note that the variable with fewer columns should be first
zz=svd(C)

#f1 and g1 are the matrix of time coefficients just like the
#PCS
f1=flows %*% zz$u
g1=swe %*% zz$v

# f1 %*% t(zz$u) will result in the flows.
# and of course, t(zz$u) %*% zz$u will result in an identfy matrix I
# similarly for g1

#f1[,1] and g1[,1] will have the 'highest' joint covariance and
# so on.. Also means, that f1[,1] is highly related to g1[,1]
# here it is similar to CCA.

#hetrogenous correlation pattern. First time
#coefficient of swe is correlated with the flows data and viceversa

J=min(length(swe[1,]), length(flows[1,]))

plot(1:J, ((zz$d)^2)/sum((zz$d)^2), xlab="Modes", ylab="Sq. Covariance",
col="red")

# Notice that the first mode captures 'almost' everything  of the joint
# covariance. AS to be expected..

corrpats1=cor(f1[,1],swe)  

corrpatf1=cor(g1[,1],flows)

#You can do a timeseries plot of the corrpatf1 and corrpats1 to get
#the hetrogeneous correlation maps..
par(mfrow=c(2,1))
plot(1:length(corrpatf1), corrpatf1, xlab="Flow Locs.", ylab="Correlation")
title(main="First TC of SWE correlated with Flows")

plot(1:length(corrpats1), corrpats1, xlab="SWE Locs.", ylab="Correlation")
title(main="First TC of Flow correlated with SWE")


#--------------------

#To predict -  we see that only the first mode is the significant one
# the rest is pretty much noise..
# using  linear regression..
# in a cross validated mode. that is, drop a year and predict it..

# say we want to predict the first year.

N=length(flows[,1])

index=1:N
nsim=100

ypred=array(0,c(J,nsim,N))

yy=1:J

for(i in 1:N){


#drop a point..
index1=index[index != i]

#get the rest of the data..
swecv=swe[index1,]
flowscv=flows[index1,]

#perform SVD
C=var(flowscv,swecv)  #note that the variable with fewer columns should be first
zz=svd(C)

#f1 and g1 are the matrix of time coefficients just like the
#PCS
f1=flowscv %*% zz$u
g1=swecv %*% zz$v
zmodel=lsfit(g1[,1],f1[,1])

stderror1= sum((zmodel$resid)^2) / (length(f1[,1]) - 2)

#prediction error..
Sxx = sum((g1[,1] - mean(g1[,1]))^2)


#the TCs of the dropped point..
xp=swe[i,] %*% zz$v

#generate an ensemble..

for(isim in 1:nsim){

stderror = stderror1*(1 + (1/length(f1[,1])) + ((xp[1] - mean(g1[,1]))^2)/Sxx)
stderror = sqrt(stderror)

yy[1]=zmodel$coef[1] + zmodel$coef[2]*xp[1] + rnorm(1,0,stderror)

#grab random TCs for the remaining TCs
for(jj in 2:J){

jj1 = round(runif(1,1,(N-1)))
yy[jj]=f1[jj1,jj]}


ypred[1:J,isim,i]=yy %*% t(zz$u)

}

}


# Unscale the data and the predictions......

#for(i in 1:6){
#ypred[i,,]=ypred[i,,]*flowsds[i] + flowmeans[i]
#zz=flows[,i]*flowsds[i] + flowmeans[i]
#flows[,i]=zz}


#plot the ensemble forecast along with the observed flows at the locations..

#say we plot at the first three locations..

par(mfrow=c(3,1))

for(i in 1:3){

#set the negative predicted values to 0..
ypred[i,,][ypred[i,,] < 0]=0

xs=1:N

xb=as.data.frame(t(ypred[i,,]))
zz=boxplot(split(xb,xs), plot=F, cex=1.0)
zz$names=rep("",length(zz$names))
z1=bxp(zz,ylim=range(ypred[i,,], flows[,i]), xlab="", ylab="Flow anomalies", cex=1.0)
axis(1,at=z1, labels=c(1949:1999), cex=1.0)
lines(z1, flows[,i], col="red", cex=1.0)
abline(h=median(flows[,i]), lty=2)
abline(h=quantile(flows[,i], c(0.9)), lty=2)
abline(h=quantile(flows[,i], c(0.1)), lty=2)
box()
title(main=c("Boxplot of cross-validated forecasts at Station ",i))

}

#### RPSSS

for(i in 1:J){

thresh=quantile(flows[,i], c(0.33, 0.66))

# climatological forecast..

climo = c(1/3, 1/3, 1/3)
climo=cumsum(climo)

rpss = 1:N
for(jj in 1:N){

# forecast categorical probabilities
fcastprob=1:3
yypred=ypred[i,,jj]
fcastprob[1] = length(yypred[yypred <= thresh[1]]) / nsim
fcastprob[2] = length(yypred[yypred > thresh[1] & yypred < thresh[2]]) / nsim
fcastprob[3] = length(yypred[yypred >= thresh[2]]) / nsim

fcastprob=cumsum(fcastprob)

# actual..
actual=rep(0,3)
if(flows[jj,i] <= thresh[1])actual[1]=1
if(flows[jj,i] > thresh[1] & flows[jj,i] < thresh[2])actual[2]=1
if(flows[jj,i] >= thresh[2])actual[3]=1


rpsclimo = sum((climo-actual)^2)
rpsfcast = sum((fcastprob - actual)^2)
rpss[jj] = 1 - (rpsfcast/rpsclimo)

}

#print out the median RPSS
print(c("RPSS of station ",i, median(rpss)))

}


#you can also plot the mean forecast with the observed flows at
#the six locations..

for(i in 1:J){

zz=apply(ypred[i,,],2,mean)
plot(flows[,i],zz, xlab="Observed flows", xlim=range(zz,flows[,i]), ylim=range(zz,flows[,i]), ylab="Mean Predicted flows")
lines(flows[,i],flows[,i])
title(main= c("Station ",i))

print(c("Correlation of Observed and Predicted Flows - Stn. ",i, cor(zz, flows[,i])))

}