library(locfit) swe=matrix(scan("Colorado-April1SWE-1949-99.txt"),ncol=46, byrow=T) #swe=scale(swe) flows=matrix(scan("AMJJFLOW-49-99.txt"),ncol=6,byrow=T) flows=flows/(10^6) #Million cubic meter flowmeans=colMeans(flows) flowsds=apply(flows,2,sd) #flows=scale(flows) zswe=prcomp(swe) (zswe$sd^2) / sum(zswe$sd^2) zflows=prcomp(flows) (zflows$sd^2) / sum(zflows$sd^2) flowpcs = zflows$x #this is a 6 column matrix - first col. is first PC etc. floweofs = zflows$u #6 X 6 matrix - first column is first EOF etc. #----- SVD ---- C=var(flows,swe) #note that the variable with fewer columns should be first zz=svd(C) #f1 and g1 are the matrix of time coefficients just like the #PCS f1=flows %*% zz$u g1=swe %*% zz$v # f1 %*% t(zz$u) will result in the flows. # and of course, t(zz$u) %*% zz$u will result in an identfy matrix I # similarly for g1 #f1[,1] and g1[,1] will have the 'highest' joint covariance and # so on.. Also means, that f1[,1] is highly related to g1[,1] # here it is similar to CCA. #hetrogenous correlation pattern. First time #coefficient of swe is correlated with the flows data and viceversa J=min(length(swe[1,]), length(flows[1,])) plot(1:J, ((zz$d)^2)/sum((zz$d)^2), xlab="Modes", ylab="Sq. Covariance", col="red") # Notice that the first mode captures 'almost' everything of the joint # covariance. AS to be expected.. corrpats1=cor(f1[,1],swe) corrpatf1=cor(g1[,1],flows) #You can do a timeseries plot of the corrpatf1 and corrpats1 to get #the hetrogeneous correlation maps.. par(mfrow=c(2,1)) plot(1:length(corrpatf1), corrpatf1, xlab="Flow Locs.", ylab="Correlation") title(main="First TC of SWE correlated with Flows") plot(1:length(corrpats1), corrpats1, xlab="SWE Locs.", ylab="Correlation") title(main="First TC of Flow correlated with SWE") #-------------------- #To predict - we see that only the first mode is the significant one # the rest is pretty much noise.. # using linear regression.. # in a cross validated mode. that is, drop a year and predict it.. # say we want to predict the first year. N=length(flows[,1]) index=1:N nsim=100 ypred=array(0,c(J,nsim,N)) yy=1:J for(i in 1:N){ #drop a point.. index1=index[index != i] #get the rest of the data.. swecv=swe[index1,] flowscv=flows[index1,] #perform SVD C=var(flowscv,swecv) #note that the variable with fewer columns should be first zz=svd(C) #f1 and g1 are the matrix of time coefficients just like the #PCS f1=flowscv %*% zz$u g1=swecv %*% zz$v zmodel=lsfit(g1[,1],f1[,1]) stderror1= sum((zmodel$resid)^2) / (length(f1[,1]) - 2) #prediction error.. Sxx = sum((g1[,1] - mean(g1[,1]))^2) #the TCs of the dropped point.. xp=swe[i,] %*% zz$v #generate an ensemble.. for(isim in 1:nsim){ stderror = stderror1*(1 + (1/length(f1[,1])) + ((xp[1] - mean(g1[,1]))^2)/Sxx) stderror = sqrt(stderror) yy[1]=zmodel$coef[1] + zmodel$coef[2]*xp[1] + rnorm(1,0,stderror) #grab random TCs for the remaining TCs for(jj in 2:J){ jj1 = round(runif(1,1,(N-1))) yy[jj]=f1[jj1,jj]} ypred[1:J,isim,i]=yy %*% t(zz$u) } } # Unscale the data and the predictions...... #for(i in 1:6){ #ypred[i,,]=ypred[i,,]*flowsds[i] + flowmeans[i] #zz=flows[,i]*flowsds[i] + flowmeans[i] #flows[,i]=zz} #plot the ensemble forecast along with the observed flows at the locations.. #say we plot at the first three locations.. par(mfrow=c(3,1)) for(i in 1:3){ #set the negative predicted values to 0.. ypred[i,,][ypred[i,,] < 0]=0 xs=1:N xb=as.data.frame(t(ypred[i,,])) zz=boxplot(split(xb,xs), plot=F, cex=1.0) zz$names=rep("",length(zz$names)) z1=bxp(zz,ylim=range(ypred[i,,], flows[,i]), xlab="", ylab="Flow anomalies", cex=1.0) axis(1,at=z1, labels=c(1949:1999), cex=1.0) lines(z1, flows[,i], col="red", cex=1.0) abline(h=median(flows[,i]), lty=2) abline(h=quantile(flows[,i], c(0.9)), lty=2) abline(h=quantile(flows[,i], c(0.1)), lty=2) box() title(main=c("Boxplot of cross-validated forecasts at Station ",i)) } #### RPSSS for(i in 1:J){ thresh=quantile(flows[,i], c(0.33, 0.66)) # climatological forecast.. climo = c(1/3, 1/3, 1/3) climo=cumsum(climo) rpss = 1:N for(jj in 1:N){ # forecast categorical probabilities fcastprob=1:3 yypred=ypred[i,,jj] fcastprob[1] = length(yypred[yypred <= thresh[1]]) / nsim fcastprob[2] = length(yypred[yypred > thresh[1] & yypred < thresh[2]]) / nsim fcastprob[3] = length(yypred[yypred >= thresh[2]]) / nsim fcastprob=cumsum(fcastprob) # actual.. actual=rep(0,3) if(flows[jj,i] <= thresh[1])actual[1]=1 if(flows[jj,i] > thresh[1] & flows[jj,i] < thresh[2])actual[2]=1 if(flows[jj,i] >= thresh[2])actual[3]=1 rpsclimo = sum((climo-actual)^2) rpsfcast = sum((fcastprob - actual)^2) rpss[jj] = 1 - (rpsfcast/rpsclimo) } #print out the median RPSS print(c("RPSS of station ",i, median(rpss))) } #you can also plot the mean forecast with the observed flows at #the six locations.. for(i in 1:J){ zz=apply(ypred[i,,],2,mean) plot(flows[,i],zz, xlab="Observed flows", xlim=range(zz,flows[,i]), ylim=range(zz,flows[,i]), ylab="Mean Predicted flows") lines(flows[,i],flows[,i]) title(main= c("Station ",i)) print(c("Correlation of Observed and Predicted Flows - Stn. ",i, cor(zz, flows[,i]))) }