{

library(locfit)
library(sm)
	
#----------------------------------------------
#    MODIFIED K-NN METHOD-- FORECAST ALL YEARS
#----------------------------------------------

# 1. Use y-hat from the locfit model 
#    (fpredicted.loc$fit and cpredicted.loc$fit)
# 2. The residuals from the locfit model will be used 
#    (residuals(locf.model), residuals(locc.model))
# 3. Get the distance between the predicted point x values
#    and the x values for all points in the model (will get the 
#    Euclidean distance of the normalized values) 
# 4. Pick the K (based on heuristic scheme) nearest neighbors
# 5. Weight the K nearest neighbors 
# 6. The weighted K-NN residuals will then be used in a bootstrap
     

# SET UP PARAMETERS                
preds=matrix(scan("predictors_knn_all.txt"),byrow=T,ncol=3)  # all 3 possible predictors
x=preds[30:51,2:3]
sf=matrix(scan("seasonal_rainfall.txt"),byrow=T,ncol=2)  # y-values (rainfall 1951-2001)
yfVol=sf[30:51,2]

mean=mean(yfVol)
sd=sd(yfVol)

const=1979              # the year before the data starts 

x=cbind(x[,1],x[,2])    # MAM SAT, AMJ SLP, MAM SST

p=length(x[1,])         # p for the number of predictors
                        #  ** NOTE:  If p is not 3, MUST CHANGE number of terms 
                        #  ** in distance calculation         

xs=(1980-const)         # the starting position of the data set (change as needed)
xe=(2001-const)         # the ending position of the data set (change as needed)       

yp=xe-xs+1              # yp for the number of years to predict at-- predict each year 
                         
xx=scale(x)             # normalize the data (so higher magnitude variables
                        # don't get more weight in distance calculation)

        
# set up matricies to hold ensemble forecast        
ensemblef=matrix(nrow=250,ncol=yp)
xboot=matrix(nrow=250,ncol=yp)
        
## Need to make the forecast for every year, so put into big loop
for(i in 1:yp)
{
  # p1 is the position of the year we're predicting
  p1=i               
  xpred=x[p1,]  
  yfpred=yfVol[p1]
 
  if(p1 == xs)
  {
     xmodel=x[(xs+1):xe,]
     yfmodel=yfVol[(xs+1):xe]      
  }        
  if(p1 == xe)
  {
      xmodel=x[xs:(xe-1),]
      yfmodel=yfVol[xs:(xe-1)]       
  }
  if(p1 != xe && p1 != xs)
  {                               
      xmodel=rbind(x[xs:(p1-1),],x[(p1+1):xe,])
      yfmodel=c(yfVol[xs:(p1-1)],yfVol[(p1+1):xe])
  }          
  ym=length(xmodel[,1])        
  

  # DISTANCE CALCULATION
  # calculate the distance between the (predictors of the) point we're predicting 
  # and all other points-- use scaled data
  xdist=scale(xmodel)
  distance=1:ym     
  for(j in 1:ym)
  {
     distance[j]=sqrt(((xx[p1,1]-xdist[j,1])^2)+((xx[p1,2]-xdist[j,2])^2))
  }
  
  # RANK the distances          
  drank=rank(distance)       # here rank 1 is the nearest neighbor
                             # (i.e. the smallest distance)        
          
  # DETERMINE K and weight it
  n=length(distance)
  kk=sqrt(n)
  kk=round(kk)
  #kk=5
        	
  W=1:kk 
  W=1/W        
  W=W/sum(W)    
  W=cumsum(W)   
       
  # Find the alpha for the locfit model-- take the alpha whith the lowest gcv        
  alphaf=seq(0.35,1,by=0.05)
  xxf=gcvplot(yfmodel~xmodel,alpha=alphaf,deg=1,kern="bisq",ev="data")
  zxf=xxf$values
  zzf=order(zxf)
  alphaf=alphaf[zzf[1]]
          
  # Do the LOCFIT and get the expected value for each of the p  points
  locf.model=locfit(yfmodel~xmodel, alpha=alphaf, deg=1, kern="bisq")
  fit=locf.model

  # Make the mean prediction
        # hack fix so predict.locfit will work:  make xpred a matrix with 
        # the real xpred the first row.  Take first predicted point.
  xpred=rbind(xpred,xmodel)      
  fpredicted.loc=predict.locfit(locf.model, xpred, se.fit=T, band="global")  
        
  # now weight the neighbors and pick one at random (using the weights)
  # do this 100 times
  residualsf=residuals(locf.model)    # get the residuals of the locfit model
  for(k in 1:250)         # do innerloop 100 times (bootstrap residuals)
  {
      rannum=runif(1,0,1)
      xy=c(rannum,W)      # adds a random number (between 0 and 1) to the 
                          # weight function (CDF)

      rankW=rank(xy)      # assigns a rank to the random number 
                          # (and W vector)

      pos=order(drank)[rankW[1]]     
                          # gives the position in the distance matrix (and
                          # corrrespondingly the y matrix for the selected
                          # neighbor)

      resids=residualsf[pos]
                          # Once I get a neighbor, I need to find the 
                          # residual associated with that neighbor

      ensemblef[k,i]=fpredicted.loc$fit[1]+resids
                          # add that residual to the y-hat from the
                          # locfit model
  }

xboot[,i]=rnorm(250,mean=mean,sd=sd)

}

meanensemblef=matrix(apply(ensemblef,2,mean),ncol=1)
write(t(meanensemblef),file="knn_rainfall_post.out",ncol=1,append=FALSE)

#======================================================================================================
# Log Liklihood       
        

# hist is the vector of historical values
# obsval is the observed value in any given year
# il  is the upper limit for the lower category
# iu  is the lower limit for the upper category

# isim is the vector of simulated values
# ncat is the number of categories (we can set it to 3 for now)
        
loglikf=1:yp
for(j in 1:yp) 
{        
isim=ensemblef[,j]
obsval=yfVol[j] 

# set the divisions at the 33rd and 67th percentile of historical data (minus the year we're in)            
if(j==1)
        xxf=yfVol[2:yp]
if(j==yp)
        xxf=yfVol[1:(yp-1)]
if(j>1&&j<yp)
        xxf=c(yfVol[1:(j-1)],yfVol[(j+1):yp])
il=quantile(xxf,0.33)              
iu=quantile(xxf,0.67)         
                 
ncat=3
probs=1:ncat
pclim=1:ncat
obscat=1

# assigns which category each of the simulations falls in        
simcat=1:length(isim)
nsim=length(isim)
simcat[isim >= iu]=3
simcat[isim <= il]=1
simcat[isim > il & isim <iu]=2

# probability of simulations falling into each category        
for(i in 1:ncat)       
  probs[i]=length(simcat[simcat == i])/nsim 

# raw probability of falling into each category   
# 33% chance to fall in each one of 3 categories             
pclim=rep(1,ncat)
pclim=pclim/ncat

# where the obsevation actually falls        
if(obsval >= iu)
  obscat=3
if(obsval > il & obsval < iu)
  obscat=2
if(obsval <= il) 
  obscat=1

loglikf[j]=probs[obscat]/pclim[obscat]
}

#======================================================================================================
#Now boxplot the log likelihood
par(mfrow=c(1,3))
boxplot(loglikf,main="Rainfall Log Likelihood", ylim=c(0,3),col="light grey",ylab="Score")

quf=quantile(yfVol,.90)
zzf1=loglikf[yfVol>quf]       

boxplot(zzf1,main="Rainfall Log Likelihood (High)", ylim=c(0,3),col="light grey",ylab="Score")
title(sub="SCORE 1-3: BETTER SKILL")

qlf=quantile(yfVol,.10)
zzf2=loglikf[yfVol<qlf]

boxplot(zzf2,main="Rainfall Log Likelihood (Low)", ylim=c(0,3),col="light grey",ylab="Score")
title(sub="MJJ SLP, MAM SST")
par(mfrow=c(1,1))

scan()

#======================================================================================================
# Rank Probability Skill Score RPSS        
        

# hist is the vector of historical values
# obsval is the observed value in any given year
# il  is the upper limit for the lower category
# iu  is the lower limit for the upper category

# isim is the vector of simulated values
# ncat is the number of categories (we can set it to 3 for now)
        
rpssf=1:yp
for(j in 1:yp) 
{        
isim=ensemblef[,j]
obsval=yfVol[j] 

# set the divisions at the 33rd and 67th percentile of historical data (minus the year we're in)            
if(j==1)
        xxf=yfVol[2:yp]
if(j==yp)
        xxf=yfVol[1:(yp-1)]
if(j>1&&j<yp)
        xxf=c(yfVol[1:(j-1)],yfVol[(j+1):yp])
il=quantile(xxf,0.33)              
iu=quantile(xxf,0.67)         
#il=quantile(ensemblef,0.33)              
#iu=quantile(ensemblef,0.67) 
                 
ncat=3
probs=1:ncat
pclim=1:ncat
obscat=1:ncat       
        
ncat1=ncat-1

# assigns which category each of the simulations falls in        
simcat=1:length(isim)
nsim=length(isim)
simcat[isim >= iu]=3
simcat[isim <= il]=1
simcat[isim > il & isim <iu]=2

# probability of simulations falling into each category        
for(i in 1:ncat)       
  probs[i]=length(simcat[simcat == i])/nsim 
probs=cumsum(probs)

# raw probability of falling into each category   
# 33% chance to fall in each one of 3 categories             
pclim=rep(1,ncat)
pclim=pclim/ncat
pclim=cumsum(pclim)

# where the obsevation actually falls        
obscat=rep(0,ncat)
if(obsval >= iu)
  obscat[3]=1
if(obsval > il & obsval < iu)
  obscat[2]=1
if(obsval <= il) 
  obscat[1]=1
obscat=cumsum(obscat)

rpssf[j]=1 - (sum((obscat-probs)^2) / sum((obscat-pclim)^2) )
}

#======================================================================================================
#Now boxplot RPSS
par(mfrow=c(1,3))
boxplot(rpssf,main="Rainfall RPSS", ylim=c(-2,1),col="light grey",ylab="Score")

quf=quantile(yfVol,.90)
zzf3=rpssf[yfVol>quf]

boxplot(zzf3,main="Rainfall RPSS (High)", ylim=c(-2,1),col="light grey",ylab="Score")
title(sub="HIGHER SCORE: BETTER SKILL")

qlf=quantile(yfVol,.10)
zzf4=rpssf[yfVol<qlf] 

boxplot(zzf4,main="Rainfall RPSS (Low)", ylim=c(-2,1),col="light grey",ylab="Score")
title(sub="MJJ SLP, MAM SST")
par(mfrow=c(1,1))

scan()

#======================================================================================================        
# Now make boxplots
par(mfrow=c(2,1))
flowh=c(yfVol[4],yfVol[9],yfVol[16])
xboot1=c(xboot[,4],xboot[,9],xboot[,16])
xboot1=matrix(xboot1,ncol=3)
xplot1=c(ensemblef[,4],ensemblef[,9],ensemblef[,16])
xplot1=matrix(xplot1,ncol=3)
xs1=1:3
xevals1=split(xplot1,xs1)
qf=quantile(yfVol,c(.05,.25,.5,.75,.95))
zz=boxplot(xevals1,plot=F,cex=1.0)
zz$names=c(1983,1988,1995)
z1=bxp(zz,xlab="",ylab="",col="light grey",ylim=range(300,1100))
lines(z1,flowh,type="b",col="red",pch=20)
title(main="Rainfall Modified K-nn Prediction (High Years of Post 1980)")
title(ylab="ASO Rainfall (mm)")
title(xlab="Year")
for(i in 1:5)
{
abline(h=qf[i],lty=2,col="blue")
}

#------------------------------------------------------------------------------------------------------
flowl=c(yfVol[5],yfVol[8],yfVol[15])
xboot2=c(xboot[,5],xboot[,8],xboot[,15])
xboot2=matrix(xboot2,ncol=3)
xplot2=c(ensemblef[,5],ensemblef[,8],ensemblef[,15])
xplot2=matrix(xplot2,ncol=3)
xs2=1:3
xevals2=split(xplot2,xs2)
qf=quantile(yfVol,c(.05,.25,.5,.75,.95))
zz=boxplot(xevals2,plot=F,cex=1.0)
zz$names=c(1984,1987,1994)
z1=bxp(zz,xlab="",ylab="",col="light grey",ylim=range(300,1100))
lines(z1,flowl,type="b",col="red",pch=20)
title(main="Rainfall Modified K-nn Prediction (Low Years of Post 1980)")
title(sub="MJJ SLP, MAM SST",cex.sub=0.85)
title(ylab="ASO Rainfall (mm)")
title(xlab="Year")
for(i in 1:5)
{
abline(h=qf[i],lty=2,col="blue")
}


scan()

#======================================================================================================
#Now do scatter plots of precipitation
par(mfrow=c(1,1))

meansim=apply(ensemblef,2,median)
plot(yfVol,meansim,pch=20,main="Scatter Plot of Ensemble Median vs Observed Rainfall (Post 1980)",xlab="Observed Rainfall (mm)",ylab="Ensemble Median (mm)")
zz=lsfit(yfVol,meansim)
yest=zz$coef[1]+zz$coef[2]*yfVol
cor=cor(yfVol,meansim)
print("Correlation between observed rainfall and ensemble median (Post 1980) is")
print(cor)
lines(yfVol,yest,col="blue")
grid(col="gray",lty="dotted")
title(sub="MJJ SLP, MAM SST",cex.sub=0.85)

scan()

#======================================================================================================
#Now do PDF
par(mfrow=c(1,3))

#hist(yfVol,probability=T,xlab="ASO Rainfall (mm)",ylab="PDF",main="PDF of Streamflow (1983)",col="light gray",ylim=range(0,1.5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot1[,1],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1983)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot1[,1],add=T,col="blue")
y=c(mean(xplot1[,1]),mean(yfVol))
#legend(250,0.013,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowh[1],lty=2)
#points(yfVol,rep(0,yp),pch=20)

#hist(yfVol,probability=T,xlab="SON Streamflow (cms)",ylab="PDF",main="PDF of Streamflow (1988)",col="light gray",ylim=range(0,1.5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot1[,2],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1988)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot1[,2],add=T,col="blue")
y=c(mean(xplot1[,2]),mean(yfVol))
#legend(250,0.008,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowh[2],lty=2)
#points(yfVol,rep(0,yp),pch=20)
#title(sub="MJJ SLP, MAM SST")

#hist(yfVol,probability=T,xlab="SON Streamflow (cms)",ylab="PDF",main="PDF of Streamflow (1995)",col="light gray",ylim=range(0,1.5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot1[,3],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1995)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot1[,3],add=T,col="blue")
y=c(mean(xplot1[,3]),mean(yfVol))
#legend(550,0.008,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowh[3],lty=2)
#points(yfVol,rep(0,yp),pch=20)
title(sub="High Years of Post 1980")

scan()

#------------------------------------------------------------------------------------------------------
par(mfrow=c(1,3))

#hist(yfVol,probability=T,xlab="SON Streamflow (cms)",ylab="PDF",main="PDF of Streamflow (1984)",col="light gray",ylim=range(0,5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot2[,1],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1984)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot2[,1],add=T,col="blue")
y=c(mean(xplot2[,1]),mean(yfVol))
#legend(500,0.013,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowl[1],lty=2)
#points(yfVol,rep(0,yp),pch=20)

#hist(yfVol,probability=T,xlab="SON Streamflow (cms)",ylab="PDF",main="PDF of Streamflow (1987)",col="light gray",ylim=range(0,5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot2[,2],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1987)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot2[,2],add=T,col="blue")
y=c(mean(xplot2[,2]),mean(yfVol))
#legend(200,0.012,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowl[2],lty=2)
#points(yfVol,rep(0,yp),pch=20)
#title(sub="MJJ SLP, MAM SST")

#hist(yfVol,probability=T,xlab="SON Streamflow (cms)",ylab="PDF",main="PDF of Streamflow (1994)",col="light gray",ylim=range(0,5e-5))
meansim=apply(ensemblef,2,mean)
sm.density(xplot2[,3],col="red",xlim=range(200,1100),xlab="ASO Rainfall (mm)",ylab="PDF")
title(main="PDF of Rainfall (1994)")
sm.density(yfVol,add=T,col="grey")
#sm.density(xboot2[,3],add=T,col="blue")
y=c(mean(xplot2[,3]),mean(yfVol))
#legend(500,0.013,c("Historical PDF","Forecast PDF","Bootstrap PDF"),lty=c(1,1,1),col=c("grey","red","blue"),cex=0.75)
abline(v=flowl[3],lty=2)
#points(yfVol,rep(0,yp),pch=20)
title(sub="Low Years of Post 1980")

par(mfrow=c(1,1))

scan()

#======================================================================================================
#Now estimate the exceedence probabilities for high years
quf=704
qlf=451

exceed=1:3

for (k in 1:3){
xl=0
xu=qlf

x=xplot2[,k]
#band=hnorm(x) # Normal Kernel
band=hnorm(x)*2.214  #EP Kernel
xx=x
library(sm)

integrandep=function(x){

n1=length(xx)
zz=abs((x-xx)/band)

# EP kernel
zz[zz >= 1]=1.
zz=0.75*(1-zz*zz)

#For Normal Kernel
#zz=exp(-zz*zz)
#zz = (1/sqrt(2*pi)) * zz

zz=sum(zz)/(n1*band)

zz
}

qgauscdf=function(a,b){

w=1:5
xxy=1:5

w[1]=0.2955242247
w[2]=0.2692667193
w[3]=0.2190863625
w[4]=0.1494513491
w[5]=0.0666713443

xxy[1]=0.1488743389
xxy[2]=0.4333953941
xxy[3]=0.6794095682
xxy[4]=0.8650633666
xxy[5]=0.9739065285


xm=0.5*(b+a)
xr=0.5*(b-a)
ss=0
for (j in 1:5){
dx=xr*xxy[j]
xx1=xm+dx
xx2=xm-dx
ss=ss+w[j]*(integrandep(xx1)+integrandep(xx2)) 
}
ss=xr*ss
ss

}


exceed[k]=1-qgauscdf(xl,xu)
}

#======================================================================================================
hist(yfVol,probability=T,xlab="ASO Rainfall (mm)",ylab="PDF",main="PDF of Rainfall (Post 1980)",col="light gray",ylim=range(0,0.005))
meansim=apply(ensemblef,2,mean)
sm.density(meansim,col="red",add=T)
sm.density(yfVol,add=T)
y=c(mean(meansim),mean(yfVol))
legend(700,0.005,c("Observed Rainfall","Forecast Rainfall"),lty=c(1,1),col=c("black","red"),cex=0.85)
title(sub="MJJ SLP, MAM SST",cex.sub=0.85)
abline(v=y[1],lty=2,col="red")
abline(v=y[2],lty=2)
points(yfVol,rep(0,yp),pch=20)

par(mfrow=c(1,1))
}