{ library(locfit) library(sm) #---------------------------------------------- # MODIFIED K-NN METHOD-- FORECAST ALL YEARS #---------------------------------------------- # 1. Use y-hat from the locfit model # (fpredicted.loc$fit and cpredicted.loc$fit) # 2. The residuals from the locfit model will be used # (residuals(locf.model), residuals(locc.model)) # 3. Get the distance between the predicted point x values # and the x values for all points in the model (will get the # Euclidean distance of the normalized values) # 4. Pick the K (based on heuristic scheme) nearest neighbors # 5. Weight the K nearest neighbors # 6. The weighted K-NN residuals will then be used in a bootstrap # SET UP PARAMETERS preds=matrix(scan("predictors_knn_all.txt"),byrow=T,ncol=3) # all 3 possible predictors x=preds[30:51,2:3] sf=matrix(scan("seasonal_rainfall.txt"),byrow=T,ncol=2) # y-values (rainfall 1951-2001) yfVol=sf[30:51,2] mean=mean(yfVol) sd=sd(yfVol) const=1979 # the year before the data starts x=cbind(x[,1],x[,2]) # MAM SAT, AMJ SLP, MAM SST p=length(x[1,]) # p for the number of predictors # ** NOTE: If p is not 3, MUST CHANGE number of terms # ** in distance calculation xs=(1980-const) # the starting position of the data set (change as needed) xe=(2001-const) # the ending position of the data set (change as needed) yp=xe-xs+1 # yp for the number of years to predict at-- predict each year xx=scale(x) # normalize the data (so higher magnitude variables # don't get more weight in distance calculation) # set up matricies to hold ensemble forecast ensemblef=matrix(nrow=250,ncol=yp) xboot=matrix(nrow=250,ncol=yp) ## Need to make the forecast for every year, so put into big loop for(i in 1:yp) { # p1 is the position of the year we're predicting p1=i xpred=x[p1,] yfpred=yfVol[p1] if(p1 == xs) { xmodel=x[(xs+1):xe,] yfmodel=yfVol[(xs+1):xe] } if(p1 == xe) { xmodel=x[xs:(xe-1),] yfmodel=yfVol[xs:(xe-1)] } if(p1 != xe && p1 != xs) { xmodel=rbind(x[xs:(p1-1),],x[(p1+1):xe,]) yfmodel=c(yfVol[xs:(p1-1)],yfVol[(p1+1):xe]) } ym=length(xmodel[,1]) # DISTANCE CALCULATION # calculate the distance between the (predictors of the) point we're predicting # and all other points-- use scaled data xdist=scale(xmodel) distance=1:ym for(j in 1:ym) { distance[j]=sqrt(((xx[p1,1]-xdist[j,1])^2)+((xx[p1,2]-xdist[j,2])^2)) } # RANK the distances drank=rank(distance) # here rank 1 is the nearest neighbor # (i.e. the smallest distance) # DETERMINE K and weight it n=length(distance) kk=sqrt(n) kk=round(kk) #kk=5 W=1:kk W=1/W W=W/sum(W) W=cumsum(W) # Find the alpha for the locfit model-- take the alpha whith the lowest gcv alphaf=seq(0.35,1,by=0.05) xxf=gcvplot(yfmodel~xmodel,alpha=alphaf,deg=1,kern="bisq",ev="data") zxf=xxf$values zzf=order(zxf) alphaf=alphaf[zzf[1]] # Do the LOCFIT and get the expected value for each of the p points locf.model=locfit(yfmodel~xmodel, alpha=alphaf, deg=1, kern="bisq") fit=locf.model # Make the mean prediction # hack fix so predict.locfit will work: make xpred a matrix with # the real xpred the first row. Take first predicted point. xpred=rbind(xpred,xmodel) fpredicted.loc=predict.locfit(locf.model, xpred, se.fit=T, band="global") # now weight the neighbors and pick one at random (using the weights) # do this 100 times residualsf=residuals(locf.model) # get the residuals of the locfit model for(k in 1:250) # do innerloop 100 times (bootstrap residuals) { rannum=runif(1,0,1) xy=c(rannum,W) # adds a random number (between 0 and 1) to the # weight function (CDF) rankW=rank(xy) # assigns a rank to the random number # (and W vector) pos=order(drank)[rankW[1]] # gives the position in the distance matrix (and # corrrespondingly the y matrix for the selected # neighbor) resids=residualsf[pos] # Once I get a neighbor, I need to find the # residual associated with that neighbor ensemblef[k,i]=fpredicted.loc$fit[1]+resids # add that residual to the y-hat from the # locfit model } xboot[,i]=rnorm(250,mean=mean,sd=sd) } meanensemblef=matrix(apply(ensemblef,2,mean),ncol=1) write(t(meanensemblef),file="knn_rainfall_post.out",ncol=1,append=FALSE) #====================================================================================================== # Log Liklihood # hist is the vector of historical values # obsval is the observed value in any given year # il is the upper limit for the lower category # iu is the lower limit for the upper category # isim is the vector of simulated values # ncat is the number of categories (we can set it to 3 for now) loglikf=1:yp for(j in 1:yp) { isim=ensemblef[,j] obsval=yfVol[j] # set the divisions at the 33rd and 67th percentile of historical data (minus the year we're in) if(j==1) xxf=yfVol[2:yp] if(j==yp) xxf=yfVol[1:(yp-1)] if(j>1&&j= iu]=3 simcat[isim <= il]=1 simcat[isim > il & isim = iu) obscat=3 if(obsval > il & obsval < iu) obscat=2 if(obsval <= il) obscat=1 loglikf[j]=probs[obscat]/pclim[obscat] } #====================================================================================================== #Now boxplot the log likelihood par(mfrow=c(1,3)) boxplot(loglikf,main="Rainfall Log Likelihood", ylim=c(0,3),col="light grey",ylab="Score") quf=quantile(yfVol,.90) zzf1=loglikf[yfVol>quf] boxplot(zzf1,main="Rainfall Log Likelihood (High)", ylim=c(0,3),col="light grey",ylab="Score") title(sub="SCORE 1-3: BETTER SKILL") qlf=quantile(yfVol,.10) zzf2=loglikf[yfVol1&&j= iu]=3 simcat[isim <= il]=1 simcat[isim > il & isim = iu) obscat[3]=1 if(obsval > il & obsval < iu) obscat[2]=1 if(obsval <= il) obscat[1]=1 obscat=cumsum(obscat) rpssf[j]=1 - (sum((obscat-probs)^2) / sum((obscat-pclim)^2) ) } #====================================================================================================== #Now boxplot RPSS par(mfrow=c(1,3)) boxplot(rpssf,main="Rainfall RPSS", ylim=c(-2,1),col="light grey",ylab="Score") quf=quantile(yfVol,.90) zzf3=rpssf[yfVol>quf] boxplot(zzf3,main="Rainfall RPSS (High)", ylim=c(-2,1),col="light grey",ylab="Score") title(sub="HIGHER SCORE: BETTER SKILL") qlf=quantile(yfVol,.10) zzf4=rpssf[yfVol= 1]=1. zz=0.75*(1-zz*zz) #For Normal Kernel #zz=exp(-zz*zz) #zz = (1/sqrt(2*pi)) * zz zz=sum(zz)/(n1*band) zz } qgauscdf=function(a,b){ w=1:5 xxy=1:5 w[1]=0.2955242247 w[2]=0.2692667193 w[3]=0.2190863625 w[4]=0.1494513491 w[5]=0.0666713443 xxy[1]=0.1488743389 xxy[2]=0.4333953941 xxy[3]=0.6794095682 xxy[4]=0.8650633666 xxy[5]=0.9739065285 xm=0.5*(b+a) xr=0.5*(b-a) ss=0 for (j in 1:5){ dx=xr*xxy[j] xx1=xm+dx xx2=xm-dx ss=ss+w[j]*(integrandep(xx1)+integrandep(xx2)) } ss=xr*ss ss } exceed[k]=1-qgauscdf(xl,xu) } #====================================================================================================== hist(yfVol,probability=T,xlab="ASO Rainfall (mm)",ylab="PDF",main="PDF of Rainfall (Post 1980)",col="light gray",ylim=range(0,0.005)) meansim=apply(ensemblef,2,mean) sm.density(meansim,col="red",add=T) sm.density(yfVol,add=T) y=c(mean(meansim),mean(yfVol)) legend(700,0.005,c("Observed Rainfall","Forecast Rainfall"),lty=c(1,1),col=c("black","red"),cex=0.85) title(sub="MJJ SLP, MAM SST",cex.sub=0.85) abline(v=y[1],lty=2,col="red") abline(v=y[2],lty=2) points(yfVol,rep(0,yp),pch=20) par(mfrow=c(1,1)) }