rm(list=ls())

library(dplyr)

apr1Fcst = read.table("K:/My Drive/Phd Research/CRB Midterm Temperature Perturbed Predictions/Code/Phase 1/bdlm/Apr1Forecast.txt", header = T, sep = ",")

#the below two functions could be looped to be repeated over the entire period of record

df.train = subset(apr1Fcst, wyears != 2000) #training dataset
preds = subset(apr1Fcst, wyears == 2000) #year to be predicted

knn = function(df.train, preds){
  
  #remove year and flow from predictor input so just covariates remain
  preds = preds[,-c(1:2)]
  
  #choose k to either be the length of full dataset (useful if bootstrapping later) or the optimal sqrt(n) (use if not bootstrapping)
  #could also choose some other user defined value for k
  k = nrow(df.train)
  #k = ceiling(sqrt(nrow(df.train)))
  
  #number of predictors
  np = ncol(df.train) - 2
  
  #initialize distance matrix
  dist.mat = matrix(NA, nrow = nrow(df.train), ncol = np)
  
  #calc distance for each neighbor (loop thru each predictor)
  i = 1
  for(i in 1:np){
    dist.mat[,i] = (preds[1,i] - df.train[,i+2])^2
  }
  
  #calc Euclidean distance for each row (year) by summing distances of each predictor
  df.train$dist = sqrt(apply(dist.mat, 1, sum))
  
  #select the k smallest distances to return k nearest neighbors
  neighbs = slice_min(df.train, order_by = dist, n = k)
  
  return(neighbs)
  
}    

neighbs = knn(df.train, preds)


#bootstrapping function (feed with neighbors and desired ensemble size)
nsim = 250

neighbor_bootstrapping = function(neighbs, nsim){
  
  #calc inverse distance weights for k-neighbors
  k = nrow(neighbs)
  knn.wts = (1/(1:k))/sum(1/(1:k))

  #bootstrap simulated flows from neighbors based on IDW scheme
  sims = sample(neighbs$q_maf, nsim, replace = T, prob = knn.wts)
  
  return(sims)
}

sims = neighbor_bootstrapping(neighbs, nsim)


boxplot(sims, ylab = "Spring Flow (MAF)")
points(preds$q_maf, col = "red", pch = 16)