rm(list=ls()) library(dplyr) apr1Fcst = read.table("K:/My Drive/Phd Research/CRB Midterm Temperature Perturbed Predictions/Code/Phase 1/bdlm/Apr1Forecast.txt", header = T, sep = ",") #the below two functions could be looped to be repeated over the entire period of record df.train = subset(apr1Fcst, wyears != 2000) #training dataset preds = subset(apr1Fcst, wyears == 2000) #year to be predicted knn = function(df.train, preds){ #remove year and flow from predictor input so just covariates remain preds = preds[,-c(1:2)] #choose k to either be the length of full dataset (useful if bootstrapping later) or the optimal sqrt(n) (use if not bootstrapping) #could also choose some other user defined value for k k = nrow(df.train) #k = ceiling(sqrt(nrow(df.train))) #number of predictors np = ncol(df.train) - 2 #initialize distance matrix dist.mat = matrix(NA, nrow = nrow(df.train), ncol = np) #calc distance for each neighbor (loop thru each predictor) i = 1 for(i in 1:np){ dist.mat[,i] = (preds[1,i] - df.train[,i+2])^2 } #calc Euclidean distance for each row (year) by summing distances of each predictor df.train$dist = sqrt(apply(dist.mat, 1, sum)) #select the k smallest distances to return k nearest neighbors neighbs = slice_min(df.train, order_by = dist, n = k) return(neighbs) } neighbs = knn(df.train, preds) #bootstrapping function (feed with neighbors and desired ensemble size) nsim = 250 neighbor_bootstrapping = function(neighbs, nsim){ #calc inverse distance weights for k-neighbors k = nrow(neighbs) knn.wts = (1/(1:k))/sum(1/(1:k)) #bootstrap simulated flows from neighbors based on IDW scheme sims = sample(neighbs$q_maf, nsim, replace = T, prob = knn.wts) return(sims) } sims = neighbor_bootstrapping(neighbs, nsim) boxplot(sims, ylab = "Spring Flow (MAF)") points(preds$q_maf, col = "red", pch = 16)