HW3_Library

Load Packages

library(extRemes) # to fit stationary and nonstationary GEV and GEV diagnostics
library(tidyverse) # GGplot2 and dplyr to organize and pipe raw data
library(reshape2) # I use this for melt command
library(fields)   # spatial model
library(sm)
library(dplyr)
library(maps)          #for state borders
library(ggplot2)      # for plotting spatial map
library(extRemes)     #for fit the GEV
library(fExtremes)    # for getting the GEV quantiles
library(geoR)         # for use as.geodata
library(spBayes)     # for bayesian kriging
library(latex2exp)    #  LaTeX math formulas to R's
library(RColorBrewer) # Color schemes for maps
library(data.table)   #Fast aggregation of large data
library(gstat)        # to Create gstat objects and variogram()
library(sp)           # for using coordinates
library(verification) # compute RPSS
library(gcmr)
library(HiddenMarkov)
library(MASS)
library(ggthemes)
library(data.table)
library(leaps) # to provide combinations
library(MPV) # for calculating PRESS

KNN-Bootstrapping

knn = function(df.train, preds, bootstrap = T){
  
  #remove year and flow from predictor input so just covariates remain
  preds = preds[,-c(1:2)]
  
  #choose k to either be the length of full dataset (useful if bootstrapping later) or the optimal sqrt(n) (use if not bootstrapping)
  #could also choose some other user defined value for k
  if (bootstrap == T ){
    k = nrow(df.train)
  } else {
    k = ceiling(sqrt(nrow(df.train)))
  }
  
  #number of predictors
  np = ncol(df.train) - 2
  
  #initialize distance matrix
  dist.mat = matrix(NA, nrow = nrow(df.train), ncol = np)
  
  #calc distance for each neighbor (loop thru each predictor)
  for(i in 1:np){
    dist.mat[,i] = (preds[1,i] - df.train[,i+2])^2
  }
  
  #calc Euclidean distance for each row (year) by summing distances of each predictor
  df.train$dist = sqrt(apply(dist.mat, 1, sum))
  
  #select the k smallest distances to return k nearest neighbors
  neighbs = slice_min(df.train, order_by = dist, n = k)
  
  return(neighbs)
  
}    

neighbor_bootstrapping = function(neighbs, nsim){
  
  # calc inverse distance weights for k-neighbors
  k = nrow(neighbs)
  knn.wts = (1/(1:k))/sum(1/(1:k))
  
  # bootstrap simulated flows from neighbors based on IDW scheme
  sims = sample(neighbs$q_maf, nsim, replace = T, prob = knn.wts)
  
  return(sims)
}

HMM

# dthmm
dthmm <- 
  function (x, Pi, delta, distn, pm, pn = NULL, discrete = NULL, 
            nonstat = TRUE) 
  {
    if (is.null(discrete)) {
      if (distn == "beta" | distn == "exp" | distn == "gamma" | 
          distn == "lnorm" | distn == "logis" | distn == "norm") 
        discrete <- FALSE
      else if (distn == "binom" | distn == "pois") 
        discrete <- TRUE
      else stop("parameter discrete must be used when applying user distributions")
    }
    y <- c(list(x = x, Pi = Pi, delta = delta, distn = distn, 
                pm = pm, pn = pn, discrete = discrete, nonstat = nonstat))
    class(y) <- "dthmm"
    return(y)
  }

get.named.parlist <- function(x,m,dist,ic,...){
  require(MASS)
  fit <- fitdistr(x,dist,...)
  np <- length(fit$estimate)
  pars <- vector('list',np)
  names(pars) <- names(fit$estimate)
  
  init <- lapply(fit$estimate,max)
  names(init) <- names(fit$estimate)
  
  
  for(j in 1:m){
    #print(j)
    #browser()
    
    #browser()
    this.fit <- fitdistr(x[ntile.ts(x,m) == (j-1)],dist,init,...)
    #for(k in 1:np)
    # pars[[k]][j] <- this.fit$estimate[k]
    for(k in 1:np)
      pars[[k]][j] <- fit$estimate[k]
    if(dist == 'normal'){
      if(ic == 'same.both'){
        pars[[k]][j] <- mean(x)
        pars[[k]][j] <- sd(x)
      } else if( ic == 'same.sd'){
        pars[[k]][j] <- mean(x[ntile.ts(x,m) == (j-1)])
        pars[[k]][j] <- sd(x)
      }else{
        pars[[k]][j] <- mean(x[ntile.ts(x,m) == (j-1)])
        pars[[k]][j] <- sd(x[ntile.ts(x,m) == (j-1)])
      }
    }
  }
  pars
}

Pi_init <- function(n,type='uniform'){
  matrix(rep(1/n,n^2),n)
}

delta_init <- function(n, type='uniform'){
  d <- rnorm(n)^2
  d/sum(d)
}

ntile.ts <- 
  function(x, n, limit.type = 'prob', tie = 1, altobs = NULL ){
    # returns an integer vector corresponding to n states broken by equal 
    # probability or equal distance
    #
    limit <- 
      if(limit.type == 'prob') 
        quantile(x,seq(0,1,1/n))
    else if(limit.type == 'equal')
      seq(min(x),max(x),by=diff(range(x))/n)
    
    if(!is.null(altobs)) limit <- quantile(altobs,seq(0,1,1/n))
    
    b <- integer(length(x))
    
    for(i in 1:(n+1)){
      filter <- 
        if(tie == 1) 
          x >= limit[i] & x <= limit[i+1]
      else 
        x > limit[i] & x <= limit[i+1]
      
      #only need to set the 1's because b is already 0's
      b[filter] <- as.integer(i-1)
    }
    
    if(class(x) == 'ts') 
      return(ts(b,start=start(x),end=end(x))) 
    else 
      return(b)
  }

# AIC (or BIC) for HMM
AIC_hmm=function(hmm, k=2){
  LL=hmm$LL
  n=length(hmm$x)
  
  n_pdf=length(hmm$pm) # number of parameters to fit one marginal distribution
  n_states=nrow(hmm$Pi) # number of states
  n_transition=nrow(hmm$Pi)*ncol(hmm$Pi) # number of transition probabilities
  npar=n_states*n_pdf+n_transition
  
  return(-2*LL+k*npar)
}

ggplot_stationary_hmm <- function(x,binwidth=NULL,res=1000,cols=NULL,...){
  
  m <- length(x$delta)
  dens <- matrix(0,nrow=m+1,ncol=res)
  r <- extendrange(x$x,f=.05)
  xrange <- seq(r[1],r[2],len=res)
  delta <- statdist(x$Pi)
  if(is.null(binwidth)) binwidth <- diff(range(x$x))/8
  for(i in 1:m){
    
    if(x$distn == 'gamma'){
      dens[i,] <- delta[i]*dgamma(xrange,shape=x$pm$shape[i],rate=x$pm$rate[i])
    }else if(x$distn == 'norm'){
      dens[i,] <- delta[i]*dnorm(xrange,mean=x$pm$mean[i],sd=x$pm$sd[i])
    }else{
      stop('Distribution not supported')
    }
    
    dens[m+1,] <- dens[m+1,] + dens[i,]
  }
  
  p <- ggplot()+
    geom_histogram(data=data.frame(x=as.vector(x$x)),aes(x=x,y=..density..),
                   binwidth=binwidth,fill='white',color='black')+
    theme_bw()
  
  dt <- data.table(x=numeric(0),y=numeric(0), state=integer(0))
  for(i in 1:m)
    dt <- rbind(dt, data.table(x=xrange,y=dens[i,], state=i))
  dt$state <- factor(dt$state)
  
  p <- p + geom_line(data=dt,aes(x=x,y=y,color=state)) +
    geom_line(data=data.frame(x=xrange,y=dens[m+1,]),aes(x=x,y=y),color='black',size=1) +
    scale_color_tableau() +
    scale_x_continuous(limits=r)
  p
  
}

statdist <- function(tpm){
  
  m <- nrow(tpm)
  ones <- rbind(rep(1,m))
  I <- diag(rep(1,m))
  U <- matrix(rep(1,m^2),m)
  as.vector(ones %*% solve(I - tpm + U))
  
}

# Find best GLM
GLM_fit = function(Pm, X, family, month) {
  
  if (family == "gamma") {
    links = c("log", "inverse","identity")
    
    # clean data and remove zeros
    Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
    
  } else if (family == "binomial"){
    links = c("logit", "probit", "cauchit")
  } else if (family == "gaussian"){
    links = c("identity")
  }
  
  N = length(Pm)
  
  combs = leaps(X,Pm, nbest=25) # GEt upto 25 combinations for each
  # number of predictors
  combos = combs$which
  ncombos = length(combos[,1])
  glm_press=vector(length = length(links))
  best_comb=vector(length = length(links))
  xpress=1:ncombos
  xmse = 1:ncombos
  
  for(j in 1:length(links)) {
    aux_var=1 # allow to fit glm
    ##remove small values for gamma distribution
    if (family == "gamma"){
      if (links[j]=="identity"){
        if( month==1){
          Pm[Pm<=0.005]=0.005
        }else{
          Pm[Pm<=2.5]=2.5
        }
        
      }else if (links[j]=="inverse" & month==1){
        Pm[Pm<=25]=25 # it is necessary to modific significantly the small values,
        #so, "inverse" is not fitted for January
        aux_var=0
      }
    }
    if (aux_var==1)
    {for(i in 1:ncombos) {
      xx = X[,combos[i,]]
      xx=as.data.frame(xx)
      if (family == "gamma"){
        zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
      }else if (family == "binomial"){
        zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
      }else if (family == "gaussian"){
        zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
      }
      xpress[i]=AIC(zz)
      xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
      #print(xpress[i])
    }}
    if (aux_var==1){
      # Test using AIC objective function
      glm_press[j]=min(xpress)
      best_comb[j]=which.min(xpress)
    }else{
      # Test using AIC objective function
      glm_press[j]=200000
      best_comb[j]=which.min(xpress)
    }
  }
  
  press_df = data.frame(glm_press)
  rownames(press_df) = links[1:length(links)]
  
  print("Results of AIC for bestfit GLM")
  print(press_df)
  print(best_comb[which.min(glm_press)])
  
  print(sprintf("Choosing the GLM which minimizes AIC: %s family and %s link function.", family,
          links[which.min(glm_press)]))
  xx = X[,combos[best_comb[which.min(glm_press)],]]
  xx=as.data.frame(xx)
  if (family == "gamma") {
    bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
  } else if (family == "binomial") {
    bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
  } else if (family == "gaussian") {
    bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
  } else {
    print("Error!")
  }
  bestmod$Call$LINK=links[which.min(glm_press)]
  bestmod$Call$AIC=AIC(bestmod)
  bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
  return(bestmod)
}

HW3_Library

Anna Starodubtseva

December 14, 2021

Load Packages

KNN-Bootstrapping

HMM

Plot GEV coefficients over grid