hw1_lib.R

# Load Packages

library(MASS)                   # for stepAIC
library(sm)                     # for sm.density in diagnostics

## Package 'sm', version 2.2-5.4: type help(sm) for summary information

## 
## Attaching package: 'sm'

## The following object is masked from 'package:MASS':
## 
##     muscle

library(MPV)          # for calculating PRESS

## 
## Attaching package: 'MPV'

## The following object is masked from 'package:MASS':
## 
##     cement

## The following object is masked from 'package:datasets':
## 
##     stackloss

library(FNN)                    # for k nearest neighbor to snap grids together
library(akima)              # for interp to smooth for plotting
library(fields)             # for surface to plot surface plot

## Loading required package: spam

## Loading required package: grid

## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: maps

library(leaps)        # to provide combinations
library(prodlim)      # for row.match()
library(locfit)       # for fitting local polynomial

## locfit 1.5-9.1    2013-03-22

library(mgcv)         # for GAM

## Loading required package: nlme

## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.

plot_india <- function(precip_dt, india_grid, raj_grid, bestmod) {
  
    ### Quilt plotting code for India grid
    par(mfrow=c(1,1))
    
    xv1=data.frame(india_grid[,1:2])
    xv2=data.frame(raj_grid[,1:2])
    
    zzint=row.match(xv2,xv1)
    indexnotna = which(!is.na(zzint))
    indexna = which( is.na(zzint) )
    
    ## create the new rajeevan grid
    rajeevgridel = cbind(raj_grid[indexnotna,1:2],india_grid[zzint[indexnotna],3])  
    
    X=precip_dt[,1:3]
    Y=precip_dt[,4]
    
    X = as.data.frame(rajeevgridel)
    names(X)=c("lon","lat","elev")
    
    if (class(bestmod)[1]=="glm") {
      
      if (bestmod$family[1]=="binomial") {
        p75 = quantile(precip_dt[,4], 0.75)
        exceed75 = precip_dt[,4] >= p75
        Y = exceed75
        ypred = predict(bestmod,newdata=X,se.fit=T, type="response")
      }
      else {
        ypred <- predict(bestmod,newdata=X,se.fit=T)
      }
    }
    else {
    ypred <- predict(bestmod,newdata=X,se.fit=T)
    }
    
    ############ Simple plotting..
    if (class(bestmod)[1]=="glm") {
      if (bestmod$family[1]=="binomial") {
        quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Probabiliy for the 75th Percentile \nPrecipitation Exceedance on DEM Grid')
        world(add=T,lwd=1)
        quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error for the 75th Percentile \nPrecipitation Exceedance on DEM Grid')
        world(add=T,lwd=1)
      } 
      else {
        quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
        world(add=T,lwd=1)
        quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
        world(add=T,lwd=1)
      }
    }  
    else {
    quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
    world(add=T,lwd=1)
    quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
    world(add=T,lwd=1)
    }
}


choose_best_model <- function(mod1, obj_fun=NULL, family=NULL){
  
  # Different cases for different problems
  if (class(bestmod)[1]=='lm') {
    
    # Determine 'best' model based on AIC criterion:
    bestAIC = stepAIC(mod1, trace=FALSE)                                        
    # Determine 'best' model based on BIC criterion:
    bestBIC = stepAIC(mod1, k=log(length(precip_dt$pmm)), trace=FALSE)          # best model is "precip ~ lon + lat" is "best" - agrees with AIC.
    
    # Compare to see if AIC and BIC agree:
    # If so, display the agreed-upon model
    if (bestAIC$call == bestBIC$call){
      model_agreement = print("The AIC and BIC criterion agree on best model. AIC and BIC result in equivalent models and either can be used to represent the best model.")
    }
    else {
      model_agreement = print("Warning! The AIC and BIC criterion do not agree on the best model.You are advised to conduct additional objective functions such as PRESS, leaps, or GCV for additional information before continuing. If no further action is taken, the AIC model will be used.")
    }
    
    # Choose which model to output as best model 
    if (obj_fun == "AIC"){
      bestmod = bestAIC # choose the AIC model
      print("Action: Choosing AIC model...")
    }
    else if (obj_fun == "BIC"){
      bestmod = bestBIC # choose the BIC model
      print("Choosing BIC model.")
    }
    else {
      bestmod = bestAIC # choose the AIC model
      print("No model preference stated between AIC or BIC or incorrect string input to obj_fun. Choosing AIC model as best model.")
    }
    
    print("Best model information shown below:")
    
  } else if (class(bestmod)[1]=='glm') {
    print("Not working!")
  }
  
    return(bestmod)
  
}  

best_model_agree <- function(mod1, obj_fun){
  
  # Determine 'best' model based on AIC criterion:
  bestAIC = stepAIC(mod1, trace=FALSE)                                      
  # Determine 'best' model based on BIC criterion:
  bestBIC = stepAIC(mod1, k=log(length(precip_dt$pmm)), trace=FALSE)            # best model is "precip ~ lon + lat" is "best" - agrees with AIC.
  
  # Compare to see if AIC and BIC agree:
  # If so, display the agreed-upon model
  if (bestAIC$call == bestBIC$call){
    model_agreement = print("The AIC and BIC criterion agree on best model. AIC and BIC result in equivalent models and either can be used to represent the best model.")
  }
  else {
    model_agreement = print("Warning! The AIC and BIC criterion do not agree on the best model.You are advised to conduct additional objective functions such as PRESS, leaps, or GCV for additional information before continuing. If no further action is taken, the AIC model will be used.")
  }
  
  # Choose which model to output as best model 
  if (obj_fun == "AIC"){
    bestmod = bestAIC # choose the AIC model
    print("Action: Choosing AIC model...")
  }
  else if (obj_fun == "BIC"){
    bestmod = bestBIC # choose the BIC model
    print("Choosing BIC model.")
  }
  else {
    bestmod = bestAIC # choose the AIC model
    print("No model preference stated between AIC or BIC or incorrect string input to obj_fun. Choosing AIC model as best model.")
  }
  
  print("Best model information shown below:")
  
  return(bestmod)
}

mod_diagnostics <- function(y, yhat, nvar, CD){
  
  #par(mfrow=c(3,2))
  e <- y - yhat
  
  nobs <- length(y)
  coef <- nvar+1
  
  ## Testing if errors (residuals) are normal and iid
  # 1. Normality histogram
  hist(e,xlab="Residuals",ylab="Density",probability=T,main="Distribution of Residuals")
  lines(sort(e),dnorm(sort(e),mean=0,sd=sd(e)),col="red")
  sm.density(e,add=T,col="blue")
  legend("topright", c("Normal Fit", "Non-parametric Fit"), lty=c(1,1), lwd=c(2.5,2.5), col=c("red", "blue")) 
  
  # 2. Normal QQplot 
  qqnorm(e,main="Normal Q-Q Plot of Residuals")
  qqline(e)
  
  # 3. Observed versus estimates
  plot(y,yhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precip", xlim=c(min(y),max(y)), ylim=c(min(y), max(y)))
  abline(a=0,b=1)
  
  ## Testing for heteroskedasticity (which is constant variance of residuals)
  # 4. Residuals versus estimates
  plot(yhat,e,xlab="Estimated Precip",ylab="Residuals",main="Residuals vs Estimated Precip")
  abline(0,0)
  
  ## Testing for autocorrelation. If errors are uncorrelated they fall between the dotted lines.
  # 5. Autocorrelation plot
  cor=acf(e,main="Autocorrelation Plot")
}

loocv_func = function(bestmod) {
  
  # Select only the data from the best model (not full model)
  mod_data = bestmod$model
  
  # Initialize leave one out cross validation vector
  loocv = vector("numeric", nrow(mod_data))
  
  # slightly different cases for different problems
  if (class(bestmod)[1]=='lm') {
    for (i in 1:nrow(mod_data)) {
      drop_data = mod_data[-i,]   # drop one precip value
      drop_mod = lm(pmm ~ ., data = drop_data)
      x_pred = mod_data[i,]
      loocv[i] = predict(drop_mod, x_pred)
    }
    } else if (class(bestmod)[1]=='glm') {
    for (i in 1:nrow(mod_data)) {
      drop_data = mod_data[-i,]   # drop one precip value
      drop_mod = lm(pmm ~ ., data = drop_data, family = bestmod$family)
      x_pred = mod_data[i,]
      loocv[i] = predict(drop_mod, x_pred, type = 'response')
    } 
    }  else if (class(bestmod)[1] == "gam") {
      for(i in 1:nrow(mod_data)){
        drop_data = mod_data[-i,]   # drop one precip value
        # names(drop_data) = c("lat","lon","elev")
        drop_mod = gam(bestmod$formula, data = drop_data)
        x_pred = mod_data[i,]
        loocv[i] = predict(drop_mod, x_pred)
      }
    }
  
  # Plot LOOCV against observations
  
  lim = range(mod_data$pmm, loocv)
  plot(mod_data$pmm, loocv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV" )
  lines(mod_data$pmm, mod_data$pmm, col = "red")
}

#### GLM Fitting ###########

### Find best GLM 

GLM_fit = function(pmm, X, family) {
  
  if (family == "gamma") {
    links = c("log", "inverse", "identity")   
    
    # clean data and remove zeros
    precip_dt = na.omit(precip_dt)
    rownames(precip_dt) = NULL
    precip_dt$pmm = ifelse(precip_dt$pmm <=0, runif(1, 0.0001, 0.001), precip_dt$pmm)
    
  } else if (family == "binomial"){
    links = c("logit", "probit", "cauchit")
  }
  
  glm_aic = vector(length = length(links))
  N = length(pmm)
  
  combs = leaps(X,pmm, nbest=25)     #  GEt upto 25 combinations for each
  # number of predictors
  combos = combs$which
  
  ncombos = length(combos[,1])
  
  xpress=1:ncombos
  xmse = 1:ncombos
  
  for(j in 1:length(links)) {
    
    for(i in 1:ncombos) {
      
      xx = X[,combos[i,]]
      xx=as.data.frame(xx)
      if (family == "gamma") {
         zz=glm(pmm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
      } else if (family == "binomial"){
         zz=glm(pmm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
      }
      
      xpress[i]=PRESS(zz)
      
      xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
    }
    
    # Test using AIC objective function
    zms=stepAIC(zz, scope=list(upper = ~., lower = ~1), trace=FALSE)
    glm_aic[j] = zms$aic
  } 
  
  aic_df = data.frame(glm_aic)
  rownames(aic_df) = links[1:3]
  
  print("Results of AIC for bestfit GLM")
  print(aic_df)
  
  sprintf("Choosing the GLM which minimizes AIC: %s family and %s link function.", family, links[which.min(glm_aic)])
  # bestmod = glm(pmm ~ ., data = X, family = Gamma(link=links[which.min(glm_aic)]))
  
  if (family == "gamma") {
    bestmod = glm(pmm ~ ., data = X, family = Gamma(link=links[which.min(glm_aic)]))
  } else if (family == "binomial") {
    bestmod = glm(pmm ~ ., data = X, family = binomial(link=links[which.min(glm_aic)]))
  } else { 
    print("Error!")
  }
  
  return(bestmod)
}

##### Local Polynomail Fitting ######

#search for best alpha over a range of alpha values between 0 and 1

locpoly_fit = function(pmm, X, family="gaussian", link="identity", glm=FALSE) {

  pmm = precip_dt[,4]     # indepedent variable
  X = precip_dt[,1:3]       # all the predictor set
  X = as.matrix(X)
  
  nvar=length(X[1,])    #number of variables
  N=length(pmm) #number ofdata points
  
  print("checking inputs")
  print(family)
  print(link)
  
  bestdeg=1
  minalpha= 0.3
  if(glm==TRUE) minalpha = 0.7
  alpha1=seq(minalpha,1.0,by=0.05)
  n=length(alpha1)
  
  #get the GCV values for all the alpha values in alpha for order of
  # polynomial = 1. kern="bisq" argument is to use the bisquare kernel
  # in computing the weights of the neighbors, which are then used in
  # the weighted least squares..
  
  z0=gcvplot(pmm ~ X, alpha=alpha1,deg=bestdeg,kern="bisq",ev=dat(),scale=TRUE,family=family,link=link, maxk=5000)
  
  bestalpha = z0$alpha[which.min(z0$alpha)]
  
  # Now fit the LOCFIT model using the best alpha and degree obtained from
  #above..
  X = data.frame(X)
  bestmod=locfit(pmm~lon+lat+elev, data=X, alpha=bestalpha, deg=bestdeg, kern="bisq", scale=TRUE, family=family, link=link, maxk=5000)  # WJR: removed ev=dat() and plotting was successfull
  
  bestmod$call$alpha = bestalpha
  bestmod$call$deg = bestdeg
  bestmod$call$family = family
  bestmod$call$link = link
  
  return(bestmod)
  
}

### F-test for Local Polynomial Fitting ###

loc_Ftest = function(precip_dt, bestmod) {
  
  pmm = precip_dt$pmm
  N=length(pmm) #number ofdata points
  X = precip_dt[,3]
  
  RSS1 = sum(residuals(bestmod)^2)
  nu1 = bestmod$dp[6]
  nu2 = bestmod$dp[7]
  nu11 = N-2*nu1 + nu2
  
  #linear regression
  X=as.matrix(X)
  zzLin=lm(pmm~X)
  
  XX = cbind(rep(1,N), X)
  # Compute the Hat matrix
  hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)
  
  II = diag(N)
  delta0 = t(II-hatm)%*%(II-hatm)    #Equation 9.2
  nu00 = sum(diag(delta0))
  
  RSS0 = sum(residuals(zzLin)^2)
  
  nu0 = length(zzLin$coef)      #number of model coefficients
  
  Fdata = (RSS0 - RSS1)/(nu00 - nu11)
  Fdata = (Fdata / (RSS1 / nu11))
  Ftheor = qf(0.95,(nu00-nu11), nu11)    #95% confidence level..
  
  ## Fdata > Ftheor   - reject null - i.e., data is otherwise (local polynomial fit is better)
  if (Fdata > Ftheor) {
    print("F-test:")
    sprintf("Reject the Null because F(local poly) = %0.2f > %0.2f = F(linear model).", Fdata, Ftheor)
  }
  
}

hw1_lib.R

Billy

Wed Oct 12 23:57:32 2016