HW1_Library

# Load Packages
library(sm)           # for sm.density in diagnostics

## Package 'sm', version 2.2-5.5: type help(sm) for summary information

library(leaps)        # to provide combinations
library(MPV)          # for calculating PRESS
library(sm)           # for sm.density in diagnostics
library(akima)        # for interp to smooth for plotting
library(fields)       # for surface to plot surface plot

## Loading required package: spam

## Loading required package: dotCall64

## Loading required package: grid

## Spam version 2.2-0 (2018-06-19) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: maps

## See www.image.ucar.edu/~nychka/Fields for
##  a vignette and other supplements.

library(mgcv)         # for GAM

## Loading required package: nlme

## This is mgcv 1.8-24. For overview type 'help("mgcv-package")'.

library(locfit)       # for fitting local polynomial

## locfit 1.5-9.1    2013-03-22

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:MPV':
## 
##     cement

## The following object is masked from 'package:sm':
## 
##     muscle

library(prodlim)
library(rgdal)        # for converting projection of coordinates

## Loading required package: sp

## rgdal: version: 1.3-4, (SVN revision 766)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/DELL/Documents/R/win-library/3.5/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/DELL/Documents/R/win-library/3.5/rgdal/proj
##  Linking to sp version: 1.3-1

library(ggplot2)      # for plotting spatial map
library(reshape2)     # for using the melt function (Convert an object into a molten data frame)
library(scales)      # for Visualization spatial map

##################################################
########### graphic functions    ################# 
##################################################

##### Quilt plotting code
Quilt_plotting=function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
  if (save==TRUE){
    if (type=="binary"){
      pdf("Predicted Binary Precipitation DEM Grid.pdf", width = 6, height = 8) # save figure
      par(mfrow=c(2,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Binary Precipitation on DEM Grid')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Binary Precipitation ')
      US( add=TRUE, col="black", lwd=2)
      dev.off()
      
      pdf("Predicted Standard Error DEM Grid.pdf", width = 8, height = 6) # save figure
      quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid')
      US( add=TRUE, col="black", lwd=2)
      dev.off()
    }else{
      pdf("Predicted Precipitation DEM Grid.pdf", width = 6, height = 8) # save figure
      par(mfrow=c(2,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Precipitation (mm)')
      US( add=TRUE, col="black", lwd=2)
      dev.off()
      
      pdf("Predicted Standard Error DEM Grid.pdf", width = 8, height = 6) # save figure
      quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
      US( add=TRUE, col="black", lwd=2)
      dev.off()
    }
    
  }else{
    if (type=="binary"){
      par(mfrow=c(2,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Binary Precipitation on DEM Grid')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Binary Precipitation')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid')
      US( add=TRUE, col="black", lwd=2)
    } else{
      par(mfrow=c(2,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Precipitation (mm)')
      US( add=TRUE, col="black", lwd=2)
      
      quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
      US( add=TRUE, col="black", lwd=2)
    }
  }

}

##### surface plotting code
plot_surface =function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
  
  zz = interp(lon,lat,ypred$fit, duplicate = "strip")
  zz2 = interp(lon1,lat1,yob, duplicate = "strip")
  if (save==TRUE){
    if (type=="binary"){
      pdf("Predicted Binary Precipitation DEM Grid1.pdf", width = 6, height = 8) # save figure
      par(mfrow=c(2,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Binary Precipitation on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Binary Precipitation'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      dev.off()
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      pdf("Predicted Standard Error DEM Grid1.pdf", width = 8, height = 6) # save figure
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      dev.off()
    }else{
      pdf("Predicted Precipitation DEM Grid1.pdf", width = 6, height = 8) # save figure
      par(mfrow=c(2,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Precipitation on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Precipitation (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      dev.off()
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      pdf("Predicted Standard Error DEM Grid1.pdf", width = 8, height = 6) # save figure
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      dev.off()
    }
    
  }else{
    if (type=="binary"){
      par(mfrow=c(3,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Binary Precipitation on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Binary Precipitation'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
    } else{
      par(mfrow=c(3,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Precipitation on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Precipitation (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
    }


  }

}

##### Model dagnostics code
mod_diagnostics = function(y, yhat, nvar,save){
  
  if (save==TRUE){
    pdf("MODEL DIAGNOSTICS.pdf", width = 10, height = 8) # save figure
  }
  par(mfrow=c(2,2))
  e <- y - yhat
  
  nobs <- length(y)
  coef <- nvar+1
  
  ## Testing if errors (residuals) are normal and iid
  # 1. Normality histogram
  
  hist(e,xlab="Residuals",ylab="Density",probability=T,main="Distribution of Residuals")
  lines(sort(e),dnorm(sort(e),mean=0,sd=sd(e)),col="red")
  sm.density(e,add=T,col="blue")
  legend("topright", c("Normal Fit", "Non-parametric Fit"), lty=c(1,1), lwd=c(2.5,2.5), col=c("red", "blue")) 
 
  
  # 2. Normal QQplot 
  qqnorm(e,main="Normal Q-Q Plot of Residuals")
  qqline(e)
  
  ## Testing for heteroskedasticity (which is constant variance of residuals)
  # 3. Residuals versus estimates
  plot(yhat,e,xlab="Estimated Precip",ylab="Residuals",main="Residuals vs Estimated Precip")
  abline(0,0)
  ## Testing for autocorrelation. If errors are uncorrelated they fall between the dotted lines.
  # 4. Autocorrelation plot
  cor=acf(e,main="Autocorrelation Plot")
  if (save==TRUE){
    dev.off() 
  }
}

#######################################
############analysis function ########
######################################

loocv_func = function(bestmod,X,Pm,save) {
  
  # Select only the data from the best model (not full model)
  if (class(bestmod)[1]=="locfit"){
    mod_data = X
    N = length(Pm)
  }else{
    mod_data = bestmod$model
    N = length(mod_data$Pm)
  }
  
  # Initialize leave one out cross validation vector
  loocv = vector("numeric", nrow(mod_data))
  
  # slightly different cases for different problems
  if (class(bestmod)[1]=='lm') {
    for (i in 1:nrow(mod_data)) {
      drop_data = mod_data[-i,]   # drop one precip value
      drop_mod = lm(Pm ~ ., data = drop_data)
      x_pred = mod_data[i,]
      loocv[i] = predict(drop_mod, x_pred)
    }
  } else if (class(bestmod)[1]=='glm') {
    for (i in 1:nrow(mod_data)) {
      drop_data = mod_data[-i,]   # drop one precip value
      drop_mod = glm(Pm ~ ., data = drop_data, family = bestmod$family)
      x_pred = mod_data[i,]
      loocv[i]=predict.glm(drop_mod, newdata=x_pred, se.fit=F,type="response")
    } 
  }  else if (class(bestmod)[1] == "gam") {
    for(i in 1:nrow(mod_data)){
      drop_data = mod_data[-i,]   # drop one precip value
      # names(drop_data) = c("lat","lon","elev")
      drop_mod = gam(bestmod$formula, data = drop_data)
      x_pred = mod_data[i,]
      loocv[i] = predict.gam(drop_mod, newdata=x_pred, se.fit=F,type="response")
    }
  } else if (class(bestmod)[1]=="locfit"){
    for(i in 1:nrow(mod_data)){
      drop_data = mod_data[-i,]   # drop one precip value
      Pm1=Pm[-i]
      # names(drop_data) = c("lat","lon","elev")
      x_pred = mod_data[i,]
      drop_mod=locfit(Pm1 ~., data = drop_data, alpha=bestmod$call$alpha, maxk = 10000, deg=bestmod$call$deg,kern="bisq"
                     , scale = T, family=bestmod$call$family, link=bestmod$call$link)
      loocv[i]=predict(drop_mod,newdata=x_pred)
    }
  } 
  
  # Plot LOOCV against observations
  if (save==TRUE){
    pdf("LOOCV.pdf", width = 8, height = 6) # save figure
    lim = range(Pm, loocv)
    plot(Pm, loocv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV" )
    #lines(mod_data$Pm, mod_data$Pm, col = "red")
    abline(a=0,b=1,col = "red")
    dev.off()
  }else{
    lim = range(Pm, loocv)
    plot(Pm, loocv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV" )
    #lines(mod_data$Pm, mod_data$Pm, col = "red")
    abline(a=0,b=1,col = "red")
  }
}

Drop_10_pred = function(bestmod,X,Pm,save,family="") {
  
  # Select only the data from the best model (not full model)
  if (class(bestmod)[1]=="locfit"){
    mod_data = X
    N = length(Pm)
  }else{
    mod_data = bestmod$model
    N = length(mod_data$Pm)
  }
  
  
  
  drop = 10
  nsample = 500
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
    
    drop_dt = mod_data[-i_drop,] # drop 10% precip value
    # slightly different cases for different problems
    if (class(bestmod)[1]=='lm') {
      fit_drop = lm(Pm ~ ., data = drop_dt)
      drop_pred = predict(fit_drop, newdata=mod_data[i_drop,])
    } else if (class(bestmod)[1]=='glm') {
      drop_mod = glm(Pm ~ ., data = drop_dt, family = bestmod$family)
      drop_pred =predict.glm(drop_mod, newdata=mod_data[i_drop,], se.fit=F,type="response")
    }  else if (class(bestmod)[1] == "gam") {
      fit_drop = gam(bestmod$formula, data = drop_dt)
      drop_pred = predict.gam(fit_drop, newdata=bestmod$model[i_drop,], se.fit=F,type="response")
    } else if (class(bestmod)[1]=="locfit"){
      if (family=="gaussian" | family=="gamma"){
        Pm1=Pm[-i_drop]
        ##remove small values 
        Pm1[Pm1<=0.1]=0.1
        drop_mod=locfit(Pm1 ~., data = drop_dt, alpha=bestmod$call$alpha, maxk = 10000, deg=bestmod$call$deg,kern="bisq"
                        , scale = T, family=bestmod$call$family, link=bestmod$call$link)
        drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      } else{
        Pm1=Pm[-i_drop]
        drop_mod=locfit(Pm1 ~., data = drop_dt, alpha=bestmod$call$alpha, maxk = 10000, deg=bestmod$call$deg,kern="bisq")
        drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      }
      
    }
    drop_actual = Pm[i_drop]
    skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
    skill_cor[i] = cor(drop_actual,drop_pred)
  }
  
  # Plot skill of model based on Drop 10% method
  if (save==TRUE){
    pdf("boxplots.pdf", width = 8, height = 6) # save figure
    par(mfrow=c(1,2))
    boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
    boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
    dev.off()
  }else{
    par(mfrow=c(1,2))
    boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
    boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
  }

}

##################################################
########### model fitting functions    ################# 
##################################################

#### GLM Fitting ###########

### Find best GLM 
GLM_fit = function(Pm, X, family, month) {
  
  if (family == "gamma") {
    links = c("log", "inverse","identity")   
    
    # clean data and remove zeros
    Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
    
  } else if (family == "binomial"){
    links = c("logit", "probit", "cauchit")
  } else if (family == "gaussian"){
    links = c("identity")
  }
  
  N = length(Pm)
  
  combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
  # number of predictors
  combos = combs$which
  ncombos = length(combos[,1])
  glm_press=vector(length = length(links))
  best_comb=vector(length = length(links))
  xpress=1:ncombos
  xmse = 1:ncombos
  
  for(j in 1:length(links)) {
    aux_var=1 # allow to fit glm
    ##remove small values for gamma distribution
    if (family == "gamma"){
      if (links[j]=="identity"){
        if( month==1){
          Pm[Pm<=0.005]=0.005
        }else{
          Pm[Pm<=2.5]=2.5
        }
        
      }else if (links[j]=="inverse" & month==1){
        Pm[Pm<=25]=25 # it is necessary to modific significantly the small values, 
        #so, "inverse" is not fitted for January
        aux_var=0
      }
    }
    if (aux_var==1)
      {for(i in 1:ncombos) {
        xx = X[,combos[i,]]
        xx=as.data.frame(xx)
        if (family == "gamma"){
          zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
        }else if (family == "binomial"){
          zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
        }else if (family == "gaussian"){
          zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
        }
        xpress[i]=PRESS(zz)
        xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
        #print(xpress[i])
      }}
    if (aux_var==1){
      # Test using PRESS objective function
      glm_press[j]=min(xpress)
      best_comb[j]=which.min(xpress)
    }else{
      # Test using PRESS objective function
      glm_press[j]=200000
      best_comb[j]=which.min(xpress)
    }
  }
  
  press_df = data.frame(glm_press)
  rownames(press_df) = links[1:length(links)]
  
  print("Results of PRESS for bestfit GLM")
  print(press_df)
  print(best_comb[which.min(glm_press)])
  
  sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", family, links[which.min(glm_press)])
  xx = X[,combos[best_comb[which.min(glm_press)],]]
  xx=as.data.frame(xx)
  if (family == "gamma") {
    bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
  } else if (family == "binomial") {
    bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
  }  else if (family == "gaussian") {
    bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
  } else { 
    print("Error!")
  }
  bestmod$Call$LINK=links[which.min(glm_press)]
  bestmod$Call$PRESS=PRESS(bestmod)
  bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
  return(bestmod)
}

##### Local Polynomail Fitting ######

#search for best alpha over a range of alpha values between 0 and 1

locpoly_fit = function(Pm, X, family="", link="", glm=FALSE,plot=FALSE) {
  
  
  nvar=length(X[1,])    #number of variables
  N=length(Pm) #number ofdata points
  
  
  
  if(glm==TRUE) {
    if (family == "gamma") {
      Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
      }
    # print("checking inputs")
    # print(family)
    # print(link)
    porder=1
    minalpha=0.6
    alpha_grid=seq(minalpha,1.0,by=0.05)
    n=length(alpha_grid)
    
    porder=2
    minalpha=0.6
    alpha1_grid=seq(minalpha,1.0,by=0.05)
    alpha=alpha2_grid=c(alpha_grid,alpha1_grid)
    
    #get the GCV values for all the alpha values in alpha for order of
    # polynomial = 1 and 2. kern="bisq" argument is to use the bisquare kernel
    # in computing the weights of the neighbors, which are then used in
    # the weighted least squares..
    gcv_deg1=gcvplot(Pm ~ ., data=X, maxk = 100000, alpha=alpha_grid,deg=1,kern="bisq", ev=dat(),scale=TRUE,family=family,link=link)
    gcv_deg2=gcvplot(Pm ~ ., data=X, maxk = 100000, alpha=alpha1_grid,deg=2,kern="bisq",ev=dat(),scale=TRUE,family=family,link=link)
    # pick the best alpha and the degree of the polynomial that
    # gives the least GCV
    z2=order(c(gcv_deg1$values,gcv_deg2$values))
    bestdeg=1
    if(z2[1] > n)bestdeg=2
    best_alpha = alpha2_grid[z2[1]]
    best_gcv = c(gcv_deg1$values,gcv_deg2$values)[z2[1]]
    output=c(bestdeg, best_alpha, best_gcv)    #the best parameter set
    
    # Now fit the LOCFIT model using the best alpha and degree obtained from above..
    if (plot==FALSE){
      bestmod=locfit(Pm ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq"
                     , scale = T, family=family, link=link,ev=dat())
    }else {
      bestmod=locfit(Pm ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq"
                     , scale = T, family=family, link=link)
    }
    
    bestmod$call$alpha = best_alpha
    bestmod$call$deg = bestdeg
    bestmod$call$family = family
    bestmod$call$link = link
    bestmod$call$gcv = best_gcv
  }
  else{
    porder=1
    minalpha=2*(nvar*porder+1)/N
    alpha_grid=seq(minalpha,1.0,by=0.05)
    n=length(alpha_grid)
    
    porder=2
    minalpha=2*(nvar*porder+1)/N
    alpha1_grid=seq(minalpha,1.0,by=0.05)
    alpha=alpha2_grid=c(alpha_grid,alpha1_grid)
    
    #get the GCV values for all the alpha values in alpha for order of
    # polynomial = 1 and 2. kern="bisq" argument is to use the bisquare kernel
    # in computing the weights of the neighbors, which are then used in
    # the weighted least squares..
    gcv_deg1=gcvplot(Pm ~ ., data=X, maxk = 100000, alpha=alpha_grid,deg=1,kern="bisq", ev=dat(),scale=TRUE)
    gcv_deg2=gcvplot(Pm ~ ., data=X, maxk = 100000, alpha=alpha1_grid,deg=2,kern="bisq",ev=dat(),scale=TRUE)
    # pick the best alpha and the degree of the polynomial that
    # gives the least GCV
    z2=order(c(gcv_deg1$values,gcv_deg2$values))
    bestdeg=1
    if(z2[1] > n)bestdeg=2
    best_alpha = alpha2_grid[z2[1]]
    best_gcv = c(gcv_deg1$values,gcv_deg2$values)[z2[1]]
    output=c(bestdeg, best_alpha, best_gcv)    #the best parameter set
    
    # Now fit the LOCFIT model using the best alpha and degree obtained from above..
    if (plot==FALSE){
      bestmod=locfit(Pm ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq",ev=dat())
    }else{
      bestmod=locfit(Pm ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq")
    }
    
    bestmod$call$alpha = best_alpha
    bestmod$call$deg = bestdeg
    bestmod$call$gcv = best_gcv
    
  }
  
  return(bestmod)
  
}

### F-test for Local Polynomial Fitting ###

loc_Ftest = function(Pm,Pmhat,X, bestmod) {
  
  N=length(Pm) #number ofdata points
  RSS1 = sum((Pm-Pmhat$fit)^2)
  
  nu1 = bestmod$dp[6] # trace(L) [lk]
  nu2 = bestmod$dp[7] # trace(L^T L) [df1]
  
  nu11 = N-2*nu1 + nu2
  
  #linear regression ###
  # #linear regression
  X=as.matrix(X)
  zzLin=lm(Pm~X)
  
  XX = cbind(rep(1,N), X)
  # Compute the Hat matrix
  hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)
  
  II = diag(N)
  delta0 = t(II-hatm)%*%(II-hatm)    #Equation 9.2
  nu00 = sum(diag(delta0))
  
  RSS0 = sum(residuals(zzLin)^2)
  
  
  Fdata = (RSS0 - RSS1)/(nu00 - nu11)
  Fdata = (Fdata / (RSS1 / nu11))
  Ftheor = qf(0.95,(nu00-nu11), nu11)    #95% confidence level..
  
  ## Fdata > Ftheor   - reject null - i.e., data is otherwise (local polynomial)
  if (Fdata > Ftheor) {
    print("F-test:")
    sprintf("Reject the Null because F(local poly) = %0.2f > %0.2f = F(linear model).", Fdata, Ftheor)
  }
}

HW1_Library

Alvaro Ossandon

October 8, 2018