# Load Packages
library(MASS) # for stepAIC
library(sm) # for sm.density in diagnostics
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
##
## Attaching package: 'sm'
## The following object is masked from 'package:MASS':
##
## muscle
library(MPV) # for calculating PRESS
##
## Attaching package: 'MPV'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## stackloss
library(FNN) # for k nearest neighbor to snap grids together
library(akima) # for interp to smooth for plotting
library(fields) # for surface to plot surface plot
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
library(leaps) # to provide combinations
library(prodlim) # for row.match()
library(locfit) # for fitting local polynomial
## locfit 1.5-9.1 2013-03-22
library(mgcv) # for GAM
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
plot_india <- function(precip_dt, india_grid, raj_grid, bestmod) {
### Quilt plotting code for India grid
par(mfrow=c(1,1))
xv1=data.frame(india_grid[,1:2])
xv2=data.frame(raj_grid[,1:2])
zzint=row.match(xv2,xv1)
indexnotna = which(!is.na(zzint))
indexna = which( is.na(zzint) )
## create the new rajeevan grid
rajeevgridel = cbind(raj_grid[indexnotna,1:2],india_grid[zzint[indexnotna],3])
X=precip_dt[,1:3]
Y=precip_dt[,4]
X = as.data.frame(rajeevgridel)
names(X)=c("lon","lat","elev")
if (class(bestmod)[1]=="glm") {
if (bestmod$family[1]=="binomial") {
p75 = quantile(precip_dt[,4], 0.75)
exceed75 = precip_dt[,4] >= p75
Y = exceed75
ypred = predict(bestmod,newdata=X,se.fit=T, type="response")
}
else {
ypred <- predict(bestmod,newdata=X,se.fit=T)
}
}
else {
ypred <- predict(bestmod,newdata=X,se.fit=T)
}
############ Simple plotting..
if (class(bestmod)[1]=="glm") {
if (bestmod$family[1]=="binomial") {
quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Probabiliy for the 75th Percentile \nPrecipitation Exceedance on DEM Grid')
world(add=T,lwd=1)
quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error for the 75th Percentile \nPrecipitation Exceedance on DEM Grid')
world(add=T,lwd=1)
}
else {
quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
world(add=T,lwd=1)
quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
world(add=T,lwd=1)
}
}
else {
quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
world(add=T,lwd=1)
quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
world(add=T,lwd=1)
}
}
choose_best_model <- function(mod1, obj_fun=NULL, family=NULL){
# Different cases for different problems
if (class(bestmod)[1]=='lm') {
# Determine 'best' model based on AIC criterion:
bestAIC = stepAIC(mod1, trace=FALSE)
# Determine 'best' model based on BIC criterion:
bestBIC = stepAIC(mod1, k=log(length(precip_dt$pmm)), trace=FALSE) # best model is "precip ~ lon + lat" is "best" - agrees with AIC.
# Compare to see if AIC and BIC agree:
# If so, display the agreed-upon model
if (bestAIC$call == bestBIC$call){
model_agreement = print("The AIC and BIC criterion agree on best model. AIC and BIC result in equivalent models and either can be used to represent the best model.")
}
else {
model_agreement = print("Warning! The AIC and BIC criterion do not agree on the best model.You are advised to conduct additional objective functions such as PRESS, leaps, or GCV for additional information before continuing. If no further action is taken, the AIC model will be used.")
}
# Choose which model to output as best model
if (obj_fun == "AIC"){
bestmod = bestAIC # choose the AIC model
print("Action: Choosing AIC model...")
}
else if (obj_fun == "BIC"){
bestmod = bestBIC # choose the BIC model
print("Choosing BIC model.")
}
else {
bestmod = bestAIC # choose the AIC model
print("No model preference stated between AIC or BIC or incorrect string input to obj_fun. Choosing AIC model as best model.")
}
print("Best model information shown below:")
} else if (class(bestmod)[1]=='glm') {
print("Not working!")
}
return(bestmod)
}
best_model_agree <- function(mod1, obj_fun){
# Determine 'best' model based on AIC criterion:
bestAIC = stepAIC(mod1, trace=FALSE)
# Determine 'best' model based on BIC criterion:
bestBIC = stepAIC(mod1, k=log(length(precip_dt$pmm)), trace=FALSE) # best model is "precip ~ lon + lat" is "best" - agrees with AIC.
# Compare to see if AIC and BIC agree:
# If so, display the agreed-upon model
if (bestAIC$call == bestBIC$call){
model_agreement = print("The AIC and BIC criterion agree on best model. AIC and BIC result in equivalent models and either can be used to represent the best model.")
}
else {
model_agreement = print("Warning! The AIC and BIC criterion do not agree on the best model.You are advised to conduct additional objective functions such as PRESS, leaps, or GCV for additional information before continuing. If no further action is taken, the AIC model will be used.")
}
# Choose which model to output as best model
if (obj_fun == "AIC"){
bestmod = bestAIC # choose the AIC model
print("Action: Choosing AIC model...")
}
else if (obj_fun == "BIC"){
bestmod = bestBIC # choose the BIC model
print("Choosing BIC model.")
}
else {
bestmod = bestAIC # choose the AIC model
print("No model preference stated between AIC or BIC or incorrect string input to obj_fun. Choosing AIC model as best model.")
}
print("Best model information shown below:")
return(bestmod)
}
mod_diagnostics <- function(y, yhat, nvar, CD){
#par(mfrow=c(3,2))
e <- y - yhat
nobs <- length(y)
coef <- nvar+1
## Testing if errors (residuals) are normal and iid
# 1. Normality histogram
hist(e,xlab="Residuals",ylab="Density",probability=T,main="Distribution of Residuals")
lines(sort(e),dnorm(sort(e),mean=0,sd=sd(e)),col="red")
sm.density(e,add=T,col="blue")
legend("topright", c("Normal Fit", "Non-parametric Fit"), lty=c(1,1), lwd=c(2.5,2.5), col=c("red", "blue"))
# 2. Normal QQplot
qqnorm(e,main="Normal Q-Q Plot of Residuals")
qqline(e)
# 3. Observed versus estimates
plot(y,yhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precip", xlim=c(min(y),max(y)), ylim=c(min(y), max(y)))
abline(a=0,b=1)
## Testing for heteroskedasticity (which is constant variance of residuals)
# 4. Residuals versus estimates
plot(yhat,e,xlab="Estimated Precip",ylab="Residuals",main="Residuals vs Estimated Precip")
abline(0,0)
## Testing for autocorrelation. If errors are uncorrelated they fall between the dotted lines.
# 5. Autocorrelation plot
cor=acf(e,main="Autocorrelation Plot")
}
loocv_func = function(bestmod) {
# Select only the data from the best model (not full model)
mod_data = bestmod$model
# Initialize leave one out cross validation vector
loocv = vector("numeric", nrow(mod_data))
# slightly different cases for different problems
if (class(bestmod)[1]=='lm') {
for (i in 1:nrow(mod_data)) {
drop_data = mod_data[-i,] # drop one precip value
drop_mod = lm(pmm ~ ., data = drop_data)
x_pred = mod_data[i,]
loocv[i] = predict(drop_mod, x_pred)
}
} else if (class(bestmod)[1]=='glm') {
for (i in 1:nrow(mod_data)) {
drop_data = mod_data[-i,] # drop one precip value
drop_mod = lm(pmm ~ ., data = drop_data, family = bestmod$family)
x_pred = mod_data[i,]
loocv[i] = predict(drop_mod, x_pred, type = 'response')
}
} else if (class(bestmod)[1] == "gam") {
for(i in 1:nrow(mod_data)){
drop_data = mod_data[-i,] # drop one precip value
# names(drop_data) = c("lat","lon","elev")
drop_mod = gam(bestmod$formula, data = drop_data)
x_pred = mod_data[i,]
loocv[i] = predict(drop_mod, x_pred)
}
}
# Plot LOOCV against observations
lim = range(mod_data$pmm, loocv)
plot(mod_data$pmm, loocv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV" )
lines(mod_data$pmm, mod_data$pmm, col = "red")
}
#### GLM Fitting ###########
### Find best GLM
GLM_fit = function(pmm, X, family) {
if (family == "gamma") {
links = c("log", "inverse", "identity")
# clean data and remove zeros
precip_dt = na.omit(precip_dt)
rownames(precip_dt) = NULL
precip_dt$pmm = ifelse(precip_dt$pmm <=0, runif(1, 0.0001, 0.001), precip_dt$pmm)
} else if (family == "binomial"){
links = c("logit", "probit", "cauchit")
}
glm_aic = vector(length = length(links))
N = length(pmm)
combs = leaps(X,pmm, nbest=25) # GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
xmse = 1:ncombos
for(j in 1:length(links)) {
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
if (family == "gamma") {
zz=glm(pmm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
} else if (family == "binomial"){
zz=glm(pmm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
}
xpress[i]=PRESS(zz)
xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
}
# Test using AIC objective function
zms=stepAIC(zz, scope=list(upper = ~., lower = ~1), trace=FALSE)
glm_aic[j] = zms$aic
}
aic_df = data.frame(glm_aic)
rownames(aic_df) = links[1:3]
print("Results of AIC for bestfit GLM")
print(aic_df)
sprintf("Choosing the GLM which minimizes AIC: %s family and %s link function.", family, links[which.min(glm_aic)])
# bestmod = glm(pmm ~ ., data = X, family = Gamma(link=links[which.min(glm_aic)]))
if (family == "gamma") {
bestmod = glm(pmm ~ ., data = X, family = Gamma(link=links[which.min(glm_aic)]))
} else if (family == "binomial") {
bestmod = glm(pmm ~ ., data = X, family = binomial(link=links[which.min(glm_aic)]))
} else {
print("Error!")
}
return(bestmod)
}
##### Local Polynomail Fitting ######
#search for best alpha over a range of alpha values between 0 and 1
locpoly_fit = function(pmm, X, family="gaussian", link="identity", glm=FALSE) {
pmm = precip_dt[,4] # indepedent variable
X = precip_dt[,1:3] # all the predictor set
X = as.matrix(X)
nvar=length(X[1,]) #number of variables
N=length(pmm) #number ofdata points
print("checking inputs")
print(family)
print(link)
bestdeg=1
minalpha= 0.3
if(glm==TRUE) minalpha = 0.7
alpha1=seq(minalpha,1.0,by=0.05)
n=length(alpha1)
#get the GCV values for all the alpha values in alpha for order of
# polynomial = 1. kern="bisq" argument is to use the bisquare kernel
# in computing the weights of the neighbors, which are then used in
# the weighted least squares..
z0=gcvplot(pmm ~ X, alpha=alpha1,deg=bestdeg,kern="bisq",ev=dat(),scale=TRUE,family=family,link=link, maxk=5000)
bestalpha = z0$alpha[which.min(z0$alpha)]
# Now fit the LOCFIT model using the best alpha and degree obtained from
#above..
X = data.frame(X)
bestmod=locfit(pmm~lon+lat+elev, data=X, alpha=bestalpha, deg=bestdeg, kern="bisq", scale=TRUE, family=family, link=link, maxk=5000) # WJR: removed ev=dat() and plotting was successfull
bestmod$call$alpha = bestalpha
bestmod$call$deg = bestdeg
bestmod$call$family = family
bestmod$call$link = link
return(bestmod)
}
### F-test for Local Polynomial Fitting ###
loc_Ftest = function(precip_dt, bestmod) {
pmm = precip_dt$pmm
N=length(pmm) #number ofdata points
X = precip_dt[,3]
RSS1 = sum(residuals(bestmod)^2)
nu1 = bestmod$dp[6]
nu2 = bestmod$dp[7]
nu11 = N-2*nu1 + nu2
#linear regression
X=as.matrix(X)
zzLin=lm(pmm~X)
XX = cbind(rep(1,N), X)
# Compute the Hat matrix
hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)
II = diag(N)
delta0 = t(II-hatm)%*%(II-hatm) #Equation 9.2
nu00 = sum(diag(delta0))
RSS0 = sum(residuals(zzLin)^2)
nu0 = length(zzLin$coef) #number of model coefficients
Fdata = (RSS0 - RSS1)/(nu00 - nu11)
Fdata = (Fdata / (RSS1 / nu11))
Ftheor = qf(0.95,(nu00-nu11), nu11) #95% confidence level..
## Fdata > Ftheor - reject null - i.e., data is otherwise (local polynomial fit is better)
if (Fdata > Ftheor) {
print("F-test:")
sprintf("Reject the Null because F(local poly) = %0.2f > %0.2f = F(linear model).", Fdata, Ftheor)
}
}