#### Problem 10 - Homework 1 for CVEN6833 - Fitting a best linear model as in problem 1 and perform Kriging on the residuals
#### Author: Alvaro Ossandon

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#data required: 
Month=1 # 1: Monthly precipitation of January; 2: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file;

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")

#### create a folder to save the figures
if (Month==1){
  diraux=paste(mainDir,"/Montly_Precep_January",sep = "")  
}else{
  diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="Hierarchical Spatial Model"
if (file.exists(subDir)){
  setwd(file.path(diraux, subDir))
} else {
  dir.create(file.path(diraux, subDir))
  setwd(file.path(diraux, subDir))
}

## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("lat","long","elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN
test=na.omit(test)
Pm = test[,4]/10     #Dependent Variable
X = test[,c(2,1,3)]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]

#######################################################################################
################## I Fit a 'best' linear model and get residuals..  ###################
#######################################################################################
# fit LM
N = length(Pm)
combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
xmse = 1:ncombos

for(i in 1:ncombos) {
  xx = X[,combos[i,]]
  xx=as.data.frame(xx)
  zz= lm(Pm ~ ., data = xx)
  xpress[i]=PRESS(zz)
  xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
}
# Test using PRESS objective function
bestmod= lm(Pm ~ ., data = X[,combos[which.min(xpress),]])
bestmod$Call$Press=PRESS(bestmod)
bestmod$combo=combos[which.min(xpress),]
summary(bestmod)
## 
## Call:
## lm(formula = Pm ~ ., data = X[, combos[which.min(xpress), ]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8218 -1.5031 -0.1744  1.0509 18.5446 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39.373736   9.014136  -4.368 1.56e-05 ***
## long         -0.257466   0.075525  -3.409 0.000711 ***
## lat           0.222873   0.101401   2.198 0.028468 *  
## elev          0.003066   0.000196  15.638  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.424 on 445 degrees of freedom
## Multiple R-squared:  0.485,  Adjusted R-squared:  0.4816 
## F-statistic: 139.7 on 3 and 445 DF,  p-value: < 2.2e-16
bestmod$Call$Press
## [1] 2659.412
#######################################################################################
######################################   Kriging on the residuals #####################
#######################################################################################
xe = cbind(lon,lat,elev)
# get residuals..
yHW1 = bestmod$residuals
xs = cbind(lon,lat)
nobs = length(yHW1)
## Compute empirical variogram and plot ...
if (save==TRUE){
  pdf("Variogram_1.pdf", width = 8, height = 6) # save figure 
}
par(mar=c(4,4,1,1))
look<- vgram(xs, yHW1, N=15, lon.lat=FALSE)
bplot.xy(look$d, look$vgram, breaks= look$breaks,
         outline=FALSE,
         xlab="distance (degrees)", ylab="variogram")
points( look$centers, look$stats[2,], col="blue")

# fit of exponential by nonlinear least squares
xd<- look$centers
ym<- look$stats[2,]
sigma<- 0.5# 
nls.fit<- nls( ym ~ sigma^2 + rho*( 1- exp(-xd/theta)),
               start= list(rho=max(look$stats[2,]), theta=quantile(xd,0.1)), control=list( maxiter=200) )
pars<- coefficients(nls.fit)
rho<-pars[1]
theta<-pars[2]
xr=round(max(xd)+sd(xd))
dgrid<- seq(0,xr,length.out=400)
lines(dgrid, sigma^2 + rho*(1 - exp(-1*dgrid/theta)), col="blue", lwd=3)

if (save==TRUE){
  dev.off()
}
## Predict at observation points.. is sigma = 0 then it will be exact.
zz=Krig(xs,yHW1,rho=rho,theta=theta,m=1,sigma2=sigma)
yp = predict(bestmod) + predict.Krig(zz)
pdf("Precipitation_Scatterplot_1.pdf", width = 8, height = 6) # save figure
# Observed versus estimates
par(mfrow=c(1,1))
lim=range(Pm*10,yp*10)
plot(Pm*10,yp*10,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
abline(a=0,b=1)
dev.off()
## png 
##   2
###################################################################
######## III. model diagnostics of kriging
###################################################################

### MODEL DIAGNOSTICS:
Pmest = yp*10  # model's predicted values of Pm 
nvar = 2                   # number of variables 
mod_diagnostics(Pm*10, Pmest, nvar,save)

###############################################################################################
#### V. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.
###############################################################################################
mod_data = xs
N = length(Pm)
drop = 10
nsample = 500
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
  i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
  
  drop_dt = mod_data[-i_drop,] # drop 10% precip value
  drop_dt1 = X[-i_drop,bestmod$combo]
  # slightly different cases for different probl
  Pm1=Pm[-i_drop]
  nobs1 = length(Pm1)
  fit_drop = lm(Pm1 ~ ., data = drop_dt1)
  drop_mod=Krig(drop_dt,Pm1,rho=rho,theta=theta,m=1,sigma2=sigma)
  drop_pred=predict.Krig(drop_mod,x=mod_data[i_drop,],drop.Z=TRUE)+
    predict(fit_drop, newdata=X[i_drop,bestmod$combo])
  drop_actual =Pm[i_drop]*10
  skill_rmse[i] = sqrt(mean((drop_actual - drop_pred*10)^2))
  skill_cor[i] = cor(drop_actual,drop_pred*10)
}
# Plot skill of model based on Drop 10% method
if(save==TRUE){
  pdf("boxplots.pdf", width = 8, height = 6) # save figure
  par(mfrow=c(1,2))
  boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
  boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
  dev.off()
}else{
  par(mfrow=c(1,2))
  boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
  boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
}

###################################################################
## VI. Spatially map the model estimates and the standard error ####
#### ## Predict on the grid.. and standard error..  ###############
###################################################################
#  read the topography map
predpts = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
lat=predpts$V1
lon=predpts$V2
elev=predpts$V3
xe = cbind(lon,lat,elev)
Xe=as.data.frame(xe)
names(Xe)=c("long","lat","elev")
xps=cbind(lon,lat) 
gp = predict.Krig(zz,x=xps,drop.Z=TRUE)
kse = predictSE.Krig(zz, x = xps)
yp = gp + predict(bestmod,newdata=Xe)
kse1=data.matrix(kse)
ypred1=cbind(yp,kse1)*10
# ypred1=cbind(ghat,kse)
ypred=as.data.frame(ypred1)
names(ypred)=c("fit","se")
plot_surface(xps[,1],xps[,2],ypred,xs[,1],xs[,2],Pm*10,save)