#### Problem 5 - Homework 1 for CVEN6833 - Fitting a local GLM model for July
#### Author: Alvaro Ossandon

#### CLEAR MEMORY
rm(list=ls())

#### Clear the R console
cat("\f")

#### Prevent Warnings
options(warn=-1)

#data required: 
Month=7 # 1: Monthly precipitation of January; 7: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")

#### create a folder to save the figures
if (Month==1){
  diraux=paste(mainDir,"/Montly_Precep_January",sep = "")  
}else{
  diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="LOC_GLM_FIT"
if (file.exists(subDir)){
  setwd(file.path(diraux, subDir))
} else {
  dir.create(file.path(diraux, subDir))
  setwd(file.path(diraux, subDir))
}

## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("Lat","Long","Elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN
test=na.omit(test)
Pm = test[,4]     #Dependent Variable
X = test[,1:3]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip =Pm

# Basic Scatterplot Matrix
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
}

##############################################
########  II Local GLM method   ##############
################################################
##gamma distribution
links = c("log", "inverse","identity") 
N = length(Pm)
combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
gcvs_m=1:ncombos
glm_gcv=vector(length = length(links))
best_comb=vector(length = length(links))
for(j in 1:length(links)) { sprintf("j",j)
  
  for(i in 1:ncombos) {
    xx = X[,combos[i,]]
    xx=as.data.frame(xx)
    zz=locpoly_fit(Pm, xx,family="gamma", link=links[j], glm=TRUE,plot=TRUE)
    gcvs_m[i] = zz$call$gcv
  }
  # Test using GCV objective function
  glm_gcv[j]=min(gcvs_m)
  best_comb[j]=which.min(gcvs_m)
}
bestmod1=locpoly_fit(Pm, X[,combos[best_comb[which.min(glm_gcv)],]],family="gamma",
                     link=links[which.min(glm_gcv)], glm=TRUE,plot=TRUE)
bestmod1$combo=combos[best_comb[which.min(glm_gcv)],]
###  gaussian distribution
gcvs_m=1:ncombos
for(i in 1:ncombos) {
  xx = X[,combos[i,]]
  xx=as.data.frame(xx)
  zz=locpoly_fit(Pm, xx,family="gaussian", link="identity", glm=TRUE,plot=TRUE)
  gcvs_m[i] = zz$call$gcv
}
bestmod2=locpoly_fit(Pm,X[,combos[which.min(gcvs_m),]],family="gaussian", link="identity", glm=TRUE,plot=TRUE)
bestmod2$combo=combos[which.min(gcvs_m),]
if (bestmod2$call$gcv>bestmod1$call$gcv){
  bestmod=bestmod1
}else{
  bestmod=bestmod2
}
X=X[,bestmod$combo]

summary(bestmod)
## Estimation type: Local Likelihood - Gamma 
## 
## Call:
## locfit(formula = Pm ~ ., data = X, alpha = 0.6, maxk = 10000, 
##     deg = 2, kern = "bisq", scale = T, family = "gamma", link = "identity", 
##     gcv = 0.066441842309433)
## 
## Number of data points:  446 
## Independent variables:  Lat Long Elev 
## Evaluation structure: Rectangular Tree 
## Number of evaluation points:  123 
## Degree of fit:  2 
## Fitted Degrees of Freedom:  25.155
Pmhat=predict(bestmod,X,se.fit=T)

#Estimate the value of the function at the observed locations..
if(save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat$fit)
  plot(Pm,Pmhat$fit,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else{
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat$fit)
  plot(Pm,Pmhat$fit,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

###################################################################
######## III. Model diagnostics of your best model
###################################################################

### MODEL DIAGNOSTICS:
#yy = precip_dt[,4]         # true value of y based on data
Pmest = Pmhat$fit        # model's predicted values of Pm
nvar = 2                   # number of variables
mod_diagnostics(Pm, Pmest, nvar,save)

###################################################################
#### IV. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

loocv_func(bestmod,X,Pm,save)

###############################################################################################
#### V. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.
###############################################################################################
#Drop_10_pred(bestmod,X,Pm,bestmod$call$family)
mod_data = X
N = length(Pm)
drop = 10
nsample = 500
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
family=bestmod$call$family
for (i in 1:nsample){
  i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
  
  drop_dt = mod_data[-i_drop,] # drop 10% precip value
  # slightly different cases for different probl
  Pm1=Pm[-i_drop]
  ##remove small values
  Pm1[Pm1<=0.01]=0.01
  drop_mod=locfit(Pm1 ~., data = drop_dt, alpha=bestmod$call$alpha, maxk = 10000, deg=bestmod$call$deg,kern="bisq"
                  , scale = T, family=bestmod$call$family, link=bestmod$call$link)
  drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
  
  drop_actual = Pm[i_drop]
  skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
  skill_cor[i] = cor(drop_actual,drop_pred)
}
# Plot skill of model based on Drop 10% method
if(save==TRUE){
  pdf("boxplots.pdf", width = 8, height = 6) # save figure
  par(mfrow=c(1,2))
  boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
  boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
  dev.off()
}else{
  par(mfrow=c(1,2))
  boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
  boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))
}

###################################################################
## VI. Spatially map the model estimates and the standard error ####
###################################################################
#  read the topography map
x1 = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
names(x1)=c("Lat","Long","Elev")
ypred <- predict(bestmod,newdata=x1,se.fit=T)
#Quilt_plotting(x1[,2],x1[,1],ypred)
plot_surface(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,save)