#### Problem 6 - Homework 1 for CVEN6833 - Fitting a GAM model for July
#### Author: Alvaro Ossandon

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#data required: 
Month=7 # 1: Monthly precipitation of January; 2: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file;


#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")

#### create a folder to save the figures
if (Month==1){
  diraux=paste(mainDir,"/Montly_Precep_January",sep = "")  
}else{
  diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="GAM_FIT"
if (file.exists(subDir)){
  setwd(file.path(diraux, subDir))
} else {
  dir.create(file.path(diraux, subDir))
  setwd(file.path(diraux, subDir))
}

## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("Lat","Long","Elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN
test=na.omit(test)
test$Elev2=test[,3]^2
Pm = test[,4]     #Dependent Variable
X = test[,c(1,2,3,5)]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip =Pm

# Basic Scatterplot Matrix
if (save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
}

#####################################################
###### Identify the best additive model  ############
#####################################################
# 9 models will be tested, each model will considered three variables: Lont, Lat, Elev
gcvs=vector(mode="numeric", length=9)
#model 1: spline for each variable
bestmod1 = gam(Pm~s(Long)+s(Lat)+s(Elev), data=test)
gcvs[1]=bestmod1$gcv.ubre
#model 2: spline for Long and Elev, and polinomial order 1 for Lat
bestmod2 = gam(Pm~s(Long)+Lat+s(Elev), data=test)
gcvs[2]=bestmod2$gcv.ubre
#model 3: spline for Lat and Elev, and polinomial order 1 for Long
bestmod3 = gam(Pm~Long+s(Lat)+s(Elev), data=test)
gcvs[3]=bestmod3$gcv.ubre
#model 4: spline for Lat and Long, and polinomial order 1 for Elev
bestmod4 = gam(Pm~s(Long)+s(Lat)+Elev, data=test)
gcvs[4]=bestmod4$gcv.ubre
#model 5: spline for Elev, and polinomial order 1 for Lat and Long
bestmod5 = gam(Pm~Long+Lat+s(Elev), data=test)
gcvs[5]=bestmod5$gcv.ubre
#model 6: spline for Lat, Long, and polinomial order 2 for Elev
bestmod6 = gam(Pm~s(Long)+s(Lat)+Elev2, data=test)
gcvs[6]=bestmod6$gcv.ubre
#model 7: spline for Long, polinomial order 1 for Lat, and polinomial order 2 for Elev
bestmod7 = gam(Pm~s(Long)+Lat+Elev2, data=test)
gcvs[7]=bestmod7$gcv.ubre
#model 8: spline for Lat, polinomial order 1 for Long, and polinomial order 2 for Elev
bestmod8 = gam(Pm~Long+s(Lat)+Elev2, data=test)
gcvs[8]=bestmod8$gcv.ubre
#model 9: polinomial order 1 for Lat, Long, and polinomial order 2 for Elev
bestmod9 = gam(Pm~Long+Lat+Elev2, data=test)
gcvs[9]=bestmod9$gcv.ubre

# best model is which has the minimum gcv
if (which.min(gcvs)==1){
  bestmod=bestmod1
}else if(which.min(gcvs)==2){
  bestmod=bestmod2
}else if (which.min(gcvs)==3){
  bestmod=bestmod3
}else if (which.min(gcvs)==4){
  bestmod=bestmod4
}else if (which.min(gcvs)==5){
  bestmod=bestmod5
}else if (which.min(gcvs)==6){
  bestmod=bestmod6
}else if (which.min(gcvs)==7){
  bestmod=bestmod7
}else if (which.min(gcvs)==8){
  bestmod=bestmod8
}else{
  bestmod=bestmod9
}
 summary(bestmod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Pm ~ s(Long) + s(Lat) + s(Elev)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.0931     0.6257   88.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df      F p-value    
## s(Long) 7.512  8.445 46.027  <2e-16 ***
## s(Lat)  5.934  7.102  3.274  0.0019 ** 
## s(Elev) 2.003  2.538 86.147  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.552   Deviance explained = 56.8%
## GCV = 181.27  Scale est. = 174.59    n = 446
Pmhat=predict(bestmod)
if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

###################################################################
######## III. Anova and model diagnostics of your best model
###################################################################
### ANOVA:
anova(bestmod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Pm ~ s(Long) + s(Lat) + s(Elev)
## 
## Approximate significance of smooth terms:
##           edf Ref.df      F p-value
## s(Long) 7.512  8.445 46.027  <2e-16
## s(Lat)  5.934  7.102  3.274  0.0019
## s(Elev) 2.003  2.538 86.147  <2e-16
### MODEL DIAGNOSTICS:
#yy = precip_dt[,4]         # true value of y based on data
Pmest = Pmhat  # model's predicted values of Pm 
nvar = 2                   # number of variables 
mod_diagnostics(Pm, Pmest, nvar,save)

###################################################################
#### IV. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

loocv_func(bestmod,X,Pm,save)

###################################################################
#### V. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

Drop_10_pred(bestmod,X,Pm,save)

###################################################################
## VI. Spatially map the model estimates and the standard error ####
###################################################################
#  read the topography map
x1 = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
names(x1)=c("Lat","Long","Elev")

ypred =predict.gam(bestmod, newdata=x1[,1:3], se.fit=T,type="response")
#Quilt_plotting(x1[,2],x1[,1],ypred)
plot_surface(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,save)