#### Anna Starodubtseva
#### CVEN 6833, HW 01
#### Problem 6 - GAM

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

# Other
save = FALSE

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 01/R"
setwd(mainDir)
suppressPackageStartupMessages(source("hw1_library.R"))
source("hw1_library.R")

## Read data
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1])  # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
lon = X[,1]
lat = X[,2]
elev = X[,3]
# Add higher order terms
X$Elev2 = X[,3]^2
## Scatterplot to show variable relationships
precip = prec_obs;
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  dev.new()
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
}

#########################################################
###################### (i) GAM ##########################
#########################################################
gcvs = vector(mode = "numeric", length = 9)
mod = vector(mode = "list", length = 9)

for (i in 1:length(mod)) {
    if (i == 1) {           # Model 1: Spline - Long, Lat, Elev
        mod[[i]] = gam(prec_obs ~ s(Long) + s(Lat) + s(Elev), data = X)
    } else if (i == 2) {    # Model 2: Spline - Long & Elev, 1st Order Polynomial - Lat 
        mod[[i]] = gam(prec_obs ~ s(Long) + Lat + s(Elev), data = X)
    } else if (i == 3) {    # Model 3: Spline - Lat & Elev, 1st Order Polynomial - Long 
        mod[[i]] = gam(prec_obs ~ Long + s(Lat) + s(Elev), data = X)
    } else if (i == 4) {    # Model 4: Spline - Lat & Long, 1st Order Polynomial - Elev
        mod[[i]] = gam(prec_obs ~ s(Long) + s(Lat) + Elev, data = X)
    } else if (i == 5) {    # Model 5: Spline - Elev, 1st Order Polynomial - Long & Lat
        mod[[i]] = gam(prec_obs ~ Long + Lat + s(Elev), data = X)
    } else if (i == 6) {    # Model 6: Spline - Long & Lat, 2nd Order Polynomial - Elev
        mod[[i]] = gam(prec_obs ~ s(Long) + s(Lat) + Elev2, data = X)
    } else if (i == 7) {    # Model 7: Spline - Long, 1st Order Poly. - Lat, 2nd Order Poly. - Elev
        mod[[i]] = gam(prec_obs ~ s(Long) + Lat + Elev2, data = X)
    } else if (i == 8) {    # Model 8: Spline - Lat, 1st Order Poly. - Long, 2nd Order Poly. - Elev
        mod[[i]] = gam(prec_obs ~ Long + s(Lat) + Elev2, data = X)
    } else if (i == 9) {    # Model 9: 1st Order Poly. - Lat & Long, 2nd Order Poly. - Elev
        mod[[i]] = gam(prec_obs ~ Long + Lat + Elev2, data = X)
    }
    gcvs[i] = mod[[i]]$gcv.ubre
}
bestmod = mod[[which.min(gcvs)]]
print(summary(bestmod))
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## prec_obs ~ s(Long) + s(Lat) + Elev
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.304962   3.470312   4.698 1.42e-05 ***
## Elev         0.002558   0.002131   1.200    0.234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value    
## s(Long) 5.655  6.787 6.370 1.54e-05 ***
## s(Lat)  1.000  1.000 0.196    0.659    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.359   Deviance explained = 42.7%
## GCV = 57.372  Scale est. = 50.57     n = 73
#########################################################
###### (ii) Show the scatterplot of observed and modeled precipitation along with the 1:1 line
#########################################################
prec_hat = predict.gam(bestmod, X, se.fit = F, type = "response")

if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat)
  dev.new()
  plot(prec_obs,prec_hat,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat)
  plot(prec_obs,prec_hat,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

#########################################################
####### (iii) Perform ANOVA and model diagnostics #######
#########################################################
anova(bestmod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## prec_obs ~ s(Long) + s(Lat) + Elev
## 
## Parametric Terms:
##      df     F p-value
## Elev  1 1.441   0.234
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value
## s(Long) 5.655  6.787 6.370 1.54e-05
## s(Lat)  1.000  1.000 0.196    0.659
mod_diagnostics(prec_obs,prec_hat,2,save)

#########################################################
####### (iv) Compute drop-one cross-validated estimates from the best model and scatterplot them against the observed values
#########################################################
loocv_func(bestmod, X, prec_obs, save)

#########################################################
####### (v) Drop 10% of observations, fit the model to the rest of the data and predict the dropped points. Compute RMSE and correlation and show them as boxplots #######
#########################################################
Drop_10_pred(bestmod,X,prec_obs, save)

#########################################################
####### (vi) Spatially map the model estimates and standard error from the best model on the high-resolution grid
#########################################################
elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
# Predict precipitation based on best model
prec_pred <- predict.gam(bestmod, elev_grid, se.fit = T, type = "response")

# Plot surface
plot_surface(elev_grid[,1],elev_grid[,2],prec_pred,X[,1],X[,2],prec_obs,save)