#### Problem 5 - Homework 1 for CVEN6833
#### Author: Billy Raseman based on the work of Emily Gill and Adam McCurdy 

#### CLEAR MEMORY
rm(list=ls())

#### Prevent Warnings
options(warn=-1)

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
setwd("C:/Users/Billy/Google Drive/CU Boulder Courses (CVEN)/CVEN 6833 Advanced Data Analysis/Homework/Homework 1")
suppressPackageStartupMessages(source("hw1_lib.R"))
source("hw1_lib.R")

#### READ IN DATA

## Read data as table from local directory 
precip = read.table("data_files/climatol_ann.txt")      # ann avg precip 246 x 4
grid = read.table("data_files/india_grid_topo.txt")     # high res DEM 3384 x 3
rajeevan = read.table("data_files/rajeevan_grid.txt")   # rajeevan grid 357 x 2

## Organize data into a data frame with names for columns/variables:
precip_dt = data.frame(precip)                          # ann avg precip at 246 locations
colnames(precip_dt) = c("lon", "lat", "elev", "pmm")
india_grid = data.frame(grid)                           # high resolution DEM (more than just India)
colnames(india_grid) = c("lon", "lat", "elev")
raj_grid = data.frame(rajeevan[,2], rajeevan[,1])       # lat/lon for just India
colnames(raj_grid) = c("lon","lat")
# (Now that these are data frames you can call out single variables using $$ operator...ex: precip_dt$pmm, india_grid$lon)
# For example: precip_dt$pmm  raj_grid$lat
# To view a data frame or matrix quickly: head(precip_dt)

# Name variables of interest

lon = precip_dt[,1]
lat = precip_dt[,2]
elev = precip_dt[,3]
precip = precip_dt[,4]

# Basic Scatterplot Matrix
pairs(~lon+lat+elev+precip,data=precip_dt, 
      main="Simple Scatterplot Matrix")

### Identify additive models based on scatterplots 

bestmod = gam(pmm~s(lon)+s(lat)+te(elev), data=precip_dt)

######## 2. Anova and model diagnostics of your best model
### ANOVA:
# F test on best model:
anova.gam(bestmod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## pmm ~ s(lon) + s(lat) + te(elev)
## 
## Approximate significance of smooth terms:
##            edf Ref.df      F  p-value
## s(lon)   6.693  7.842 16.543  < 2e-16
## s(lat)   1.000  1.000 37.563 3.38e-09
## te(elev) 2.053  2.359  5.154    0.004
### MODEL DIAGNOSTICS: *** Challenge = make your own diagnostics function that you can use on any type of model.
yy = precip_dt[,4]         # true value of y based on data
yest = predict(bestmod)    # model's predicted values of y 
nvar = 2                   # number of variables 
mod_diagnostics(yy, yest, nvar)

###################################################################
#### iii. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

loocv_func(bestmod)

#### 4. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## **** Challenge: The loop below does this five times. Turn these commands into a external FUNCTION that loops through this 
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.

N = length(precip_dt$pmm)
drop = 10
nsample = 500
i_full = 1:N 

# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)    

for (i in 1:nsample){
  i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
  
  drop_dt = precip_dt[-i_drop,]
  fit_drop = gam(bestmod$formula, data = drop_dt)
  
  drop_pred = predict(fit_drop, newdata=precip_dt[i_drop,])     
  # notice that newdata has all four columns, but the model knows to pull "pmm" col because of the way the model was initially specified.
  
  drop_actual = precip_dt[i_drop,]$pmm 
  
  skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
  skill_cor[i] = cor(drop_actual,drop_pred)
}

# Plot skill of model based on Drop 10% method
par(mfrow=c(1,2))
boxplot(skill_rmse, main = "RMSE-Skill", ylim = range(skill_rmse))
boxplot(skill_cor, main = "Cor-Skill", ylim=range(skill_cor))

#### 5. Estimate precip and standard errors on high resolution DEM grid and show as a 3-D plot.
# Use the best fit model with India DEM to predict rainfall using lat and lon from DEM

### Quilt plotting code for India grid
par(mfrow=c(1,1))

xv1=data.frame(india_grid[,1:2])
xv2=data.frame(raj_grid[,1:2])

zzint=row.match(xv2,xv1)
indexnotna = which(!is.na(zzint))
indexna = which( is.na(zzint) )

## create the new rajeevan grid
rajeevgridel = cbind(raj_grid[indexnotna,1:2],india_grid[zzint[indexnotna],3])  

X=precip_dt[,1:3]
Y=precip_dt[,4]

X = as.data.frame(rajeevgridel)
names(X)=c("lon","lat","elev")

  ypred <- predict(bestmod,newdata=X,se.fit=T)
  
  quilt.plot(X[,1],X[,2],ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)')
  world(add=T,lwd=1)

  quilt.plot(X[,1],X[,2],ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)')
  world(add=T,lwd=1)