#### Anna Starodubtseva
#### CVEN 6833, HW 01
#### Problem 5

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

# Other
save = FALSE

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 01/R"
setwd(mainDir)
suppressPackageStartupMessages(source("hw1_library.R"))
source("hw1_library.R")

## Read data
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1])  # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
lon = X[,1]
lat = X[,2]
elev = X[,3]

## Scatterplot to show variable relationships
precip = prec_obs;
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  dev.new()
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
}

#########################################################
####### (i) Local GLM Method #######
#########################################################
# Create best model assuming gamma distribution
links = c("log", "inverse","identity") 
N = length(prec_obs)
combs = leaps(X,prec_obs,nbest=25)     #  Get up to 25 combinations for each
combos = combs$which                # number of predictors
ncombos = length(combos[,1])
glm_val=vector(length = length(links))
bestcombo=vector(length = length(links))
obj_func=1:ncombos

for (j in 1:length(links)) { sprintf("j",j)
    for(i in 1:ncombos) {
        xx = X[,combos[i,]]
        xx=as.data.frame(xx)
        bestmod=locpoly_fit(prec_obs, xx, family = "gamma", link = links[j], glm=T,plot=T)
        obj_func[i] = bestmod$call$gcv
    }
    # Test using GCV objective function
    glm_val[j] = min(obj_func)
    bestcombo[j] = which.min(obj_func)
}
X_best = data.frame(X[,combos[bestcombo[which.min(glm_val)],]])
names(X_best) = names(X[combos[bestcombo[which.min(glm_val)],]])
bestmod_gam = locpoly_fit(prec_obs, X_best, family = "gamma", link = links[which.min(glm_val)], glm=T ,plot=T)
bestmod_gam$combo = combos[bestcombo[which.min(glm_val)],]

# Create best model assuming gaussian distribution
obj_func=1:ncombos
for (j in 1:length(links)) { sprintf("j",j)
    for(i in 1:ncombos) {
        xx = X[,combos[i,]]
        xx=as.data.frame(xx)
        bestmod=locpoly_fit(prec_obs, xx, family = "gaussian", link = "identity", glm=T,plot=T)
        obj_func[i] = bestmod$call$gcv
    }
    # Test using GCV objective function
    glm_val[j] = min(obj_func)
    bestcombo[j] = which.min(obj_func)
}
X_best = data.frame(X[,combos[bestcombo[which.min(glm_val)],]])
names(X_best) = names(X[combos[bestcombo[which.min(glm_val)],]])
bestmod_gau = locpoly_fit(prec_obs,X_best, family = "gaussian", link = links[which.min(glm_val)], glm=T ,plot=T)
bestmod_gau$combo = combos[bestcombo[which.min(glm_val)],]

# Determine best model
if (bestmod_gau$call$gcv > bestmod_gam$call$gcv){
  bestmod=bestmod_gam
  bestmod$family = "gamma"
}else{
  bestmod=bestmod_gau
  bestmod$family = "gaussian"
}

print(summary(bestmod))
## Estimation type: Local Likelihood - Gamma 
## 
## Call:
## locfit(formula = prec_obs ~ ., data = X, alpha = 0.6, maxk = 10000, 
##     deg = 2, kern = "bisq", scale = T, family = "gamma", link = "inverse", 
##     gcv = 0.101765551538974)
## 
## Number of data points:  73 
## Independent variables:  Long 
## Evaluation structure: Rectangular Tree 
## Number of evaluation points:  7 
## Degree of fit:  2 
## Fitted Degrees of Freedom:  5.736
#########################################################
###### (ii) Show the scatterplot of observed and modeled precipitation along with the 1:1 line
#########################################################
prec_hat=predict(bestmod,X_best,se.fit=T)

if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat$fit)
  dev.new()
  plot(prec_obs,prec_hat$fit,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat$fit)
  plot(prec_obs,prec_hat$fit,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

#########################################################
####### (iii) Perform ANOVA and model diagnostics #######
#########################################################
loc_Ftest(prec_obs,prec_hat,X_best,bestmod)
## [1] "F-test:"
## [1] "Reject the Null because F(local poly) = 2.60 > 2.36 = F(linear model)."
mod_diagnostics(prec_obs,prec_hat$fit,2,save)

#########################################################
####### (iv) Compute drop-one cross-validated estimates from the best model and scatterplot them against the observed values
#########################################################
loocv_func(bestmod, X_best, prec_obs, save)

#########################################################
####### (v) Drop 10% of observations, fit the model to the rest of the data and predict the dropped points. Compute RMSE and correlation and show them as boxplots #######
#########################################################
Drop_10_pred(bestmod,X_best,prec_obs, save, bestmod$family)

#########################################################
####### (vi) Spatially map the model estimates and standard error from the best model on the high-resolution grid
#########################################################
elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
# Predict precipitation based on best model
prec_pred <- predict(bestmod, elev_grid, se.fit = T)

# Plot surface
plot_surface(elev_grid[,1],elev_grid[,2],prec_pred,X[,1],X[,2],prec_obs,save)

Compare to linear regression: The local polynomial method offers greater potential to capture detailed spatial behavior but are less interpretable than linear models