#### Anna Starodubtseva
#### CVEN 6833, HW 01
#### Problem 2(b) - Linear regression

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

# Other
save = FALSE

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 01/R"
setwd(mainDir)
suppressPackageStartupMessages(source("hw1_library.R"))
source("hw1_library.R")

## Read data
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1])  # Mean of location columns
prec_obs = colMeans(data[,2:ncol(data)])    # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
lon = X[,1]
lat = X[,2]
elev = X[,3]
# Add interaction terms
X$LongLat = X[,1]*X[,2]
X$LongElev = X[,1]*X[,3]
X$LatElev = X[,2]*X[,3]

## Scatterplot to show variable relationships
precip = prec_obs;
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  #dev.new()
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
}else{
    #dev.new()
  pairs(~lon+lat+elev+precip,data=X, 
        main="Simple Scatterplot Matrix")
}

#########################################################
####### (i) Fit a 'best' linear regression model ########
#########################################################
# Determine possible combinations of variables
N = length(prec_obs)
combs = leaps(X,prec_obs,nbest=25)     #  Get up to 25 combinations for each
combos = combs$which                # number of predictors
ncombos = length(combos[,1])
obj_func = 1:ncombos
xmse = 1:ncombos

# Determine best combination of parameters utilizing AIC objective function
for(i in 1:ncombos) {
  x_i = X[,combos[i,]]
  x_i=as.data.frame(x_i)
  mod_i = lm(prec_obs ~ ., data = x_i)
  obj_func[i]=AIC(mod_i)
  xmse[i] = sum((mod_i$res)^2) / (N - length(mod_i$coef))
}
# Compute best model
bestcombo = combos[which.min(obj_func),]
coef = data.frame(X[,bestcombo])
names(coef) = names(X[bestcombo])
bestmod = lm(prec_obs ~ ., coef)
bestmod$Call$AIC=AIC(bestmod)
bestmod$Call$combo=bestcombo
print(summary(bestmod))
## 
## Call:
## lm(formula = prec_obs ~ ., data = coef)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.8351  -4.2677  -0.8141   2.9958  25.5307 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21.5129810  2.7260522   7.892 2.89e-11 ***
## Elev        -0.0981481  0.0191692  -5.120 2.57e-06 ***
## LongElev    -0.0009075  0.0001795  -5.055 3.30e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.672 on 70 degrees of freedom
## Multiple R-squared:  0.2748, Adjusted R-squared:  0.254 
## F-statistic: 13.26 on 2 and 70 DF,  p-value: 1.309e-05
#########################################################
###### (ii) Show the scatterplot of observed and modeled precipitation along with the 1:1 line
#########################################################
prec_hat=bestmod$fitted.values

if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat)
  #dev.new()
  plot(prec_obs,prec_hat,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(prec_obs,prec_hat)
  #dev.new()
  plot(prec_obs,prec_hat,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

#########################################################
####### (iii) Perform ANOVA and model diagnostics #######
#########################################################
print(anova(bestmod))
## Analysis of Variance Table
## 
## Response: prec_obs
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Elev       1   56.7   56.72  0.9636  0.3297    
## LongElev   1 1504.1 1504.09 25.5554 3.3e-06 ***
## Residuals 70 4119.9   58.86                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_diagnostics(prec_obs,prec_hat,2,save)

#########################################################
####### (iv) Compute drop-one cross-validated estimates from the best model and scatterplot them against the observed values
#########################################################
loocv_func(bestmod,X,prec_obs,save)

#########################################################
####### (v) Drop 10% of observations, fit the model to the rest of the data and predict the dropped points. Compute RMSE and correlation and show them as boxplots #######
#########################################################
Drop_10_pred(bestmod,X,prec_obs,save)

#########################################################
####### (vi) Spatially map the model estimates and standard error from the best model on the high-resolution grid
#########################################################
elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
# Add interaction terms
elev_grid$LongLat = elev_grid[,1]*elev_grid[,2]
elev_grid$LongElev = elev_grid[,1]*elev_grid[,3]
elev_grid$LatElev = elev_grid[,2]*elev_grid[,3]
# Predict precipitation based on best model
prec_pred = predict(bestmod, elev_grid[,bestcombo], se.fit = T)

# Plot surface
plot_surface(elev_grid[,1],elev_grid[,2],prec_pred,X[,1],X[,2],prec_obs,save)