#### Problem 2 - Homework 1 for CVEN6833 - Fitting a linear model for July
#### Author: Alvaro Ossandon

#### CLEAR MEMORY
rm(list=ls())
#### Clear the R console
cat("\f")

#### Prevent Warnings
options(warn=-1)

#data required:
Month=7 # 1: Monthly precipitation of January; 2: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file;

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")

#### create a folder to save the figures
if (Month==1){
  month2="January"
  diraux=paste(mainDir,"/Montly_Precep_January",sep = "")  
}else{
  month2="July"
  diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="LM_FIT"
if (file.exists(subDir)){
  setwd(file.path(diraux, subDir))
} else {
  dir.create(file.path(diraux, subDir))
  setwd(file.path(diraux, subDir))
}

## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("Lat","Long","Elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN

test=na.omit(test)
Pm = test[,4]     #Dependent Variable
#Try up to order 2
test$Lat2=test[,1]^2
test$Long2=test[,2]^2
test$Elev2=test[,3]^2
X = test[,c(1,2,3,5,6,7)]# Independent variable.
#X = test[,1:3]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip =Pm

# Basic Scatterplot Matrix
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
}

###################################################################
####### Fit a 'best' linear regression mode  ####
###################################################################
# fit LM
N = length(Pm)
combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
xmse = 1:ncombos

for(i in 1:ncombos) {
  xx = X[,combos[i,]]
  xx=as.data.frame(xx)
  zz= lm(Pm ~ ., data = xx)
  xpress[i]=PRESS(zz)
  xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
}
# Test using PRESS objective function
bestmod= lm(Pm ~ ., data = X[,combos[which.min(xpress),]])
bestmod$Call$Press=PRESS(bestmod)
bestmod$combo=combos[which.min(xpress),]
summary(bestmod)
## 
## Call:
## lm(formula = Pm ~ ., data = X[, combos[which.min(xpress), ]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.826  -9.123  -1.793   7.065  76.434 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.790e+04  3.506e+03   5.106 4.90e-07 ***
## Long         3.286e+02  6.628e+01   4.959 1.02e-06 ***
## Elev         3.509e-02  1.179e-02   2.977  0.00307 ** 
## Lat2        -5.031e-02  8.080e-03  -6.227 1.11e-09 ***
## Long2        1.513e+00  3.121e-01   4.847 1.74e-06 ***
## Elev2       -3.532e-06  2.350e-06  -1.503  0.13361    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.78 on 440 degrees of freedom
## Multiple R-squared:  0.4459, Adjusted R-squared:  0.4396 
## F-statistic: 70.81 on 5 and 440 DF,  p-value: < 2.2e-16
bestmod$Call$Press
## [1] 98708.26
#Pmhat=predict(bestmod)
Pmhat=bestmod$fitted.values

if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

###################################################################
######## III. Anova and model diagnostics of your best model
###################################################################
### ANOVA:
anova(bestmod)
## Analysis of Variance Table
## 
## Response: Pm
##            Df Sum Sq Mean Sq  F value    Pr(>F)    
## Long        1  33459   33459 153.1795 < 2.2e-16 ***
## Elev        1  26660   26660 122.0531 < 2.2e-16 ***
## Lat2        1  11489   11489  52.5972 1.859e-12 ***
## Long2       1   5238    5238  23.9810 1.369e-06 ***
## Elev2       1    493     493   2.2584    0.1336    
## Residuals 440  96109     218                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### MODEL DIAGNOSTICS:
#yy = precip_dt[,4]         # true value of y based on data
Pmest = Pmhat  # model's predicted values of Pm 
nvar = 2                   # number of variables 
mod_diagnostics(Pm, Pmest, nvar,save)

###################################################################
#### IV. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

loocv_func(bestmod,X,Pm,save)

###############################################################################################
#### V. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.
###############################################################################################
Drop_10_pred(bestmod,X,Pm,save)

###################################################################
## VI. Spatially map the model estimates and the standard error ####
###################################################################

#  read the topography map
x1 = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
names(x1)=c("Lat","Long","Elev")
#Try up to order 2
x1$Lat2=x1[,1]^2
x1$Long2=x1[,2]^2
x1$Elev2=x1[,3]^2
ypred =predict(bestmod, newdata=x1[,bestmod$combo], se.fit=T)

#Quilt_plotting(x1[,2],x1[,1],ypred)
plot_surface(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,save)