#### Problem 2 - Homework 1 for CVEN6833 - Fitting a linear model for January
#### Author: Alvaro Ossandon

#### CLEAR MEMORY
rm(list=ls())
#### Clear the R console
cat("\f")

#### Prevent Warnings
options(warn=-1)

#data required:
Month=1 # 1: Monthly precipitation of January; 2: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file;

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")

#### create a folder to save the figures
if (Month==1){
  month2="January"
  diraux=paste(mainDir,"/Montly_Precep_January",sep = "")  
}else{
  month2="July"
  diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="LM_FIT"
if (file.exists(subDir)){
  setwd(file.path(diraux, subDir))
} else {
  dir.create(file.path(diraux, subDir))
  setwd(file.path(diraux, subDir))
}

## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("Lat","Long","Elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN

test=na.omit(test)
Pm = test[,4]     #Dependent Variable
#Try up to order 2
test$Lat2=test[,1]^2
test$Long2=test[,2]^2
test$Elev2=test[,3]^2
X = test[,c(1,2,3,5,6,7)]# Independent variable.
#X = test[,1:3]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip =Pm

# Basic Scatterplot Matrix
if(save==TRUE){
  pdf("Simple Scatterplot Matrix.pdf") # save figure
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
  dev.off() 
}else{
  pairs(~lon+lat+elev+precip,data=test, 
        main="Simple Scatterplot Matrix")
}

###################################################################
####### Fit a 'best' linear regression mode  ####
###################################################################
# fit LM
N = length(Pm)
combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
xmse = 1:ncombos

for(i in 1:ncombos) {
  xx = X[,combos[i,]]
  xx=as.data.frame(xx)
  zz= lm(Pm ~ ., data = xx)
  xpress[i]=PRESS(zz)
  xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
}
# Test using PRESS objective function
bestmod= lm(Pm ~ ., data = X[,combos[which.min(xpress),]])
bestmod$Call$Press=PRESS(bestmod)
bestmod$combo=combos[which.min(xpress),]
summary(bestmod)
## 
## Call:
## lm(formula = Pm ~ ., data = X[, combos[which.min(xpress), ]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.939 -12.309   0.103   8.143 182.667 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.133e+04  5.260e+03   4.055 5.93e-05 ***
## Lat         -3.184e+02  6.729e+01  -4.732 3.00e-06 ***
## Long         2.908e+02  9.802e+01   2.966  0.00318 ** 
## Elev        -5.064e-02  1.742e-02  -2.907  0.00383 ** 
## Lat2         4.121e+00  8.626e-01   4.777 2.43e-06 ***
## Long2        1.396e+00  4.615e-01   3.024  0.00264 ** 
## Elev2        1.832e-05  3.472e-06   5.276 2.07e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.87 on 442 degrees of freedom
## Multiple R-squared:  0.5837, Adjusted R-squared:  0.578 
## F-statistic: 103.3 on 6 and 442 DF,  p-value: < 2.2e-16
bestmod$Call$Press
## [1] 218133
#Pmhat=predict(bestmod)
Pmhat=bestmod$fitted.values

if (save==TRUE){
  pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
  dev.off() 
}else {
  # Observed versus estimates
  par(mfrow=c(1,1))
  lim=range(Pm,Pmhat)
  plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
  abline(a=0,b=1)
}

###################################################################
######## III. Anova and model diagnostics of your best model
###################################################################
### ANOVA:
anova(bestmod)
## Analysis of Variance Table
## 
## Response: Pm
##            Df Sum Sq Mean Sq  F value    Pr(>F)    
## Lat         1    232     232   0.4845    0.4867    
## Long        1 102327  102327 214.0366 < 2.2e-16 ***
## Elev        1 143638  143638 300.4488 < 2.2e-16 ***
## Lat2        1   9482    9482  19.8328 1.072e-05 ***
## Long2       1  27286   27286  57.0750 2.434e-13 ***
## Elev2       1  13310   13310  27.8414 2.067e-07 ***
## Residuals 442 211311     478                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### MODEL DIAGNOSTICS:
#yy = precip_dt[,4]         # true value of y based on data
Pmest = Pmhat  # model's predicted values of Pm 
nvar = 2                   # number of variables 
mod_diagnostics(Pm, Pmest, nvar,save)

###################################################################
#### IV. Compute cross-validated and fitted estmiates at each ####
####      observation. Plot them against the observed values.  ####
###################################################################

loocv_func(bestmod,X,Pm,save)

###############################################################################################
#### V. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.
###############################################################################################
Drop_10_pred(bestmod,X,Pm,save)

###################################################################
## VI. Spatially map the model estimates and the standard error ####
###################################################################

#  read the topography map
x1 = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
names(x1)=c("Lat","Long","Elev")
#Try up to order 2
x1$Lat2=x1[,1]^2
x1$Long2=x1[,2]^2
x1$Elev2=x1[,3]^2
ypred =predict(bestmod, newdata=x1[,bestmod$combo], se.fit=T)

#Quilt_plotting(x1[,2],x1[,1],ypred)
plot_surface(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,save)