#### Problem 2 - Homework 1 for CVEN6833 - Fitting a linear model for July
#### Author: Alvaro Ossandon
#### CLEAR MEMORY
rm(list=ls())
#### Clear the R console
cat("\f")
#### Prevent Warnings
options(warn=-1)
#data required:
Month=7 # 1: Monthly precipitation of January; 2: Monthly precipitation of July
save=FALSE # True for saving figure as pdf file;
#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW1"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW1.R"))
source("Lib_HW1.R")
#### create a folder to save the figures
if (Month==1){
month2="January"
diraux=paste(mainDir,"/Montly_Precep_January",sep = "")
}else{
month2="July"
diraux=paste(mainDir,"/Montly_Precep_July",sep = "")
}
setwd(diraux)
subDir="LM_FIT"
if (file.exists(subDir)){
setwd(file.path(diraux, subDir))
} else {
dir.create(file.path(diraux, subDir))
setwd(file.path(diraux, subDir))
}
## Read data as table (data frame)
test = read.table(paste("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_monthly_precip_0",Month,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("Lat","Long","Elev","Pm")
test1=test
#filtering the -999 values
non_values<- which(test[,4]<0)
test[non_values,4]=NaN
test=na.omit(test)
Pm = test[,4] #Dependent Variable
#Try up to order 2
test$Lat2=test[,1]^2
test$Long2=test[,2]^2
test$Elev2=test[,3]^2
X = test[,c(1,2,3,5,6,7)]# Independent variable.
#X = test[,1:3]# Independent variable.
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip =Pm
# Basic Scatterplot Matrix
if(save==TRUE){
pdf("Simple Scatterplot Matrix.pdf") # save figure
pairs(~lon+lat+elev+precip,data=test,
main="Simple Scatterplot Matrix")
dev.off()
}else{
pairs(~lon+lat+elev+precip,data=test,
main="Simple Scatterplot Matrix")
}

###################################################################
####### Fit a 'best' linear regression mode ####
###################################################################
# fit LM
N = length(Pm)
combs = leaps(X,Pm, nbest=25) # GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
xmse = 1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz= lm(Pm ~ ., data = xx)
xpress[i]=PRESS(zz)
xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
}
# Test using PRESS objective function
bestmod= lm(Pm ~ ., data = X[,combos[which.min(xpress),]])
bestmod$Call$Press=PRESS(bestmod)
bestmod$combo=combos[which.min(xpress),]
summary(bestmod)
##
## Call:
## lm(formula = Pm ~ ., data = X[, combos[which.min(xpress), ]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.826 -9.123 -1.793 7.065 76.434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.790e+04 3.506e+03 5.106 4.90e-07 ***
## Long 3.286e+02 6.628e+01 4.959 1.02e-06 ***
## Elev 3.509e-02 1.179e-02 2.977 0.00307 **
## Lat2 -5.031e-02 8.080e-03 -6.227 1.11e-09 ***
## Long2 1.513e+00 3.121e-01 4.847 1.74e-06 ***
## Elev2 -3.532e-06 2.350e-06 -1.503 0.13361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.78 on 440 degrees of freedom
## Multiple R-squared: 0.4459, Adjusted R-squared: 0.4396
## F-statistic: 70.81 on 5 and 440 DF, p-value: < 2.2e-16
bestmod$Call$Press
## [1] 98708.26
#Pmhat=predict(bestmod)
Pmhat=bestmod$fitted.values
if (save==TRUE){
pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
# Observed versus estimates
par(mfrow=c(1,1))
lim=range(Pm,Pmhat)
plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
abline(a=0,b=1)
dev.off()
}else {
# Observed versus estimates
par(mfrow=c(1,1))
lim=range(Pm,Pmhat)
plot(Pm,Pmhat,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
abline(a=0,b=1)
}

###################################################################
######## III. Anova and model diagnostics of your best model
###################################################################
### ANOVA:
anova(bestmod)
## Analysis of Variance Table
##
## Response: Pm
## Df Sum Sq Mean Sq F value Pr(>F)
## Long 1 33459 33459 153.1795 < 2.2e-16 ***
## Elev 1 26660 26660 122.0531 < 2.2e-16 ***
## Lat2 1 11489 11489 52.5972 1.859e-12 ***
## Long2 1 5238 5238 23.9810 1.369e-06 ***
## Elev2 1 493 493 2.2584 0.1336
## Residuals 440 96109 218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### MODEL DIAGNOSTICS:
#yy = precip_dt[,4] # true value of y based on data
Pmest = Pmhat # model's predicted values of Pm
nvar = 2 # number of variables
mod_diagnostics(Pm, Pmest, nvar,save)

###################################################################
#### IV. Compute cross-validated and fitted estmiates at each ####
#### observation. Plot them against the observed values. ####
###################################################################
loocv_func(bestmod,X,Pm,save)

###############################################################################################
#### V. Drop 10% of obs, fit best model, predict dropped points. Compute RMSE and R2. Boxplot.
## validation nsample times (dropping a new 10% each time), and outputs boxplots of RMSE and R2.
###############################################################################################
Drop_10_pred(bestmod,X,Pm,save)

###################################################################
## VI. Spatially map the model estimates and the standard error ####
###################################################################
# read the topography map
x1 = read.table("http://cires1.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
names(x1)=c("Lat","Long","Elev")
#Try up to order 2
x1$Lat2=x1[,1]^2
x1$Long2=x1[,2]^2
x1$Elev2=x1[,3]^2
ypred =predict(bestmod, newdata=x1[,bestmod$combo], se.fit=T)
#Quilt_plotting(x1[,2],x1[,1],ypred)
plot_surface(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,save)
