#### Anna Starodubtseva
#### CVEN 6833, HW 01
#### Problem 4 - Local polynomial
#### CLEAR MEMORY
rm(list=ls())
#graphics.off()
#### Prevent Warnings
options(warn=-1)
# Other
save = FALSE
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 01/R"
setwd(mainDir)
suppressPackageStartupMessages(source("hw1_library.R"))
source("hw1_library.R")
## Read data
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1]) # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)
## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
lon = X[,1]
lat = X[,2]
elev = X[,3]
## Scatterplot to show variable relationships
precip = prec_obs;
if(save==TRUE){
pdf("Simple Scatterplot Matrix.pdf") # save figure
dev.new()
pairs(~lon+lat+elev+precip,data=X,
main="Simple Scatterplot Matrix")
dev.off()
}else{
pairs(~lon+lat+elev+precip,data=X,
main="Simple Scatterplot Matrix")
}

#########################################################
####### (i) Local Polynomial Method #######
#########################################################
N = length(prec_obs)
combs = leaps(X,prec_obs,nbest=25) # Get up to 25 combinations for each
combos = combs$which # number of predictors
ncombos = length(combos[,1])
obj_func = 1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
bestmod=locpoly_fit(prec_obs, xx, glm=FALSE,plot=FALSE)
obj_func[i] = bestmod$call$gcv
}
bestcombo = combos[which.min(obj_func),]
X_new = data.frame(X[,bestcombo])
names(X_new) = names(X[bestcombo])
bestmod=locpoly_fit(prec_obs,X_new, glm=FALSE,plot=FALSE)
summary(bestmod)
## Estimation type: Local Regression
##
## Call:
## locfit(formula = prec_obs ~ ., data = X, alpha = 0.254794520547945,
## maxk = 10000, deg = 1, kern = "bisq", ev = dat(), gcv = 55.6144049900545)
##
## Number of data points: 73
## Independent variables: Long
## Evaluation structure: Data
## Number of evaluation points: 73
## Degree of fit: 1
## Fitted Degrees of Freedom: 7.597
x=as.matrix(X_new)
L1 = matrix(0,ncol=N,nrow=N)
for(i in 1:N){L1[i,]=locfit(prec_obs ~ x,alpha=bestmod$call$alpha,deg=bestmod$call$deg,ev=x[i,],
kern="bisq", geth=1)}
Pest1=L1%*%prec_obs
#compute the GCV for this alpha..
gcvalpha=(N*sum((prec_obs-Pest1)^2)) / ((N-sum(diag(L1)))^2)
print(c('alpha, gcv, gcv from gcvplot', bestmod$call$alpha, gcvalpha, bestmod$call$gcv))
## [1] "alpha, gcv, gcv from gcvplot" "0.254794520547945"
## [3] "55.6144049900545" "55.6144049900545"
bestmod=locpoly_fit(prec_obs,X_new, glm=FALSE,plot=TRUE)
bestmod$combo=combos[which.min(obj_func),]
prec_hat=predict(bestmod,X_new,se.fit=T)
# compare the predicted values by model and by L matrix
if(save==TRUE){
pdf("Estimated_Precipitation_Locfit_Vs_LMatrix.pdf", width = 8, height = 6) # save figure
par(mfrow=c(1,1))
lim=range(Pest1,prec_hat$fit)
plot(Pest1,prec_hat$fit,xlab="Estimated Precipitation",ylab="Estimated Precipitation with L matrix",main="Estimated Precipitation vs Estimated Precipitation with L", xlim=lim, ylim=lim)
abline(a=0,b=1)
dev.off()
}else{
par(mfrow=c(1,1))
lim=range(Pest1,prec_hat$fit)
plot(Pest1,prec_hat$fit,xlab="Estimated Precipitation",ylab="Estimated Precipitation with L matrix",main="Estimated Precipitation vs Estimated Precipitation with L", xlim=lim, ylim=lim)
abline(a=0,b=1)
}

#########################################################
###### (ii) Show the scatterplot of observed and modeled precipitation along with the 1:1 line
#########################################################
if (save==TRUE){
pdf("Precipitation_Scatterplot.pdf", width = 8, height = 6) # save figure
# Observed versus estimates
par(mfrow=c(1,1))
lim=range(prec_obs,prec_hat$fit)
dev.new()
plot(prec_obs,prec_hat$fit,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
abline(a=0,b=1)
dev.off()
}else {
# Observed versus estimates
par(mfrow=c(1,1))
lim=range(prec_obs,prec_hat$fit)
plot(prec_obs,prec_hat$fit,xlab="Actual Precipitation (mm)",ylab="Estimated Precipitation (mm)",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
abline(a=0,b=1)
}

#########################################################
####### (iii) Perform ANOVA and model diagnostics #######
#########################################################
loc_Ftest(prec_obs,prec_hat,X_new,bestmod)
## [1] "F-test:"
## [1] "Reject the Null because F(local poly) = 2.47 > 2.04 = F(linear model)."
mod_diagnostics(prec_obs,prec_hat$fit,2,save)

#########################################################
####### (iv) Compute drop-one cross-validated estimates from the best model and scatterplot them against the observed values
#########################################################
#do a x-validated prediction - i.e., drop a point and obtain its estimate using the rest.
zcv=locfit(prec_obs ~., data=X_new, alpha=bestmod$call$alpha, deg=bestmod$call$deg, kern="bisq", ev=dat(cv=TRUE), scale=TRUE)
#cross validated estimates..
Pcv = predict(zcv)
# Plot LOOCV against observations
if (save==TRUE){
pdf("LOOCV.pdf", width = 8, height = 6) # save figure
lim = range(Pm, Pcv)
plot(prec_obs, Pcv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV")
abline(a=0,b=1,col = "red")
dev.off()
}else{
par(mfrow=c(1,1))
lim = range(prec_obs, Pcv)
plot(prec_obs, Pcv, xlim = lim, ylim=lim, xlab="Observed", ylab="X-validated Estimate", main= "LOOCV")
abline(a=0,b=1,col = "red")
}

#########################################################
####### (v) Drop 10% of observations, fit the model to the rest of the data and predict the dropped points. Compute RMSE and correlation and show them as boxplots #######
#########################################################
Drop_10_pred(bestmod,X,prec_obs,save)

#########################################################
####### (vi) Spatially map the model estimates and standard error from the best model on the high-resolution grid
#########################################################
elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
# Predict precipitation based on best model
prec_pred <- predict(bestmod, elev_grid, se.fit = T)
# Plot surface
plot_surface(elev_grid[,1],elev_grid[,2],prec_pred,X[,1],X[,2],prec_obs,save)


- The local polynomial method using GCV and with no interaction terms captures some longitudinal variation
- Residuals increase with increasing precipitation
- Greatest standard error concentrated at west-most longitude