2. Logistic Regression
Compute the 75th percentile of the average winter 3-day maximum precipitation across all the stations and using this as the threshold, categrize the annual precipitation at each station to a binary variable 0 if annual precipitation is less than the threshold, 1 otherwise.
(i) Fit a ‘best’ GLM (i.e. logistic regression) with the appropriate link function using one of the objective functions. Test the model goodness using ANOVA
(ii) Estimate the function on the grid and plot the surface. Also plot the standard error.
(iii) Repeat (i) and (ii) with Local GLM
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
graphics.off()
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1]) # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)
## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
# Add interaction terms
# X$LongLat = X[,1]*X[,2]
# X$LongElev = X[,1]*X[,3]
# X$LatElev = X[,2]*X[,3]
## Compute the 75th percentile and categorize precipitation to a binary variable
q = quantile(prec_obs, c(0.75))
prec_bin = rep(1, length(prec_obs))
for (i in 1:length(prec_obs)) {
if (prec_obs[i] < q) {
prec_bin[i] = 0
}
}
glmfit = GLM_fit(prec_bin, X, family = "binomial")
## [1] "Results of AIC for bestfit GLM"
## glm_val
## logit 60.9066
print(summary(glmfit))
##
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]),
## data = xx)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8316 -0.4823 -0.2814 0.8056 2.2909
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -54.4745 13.2884 -4.099 4.14e-05 ***
## Long -0.4906 0.1208 -4.061 4.90e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.708 on 72 degrees of freedom
## Residual deviance: 56.907 on 71 degrees of freedom
## AIC: 60.907
##
## Number of Fisher Scoring iterations: 5
prec_pred = glmfit$fitted.values
par(mfrow=c(1,1))
lim=range(prec_bin,prec_pred)
plot(prec_bin,prec_pred,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
anova(glmfit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: prec_obs
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 72 83.708
## Long 1 26.802 71 56.907
elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
lon = elev_grid[,1]
lat = elev_grid[,2]
elev_hr = elev_grid[,3]
xps = cbind(lon,lat)
ypred = predict(glmfit, newdata = elev_grid, se.fit = T, type = "response")
Quilt_plotting(xps[,1], xps[,2], ypred, X[,1], X[,2], prec_bin, type = "binary")
links = c("logit")
N = length(prec_bin)
combs = leaps(X, prec_bin, nbest=25) # Get up to 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
gcvs_m=1:ncombos
glm_gcv=vector(length = length(links))
bestcombo=vector(length = length(links))
for(j in 1:length(links)) { sprintf("j",j)
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz=locpoly_fit(prec_bin, xx,family="binomial", link=links[j], glm=TRUE,plot=TRUE)
gcvs_m[i] = zz$call$gcv
}
# Test using GCV objective function
glm_gcv[j]=min(gcvs_m)
bestcombo[j]=which.min(gcvs_m)
}
X_best = data.frame(X[,combos[bestcombo[which.min(glm_gcv)],]])
names(X_best) = names(X[combos[bestcombo[which.min(glm_gcv)],]])
locfit = locpoly_fit(prec_bin, X_best,family="binomial", link=links[which.min(glm_gcv)], glm=TRUE,plot=TRUE)
locfit$combo=combos[bestcombo[which.min(glm_gcv)],]
Xnew=X[,locfit$combo]
summary(locfit)
## Estimation type: Local Likelihood - Binomial
##
## Call:
## locfit(formula = prec_obs ~ ., data = X, alpha = 0.6, maxk = 10000,
## deg = 2, kern = "bisq", scale = T, family = "binomial", link = "logit",
## gcv = 0.683653679585743)
##
## Number of data points: 73
## Independent variables: Long
## Evaluation structure: Rectangular Tree
## Number of evaluation points: 7
## Degree of fit: 2
## Fitted Degrees of Freedom: 5.206
prec_pred = predict(locfit, Xnew, se.fit = T)
par(mfrow=c(1,1))
lim=range(prec_bin,prec_pred$fit)
plot(prec_bin,prec_pred$fit,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)
loc_Ftest(prec_bin,prec_pred, Xnew, locfit)
## [1] "F-test:"
## [1] "Reject the Null because F(local poly) = 4.90 > 2.52 = F(linear model)."
ypred = predict(locfit, newdata = elev_grid, se.fit = T, type = "response")
Quilt_plotting(xps[,1], xps[,2], ypred, X[,1], X[,2], prec_bin, type = "binary")
Comments