HW2_Prob3

3. Bayesian Hierarchical Spatial Model - Binomial
Develop a Bayesian Hierachical Spatial Model for problem 2. For this the first level is a Binomial distribution.
You can use the steps/code from Andrew Verdin at http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/bayes/spatial-bayes/\ And/or write your own code in RJAGS

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
graphics.off()
.pardefault <- par()
par(ask = F)

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")

Read data

# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1])  # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")

## Compute the 75th percentile and categorize precipitation to a binary variable
q = quantile(prec_obs, c(0.75))
prec_bin = rep(1, length(prec_obs))

for (i in 1:length(prec_obs)) {
  if (prec_obs[i] < q) {
    prec_bin[i] = 0
  }
}

Fit GLM

glmfit = GLM_fit(prec_bin, X, family = "binomial")

## [1] "Results of AIC for bestfit GLM"
##       glm_val
## logit 60.9066

print(summary(glmfit))

## 
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]), 
##     data = xx)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8316  -0.4823  -0.2814   0.8056   2.2909  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -54.4745    13.2884  -4.099 4.14e-05 ***
## Long         -0.4906     0.1208  -4.061 4.90e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.708  on 72  degrees of freedom
## Residual deviance: 56.907  on 71  degrees of freedom
## AIC: 60.907
## 
## Number of Fisher Scoring iterations: 5

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, X$Long, X$Lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))

## variog: computing omnidirectional variogram

sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1000
# Coordinates of observations
coords = cbind(X$Long, X$Lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(0.01,0.001))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_bin ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1)

## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 666, 14.86%
## Sampled: 199 of 666, 29.88%
## Sampled: 299 of 666, 44.89%
## Sampled: 399 of 666, 59.91%
## Sampled: 499 of 666, 74.92%
## Sampled: 599 of 666, 89.94%

Plot posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
plot(beta)

See attached PDF for plot of beta

plot(theta)

print(summary(window(bat.fm$p.beta.recover.samples)))

## 
## Iterations = 1:666
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 666 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean        SD  Naive SE Time-series SE
## (Intercept) -7.491e+00 2.8292409 1.096e-01      1.096e-01
## Long        -7.010e-02 0.0254592 9.865e-04      9.865e-04
## Lat          6.709e-03 0.0332811 1.290e-03      1.290e-03
## Elev        -2.501e-05 0.0001761 6.824e-06      6.824e-06
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%      97.5%
## (Intercept) -1.316e+01 -9.3676628 -7.615e+00 -5.614e+00 -1.8202449
## Long        -1.187e-01 -0.0871249 -7.105e-02 -5.287e-02 -0.0183743
## Lat         -5.801e-02 -0.0155504  6.424e-03  2.722e-02  0.0748346
## Elev        -3.729e-04 -0.0001377 -2.997e-05  7.975e-05  0.0003364

Plot precipitation posterior PDF

elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
lon = elev_grid[,1]
lat = elev_grid[,2]
elev_hr = elev_grid[,3]
xps = cbind(lon,lat) 

y = matrix(nrow = length(elev_grid[,1]), ncol = nrow(beta))
yse = matrix(nrow = length(elev_grid[,1]), ncol = nrow(beta))

for (i in 1:nrow(beta)) {
  zz = Krig(X[,1:2], glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 = predict(zz, x = xps, drop.Z = T)
  yse[,i] = predictSE(zz, x = xps, drop.Z = T)
  y1 = beta[i,1] + beta[i,2]*lon + beta[i,3]*lat + beta[i,4]*elev_hr
  y[,i] = y1 + y2
}
y=logit2prob(y) #to obtain the real value
yse=logit2prob(yse) #to obtain the real value
mean.y = apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse = apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

g1 = ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Spatially map precipitation estimate and standard error on high-resolution grid

pred = cbind(mean.y,mean.yse)
ypred = data.frame(pred)
names(ypred) = c("fit","se.fit")
par(.pardefault)
Quilt_plotting(xps[,1],xps[,2],ypred,X[,1],X[,2],prec_bin, type = "binary")

Comments

The predicted precipitation shows good agreement in areas where the observed binary precipitation is 1 (average precipitation greater than 75% percentile), which corresponds to areas with lower standard error
The predicted standard error tends to be greater in regions where observed precipitation data is sparse
The Bayesian Hierarchical Spatial Model has greater standard error than that of the logistic regression model from Problem 2