HW2_Prob1

1. Bayesian Hierarchical Spatial Model - Problem 9 from HW1
In the Bayesian models, plot the posterior histogram/PDF of the parameters; spatial maps of posterior mean and standard error. See Verdin et al. 2015 for tips

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
graphics.off()

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")

Read data

# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_obs = colMeans(data[,-1])  # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create design matrix
X = loc[,1:3]
names(X) = c("Long","Lat","Elev")
lon = X[,1]
lat = X[,2]
elev = X[,3]
precip = prec_obs   # Convert precipitation to cm

xs = cbind(lon, lat)

Fit GLM

glmfit = glm(prec_obs ~ ., data = X, family = Gamma(link = "log"))
print(summary(glmfit))

## 
## Call:
## glm(formula = prec_obs ~ ., family = Gamma(link = "log"), data = X)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.79244  -0.24519  -0.05654   0.15930   0.83631  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.346e+00  1.343e+00  -2.492   0.0151 *  
## Long        -5.956e-02  1.218e-02  -4.889 6.36e-06 ***
## Lat         -2.404e-03  1.559e-02  -0.154   0.8779    
## Elev         9.517e-06  8.691e-05   0.110   0.9131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.122277)
## 
##     Null deviance: 11.0656  on 72  degrees of freedom
## Residual deviance:  7.7545  on 69  degrees of freedom
## AIC: 483.61
## 
## Number of Fisher Scoring iterations: 6

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))

## variog: computing omnidirectional variogram

sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 5000
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(1,0.01))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_obs ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)

## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 3333, 2.97%
## Sampled: 199 of 3333, 5.97%
## Sampled: 299 of 3333, 8.97%
## Sampled: 399 of 3333, 11.97%
## Sampled: 499 of 3333, 14.97%
## Sampled: 599 of 3333, 17.97%
## Sampled: 699 of 3333, 20.97%
## Sampled: 799 of 3333, 23.97%
## Sampled: 899 of 3333, 26.97%
## Sampled: 999 of 3333, 29.97%
## Sampled: 1099 of 3333, 32.97%
## Sampled: 1199 of 3333, 35.97%
## Sampled: 1299 of 3333, 38.97%
## Sampled: 1399 of 3333, 41.97%
## Sampled: 1499 of 3333, 44.97%
## Sampled: 1599 of 3333, 47.97%
## Sampled: 1699 of 3333, 50.98%
## Sampled: 1799 of 3333, 53.98%
## Sampled: 1899 of 3333, 56.98%
## Sampled: 1999 of 3333, 59.98%
## Sampled: 2099 of 3333, 62.98%
## Sampled: 2199 of 3333, 65.98%
## Sampled: 2299 of 3333, 68.98%
## Sampled: 2399 of 3333, 71.98%
## Sampled: 2499 of 3333, 74.98%
## Sampled: 2599 of 3333, 77.98%
## Sampled: 2699 of 3333, 80.98%
## Sampled: 2799 of 3333, 83.98%
## Sampled: 2899 of 3333, 86.98%
## Sampled: 2999 of 3333, 89.98%
## Sampled: 3099 of 3333, 92.98%
## Sampled: 3199 of 3333, 95.98%
## Sampled: 3299 of 3333, 98.98%

Plot Posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
plot(beta)

See attached PDF for plot of beta

plot(theta)

print(summary(window(bat.fm$p.beta.recover.samples)))

## 
## Iterations = 1:3333
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 3333 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean       SD  Naive SE Time-series SE
## (Intercept) -1.174e+02 31.64771 5.482e-01      5.146e-01
## Long        -1.293e+00  0.28601 4.954e-03      4.496e-03
## Lat         -6.019e-02  0.35728 6.189e-03      6.383e-03
## Elev         4.471e-04  0.00202 3.498e-05      3.498e-05
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%      97.5%
## (Intercept) -179.78916 -1.384e+02 -1.173e+02 -95.905302 -56.544084
## Long          -1.85249 -1.485e+00 -1.292e+00  -1.103976  -0.743526
## Lat           -0.76261 -3.011e-01 -6.689e-02   0.187306   0.632244
## Elev          -0.00354 -8.970e-04  4.184e-04   0.001794   0.004514

Plot precipitation posterior PDF

elev_grid=read.delim(paste(mainDir, "/data/Elevation_grid_1deg.txt", sep = ""), header = TRUE, sep = "\t", dec = ".")
names(elev_grid) = c("Long","Lat","Elev")
lon = elev_grid[,1]
lat = elev_grid[,2]
elev_hr = elev_grid[,3]
xps = cbind(lon,lat) 

y = matrix(nrow = length(elev_grid[,1]), ncol = nrow(beta))
yse = matrix(nrow = length(elev_grid[,1]), ncol = nrow(beta))

for (i in 1:nrow(beta)) {
  zz = Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 = predict(zz, x = xps, drop.Z = T)
  yse[,i] = predictSE(zz, x = xps, drop.Z = T)
  y1 = beta[i,1] + beta[i,2]*lon + beta[i,3]*lat + beta[i,4]*elev_hr
  y[,i] = y1 + y2
}
mean.y = apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse = apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

g1 = ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Spatially map precipitation estimate and standard error on high-resolution grid

pred = cbind(mean.y,mean.yse)
ypred = data.frame(pred)
names(ypred) = c("fit","se.fit")
Quilt_plotting(xps[,1],xps[,2],ypred,X[,1],X[,2],prec_obs)

Comments

The posterior density of tau shows that its value is effectively 0
The predicted precipitation on the high-resolution grid increases primarily with longitude, which can be seen in the fitted GLM
The predicted standard error is generally low but is larger in areas where there is less observational data