Problem 12 of HW1. Develop a Bayesian Hierarchical Spatial Model for problem 7 of HW1. For this the first level is a Binomial distribution (i.e. logistic regression) You can use the steps/code from Andrew Verdin at http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/bayes/spatial-bayes/ And/or write your own code in RJAGS.

Data and commands for the problems below will be at http://civil.colorado.edu/~balajir/CVEN6833/HWs/HW-2-2018

Solution

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#data required: 
day=11 # day of January 1997 

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW2.R"))
source("Lib_HW2.R")

Read in data

#  Read the topography data map
x0 = read.table("./data/colo_dem.dat")
names(x0)=c("lat","long","elev")
x1 = x0[,1:3]
## Read data as table (data frame)
test = read.table(paste("C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2/data/colo_pcp_daily_1997_01_",day,".dat",sep=""))
#assign names for columns/variables:
names(test)=c("lat","long","elev","Pm")
test1=test
test[test == -999.999] <- NA
test = test[complete.cases(test), ] 
#delete duplicate locations
dup_loc <- dup.coords(test[,1:2]) 
test=test[-as.numeric(dup_loc[1,]), ] 

Fit GLM

# convert the precipitation to binary data 
Pm= test[,4]>0#Dependent Variable
X = test[,1:3]# Independent variable.
glmfit=GLM_fit(as.data.frame(cbind(X,Pm)),"binomial")
## [1] "Results of PRESS for bestfit GLM"
##        glm_press
## logit   342.9802
## probit  343.2803
sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", glmfit$Call$FAMILY, glmfit$Call$LINK)
## [1] "Choosing the GLM which minimizes PRESS: binomial family and logit link function."
summary(glmfit)
## 
## Call:
## glm(formula = Y ~ ., family = binomial(link = links[zc_bic]), 
##     data = xx)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1776  -1.1158   0.6256   0.8550   1.4837  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.3313552 10.0578990  -0.132 0.894692    
## lat          0.5192660  0.1194255   4.348 1.37e-05 ***
## long         0.1884544  0.0862533   2.185 0.028897 *  
## elev         0.0007859  0.0002328   3.376 0.000735 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 363.80  on 287  degrees of freedom
## Residual deviance: 333.11  on 284  degrees of freedom
## AIC: 341.11
## 
## Number of Fisher Scoring iterations: 4

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals,X$long,X$lat),data.col=1)
## as.geodata: 2 replicated data locations found. 
##  Consider using jitterDupCoords() for jittering replicated locations. 
## WARNING: there are data at coincident or very closed locations, some of the geoR's functions may not work.
##  Use function dup.coords() to locate duplicated coordinates.
##  Consider using jitterDupCoords() for jittering replicated locations
vg = variog(geod,breaks=seq(50,10000,50))
## variog: computing omnidirectional variogram
## variog: co-locatted data found, adding one bin at the origin
phi.val = median(vg$u) 
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val

# Input number of samples
n.samples <- 1500
# coordinates of observations
coords <- cbind(X$long,X$lat) 
# prior distributions defined (limited distributions available... see help(spLM)
priors <- list("beta.flat", "phi.unif"=c(phi.lo,phi.hi),
               "sigma.sq.ig"=c(100, 100),"tau.sq.IG"=c(0.01, 0.001))
# starting values for MCMC resampling
starting <- list("phi"=phi.val, "sigma.sq"=500,"tau.sq"=0.01)
# adjustment factor for MCMC sampling routine
tuning <- list("phi"=1, "sigma.sq"=1, "tau.sq"=1) # WJR why set these to 1??

# Perform Monte Carlo Markov Chain Analysis
bat.fm <- spLM(Pm~., data=X, coords=as.matrix(X[,1:2]), priors=priors, tuning=tuning, starting=starting, cov.model = "exponential", n.samples = n.samples, verbose = FALSE, n.report = 50)
#add some small amount to duplicated coordinates
dup <- duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# burn-in samples
bat.fm <- spRecover(bat.fm, start=((1/3)*n.samples)+1, thin=1)
## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%

Plot posterior pdfs

# plot posterior pdfs
beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
plot(beta)

plot(theta)

summary(window(bat.fm$p.beta.recover.samples))
## 
## Iterations = 1:1000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                  Mean        SD  Naive SE Time-series SE
## (Intercept) 0.6825185 6.6031084 2.088e-01      2.088e-01
## lat         0.0813582 0.0853898 2.700e-03      2.525e-03
## long        0.0336542 0.0550883 1.742e-03      1.742e-03
## elev        0.0001581 0.0001013 3.202e-06      3.202e-06
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%       50%       75%     97.5%
## (Intercept) -1.242e+01 -3.633e+00 0.6513587 5.3272268 1.395e+01
## lat         -8.707e-02  2.893e-02 0.0769258 0.1426181 2.420e-01
## long        -8.076e-02 -2.383e-03 0.0357933 0.0726966 1.401e-01
## elev        -4.586e-05  8.871e-05 0.0001561 0.0002264 3.572e-04

Plot precipitation posterior pdf, predicted precipitation, and standard error

n.samples2 = floor(n.samples*(2/3))
ylog = matrix(nrow=length(x1[,1]), ncol=n.samples2)
yselog = matrix(nrow=length(x1[,1]), ncol=n.samples2)

for (i in 1:n.samples2){
  zz = Krig(X[,1:2],glmfit$residuals,rho=theta[i,1],theta=theta[i,3],m=1,sigma2=theta[i,2])
  y2 = predict.Krig(zz,x=x1[,1:2],drop.Z=TRUE) 
  yselog[,i] = predictSE(zz, x=x1[,1:2], drop.Z=TRUE) 
  y1 = beta[i,1] + beta[i,2]*x1[,1]+beta[i,3]*x1[,2]+beta[i,4]*x1[,3]
  ylog[,i] = y1+y2
}
y=logit2prob(ylog) #to obtain the real value
yse=logit2prob(yselog) #to obtain the real value
mean.y = apply(y, 1, FUN = mean) # take the mean and convert from percent to fraction
mean.yse = apply(yse, 1, FUN = mean) # take the mean and convert from percent to fraction

ggplot() +
 geom_histogram(aes(as.vector(y)),fill="gray50",col='black') +
  labs(title = "Histogram of Precipitation Posterior",
       x = "Predictions")+
  #theme_light()+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###################################################################
## II. Spatially map the model estimates and the standard error ####
###################################################################
Pred=cbind(mean.y,mean.yse)
ypred=data.frame(Pred)
names(ypred)=c("fit","se")
Quilt_plotting(x1[,2],x1[,1],ypred,X[,2],X[,1],Pm,type="binary")

Final comments

  • The nugget effect could not set as 0 because it produced an error in the code execution. However, in the posterior distribution is seen that this parameter (tau) is close to 0.
  • A transformation was applied to the posterior binary precipitation obtained by Bayesian Hierarchical Spatial Model because for binomial distribution the logit link function was considered
  • Predictions are good since probabilities over 0.8 are obtained in zones where observed precipitation are greater than 0.
  • Although standard error tends to increase in zones where observation data are sparse, its range of variation is low, which is good. however, its value are significant considering that the variable predicted is a king of probability (0-1).