hw2_Prob3

3. Develop a Bayesian Hierarchical Spatial Model for problem 2 above.

For this the first level is a Binomial distribution (i.e. logistic regression)

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")

Read data

climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")

distance_data <- read.table("data/2b-climatol_ann_with_distances.txt", header=TRUE)

#### Rename columns in distance_data to match climatol_data
names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")

#### Merge the two datasets
merged_data <- merge(climatol_data, distance_data, by=c("Longitude", "Latitude", "Elevation"))
prec_obs <- merged_data$AnnualPrecipitation  # Extract precipitation column

#### Create Design Matrix
X <- merged_data[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")]
names(X) <- c("Long", "Lat", "Elev", "DistArabian", "DistBay")

lon <- X[,1]
lat <- X[,2]
ele <- X[,3]
dista <- X[,4]
distb <- X[,5]

precip <- prec_obs/10 #convert to cm
xs = cbind(lon, lat)

## Categorize precipitation to a binary variable
q = 150
prec_bin = rep(1, length(precip))

for (i in 1:length(precip)) {
  if (precip[i] < q) {
    prec_bin[i] = 0
  }
}

Fit GLM

glmfit = GLM_fit(prec_bin, X, family = "binomial")

## [1] "Results of AIC for bestfit GLM"
##        glm_val
## logit 167.5007

if (save) {
output_file <- "3-glmfit_summary.txt"
sink(output_file)
}

print(summary(glmfit))

## 
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]), 
##     data = xx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -96.6420    27.8467  -3.471 0.000519 ***
## Long          1.2479     0.3627   3.440 0.000581 ***
## DistArabian  -0.8949     0.2744  -3.261 0.001111 ** 
## DistBay       0.4106     0.2038   2.015 0.043908 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 204.82  on 245  degrees of freedom
## Residual deviance: 159.50  on 242  degrees of freedom
## AIC: 167.5
## 
## Number of Fisher Scoring iterations: 6

if (save) {
sink()
}

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))

## variog: computing omnidirectional variogram

sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1500
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(0.01,0.001))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_bin ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)

## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%

Plot Posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples

if (save) pdf("3-posterior_plots.pdf")

# Adjust plotting layout
par(mfrow=c(2,2), mar=c(4,4,2,1))
for (i in 1:6) { 
    dens <- density(beta[,i], adjust=1)
    plot.ts(beta[,i], main=paste("Trace of", colnames(beta)[i]), ylab="Value", xlab="Iterations")
    plot(density(beta[,i]), main=paste("Density of", colnames(beta)[i]), xlab=paste("N=", length(beta[,i]), "Bandwidth=", round(dens$bw, 6)))
}

par(mfrow=c(3,2), mar=c(4,4,2,1))
for (i in 1:3) { 
    dens <- density(theta[,i], adjust=1)
    plot.ts(theta[,i], main=paste("Trace of", colnames(theta)[i]), ylab="Value", xlab="Iterations")
    plot(density(theta[,i]), main=paste("Density of", colnames(theta)[i]), xlab=paste("N=", length(theta[,i]), "Bandwidth=", round(dens$bw, 4)))
}

if (save) dev.off()
summary(window(bat.fm$p.beta.recover.samples))

## 
## Iterations = 1:1000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean      SD  Naive SE Time-series SE
## (Intercept) -6.034e+00 5.84933 1.850e-01      1.966e-01
## Long         8.769e-02 0.07389 2.337e-03      2.487e-03
## Lat         -5.352e-02 0.03095 9.787e-04      8.755e-04
## Elev        -8.258e-05 0.00011 3.478e-06      3.478e-06
## DistArabian -1.435e-02 0.07343 2.322e-03      2.263e-03
## DistBay      8.342e-02 0.03549 1.122e-03      1.188e-03
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%     97.5%
## (Intercept) -1.723e+01 -1.027e+01 -5.883e+00 -2.1493055 5.6202159
## Long        -6.001e-02  3.830e-02  8.530e-02  0.1403668 0.2295232
## Lat         -1.149e-01 -7.391e-02 -5.357e-02 -0.0324948 0.0063597
## Elev        -3.005e-04 -1.534e-04 -8.192e-05 -0.0000059 0.0001253
## DistArabian -1.572e-01 -6.664e-02 -1.326e-02  0.0360340 0.1316674
## DistBay      1.330e-02  6.022e-02  8.401e-02  0.1064842 0.1501446

Plot precipitation posterior pdf, predicted precipitation, and standard error

##precipitation posterior pdf
library(parallel)


  # Read the high-resolution elevation grid
  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
  

  
  lon_hr <- grid_data[,1]
  lat_hr <- grid_data[,2]
  elev_hr <- grid_data[,3]
  xps <- cbind(lon_hr, lat_hr)
  
  n_beta <- nrow(beta)
  y <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  yse <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  
  # Set up parallel cluster
  n.cores <- max(1, detectCores() - 2)
  cl <- makeCluster(n.cores)
  # Load fields package and export variables to workers
  invisible(clusterEvalQ(cl, library(fields)))
  clusterExport(cl, varlist = c("xs", "xps", "theta", "beta", "glmfit", "lon_hr", "lat_hr", "elev_hr", "n_beta"), envir = environment())

# Run parallel Kriging and prediction
results <- parLapply(cl, 1:n_beta, function(i) {
  
  zz <- Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 <- predict(zz, x = xps, drop.Z = TRUE)
  yse_i <- predictSE(zz, x = xps, drop.Z = TRUE)
  y1 <- beta[i,1] + beta[i,2]*lon_hr + beta[i,3]*lat_hr + beta[i,4]*elev_hr
  y_i <- y1 + y2
  list(y = y_i, yse = yse_i)
})
  
  # Stop cluster
stopCluster(cl)

# Combine results
for (i in 1:n_beta) {
  y[,i] <- results[[i]]$y
  yse[,i] <- results[[i]]$yse
}

y=logit2prob(y) #to obtain the real value
yse=logit2prob(yse) #to obtain the real value
mean.y <- apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse <- apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

# Plot histogram of posterior predictions
  g1 <- ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "3-Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
  print(g1)

Spatially map the model estimates and standard error on the high-resolution grid

# Read the high-resolution elevation grid

  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))
# Extract the index of points from the full prediction grid that are in the Rajeevan grid
# (assuming the order of grid_data was used in predictions)
india_index <- match(paste(india_grid$Longitude, india_grid$Latitude),
                     paste(grid_data$Longitude, grid_data$Latitude))

# Add predicted posterior mean and standard error from MCMC-based Kriging
india_grid$PredictedPrecip <- mean.y[india_index]
india_grid$StandardError <- mean.yse[india_index]

# Plot estimates and standard error on high-resolution grid


#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("2-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("2-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)

hw2_Prob3

Haiying Peng

2025-03-12

3. Develop a Bayesian Hierarchical Spatial Model for problem 2 above.

Read data

Fit GLM

Fit variogram and perform MCMC

Plot Posterior PDFs

Plot precipitation posterior pdf, predicted precipitation, and standard error

Spatially map the model estimates and standard error on the high-resolution grid