3. Develop a Bayesian Hierarchical Spatial Model for problem 2 above.

For this the first level is a Binomial distribution (i.e. logistic regression)

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")
Read data
climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")

distance_data <- read.table("data/2b-climatol_ann_with_distances.txt", header=TRUE)

#### Rename columns in distance_data to match climatol_data
names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")

#### Merge the two datasets
merged_data <- merge(climatol_data, distance_data, by=c("Longitude", "Latitude", "Elevation"))
prec_obs <- merged_data$AnnualPrecipitation  # Extract precipitation column

#### Create Design Matrix
X <- merged_data[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")]
names(X) <- c("Long", "Lat", "Elev", "DistArabian", "DistBay")

lon <- X[,1]
lat <- X[,2]
ele <- X[,3]
dista <- X[,4]
distb <- X[,5]

precip <- prec_obs/10 #convert to cm
xs = cbind(lon, lat)

## Categorize precipitation to a binary variable
q = 150
prec_bin = rep(1, length(precip))

for (i in 1:length(precip)) {
  if (precip[i] < q) {
    prec_bin[i] = 0
  }
}

Fit GLM

glmfit = GLM_fit(prec_bin, X, family = "binomial")
## [1] "Results of AIC for bestfit GLM"
##        glm_val
## logit 167.5007
if (save) {
output_file <- "3-glmfit_summary.txt"
sink(output_file)
}

print(summary(glmfit))
## 
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]), 
##     data = xx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -96.6420    27.8467  -3.471 0.000519 ***
## Long          1.2479     0.3627   3.440 0.000581 ***
## DistArabian  -0.8949     0.2744  -3.261 0.001111 ** 
## DistBay       0.4106     0.2038   2.015 0.043908 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 204.82  on 245  degrees of freedom
## Residual deviance: 159.50  on 242  degrees of freedom
## AIC: 167.5
## 
## Number of Fisher Scoring iterations: 6
if (save) {
sink()
}

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))
## variog: computing omnidirectional variogram
sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1500
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(0.01,0.001))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_bin ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)
## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%

Plot Posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples

if (save) pdf("3-posterior_plots.pdf")

# Adjust plotting layout
par(mfrow=c(2,2), mar=c(4,4,2,1))
for (i in 1:6) { 
    dens <- density(beta[,i], adjust=1)
    plot.ts(beta[,i], main=paste("Trace of", colnames(beta)[i]), ylab="Value", xlab="Iterations")
    plot(density(beta[,i]), main=paste("Density of", colnames(beta)[i]), xlab=paste("N=", length(beta[,i]), "Bandwidth=", round(dens$bw, 6)))
}

par(mfrow=c(3,2), mar=c(4,4,2,1))
for (i in 1:3) { 
    dens <- density(theta[,i], adjust=1)
    plot.ts(theta[,i], main=paste("Trace of", colnames(theta)[i]), ylab="Value", xlab="Iterations")
    plot(density(theta[,i]), main=paste("Density of", colnames(theta)[i]), xlab=paste("N=", length(theta[,i]), "Bandwidth=", round(dens$bw, 4)))
}

if (save) dev.off()
summary(window(bat.fm$p.beta.recover.samples))
## 
## Iterations = 1:1000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean      SD  Naive SE Time-series SE
## (Intercept) -6.034e+00 5.84933 1.850e-01      1.966e-01
## Long         8.769e-02 0.07389 2.337e-03      2.487e-03
## Lat         -5.352e-02 0.03095 9.787e-04      8.755e-04
## Elev        -8.258e-05 0.00011 3.478e-06      3.478e-06
## DistArabian -1.435e-02 0.07343 2.322e-03      2.263e-03
## DistBay      8.342e-02 0.03549 1.122e-03      1.188e-03
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%     97.5%
## (Intercept) -1.723e+01 -1.027e+01 -5.883e+00 -2.1493055 5.6202159
## Long        -6.001e-02  3.830e-02  8.530e-02  0.1403668 0.2295232
## Lat         -1.149e-01 -7.391e-02 -5.357e-02 -0.0324948 0.0063597
## Elev        -3.005e-04 -1.534e-04 -8.192e-05 -0.0000059 0.0001253
## DistArabian -1.572e-01 -6.664e-02 -1.326e-02  0.0360340 0.1316674
## DistBay      1.330e-02  6.022e-02  8.401e-02  0.1064842 0.1501446

Plot precipitation posterior pdf, predicted precipitation, and standard error

##precipitation posterior pdf
library(parallel)


  # Read the high-resolution elevation grid
  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
  

  
  lon_hr <- grid_data[,1]
  lat_hr <- grid_data[,2]
  elev_hr <- grid_data[,3]
  xps <- cbind(lon_hr, lat_hr)
  
  n_beta <- nrow(beta)
  y <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  yse <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  
  # Set up parallel cluster
  n.cores <- max(1, detectCores() - 2)
  cl <- makeCluster(n.cores)
  # Load fields package and export variables to workers
  invisible(clusterEvalQ(cl, library(fields)))
  clusterExport(cl, varlist = c("xs", "xps", "theta", "beta", "glmfit", "lon_hr", "lat_hr", "elev_hr", "n_beta"), envir = environment())

# Run parallel Kriging and prediction
results <- parLapply(cl, 1:n_beta, function(i) {
  
  zz <- Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 <- predict(zz, x = xps, drop.Z = TRUE)
  yse_i <- predictSE(zz, x = xps, drop.Z = TRUE)
  y1 <- beta[i,1] + beta[i,2]*lon_hr + beta[i,3]*lat_hr + beta[i,4]*elev_hr
  y_i <- y1 + y2
  list(y = y_i, yse = yse_i)
})
  
  # Stop cluster
stopCluster(cl)

# Combine results
for (i in 1:n_beta) {
  y[,i] <- results[[i]]$y
  yse[,i] <- results[[i]]$yse
}

y=logit2prob(y) #to obtain the real value
yse=logit2prob(yse) #to obtain the real value
mean.y <- apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse <- apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

# Plot histogram of posterior predictions
  g1 <- ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "3-Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
  print(g1)

Spatially map the model estimates and standard error on the high-resolution grid

# Read the high-resolution elevation grid

  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))
# Extract the index of points from the full prediction grid that are in the Rajeevan grid
# (assuming the order of grid_data was used in predictions)
india_index <- match(paste(india_grid$Longitude, india_grid$Latitude),
                     paste(grid_data$Longitude, grid_data$Latitude))

# Add predicted posterior mean and standard error from MCMC-based Kriging
india_grid$PredictedPrecip <- mean.y[india_index]
india_grid$StandardError <- mean.yse[india_index]

# Plot estimates and standard error on high-resolution grid


#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("2-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("2-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)