1.Problem 7 from HW1 with a Bayesian Hierarchical Spatial Model

[In the Bayesian models, plot the posterior historgram/PDF of the parameters; spatial maps of posterior mean and standard error]

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")
Read data
climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")

distance_data <- read.table("data/2b-climatol_ann_with_distances.txt", header=TRUE)

#### Rename columns in distance_data to match climatol_data
names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")

#### Merge the two datasets
merged_data <- merge(climatol_data, distance_data, by=c("Longitude", "Latitude", "Elevation"))
prec_obs <- merged_data$AnnualPrecipitation  # Extract precipitation column

#### Create Design Matrix
X <- merged_data[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")]
names(X) <- c("Long", "Lat", "Elev", "DistArabian", "DistBay")

lon <- X[,1]
lat <- X[,2]
ele <- X[,3]
dista <- X[,4]
distb <- X[,5]

precip <- prec_obs/10 #convert to cm
xs = cbind(lon, lat)

Fit GLM

glmfit = glm(prec_obs ~ ., data = X, family = Gamma(link = "log"))

if (save) {
output_file <- "1-glmfit_summary.txt"
sink(output_file)
}

print(summary(glmfit))
## 
## Call:
## glm(formula = prec_obs ~ ., family = Gamma(link = "log"), data = X)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.431e+00  5.134e+00  -1.837 0.067422 .  
## Long         2.175e-01  6.490e-02   3.351 0.000937 ***
## Lat          8.008e-03  2.726e-02   0.294 0.769193    
## Elev        -1.032e-04  9.631e-05  -1.072 0.284810    
## DistArabian -1.578e-01  6.420e-02  -2.459 0.014650 *  
## DistBay      5.025e-02  3.161e-02   1.590 0.113210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.2055282)
## 
##     Null deviance: 61.286  on 245  degrees of freedom
## Residual deviance: 36.151  on 240  degrees of freedom
## AIC: 3646.9
## 
## Number of Fisher Scoring iterations: 14
if (save) {
sink()
}

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))
## variog: computing omnidirectional variogram
sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1500
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(1,0.01))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_obs ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)
## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%

Plot Posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples


if (save) pdf("1-posterior_plots.pdf")

# Adjust plotting layout
par(mfrow=c(2,2), mar=c(4,4,2,1))
for (i in 1:6) { 
    dens <- density(beta[,i], adjust=1)
    plot.ts(beta[,i], main=paste("Trace of", colnames(beta)[i]), ylab="Value", xlab="Iterations")
    plot(density(beta[,i]), main=paste("Density of", colnames(beta)[i]), xlab=paste("N=", length(beta[,i]), "Bandwidth=", round(dens$bw, 2)))
}

par(mfrow=c(3,2), mar=c(4,4,2,1))
for (i in 1:3) { 
    dens <- density(theta[,i], adjust=1)
    plot.ts(theta[,i], main=paste("Trace of", colnames(theta)[i]), ylab="Value", xlab="Iterations")
    plot(density(theta[,i]), main=paste("Density of", colnames(theta)[i]), xlab=paste("N=", length(theta[,i]), "Bandwidth=", round(dens$bw, 2)))
}

if (save) dev.off()
summary(window(bat.fm$p.beta.recover.samples))
## 
## Iterations = 1:1000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean        SD  Naive SE Time-series SE
## (Intercept) -1.474e+04 5855.7716 1.852e+02      1.852e+02
## Long         2.134e+02   74.0773 2.343e+00      2.343e+00
## Lat         -2.640e+01   30.8640 9.760e-01      7.978e-01
## Elev        -1.318e-01    0.1115 3.526e-03      3.252e-03
## DistArabian -1.189e+02   72.4791 2.292e+00      2.055e+00
## DistBay      8.476e+01   37.3607 1.181e+00      1.181e+00
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%      97.5%
## (Intercept) -2.623e+04 -1.860e+04 -1.453e+04 -1.086e+04 -3.137e+03
## Long         6.646e+01  1.648e+02  2.109e+02  2.627e+02  3.588e+02
## Lat         -8.530e+01 -4.656e+01 -2.616e+01 -5.374e+00  3.352e+01
## Elev        -3.412e-01 -2.104e-01 -1.342e-01 -5.424e-02  8.596e-02
## DistArabian -2.618e+02 -1.684e+02 -1.184e+02 -6.977e+01  2.473e+01
## DistBay      9.971e+00  6.105e+01  8.561e+01  1.106e+02  1.565e+02

Plot precipitation posterior pdf, predicted precipitation, and standard error

##precipitation posterior pdf
library(parallel)


  # Read the high-resolution elevation grid
  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
  

  
  lon_hr <- grid_data[,1]
  lat_hr <- grid_data[,2]
  elev_hr <- grid_data[,3]
  xps <- cbind(lon_hr, lat_hr)
  
  n_beta <- nrow(beta)
  y <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  yse <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  
  # Set up parallel cluster
  n.cores <- max(1, detectCores() - 2)
  cl <- makeCluster(n.cores)
  # Load fields package and export variables to workers
  invisible(clusterEvalQ(cl, library(fields)))
  clusterExport(cl, varlist = c("xs", "xps", "theta", "beta", "glmfit", "lon_hr", "lat_hr", "elev_hr", "n_beta"), envir = environment())

# Run parallel Kriging and prediction
results <- parLapply(cl, 1:n_beta, function(i) {
  
  zz <- Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 <- predict(zz, x = xps, drop.Z = TRUE)
  yse_i <- predictSE(zz, x = xps, drop.Z = TRUE)
  y1 <- beta[i,1] + beta[i,2]*lon_hr + beta[i,3]*lat_hr + beta[i,4]*elev_hr
  y_i <- y1 + y2
  list(y = y_i, yse = yse_i)
})
  
  # Stop cluster
stopCluster(cl)

# Combine results
for (i in 1:n_beta) {
  y[,i] <- results[[i]]$y
  yse[,i] <- results[[i]]$yse
}


mean.y <- apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse <- apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

# Plot histogram of posterior predictions
  g1 <- ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "1-Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
  print(g1)

Spatially map the model estimates and standard error on the high-resolution grid

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))

# Use only the rows in 'grid_data' that match the India mask
matched_idx <- match(
  paste(india_grid$Longitude, india_grid$Latitude, india_grid$Elevation),
  paste(grid_data$Longitude, grid_data$Latitude, grid_data$Elevation)
)

# Add predicted results from the parallel Kriging
india_grid$PredictedPrecip <- mean.y[matched_idx] * 10   # Convert to mm
india_grid$StandardError   <- mean.yse[matched_idx] * 10  # Convert to mm


# Plot estimates and standard error on high-resolution grid
#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation (mm)", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("1-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("1-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)