HW2_Prob1

1.Problem 7 from HW1 with a Bayesian Hierarchical Spatial Model

[In the Bayesian models, plot the posterior historgram/PDF of the parameters; spatial maps of posterior mean and standard error]

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE  # Change to TRUE if you want to save the plot

# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")

Read data

climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")

distance_data <- read.table("data/2b-climatol_ann_with_distances.txt", header=TRUE)

#### Rename columns in distance_data to match climatol_data
names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")

#### Merge the two datasets
merged_data <- merge(climatol_data, distance_data, by=c("Longitude", "Latitude", "Elevation"))
prec_obs <- merged_data$AnnualPrecipitation  # Extract precipitation column

#### Create Design Matrix
X <- merged_data[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")]
names(X) <- c("Long", "Lat", "Elev", "DistArabian", "DistBay")

lon <- X[,1]
lat <- X[,2]
ele <- X[,3]
dista <- X[,4]
distb <- X[,5]

precip <- prec_obs/10 #convert to cm
xs = cbind(lon, lat)

Fit GLM

glmfit = glm(prec_obs ~ ., data = X, family = Gamma(link = "log"))

if (save) {
output_file <- "1-glmfit_summary.txt"
sink(output_file)
}

print(summary(glmfit))

## 
## Call:
## glm(formula = prec_obs ~ ., family = Gamma(link = "log"), data = X)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.431e+00  5.134e+00  -1.837 0.067422 .  
## Long         2.175e-01  6.490e-02   3.351 0.000937 ***
## Lat          8.008e-03  2.726e-02   0.294 0.769193    
## Elev        -1.032e-04  9.631e-05  -1.072 0.284810    
## DistArabian -1.578e-01  6.420e-02  -2.459 0.014650 *  
## DistBay      5.025e-02  3.161e-02   1.590 0.113210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.2055282)
## 
##     Null deviance: 61.286  on 245  degrees of freedom
## Residual deviance: 36.151  on 240  degrees of freedom
## AIC: 3646.9
## 
## Number of Fisher Scoring iterations: 14

if (save) {
sink()
}

Fit variogram and perform MCMC

geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))

## variog: computing omnidirectional variogram

sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1500
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(1,0.01))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_obs ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)

## -------------------------------------------------
##      Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%

Plot Posterior PDFs

beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples


if (save) pdf("1-posterior_plots.pdf")

# Adjust plotting layout
par(mfrow=c(2,2), mar=c(4,4,2,1))
for (i in 1:6) { 
    dens <- density(beta[,i], adjust=1)
    plot.ts(beta[,i], main=paste("Trace of", colnames(beta)[i]), ylab="Value", xlab="Iterations")
    plot(density(beta[,i]), main=paste("Density of", colnames(beta)[i]), xlab=paste("N=", length(beta[,i]), "Bandwidth=", round(dens$bw, 2)))
}

par(mfrow=c(3,2), mar=c(4,4,2,1))
for (i in 1:3) { 
    dens <- density(theta[,i], adjust=1)
    plot.ts(theta[,i], main=paste("Trace of", colnames(theta)[i]), ylab="Value", xlab="Iterations")
    plot(density(theta[,i]), main=paste("Density of", colnames(theta)[i]), xlab=paste("N=", length(theta[,i]), "Bandwidth=", round(dens$bw, 2)))
}

if (save) dev.off()
summary(window(bat.fm$p.beta.recover.samples))

## 
## Iterations = 1:1000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean        SD  Naive SE Time-series SE
## (Intercept) -1.474e+04 5855.7716 1.852e+02      1.852e+02
## Long         2.134e+02   74.0773 2.343e+00      2.343e+00
## Lat         -2.640e+01   30.8640 9.760e-01      7.978e-01
## Elev        -1.318e-01    0.1115 3.526e-03      3.252e-03
## DistArabian -1.189e+02   72.4791 2.292e+00      2.055e+00
## DistBay      8.476e+01   37.3607 1.181e+00      1.181e+00
## 
## 2. Quantiles for each variable:
## 
##                   2.5%        25%        50%        75%      97.5%
## (Intercept) -2.623e+04 -1.860e+04 -1.453e+04 -1.086e+04 -3.137e+03
## Long         6.646e+01  1.648e+02  2.109e+02  2.627e+02  3.588e+02
## Lat         -8.530e+01 -4.656e+01 -2.616e+01 -5.374e+00  3.352e+01
## Elev        -3.412e-01 -2.104e-01 -1.342e-01 -5.424e-02  8.596e-02
## DistArabian -2.618e+02 -1.684e+02 -1.184e+02 -6.977e+01  2.473e+01
## DistBay      9.971e+00  6.105e+01  8.561e+01  1.106e+02  1.565e+02

Plot precipitation posterior pdf, predicted precipitation, and standard error

##precipitation posterior pdf
library(parallel)


  # Read the high-resolution elevation grid
  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
  

  
  lon_hr <- grid_data[,1]
  lat_hr <- grid_data[,2]
  elev_hr <- grid_data[,3]
  xps <- cbind(lon_hr, lat_hr)
  
  n_beta <- nrow(beta)
  y <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  yse <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
  
  # Set up parallel cluster
  n.cores <- max(1, detectCores() - 2)
  cl <- makeCluster(n.cores)
  # Load fields package and export variables to workers
  invisible(clusterEvalQ(cl, library(fields)))
  clusterExport(cl, varlist = c("xs", "xps", "theta", "beta", "glmfit", "lon_hr", "lat_hr", "elev_hr", "n_beta"), envir = environment())

# Run parallel Kriging and prediction
results <- parLapply(cl, 1:n_beta, function(i) {
  
  zz <- Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
  y2 <- predict(zz, x = xps, drop.Z = TRUE)
  yse_i <- predictSE(zz, x = xps, drop.Z = TRUE)
  y1 <- beta[i,1] + beta[i,2]*lon_hr + beta[i,3]*lat_hr + beta[i,4]*elev_hr
  y_i <- y1 + y2
  list(y = y_i, yse = yse_i)
})
  
  # Stop cluster
stopCluster(cl)

# Combine results
for (i in 1:n_beta) {
  y[,i] <- results[[i]]$y
  yse[,i] <- results[[i]]$yse
}


mean.y <- apply(y, 1, FUN = mean)    # Take the mean and convert from % to fraction
mean.yse <- apply(yse, 1, FUN = mean)  # Take the mean and convert from % to fraction

# Plot histogram of posterior predictions
  g1 <- ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
  labs(title = "1-Histogram of Precipitation Posterior", x = "Predictions") +
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
  print(g1)

Spatially map the model estimates and standard error on the high-resolution grid

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))

# Use only the rows in 'grid_data' that match the India mask
matched_idx <- match(
  paste(india_grid$Longitude, india_grid$Latitude, india_grid$Elevation),
  paste(grid_data$Longitude, grid_data$Latitude, grid_data$Elevation)
)

# Add predicted results from the parallel Kriging
india_grid$PredictedPrecip <- mean.y[matched_idx] * 10   # Convert to mm
india_grid$StandardError   <- mean.yse[matched_idx] * 10  # Convert to mm


# Plot estimates and standard error on high-resolution grid
#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation (mm)", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("1-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("1-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)

HW2_Prob1

Haiying Peng

2025-03-10

1.Problem 7 from HW1 with a Bayesian Hierarchical Spatial Model

Read data

Fit GLM

Fit variogram and perform MCMC

Plot Posterior PDFs

Plot precipitation posterior pdf, predicted precipitation, and standard error

Spatially map the model estimates and standard error on the high-resolution grid