For this the first level is a Binomial distribution (i.e. logistic regression)
#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- FALSE # Change to TRUE if you want to save the plot
# Set working directory
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)
source("HW2_Library.R")
climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")
distance_data <- read.table("data/2b-climatol_ann_with_distances.txt", header=TRUE)
#### Rename columns in distance_data to match climatol_data
names(distance_data) <- c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")
#### Merge the two datasets
merged_data <- merge(climatol_data, distance_data, by=c("Longitude", "Latitude", "Elevation"))
prec_obs <- merged_data$AnnualPrecipitation # Extract precipitation column
#### Create Design Matrix
X <- merged_data[, c("Longitude", "Latitude", "Elevation", "Dist_ArabianSea", "Dist_BayOfBengal")]
names(X) <- c("Long", "Lat", "Elev", "DistArabian", "DistBay")
lon <- X[,1]
lat <- X[,2]
ele <- X[,3]
dista <- X[,4]
distb <- X[,5]
precip <- prec_obs/10 #convert to cm
xs = cbind(lon, lat)
## Categorize precipitation to a binary variable
q = 150
prec_bin = rep(1, length(precip))
for (i in 1:length(precip)) {
if (precip[i] < q) {
prec_bin[i] = 0
}
}
glmfit = GLM_fit(prec_bin, X, family = "binomial")
## [1] "Results of AIC for bestfit GLM"
## glm_val
## logit 167.5007
if (save) {
output_file <- "3-glmfit_summary.txt"
sink(output_file)
}
print(summary(glmfit))
##
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]),
## data = xx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -96.6420 27.8467 -3.471 0.000519 ***
## Long 1.2479 0.3627 3.440 0.000581 ***
## DistArabian -0.8949 0.2744 -3.261 0.001111 **
## DistBay 0.4106 0.2038 2.015 0.043908 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 204.82 on 245 degrees of freedom
## Residual deviance: 159.50 on 242 degrees of freedom
## AIC: 167.5
##
## Number of Fisher Scoring iterations: 6
if (save) {
sink()
}
geod = as.geodata(cbind(glmfit$residuals, lon, lat), data.col = 1)
vg = variog(geod,breaks = seq(50,10000,50))
## variog: computing omnidirectional variogram
sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 1500
# Coordinates of observations
coords = cbind(lon, lat)
# Prior Distributions Defined
priors = list("beta.flat", "phi.unif" = c(phi.lo,phi.hi),"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(0.01,0.001))
# Starting values for MCMC resampling
starting = list("phi" = phi.val, "sigma.sq" = sigma.sq, "tau.sq" = 1)
# Adjustment factor for MCMC sampling routine
tuning = list("phi" = 1, "sigma.sq" = 1, "tau.sq" = 1)
# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(prec_bin ~ ., data = X, coords = as.matrix(X[,1:2]), priors = priors, tuning = tuning, starting = starting, cov.model = "exponential", n.samples = n.samples, verbose = F, n.report = 50)
# Add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# Burn-In Samples
bat.fm = spRecover(bat.fm, start = ((1/3)*n.samples)+1, thin = 1, verbose = T)
## -------------------------------------------------
## Recovering beta and w
## -------------------------------------------------
## Sampled: 99 of 1000, 9.90%
## Sampled: 199 of 1000, 19.90%
## Sampled: 299 of 1000, 29.90%
## Sampled: 399 of 1000, 39.90%
## Sampled: 499 of 1000, 49.90%
## Sampled: 599 of 1000, 59.90%
## Sampled: 699 of 1000, 69.90%
## Sampled: 799 of 1000, 79.90%
## Sampled: 899 of 1000, 89.90%
## Sampled: 999 of 1000, 99.90%
beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
if (save) pdf("3-posterior_plots.pdf")
# Adjust plotting layout
par(mfrow=c(2,2), mar=c(4,4,2,1))
for (i in 1:6) {
dens <- density(beta[,i], adjust=1)
plot.ts(beta[,i], main=paste("Trace of", colnames(beta)[i]), ylab="Value", xlab="Iterations")
plot(density(beta[,i]), main=paste("Density of", colnames(beta)[i]), xlab=paste("N=", length(beta[,i]), "Bandwidth=", round(dens$bw, 6)))
}
par(mfrow=c(3,2), mar=c(4,4,2,1))
for (i in 1:3) {
dens <- density(theta[,i], adjust=1)
plot.ts(theta[,i], main=paste("Trace of", colnames(theta)[i]), ylab="Value", xlab="Iterations")
plot(density(theta[,i]), main=paste("Density of", colnames(theta)[i]), xlab=paste("N=", length(theta[,i]), "Bandwidth=", round(dens$bw, 4)))
}
if (save) dev.off()
summary(window(bat.fm$p.beta.recover.samples))
##
## Iterations = 1:1000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 1000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## (Intercept) -6.034e+00 5.84933 1.850e-01 1.966e-01
## Long 8.769e-02 0.07389 2.337e-03 2.487e-03
## Lat -5.352e-02 0.03095 9.787e-04 8.755e-04
## Elev -8.258e-05 0.00011 3.478e-06 3.478e-06
## DistArabian -1.435e-02 0.07343 2.322e-03 2.263e-03
## DistBay 8.342e-02 0.03549 1.122e-03 1.188e-03
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## (Intercept) -1.723e+01 -1.027e+01 -5.883e+00 -2.1493055 5.6202159
## Long -6.001e-02 3.830e-02 8.530e-02 0.1403668 0.2295232
## Lat -1.149e-01 -7.391e-02 -5.357e-02 -0.0324948 0.0063597
## Elev -3.005e-04 -1.534e-04 -8.192e-05 -0.0000059 0.0001253
## DistArabian -1.572e-01 -6.664e-02 -1.326e-02 0.0360340 0.1316674
## DistBay 1.330e-02 6.022e-02 8.401e-02 0.1064842 0.1501446
##precipitation posterior pdf
library(parallel)
# Read the high-resolution elevation grid
grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
lon_hr <- grid_data[,1]
lat_hr <- grid_data[,2]
elev_hr <- grid_data[,3]
xps <- cbind(lon_hr, lat_hr)
n_beta <- nrow(beta)
y <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
yse <- matrix(NA, nrow = nrow(grid_data), ncol = n_beta)
# Set up parallel cluster
n.cores <- max(1, detectCores() - 2)
cl <- makeCluster(n.cores)
# Load fields package and export variables to workers
invisible(clusterEvalQ(cl, library(fields)))
clusterExport(cl, varlist = c("xs", "xps", "theta", "beta", "glmfit", "lon_hr", "lat_hr", "elev_hr", "n_beta"), envir = environment())
# Run parallel Kriging and prediction
results <- parLapply(cl, 1:n_beta, function(i) {
zz <- Krig(xs, glmfit$residuals, sigma = theta[i,1], theta = theta[i,3], m = 1, tau2 = theta[i,2])
y2 <- predict(zz, x = xps, drop.Z = TRUE)
yse_i <- predictSE(zz, x = xps, drop.Z = TRUE)
y1 <- beta[i,1] + beta[i,2]*lon_hr + beta[i,3]*lat_hr + beta[i,4]*elev_hr
y_i <- y1 + y2
list(y = y_i, yse = yse_i)
})
# Stop cluster
stopCluster(cl)
# Combine results
for (i in 1:n_beta) {
y[,i] <- results[[i]]$y
yse[,i] <- results[[i]]$yse
}
y=logit2prob(y) #to obtain the real value
yse=logit2prob(yse) #to obtain the real value
mean.y <- apply(y, 1, FUN = mean) # Take the mean and convert from % to fraction
mean.yse <- apply(yse, 1, FUN = mean) # Take the mean and convert from % to fraction
# Plot histogram of posterior predictions
g1 <- ggplot() + geom_histogram(aes(as.vector(y)), fill = "gray50", col = "black") +
labs(title = "3-Histogram of Precipitation Posterior", x = "Predictions") +
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
print(g1)
# Read the high-resolution elevation grid
grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")
# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")
# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))
# Extract the index of points from the full prediction grid that are in the Rajeevan grid
# (assuming the order of grid_data was used in predictions)
india_index <- match(paste(india_grid$Longitude, india_grid$Latitude),
paste(grid_data$Longitude, grid_data$Latitude))
# Add predicted posterior mean and standard error from MCMC-based Kriging
india_grid$PredictedPrecip <- mean.y[india_index]
india_grid$StandardError <- mean.yse[india_index]
# Plot estimates and standard error on high-resolution grid
#### Define Color Palettes
prec_palette <- hcl.colors(10, palette = "Zissou 1")
se_palette <- hcl.colors(10, palette = "Inferno")
#### Plot 1: Predicted Precipitation Map
g1 <- ggplot(india_grid) +
geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) + # Use "Long" and "Lat"
scale_fill_gradientn(name = "Precipitation", colours = prec_palette, na.value = "white") +
theme_bw() +
coord_fixed() +
ggtitle("Spatial Distribution of Predicted Precipitation")
#### Plot 2: Standard Error Map
g2 <- ggplot(india_grid) +
geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) + # Use "Long" and "Lat"
scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
theme_bw() +
coord_fixed() +
ggtitle("Spatial Distribution of Prediction Standard Error")
#### Save Plots
if (save) {
ggsave("2-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
ggsave("2-Standard_Error_Map.pdf", g2, width=8, height=6)
}
#### Display Plots
print(g1)
print(g2)