HW2_Prob2

2.Use a precipitation threshold of 150 cm across all the stations and categorize the precipitation at each station to a binary variable – 0 if it is less than the threshold, 1, otherwise. Use the suite of predictors from HW1.

Fit a ‘best’ GLM (i.e. logistic regression) with the appropriate link function using one of the objective functions. Test the model goodness using ANOVA
Estimate the function on the grid and plot the surface. Also plot the standard error.
Repeat (i) and (ii) with Local GLM

#### Clear Memory and Set Options
rm(list=ls())
options(warn=-1)
save <- TRUE  # Change to TRUE if you want to save the plot

# Set working directory (Modify if necessary)
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(script_dir)

source("HW2_Library.R")

Read data

climatol_data <- read.table("data/climatol_ann.txt", header=FALSE)
names(climatol_data) <- c("Longitude", "Latitude", "Elevation", "AnnualPrecipitation")

X <- climatol_data[,1:3] 
names(X) <- c("Long", "Lat", "Elev")

prec_obs <- climatol_data$AnnualPrecipitation  # Extract precipitation column
precip <- prec_obs/10 #convert to cm

## Categorize precipitation to a binary variable
q = 150
prec_bin = rep(1, length(precip))

for (i in 1:length(precip)) {
  if (precip[i] < q) {
    prec_bin[i] = 0
  }
}

(i) Fit a ‘best’ GLM (i.e. logistic regression) with the appropriate link function using one of the objective functions. Test the model goodness using ANOVA

Fit GLM

glmfit = GLM_fit(prec_bin, X, family = "binomial")

## [1] "Results of AIC for bestfit GLM"
##       glm_val
## logit 170.369

print(summary(glmfit))

## 
## Call:
## glm(formula = prec_obs ~ ., family = binomial(link = links[which.min(glm_val)]), 
##     data = xx)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -17.37216    3.22415  -5.388 7.12e-08 ***
## Long          0.23332    0.04455   5.238 1.63e-07 ***
## Lat          -0.15704    0.04352  -3.609 0.000308 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 204.82  on 245  degrees of freedom
## Residual deviance: 164.37  on 243  degrees of freedom
## AIC: 170.37
## 
## Number of Fisher Scoring iterations: 6

Plot observed vs. predicted precipitation

prec_pred = glmfit$fitted.values
par(mfrow=c(1,1))
lim=range(prec_bin,prec_pred)
plot(prec_bin,prec_pred,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)

Test model goodness using ANOVA

anova(glmfit)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: prec_obs
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   245     204.82              
## Long  1   25.797       244     179.03 3.793e-07 ***
## Lat   1   14.659       243     164.37 0.0001289 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(ii) Estimate the function on the grid and plot the surface. Also plot the standard error

# Read the high-resolution elevation grid

  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))
covariate_names <- c("Longitude", "Latitude", "Elevation")
X_grid <- india_grid[, covariate_names, drop=FALSE]
colnames(X_grid) <- c("Long","Lat","Elev")



# Predict precipitation probabilities using the fitted model
india_grid$PredictedPrecip <- predict(glmfit, newdata = X_grid, type = "response")

# Estimate standard error of predictions
pred_se <- predict(glmfit, newdata = X_grid, se.fit = TRUE, type = "response")
india_grid$StandardError <- pred_se$se.fit

# Plot estimates and standard error on high-resolution grid


#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("6-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("6-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)

(iii) Repeat (i) and (ii) with Local GLM

Fit a local polynomial using the appropriate link function (i.e. Local GLM)

links = c("logit")
N = length(prec_bin)
combs = leaps(X, prec_bin, nbest=25)     #  Get up to 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
gcvs_m=1:ncombos
glm_gcv=vector(length = length(links))
bestcombo=vector(length = length(links))
for(j in 1:length(links)) { sprintf("j",j)
  
  for(i in 1:ncombos) {
    xx = X[,combos[i,]]
    xx=as.data.frame(xx)
    zz=locpoly_fit(prec_bin, xx,family="binomial", link=links[j], glm=TRUE,plot=TRUE)
    gcvs_m[i] = zz$call$gcv
  }
  # Test using GCV objective function
  glm_gcv[j]=min(gcvs_m)
  bestcombo[j]=which.min(gcvs_m)
}
X_best = data.frame(X[,combos[bestcombo[which.min(glm_gcv)],]])
names(X_best) = names(X[combos[bestcombo[which.min(glm_gcv)],]])
locfit = locpoly_fit(prec_bin, X_best,family="binomial", link=links[which.min(glm_gcv)], glm=TRUE,plot=TRUE)
locfit$combo=combos[bestcombo[which.min(glm_gcv)],]

Xnew=X[,locfit$combo]
summary(locfit)

## Estimation type: Local Likelihood - Binomial 
## 
## Call:
## locfit(formula = prec_obs ~ ., data = X, alpha = 0.75, maxk = 10000, 
##     deg = 2, kern = "bisq", scale = T, family = "binomial", link = "logit", 
##     gcv = 0.190198530525885)
## 
## Number of data points:  246 
## Independent variables:  Long Lat Elev 
## Evaluation structure: Rectangular Tree 
## Number of evaluation points:  99 
## Degree of fit:  2 
## Fitted Degrees of Freedom:  28.626

Plot observed vs. predicted precipitation

prec_pred = predict(locfit, Xnew, se.fit = T)
par(mfrow=c(1,1))
lim=range(prec_bin,prec_pred$fit)
plot(prec_bin,prec_pred$fit,xlab="Actual Precipitation",ylab="Estimated Precipitation",main="Actual vs Estimated Precipitation", xlim=lim, ylim=lim)

Test model goodness using ANOVA

loc_Ftest(prec_bin,prec_pred, Xnew, locfit)

## [1] "F-test:"
## [1] "Reject the Null because F(local poly) = 4.07 > 1.46 = F(linear model)."

Estimate the function on the grid and plot the surface. Also plot the standard error

# Read the high-resolution elevation grid

  grid_data <- read.table("data/india-grid-topo.txt", header=FALSE)
  colnames(grid_data) <- c("Longitude", "Latitude", "Elevation")

# Read the Rajeevan grid defining India's area
rajeevan_grid = read.table("data/2b-Rajeevan_grid_with_distances.txt", header=TRUE)
colnames(rajeevan_grid)[1:3] <- c("Longitude", "Latitude", "Elevation")


# Perform an inner join to retain only points within India's defined region
india_grid = merge(rajeevan_grid, grid_data, by = c("Longitude", "Latitude", "Elevation"))
covariate_names <- c("Longitude", "Latitude", "Elevation")
X_grid <- india_grid[, covariate_names, drop=FALSE]
colnames(X_grid) <- c("Long","Lat","Elev")



# Predict precipitation probabilities using the fitted model
india_grid$PredictedPrecip <- predict(locfit, newdata = X_grid, type = "response")

# Estimate standard error of predictions
pred_se <- predict(locfit, newdata = X_grid, se.fit = TRUE, type = "response")
india_grid$StandardError <- pred_se$se.fit

# Plot estimates and standard error on high-resolution grid


#### Define Color Palettes
  prec_palette <- hcl.colors(10, palette = "Zissou 1")  
  se_palette <- hcl.colors(10, palette = "Inferno")  

  #### Plot 1: Predicted Precipitation Map
  g1 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = PredictedPrecip)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Precipitation", colours = prec_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Predicted Precipitation")

  #### Plot 2: Standard Error Map
  g2 <- ggplot(india_grid) +
    geom_tile(aes(x = Longitude, y = Latitude, fill = StandardError)) +  # Use "Long" and "Lat"
    scale_fill_gradientn(name = "Std. Error", colours = se_palette, na.value = "white") +
    theme_bw() +
    coord_fixed() +
    ggtitle("Spatial Distribution of Prediction Standard Error")

  #### Save Plots
  if (save) {
    ggsave("2-Predicted_Precipitation_Map.pdf", g1, width=8, height=6)
    ggsave("2-Standard_Error_Map.pdf", g2, width=8, height=6)
  }

  #### Display Plots
  print(g1)

  print(g2)

HW2_Prob2

Haiying Peng

2025-03-12

2.Use a precipitation threshold of 150 cm across all the stations and categorize the precipitation at each station to a binary variable – 0 if it is less than the threshold, 1, otherwise. Use the suite of predictors from HW1.

Read data

(i) Fit a ‘best’ GLM (i.e. logistic regression) with the appropriate link function using one of the objective functions. Test the model goodness using ANOVA

Plot observed vs. predicted precipitation

Test model goodness using ANOVA

(ii) Estimate the function on the grid and plot the surface. Also plot the standard error

(iii) Repeat (i) and (ii) with Local GLM

Fit a local polynomial using the appropriate link function (i.e. Local GLM)

Plot observed vs. predicted precipitation

Test model goodness using ANOVA

Estimate the function on the grid and plot the surface. Also plot the standard error