HW3_Prob4

4. Modeling Nonstationary Time Series - HMM & Forecast and simulation
Another way to model/simulate a time series is using HMM. For the spring streamflow in problem 1 for April \(1^{st}\) lead time fit an HMM and make the forecast. The steps are as follows:

Fit a best HMM model for the spring streamflow
Fit a best GLM (mostly logistic regression) to the state sequence as a function of predictors for April 1 and the state from the previous year - i.e., \[ S_t = f(S_t - 1, x_t) \]
Using this best GLM, for each year, t, based on the predictor vector obtain the probabilities of the states (i.e. the distribution of the HMM)
Using these state probabilities, simulate flow from the corresponding state PDFs - to obtain an ensemble
Plot the historic flow vs ensemble mean forecast along with the 1:1 line for visual comparison. Compute skill scores - RPSS and correlation between the historic flow and the mean of the ensemble forecast.
Repeat for March \(1^{st}\) lead time

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
set.seed(5)

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 03/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw3_library.R"))
source("hw3_library.R")

Read data

swe = as.matrix(readRDS("./data/prob1/springSWE_inches.Rds"))   # Snow water equivalent @ Mar 1 & Apr 1
clim = as.matrix(readRDS("./data/prob1/winterMeanClimateIndices.Rds")) # AMO, PDO, ENSO
data = as.matrix(readRDS("./data/prob1/springFlows.Rds"))    # Spring flow @ Lees Ferry
years = swe[,1]
clim = clim[-which(clim[,1] < years[1]),]     # only include years with full dataset
data = data[-which(data[,1] < years[1]),]     # only include years with full dataset

Compute terciles of observed historical spring flow

obs = as.data.frame(data)
q_maf = obs$q_maf
terc = quantile(q_maf, c(1/3, 2/3))
terc_obs = rep(NA, length(q_maf))
for (i in 1:length(q_maf)){
  if ( q_maf[i] < terc[1]){
    terc_obs[i]=1
  } else if ( q_maf[i] >= terc[1] & q_maf[i] < terc[2] ) {
    terc_obs[i]=2
  } else {
    terc_obs[i]=3
  }
}
climatology=c(1/3, 1/3, 1/3)

Fit a best HMM for the spring streamflow

family = "gamma"  # underlying distribution for hmm
stationary <- F   # use a stationary distribution of mixtures
discrete <- F
ic = "same.sd"  #c("same.sd","same.both","both.diff")
fd.name = ifelse(family == "norm", "normal", family)

x = data[,2]
aic1=c()

Obtain AIC for 1 to 3 states and choose best order

for(imodel in 1:3){
  sink(nullfile())
  m <- imodel # model order to fit
  # T.P.M.
  Pi <- Pi_init(m)
  delta <- delta_init(m)
  
  pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
  
  # set up the model
  hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
  
  if(imodel < 2){
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge =
                                      expression(diff > tol)))
  } else {
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
  }
  sink()
  
  # get the hidden states from the fitted model
  # Global decoding
  # To get the probability of being in a state: hmm$u
  decoding <- Viterbi(hmm)
  
  # get AIC
  aic <- AIC_hmm(hmm, k=2)
  aic1 = c(aic1,aic)
  sink()
}

## Get the best order
bestorder = order(aic1)[1]
if (bestorder == 1){
  bestorder = 2
}
print(sprintf("The best HMM has %s states", bestorder))

## [1] "The best HMM has 2 states"

Fit best HMM

m <- bestorder
# T.P.M.
Pi <- Pi_init(m)
delta <- delta_init(m)

pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))

sink(nullfile())
# set up the model
hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
sink()

decoding <- Viterbi(hmm)
p <- ggplot_stationary_hmm(hmm,.5)
print(p)

Fit a best GLM to the state sequence as a function of predictors for April 1st and March 1st

nsim = 250
nyrs = length(years)

allY=ifelse(decoding==1, 0, 1) # need 0 or 1 for GLM_fit function. 0 means state 1, 1 means state 2
S_1=allY[-length(allY)] # previous state
Y=allY[-1]
Q_mean = matrix(NA, nrow = nyrs-1, ncol = 2)
rpss = vector("numeric", length = 2)
corr = vector("numeric", length = 2)
for (k in 3:2){
  ######### Fit Best Logistic Regression Model #######
  X = data.frame(S_1, cbind(clim[-1,-1],swe[-1,k]))
  glm_mod = GLM_fit(Y, X, family='binomial')
  print(summary(glm_mod))
  
  ######### Using best GLM, obtain probabilities of the states (i.e. distribution of the HMM) #######
  state_2prob = predict.glm(glm_mod, type='response') # recall that 0 is state 1, 1 is state 2
  state_1prob = 1-state_2prob
  state_probs = data.frame(s1=state_1prob, s2=state_2prob)
  
  ######### Simulate flow from corresponding state PDFs to obtain ensemble #######
  terc_mat=matrix(NA, nrow=nrow(state_probs), ncol=3)
  for (i in 1:nrow(state_probs)){
    sim_vec=rep(NA, nsim)
    for (s in 1:nsim){
      state=sample(c(1,2), size=1, prob=state_probs[i,])
      if (state==1){
        sim_vec[s]= rgamma(1, shape=hmm$pm$shape[1], rate = hmm$pm$rate[1]) #simulate from distribution for state 1 with params in hmm object
      } else { # state 2
        sim_vec[s]= rgamma(1, shape=hmm$pm$shape[2], rate = hmm$pm$rate[2]) #simulate from distribution for state 2 with params in hmm object
      }
    }
    p1 = sum(sim_vec < terc[1])/length(sim_vec)
    p3 = sum(sim_vec > terc[2])/length(sim_vec)
    p2 = 1-(p3+p1)
    p = c(p1,p2,p3)
    
    terc_mat[i,] = p
    Q_mean[i,k-1] = mean(sim_vec)
  }
  ######### Compute RPSS and correlation #######
  rpss[k-1]=rps(obs=terc_obs[-1], pred=terc_mat, baseline=climatology)$rpss
  corr[k-1]=cor(Q_mean[,k-1], data[-1,2])
}

## [1] "Results of AIC for bestfit GLM"
##         glm_press
## logit          10
## probit         10
## cauchit        10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and probit link function."
## 
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]), 
##     data = xx)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.654e-04  -2.100e-08  -2.100e-08  -2.100e-08   1.520e-04  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -583.45   52454.65  -0.011    0.991
## AMO.DJF      -399.83   35493.55  -0.011    0.991
## PDO.DJF        69.69    6356.45   0.011    0.991
## ENSO.DJF       24.47    2266.43   0.011    0.991
## V4             14.36    1305.85   0.011    0.991
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.8362e+01  on 52  degrees of freedom
## Residual deviance: 9.6438e-08  on 48  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25
## 
## [1] "Results of AIC for bestfit GLM"
##         glm_press
## logit          10
## probit         10
## cauchit        10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and logit link function."
## 
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]), 
##     data = xx)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.135e-04  -2.100e-08  -2.100e-08  -2.100e-08   1.262e-04  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1453.51  285171.00  -0.005    0.996
## AMO.DJF       -825.42  159818.43  -0.005    0.996
## PDO.DJF        212.62   41303.34   0.005    0.996
## ENSO.DJF        57.36   11829.42   0.005    0.996
## V4              61.79   12240.80   0.005    0.996
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.8362e+01  on 52  degrees of freedom
## Residual deviance: 5.6578e-08  on 48  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25

Plot the historical flow vs ensemble mean forecast

par(mfrow = c(1,2), mar = c(4,4,2,1))
plot(Q_mean[,2], data[-1,2], xlab = "Ensemble Mean", ylab = "Observed", main = expression(paste("April ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[2],2), sep=''),
                               paste('rpss=', round(rpss[2],2), sep='')), cex=1.1)
plot(Q_mean[,1], data[-1,2], xlab = "Ensemble Mean", ylab = "", main = expression(paste("March ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[1],2), sep=''),
                               paste('rpss=', round(rpss[1],2), sep='')), cex=1.1)

Comments

The best HMM was obtained by calculating AIC values for an HMM with 1 to 3 states. The AIC value for 1 and 2 states was close (295 vs. 298) with the 3 state model having a larger AIC, so a 2-state model was chosen.
The first state is more prevalent while the second state primarily captures upper extreme values of spring flow
The “best” GLM for both lead times considered all covariates (AMO, PDO, ENSO, SWE) except for the state from the previous year
The “best” GLM in both cases provided a poor fit (high standard error, p-value ~ 1) regardless of chosen objective funciton (AIC, BIC, PRESS)
The scatterplots of observed flow vs. ensemble mean show a stratification of the forecasted flow into two groups, with the majority of the values falling in the lower group corresponding to State 1
As expected, the model considering a shorter lead time (April \(1^{st}\)) performs slightly better than the longer lead time (March \(1^{st}\)), with a higher correlation and RPSS between historical flow and the mean of the ensemble forecast