4. Modeling Nonstationary Time Series - HMM & Forecast and simulation
Another way to model/simulate a time series is using HMM. For the spring streamflow in problem 1 for April \(1^{st}\) lead time fit an HMM and make the forecast. The steps are as follows:

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
set.seed(5)

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 03/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw3_library.R"))
source("hw3_library.R")

Read data

swe = as.matrix(readRDS("./data/prob1/springSWE_inches.Rds"))   # Snow water equivalent @ Mar 1 & Apr 1
clim = as.matrix(readRDS("./data/prob1/winterMeanClimateIndices.Rds")) # AMO, PDO, ENSO
data = as.matrix(readRDS("./data/prob1/springFlows.Rds"))    # Spring flow @ Lees Ferry
years = swe[,1]
clim = clim[-which(clim[,1] < years[1]),]     # only include years with full dataset
data = data[-which(data[,1] < years[1]),]     # only include years with full dataset
Compute terciles of observed historical spring flow
obs = as.data.frame(data)
q_maf = obs$q_maf
terc = quantile(q_maf, c(1/3, 2/3))
terc_obs = rep(NA, length(q_maf))
for (i in 1:length(q_maf)){
  if ( q_maf[i] < terc[1]){
    terc_obs[i]=1
  } else if ( q_maf[i] >= terc[1] & q_maf[i] < terc[2] ) {
    terc_obs[i]=2
  } else {
    terc_obs[i]=3
  }
}
climatology=c(1/3, 1/3, 1/3)

Fit a best HMM for the spring streamflow

family = "gamma"  # underlying distribution for hmm
stationary <- F   # use a stationary distribution of mixtures
discrete <- F
ic = "same.sd"  #c("same.sd","same.both","both.diff")
fd.name = ifelse(family == "norm", "normal", family)

x = data[,2]
aic1=c()
Obtain AIC for 1 to 3 states and choose best order
for(imodel in 1:3){
  sink(nullfile())
  m <- imodel # model order to fit
  # T.P.M.
  Pi <- Pi_init(m)
  delta <- delta_init(m)
  
  pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
  
  # set up the model
  hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
  
  if(imodel < 2){
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge =
                                      expression(diff > tol)))
  } else {
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
  }
  sink()
  
  # get the hidden states from the fitted model
  # Global decoding
  # To get the probability of being in a state: hmm$u
  decoding <- Viterbi(hmm)
  
  # get AIC
  aic <- AIC_hmm(hmm, k=2)
  aic1 = c(aic1,aic)
  sink()
}

## Get the best order
bestorder = order(aic1)[1]
if (bestorder == 1){
  bestorder = 2
}
print(sprintf("The best HMM has %s states", bestorder))
## [1] "The best HMM has 2 states"
Fit best HMM
m <- bestorder
# T.P.M.
Pi <- Pi_init(m)
delta <- delta_init(m)

pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
sink(nullfile())
# set up the model
hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
sink()
decoding <- Viterbi(hmm)
p <- ggplot_stationary_hmm(hmm,.5)
print(p)

Fit a best GLM to the state sequence as a function of predictors for April 1st and March 1st

nsim = 250
nyrs = length(years)

allY=ifelse(decoding==1, 0, 1) # need 0 or 1 for GLM_fit function. 0 means state 1, 1 means state 2
S_1=allY[-length(allY)] # previous state
Y=allY[-1]
Q_mean = matrix(NA, nrow = nyrs-1, ncol = 2)
rpss = vector("numeric", length = 2)
corr = vector("numeric", length = 2)
for (k in 3:2){
  ######### Fit Best Logistic Regression Model #######
  X = data.frame(S_1, cbind(clim[-1,-1],swe[-1,k]))
  glm_mod = GLM_fit(Y, X, family='binomial')
  print(summary(glm_mod))
  
  ######### Using best GLM, obtain probabilities of the states (i.e. distribution of the HMM) #######
  state_2prob = predict.glm(glm_mod, type='response') # recall that 0 is state 1, 1 is state 2
  state_1prob = 1-state_2prob
  state_probs = data.frame(s1=state_1prob, s2=state_2prob)
  
  ######### Simulate flow from corresponding state PDFs to obtain ensemble #######
  terc_mat=matrix(NA, nrow=nrow(state_probs), ncol=3)
  for (i in 1:nrow(state_probs)){
    sim_vec=rep(NA, nsim)
    for (s in 1:nsim){
      state=sample(c(1,2), size=1, prob=state_probs[i,])
      if (state==1){
        sim_vec[s]= rgamma(1, shape=hmm$pm$shape[1], rate = hmm$pm$rate[1]) #simulate from distribution for state 1 with params in hmm object
      } else { # state 2
        sim_vec[s]= rgamma(1, shape=hmm$pm$shape[2], rate = hmm$pm$rate[2]) #simulate from distribution for state 2 with params in hmm object
      }
    }
    p1 = sum(sim_vec < terc[1])/length(sim_vec)
    p3 = sum(sim_vec > terc[2])/length(sim_vec)
    p2 = 1-(p3+p1)
    p = c(p1,p2,p3)
    
    terc_mat[i,] = p
    Q_mean[i,k-1] = mean(sim_vec)
  }
  ######### Compute RPSS and correlation #######
  rpss[k-1]=rps(obs=terc_obs[-1], pred=terc_mat, baseline=climatology)$rpss
  corr[k-1]=cor(Q_mean[,k-1], data[-1,2])
}
## [1] "Results of AIC for bestfit GLM"
##         glm_press
## logit          10
## probit         10
## cauchit        10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and probit link function."
## 
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]), 
##     data = xx)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.654e-04  -2.100e-08  -2.100e-08  -2.100e-08   1.520e-04  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -583.45   52454.65  -0.011    0.991
## AMO.DJF      -399.83   35493.55  -0.011    0.991
## PDO.DJF        69.69    6356.45   0.011    0.991
## ENSO.DJF       24.47    2266.43   0.011    0.991
## V4             14.36    1305.85   0.011    0.991
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.8362e+01  on 52  degrees of freedom
## Residual deviance: 9.6438e-08  on 48  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25
## 
## [1] "Results of AIC for bestfit GLM"
##         glm_press
## logit          10
## probit         10
## cauchit        10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and logit link function."
## 
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]), 
##     data = xx)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.135e-04  -2.100e-08  -2.100e-08  -2.100e-08   1.262e-04  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1453.51  285171.00  -0.005    0.996
## AMO.DJF       -825.42  159818.43  -0.005    0.996
## PDO.DJF        212.62   41303.34   0.005    0.996
## ENSO.DJF        57.36   11829.42   0.005    0.996
## V4              61.79   12240.80   0.005    0.996
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.8362e+01  on 52  degrees of freedom
## Residual deviance: 5.6578e-08  on 48  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25

Plot the historical flow vs ensemble mean forecast

par(mfrow = c(1,2), mar = c(4,4,2,1))
plot(Q_mean[,2], data[-1,2], xlab = "Ensemble Mean", ylab = "Observed", main = expression(paste("April ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[2],2), sep=''),
                               paste('rpss=', round(rpss[2],2), sep='')), cex=1.1)
plot(Q_mean[,1], data[-1,2], xlab = "Ensemble Mean", ylab = "", main = expression(paste("March ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[1],2), sep=''),
                               paste('rpss=', round(rpss[1],2), sep='')), cex=1.1)

Comments