HMM & Forecast

Another way to simulate a time series is using Hidden Markov Model (Markov Chain + resampling) the steps are as follows:

\(S_t = f(S_t-1, ENSO_t, AMO_t, PDO_t) + eps\)

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW3"
setwd(mainDir)

suppressPackageStartupMessages(source("Lib_HW3.R"))
source("Lib_HW3.R")

Read data

data=matrix(scan("./data/LeesFerry-monflows-1906-2016.txt"),ncol=13,byrow=T)
WYmonflow=data[,2:13]
N = dim(data)[1]#number of years of data for each month
nd=dim(WYmonflow)[2]
x=WYmonflow[,5]/10^6             # convert to MAF May Flow
DATA=as.data.frame(cbind(data[,1],x))
names(DATA)=c("year","may")

#Read covariates
nino34=scan("./data/NINO34-winter-1906-2016.txt")#Nino 34
pdo=scan("./data/PDO-winter-1906-2016.txt")#PDO
amo=scan("./data/AMO-winter-1906-2016.txt")#AMO

Define the best number of PDFs

family = "gamma"  # underlying distribution for hmm
aic1=c() # AIC vector

for(imodel in 1:6){
  m = imodel#model order to fit
  stationary = F# use a stationary distribution of mixtures
  ic="same.sd"#c("same.sd","same.both","both.diff")
  fd.name=ifelse(family == "norm", "normal", family)
  Pi=Pi_init(m)
  delta=delta_init(m)
  pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
  # set up the model
  hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = F)

  if(imodel < 2){
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge = expression(diff > tol)))
  } else {
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
  }
  
  # get the hidden states from the fitted model 
  # Global decoding
  # To get the probability of being in a state: hmm$u
  decoding <- Viterbi(hmm)
  
  # get AIC
  aic <- AIC(hmm)
  aic1=c(aic1,aic)
}
knitr::opts_chunk$set(echo = T,
                      results = T)
## Get the best order
bestorder = order(aic1)[1]
## Fit the model for this best order
## if the best order is 1, 2 will be considered
if(bestorder==1){
  m=order(aic1)[2] #model order to fit
} else{
  m = bestorder #model order to fit
}
sprintf("Model order: %s",m)
## [1] "Model order: 2"

Quasi-stationary approach

stationary= F   # use a stationary distribution of mixtures
ic="same.sd"#c("same.sd","same.both","both.diff")
fd.name=ifelse(family == "norm", "normal", family)
# T.P.M.
Pi= Pi_init(m)
delta=delta_init(m)
pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))

# set up the model
hmm=dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete =F)

hmm=BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
##the vector decoding gives the best estimate of the hidden state
## for each observation
decoding=Viterbi(hmm)
binwidth=0.5
p=ggplot_stationary_hmm(hmm,decoding,binwidth=binwidth,res=N)
print(p)

knitr::opts_chunk$set(echo = T,
                      results = T)

Get state vector

# Add most likely state column to 'DATA' dataframe
DATA$State =decoding
#create a binay state vector
DATA$State_B=DATA$State
DATA$State_B[which(DATA$State==2)]=1
DATA$State_B[which(DATA$State==1)]=0
ggplot() +
    geom_line(aes(x=DATA$year,y=DATA$may),color="Black") +
    geom_point(aes(x=DATA$year[which(DATA$State==1)],y=DATA$may[which(DATA$State==1)]),size=2,color="Red") +
    geom_point(aes(x=DATA$year[which(DATA$State==2)],y=DATA$may[which(DATA$State==2)]),size=2,color="Blue") +
    geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1]),
                linetype="dashed", 
                color = "red", size=1) +
    geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2]),
                linetype="dashed", 
                color = "blue", size=1) +
    xlab("Year") +
    ylab("Flow (MAF)")

Quasi-Stationary Simulation

simlength = length(DATA$may)
nsims1 = 250
predictions_plot <- as.data.frame(matrix(0,nrow=simlength,ncol=nsims1))
for(i in 1:nsims1){
  predictions <- simulate(hmm,nsim=simlength)$x
  predictions_plot[,i] <- predictions
}

Get May statistics and PDF

#initialize the matrices that store the statistics
may_stat=data.frame(matrix(0,nsims1,6))
names(may_stat)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions_plot)))
for(i in 1:ncol(predictions_plot)){
  pdf[,i] <- sm.density(predictions_plot[,i], eval.points = evalPoints, display="none")$estimate
  flowsim=as.numeric(t(predictions_plot[,i]))
  may_stat[i,1]=mean(flowsim)
  may_stat[i,2]=sd(flowsim)
  may_stat[i,3]=skew(flowsim)
  may_stat[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
  may_stat[i,5]=max(flowsim)
  may_stat[i,6]=min(flowsim)
}
pdf <- pdf[, which(colSums(pdf) != 0)]
origDen = sm.density(DATA$may, eval.points = evalPoints, display="none")$estimate

Nonstationary approach

Fit a Logistic Regression Model

covariates=as.data.frame(cbind(DATA$State_B[2:N],DATA$State_B[1:(N-1)],nino34[2:N],pdo[2:N],amo[2:N]))
names(covariates)=c("S","S_lag1","Nino34","PDO","AMO")
bestmodel = glm(S ~., data = covariates, family = binomial(link = "logit"))

Nontationary Simulation

# Simulate and plot
nsims3=250
sim_preds <- cbind(DATA$year,c(DATA$State[1],predict(bestmodel,type="response")))
predictions <- as.data.frame(matrix(0,nrow=length(DATA$year),ncol=nsims3))

for (j in 1:length(DATA$year)){
   predictions[j,1] <- DATA$year[j]
  for (i in 1:nsims3){
      rr <- runif(1,min=0,max=1)
      if(rr >= sim_preds[j,2]){
        sim_state=1
        if (hmm$distn=="gamma"){
          predictions[j,i]=rgamma(1,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1])
        }else {
          predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[1],scale=hmm_fit$pm$scale[1],shape=hmm_fit$pm$shape[1])
        }
      }
      if(rr < sim_preds[j,2]){
        sim_state=2
        if (hmm$distn=="gamma"){
          predictions[j,i]=rgamma(1,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2])
        }else {
          predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[2],scale=hmm_fit$pm$scale[2],shape=hmm_fit$pm$shape[2])
        }        
      }
    }
}

Get MAy PDF

#initialize the matrices that store the statistics
may_stat1=data.frame(matrix(0,nsims1,6))
names(may_stat1)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf1 <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions)))
for(i in 1:ncol(predictions)){
  pdf1[,i] <- sm.density(predictions[,i], eval.points = evalPoints, display="none")$estimate
  flowsim=as.numeric(t(predictions[,i]))
  may_stat1[i,1]=mean(flowsim)
  may_stat1[i,2]=sd(flowsim)
  may_stat1[i,3]=skew(flowsim)
  may_stat1[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
  may_stat1[i,5]=max(flowsim)
  may_stat1[i,6]=min(flowsim)
}
pdf1 <- pdf1[, which(colSums(pdf1) != 0)]

Plot MAy PDF

ylim1=c(-0.02,0.45)#range(pdf,pdf1)
pd1=plot_density(pdf,origDen,evalPoints,title = "May Quasi-stationary Approach",ratio=0.9)
pd1=pd1+coord_map(ylim=ylim1)
pd2=plot_density(pdf1,origDen,evalPoints,title = "May Nonstationary Approach",ratio=0.9)
pd2=pd2+coord_map(ylim=ylim1)
multiplot(pd1,pd2, cols = 2)

Plot of simulated vs. observed values

May statistics of Colorado River Flow at Lees Ferry

g1=boxplot_sts(cbind(may_stat[,1],may_stat1[,1]),mean(x),names(may_stat1)[1],lab=c("Quasi-stationary","Nonstationary"))
g2=boxplot_sts(cbind(may_stat[,2],may_stat1[,2]),sd(x),names(may_stat1)[2],lab=c("Quasi-stationary","Nonstationary"))
g3=boxplot_sts(cbind(may_stat[,3],may_stat1[,3]),skew(x),names(may_stat1)[3],lab=c("Quasi-stationary","Nonstationary"))
g4=boxplot_sts(cbind(may_stat[,4],may_stat1[,4]),cor(x[1:(N-1)],x[2:N]),names(may_stat1)[4],lab=c("Quasi-stationary","Nonstationary"))
g5=boxplot_sts(cbind(may_stat[,5],may_stat1[,5]),max(x),names(may_stat1)[5],lab=c("Quasi-stationary","Nonstationary"))
g6=boxplot_sts(cbind(may_stat[,6],may_stat1[,6]),min(x),names(may_stat1)[6],lab=c("Quasi-stationary","Nonstationary"))

multiplot(g1,g4,g2,g5,g3,g6, cols = 3)

Final comments

  • From PDF plots it is possible to see that the fit to the observation PDF is similar for both methods, but the nonstationary approach exhibits less variation for the high values (dots).

  • the quasi-stationary approach time series exhibits a low variation of median values while in the case of nonstationary approach, median values show more temporal variation and they try to reproduce the temporal variation of the observation data.

  • From the statistics boxplot, match between median values of the all statistics for quasi-stationary approach and observation data are better than the case of Nonstationary approach.

  • It is seen that Nonstationary approach is not able to capture the autocorrelation.

  • Finally, the nonstationary approach shows less variation of high values prediction (boxplots of maximum values).