HW3_P4

HMM & Forecast

Another way to simulate a time series is using Hidden Markov Model (Markov Chain + resampling) the steps are as follows:

Generate 250 simulations each of same length as the historical data.
For the May (one flow value per year) Lees Ferry streamflow time series fit a best HMM. This will result in the state transition probability matrix, and the parameters of the component distributions. It also results in the state sequence of the historical time series.
Generate 250 simulations from the fitted HMM. This involves generating the state sequence from the transition probability matrix and resampling flows from the corresponding component distribution.
Boxplot the mean, variance, skew, lag-1 correlation, minimum, maximum and PDF from the simulations with the corresponding values from the historical data plotted on them.
Fit an GLM for the state series -

\(S_t = f(S_t-1, ENSO_t, AMO_t, PDO_t) + eps\)

Simulate state sequence from the GLM and flows from the component distribution.
Generate 250 simulations each of same length as the historic data.
Compare and discuss what you find from both approaches.

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW3"
setwd(mainDir)

suppressPackageStartupMessages(source("Lib_HW3.R"))
source("Lib_HW3.R")

Read data

data=matrix(scan("./data/LeesFerry-monflows-1906-2016.txt"),ncol=13,byrow=T)
WYmonflow=data[,2:13]
N = dim(data)[1]#number of years of data for each month
nd=dim(WYmonflow)[2]
x=WYmonflow[,5]/10^6             # convert to MAF May Flow
DATA=as.data.frame(cbind(data[,1],x))
names(DATA)=c("year","may")

#Read covariates
nino34=scan("./data/NINO34-winter-1906-2016.txt")#Nino 34
pdo=scan("./data/PDO-winter-1906-2016.txt")#PDO
amo=scan("./data/AMO-winter-1906-2016.txt")#AMO

Define the best number of PDFs

Fit HMM models of orders 2 through 6. Obtain the AIC for each
They are stored in arrays aic1 and aic1
Spit it out and the best order is the one with the least value of AIC.

family = "gamma"  # underlying distribution for hmm
aic1=c() # AIC vector

for(imodel in 1:6){
  m = imodel#model order to fit
  stationary = F# use a stationary distribution of mixtures
  ic="same.sd"#c("same.sd","same.both","both.diff")
  fd.name=ifelse(family == "norm", "normal", family)
  Pi=Pi_init(m)
  delta=delta_init(m)
  pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
  # set up the model
  hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = F)

  if(imodel < 2){
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge = expression(diff > tol)))
  } else {
    hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
  }
  
  # get the hidden states from the fitted model 
  # Global decoding
  # To get the probability of being in a state: hmm$u
  decoding <- Viterbi(hmm)
  
  # get AIC
  aic <- AIC(hmm)
  aic1=c(aic1,aic)
}
knitr::opts_chunk$set(echo = T,
                      results = T)

## Get the best order
bestorder = order(aic1)[1]
## Fit the model for this best order
## if the best order is 1, 2 will be considered
if(bestorder==1){
  m=order(aic1)[2] #model order to fit
} else{
  m = bestorder #model order to fit
}
sprintf("Model order: %s",m)

## [1] "Model order: 2"

Quasi-stationary approach

stationary= F   # use a stationary distribution of mixtures
ic="same.sd"#c("same.sd","same.both","both.diff")
fd.name=ifelse(family == "norm", "normal", family)
# T.P.M.
Pi= Pi_init(m)
delta=delta_init(m)
pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))

# set up the model
hmm=dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete =F)

hmm=BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
##the vector decoding gives the best estimate of the hidden state
## for each observation
decoding=Viterbi(hmm)
binwidth=0.5
p=ggplot_stationary_hmm(hmm,decoding,binwidth=binwidth,res=N)
print(p)

knitr::opts_chunk$set(echo = T,
                      results = T)

Get state vector

# Add most likely state column to 'DATA' dataframe
DATA$State =decoding
#create a binay state vector
DATA$State_B=DATA$State
DATA$State_B[which(DATA$State==2)]=1
DATA$State_B[which(DATA$State==1)]=0
ggplot() +
    geom_line(aes(x=DATA$year,y=DATA$may),color="Black") +
    geom_point(aes(x=DATA$year[which(DATA$State==1)],y=DATA$may[which(DATA$State==1)]),size=2,color="Red") +
    geom_point(aes(x=DATA$year[which(DATA$State==2)],y=DATA$may[which(DATA$State==2)]),size=2,color="Blue") +
    geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1]),
                linetype="dashed", 
                color = "red", size=1) +
    geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2]),
                linetype="dashed", 
                color = "blue", size=1) +
    xlab("Year") +
    ylab("Flow (MAF)")

Quasi-Stationary Simulation

simlength = length(DATA$may)
nsims1 = 250
predictions_plot <- as.data.frame(matrix(0,nrow=simlength,ncol=nsims1))
for(i in 1:nsims1){
  predictions <- simulate(hmm,nsim=simlength)$x
  predictions_plot[,i] <- predictions
}

Get May statistics and PDF

#initialize the matrices that store the statistics
may_stat=data.frame(matrix(0,nsims1,6))
names(may_stat)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions_plot)))
for(i in 1:ncol(predictions_plot)){
  pdf[,i] <- sm.density(predictions_plot[,i], eval.points = evalPoints, display="none")$estimate
  flowsim=as.numeric(t(predictions_plot[,i]))
  may_stat[i,1]=mean(flowsim)
  may_stat[i,2]=sd(flowsim)
  may_stat[i,3]=skew(flowsim)
  may_stat[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
  may_stat[i,5]=max(flowsim)
  may_stat[i,6]=min(flowsim)
}
pdf <- pdf[, which(colSums(pdf) != 0)]
origDen = sm.density(DATA$may, eval.points = evalPoints, display="none")$estimate

Nonstationary approach

Fit a Logistic Regression Model

covariates=as.data.frame(cbind(DATA$State_B[2:N],DATA$State_B[1:(N-1)],nino34[2:N],pdo[2:N],amo[2:N]))
names(covariates)=c("S","S_lag1","Nino34","PDO","AMO")
bestmodel = glm(S ~., data = covariates, family = binomial(link = "logit"))

Nontationary Simulation

# Simulate and plot
nsims3=250
sim_preds <- cbind(DATA$year,c(DATA$State[1],predict(bestmodel,type="response")))
predictions <- as.data.frame(matrix(0,nrow=length(DATA$year),ncol=nsims3))

for (j in 1:length(DATA$year)){
   predictions[j,1] <- DATA$year[j]
  for (i in 1:nsims3){
      rr <- runif(1,min=0,max=1)
      if(rr >= sim_preds[j,2]){
        sim_state=1
        if (hmm$distn=="gamma"){
          predictions[j,i]=rgamma(1,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1])
        }else {
          predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[1],scale=hmm_fit$pm$scale[1],shape=hmm_fit$pm$shape[1])
        }
      }
      if(rr < sim_preds[j,2]){
        sim_state=2
        if (hmm$distn=="gamma"){
          predictions[j,i]=rgamma(1,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2])
        }else {
          predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[2],scale=hmm_fit$pm$scale[2],shape=hmm_fit$pm$shape[2])
        }        
      }
    }
}

Get MAy PDF

#initialize the matrices that store the statistics
may_stat1=data.frame(matrix(0,nsims1,6))
names(may_stat1)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf1 <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions)))
for(i in 1:ncol(predictions)){
  pdf1[,i] <- sm.density(predictions[,i], eval.points = evalPoints, display="none")$estimate
  flowsim=as.numeric(t(predictions[,i]))
  may_stat1[i,1]=mean(flowsim)
  may_stat1[i,2]=sd(flowsim)
  may_stat1[i,3]=skew(flowsim)
  may_stat1[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
  may_stat1[i,5]=max(flowsim)
  may_stat1[i,6]=min(flowsim)
}
pdf1 <- pdf1[, which(colSums(pdf1) != 0)]

Plot MAy PDF

ylim1=c(-0.02,0.45)#range(pdf,pdf1)
pd1=plot_density(pdf,origDen,evalPoints,title = "May Quasi-stationary Approach",ratio=0.9)
pd1=pd1+coord_map(ylim=ylim1)
pd2=plot_density(pdf1,origDen,evalPoints,title = "May Nonstationary Approach",ratio=0.9)
pd2=pd2+coord_map(ylim=ylim1)
multiplot(pd1,pd2, cols = 2)

Plot of simulated vs. observed values

May statistics of Colorado River Flow at Lees Ferry

g1=boxplot_sts(cbind(may_stat[,1],may_stat1[,1]),mean(x),names(may_stat1)[1],lab=c("Quasi-stationary","Nonstationary"))
g2=boxplot_sts(cbind(may_stat[,2],may_stat1[,2]),sd(x),names(may_stat1)[2],lab=c("Quasi-stationary","Nonstationary"))
g3=boxplot_sts(cbind(may_stat[,3],may_stat1[,3]),skew(x),names(may_stat1)[3],lab=c("Quasi-stationary","Nonstationary"))
g4=boxplot_sts(cbind(may_stat[,4],may_stat1[,4]),cor(x[1:(N-1)],x[2:N]),names(may_stat1)[4],lab=c("Quasi-stationary","Nonstationary"))
g5=boxplot_sts(cbind(may_stat[,5],may_stat1[,5]),max(x),names(may_stat1)[5],lab=c("Quasi-stationary","Nonstationary"))
g6=boxplot_sts(cbind(may_stat[,6],may_stat1[,6]),min(x),names(may_stat1)[6],lab=c("Quasi-stationary","Nonstationary"))

multiplot(g1,g4,g2,g5,g3,g6, cols = 3)

Final comments

From PDF plots it is possible to see that the fit to the observation PDF is similar for both methods, but the nonstationary approach exhibits less variation for the high values (dots).
the quasi-stationary approach time series exhibits a low variation of median values while in the case of nonstationary approach, median values show more temporal variation and they try to reproduce the temporal variation of the observation data.
From the statistics boxplot, match between median values of the all statistics for quasi-stationary approach and observation data are better than the case of Nonstationary approach.
It is seen that Nonstationary approach is not able to capture the autocorrelation.
Finally, the nonstationary approach shows less variation of high values prediction (boxplots of maximum values).