HMM & Forecast
Another way to simulate a time series is using Hidden Markov Model (Markov Chain + resampling) the steps are as follows:
Generate 250 simulations each of same length as the historical data.
For the May (one flow value per year) Lees Ferry streamflow time series fit a best HMM. This will result in the state transition probability matrix, and the parameters of the component distributions. It also results in the state sequence of the historical time series.
Generate 250 simulations from the fitted HMM. This involves generating the state sequence from the transition probability matrix and resampling flows from the corresponding component distribution.
Boxplot the mean, variance, skew, lag-1 correlation, minimum, maximum and PDF from the simulations with the corresponding values from the historical data plotted on them.
Fit an GLM for the state series -
\(S_t = f(S_t-1, ENSO_t, AMO_t, PDO_t) + eps\)
Simulate state sequence from the GLM and flows from the component distribution.
Generate 250 simulations each of same length as the historic data.
Compare and discuss what you find from both approaches.
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW3"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW3.R"))
source("Lib_HW3.R")
data=matrix(scan("./data/LeesFerry-monflows-1906-2016.txt"),ncol=13,byrow=T)
WYmonflow=data[,2:13]
N = dim(data)[1]#number of years of data for each month
nd=dim(WYmonflow)[2]
x=WYmonflow[,5]/10^6 # convert to MAF May Flow
DATA=as.data.frame(cbind(data[,1],x))
names(DATA)=c("year","may")
#Read covariates
nino34=scan("./data/NINO34-winter-1906-2016.txt")#Nino 34
pdo=scan("./data/PDO-winter-1906-2016.txt")#PDO
amo=scan("./data/AMO-winter-1906-2016.txt")#AMO
Define the best number of PDFs
family = "gamma" # underlying distribution for hmm
aic1=c() # AIC vector
for(imodel in 1:6){
m = imodel#model order to fit
stationary = F# use a stationary distribution of mixtures
ic="same.sd"#c("same.sd","same.both","both.diff")
fd.name=ifelse(family == "norm", "normal", family)
Pi=Pi_init(m)
delta=delta_init(m)
pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
# set up the model
hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = F)
if(imodel < 2){
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge = expression(diff > tol)))
} else {
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
}
# get the hidden states from the fitted model
# Global decoding
# To get the probability of being in a state: hmm$u
decoding <- Viterbi(hmm)
# get AIC
aic <- AIC(hmm)
aic1=c(aic1,aic)
}
knitr::opts_chunk$set(echo = T,
results = T)
## Get the best order
bestorder = order(aic1)[1]
## Fit the model for this best order
## if the best order is 1, 2 will be considered
if(bestorder==1){
m=order(aic1)[2] #model order to fit
} else{
m = bestorder #model order to fit
}
sprintf("Model order: %s",m)
## [1] "Model order: 2"
stationary= F # use a stationary distribution of mixtures
ic="same.sd"#c("same.sd","same.both","both.diff")
fd.name=ifelse(family == "norm", "normal", family)
# T.P.M.
Pi= Pi_init(m)
delta=delta_init(m)
pars=get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
# set up the model
hmm=dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete =F)
hmm=BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
##the vector decoding gives the best estimate of the hidden state
## for each observation
decoding=Viterbi(hmm)
binwidth=0.5
p=ggplot_stationary_hmm(hmm,decoding,binwidth=binwidth,res=N)
print(p)
knitr::opts_chunk$set(echo = T,
results = T)
Get state vector
# Add most likely state column to 'DATA' dataframe
DATA$State =decoding
#create a binay state vector
DATA$State_B=DATA$State
DATA$State_B[which(DATA$State==2)]=1
DATA$State_B[which(DATA$State==1)]=0
ggplot() +
geom_line(aes(x=DATA$year,y=DATA$may),color="Black") +
geom_point(aes(x=DATA$year[which(DATA$State==1)],y=DATA$may[which(DATA$State==1)]),size=2,color="Red") +
geom_point(aes(x=DATA$year[which(DATA$State==2)],y=DATA$may[which(DATA$State==2)]),size=2,color="Blue") +
geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1]),
linetype="dashed",
color = "red", size=1) +
geom_hline(yintercept=qgamma(0.5,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2]),
linetype="dashed",
color = "blue", size=1) +
xlab("Year") +
ylab("Flow (MAF)")
Quasi-Stationary Simulation
simlength = length(DATA$may)
nsims1 = 250
predictions_plot <- as.data.frame(matrix(0,nrow=simlength,ncol=nsims1))
for(i in 1:nsims1){
predictions <- simulate(hmm,nsim=simlength)$x
predictions_plot[,i] <- predictions
}
Get May statistics and PDF
#initialize the matrices that store the statistics
may_stat=data.frame(matrix(0,nsims1,6))
names(may_stat)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions_plot)))
for(i in 1:ncol(predictions_plot)){
pdf[,i] <- sm.density(predictions_plot[,i], eval.points = evalPoints, display="none")$estimate
flowsim=as.numeric(t(predictions_plot[,i]))
may_stat[i,1]=mean(flowsim)
may_stat[i,2]=sd(flowsim)
may_stat[i,3]=skew(flowsim)
may_stat[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
may_stat[i,5]=max(flowsim)
may_stat[i,6]=min(flowsim)
}
pdf <- pdf[, which(colSums(pdf) != 0)]
origDen = sm.density(DATA$may, eval.points = evalPoints, display="none")$estimate
Fit a Logistic Regression Model
covariates=as.data.frame(cbind(DATA$State_B[2:N],DATA$State_B[1:(N-1)],nino34[2:N],pdo[2:N],amo[2:N]))
names(covariates)=c("S","S_lag1","Nino34","PDO","AMO")
bestmodel = glm(S ~., data = covariates, family = binomial(link = "logit"))
Nontationary Simulation
# Simulate and plot
nsims3=250
sim_preds <- cbind(DATA$year,c(DATA$State[1],predict(bestmodel,type="response")))
predictions <- as.data.frame(matrix(0,nrow=length(DATA$year),ncol=nsims3))
for (j in 1:length(DATA$year)){
predictions[j,1] <- DATA$year[j]
for (i in 1:nsims3){
rr <- runif(1,min=0,max=1)
if(rr >= sim_preds[j,2]){
sim_state=1
if (hmm$distn=="gamma"){
predictions[j,i]=rgamma(1,shape=hmm$pm$shape[1],rate=hmm$pm$rate[1])
}else {
predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[1],scale=hmm_fit$pm$scale[1],shape=hmm_fit$pm$shape[1])
}
}
if(rr < sim_preds[j,2]){
sim_state=2
if (hmm$distn=="gamma"){
predictions[j,i]=rgamma(1,shape=hmm$pm$shape[2],rate=hmm$pm$rate[2])
}else {
predictions[j,i]=revd(1,loc=hmm_fit$pm$loc[2],scale=hmm_fit$pm$scale[2],shape=hmm_fit$pm$shape[2])
}
}
}
}
Get MAy PDF
#initialize the matrices that store the statistics
may_stat1=data.frame(matrix(0,nsims1,6))
names(may_stat1)=c("Mean","Std. Dev.","Skew","Lag-1 correlation","Max","Min")
evalPoints=seq(min(x)-0.25*sd(x),max(x)+0.25*sd(x),length=N)
pdf1 <- as.data.frame(matrix(nrow = length(evalPoints), ncol = ncol(predictions)))
for(i in 1:ncol(predictions)){
pdf1[,i] <- sm.density(predictions[,i], eval.points = evalPoints, display="none")$estimate
flowsim=as.numeric(t(predictions[,i]))
may_stat1[i,1]=mean(flowsim)
may_stat1[i,2]=sd(flowsim)
may_stat1[i,3]=skew(flowsim)
may_stat1[i,4]=cor(flowsim[1:(N-1)],flowsim[2:N])
may_stat1[i,5]=max(flowsim)
may_stat1[i,6]=min(flowsim)
}
pdf1 <- pdf1[, which(colSums(pdf1) != 0)]
ylim1=c(-0.02,0.45)#range(pdf,pdf1)
pd1=plot_density(pdf,origDen,evalPoints,title = "May Quasi-stationary Approach",ratio=0.9)
pd1=pd1+coord_map(ylim=ylim1)
pd2=plot_density(pdf1,origDen,evalPoints,title = "May Nonstationary Approach",ratio=0.9)
pd2=pd2+coord_map(ylim=ylim1)
multiplot(pd1,pd2, cols = 2)
g1=boxplot_sts(cbind(may_stat[,1],may_stat1[,1]),mean(x),names(may_stat1)[1],lab=c("Quasi-stationary","Nonstationary"))
g2=boxplot_sts(cbind(may_stat[,2],may_stat1[,2]),sd(x),names(may_stat1)[2],lab=c("Quasi-stationary","Nonstationary"))
g3=boxplot_sts(cbind(may_stat[,3],may_stat1[,3]),skew(x),names(may_stat1)[3],lab=c("Quasi-stationary","Nonstationary"))
g4=boxplot_sts(cbind(may_stat[,4],may_stat1[,4]),cor(x[1:(N-1)],x[2:N]),names(may_stat1)[4],lab=c("Quasi-stationary","Nonstationary"))
g5=boxplot_sts(cbind(may_stat[,5],may_stat1[,5]),max(x),names(may_stat1)[5],lab=c("Quasi-stationary","Nonstationary"))
g6=boxplot_sts(cbind(may_stat[,6],may_stat1[,6]),min(x),names(may_stat1)[6],lab=c("Quasi-stationary","Nonstationary"))
multiplot(g1,g4,g2,g5,g3,g6, cols = 3)
From PDF plots it is possible to see that the fit to the observation PDF is similar for both methods, but the nonstationary approach exhibits less variation for the high values (dots).
the quasi-stationary approach time series exhibits a low variation of median values while in the case of nonstationary approach, median values show more temporal variation and they try to reproduce the temporal variation of the observation data.
From the statistics boxplot, match between median values of the all statistics for quasi-stationary approach and observation data are better than the case of Nonstationary approach.
It is seen that Nonstationary approach is not able to capture the autocorrelation.
Finally, the nonstationary approach shows less variation of high values prediction (boxplots of maximum values).