4. Modeling Nonstationary Time Series - HMM & Forecast and simulation
Another way to model/simulate a time series is using HMM. For the spring streamflow in problem 1 for April \(1^{st}\) lead time fit an HMM and make the forecast. The steps are as follows:
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
set.seed(5)
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 03/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw3_library.R"))
source("hw3_library.R")
swe = as.matrix(readRDS("./data/prob1/springSWE_inches.Rds")) # Snow water equivalent @ Mar 1 & Apr 1
clim = as.matrix(readRDS("./data/prob1/winterMeanClimateIndices.Rds")) # AMO, PDO, ENSO
data = as.matrix(readRDS("./data/prob1/springFlows.Rds")) # Spring flow @ Lees Ferry
years = swe[,1]
clim = clim[-which(clim[,1] < years[1]),] # only include years with full dataset
data = data[-which(data[,1] < years[1]),] # only include years with full dataset
obs = as.data.frame(data)
q_maf = obs$q_maf
terc = quantile(q_maf, c(1/3, 2/3))
terc_obs = rep(NA, length(q_maf))
for (i in 1:length(q_maf)){
if ( q_maf[i] < terc[1]){
terc_obs[i]=1
} else if ( q_maf[i] >= terc[1] & q_maf[i] < terc[2] ) {
terc_obs[i]=2
} else {
terc_obs[i]=3
}
}
climatology=c(1/3, 1/3, 1/3)
family = "gamma" # underlying distribution for hmm
stationary <- F # use a stationary distribution of mixtures
discrete <- F
ic = "same.sd" #c("same.sd","same.both","both.diff")
fd.name = ifelse(family == "norm", "normal", family)
x = data[,2]
aic1=c()
for(imodel in 1:3){
sink(nullfile())
m <- imodel # model order to fit
# T.P.M.
Pi <- Pi_init(m)
delta <- delta_init(m)
pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
# set up the model
hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
if(imodel < 2){
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000,posdiff=TRUE,converge =
expression(diff > tol)))
} else {
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
}
sink()
# get the hidden states from the fitted model
# Global decoding
# To get the probability of being in a state: hmm$u
decoding <- Viterbi(hmm)
# get AIC
aic <- AIC_hmm(hmm, k=2)
aic1 = c(aic1,aic)
sink()
}
## Get the best order
bestorder = order(aic1)[1]
if (bestorder == 1){
bestorder = 2
}
print(sprintf("The best HMM has %s states", bestorder))
## [1] "The best HMM has 2 states"
m <- bestorder
# T.P.M.
Pi <- Pi_init(m)
delta <- delta_init(m)
pars <- get.named.parlist(x, m, fd.name, lower=.0, ic)#,start=list(shape1=2,shape2=2))
sink(nullfile())
# set up the model
hmm <- dthmm(x, Pi=Pi, delta=delta, family, pars, nonstat=!stationary, discrete = discrete)
hmm <- BaumWelch(hmm, bwcontrol(maxiter = 1000, tol = 1e-08))
sink()
decoding <- Viterbi(hmm)
p <- ggplot_stationary_hmm(hmm,.5)
print(p)
nsim = 250
nyrs = length(years)
allY=ifelse(decoding==1, 0, 1) # need 0 or 1 for GLM_fit function. 0 means state 1, 1 means state 2
S_1=allY[-length(allY)] # previous state
Y=allY[-1]
Q_mean = matrix(NA, nrow = nyrs-1, ncol = 2)
rpss = vector("numeric", length = 2)
corr = vector("numeric", length = 2)
for (k in 3:2){
######### Fit Best Logistic Regression Model #######
X = data.frame(S_1, cbind(clim[-1,-1],swe[-1,k]))
glm_mod = GLM_fit(Y, X, family='binomial')
print(summary(glm_mod))
######### Using best GLM, obtain probabilities of the states (i.e. distribution of the HMM) #######
state_2prob = predict.glm(glm_mod, type='response') # recall that 0 is state 1, 1 is state 2
state_1prob = 1-state_2prob
state_probs = data.frame(s1=state_1prob, s2=state_2prob)
######### Simulate flow from corresponding state PDFs to obtain ensemble #######
terc_mat=matrix(NA, nrow=nrow(state_probs), ncol=3)
for (i in 1:nrow(state_probs)){
sim_vec=rep(NA, nsim)
for (s in 1:nsim){
state=sample(c(1,2), size=1, prob=state_probs[i,])
if (state==1){
sim_vec[s]= rgamma(1, shape=hmm$pm$shape[1], rate = hmm$pm$rate[1]) #simulate from distribution for state 1 with params in hmm object
} else { # state 2
sim_vec[s]= rgamma(1, shape=hmm$pm$shape[2], rate = hmm$pm$rate[2]) #simulate from distribution for state 2 with params in hmm object
}
}
p1 = sum(sim_vec < terc[1])/length(sim_vec)
p3 = sum(sim_vec > terc[2])/length(sim_vec)
p2 = 1-(p3+p1)
p = c(p1,p2,p3)
terc_mat[i,] = p
Q_mean[i,k-1] = mean(sim_vec)
}
######### Compute RPSS and correlation #######
rpss[k-1]=rps(obs=terc_obs[-1], pred=terc_mat, baseline=climatology)$rpss
corr[k-1]=cor(Q_mean[,k-1], data[-1,2])
}
## [1] "Results of AIC for bestfit GLM"
## glm_press
## logit 10
## probit 10
## cauchit 10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and probit link function."
##
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]),
## data = xx)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.654e-04 -2.100e-08 -2.100e-08 -2.100e-08 1.520e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -583.45 52454.65 -0.011 0.991
## AMO.DJF -399.83 35493.55 -0.011 0.991
## PDO.DJF 69.69 6356.45 0.011 0.991
## ENSO.DJF 24.47 2266.43 0.011 0.991
## V4 14.36 1305.85 0.011 0.991
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.8362e+01 on 52 degrees of freedom
## Residual deviance: 9.6438e-08 on 48 degrees of freedom
## AIC: 10
##
## Number of Fisher Scoring iterations: 25
##
## [1] "Results of AIC for bestfit GLM"
## glm_press
## logit 10
## probit 10
## cauchit 10
## [1] 30
## [1] "Choosing the GLM which minimizes AIC: binomial family and logit link function."
##
## Call:
## glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]),
## data = xx)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.135e-04 -2.100e-08 -2.100e-08 -2.100e-08 1.262e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1453.51 285171.00 -0.005 0.996
## AMO.DJF -825.42 159818.43 -0.005 0.996
## PDO.DJF 212.62 41303.34 0.005 0.996
## ENSO.DJF 57.36 11829.42 0.005 0.996
## V4 61.79 12240.80 0.005 0.996
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.8362e+01 on 52 degrees of freedom
## Residual deviance: 5.6578e-08 on 48 degrees of freedom
## AIC: 10
##
## Number of Fisher Scoring iterations: 25
par(mfrow = c(1,2), mar = c(4,4,2,1))
plot(Q_mean[,2], data[-1,2], xlab = "Ensemble Mean", ylab = "Observed", main = expression(paste("April ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[2],2), sep=''),
paste('rpss=', round(rpss[2],2), sep='')), cex=1.1)
plot(Q_mean[,1], data[-1,2], xlab = "Ensemble Mean", ylab = "", main = expression(paste("March ", 1^{"st"}, " Lead Time")))
abline(a=0, b=1, col='red')
legend('bottomright', legend=c(paste('cor=',round(corr[1],2), sep=''),
paste('rpss=', round(rpss[1],2), sep='')), cex=1.1)
Comments