Total Organic Content (TOC) in surface waters is dependent on a suite of climate and land surface variables. The goal is develop a robust functional relationship between TOC and the best suite of predictors. To this end, the following steps need to be performed.
#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/CVEN5454/Project"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_Project.R"))
source("Lib_Project.R")
data=read.delim("OH_TOC_predictors.txt")
g=ggpairs(data,columns = 1:6, lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g,progress = F)
g1=ggpairs(data,columns = c(1,7:11), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g1,progress = F)
g2=ggpairs(data,columns = c(1,12:16), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g2,progress = F)
g3=ggpairs(data,columns = c(1,17:20), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g3,progress = F)
Obtain the best linear model using PRESS
#Try up to order 2 for the first 6 variables based on the scatter plot
data$T7D2=data$T7D^2
data$T15D2=data$T15D^2
data$T30D2=data$T30D^2
data$T30D1M2=data$T30D1M^2
data$T30D2M2=data$T30D2M^2
data$T30D3M2=data$T30D3M^2
# fit LM
N = length(data$TOC)
X=data[,2:26]# Independent variable
combs = leaps(X,data$TOC, nbest=40) # GEt upto 30 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz= lm(data$TOC ~ ., data = xx)
xpress[i]=PRESS(zz)
}
# Test using PRESS objective function
bestmod= lm(data$TOC ~ ., data = X[,combos[which.min(xpress),]])
bestmod$Call$Press=PRESS(bestmod)
bestmod$combo=combos[which.min(xpress),]
summary(bestmod)
##
## Call:
## lm(formula = data$TOC ~ ., data = X[, combos[which.min(xpress),
## ]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18269 -0.34040 -0.06865 0.25984 1.60415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.244800 0.157263 7.915 2.45e-14 ***
## T30D3M 0.007046 0.001585 4.445 1.14e-05 ***
## P15D 0.117994 0.020813 5.669 2.75e-08 ***
## ddmonth 0.011659 0.006335 1.840 0.066455 .
## NDVI1M 1.039848 0.185855 5.595 4.10e-08 ***
## PDSI1M 0.104800 0.026939 3.890 0.000117 ***
## PDSI2M -0.074899 0.026645 -2.811 0.005182 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4731 on 400 degrees of freedom
## Multiple R-squared: 0.2645, Adjusted R-squared: 0.2535
## F-statistic: 23.98 on 6 and 400 DF, p-value: < 2.2e-16
Model diagnostics
mod_diagnostics(data$TOC, bestmod$fitted.values)
ANOVA
anova(bestmod)
## Analysis of Variance Table
##
## Response: data$TOC
## Df Sum Sq Mean Sq F value Pr(>F)
## T30D3M 1 10.115 10.1146 45.1822 6.205e-11 ***
## P15D 1 9.308 9.3078 41.5782 3.269e-10 ***
## ddmonth 1 0.686 0.6863 3.0658 0.080723 .
## NDVI1M 1 8.175 8.1753 36.5194 3.464e-09 ***
## PDSI1M 1 2.154 2.1541 9.6224 0.002059 **
## PDSI2M 1 1.769 1.7689 7.9017 0.005182 **
## Residuals 400 89.545 0.2239
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "-----SST: 121.752"
Plot the historical and modeled values with 95% confidence interval
yhat=as.data.frame(predict(bestmod, interval = "confidence",level = 0.95))
range1=range(data$TOC, yhat$fit)
par(mar = c(4, 4, 1.5, 0.5))
plot(data$TOC, yhat$fit,xlab="Historical TOC", ylab="Modeled TOC",
xlim = range1,ylim = range1)
abline(0, 1, col = "red",lw=2)
arrows(data$TOC, yhat$lwr,
data$TOC, yhat$upr,
length = 0.05, angle = 90, code = 3)
Drop_Xperc_pred(bestmod,X[,combos[which.min(xpress),]],data$TOC,per=10)
Drop_Xperc_pred(bestmod1,data1,TOC,per=10)
histogram of TOC
linew=2
par(mfrow=c(1,1))
par(mar = c(4, 4, 1.5, 1))
hist(data$TOC,probability=T, main="TOC Histogram", xlab = "TOC",ylab = "PDF",col="gray80")
points(data$TOC,rep(0,length(data$TOC)),pch = 21,bg="white",cex=1)
sm.density(data$TOC,add=T,col="darkblue",lwd=linew,positive=TRUE,probability=T)
Scatterplot of log TOC
data1=cbind(log(data$TOC),data[,2:20])
colnames(data1)=c("LogTOC",colnames(data)[2:20])
g=ggpairs(data1,columns = 1:6, lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g,progress = F)
g1=ggpairs(data1,columns = c(1,7:11), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g1,progress = F)
g2=ggpairs(data1,columns = c(1,12:16), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g2,progress = F)
g3=ggpairs(data1,columns = c(1,17:20), lower = list(continuous = my_fn),progress = ggmatrix_progress(clear = FALSE))+
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 12, colour = "black", angle = 0),
strip.text.y = element_text(size = 12, colour = "black", angle = 90))
print(g3,progress = F)
Obtain the best linear model using PRESS
# fit LM
LTOC=log(data$TOC)
X=data[,2:26]# Independent variable
combs = leaps(X,LTOC, nbest=40) # GEt upto 30 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz= lm(LTOC ~ ., data = xx)
xpress[i]=PRESS(zz)
}
#Test using PRESS objective function
bestmod2= lm(LTOC ~ ., data = X[,combos[which.min(xpress),]])
bestmod2$Call$Press=PRESS(bestmod2)
bestmod2$combo=combos[which.min(xpress),]
summary(bestmod2)
##
## Call:
## lm(formula = LTOC ~ ., data = X[, combos[which.min(xpress), ]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63251 -0.12435 -0.00865 0.10379 0.50020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.451324 0.058201 7.755 7.50e-14 ***
## T7D 0.003213 0.001795 1.790 0.07414 .
## T30D -0.004956 0.002496 -1.986 0.04774 *
## T30D2M 0.004098 0.000843 4.861 1.68e-06 ***
## P15D 0.043859 0.007746 5.662 2.87e-08 ***
## ddmonth 0.004317 0.002349 1.838 0.06685 .
## NDVI1M 0.380692 0.191690 1.986 0.04772 *
## PDSI1M 0.041224 0.010221 4.033 6.59e-05 ***
## PDSI2M -0.031510 0.009977 -3.158 0.00171 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1759 on 398 degrees of freedom
## Multiple R-squared: 0.2945, Adjusted R-squared: 0.2803
## F-statistic: 20.77 on 8 and 398 DF, p-value: < 2.2e-16
Model diagnostics
mod_diagnostics(LTOC, bestmod2$fitted.values)
ANOVA
anova(bestmod2)
## Analysis of Variance Table
##
## Response: LTOC
## Df Sum Sq Mean Sq F value Pr(>F)
## T7D 1 0.5762 0.57620 18.6183 2.017e-05 ***
## T30D 1 0.5597 0.55971 18.0857 2.635e-05 ***
## T30D2M 1 1.8283 1.82825 59.0753 1.197e-13 ***
## P15D 1 1.3184 1.31843 42.6017 2.046e-10 ***
## ddmonth 1 0.0351 0.03505 1.1327 0.287853
## NDVI1M 1 0.2875 0.28755 9.2913 0.002456 **
## PDSI1M 1 0.2274 0.22744 7.3491 0.007000 **
## PDSI2M 1 0.3087 0.30868 9.9743 0.001708 **
## Residuals 398 12.3173 0.03095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "-----SST: 17.459"
Plot the historical and modeled values with 95% confidence interval
yhat=as.data.frame(predict(bestmod2, interval = "confidence",level = 0.95))
range1=range(data$TOC, exp(yhat$fit))
par(mar = c(4, 4, 1.5, 0.5))
plot(data$TOC, exp(yhat$fit),xlab="Historical TOC", ylab="Modeled TOC",
xlim = range1,ylim = range1)
abline(0, 1, col = "red",lw=2)
arrows(data$TOC, exp(yhat$lwr),
data$TOC, exp(yhat$upr),
length = 0.05, angle = 90, code = 3)
Skill boxplots for logarithm of TOC model
Drop_Xperc_pred(bestmod2,X[,bestmod2$combo],LTOC,per=10,var_type="log")
Obtain the best linear model using PRESS
# fit LM
X=data[,2:26]# Independent variable
combs = leaps(X,data$TOC, nbest=40) # GEt upto 30 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
xpress=1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz= glm(data$TOC ~ .,data = xx, family = Gamma(link="log"))
xpress[i]=PRESS(zz)
}
#Test using PRESS objective function
bestmod3= glm(data$TOC ~ .,data = X[,combos[which.min(xpress),]], family = Gamma(link="log"))
bestmod3$Call$Press=PRESS(bestmod2)
bestmod3$combo=combos[which.min(xpress),]
summary(bestmod3)
##
## Call:
## glm(formula = data$TOC ~ ., family = Gamma(link = "log"), data = X[,
## combos[which.min(xpress), ]])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.60004 -0.13185 -0.02404 0.09481 0.55330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4060631 0.0625539 6.491 2.53e-10 ***
## T7D 0.0032542 0.0018473 1.762 0.07891 .
## T30D -0.0042306 0.0025312 -1.671 0.09543 .
## T30D3M 0.0031454 0.0007204 4.366 1.61e-05 ***
## P15D 0.0445842 0.0079354 5.618 3.63e-08 ***
## ddmonth 0.0043020 0.0024235 1.775 0.07664 .
## NDVI1M 0.5080360 0.1863678 2.726 0.00669 **
## PDSI1M 0.0416311 0.0104783 3.973 8.42e-05 ***
## PDSI2M -0.0318309 0.0102216 -3.114 0.00198 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.03251939)
##
## Null deviance: 17.214 on 406 degrees of freedom
## Residual deviance: 12.465 on 398 degrees of freedom
## AIC: 539.36
##
## Number of Fisher Scoring iterations: 4
Model diagnostics
mod_diagnostics(data$TOC, bestmod3$fitted.values)
ANOVA
anova(bestmod3)
## Analysis of Deviance Table
##
## Model: Gamma, link: log
##
## Response: data$TOC
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 406 17.214
## T7D 1 0.50596 405 16.708
## T30D 1 0.49048 404 16.217
## T30D3M 1 1.29183 403 14.925
## P15D 1 1.42361 402 13.502
## ddmonth 1 0.03026 401 13.471
## NDVI1M 1 0.48092 400 12.991
## PDSI1M 1 0.22697 399 12.764
## PDSI2M 1 0.29839 398 12.465
Plot the historical and modeled values with 95% confidence interval
yhat=predict.glm(bestmod3, type = "response")
Y_hat_CI = predict.glm(bestmod3, type = "response", se.fit = T)$se.fit
yhat_lwr=yhat-Y_hat_CI
yhat_upr=yhat+Y_hat_CI
range1=range(data$TOC, yhat)
par(mar = c(4, 4, 1.5, 0.5))
plot(data$TOC, yhat,xlab="Historical TOC", ylab="Modeled TOC",
xlim = range1,ylim = range1)
abline(0, 1, col = "red",lw=2)
arrows(data$TOC, yhat_lwr,
data$TOC, yhat_upr,
length = 0.05, angle = 90, code = 3)
Skill boxplots for GLM model with Gamma
Drop_Xperc_pred(bestmod3,X[,bestmod3$combo],data$TOC,per=10)
Fit a Bayesian Regression for problem 10 using the best predictors obtained from 10(b). Plot the posterior distribution of the coefficients; the estimate of TOC along with 95% confidence intervals. Compare the posterior distributions with that from 10.
df=cbind(data$TOC,X[,bestmod$combo])#The best predictors obtained from 10(b)
colnames(df)=c("TOC",colnames(X[,bestmod$combo]))
n = nrow(df)
K = ncol(df)
### TOC as a function of T30D3M, P15D, ddmonth, NDVI1M, PDSI1M and PDSI2M
initf = function()list(
alpha = coefficients(bestmod)[1],
betas = coefficients(bestmod)[-1],
sigmap = sd(residuals(bestmod)))
### JAGS model
model_string <- "model{
# Likelihood
for(i in 1:n){
Y[i] ~ dnorm(mu[i],sigma)
mu[i] <- beta[1] + beta[2]*T30D3M[i] + beta[3]*P15D[i] + beta[4]*ddmonth[i] + beta[5]*NDVI1M[i] + beta[6]*PDSI1M[i] + beta[7]*PDSI2M[i]
}
# Prior for beta
for(j in 1:7){
beta[j] ~ dnorm(0,0.0001)
}
# Prior for the inverse variance
sigma <- 1/sqrt(inv.var)
inv.var ~ dgamma(0.01, 0.01)
}"
model1.spec<-textConnection(model_string)
model <- jags.model(model1.spec, data = list(Y = df$TOC,n=n,T30D3M=df$T30D3M, P15D=df$P15D, ddmonth=df$ddmonth, NDVI1M=df$NDVI1M,PDSI1M=df$PDSI1M,PDSI2M=df$PDSI2M), n.chains = 3, n.adapt= 10000)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 407
## Unobserved stochastic nodes: 8
## Total graph size: 4540
##
## Initializing model
update(model, 10000); # Burnin for 10000 samples
mcmc_samples <- coda.samples(model, variable.names=c("beta","sigma","mu"), n.iter=20000)
zz=summary(mcmc_samples)
samples <- jags.samples(model, c('beta', 'sigma','mu'), 1000)
### Plot the posterior
par(mfcol=c(2,2))
beta1 = mcmc_samples[[1]][,"beta[1]"]
beta2 = mcmc_samples[[1]][,"beta[2]"]
beta3 = mcmc_samples[[1]][,"beta[3]"]
beta4 = mcmc_samples[[1]][,"beta[4]"]
beta5 = mcmc_samples[[1]][,"beta[5]"]
beta6 = mcmc_samples[[1]][,"beta[6]"]
beta7 = mcmc_samples[[1]][,"beta[7]"]
sigmasim = mcmc_samples[[1]][,"sigma"]
hist(beta1,probability=T)
sm.density(beta1,add=TRUE)
abline(v=bestmod$coef[1],col="red")
hist(beta2,probability=T)
sm.density(beta2,add=TRUE)
abline(v=bestmod$coef[2],col="red")
hist(beta3,probability=T)
sm.density(beta3,add=TRUE)
abline(v=bestmod$coef[3],col="red")
hist(beta4,probability=T)
sm.density(beta4,add=TRUE)
abline(v=bestmod$coef[4],col="red")
hist(beta5,probability=T)
sm.density(beta5,add=TRUE)
abline(v=bestmod$coef[5],col="red")
hist(beta5,probability=T)
sm.density(beta5,add=TRUE)
abline(v=bestmod$coef[5],col="red")
hist(beta6,probability=T)
sm.density(beta6,add=TRUE)
abline(v=bestmod$coef[6],col="red")
hist(beta7,probability=T)
sm.density(beta7,add=TRUE)
abline(v=bestmod$coef[7],col="red")
par(mfrow=c(1,1))
hist(sigmasim,probability=T,xlim = range(sigmasim,sd(residuals(bestmod))))
sm.density(sigmasim,add=TRUE)
abline(v=sd(residuals(bestmod)),col="red")
xx=samples$mu[,,1]
yy=t(apply(xx,1,quantile, probs=c(0.025,0.5, 0.975), na.rm=TRUE))#95% CI and median
ymed=yy[,2]
yhatl=yy[,1]
yhatu=yy[,3]
range1=range(data$TOC,yy)
par(mar = c(4, 4, 1.5, 0.5))
plot(data$TOC, ymed,xlab="Historical TOC", ylab="Modeled TOC",
xlim = range1,ylim = range1)
abline(0, 1, col = "red",lw=2)
arrows(data$TOC, yhatl,
data$TOC, yhatu,
length = 0.05, angle = 90, code = 3)
Comments
Problem 10
Part a,b, and c
The model fails the normality, the homoscedasticity test, and independence of the residuals.
ANOVA test shows that the overall performance of the model is poor because the model can only 26% of the total variance.
The above is confirmed by Plot the historical and modeled values with 95% confidence interval and boxplots of skill measures since the median RMSE-Skill and COR skill are roughly 0.47 and 0.5, respectively.
Skills reveals that model in the prediction mode is similar to the fitted model performance (median skills match with red points).
Part d and e: including the lag-1 value
Part f
By taking logarithm of TOC, only the problem with the normality of residuals is fixed, but there is not an increase of the performance of both fitted model and prediction mode.
By fitting a GLM with a Gamma family, there is not an increase of the performance of both fitted model and prediction mode (median RMSE-Skill and COR skill are roughly 0.48 and 0.63, respectively).
Problem 11
The posterior distribution of the parameters obtained in 10(b) shows that mots of the parameters are significant since their histograms do not contain the zero. The only exception beta4 (associated to ddmonth) whose histogram contains the zero, but the density associated to this value is low.
In the plot the historical and modeled values with 95% confidence interval it is seen that the median estimate is similar to the estimates obtained in 10 (b).