graphic functions
# #function to add linear smother -----------------------------------------
my_fn <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point(colour ="gray20",shape=21, fill="gray60") +
geom_smooth(method="auto", fill="skyblue4", color="dodgerblue4", ...)
p
}
# ##### Model dagnostics code ---------------------------------------------
mod_diagnostics = function(y, yhat){
par(mfrow=c(2,2))
e <- y - yhat
nobs <- length(y)
## Testing if errors (residuals) are normal and iid
# 1. Normality histogram
b=hist(e,probability=T,plot=FALSE)
rang=range(b$density,dnorm(sort(e),mean=0,sd=sd(e)))
hist(e,xlab="Residuals",ylab="Density",probability=T,main="Distribution of Residuals",ylim = rang)
lines(sort(e),dnorm(sort(e),mean=0,sd=sd(e)),col="red")
sm.density(e,add=T,col="blue")
legend("topright", c("Normal Fit", "Non-par Fit"), lty=c(1,1), lwd=c(2.5,2.5), col=c("red", "blue"),cex=0.78)
# 2. Normal QQplot
qqnorm(e,main="Normal Q-Q Plot of Residuals")
qqline(e)
## Testing for heteroskedasticity (which is constant variance of residuals)
# 3. Residuals versus estimates
plot(yhat,e,xlab="Estimated Precip",ylab="Residuals",main="Residuals vs Estimated Precip")
abline(0,0)
## Testing for autocorrelation. If errors are uncorrelated they fall between the dotted lines.
# 4. Autocorrelation plot
cor=acf(e,main="Autocorrelation Plot")
}
# Conditional PDF plot ----------------------------------------------------
Conditional_PDF_plot = function(yeval,fycond_nor,Y1, fycond_ker,simY1,val1,loc1,limi=c(0,0)){
linew=2
col_vals=c("blue","red","purple")
name=c("Bivariate Normal","Kernel density estimator","Frank Copula")
# dev.new()
# a=sm.density(simY1,positive=TRUE,probability=T)$estimate
# dev.off()
#par(mfrow=c(1,1))
if(limi[2]==0){
plot(yeval, fycond_nor, col=col_vals[1],lwd=linew,type="l",main=paste("Inter-eruption time=",val1," (min)",sep = ""),xlab="Eruption duration (min)", ylab="Conditional PDF",pch=14,
ylim=range(fycond_nor,fycond_ker))
}else{
plot(yeval, fycond_nor, col=col_vals[1],lwd=linew,type="l",main=paste("Inter-eruption time=",val1," (min)",sep = ""),xlab="Eruption duration (min)", ylab="Conditional PDF",pch=14,
ylim=range(fycond_nor,fycond_ker,limi))
}
lines(Y1, fycond_ker, col=col_vals[2],lwd=linew)
b=sm.density(simY1,add=T,col=col_vals[3],lwd=linew)
legend(loc1,inset=.0,name,lty=c(1,1,1),lwd=linew,cex=1,col=col_vals,bty="n")
box()
}
# Boxplots ----------------------------------------------------------------
Boxplots= function(data,obs,lab,laby){
name=1
p1=ggplot()+
geom_boxplot(mapping = aes(x = name, y=data))+
geom_point(mapping = aes(name,obs), size = 2, color = "red")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
panel.background = element_rect( colour = "gray30"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "grey80", linetype = "dashed"))+
scale_x_continuous(breaks=name,limits = c(0.5,1.5),labels =lab)+
theme(axis.text.x=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.5),
axis.text.y=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.8))+
labs(title = lab,x = "",y = laby,color="")
p1
}
# Q-Q plots ---------------------------------------------------------------
Q_Q_plots= function(data, name,Gamma_par,Weibull_par){
sampquant = sort(data)
sizes=c(14,9,11)
N = length(sampquant)
sampprob = c(1:N)/(N+1)
norquant = qnorm(sampprob, mean=mean(sampquant), sd=sd(sampquant))
lognorquant = qlnorm(sampprob, meanlog=mean(log(sampquant)), sdlog=sd(log(sampquant)))
gammaquant = qgamma(sampprob, shape=Gamma_par[1],scale=1/Gamma_par[2])
Weibullquant = qweibull(sampprob, shape=Weibull_par[1],scale=Weibull_par[2])
plotdataframe2=as.data.frame(sampquant)
plotdataframe2$Normal=norquant
plotdataframe2$Lognormal=lognorquant
plotdataframe2$Gamma=gammaquant
plotdataframe2$Weibull=Weibullquant
colnames(plotdataframe2)=c("Sample","Normal","Log Normal","Gamma","Weibull")
df1 <- melt(plotdataframe2 , id.vars = "Sample", variable.name = "Quantiles")
P=ggplot(df1, aes(x=value,y=Sample,group=Quantiles))+
geom_line(aes(x=Sample, y=Sample),colour ="black",size=0.5)+
geom_point(aes(fill=factor(Quantiles)),shape=21, colour ="black",size=2)+
#coord_cartesian(xlim = lims, ylim = lims) +
facet_wrap(Quantiles ~ .,scales = "free_y", ncol=2)+
labs(title = paste("Q-Q plots",name,sep = " "), x = "Model quantiles", y ="Sample quantiles")+
theme(legend.position="")+
theme(plot.title = element_text(hjust = 0.5,size = sizes[1],face = "bold"),
#panel.background = element_rect( colour = "grey20"),
axis.text = element_text(size = sizes[2]),
axis.title.x = element_text(size = sizes[3], vjust = -0.5,face = "bold"),
axis.title.y = element_text(size = sizes[3], vjust = 0.2,face = "bold"),
strip.text.x = element_text(size = sizes[3], colour = "black", angle = 0))
P
}
Q_Q_plots_month= function(data, month,Gamma_par,Weibull_par){
sampquant = sort(data[,month])
sizes=c(14,9,11)
N = length(sampquant)
sampprob = c(1:N)/(N+1)
norquant = qnorm(sampprob, mean=mean(sampquant), sd=sd(sampquant))
lognorquant = qlnorm(sampprob, meanlog=mean(log(sampquant)), sdlog=sd(log(sampquant)))
gammaquant = qgamma(sampprob, shape=Gamma_par[which(rownames(Gamma_par)==month),1],
scale=1/Gamma_par[which(rownames(Gamma_par)==month),2])
Weibullquant = qweibull(sampprob, shape=Weibull_par[which(rownames(Weibull_par)==month),1],
scale=Weibull_par[which(rownames(Weibull_par)==month),2])
plotdataframe2=as.data.frame(sampquant)
plotdataframe2$Normal=norquant
plotdataframe2$Lognormal=lognorquant
plotdataframe2$Gamma=gammaquant
plotdataframe2$Weibull=Weibullquant
colnames(plotdataframe2)=c("Sample","Normal","Log Normal","Gamma","Weibull")
df1 <- melt(plotdataframe2 , id.vars = "Sample", variable.name = "Quantiles")
P=ggplot(df1, aes(x=value,y=Sample,group=Quantiles))+
geom_line(aes(x=Sample, y=Sample),colour ="black",size=0.5)+
geom_point(aes(fill=factor(Quantiles)),shape=21, colour ="black",size=2)+
facet_wrap(Quantiles ~ .,scales = "free_y", ncol=2)+
labs(title = paste("Q-Q plots",month,sep = " "), x = "Model quantiles (cms)", y ="Sample quantiles (cms)")+
theme(legend.position="")+
theme(plot.title = element_text(hjust = 0.5,size = sizes[1],face = "bold"),
#panel.background = element_rect( colour = "grey20"),
axis.text = element_text(size = sizes[2]),
axis.title.x = element_text(size = sizes[3], vjust = -0.5,face = "bold"),
axis.title.y = element_text(size = sizes[3], vjust = 0.2,face = "bold"),
strip.text.x = element_text(size = sizes[3], colour = "black", angle = 0))
P
}
Analysis functions
loocv_func = function(bestmod,X,Y) {
# Select only the data from the best model (not full model)
mod_data = X
N = length(Y)
# Initialize leave one out cross validation vector
loocv = vector("numeric", nrow(mod_data))
if (class(bestmod)[1]=='lm') {
for (i in 1:nrow(mod_data)) {
drop_data = mod_data[-i,] # drop one precip value
drop_mod = lm(Y ~ ., data = drop_data)
x_pred = mod_data[i,]
loocv[i] = predict(drop_mod, x_pred)
}
} else if (class(bestmod)[1]=='glm') {
for (i in 1:nrow(mod_data)) {
drop_data = mod_data[-i,] # drop one precip value
drop_mod = glm(Y ~ ., data = drop_data, family = bestmod$family)
x_pred = mod_data[i,]
loocv[i]=predict.glm(drop_mod, newdata=x_pred, se.fit=F,type="response")
}
} else if (class(bestmod)[1] == "gam") {
for(i in 1:nrow(mod_data)){
drop_data = mod_data[-i,] # drop one precip value
# names(drop_data) = c("lat","lon","elev")
drop_mod = gam(bestmod$formula, data = drop_data)
x_pred = mod_data[i,]
loocv[i] = predict.gam(drop_mod, newdata=x_pred, se.fit=F,type="response")
}
}
ypred=bestmod$fitted.values
dat=as.data.frame(cbind(Y,ypred,probxval))
names(dat) <- c("Actual", "Fitted", "X-validated")
dat.melt <- melt(dat, id.vars = "Actual", variable.name ="type",
value.name = "Estimated")
p=ggplot(dat.melt, aes(x = Actual, y = Estimated, shape = type, color = type))+
geom_point(size=4) +
theme(plot.title = element_text(hjust = 0.5,size = 15,face = "bold"),
panel.background = element_rect( colour = "grey20"),
axis.text = element_text(size = 11),legend.position=c(0.5,0.88),
legend.text = element_text(colour="black", size = 11, face = "bold"),legend.title=element_blank(),
legend.background = element_rect(colour = "transparent",fill="gray94", size=.5, linetype="dotted"),
axis.title.x = element_text(size = 12, vjust = -0.5,face = "bold"),
axis.title.y = element_text(size = 12, vjust = 0.2,face = "bold"))
if (class(bestmod)[1]=='lm') {
p=p+geom_abline(slope = 1,intercept = 0,size=1)
}
p
}
########### Plot skill of model based on Drop X% method ----------------------------
Drop_Xperc_pred = function(bestmod,X,Y,per,family="",var_type="") {
# Select only the data from the best model (not full model)
mod_data = data_mod=cbind(X,Y)
N = length(Y)
drop = per
nsample = 500
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
i_drop = sample(i_full,round(N*drop/100)) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% data value
if (class(bestmod)[1]=='lm') {
fit_drop = lm(Y ~ ., data = drop_dt)
drop_pred = predict(fit_drop, newdata=mod_data[i_drop,])
}else{
fit_drop = glm(Y ~ ., data = drop_dt, family = bestmod$family)
drop_pred =predict.glm(fit_drop, newdata=mod_data[i_drop,], se.fit=F,type="response")
}
drop_actual = Y[i_drop]
if (var_type=="log"){
skill_rmse[i] = sqrt(mean((exp(drop_actual) - exp(drop_pred))^2))
skill_cor[i] = cor(exp(drop_actual),exp(drop_pred))
} else{
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
if (var_type=="log"){
var1=sqrt(mean((exp(Y) - exp(bestmod$fitted.values))^2))
var2=cor(exp(Y), exp(bestmod$fitted.values))
}else {
var1=sqrt(mean((Y - bestmod$fitted.values)^2))
var2=cor(Y, bestmod$fitted.values)
}
name=1
lab=c("RMSE-Skill","COR-Skill")
p1=ggplot()+
geom_boxplot(mapping = aes(x = name, y=skill_rmse))+
geom_point(mapping = aes(x = name, y =var1 ),
color = "red", size = 2)+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
panel.background = element_rect( colour = "gray30"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "grey80", linetype = "dashed"))+
#scale_x_continuous(breaks=name,limits = c(0.6,1.4),labels =lab)+
theme(axis.text.x=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.5),
axis.text.y=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.8))+
labs(title =lab[1],x = "RMSE",y = lab[1])
p2=ggplot()+
geom_boxplot(mapping = aes(x = name, y=skill_cor))+
geom_point(mapping = aes(x = name, y = var2),
color = "red", size = 2)+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
panel.background = element_rect( colour = "gray30"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "grey80", linetype = "dashed"))+
#scale_x_continuous(breaks=name,limits = c(0.6,1.4),labels =lab)+
theme(axis.text.x=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.5),
axis.text.y=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.8))+
labs(title =lab[2],x = "Correlation",y = lab[2])
multiplot(p1,p2, cols = 2)
}
# K-S tests ----------------------------------------------------------------
Ks_test = function(data,Gamma_par,Weibull_par) {
Ks_stat=data.frame(matrix(data=0,nrow = 4,ncol=2))*0#to save p-value y KS statistic
colnames(Ks_stat)=c("p.value","statistic")
rownames(Ks_stat)=c("Normal","LogNormal","Gamma","Weibull")
ksnor=ks.test(data, "pnorm",mean=mean(data), sd=sd(data))
kslognor=ks.test(data, "plnorm", meanlog=mean(log(data)), sdlog=sd(log(data)))
ksgamma=ks.test(data, "pgamma", shape=Gamma_par[1], rate=Gamma_par[2])
ksweibull=ks.test(data, "pweibull", shape=Weibull_par[1],scale = Weibull_par[2])
Ks_stat[1,]=c(ksnor$p.value, ksnor$statistic)
Ks_stat[2,]=c(kslognor$p.value, kslognor$statistic)
Ks_stat[3,]=c(ksgamma$p.value, ksgamma$statistic)
Ks_stat[4,]=c(ksweibull$p.value, ksweibull$statistic)
print("#############K-S test of goodness of fit October:")
print(Ks_stat)
print(paste("-----Best fit:",rownames(Ks_stat)[which.min(Ks_stat[,2])],"Distribution with D=",
round(min(Ks_stat[,2]),digits = 4),"and P-value=",
round(Ks_stat[which.min(Ks_stat[,2]),1],digits = 4),sep = " "))
}
Ks_test_month = function(data,Gamma_par,Weibull_par) {
Ks_Mar_stat=data.frame(matrix(data=0,nrow = 4,ncol=2))*0#to save Mar p-value y KS statistic
colnames(Ks_Mar_stat)=c("p.value","statistic")
rownames(Ks_Mar_stat)=c("Normal","LogNormal","Gamma","Weibull")
Ks_Jun_stat=data.frame(matrix(data=0,nrow = 4,ncol=2))*0#to save Jun p-value y KS statistic
colnames(Ks_Jun_stat)=c("p.value","statistic")
rownames(Ks_Jun_stat)=c("Normal","LogNormal","Gamma","Weibull")
Ks_Oct_stat=data.frame(matrix(data=0,nrow = 4,ncol=2))*0#to save Oct p-value y KS statistic
colnames(Ks_Oct_stat)=c("p.value","statistic")
rownames(Ks_Oct_stat)=c("Normal","LogNormal","Gamma","Weibull")
labs=c("Mar","Jun","Oct")
for (i in 1:3) {
Flow=data[,labs[i]]
ksnor=ks.test(Flow, "pnorm",mean=mean(Flow), sd=sd(Flow))
kslognor=ks.test(Flow, "plnorm", meanlog=mean(log(Flow)), sdlog=sd(log(Flow)))
if (i==1){
ksgamma=ks.test(Flow, "pgamma", shape=Gamma_par[3,1], rate=Gamma_par[3,2])
ksweibull=ks.test(Flow, "pweibull", shape=Weibull_par[3,1],scale = Weibull_par[3,2])
Ks_Mar_stat[1,]=c(ksnor$p.value, ksnor$statistic)
Ks_Mar_stat[2,]=c(kslognor$p.value, kslognor$statistic)
Ks_Mar_stat[3,]=c(ksgamma$p.value, ksgamma$statistic)
Ks_Mar_stat[4,]=c(ksweibull$p.value, ksweibull$statistic)
print("#############K-S test of goodness of fit March:")
print(Ks_Mar_stat)
print(paste("-----Best fit:",rownames(Ks_Mar_stat)[which.min(Ks_Mar_stat[,2])],"Distribution with D=",
round(min(Ks_Mar_stat[,2]),digits = 4),"and P-value=",
round(Ks_Mar_stat[which.min(Ks_Mar_stat[,2]),1],digits = 4),sep = " "))
}else if(i==2){
ksgamma=ks.test(Flow, "pgamma", shape=Gamma_par[6,1], rate=Gamma_par[6,2])
ksweibull=ks.test(Flow, "pweibull", shape=Weibull_par[6,1],scale = Weibull_par[6,2])
Ks_Jun_stat[1,]=c(ksnor$p.value, ksnor$statistic)
Ks_Jun_stat[2,]=c(kslognor$p.value, kslognor$statistic)
Ks_Jun_stat[3,]=c(ksgamma$p.value, ksgamma$statistic)
Ks_Jun_stat[4,]=c(ksweibull$p.value, ksweibull$statistic)
print("#############K-S test of goodness of fit June:")
print(Ks_Jun_stat)
print(paste("-----Best fit:",rownames(Ks_Jun_stat)[which.min(Ks_Jun_stat[,2])],"Distribution with D=",
round(min(Ks_Jun_stat[,2]),digits = 4),"and P-value=",
round(Ks_Jun_stat[which.min(Ks_Jun_stat[,2]),1],digits = 4),sep = " "))
}else {
ksgamma=ks.test(Flow, "pgamma", shape=Gamma_par[10,1], rate=Gamma_par[10,2])
ksweibull=ks.test(Flow, "pweibull", shape=Weibull_par[10,1],scale = Weibull_par[10,2])
Ks_Oct_stat[1,]=c(ksnor$p.value, ksnor$statistic)
Ks_Oct_stat[2,]=c(kslognor$p.value, kslognor$statistic)
Ks_Oct_stat[3,]=c(ksgamma$p.value, ksgamma$statistic)
Ks_Oct_stat[4,]=c(ksweibull$p.value, ksweibull$statistic)
print("#############K-S test of goodness of fit October:")
print(Ks_Oct_stat)
print(paste("-----Best fit:",rownames(Ks_Oct_stat)[which.min(Ks_Oct_stat[,2])],"Distribution with D=",
round(min(Ks_Oct_stat[,2]),digits = 4),"and P-value=",
round(Ks_Oct_stat[which.min(Ks_Oct_stat[,2]),1],digits = 4),sep = " "))
}
}
}
# ############mutual information ------------------------------------------
minfconf=function(x,y){
#this will spit out the quantiles of the MI from the bootstrap..
air3 = cbind(x,y)
result.12 = sm.density(air3, eval.points = air3, display = "none",verbose=0)
result.12 = diag(result.12$estimate)
result.1 = sm.density(x, eval.points = x, display = "none",verbose=0)$estimate
result.2 = sm.density(y, eval.points = y, display = "none",verbose=0)$estimate
#Mutual Information from the hand out..
#tobs = mean(log(result.12$estimate /(result.1$estimate * result.2$estimate)))
#Average Mutual Information .. Moon et al. (1995)
tobs = sum(result.12 * log2(result.12 /(result.1 * result.2)))
print(c(tobs))
n=length(x)
nsim = 500
#500 bootstrap samples..
#vector of MI for the bootstrap samples..
simval = rep(0,nsim)
#vector of correlations of the bootstrap samples..
simcor=rep(0,nsim)
for (i in 1:nsim) {
#sample y but not x. This disturbs the joint occurrence of x and y
xy=sample(y,replace=TRUE)
xx = sample(x,replace=TRUE)
samp = cbind(xx,xy)
#to obtain confidence interval of MI both x and y are undisturbed.
#xn=sample(1:n, replace=TRUE)
# samp = cbind(y1 = x[xn], y2 = y[xn])
result.12 = sm.density(samp, eval.points = samp, display = "none", verbose=0)
result.12 = diag(result.12$estimate)
result.1 = sm.density(xx, eval.points = xx, display = "none", verbose=0)$estimate
result.2 = sm.density(xy, eval.points = xy, display = "none", verbose=0)$estimate
#Mutual Information from the hand out..
#simval[i] = mean(log(result.12$estimate /(result.1$estimate * result.2$estimate)))
#Average Mutual Information .. Moon et al. (1995)
simval[i] = sum(result.12 * log2(result.12 /(result.1 * result.2)))
simcor[i]=cor(samp[,1],samp[,2])
}
print(c('MI range',quantile(simval,c(0.05,0.1,.50,.90,.95))))
print(c('Cor range ',quantile(simcor,c(0.05,0.1,.50,.90,.95))))
name=1
lab=c("Correlation","MI")
p1=ggplot()+
geom_boxplot(mapping = aes(x = name, y=simcor))+
geom_point(mapping = aes(x = name, y = cor(x,y)),
color = "red", size = 2)+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
panel.background = element_rect( colour = "gray30"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "grey80", linetype = "dashed"))+
theme(axis.text.x=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.5),
axis.text.y=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.8))+
labs(title ="Boxplot of bootstrap correlations",x = "",y = lab[1])
p2=ggplot()+
geom_boxplot(mapping = aes(x = name, y=simval))+
geom_point(mapping = aes(x = name, y = tobs),
color = "red", size = 2)+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
panel.background = element_rect( colour = "gray30"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "grey80", linetype = "dashed"))+
theme(axis.text.x=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.5),
axis.text.y=element_text(color = "gray20", size=10, angle=0, vjust=.8, hjust=0.8))+
labs(title ="Boxplot of bootstrap MI",x = "",y = lab[2])
dev.new()
multiplot(p1,p2, cols = 2)
}
# #########multiplot function ---------------------------------------------
multiplot = function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}