library(geoR) #for geodata
library(spBayes) #for splm
library(fields)
library(leaps)
library(MPV) # for calculating PRESS
library(fields) # for surface to plot surface plot
library(ggplot2)
library(rgdal) # for converting projection of coordinates
library(reshape2) # for using the melt function (Convert an object into a molten data frame)
library(scales)
library(akima)
library(corrplot)
library(gridExtra)
library(glmnet)
library(ggmap)
library(sm)
library(locfit)
library(prodlim)
#Winter (Dec-Mar)
Win_df = read.table("Max_Winter_Seas_Prec.txt", header = T)
#Summer (Jun-Sep)
#Sum_df = read.table("Max_summer_Seas_Prec.txt", header = T)
#this is summer data/ need for spatial elements
Precep_df = read.table("Precipitaton_data.txt", header = T)
Elev_grid = read.table('Elevation_grid_1deg (1).txt',header = T)
merge_data <- function(df1, df2) {
#Transpose data
df1 = setNames(data.frame(t(df1[,-1])), df1[,1])
#Create the mean column per location
df1['Mean_Rainfall'] = rowMeans(df1)
#resetting the index so that location is a column rather than an index
df1 <- cbind(Location = rownames(df1), df1)
rownames(df1) <- 1:nrow(df1)
# merge to have long/lat data
df3 = merge(df1, df2, by="row.names", all.x=TRUE)
df3 <- df3[, -c(1, 3:57)]
return(df3)
}
win_data = merge_data(Win_df, Precep_df)
y = win_data[,2]
X = win_data[,3:5]
df = data.frame(win_data[2], (win_data[2] >= 22.80909) * 1)
win_data$rainfall_bin = df$Mean_Rainfall.1
y = unlist(win_data[7], use.names = FALSE)
X = win_data[,3:5]
set.seed(1)
GLM_fit = function(Pm, X, family) {
if (family == "gamma") {
links = c("log", "inverse","identity")
# clean data and remove zeros
Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
} else if (family == "binomial"){
links = c("logit", "probit", "cauchit")
} else if (family == "gaussian"){
links = c("identity")
}
N = length(Pm)
combs = leaps(X,Pm, nbest=25) # GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
glm_press=vector(length = length(links))
best_comb=vector(length = length(links))
xpress=1:ncombos
xmse = 1:ncombos
for(j in 1:length(links)) {
aux_var=1 # allow to fit glm
##remove small values for gamma distribution
if (family == "gamma"){
if (links[j]=="identity"){
if( month==1){
Pm[Pm<=0.005]=0.005
}else{
Pm[Pm<=2.5]=2.5
}
}else if (links[j]=="inverse" & month==1){
Pm[Pm<=25]=25 # it is necessary to modific significantly the small values,
#so, "inverse" is not fitted for January
aux_var=0
}
}
if (aux_var==1)
{for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
if (family == "gamma"){
zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
}else if (family == "binomial"){
zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
}else if (family == "gaussian"){
zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
}
xpress[i]=PRESS(zz)
xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
#print(xpress[i])
}}
if (aux_var==1){
# Test using PRESS objective function
glm_press[j]=min(xpress)
best_comb[j]=which.min(xpress)
}else{
# Test using PRESS objective function
glm_press[j]=200000
best_comb[j]=which.min(xpress)
}
}
press_df = data.frame(glm_press)
rownames(press_df) = links[1:length(links)]
print("Results of PRESS for bestfit GLM")
print(press_df)
print(best_comb[which.min(glm_press)])
sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", family, links[which.min(glm_press)])
xx = X[,combos[best_comb[which.min(glm_press)],]]
xx=as.data.frame(xx)
if (family == "gamma") {
bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
} else if (family == "binomial") {
bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
} else if (family == "gaussian") {
bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
} else {
print("Error!")
}
bestmod$Call$LINK=links[which.min(glm_press)]
bestmod$Call$PRESS=PRESS(bestmod)
bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
return(bestmod)
}
bestmod=GLM_fit(y, X,"binomial")
summary(bestmod)
[1] "Results of PRESS for bestfit GLM"
glm_press
logit 66.95058
probit 66.72346
cauchit 66.60423
[1] 1
Call:
glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]),
data = xx)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9077 -0.4983 -0.3842 0.7281 2.2594
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -72.2264 29.2726 -2.467 0.0136 *
xx -0.6491 0.2630 -2.468 0.0136 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 83.708 on 72 degrees of freedom
Residual deviance: 62.313 on 71 degrees of freedom
AIC: 66.313
Number of Fisher Scoring iterations: 8
bestmod$Call$LINK
geod = as.geodata(cbind(bestmod$residuals,X$Lon,X$Lat),data.col=1)
vg = variog(geod,breaks=seq(50,10000,50))
vg = variog(geod)
plot(vg, main="binned variogram")
plot(vg, main="variogram cloud")
variog: computing omnidirectional variogram variog: computing omnidirectional variogram
sigma.sq = median(vg$v)
sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u)
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val
# Input number of samples
n.samples = 5000
# coordinates of observations
coords = cbind(X$Lon,X$Lat)
# prior distributions defined (limited distributions available... see help(spLM)
priors = list("beta.flat", "phi.unif"=c(phi.lo,phi.hi), #sigma and tau assumed to follow an inverse-Gamma distribution
"sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(1, 0.01)) # spatial decay phi assumed to follow Uniform distributions
#The hyperparameters of the inverse-Gamma are passed as a vector of length two, with the first and second elements corresponding
#to the shape and scale, respectively. The hyperparameters of the Uniform are also passed as a vector of length two with the
#first and second elements corresponding to the lower and upper support, respectively. If the regression coefficients, i.e.,
#beta vector, are assumed to follow a multivariate Normal distribution then pass the hyperparameters as a list of length two
#with the first and second elements corresponding to the mean vector and positive definite covariance matrix, respectively.
# starting values for MCMC resampling
starting = list("phi"=phi.val, "sigma.sq"=sigma.sq, "tau.sq"=1)
# adjustment factor for MCMC sampling routine
tuning = list("phi"=1, "sigma.sq"=1,"tau.sq"=1)
# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(y~., data=X, coords=as.matrix(X[,1:2]), priors=priors, tuning=tuning, starting=starting, cov.model = "exponential", n.samples = n.samples, verbose = FALSE, n.report = 50)
#add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# burn-in samples
bat.fm = spRecover(bat.fm, start=((1/3)*n.samples)+1, thin=1, verbose=T)
------------------------------------------------- Recovering beta and w ------------------------------------------------- Sampled: 99 of 3333, 2.97% Sampled: 199 of 3333, 5.97% Sampled: 299 of 3333, 8.97% Sampled: 399 of 3333, 11.97% Sampled: 499 of 3333, 14.97% Sampled: 599 of 3333, 17.97% Sampled: 699 of 3333, 20.97% Sampled: 799 of 3333, 23.97% Sampled: 899 of 3333, 26.97% Sampled: 999 of 3333, 29.97% Sampled: 1099 of 3333, 32.97% Sampled: 1199 of 3333, 35.97% Sampled: 1299 of 3333, 38.97% Sampled: 1399 of 3333, 41.97% Sampled: 1499 of 3333, 44.97% Sampled: 1599 of 3333, 47.97% Sampled: 1699 of 3333, 50.98% Sampled: 1799 of 3333, 53.98% Sampled: 1899 of 3333, 56.98% Sampled: 1999 of 3333, 59.98% Sampled: 2099 of 3333, 62.98% Sampled: 2199 of 3333, 65.98% Sampled: 2299 of 3333, 68.98% Sampled: 2399 of 3333, 71.98% Sampled: 2499 of 3333, 74.98% Sampled: 2599 of 3333, 77.98% Sampled: 2699 of 3333, 80.98% Sampled: 2799 of 3333, 83.98% Sampled: 2899 of 3333, 86.98% Sampled: 2999 of 3333, 89.98% Sampled: 3099 of 3333, 92.98% Sampled: 3199 of 3333, 95.98% Sampled: 3299 of 3333, 98.98%
beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
plot(beta)
plot(theta)
summary(window(bat.fm$p.beta.recover.samples))
Iterations = 1:3333
Thinning interval = 1
Number of chains = 1
Sample size per chain = 3333
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
(Intercept) -6.5637780 8.8427775 1.532e-01 1.532e-01
Lon -0.0593996 0.0797245 1.381e-03 1.381e-03
Lat 0.0138093 0.1045074 1.810e-03 1.810e-03
Elev -0.0000498 0.0005828 1.009e-05 1.009e-05
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
(Intercept) -24.008494 -1.258e+01 -6.4286854 -0.7799282 10.788643
Lon -0.216121 -1.119e-01 -0.0592635 -0.0091485 0.099278
Lat -0.188431 -5.800e-02 0.0142163 0.0850108 0.220891
Elev -0.001197 -4.536e-04 -0.0000441 0.0003351 0.001094
logit2prob = function(logit){
odds <- exp(logit)
prob <- odds / (1 + odds)
return(prob)
}
n.samples2 = floor(n.samples*(2/3))
lat=Elev_grid$Lat
lon=Elev_grid$Lon
elev=Elev_grid$Elev
x1 = cbind(lat,lon, elev)
x1=as.data.frame(x1)
ylog = matrix(nrow=length(x1[,1]), ncol=n.samples2)
yselog = matrix(nrow=length(x1[,1]), ncol=n.samples2)
for (i in 1:n.samples2){
zz = Krig(X[,1:2],bestmod$residuals,sigma=theta[i,1],theta=theta[i,3],m=1,tau2=theta[i,2])
y2 = predict.Krig(zz,x=x1[,1:2],drop.Z=TRUE)
yselog[,i] = predictSE(zz, x=x1[,1:2], drop.Z=TRUE)
y1 = beta[i,1] + beta[i,2]*x1[,1]+beta[i,3]*x1[,2]+beta[i,4]*x1[,3]
ylog[,i] = y1+y2
}
y=logit2prob(ylog) #to obtain the real value
yse=logit2prob(yselog) #to obtain the real value
mean.y = apply(y, 1, FUN = mean) # take the mean and convert from percent to fraction
mean.yse = apply(yse, 1, FUN = mean) # take the mean and convert from percent to fraction
ggplot() +
geom_histogram(aes(as.vector(y)),fill="gray50",col='black') +
labs(title = "Histogram of Precipitation Posterior",
x = "Predictions")+
#theme_light()+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2) # for plotting spatial map
library(reshape2) # for using the melt function (Convert an object into a molten data frame)
library(scales)
Quilt_plotting=function(lon, lat, ypred, lon1, lat1, yob,type=""){
if (type=="binary"){
par(mfrow=c(3,1))
quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Posterior Binary Precipitation on DEM Grid',zlim=range(ypred$fit,yob))
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Observed Binary Precipitation',)
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
quilt.plot(lon, lat,ypred$se,xlab="Longitude (m)",ylab="Latitude (m)",main='Standard Error on DEM Grid')
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
} else{
par(mfrow=c(3,1))
quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Posterior Precipitation on DEM Grid (mm)',zlim=range(ypred$fit,yob))
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Observed Precipitation (mm)',zlim=range(ypred$fit,yob))
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
quilt.plot(lon, lat,ypred$se,xlab="Longitude (m)",ylab="Latitude (m)",main='Standard Error on DEM Grid (mm)')
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
box()
}
}
plot_surface =function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
zz = interp(lon,lat,ypred$fit, duplicate = "strip")
zz2 = interp(lon1,lat1,yob, duplicate = "strip")
if (type=="binary"){
par(mfrow=c(3,1))
surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Binary Precipitation on DEM Grid'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Binary Precipitation'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
zz = interp(lon,lat,ypred$se, duplicate = "strip")
surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
} else{
par(mfrow=c(3,1))
surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Precipitation on DEM Grid (mm)'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Precipitation (mm)'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
zz = interp(lon,lat,ypred$se, duplicate = "strip")
surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid (mm)'
,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
}
}
Pm = win_data[,7]
Pred=cbind(mean.y,mean.yse)
ypred=data.frame(Pred)
names(ypred)=c("fit","se")
#Quilt_plotting(x1[,2],x1[,1],ypred)
Quilt_plotting(x1[,2],x1[,1],ypred,X[,1],X[,2],Pm, type="binary")
plot_surface(x1[,2],x1[,1],ypred,X[,1],X[,2],Pm, type = "binary")