library(geoR) #for geodata
library(spBayes) #for splm
library(fields)


library(leaps)
library(MPV)          # for calculating PRESS
library(fields)       # for surface to plot surface plot
library(ggplot2)
library(rgdal)        # for converting projection of coordinates
library(reshape2)     # for using the melt function (Convert an object into a molten data frame)
library(scales) 
library(akima)
library(corrplot)
library(gridExtra)
library(glmnet)
library(ggmap)
library(sm) 
library(locfit)
library(prodlim)


#Winter (Dec-Mar)
Win_df = read.table("Max_Winter_Seas_Prec.txt", header = T)

#Summer (Jun-Sep)
#Sum_df = read.table("Max_summer_Seas_Prec.txt", header = T)

#this is summer data/ need for spatial elements
Precep_df = read.table("Precipitaton_data.txt", header = T)


Elev_grid = read.table('Elevation_grid_1deg (1).txt',header = T)

merge_data <- function(df1, df2) {
  #Transpose data
  df1 = setNames(data.frame(t(df1[,-1])), df1[,1])

  #Create the mean column per location 
  df1['Mean_Rainfall'] = rowMeans(df1)
  
  #resetting the index so that location is a column rather than an index
  df1 <- cbind(Location = rownames(df1), df1)
  rownames(df1) <- 1:nrow(df1)
  
  # merge to have long/lat data
  df3 = merge(df1, df2, by="row.names", all.x=TRUE) 
  
  df3 <- df3[, -c(1, 3:57)]

  return(df3)
}

win_data = merge_data(Win_df, Precep_df)

y = win_data[,2]
X = win_data[,3:5]
df = data.frame(win_data[2], (win_data[2] >= 22.80909) * 1)
win_data$rainfall_bin = df$Mean_Rainfall.1

y = unlist(win_data[7], use.names = FALSE)
X = win_data[,3:5]


set.seed(1)


GLM_fit = function(Pm, X, family) {
  
  if (family == "gamma") {
    links = c("log", "inverse","identity")   
    
    # clean data and remove zeros
    Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
    
  } else if (family == "binomial"){
    links = c("logit", "probit", "cauchit")
  } else if (family == "gaussian"){
    links = c("identity")
  }
  
  N = length(Pm)
  
  combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
  # number of predictors
  combos = combs$which
  ncombos = length(combos[,1])
  glm_press=vector(length = length(links))
  best_comb=vector(length = length(links))
  xpress=1:ncombos
  xmse = 1:ncombos
  
  for(j in 1:length(links)) {
    aux_var=1 # allow to fit glm
    ##remove small values for gamma distribution
    if (family == "gamma"){
      if (links[j]=="identity"){
        if( month==1){
          Pm[Pm<=0.005]=0.005
        }else{
          Pm[Pm<=2.5]=2.5
        }
        
      }else if (links[j]=="inverse" & month==1){
        Pm[Pm<=25]=25 # it is necessary to modific significantly the small values, 
        #so, "inverse" is not fitted for January
        aux_var=0
      }
    }
    if (aux_var==1)
      {for(i in 1:ncombos) {
        xx = X[,combos[i,]]
        xx=as.data.frame(xx)
        if (family == "gamma"){
          zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
        }else if (family == "binomial"){
          zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
        }else if (family == "gaussian"){
          zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
        }
        xpress[i]=PRESS(zz)
        xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
        #print(xpress[i])
      }}
    if (aux_var==1){
      # Test using PRESS objective function
      glm_press[j]=min(xpress)
      best_comb[j]=which.min(xpress)
    }else{
      # Test using PRESS objective function
      glm_press[j]=200000
      best_comb[j]=which.min(xpress)
    }
  }
  
  press_df = data.frame(glm_press)
  rownames(press_df) = links[1:length(links)]
  
  print("Results of PRESS for bestfit GLM")
  print(press_df)
  print(best_comb[which.min(glm_press)])
  
  sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", family, links[which.min(glm_press)])
  xx = X[,combos[best_comb[which.min(glm_press)],]]
  xx=as.data.frame(xx)
  if (family == "gamma") {
    bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
  } else if (family == "binomial") {
    bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
  }  else if (family == "gaussian") {
    bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
  } else { 
    print("Error!")
  }
  bestmod$Call$LINK=links[which.min(glm_press)]
  bestmod$Call$PRESS=PRESS(bestmod)
  bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
  return(bestmod)
}


bestmod=GLM_fit(y, X,"binomial")
summary(bestmod)

[1] "Results of PRESS for bestfit GLM"
        glm_press
logit    66.95058
probit   66.72346
cauchit  66.60423
[1] 1

Call:
glm(formula = Pm ~ ., family = binomial(link = links[which.min(glm_press)]), 
    data = xx)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9077  -0.4983  -0.3842   0.7281   2.2594  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept) -72.2264    29.2726  -2.467   0.0136 *
xx           -0.6491     0.2630  -2.468   0.0136 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 83.708  on 72  degrees of freedom
Residual deviance: 62.313  on 71  degrees of freedom
AIC: 66.313

Number of Fisher Scoring iterations: 8


bestmod$Call$LINK


geod = as.geodata(cbind(bestmod$residuals,X$Lon,X$Lat),data.col=1)


vg = variog(geod,breaks=seq(50,10000,50))

vg = variog(geod)
plot(vg, main="binned variogram") 
plot(vg, main="variogram cloud")

variog: computing omnidirectional variogram
variog: computing omnidirectional variogram


sigma.sq = median(vg$v)


sigma.sq = median(vg$v)
sigma.sq.lo = 0.25*sigma.sq
sigma.sq.hi = 1.75*sigma.sq
phi.val = median(vg$u) 
phi.lo = 0.25*phi.val
phi.hi = 1.75*phi.val

# Input number of samples
n.samples = 5000

# coordinates of observations
coords = cbind(X$Lon,X$Lat) 

# prior distributions defined (limited distributions available... see help(spLM)

priors = list("beta.flat", "phi.unif"=c(phi.lo,phi.hi), #sigma and tau assumed to follow an inverse-Gamma distribution
               "sigma.sq.ig"=c(sigma.sq.lo,sigma.sq.hi),"tau.sq.IG"=c(1, 0.01)) # spatial decay phi assumed to follow Uniform distributions
#The hyperparameters of the inverse-Gamma are passed as a vector of length two, with the first and second elements corresponding 
#to the shape and scale, respectively. The hyperparameters of the Uniform are also passed as a vector of length two with the 
#first and second elements corresponding to the lower and upper support, respectively. If the regression coefficients, i.e., 
#beta vector, are assumed to follow a multivariate Normal distribution then pass the hyperparameters as a list of length two 
#with the first and second elements corresponding to the mean vector and positive definite covariance matrix, respectively.               
               
# starting values for MCMC resampling
starting = list("phi"=phi.val, "sigma.sq"=sigma.sq, "tau.sq"=1)


# adjustment factor for MCMC sampling routine
tuning = list("phi"=1, "sigma.sq"=1,"tau.sq"=1) 

# Perform Monte Carlo Markov Chain Analysis
bat.fm = spLM(y~., data=X, coords=as.matrix(X[,1:2]), priors=priors, tuning=tuning, starting=starting, cov.model = "exponential", n.samples = n.samples, verbose = FALSE, n.report = 50)


#add some small amount to duplicated coordinates
dup = duplicated(bat.fm$coords); bat.fm$coords[dup] <- bat.fm$coords[dup] + 1e-3
# burn-in samples
bat.fm = spRecover(bat.fm, start=((1/3)*n.samples)+1, thin=1, verbose=T)

-------------------------------------------------
		Recovering beta and w
-------------------------------------------------
Sampled: 99 of 3333, 2.97%
Sampled: 199 of 3333, 5.97%
Sampled: 299 of 3333, 8.97%
Sampled: 399 of 3333, 11.97%
Sampled: 499 of 3333, 14.97%
Sampled: 599 of 3333, 17.97%
Sampled: 699 of 3333, 20.97%
Sampled: 799 of 3333, 23.97%
Sampled: 899 of 3333, 26.97%
Sampled: 999 of 3333, 29.97%
Sampled: 1099 of 3333, 32.97%
Sampled: 1199 of 3333, 35.97%
Sampled: 1299 of 3333, 38.97%
Sampled: 1399 of 3333, 41.97%
Sampled: 1499 of 3333, 44.97%
Sampled: 1599 of 3333, 47.97%
Sampled: 1699 of 3333, 50.98%
Sampled: 1799 of 3333, 53.98%
Sampled: 1899 of 3333, 56.98%
Sampled: 1999 of 3333, 59.98%
Sampled: 2099 of 3333, 62.98%
Sampled: 2199 of 3333, 65.98%
Sampled: 2299 of 3333, 68.98%
Sampled: 2399 of 3333, 71.98%
Sampled: 2499 of 3333, 74.98%
Sampled: 2599 of 3333, 77.98%
Sampled: 2699 of 3333, 80.98%
Sampled: 2799 of 3333, 83.98%
Sampled: 2899 of 3333, 86.98%
Sampled: 2999 of 3333, 89.98%
Sampled: 3099 of 3333, 92.98%
Sampled: 3199 of 3333, 95.98%
Sampled: 3299 of 3333, 98.98%


beta = bat.fm$p.beta.recover.samples
theta = bat.fm$p.theta.recover.samples
plot(beta)


plot(theta)


summary(window(bat.fm$p.beta.recover.samples))

Iterations = 1:3333
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 3333 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

                  Mean        SD  Naive SE Time-series SE
(Intercept) -6.5637780 8.8427775 1.532e-01      1.532e-01
Lon         -0.0593996 0.0797245 1.381e-03      1.381e-03
Lat          0.0138093 0.1045074 1.810e-03      1.810e-03
Elev        -0.0000498 0.0005828 1.009e-05      1.009e-05

2. Quantiles for each variable:

                  2.5%        25%        50%        75%     97.5%
(Intercept) -24.008494 -1.258e+01 -6.4286854 -0.7799282 10.788643
Lon          -0.216121 -1.119e-01 -0.0592635 -0.0091485  0.099278
Lat          -0.188431 -5.800e-02  0.0142163  0.0850108  0.220891
Elev         -0.001197 -4.536e-04 -0.0000441  0.0003351  0.001094


logit2prob = function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}


n.samples2 = floor(n.samples*(2/3))

lat=Elev_grid$Lat
lon=Elev_grid$Lon
elev=Elev_grid$Elev
x1 = cbind(lat,lon, elev)
x1=as.data.frame(x1)

ylog = matrix(nrow=length(x1[,1]), ncol=n.samples2)
yselog = matrix(nrow=length(x1[,1]), ncol=n.samples2)

for (i in 1:n.samples2){
  zz = Krig(X[,1:2],bestmod$residuals,sigma=theta[i,1],theta=theta[i,3],m=1,tau2=theta[i,2])
  y2 = predict.Krig(zz,x=x1[,1:2],drop.Z=TRUE) 
  yselog[,i] = predictSE(zz, x=x1[,1:2], drop.Z=TRUE) 
  y1 = beta[i,1] + beta[i,2]*x1[,1]+beta[i,3]*x1[,2]+beta[i,4]*x1[,3]
  ylog[,i] = y1+y2
}

y=logit2prob(ylog) #to obtain the real value
yse=logit2prob(yselog) #to obtain the real value


mean.y = apply(y, 1, FUN = mean) # take the mean and convert from percent to fraction
mean.yse = apply(yse, 1, FUN = mean) # take the mean and convert from percent to fraction

ggplot() +
 geom_histogram(aes(as.vector(y)),fill="gray50",col='black') +
  labs(title = "Histogram of Precipitation Posterior",
       x = "Predictions")+
  #theme_light()+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


library(ggplot2)      # for plotting spatial map
library(reshape2)     # for using the melt function (Convert an object into a molten data frame)
library(scales)


Quilt_plotting=function(lon, lat, ypred, lon1, lat1, yob,type=""){
    if (type=="binary"){
      par(mfrow=c(3,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Posterior Binary Precipitation on DEM Grid',zlim=range(ypred$fit,yob))
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Observed Binary Precipitation',)
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
      
      quilt.plot(lon, lat,ypred$se,xlab="Longitude (m)",ylab="Latitude (m)",main='Standard Error on DEM Grid')
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
    } else{
      par(mfrow=c(3,1))
      quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Posterior Precipitation on DEM Grid (mm)',zlim=range(ypred$fit,yob))
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
      
      quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Observed Precipitation (mm)',zlim=range(ypred$fit,yob))
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
      
      quilt.plot(lon, lat,ypred$se,xlab="Longitude (m)",ylab="Latitude (m)",main='Standard Error on DEM Grid (mm)')
      grid(col="gray70",lty=2)
      US(add=TRUE, col="gray50", lwd=2, xlim=range(-125,-100))
      box()
    }
  }


plot_surface =function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
  
  zz = interp(lon,lat,ypred$fit, duplicate = "strip")
  zz2 = interp(lon1,lat1,yob, duplicate = "strip")
    if (type=="binary"){
      par(mfrow=c(3,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Binary Precipitation on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Binary Precipitation'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
    } else{
      par(mfrow=c(3,1))
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Precipitation on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob), xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      surface(zz2,xlab="Longitude",ylab ="Latitude",main='Actual Precipitation (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$fit,yob),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
      
      
      zz = interp(lon,lat,ypred$se, duplicate = "strip")
      
      
      surface(zz,xlab="Longitude",ylab ="Latitude",main='Predicted Standard Error on DEM Grid (mm)'
              ,horizontal = FALSE,legend.shrink = 0.9, zlim=range(ypred$se),xlim=range(lon,lon1),ylim=range(lat,lat1))
      grid(col= "grey50", lty = "dashed")
      US( add=TRUE, col="gray30", lwd=2)
    }


  }


Pm = win_data[,7]

Pred=cbind(mean.y,mean.yse)
ypred=data.frame(Pred)
names(ypred)=c("fit","se")
#Quilt_plotting(x1[,2],x1[,1],ypred)
Quilt_plotting(x1[,2],x1[,1],ypred,X[,1],X[,2],Pm, type="binary")

plot_surface(x1[,2],x1[,1],ypred,X[,1],X[,2],Pm, type = "binary")

Fit the Best Linear Model¶

Fit the Variogram and Perform MCMC¶

Plot Posterior PDFs¶

Plot Precipitation Posterior PDF, Predicted Precipitation, and Standard Error¶