#Winter (Dec-Mar)
Win_df = read.table("Max_Winter_Seas_Prec.txt", header = T)
#Summer (Jun-Sep)
#Sum_df = read.table("Max_summer_Seas_Prec.txt", header = T)
#this is summer data/ need for spatial elements
Precep_df = read.table("Precipitaton_data.txt", header = T)
Elev_grid = read.table('Elevation_grid_1deg (1).txt',header = T)
merge_data <- function(df1, df2) {
#Transpose data
df1 = setNames(data.frame(t(df1[,-1])), df1[,1])
#Create the mean column per location
df1['Mean_Rainfall'] = rowMeans(df1)
#resetting the index so that location is a column rather than an index
df1 <- cbind(Location = rownames(df1), df1)
rownames(df1) <- 1:nrow(df1)
# merge to have long/lat data
df3 = merge(df1, df2, by="row.names", all.x=TRUE)
df3 <- df3[, -c(1, 3:57)]
return(df3)
}
win_data = merge_data(Win_df, Precep_df)
y = win_data[,2]
X = win_data[,3:5]
#create binary col. based on the 75th percentile
df = data.frame(win_data[2], (win_data[2] >= 22.80909) * 1)
win_data$rainfall_bin = df$Mean_Rainfall.1
y = unlist(win_data[7], use.names = FALSE)
X = win_data[,3:5]
#investigate the logit link function
glmfit= glm(y ~ ., data = X, family = binomial(link='logit'))
summary(glmfit)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = X)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7437 -0.6743 -0.3595 0.8778 2.1986
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.902e+01 1.364e+01 -3.595 0.000324 ***
## Lon -4.142e-01 1.199e-01 -3.455 0.000551 ***
## Lat 7.964e-02 1.287e-01 0.619 0.536095
## Elev 1.331e-05 6.876e-04 0.019 0.984554
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.708 on 72 degrees of freedom
## Residual deviance: 62.163 on 69 degrees of freedom
## AIC: 70.163
##
## Number of Fisher Scoring iterations: 5
anova(glmfit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 72 83.708
## Lon 1 20.9843 71 62.724
## Lat 1 0.5607 70 62.163
## Elev 1 0.0004 69 62.163
#investigate the probit link function
glmfit= glm(y ~ ., data = X, family = binomial(link='probit'))
summary(glmfit)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "probit"), data = X)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7308 -0.6735 -0.3473 0.8854 2.1953
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.896e+01 7.465e+00 -3.879 0.000105 ***
## Lon -2.425e-01 6.519e-02 -3.720 0.000199 ***
## Lat 5.280e-02 7.444e-02 0.709 0.478169
## Elev 2.647e-05 3.953e-04 0.067 0.946623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.708 on 72 degrees of freedom
## Residual deviance: 61.860 on 69 degrees of freedom
## AIC: 69.86
##
## Number of Fisher Scoring iterations: 6
anova(glmfit)
## Analysis of Deviance Table
##
## Model: binomial, link: probit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 72 83.708
## Lon 1 21.0926 71 62.616
## Lat 1 0.7508 70 61.865
## Elev 1 0.0045 69 61.860
The models using the logit and probit link functions preform very similarly, but the cauchit ultimately performs the best.
GLM_fit = function(Pm, X, family, month) {
if (family == "gamma") {
links = c("log", "inverse","identity")
# clean data and remove zeros
Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
} else if (family == "binomial"){
links = c("logit", "probit", "cauchit")
} else if (family == "gaussian"){
links = c("identity")
}
N = length(Pm)
combs = leaps(X,Pm, nbest=25) # GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
glm_press=vector(length = length(links))
best_comb=vector(length = length(links))
xpress=1:ncombos
xmse = 1:ncombos
for(j in 1:length(links)) {
aux_var=1 # allow to fit glm
##remove small values for gamma distribution
if (family == "gamma"){
if (links[j]=="identity"){
if( month==1){
Pm[Pm<=0.005]=0.005
}else{
Pm[Pm<=2.5]=2.5
}
}else if (links[j]=="inverse"){
Pm[Pm<=25]=25 # it is necessary to modific significantly the small values,
#so, "inverse" is not fitted for January
aux_var=0
}
}
if (aux_var==1)
{for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
if (family == "gamma"){
zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
}else if (family == "binomial"){
zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
}else if (family == "gaussian"){
zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
}
xpress[i]=PRESS(zz)
xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
#print(xpress[i])
}}
if (aux_var==1){
# Test using PRESS objective function
glm_press[j]=min(xpress)
best_comb[j]=which.min(xpress)
}else{
# Test using PRESS objective function
glm_press[j]=200000
best_comb[j]=which.min(xpress)
}
}
press_df = data.frame(glm_press)
rownames(press_df) = links[1:length(links)]
print("Results of PRESS for bestfit GLM")
print(press_df)
print(best_comb[which.min(glm_press)])
sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", family, links[which.min(glm_press)])
xx = X[,combos[best_comb[which.min(glm_press)],]]
xx=as.data.frame(xx)
if (family == "gamma") {
bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
} else if (family == "binomial") {
bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
} else if (family == "gaussian") {
bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
} else {
print("Error!")
}
bestmod$Call$LINK=links[which.min(glm_press)]
bestmod$Call$PRESS=PRESS(bestmod)
bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
print(bestmod$Call$LINK)
return(bestmod)
}
bestmod = GLM_fit(y, X, 'binomial')
## [1] "Results of PRESS for bestfit GLM"
## glm_press
## logit 66.95058
## probit 66.72346
## cauchit 66.60423
## [1] 1
## [1] "cauchit"
anova(bestmod)
## Analysis of Deviance Table
##
## Model: binomial, link: cauchit
##
## Response: Pm
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 72 83.708
## xx 1 21.395 71 62.313
bestmod= glm(y ~ Lon, data = X, family = binomial(link='cauchit'))
summary(bestmod)
##
## Call:
## glm(formula = y ~ Lon, family = binomial(link = "cauchit"), data = X)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9077 -0.4983 -0.3842 0.7281 2.2594
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -72.2264 29.2726 -2.467 0.0136 *
## Lon -0.6491 0.2630 -2.468 0.0136 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.708 on 72 degrees of freedom
## Residual deviance: 62.313 on 71 degrees of freedom
## AIC: 66.313
##
## Number of Fisher Scoring iterations: 8
anova(bestmod)
## Analysis of Deviance Table
##
## Model: binomial, link: cauchit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 72 83.708
## Lon 1 21.395 71 62.313
X1= X
plot_surface =function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
zz = interp(lat,lon,ypred$fit)
zz2 = interp(lat1,lon1,yob)
par(mfrow=c(1,3))
surface(zz,xlab="Latitude",ylab ="Longitude",main='Predicted Precipitation (mm)'
,horizontal = TRUE, zlim=range(ypred$fit,yob), xlim=range(lat,lat1),ylim=range(lon,lon1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
surface(zz2,xlab="Latitude",ylab ="Longitude",main='Actual Precipitation (mm)'
,horizontal = TRUE, zlim=range(ypred$fit,yob), xlim=range(lat,lat1),ylim=range(lon,lon1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
zz = interp(lat,lon,ypred$se)
surface(zz,xlab="Latitude",ylab ="Longitude",main='Predicted Standard Error (mm)'
,horizontal = TRUE, zlim=range(ypred$se), xlim=range(lat,lat1),ylim=range(lon,lon1))
grid(col= "grey50", lty = "dashed")
US( add=TRUE, col="gray30", lwd=2)
}
ypred =predict(bestmod, newdata=Elev_grid, se.fit=T)
plot_surface(Elev_grid[,1], Elev_grid[,2], ypred,X[,1], X[,2],y, save)
Pm = y
links = c("logit", "probit", 'cauchit')
N = length(Pm)
combs = leaps(X,Pm, nbest=25) # GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
gcvs_m=1:ncombos
glm_gcv=vector(length = length(links))
best_comb=vector(length = length(links))
for(j in 1:length(links)) { sprintf("j",j)
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz=locpoly_fit(Pm, xx,family="binomial", link=links[j], glm=TRUE,plot=TRUE)
gcvs_m[i] = zz$call$gcv
}
# Test using GCV objective function
glm_gcv[j]=min(gcvs_m)
best_comb[j]=which.min(gcvs_m)
}
bestmod1=locpoly_fit(Pm, X[combos[best_comb[which.min(glm_gcv)],]],family="binomial",
link=links[which.min(glm_gcv)], glm=TRUE,plot=TRUE)
bestmod1$combo=combos[best_comb[which.min(glm_gcv)],]
### gaussian distribution
gcvs_m=1:ncombos
for(i in 1:ncombos) {
xx = X[,combos[i,]]
xx=as.data.frame(xx)
zz=locpoly_fit(Pm, xx,family="binomial", link="logit", glm=TRUE,plot=TRUE)
gcvs_m[i] = zz$call$gcv
}
bestmod2=locpoly_fit(Pm,X[combos[which.min(gcvs_m),]],family="binomial", link="logit", glm=TRUE,plot=TRUE)
bestmod2$combo=combos[which.min(gcvs_m),]
if (bestmod2$call$gcv>bestmod1$call$gcv){
bestmod=bestmod1
}else{
bestmod=bestmod2
}
X=X[,bestmod$combo]
summary(bestmod)
## Estimation type: Local Likelihood - Binomial
##
## Call:
## locfit(formula = Pm ~ ., data = X, alpha = 0.6, maxk = 10000,
## deg = 2, kern = "bisq", scale = T, family = "binomial", link = "probit",
## gcv = 0.80530628550884)
##
## Number of data points: 73
## Independent variables: Lon
## Evaluation structure: Rectangular Tree
## Number of evaluation points: 7
## Degree of fit: 2
## Fitted Degrees of Freedom: 5.475
res.aov <- aov(Mean_Rainfall ~Lon^2,data=win_data)
print(res.aov)
## Call:
## aov(formula = Mean_Rainfall ~ Lon^2, data = win_data)
##
## Terms:
## Lon Residuals
## Sum of Squares 1684.722 6350.392
## Deg. of Freedom 1 71
##
## Residual standard error: 9.457386
## Estimated effects may be unbalanced
ypred =predict(bestmod, newdata=Elev_grid, se.fit=T)
plot_surface(Elev_grid[,1], Elev_grid[,2], ypred,X1[,1], X1[,2],y, save)