Homework

#Winter (Dec-Mar)
Win_df = read.table("Max_Winter_Seas_Prec.txt", header = T)

#Summer (Jun-Sep)
#Sum_df = read.table("Max_summer_Seas_Prec.txt", header = T)

#this is summer data/ need for spatial elements
Precep_df = read.table("Precipitaton_data.txt", header = T)


Elev_grid = read.table('Elevation_grid_1deg (1).txt',header = T)

merge_data <- function(df1, df2) {
  #Transpose data
  df1 = setNames(data.frame(t(df1[,-1])), df1[,1])

  #Create the mean column per location 
  df1['Mean_Rainfall'] = rowMeans(df1)
  
  #resetting the index so that location is a column rather than an index
  df1 <- cbind(Location = rownames(df1), df1)
  rownames(df1) <- 1:nrow(df1)
  
  # merge to have long/lat data
  df3 = merge(df1, df2, by="row.names", all.x=TRUE) 
  
  df3 <- df3[, -c(1, 3:57)]

  return(df3)
}

win_data = merge_data(Win_df, Precep_df)

y = win_data[,2]
X = win_data[,3:5]

#create binary col. based on the 75th percentile
df = data.frame(win_data[2], (win_data[2] >= 22.80909) * 1)
win_data$rainfall_bin = df$Mean_Rainfall.1

y = unlist(win_data[7], use.names = FALSE)
X = win_data[,3:5]

The GLM model

#investigate the logit link function
glmfit= glm(y ~ ., data = X, family = binomial(link='logit'))
summary(glmfit)

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = X)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7437  -0.6743  -0.3595   0.8778   2.1986  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.902e+01  1.364e+01  -3.595 0.000324 ***
## Lon         -4.142e-01  1.199e-01  -3.455 0.000551 ***
## Lat          7.964e-02  1.287e-01   0.619 0.536095    
## Elev         1.331e-05  6.876e-04   0.019 0.984554    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.708  on 72  degrees of freedom
## Residual deviance: 62.163  on 69  degrees of freedom
## AIC: 70.163
## 
## Number of Fisher Scoring iterations: 5

anova(glmfit)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                    72     83.708
## Lon   1  20.9843        71     62.724
## Lat   1   0.5607        70     62.163
## Elev  1   0.0004        69     62.163

#investigate the probit link function
glmfit= glm(y ~ ., data = X, family = binomial(link='probit'))
summary(glmfit)

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "probit"), data = X)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7308  -0.6735  -0.3473   0.8854   2.1953  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.896e+01  7.465e+00  -3.879 0.000105 ***
## Lon         -2.425e-01  6.519e-02  -3.720 0.000199 ***
## Lat          5.280e-02  7.444e-02   0.709 0.478169    
## Elev         2.647e-05  3.953e-04   0.067 0.946623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.708  on 72  degrees of freedom
## Residual deviance: 61.860  on 69  degrees of freedom
## AIC: 69.86
## 
## Number of Fisher Scoring iterations: 6

anova(glmfit)

## Analysis of Deviance Table
## 
## Model: binomial, link: probit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                    72     83.708
## Lon   1  21.0926        71     62.616
## Lat   1   0.7508        70     61.865
## Elev  1   0.0045        69     61.860

The models using the logit and probit link functions preform very similarly, but the cauchit ultimately performs the best.

GLM_fit = function(Pm, X, family, month) {
  
  if (family == "gamma") {
    links = c("log", "inverse","identity")   
    
    # clean data and remove zeros
    Pm = ifelse(Pm <=0, runif(1, 0.0001, 0.001), Pm)
    
  } else if (family == "binomial"){
    links = c("logit", "probit", "cauchit")
  } else if (family == "gaussian"){
    links = c("identity")
  }
  
  N = length(Pm)
  
  combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
  # number of predictors
  combos = combs$which
  ncombos = length(combos[,1])
  glm_press=vector(length = length(links))
  best_comb=vector(length = length(links))
  xpress=1:ncombos
  xmse = 1:ncombos
  
  for(j in 1:length(links)) {
    aux_var=1 # allow to fit glm
    ##remove small values for gamma distribution
    if (family == "gamma"){
      if (links[j]=="identity"){
        if( month==1){
          Pm[Pm<=0.005]=0.005
        }else{
          Pm[Pm<=2.5]=2.5
        }
        
      }else if (links[j]=="inverse"){
        Pm[Pm<=25]=25 # it is necessary to modific significantly the small values, 
        #so, "inverse" is not fitted for January
        aux_var=0
      }
    }
    if (aux_var==1)
      {for(i in 1:ncombos) {
        xx = X[,combos[i,]]
        xx=as.data.frame(xx)
        if (family == "gamma"){
          zz=glm(Pm ~ ., data=xx, family = Gamma(link=links[j]), maxit=500)
        }else if (family == "binomial"){
          zz=glm(Pm ~ ., data=xx, family = binomial(link=links[j]), maxit=500)
        }else if (family == "gaussian"){
          zz=glm(Pm ~ ., data=xx, family = gaussian(link=links[j]), maxit=500)
        }
        xpress[i]=PRESS(zz)
        xmse[i] = sum((zz$res)^2) / (N - length(zz$coef))
        #print(xpress[i])
      }}
    if (aux_var==1){
      # Test using PRESS objective function
      glm_press[j]=min(xpress)
      best_comb[j]=which.min(xpress)
    }else{
      # Test using PRESS objective function
      glm_press[j]=200000
      best_comb[j]=which.min(xpress)
    }
  }
  
  press_df = data.frame(glm_press)
  rownames(press_df) = links[1:length(links)]
  
  print("Results of PRESS for bestfit GLM")
  print(press_df)
  print(best_comb[which.min(glm_press)])
  
  sprintf("Choosing the GLM which minimizes PRESS: %s family and %s link function.", family, links[which.min(glm_press)])
  xx = X[,combos[best_comb[which.min(glm_press)],]]
  xx=as.data.frame(xx)
  if (family == "gamma") {
    bestmod = glm(Pm ~ ., data = xx, family = Gamma(link=links[which.min(glm_press)]))
  } else if (family == "binomial") {
    bestmod = glm(Pm ~ ., data = xx, family = binomial(link=links[which.min(glm_press)]))
  }  else if (family == "gaussian") {
    bestmod = glm(Pm ~ ., data = xx, family = gaussian(link=links[which.min(glm_press)]))
  } else { 
    print("Error!")
  }
  bestmod$Call$LINK=links[which.min(glm_press)]
  bestmod$Call$PRESS=PRESS(bestmod)
  bestmod$Call$combo=combos[best_comb[which.min(glm_press)],]
  print(bestmod$Call$LINK)
  return(bestmod)
}

bestmod = GLM_fit(y, X, 'binomial')

## [1] "Results of PRESS for bestfit GLM"
##         glm_press
## logit    66.95058
## probit   66.72346
## cauchit  66.60423
## [1] 1
## [1] "cauchit"

anova(bestmod)

## Analysis of Deviance Table
## 
## Model: binomial, link: cauchit
## 
## Response: Pm
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                    72     83.708
## xx    1   21.395        71     62.313

bestmod= glm(y ~ Lon, data = X, family = binomial(link='cauchit'))
summary(bestmod)

## 
## Call:
## glm(formula = y ~ Lon, family = binomial(link = "cauchit"), data = X)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9077  -0.4983  -0.3842   0.7281   2.2594  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -72.2264    29.2726  -2.467   0.0136 *
## Lon          -0.6491     0.2630  -2.468   0.0136 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.708  on 72  degrees of freedom
## Residual deviance: 62.313  on 71  degrees of freedom
## AIC: 66.313
## 
## Number of Fisher Scoring iterations: 8

anova(bestmod)

## Analysis of Deviance Table
## 
## Model: binomial, link: cauchit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                    72     83.708
## Lon   1   21.395        71     62.313

X1= X

plot_surface =function(lon, lat, ypred, lon1, lat1, yob,save,type=""){
      zz = interp(lat,lon,ypred$fit)
      zz2 = interp(lat1,lon1,yob)
  
        par(mfrow=c(1,3))
        surface(zz,xlab="Latitude",ylab ="Longitude",main='Predicted Precipitation (mm)'
                ,horizontal = TRUE, zlim=range(ypred$fit,yob), xlim=range(lat,lat1),ylim=range(lon,lon1))
        grid(col= "grey50", lty = "dashed")
        US( add=TRUE, col="gray30", lwd=2)
        
        surface(zz2,xlab="Latitude",ylab ="Longitude",main='Actual Precipitation (mm)'
                ,horizontal = TRUE, zlim=range(ypred$fit,yob), xlim=range(lat,lat1),ylim=range(lon,lon1))
        grid(col= "grey50", lty = "dashed")
        US( add=TRUE, col="gray30", lwd=2)
        
        
        zz = interp(lat,lon,ypred$se)
        
        
        surface(zz,xlab="Latitude",ylab ="Longitude",main='Predicted Standard Error (mm)'
                ,horizontal = TRUE, zlim=range(ypred$se), xlim=range(lat,lat1),ylim=range(lon,lon1))
        grid(col= "grey50", lty = "dashed")
        US( add=TRUE, col="gray30", lwd=2)
  
}


ypred =predict(bestmod, newdata=Elev_grid, se.fit=T)
plot_surface(Elev_grid[,1], Elev_grid[,2], ypred,X[,1], X[,2],y, save)

Local GLM

Pm = y
links = c("logit", "probit", 'cauchit') 
N = length(Pm)
combs = leaps(X,Pm, nbest=25)     #  GEt upto 25 combinations for each
# number of predictors
combos = combs$which
ncombos = length(combos[,1])
gcvs_m=1:ncombos
glm_gcv=vector(length = length(links))
best_comb=vector(length = length(links))
for(j in 1:length(links)) { sprintf("j",j)
  
  for(i in 1:ncombos) {
    xx = X[,combos[i,]]
    xx=as.data.frame(xx)
    zz=locpoly_fit(Pm, xx,family="binomial", link=links[j], glm=TRUE,plot=TRUE)
    gcvs_m[i] = zz$call$gcv
  }
  # Test using GCV objective function
  glm_gcv[j]=min(gcvs_m)
  best_comb[j]=which.min(gcvs_m)
}
bestmod1=locpoly_fit(Pm, X[combos[best_comb[which.min(glm_gcv)],]],family="binomial",
                     link=links[which.min(glm_gcv)], glm=TRUE,plot=TRUE)
bestmod1$combo=combos[best_comb[which.min(glm_gcv)],]
###  gaussian distribution
gcvs_m=1:ncombos
for(i in 1:ncombos) {
  xx = X[,combos[i,]]
  xx=as.data.frame(xx)
  zz=locpoly_fit(Pm, xx,family="binomial", link="logit", glm=TRUE,plot=TRUE)
  gcvs_m[i] = zz$call$gcv
}
bestmod2=locpoly_fit(Pm,X[combos[which.min(gcvs_m),]],family="binomial", link="logit", glm=TRUE,plot=TRUE)
bestmod2$combo=combos[which.min(gcvs_m),]
if (bestmod2$call$gcv>bestmod1$call$gcv){
  bestmod=bestmod1
}else{
  bestmod=bestmod2
}
X=X[,bestmod$combo]

summary(bestmod)

## Estimation type: Local Likelihood - Binomial 
## 
## Call:
## locfit(formula = Pm ~ ., data = X, alpha = 0.6, maxk = 10000, 
##     deg = 2, kern = "bisq", scale = T, family = "binomial", link = "probit", 
##     gcv = 0.80530628550884)
## 
## Number of data points:  73 
## Independent variables:  Lon 
## Evaluation structure: Rectangular Tree 
## Number of evaluation points:  7 
## Degree of fit:  2 
## Fitted Degrees of Freedom:  5.475

res.aov <- aov(Mean_Rainfall ~Lon^2,data=win_data)

print(res.aov)

## Call:
##    aov(formula = Mean_Rainfall ~ Lon^2, data = win_data)
## 
## Terms:
##                      Lon Residuals
## Sum of Squares  1684.722  6350.392
## Deg. of Freedom        1        71
## 
## Residual standard error: 9.457386
## Estimated effects may be unbalanced

ypred =predict(bestmod, newdata=Elev_grid, se.fit=T)
plot_surface(Elev_grid[,1], Elev_grid[,2], ypred,X1[,1], X1[,2],y, save)

Homework_2

Jasmine Garland

10/28/2021

The GLM model

Local GLM