{
#	Logistic Regression
#	for problem 15-1 from Helsel book
	
##test=matrix(scan("prob15-1.txt",skip=1), ncol=5, byrow=T)
#OR

test=matrix(scan("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/prob-15-1.txt",skip=1), ncol=5, byrow=T)
	
	
#	

	Y = test[,5]	# REsponse variable - Contamination 0 or 1
      X = test[,1:4]	#predictor variables..

	logcalib=glm(Y ~ X,family="binomial")

	summary(logcalib)

 summary(logcalib)

Call:
glm(formula = Y ~ X, family = "binomial")

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.09450  -0.33494  -0.08567  -0.04624   2.13239  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -13.20539    3.55814  -3.711 0.000206 ***
X1            0.51527    0.15094   3.414 0.000641 ***
X2            0.42909    0.27487   1.561 0.118506    
X3            0.03035    0.32461   0.093 0.925515    
X4            1.08951    0.29863   3.648 0.000264 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 91.474  on 123  degrees of freedom
Residual deviance: 49.704  on 119  degrees of freedom
AIC: 59.704

Number of Fisher Scoring iterations: 7

####################################################
#variables X2 and X3 have high p-values - i.e. they are insignificant

# We fit the model without these two variables

      X = cbind(test[,1], test[,4])	# dropping out X2 and X3 predictor variables..

	logcalib=glm(Y ~ X,family="binomial")

	summary(logcalib)

Call:
glm(formula = Y ~ X, family = "binomial")

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.07135  -0.28760  -0.11266  -0.06592   2.05460  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -10.8904     2.4344  -4.474  7.7e-06 ***
X1            0.4636     0.1358   3.415 0.000638 ***
X2            1.0740     0.2830   3.795 0.000148 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 91.474  on 123  degrees of freedom
Residual deviance: 52.545  on 121  degrees of freedom
AIC: 58.545

Number of Fisher Scoring iterations: 7
#################################################

#Notice that the AIC has improved as well from the full model

# Performing a Stepwise also produces the same model as above.

X = test[,1:4]
X = as.data.frame(X)

# Full model
fullm = glm(Y ~ ., data=X, family="binomial")

# do a stepwise
zstep = stepAIC(fullm)

#  output of the best model 
>summary(zstep)

Call:
glm(formula = Y ~ V1 + V2 + V4, family = "binomial", data = X)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.08079  -0.33644  -0.08457  -0.04780   2.14428  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -13.0457     3.0983  -4.211 2.55e-05 ***
V1            0.5161     0.1508   3.422 0.000622 ***
V2            0.4257     0.2712   1.569 0.116541    
V4            1.0859     0.2952   3.679 0.000234 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 91.474  on 123  degrees of freedom
Residual deviance: 49.713  on 120  degrees of freedom
AIC: 57.713

Number of Fisher Scoring iterations: 7

#############################################################################
#Estimate using the best model

	logpred=predict.glm(zstep,data = X, se.fit=T,type="response")

#logpred$fit contains the estimate from the Logistic regression
# logpred$se contains the standard error of the estimate.

# If you wish prediction interval..
logpred = predict.glm(zstep, data = X, se.fit=T, type="response", interval="predict")


##################################################################