# Commands for Multiple Regression
# You can use the same for Simple Regression as well..


#Suppose the best set of predictor variables or independent variables are in matrix X
# You have this from the subset selection procedure..

# The dependent variable is in vector Y

#For Example,
test=matrix(scan("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/Avg-SpringFlows-April1SWE-49-04.txt",skip=1), ncol=2, byrow=T)

Y=test[,1]		#Spring river flows
X=test[,2]		#April 1 Snow water equivalent
X = as.matrix(X)	#if X is a single column it will make it a matrix 

# do a linear regression..
zz=lsfit(X,Y)

N=length(Y)	# of data points..
X1=cbind(rep(1,n),X)	#add the first column of 1s

betas=solve(t(X1) %*% X1)%*%t(X1)%*%Y	#betas = (X^TX)^-1 X^T Y

#command solve gives the inverse of a matrix
#t(X) gives the transpose of a matrix
#%*% is the command for matrix multiplication.


bestmodel = lsfit(X,Y)		#Linear regress fit

ls.print(bestmodel)			# spits out the F-test, R2, variable significance
				# F-test provides significance of the model

##### quantities for ANOVA

SST = sum((Y - mean(Y))^2) 

Yhat = Y - bestmodel$res    #(Y - Yhat = residuals)

SSR = sum((Yhat - mean(Y))^2) 


SSE = sum((bestmodel$res)^2) 

MSR = SSR / ncol(X)            

MSE = SSE/(N - length(bestmodel$coef))    


Source of Variation				Sum of Squares		DOF
Mean Square

Regression					    SSR ncol(X)           SSR/ncol(X)

Error or REsidual				    SSE		N - (ncol(X) +1)  SSE / (N - ncol(X) -1)

Fo = MSR/MSE 

(Same as the F-statistic from the output of ls.print(bestmodelp) above).

>pvalue  =  1 - pf(Fo,ncol(X),(N-length(bestmodelp$coef)))     

#Regression is sifnfiicant if p-value less than alpha..

#################################################################


N = length(Y)
XX = cbind(rep(1,N), X)

##########################3 Hat matrix and influence..
# Compute the Hat matrix
hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)

# You can also get the hat matrix from...

library(MPV)
zz = lm(Y ~ X)
hatm = hatvalues(zz)

#studentized residuals - ri  - equation 12-42
ri = bestmodel$res/sqrt((1 - diag(hatm)) * MSE)

#Compute Cook's Distance Equation 12-44
Di = ri*ri * diag(hatm) / ((1-diag(hatm)) * length(bestmodel$coef))

#If Dis are greater than 1 then there are points that have undue influence on the fit.

##############

# Model Diagnostics on the residuals can be done by 
# Use the commands in the file
http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/lsfit-diag-crossval-commands_elts.txt

# Normality of residuals
# Independence of residuals
# Homoskedasticity - Yhat versus residuals should not show any structure

##########################################################################