# Commands for Multiple Regression # You can use the same for Simple Regression as well.. #Suppose the best set of predictor variables or independent variables are in matrix X # You have this from the subset selection procedure.. # The dependent variable is in vector Y #For Example, test=matrix(scan("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/Avg-SpringFlows-April1SWE-49-04.txt",skip=1), ncol=2, byrow=T) Y=test[,1] #Spring river flows X=test[,2] #April 1 Snow water equivalent X = as.matrix(X) #if X is a single column it will make it a matrix # do a linear regression.. zz=lsfit(X,Y) N=length(Y) # of data points.. X1=cbind(rep(1,n),X) #add the first column of 1s betas=solve(t(X1) %*% X1)%*%t(X1)%*%Y #betas = (X^TX)^-1 X^T Y #command solve gives the inverse of a matrix #t(X) gives the transpose of a matrix #%*% is the command for matrix multiplication. bestmodel = lsfit(X,Y) #Linear regress fit ls.print(bestmodel) # spits out the F-test, R2, variable significance # F-test provides significance of the model ##### quantities for ANOVA SST = sum((Y - mean(Y))^2) Yhat = Y - bestmodel$res #(Y - Yhat = residuals) SSR = sum((Yhat - mean(Y))^2) SSE = sum((bestmodel$res)^2) MSR = SSR / ncol(X) MSE = SSE/(N - length(bestmodel$coef)) Source of Variation Sum of Squares DOF Mean Square Regression SSR ncol(X) SSR/ncol(X) Error or REsidual SSE N - (ncol(X) +1) SSE / (N - ncol(X) -1) Fo = MSR/MSE (Same as the F-statistic from the output of ls.print(bestmodelp) above). >pvalue = 1 - pf(Fo,ncol(X),(N-length(bestmodelp$coef))) #Regression is sifnfiicant if p-value less than alpha.. ################################################################# N = length(Y) XX = cbind(rep(1,N), X) ##########################3 Hat matrix and influence.. # Compute the Hat matrix hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX) # You can also get the hat matrix from... library(MPV) zz = lm(Y ~ X) hatm = hatvalues(zz) #studentized residuals - ri - equation 12-42 ri = bestmodel$res/sqrt((1 - diag(hatm)) * MSE) #Compute Cook's Distance Equation 12-44 Di = ri*ri * diag(hatm) / ((1-diag(hatm)) * length(bestmodel$coef)) #If Dis are greater than 1 then there are points that have undue influence on the fit. ############## # Model Diagnostics on the residuals can be done by # Use the commands in the file http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/lsfit-diag-crossval-commands_elts.txt # Normality of residuals # Independence of residuals # Homoskedasticity - Yhat versus residuals should not show any structure ##########################################################################