library(VGAM) library(nnet) library(MASS) library(verification) ################### ### Perform PCA ### ################### #get variance matrix.. zs=var(Xpredictors) #do an Eigen decomposition.. zsvd=svd(zs) #Principal Components... pcs=t(t(zsvd$u) %*% t(Xpredictors)) # eigenvector * X matrix #Eigen Values.. - fraction variance lambdas=(zsvd$d/sum(zsvd$d)) ## Eigen Spectrum plot(1:20, lambdas[1:20], type="l", xlab="Modes", ylab="Frac. Var. explained") points(1:20, lambdas[1:20], col="red") ## Eigen Vectors ### plot zsvd$u[,1:4] ### against the variables ### Identify the variables that have a strong influence ### on the injuries. ## part(b) ### fit Multinomial regression ### create binary vector bpartbin=as.numeric(bpart) pcs=cbind(bpartbin,pcs) pcs = data.frame(pcs) ## model with first 5 PCS zz = multinom(bpartbin ~ V2+V3+V4+V5+V6, data=pcs) ## or model with all the PCS zz = multinom(bpartbin ~ ., data=pcs) N = log(length(bpart)) ## BIC zbest=stepAIC(zz,k=N) #################### RPSS ypred = predict(zz, type = "probs") # RPSS calculations acc2 = as.numeric(bpart) N = length(acc2) ### using verification package you get the same results.. # Empirical probabilities p1 <- length(acc2[acc2 == 1])/N p2 <- length(acc2[acc2 == 2])/N p3 <- length(acc2[acc2 == 3])/N p4 <- length(acc2[acc2 == 4])/N p5 <- length(acc2[acc2 == 5])/N climo <- c(p1, p2, p3,p4,p5) rps(acc2, ypred, baseline=climo )$rpss ########### part (c) # retain top 20 variables to.retain=20 imp.vars.energy=as.character(read.csv("gbm.importance.50.energy.csv",header=TRUE)[1:to.retain,"var"]) imp.vars.inj.type=as.character(read.csv("gbm.importance.50.code.csv",header=TRUE)[1:to.retain,"var"]) imp.vars.body=as.character(read.csv("gbm.importance.50.body.csv",header=TRUE)[1:to.retain,"var"]) imp.vars.severity=c("crane","heavy.material.tool","stairs","unpowered.tool","exiting.transitioning","heavy.vehicle","uneven.surface","drill","ladder","object.at.height","improper.procedure.inattention","unpowered.transporter","machinery","improper.security.of.materials","working.at.height","no.improper.PPE","small.particle","steel.steel.sections","stripping","scaffold") # find column numbers corresponding to the important variables (where binary is some attribute and outcome data set) ## Read Body part data and select the important variables data = read.csv("data.worst.case.severity.csv") index=which(colnames(data)%in%imp.vars.body) Xpredictors = data[,index] sev=as.character(data[,1]) # Divide body parts into five classes sev[sev == "1st Aid"]=1 sev[sev == "Medical Case"]=2 sev[sev == "Lost Work Time"]=3 sev[sev == "Permanent Disablement"]=4 sev[sev == "Fatality"]=5 ### create binary vector sevbin=as.numeric(sev) ### repeat steps from above.