5. PCA + Multinomial Regression
Construction safety outcomes data have been compiled from accident reports and described in Tixier et al. (2016). The data, attribute information and relevant R- commands can be found at http://civil.colorado.edu/~balajir/CVEN6833/const-data For the safety outcome of body part, with five categories (see Table 2 of Tixier et al., 2016).  Twenty leading attributes, all binary (1 or 0 – i.e., present or absent) are provided for each accident record.
(a) Perform a PCA on the attribute data - show the Eigen spectrum and the leading four Eigen vectors
(b) Fit a best multinomial regression with the principal components as predictors to model the categorical probability of injuries to the five category of body parts and compute the RPSS
(c) Repeat (b) for the Injury Severity.  Tixier et al. (2016) modeled these using Random Forests and Stochastic Boosting. They could not get good skill for Injury Severity. Compare your results with theirs.
(d) Apply CART and compare the results

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")

(a) Perform a PCA on the attribute data - show the Eigen spectrum and the leading four Eigen vectors

## Read Data
# Retain top 20 variables
to.retain = 20
imp.vars.body = as.character(read.csv("./data/prob5/gbm.importance.50.body.csv", header = T)[1:to.retain,"var"])
imp.vars.severity = as.character(read.csv("./data/prob5/gbm.importance.50.code.csv", header = T)[1:to.retain,"var"])

## Read Body Part data and select the important variables
data = read.csv("./data/prob5/data.body.part.csv")
colnames(data) <- as.character(colnames(data))
index = which(colnames(data)%in%imp.vars.body)
Xpredictors = data[,index]
bpart = data[,1]

bpart=as.character(bpart)
bpart[bpart == "head"]=1
bpart[bpart == "neck"]=2
bpart[bpart == "trunk"]=3
bpart[bpart == "upper extremities"]=4
bpart[bpart == "lower extremities"]=5

###################
### Perform PCA ###
###################

#get variance matrix..
zs=var(Xpredictors)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=t(t(zsvd$u) %*% t(Xpredictors))  # eigenvector * X matrix 

#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

## Eigen Spectrum

plot(1:20, lambdas[1:20], type="l", xlab="Modes", ylab="Frac. Var. Explained")
points(1:20, lambdas[1:20], col="red")

print(sprintf("The first 4 PCs explain %s%% of the variance", round(sum(lambdas[1:4])*100)))
## [1] "The first 4 PCs explain 46% of the variance"
print(sprintf("The first 6 PCs explain %s%% of the variance", round(sum(lambdas[1:6])*100)))
## [1] "The first 6 PCs explain 57% of the variance"
## Eigen Vectors
### plot zsvd$u[,1:4]
### against the variables
### Identify the variables that have a strong influence
### on the injuries.
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
                 variable = imp.vars.body)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
  ind1=4*(i-1)+1
  ind=ind1+3
  a[ind1:ind]=zsvd$u[i,1:4]
  
}
tu$Eof= a
show(ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) + 
  geom_point(size=3) + geom_line()+
  labs(title = "Eigen vectors injury precursors",x = "variable",y = "Influence",color="")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
  theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
  coord_flip())

(b) Fit a best multinomial regression with the principal components as predictors to model the categorical probability of injuries to the five category of body parts and compute the RPSS

### create binary vector
bpartbin = as.numeric(bpart)
pcs=cbind(bpartbin,pcs)
pcs = data.frame(pcs)

## model with first 5 PCs
#zz = multinom(bpart ~ V2+V3+V4+V5+V6, data=pcs, MaxNWts = 10000)
## or model with all the PCS
zz = multinom(bpartbin ~ ., data=pcs)
## # weights:  110 (84 variable)
## initial  value 6830.454500 
## iter  10 value 4802.845600
## iter  20 value 4681.045845
## iter  30 value 4667.484616
## iter  40 value 4665.052067
## iter  50 value 4664.133716
## iter  60 value 4663.837152
## iter  70 value 4663.749986
## iter  80 value 4663.668413
## final  value 4663.655477 
## converged
N = log(length(bpartbin))
print(summary(zbest))
## Call:
## multinom(formula = bpartbin ~ V2 + V3 + V4 + V6 + V7 + V8 + V9 + 
##     V12 + V14 + V15 + V17 + V20 + V21, data = pcs)
## 
## Coefficients:
##   (Intercept)        V2         V3         V4        V6        V7          V8
## 2  -1.8715403 -2.117657  0.7444662  2.2291403 2.7728561 -2.859202 -5.06051710
## 3  -0.2512364 -3.945427  7.8531051  1.0534055 0.6796384  1.412624  0.09345112
## 4   1.0090588 -1.829775  4.9939418 -0.6960140 0.6249908  3.050353 -1.07466342
## 5   0.2795602 -3.909752 12.7226743  0.6675859 2.2823349  1.776654  1.00568870
##            V9        V12        V14        V15        V17         V20
## 2  4.54543505  9.5003324  9.2493738 -11.271684 21.3996690 -174.901741
## 3 -0.07983149 -0.2164937 -2.2052052  -1.017061 -0.7406517   -4.155083
## 4 -0.83864614  0.2290801  0.4808086   1.357942 -0.9804030    2.700105
## 5  0.58910532 -1.1178187 -1.0133073   1.302331 -1.3559690    1.332043
##           V21
## 2 223.4749927
## 3  -0.4797571
## 4  -3.2937500
## 5  -1.2893075
## 
## Std. Errors:
##   (Intercept)        V2        V3        V4        V6        V7        V8
## 2  0.14450319 0.6873848 1.6222056 0.8975402 1.0366390 1.7731184 1.9411282
## 3  0.07671925 0.6121483 1.8827514 0.4086169 0.4716272 0.3400028 0.4634082
## 4  0.05908701 0.1822436 0.3584732 0.2217303 0.3923408 0.2956986 0.3686182
## 5  0.06766681 0.5863416 1.7828655 0.2888557 0.4100698 0.3204268 0.4177351
##          V9       V12       V14       V15       V17       V20       V21
## 2 1.8188829 2.3524723 3.2689684 3.4802727 5.9370087 54.749495 68.312069
## 3 0.4305687 0.6170746 0.7693763 0.3503601 0.6167398  3.912979  1.861419
## 4 0.3404493 0.2998880 0.3229168 0.3660987 0.3358988  1.041735  1.135893
## 5 0.3914876 0.4917368 0.4553335 0.4936806 0.5059824  1.109333  1.303356
## 
## Residual Deviance: 9390.197 
## AIC: 9502.197
#################### RPSS
ypred = predict(zbest, type = "probs")
# RPSS calculations
acc2 = as.numeric(bpart)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:5) {
  ind=which(acc2==i)
  acc2_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
p5 <- length(acc2[acc2 == 5])/N

prob = c(p1, p2, p3,p4, p5)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
##                         prob
## head              0.23963242
## neck              0.02403393
## trunk             0.12111216
## upper extremities 0.42035815
## lower extremities 0.19486334
print(sprintf('Body Part Multinomial Regression RPSS: %s', RPSS))
## [1] "Body Part Multinomial Regression RPSS: 0.281666586947724"

Comments

(c) Repeat (b) for the Injury Severity

# find column numbers corresponding to the important variables (where binary is some attribute and outcome data set)

## Read Injury Severity data and select the important variables
data1 = read.csv("./data/prob5/data.worst.case.severity.csv")
index=which(colnames(data1)%in%imp.vars.severity)
Xpredictors = data1[,index]
sev = data1[,1]

# Divide body parts into five classes
sev[sev == "1st Aid"]=1
sev[sev == "Medical Case"]=2
sev[sev == "Lost Work Time"]=3
sev[sev == "Permanent Disablement"]=4
sev[sev == "Fatality"]=5

### repeat steps from above.
#get variance matrix..
zs=var(Xpredictors)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs1 = t(t(zsvd$u) %*% t(Xpredictors))  # eigenvector * X matrix 

#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

## Eigen Spectrum

plot(1:20, lambdas[1:20], type="l", xlab="Modes", ylab="Frac. Var. Explained")
points(1:20, lambdas[1:20], col="red")

print(sprintf("The first 4 PCs explain %s%% of the variance", round(sum(lambdas[1:4])*100)))
## [1] "The first 4 PCs explain 47% of the variance"
print(sprintf("The first 6 PCs explain %s%% of the variance", round(sum(lambdas[1:6])*100)))
## [1] "The first 6 PCs explain 60% of the variance"
## Eigen Vectors
### plot zsvd$u[,1:4]
### against the variables
### Identify the variables that have a strong influence
### on the injuries.
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
                 variable = imp.vars.severity)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
  ind1=4*(i-1)+1
  ind=ind1+3
  a[ind1:ind]=zsvd$u[i,1:4]
  
}
tu$Eof= a
show(ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) + 
       geom_point(size=3) + geom_line()+
       labs(title = "Eigen vectors injury precursors",x = "variable",y = "Influence",color="")+
       theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
       theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
       coord_flip())

## part(b)
### fit Multinomial regression

### create binary vector
sevbin = as.numeric(sev)

pcs1 = cbind(sevbin,pcs1)
pcs1 = data.frame(pcs1)

## model with first 5 PCs
#zz = multinom(bpart ~ V2+V3+V4+V5+V6, data=pcs)
## or model with all the PCS
zz = multinom(sevbin ~ ., data=pcs1)
## # weights:  110 (84 variable)
## initial  value 3033.790465 
## iter  10 value 2259.280906
## iter  20 value 2193.854089
## iter  30 value 2183.696563
## iter  40 value 2181.897827
## iter  50 value 2181.379394
## iter  60 value 2181.332399
## iter  70 value 2181.327237
## final  value 2181.327085 
## converged
N = log(length(sevbin))
print(summary(zbest))
## Call:
## multinom(formula = sevbin ~ V5, data = pcs1)
## 
## Coefficients:
##   (Intercept)         V5
## 2   1.3856859 -0.5023897
## 3   1.1492696  1.0188690
## 4  -0.4902862 -0.1879632
## 5  -1.8079867  1.5778299
## 
## Std. Errors:
##   (Intercept)        V5
## 2  0.08127565 0.2718042
## 3  0.08325535 0.3228468
## 4  0.11794842 0.3901892
## 5  0.19168359 1.0502369
## 
## Residual Deviance: 4541.59 
## AIC: 4557.59
#################### RPSS
ypred = predict(zbest, type = "probs")
# RPSS calculations
acc2 = as.numeric(sev)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:4) {
  ind=which(acc2==i)
  acc2_name[i]=as.character(data1$worst.case.severity[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N

prob <- c(p1, p2, p3,p4)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
##                             prob
## 1st Aid               0.11140584
## Medical Case          0.47108753
## Lost Work Time        0.33103448
## Permanent Disablement 0.06949602
print(sprintf('Injury Severity Multinomial Regression RPSS: %s', RPSS))
## [1] "Injury Severity Multinomial Regression RPSS: 0.357734324492506"

Comments

Compare results with Tixier et al. (2016)

(d) Apply CART and compare the results

Apply CART for body part data

## Create complex (full) tree
data$body.part = factor(data$body.part)
myTree <- tree(body.part ~ ., data = data, model = T, method = 'class')
print(summary(myTree))
## 
## Classification tree:
## tree(formula = body.part ~ ., data = data, method = "class", 
##     model = T)
## Variables actually used in tree construction:
## [1] "small.particle"      "hazardous.substance" "object.on.the.floor"
## Number of terminal nodes:  4 
## Residual mean deviance:  2.397 = 10160 / 4240 
## Misclassification error rate: 0.4618 = 1960 / 4244
## Plot unpruned tree, calculate & plot pruned tree
treeFit = treeFun(myTree, toPlot = T, title = "Body Part Tree")

## Use tree to predict original data
treePred <- predict(treeFit)

# RPSS calculations
acc2 = as.numeric(bpart)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:5) {
  ind=which(acc2==i)
  acc2_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
p5 <- length(acc2[acc2 == 5])/N

prob = c(p1, p2, p3,p4, p5)
RPSS=rps(acc2, treePred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
##                         prob
## head              0.23963242
## neck              0.02403393
## trunk             0.12111216
## upper extremities 0.42035815
## lower extremities 0.19486334
print(sprintf('RPSS for Body Part CART: %s', RPSS))
## [1] "RPSS for Body Part CART: 0.0740356331139008"

Apply CART for injury severity data

## Create complex (full) tree
#myTree <- tree(worst.case.severity ~ ., data = data1, model = T)
data1$worst.case.severity = factor(data1$worst.case.severity)
myTree <- tree(worst.case.severity ~ ., data = data1, model = T, method = 'class')
print(summary(myTree))
## 
## Classification tree:
## tree(formula = worst.case.severity ~ ., data = data1, method = "class", 
##     model = T)
## Variables actually used in tree construction:
## [1] "small.particle"
## Number of terminal nodes:  2 
## Residual mean deviance:  2.412 = 4542 / 1883 
## Misclassification error rate: 0.5289 = 997 / 1885
## Plot unpruned tree, calculate & plot pruned tree
treeFit = treeFun(myTree, toPlot = T, title = "Injury Severity Tree")

## Use tree to predict original data
treePred <- predict(treeFit)
# RPSS calculations
acc2 = as.numeric(sev)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:4) {
  ind=which(acc2==i)
  acc2_name[i]=as.character(data1$worst.case.severity[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N

prob <- c(p1, p2, p3,p4)
RPSS=rps(acc2, treePred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
##                             prob
## 1st Aid               0.11140584
## Medical Case          0.47108753
## Lost Work Time        0.33103448
## Permanent Disablement 0.06949602
print(sprintf('RPSS for Injury Severity CART: %s', RPSS))
## [1] "RPSS for Injury Severity CART: -0.276864513386898"

Compare the results