5. PCA + Multinomial Regression
Construction safety outcomes data have been compiled from accident reports and described in Tixier et al. (2016). The data, attribute information and relevant R- commands can be found at http://civil.colorado.edu/~balajir/CVEN6833/const-data For the safety outcome of body part, with five categories (see Table 2 of Tixier et al., 2016). Twenty leading attributes, all binary (1 or 0 – i.e., present or absent) are provided for each accident record.
(a) Perform a PCA on the attribute data - show the Eigen spectrum and the leading four Eigen vectors
(b) Fit a best multinomial regression with the principal components as predictors to model the categorical probability of injuries to the five category of body parts and compute the RPSS
(c) Repeat (b) for the Injury Severity. Tixier et al. (2016) modeled these using Random Forests and Stochastic Boosting. They could not get good skill for Injury Severity. Compare your results with theirs.
(d) Apply CART and compare the results
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
## Read Data
# Retain top 20 variables
to.retain = 20
imp.vars.body = as.character(read.csv("./data/prob5/gbm.importance.50.body.csv", header = T)[1:to.retain,"var"])
imp.vars.severity = as.character(read.csv("./data/prob5/gbm.importance.50.code.csv", header = T)[1:to.retain,"var"])
## Read Body Part data and select the important variables
data = read.csv("./data/prob5/data.body.part.csv")
colnames(data) <- as.character(colnames(data))
index = which(colnames(data)%in%imp.vars.body)
Xpredictors = data[,index]
bpart = data[,1]
bpart=as.character(bpart)
bpart[bpart == "head"]=1
bpart[bpart == "neck"]=2
bpart[bpart == "trunk"]=3
bpart[bpart == "upper extremities"]=4
bpart[bpart == "lower extremities"]=5
###################
### Perform PCA ###
###################
#get variance matrix..
zs=var(Xpredictors)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
pcs=t(t(zsvd$u) %*% t(Xpredictors)) # eigenvector * X matrix
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
## Eigen Spectrum
plot(1:20, lambdas[1:20], type="l", xlab="Modes", ylab="Frac. Var. Explained")
points(1:20, lambdas[1:20], col="red")
print(sprintf("The first 4 PCs explain %s%% of the variance", round(sum(lambdas[1:4])*100)))
## [1] "The first 4 PCs explain 46% of the variance"
print(sprintf("The first 6 PCs explain %s%% of the variance", round(sum(lambdas[1:6])*100)))
## [1] "The first 6 PCs explain 57% of the variance"
## Eigen Vectors
### plot zsvd$u[,1:4]
### against the variables
### Identify the variables that have a strong influence
### on the injuries.
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
variable = imp.vars.body)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
ind1=4*(i-1)+1
ind=ind1+3
a[ind1:ind]=zsvd$u[i,1:4]
}
tu$Eof= a
show(ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) +
geom_point(size=3) + geom_line()+
labs(title = "Eigen vectors injury precursors",x = "variable",y = "Influence",color="")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
coord_flip())
### create binary vector
bpartbin = as.numeric(bpart)
pcs=cbind(bpartbin,pcs)
pcs = data.frame(pcs)
## model with first 5 PCs
#zz = multinom(bpart ~ V2+V3+V4+V5+V6, data=pcs, MaxNWts = 10000)
## or model with all the PCS
zz = multinom(bpartbin ~ ., data=pcs)
## # weights: 110 (84 variable)
## initial value 6830.454500
## iter 10 value 4802.845600
## iter 20 value 4681.045845
## iter 30 value 4667.484616
## iter 40 value 4665.052067
## iter 50 value 4664.133716
## iter 60 value 4663.837152
## iter 70 value 4663.749986
## iter 80 value 4663.668413
## final value 4663.655477
## converged
N = log(length(bpartbin))
print(summary(zbest))
## Call:
## multinom(formula = bpartbin ~ V2 + V3 + V4 + V6 + V7 + V8 + V9 +
## V12 + V14 + V15 + V17 + V20 + V21, data = pcs)
##
## Coefficients:
## (Intercept) V2 V3 V4 V6 V7 V8
## 2 -1.8715403 -2.117657 0.7444662 2.2291403 2.7728561 -2.859202 -5.06051710
## 3 -0.2512364 -3.945427 7.8531051 1.0534055 0.6796384 1.412624 0.09345112
## 4 1.0090588 -1.829775 4.9939418 -0.6960140 0.6249908 3.050353 -1.07466342
## 5 0.2795602 -3.909752 12.7226743 0.6675859 2.2823349 1.776654 1.00568870
## V9 V12 V14 V15 V17 V20
## 2 4.54543505 9.5003324 9.2493738 -11.271684 21.3996690 -174.901741
## 3 -0.07983149 -0.2164937 -2.2052052 -1.017061 -0.7406517 -4.155083
## 4 -0.83864614 0.2290801 0.4808086 1.357942 -0.9804030 2.700105
## 5 0.58910532 -1.1178187 -1.0133073 1.302331 -1.3559690 1.332043
## V21
## 2 223.4749927
## 3 -0.4797571
## 4 -3.2937500
## 5 -1.2893075
##
## Std. Errors:
## (Intercept) V2 V3 V4 V6 V7 V8
## 2 0.14450319 0.6873848 1.6222056 0.8975402 1.0366390 1.7731184 1.9411282
## 3 0.07671925 0.6121483 1.8827514 0.4086169 0.4716272 0.3400028 0.4634082
## 4 0.05908701 0.1822436 0.3584732 0.2217303 0.3923408 0.2956986 0.3686182
## 5 0.06766681 0.5863416 1.7828655 0.2888557 0.4100698 0.3204268 0.4177351
## V9 V12 V14 V15 V17 V20 V21
## 2 1.8188829 2.3524723 3.2689684 3.4802727 5.9370087 54.749495 68.312069
## 3 0.4305687 0.6170746 0.7693763 0.3503601 0.6167398 3.912979 1.861419
## 4 0.3404493 0.2998880 0.3229168 0.3660987 0.3358988 1.041735 1.135893
## 5 0.3914876 0.4917368 0.4553335 0.4936806 0.5059824 1.109333 1.303356
##
## Residual Deviance: 9390.197
## AIC: 9502.197
#################### RPSS
ypred = predict(zbest, type = "probs")
# RPSS calculations
acc2 = as.numeric(bpart)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:5) {
ind=which(acc2==i)
acc2_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
p5 <- length(acc2[acc2 == 5])/N
prob = c(p1, p2, p3,p4, p5)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
## prob
## head 0.23963242
## neck 0.02403393
## trunk 0.12111216
## upper extremities 0.42035815
## lower extremities 0.19486334
print(sprintf('Body Part Multinomial Regression RPSS: %s', RPSS))
## [1] "Body Part Multinomial Regression RPSS: 0.281666586947724"
# find column numbers corresponding to the important variables (where binary is some attribute and outcome data set)
## Read Injury Severity data and select the important variables
data1 = read.csv("./data/prob5/data.worst.case.severity.csv")
index=which(colnames(data1)%in%imp.vars.severity)
Xpredictors = data1[,index]
sev = data1[,1]
# Divide body parts into five classes
sev[sev == "1st Aid"]=1
sev[sev == "Medical Case"]=2
sev[sev == "Lost Work Time"]=3
sev[sev == "Permanent Disablement"]=4
sev[sev == "Fatality"]=5
### repeat steps from above.
#get variance matrix..
zs=var(Xpredictors)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
pcs1 = t(t(zsvd$u) %*% t(Xpredictors)) # eigenvector * X matrix
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
## Eigen Spectrum
plot(1:20, lambdas[1:20], type="l", xlab="Modes", ylab="Frac. Var. Explained")
points(1:20, lambdas[1:20], col="red")
print(sprintf("The first 4 PCs explain %s%% of the variance", round(sum(lambdas[1:4])*100)))
## [1] "The first 4 PCs explain 47% of the variance"
print(sprintf("The first 6 PCs explain %s%% of the variance", round(sum(lambdas[1:6])*100)))
## [1] "The first 6 PCs explain 60% of the variance"
## Eigen Vectors
### plot zsvd$u[,1:4]
### against the variables
### Identify the variables that have a strong influence
### on the injuries.
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
variable = imp.vars.severity)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
ind1=4*(i-1)+1
ind=ind1+3
a[ind1:ind]=zsvd$u[i,1:4]
}
tu$Eof= a
show(ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) +
geom_point(size=3) + geom_line()+
labs(title = "Eigen vectors injury precursors",x = "variable",y = "Influence",color="")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
coord_flip())
## part(b)
### fit Multinomial regression
### create binary vector
sevbin = as.numeric(sev)
pcs1 = cbind(sevbin,pcs1)
pcs1 = data.frame(pcs1)
## model with first 5 PCs
#zz = multinom(bpart ~ V2+V3+V4+V5+V6, data=pcs)
## or model with all the PCS
zz = multinom(sevbin ~ ., data=pcs1)
## # weights: 110 (84 variable)
## initial value 3033.790465
## iter 10 value 2259.280906
## iter 20 value 2193.854089
## iter 30 value 2183.696563
## iter 40 value 2181.897827
## iter 50 value 2181.379394
## iter 60 value 2181.332399
## iter 70 value 2181.327237
## final value 2181.327085
## converged
N = log(length(sevbin))
print(summary(zbest))
## Call:
## multinom(formula = sevbin ~ V5, data = pcs1)
##
## Coefficients:
## (Intercept) V5
## 2 1.3856859 -0.5023897
## 3 1.1492696 1.0188690
## 4 -0.4902862 -0.1879632
## 5 -1.8079867 1.5778299
##
## Std. Errors:
## (Intercept) V5
## 2 0.08127565 0.2718042
## 3 0.08325535 0.3228468
## 4 0.11794842 0.3901892
## 5 0.19168359 1.0502369
##
## Residual Deviance: 4541.59
## AIC: 4557.59
#################### RPSS
ypred = predict(zbest, type = "probs")
# RPSS calculations
acc2 = as.numeric(sev)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:4) {
ind=which(acc2==i)
acc2_name[i]=as.character(data1$worst.case.severity[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
prob <- c(p1, p2, p3,p4)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
## prob
## 1st Aid 0.11140584
## Medical Case 0.47108753
## Lost Work Time 0.33103448
## Permanent Disablement 0.06949602
print(sprintf('Injury Severity Multinomial Regression RPSS: %s', RPSS))
## [1] "Injury Severity Multinomial Regression RPSS: 0.357734324492506"
## Create complex (full) tree
data$body.part = factor(data$body.part)
myTree <- tree(body.part ~ ., data = data, model = T, method = 'class')
print(summary(myTree))
##
## Classification tree:
## tree(formula = body.part ~ ., data = data, method = "class",
## model = T)
## Variables actually used in tree construction:
## [1] "small.particle" "hazardous.substance" "object.on.the.floor"
## Number of terminal nodes: 4
## Residual mean deviance: 2.397 = 10160 / 4240
## Misclassification error rate: 0.4618 = 1960 / 4244
## Plot unpruned tree, calculate & plot pruned tree
treeFit = treeFun(myTree, toPlot = T, title = "Body Part Tree")
## Use tree to predict original data
treePred <- predict(treeFit)
# RPSS calculations
acc2 = as.numeric(bpart)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:5) {
ind=which(acc2==i)
acc2_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
p5 <- length(acc2[acc2 == 5])/N
prob = c(p1, p2, p3,p4, p5)
RPSS=rps(acc2, treePred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
## prob
## head 0.23963242
## neck 0.02403393
## trunk 0.12111216
## upper extremities 0.42035815
## lower extremities 0.19486334
print(sprintf('RPSS for Body Part CART: %s', RPSS))
## [1] "RPSS for Body Part CART: 0.0740356331139008"
## Create complex (full) tree
#myTree <- tree(worst.case.severity ~ ., data = data1, model = T)
data1$worst.case.severity = factor(data1$worst.case.severity)
myTree <- tree(worst.case.severity ~ ., data = data1, model = T, method = 'class')
print(summary(myTree))
##
## Classification tree:
## tree(formula = worst.case.severity ~ ., data = data1, method = "class",
## model = T)
## Variables actually used in tree construction:
## [1] "small.particle"
## Number of terminal nodes: 2
## Residual mean deviance: 2.412 = 4542 / 1883
## Misclassification error rate: 0.5289 = 997 / 1885
## Plot unpruned tree, calculate & plot pruned tree
treeFit = treeFun(myTree, toPlot = T, title = "Injury Severity Tree")
## Use tree to predict original data
treePred <- predict(treeFit)
# RPSS calculations
acc2 = as.numeric(sev)
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:4) {
ind=which(acc2==i)
acc2_name[i]=as.character(data1$worst.case.severity[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
prob <- c(p1, p2, p3,p4)
RPSS=rps(acc2, treePred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
## [1] "Empirical probabilities are:"
print(prob)
## prob
## 1st Aid 0.11140584
## Medical Case 0.47108753
## Lost Work Time 0.33103448
## Permanent Disablement 0.06949602
print(sprintf('RPSS for Injury Severity CART: %s', RPSS))
## [1] "RPSS for Injury Severity CART: -0.276864513386898"
Comments