library(ggplot2);
library(nnet); # to fit multinomial regression
library(VGAM);
library(verification);
library(leaps);
library(tree); # classification and regression trees
library(randomForest); #Random Forest
library(glmnet);
#install.packages('tree', repos='http://cran.us.r-project.org')
set.seed(1)
## read data
# retain top 20 variables
to.retain=20
imp.vars.body=as.character(read.csv("gbm.importance.50.body.csv",header=TRUE)[1:to.retain,"var"])
imp.vars.severity = as.character(read.csv("gbm.importance.50.code.csv",header=TRUE)[1:to.retain, "var"])
## Read Body part data and select the important variables
data = read.csv("data.body.part.csv")
index=which(colnames(data)%in%imp.vars.body)
Xpredictors = data[,index]
bpart=data[,1]
## Read the Injury Severity data and select the important variables
data1 = read.csv("data.severity.csv")
#get variance matrix..
zs=var(Xpredictors)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
pcs=t(t(zsvd$u) %*% t(Xpredictors)) # eigenvector * X matrix
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
#Plot the eigen variance spectrum for the first 20 modes
#Eigen spectrum
ggplot() +
geom_line(mapping = aes(1:20, lambdas[1:20])) +
geom_point(mapping = aes(1:20, lambdas[1:20]), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
labs(title = "Eigen Spectrum injury precursors",x = "Modes",y = "Frac. Var. explained")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
sprintf("First 6 PCs explained %s percent of the variance",round(sum(lambdas[1:6])*100))
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
variable = imp.vars.body)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
ind1=4*(i-1)+1
ind=ind1+3
a[ind1:ind]=zsvd$u[i,1:4]
}
tu$Eof= a
ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) +
geom_point(size=3) + geom_line()+
labs(title = "Eigen vectors injury precursors",x = "variable",y = "Influence",color="")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
coord_flip()
pcs = data.frame(pcs)
zz = multinom(bpart ~ ., data=pcs)
# weights: 110 (84 variable) initial value 6830.454500 iter 10 value 4802.845600 iter 20 value 4681.045845 iter 30 value 4667.484616 iter 40 value 4665.052067 iter 50 value 4664.133716 iter 60 value 4663.837152 iter 70 value 4663.749986 iter 80 value 4663.668413 final value 4663.655477 converged
N = log(length(bpart))
zbest=stepAIC(zz,k=N)
summary(zbest)
Call:
multinom(formula = bpart ~ X1 + X2 + X3 + X5 + X6 + X7 + X8 +
X11 + X13 + X14 + X16 + X19 + X20, data = pcs)
Coefficients:
(Intercept) X1 X2 X3 X5
lower extremities 0.2795602 -3.909752 12.7226743 0.6675859 2.2823349
neck -1.8715403 -2.117657 0.7444662 2.2291403 2.7728561
trunk -0.2512364 -3.945427 7.8531051 1.0534055 0.6796384
upper extremities 1.0090588 -1.829775 4.9939418 -0.6960140 0.6249908
X6 X7 X8 X11 X13
lower extremities 1.776654 1.00568870 0.58910532 -1.1178187 -1.0133073
neck -2.859202 -5.06051710 4.54543505 9.5003324 9.2493738
trunk 1.412624 0.09345112 -0.07983149 -0.2164937 -2.2052052
upper extremities 3.050353 -1.07466342 -0.83864614 0.2290801 0.4808086
X14 X16 X19 X20
lower extremities 1.302331 -1.3559690 1.332043 -1.2893075
neck -11.271684 21.3996690 -174.901741 223.4749927
trunk -1.017061 -0.7406517 -4.155083 -0.4797571
upper extremities 1.357942 -0.9804030 2.700105 -3.2937500
Std. Errors:
(Intercept) X1 X2 X3 X5 X6
lower extremities 0.06766681 0.5863416 1.7828655 0.2888557 0.4100698 0.3204268
neck 0.14450319 0.6873848 1.6222056 0.8975402 1.0366390 1.7731184
trunk 0.07671925 0.6121483 1.8827514 0.4086169 0.4716272 0.3400028
upper extremities 0.05908701 0.1822436 0.3584732 0.2217303 0.3923408 0.2956986
X7 X8 X11 X13 X14 X16
lower extremities 0.4177351 0.3914876 0.4917368 0.4553335 0.4936806 0.5059824
neck 1.9411282 1.8188829 2.3524723 3.2689684 3.4802727 5.9370087
trunk 0.4634082 0.4305687 0.6170746 0.7693763 0.3503601 0.6167398
upper extremities 0.3686182 0.3404493 0.2998880 0.3229168 0.3660987 0.3358988
X19 X20
lower extremities 1.109333 1.303356
neck 54.749495 68.312069
trunk 3.912979 1.861419
upper extremities 1.041735 1.135893
Residual Deviance: 9390.197
AIC: 9502.197
ypred = predict(zbest, type = "probs")
# RPSS calculations
acc2 = unclass(factor(data$body.part))
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:5) {
ind=which(acc2==i)
acc2_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
p5 <- length(acc2[acc2 == 5])/N
prob = c(p1, p2, p3,p4,p5)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
print(prob)
[1] "Empirical probabilities are:"
prob
head 0.23963242
lower extremities 0.19486334
neck 0.02403393
trunk 0.12111216
upper extremities 0.42035815
sprintf("The RPSS of the body part multinomial regression is %s",RPSS)
index=which(colnames(data1)%in%imp.vars.severity)
Xpredictors = data1[,index]
sev=data1[,1]
#get variance matrix..
zs=var(Xpredictors)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
pcs1=t(t(zsvd$u) %*% t(Xpredictors)) # eigenvector * X matrix
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
#Plot the eigen variance spectrum for the first 20 modes
#Eigen spectrum
ggplot() +
geom_line(mapping = aes(1:20, lambdas[1:20])) +
geom_point(mapping = aes(1:20, lambdas[1:20]), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
labs(title = "Eigen Spectrum 20 leading attributes",x = "Modes",y = "Frac. Var. explained")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
sprintf("First 6 PCs explained %s percent of the variance",round(sum(lambdas[1:6])*100))
sprintf("First 10 PCs explained %s percent of the variance",round(sum(lambdas[1:10])*100))
#plot the leading four eigen vectors
tu = expand.grid(Eofs = gl(4, 1, labels = c("EOF no.1","EOF no.2","EOF no.3","EOF no.4")),
variable = imp.vars.severity)
a=vector(length = 80)
a[1:4]=zsvd$u[1,1:4]
for (i in 2:20) {
ind1=4*(i-1)+1
ind=ind1+3
a[ind1:ind]=zsvd$u[i,1:4]
}
tu$Eof= a
ggplot(tu, aes(x = variable, y = Eof, color = Eofs, group = Eofs)) +
geom_point(size=3) + geom_line()+
labs(title = "Eigen vectors of 20 leading attributes",x = "variable",y = "Influence",color="")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
theme(axis.text.x=element_text(color = "black", size=11, angle=90, vjust=.8, hjust=0.8))+
coord_flip()
#pcs1=cbind(sev,pcs1)
pcs1 = data.frame(pcs1)
## or model with all the PCS
zz = multinom(sev ~ ., data=pcs1)
# weights: 88 (63 variable) initial value 2470.376552 iter 10 value 1437.098629 iter 20 value 1431.364632 iter 30 value 1430.827448 iter 40 value 1430.770986 iter 50 value 1430.767913 final value 1430.767458 converged
N = log(length(sev))
zbest1=stepAIC(zz,k=N)
summary(zbest1)
Call:
multinom(formula = sev ~ 1, data = pcs1)
Coefficients:
(Intercept)
Lost Work Time -2.627830
Medical Case -1.883903
Pain -2.148727
Std. Errors:
(Intercept)
Lost Work Time 0.10568406
Medical Case 0.07551759
Pain 0.08487577
Residual Deviance: 2977.115
AIC: 2983.115
#################### RPSS
ypred = predict(zbest1, type = "probs")
# RPSS calculations
acc2 = unclass(as.factor(sev))
N = length(acc2)
acc2_name =""
#identify the numeric index
for (i in 1:4) {
ind=which(acc2==i)
acc2_name[i]=as.character(data1$injury.severity[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 <- length(acc2[acc2 == 1])/N
p2 <- length(acc2[acc2 == 2])/N
p3 <- length(acc2[acc2 == 3])/N
p4 <- length(acc2[acc2 == 4])/N
prob <- c(p1, p2, p3,p4)
RPSS=rps(acc2, ypred, baseline=prob)$rpss
prob=as.data.frame(prob)
rownames(prob)=acc2_name
print("Empirical probabilities are:")
print(prob)
[1] "Empirical probabilities are:"
prob
1st Aid 0.74579125
Lost Work Time 0.05387205
Medical Case 0.11335578
Pain 0.08698092
sprintf("The RPSS is %s",RPSS)
data$body.part=as.factor(data$body.part)
myTree = tree(body.part ~., data = data, model = T, method = 'class')
summary(myTree)
Classification tree:
tree(formula = body.part ~ ., data = data, method = "class",
model = T)
Variables actually used in tree construction:
[1] "small.particle" "hazardous.substance" "object.on.the.floor"
Number of terminal nodes: 4
Residual mean deviance: 2.397 = 10160 / 4240
Misclassification error rate: 0.4618 = 1960 / 4244
treeFun = function(myTree, toPlot = F, title = ""){
#Perform CV on tree object
cvTree <- cv.tree(myTree)
optTree <- which.min(cvTree$dev)
bestTree <- cvTree$size[optTree]
#prune Tree based on CV results
pruneTree <- prune.tree(myTree, best = bestTree)
#If plotting is selected
if(toPlot){
#Plot unpruned Tree
plot(myTree)
text(myTree, cex = .75)
title(main = paste("Unpruned Tree for", title))
#Plot CV
plot(cvTree$size, cvTree$dev, type = "b",
main = paste("Cross Validation for", title))
#Plot Prunned Tree
plot(pruneTree)
text(pruneTree, cex = .75)
title(main = paste("Pruned Tree for", title))
}
pruneTree$besTree=bestTree
return(pruneTree)
}
treeFit = treeFun(myTree, toPlot=T, title=" Classification Tree")
summary(treeFit)
Classification tree:
tree(formula = body.part ~ ., data = data, method = "class",
model = T)
Variables actually used in tree construction:
[1] "small.particle" "hazardous.substance" "object.on.the.floor"
Number of terminal nodes: 4
Residual mean deviance: 2.397 = 10160 / 4240
Misclassification error rate: 0.4618 = 1960 / 4244
#Use tree to predict original data
treePred <- predict(treeFit)
treeResid <- resid(treeFit)
myRange <- range(treePred, data$body.part)
N = length(data$body.part)
bpart=as.integer(data$body.part)
bpart_name=""
#identify the numeric index
for (i in 1:5) {
ind=which(bpart==i)
bpart_name[i]=as.character(data$body.part[ind[1]])
}
### using verification package you get the same results..
# Empirical probabilities
p1 = length(bpart[bpart== 1])/N
p2 = length(bpart[bpart == 2])/N
p3 = length(bpart[bpart == 3])/N
p4 = length(bpart[bpart == 4])/N
p5 = length(bpart[bpart == 5])/N
prob <- c(p1, p2, p3,p4,p5)
RPSS=rps(bpart, treePred, baseline=prob )$rpss
prob=as.data.frame(prob)
rownames(prob)=bpart_name
print("Empirical probabilities are:")
print(prob)
[1] "Empirical probabilities are:"
prob
head 0.23963242
lower extremities 0.19486334
neck 0.02403393
trunk 0.12111216
upper extremities 0.42035815
sprintf("The RPSS of the body part multinomial regression is %s",RPSS)
data1$injury.severity = as.factor(data1$injury.severity)
myTree = tree(injury.severity ~., data = data1, model = T)
summary(myTree)
print("It is not possible to apply CART to Injury Severity category")
Classification tree: tree(formula = injury.severity ~ ., data = data1, model = T) Variables actually used in tree construction: character(0) Number of terminal nodes: 1 Residual mean deviance: 1.672 = 2977 / 1781 Misclassification error rate: 0.2542 = 453 / 1782
[1] "It is not possible to apply CART to Injury Severity category"