Catalina Jerez
catalina.jerez@colorado.edu


Construction safety outcomes data have been compiled from accident reports and described in Tixier et al. (2016). The data, attribute information and relevant can be found at For the safety outcome of body part, with five categories (see Table 2 of Tixier et al., 2016).

Twenty leading attributes, all binary (1 or 0 – i.e., present or absent) are provided for each accident record.

  1. Perform a PCA on the attribute data - show the Eigen spectrum and the leading four Eigen vectors.

  2. Fit a best multinomial regression with the principal components as predictors to model the categorical probability of injuries to the five category of body parts and compute the RPSS

  3. Repeat b) for the Injury Severity Tixier et al. (2016) modeled these using Random Forests and Stochastic Boosting. They could not get good skill for Injury Severity. Compare your results with theirs.

1 Library & functions

1.1 Libraries

gc()
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  542916 29.0    1201048 64.2         NA   700280 37.4
## Vcells 1002446  7.7    8388608 64.0      36864  1963567 15.0
rm(list = ls())
# Load necessary libraries
spatialAnalysis.Lib     = c("fields", "maps", "rnaturalearth", "sf")
statisAnalysis.Lib      = c("psych")
dataManipulation.Lib    = c("dplyr", "reshape2", "tidyr") 
dataVisualization.Lib   = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer")
modeling.Lib            = c("nnet", "MASS", "verification")
list.packages           = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
                                   modeling.Lib,
                                   dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
##        fields          maps rnaturalearth            sf         psych 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##          nnet          MASS  verification         dplyr      reshape2 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##         tidyr       ggplot2       ggrepel        ggpubr  RColorBrewer 
##          TRUE          TRUE          TRUE          TRUE          TRUE

1.2 Functions

# constants 
text.size    = 15
customize.bw = theme_bw(base_family = "Times") +
  theme(
    axis.text.x        = element_text(size = text.size, vjust = 0.5),
    axis.text.y        = element_text(size = text.size),
    axis.title         = element_text(size = text.size + 2, face = "bold"),
    plot.title         = element_text(size = text.size + 2, face = "bold"),
    
    legend.position    = "bottom",
    legend.title       = element_text(size = text.size, face = "bold"),
    legend.text        = element_text(size = text.size),
    
    panel.border       = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
    panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
    panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank()
  )

# color palettes
palette.rdbu = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200)
# world map
world.sf     = ne_countries(scale = "medium", returnclass = "sf") %>%
    st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
    st_transform("+proj=longlat +datum=WGS84")

#  plot for PCA
plot.eigenSpectrum = function(pca_result, n = 20) {
  eigvals  = pca_result$sdev^2
  frac_var = eigvals / sum(eigvals)
  df       = data.frame(Mode = 1:length(frac_var), Variance = frac_var)
  breaks   = seq(0, 20, 4); breaks[1] = 1
  ggplot(df[1:n, ], aes(x = Mode, y = Variance)) +
    geom_line(color  = "#636363") + 
    geom_point(color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
    scale_x_continuous(breaks = breaks)+
    labs(y = "Fraction Variance", x = "Mode",title = "PCA Eigenvalue Spectrum")+
    theme_bw(base_size = 12, base_family = "Times") + 
    customize.bw
}

2 Load data and pre-process

base.path = "https://civil.colorado.edu/~balajir/CVEN6833/const-data/"

# Body part data's
df.body  = read.csv(paste0(base.path, "data.body.part.csv"))
imp.body = read.csv(paste0(base.path, "gbm.importance.50.body.csv"), header=TRUE)

# Injury Severity data's
df.sevr  = read.csv(paste0(base.path, "data.severity.csv"))
imp.sevr = read.csv(paste0(base.path, "gbm.importance.50.code.csv"), header=TRUE)

# Top 20 variables
to.retain    = 20
impVars.body = as.character(imp.body[1:to.retain, "var"])
impVars.sevr = as.character(imp.sevr[1:to.retain, "var"])

# select the important variables
bp.index      = which(colnames(df.body) %in% impVars.body)
bp.predictors = df.body[,bp.index]
bp.value      = df.body$body.part

is.index      = which(colnames(df.sevr) %in% impVars.sevr)
is.predictors = df.sevr[,is.index]
is.value      = df.sevr$injury.severity

3 Principal Component Analysis (PCA)

# perform PCA
bp.pca  = prcomp(bp.predictors, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (20 modes)
plot.eigenSpectrum(bp.pca)

3.1 Show leading four Eigen vectors

leading.vect     = as.data.frame( bp.pca$rotation[,1:4] )
leading.vect$var = rownames(leading.vect)
leading.vectLong = gather(leading.vect, key = "PC", value = "Loading", -var)

# plot
ggplot(leading.vectLong, aes(x = reorder(var, abs(Loading)), y = Loading, fill = PC)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Leading Four Eigenvectors (Loadings)", 
       x = "Variables", y = "Loading") +
  scale_fill_brewer(palette = "Set1") +
  customize.bw

4 Fitting Multinomial Regression for Body Part

# data for regression
pcs.body           = as.data.frame(bp.pca$x)
pcs.body$body.part = as.factor(bp.value)

# fit multinomial regression with first 5 PCs
model.body     = multinom(body.part ~ PC1 + PC2 + PC3 + PC4 + PC5, data = pcs.body)
## # weights:  35 (24 variable)
## initial  value 6830.454500 
## iter  10 value 5030.738100
## iter  20 value 4884.611910
## iter  30 value 4854.568291
## final  value 4854.559561 
## converged
bestModel.body = stepAIC(model.body, k = log(nrow(pcs.body))) # best model via BIC criteria
## Start:  AIC=9909.6
## body.part ~ PC1 + PC2 + PC3 + PC4 + PC5
## 
## # weights:  30 (20 variable)
## initial  value 6830.454500 
## iter  10 value 5416.594308
## iter  20 value 5265.669544
## final  value 5242.496039 
## converged
## # weights:  30 (20 variable)
## initial  value 6830.454500 
## iter  10 value 5115.416155
## iter  20 value 4977.899356
## final  value 4948.834541 
## converged
## # weights:  30 (20 variable)
## initial  value 6830.454500 
## iter  10 value 5138.226446
## iter  20 value 5025.810296
## final  value 5000.686585 
## converged
## # weights:  30 (20 variable)
## initial  value 6830.454500 
## iter  10 value 5048.321172
## iter  20 value 4973.282029
## iter  30 value 4961.197355
## iter  30 value 4961.197349
## iter  30 value 4961.197349
## final  value 4961.197349 
## converged
## # weights:  30 (20 variable)
## initial  value 6830.454500 
## iter  10 value 5089.009095
## iter  20 value 4927.337443
## iter  30 value 4887.101658
## iter  30 value 4887.101653
## iter  30 value 4887.101653
## final  value 4887.101653 
## converged
##        Df     AIC
## <none>     9909.6
## - PC5   4  9941.3
## - PC2   4 10064.7
## - PC4   4 10089.5
## - PC3   4 10168.4
## - PC1   4 10652.1
probs.body     = predict(bestModel.body, type = "probs") # predict class probabilities

# climatological baseline (same shape as probs)
true.labels = factor(bp.value, levels = levels(pcs.body$body.part))
obs.body    = as.numeric(true.labels)  # convert to class indices (1-based)
climo.tab   = prop.table(table(obs.body))
climo.body  = matrix(rep(climo.tab, each = nrow(probs.body)), 
                     nrow = nrow(probs.body), byrow = FALSE)
# compute RPSS
rpss.body = rps(obs.body, probs.body, baseline = climo.body)$rpss
cat(sprintf("RPSS = %.3f", rpss.body ))
## RPSS = 0.627

5 Fitting Multinomial Regression for Injury Severity

# perform PCA
is.pca  = prcomp(is.predictors, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (20 modes)
plot.eigenSpectrum(is.pca)

5.1 Show leading four Eigen vectors

leading.vect     = as.data.frame( is.pca$rotation[,1:4] )
leading.vect$var = rownames(leading.vect)
leading.vectLong = gather(leading.vect, key = "PC", value = "Loading", -var)

# plot
ggplot(leading.vectLong, aes(x = reorder(var, abs(Loading)), y = Loading, fill = PC)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Leading Four Eigenvectors (Loadings)", 
       x = "Variables", y = "Loading") +
  scale_fill_brewer(palette = "Set1") +
  customize.bw

5.2 Fitting Multinomial Regression

# data for regression
pcs.sevr           = as.data.frame(is.pca$x)
pcs.sevr$sevr.part = as.factor(is.value)

# fit multinomial regression with first 5 PCs
model.sevr     = multinom(sevr.part ~ PC1 + PC2 + PC3 + PC4 + PC5, data = pcs.sevr)
## # weights:  28 (18 variable)
## initial  value 2470.376552 
## iter  10 value 1472.530627
## iter  20 value 1471.598266
## final  value 1471.524236 
## converged
bestModel.sevr = stepAIC(model.sevr, k = log(nrow(pcs.sevr))) # best model via BIC criteria
## Start:  AIC=3077.79
## sevr.part ~ PC1 + PC2 + PC3 + PC4 + PC5
## 
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1475.783073
## iter  20 value 1475.463913
## iter  20 value 1475.463913
## iter  20 value 1475.463913
## final  value 1475.463913 
## converged
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1479.525967
## iter  20 value 1478.876491
## final  value 1478.862661 
## converged
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1473.799972
## iter  20 value 1473.587005
## iter  20 value 1473.587000
## iter  20 value 1473.587000
## final  value 1473.587000 
## converged
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1472.938898
## iter  20 value 1472.689044
## final  value 1472.688977 
## converged
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1475.392168
## iter  20 value 1474.727691
## final  value 1474.726661 
## converged
##        Df    AIC
## - PC4   3 3057.7
## - PC3   3 3059.5
## - PC5   3 3061.7
## - PC1   3 3063.2
## - PC2   3 3070.0
## <none>    3077.8
## # weights:  24 (15 variable)
## initial  value 2470.376552 
## iter  10 value 1472.938898
## iter  20 value 1472.689044
## final  value 1472.688977 
## converged
## 
## Step:  AIC=3057.66
## sevr.part ~ PC1 + PC2 + PC3 + PC5
## 
## # weights:  20 (12 variable)
## initial  value 2470.376552 
## iter  10 value 1477.399576
## final  value 1476.796010 
## converged
## # weights:  20 (12 variable)
## initial  value 2470.376552 
## iter  10 value 1481.097060
## final  value 1479.913365 
## converged
## # weights:  20 (12 variable)
## initial  value 2470.376552 
## iter  10 value 1474.603291
## final  value 1474.435427 
## converged
## # weights:  20 (12 variable)
## initial  value 2470.376552 
## iter  10 value 1476.909086
## final  value 1475.830819 
## converged
##        Df    AIC
## - PC3   3 3038.7
## - PC5   3 3041.5
## - PC1   3 3043.4
## - PC2   3 3049.7
## <none>    3057.7
## # weights:  20 (12 variable)
## initial  value 2470.376552 
## iter  10 value 1474.603291
## final  value 1474.435427 
## converged
## 
## Step:  AIC=3038.7
## sevr.part ~ PC1 + PC2 + PC5
## 
## # weights:  16 (9 variable)
## initial  value 2470.376552 
## iter  10 value 1479.107979
## final  value 1479.006364 
## converged
## # weights:  16 (9 variable)
## initial  value 2470.376552 
## iter  10 value 1481.860575
## final  value 1481.548181 
## converged
## # weights:  16 (9 variable)
## initial  value 2470.376552 
## iter  10 value 1478.790820
## final  value 1477.733336 
## converged
##        Df    AIC
## - PC5   3 3022.8
## - PC1   3 3025.4
## - PC2   3 3030.5
## <none>    3038.7
## # weights:  16 (9 variable)
## initial  value 2470.376552 
## iter  10 value 1478.790820
## final  value 1477.733336 
## converged
## 
## Step:  AIC=3022.84
## sevr.part ~ PC1 + PC2
## 
## # weights:  12 (6 variable)
## initial  value 2470.376552 
## iter  10 value 1481.845961
## iter  10 value 1481.845960
## iter  10 value 1481.845960
## final  value 1481.845960 
## converged
## # weights:  12 (6 variable)
## initial  value 2470.376552 
## iter  10 value 1484.477035
## iter  10 value 1484.477024
## iter  10 value 1484.477024
## final  value 1484.477024 
## converged
##        Df    AIC
## - PC1   3 3008.6
## - PC2   3 3013.9
## <none>    3022.8
## # weights:  12 (6 variable)
## initial  value 2470.376552 
## iter  10 value 1481.845961
## iter  10 value 1481.845960
## iter  10 value 1481.845960
## final  value 1481.845960 
## converged
## 
## Step:  AIC=3008.6
## sevr.part ~ PC2
## 
## # weights:  8 (3 variable)
## initial  value 2470.376552 
## final  value 1488.557695 
## converged
##        Df    AIC
## - PC2   3 2999.6
## <none>    3008.6
## # weights:  8 (3 variable)
## initial  value 2470.376552 
## final  value 1488.557695 
## converged
## 
## Step:  AIC=2999.57
## sevr.part ~ 1
probs.sevr     = predict(bestModel.sevr, type = "probs") # predict class probabilities

# climatological baseline (same shape as probs)
true.labels = factor(is.value, levels = levels(pcs.sevr$sevr.part))
obs.sevr    = as.numeric(true.labels)  # convert to class indices (1-based)
climo.tab   = prop.table(table(obs.sevr))
climo.sevr  = matrix(rep(climo.tab, each = nrow(probs.sevr)), 
                     nrow = nrow(probs.sevr), byrow = FALSE)
# compute RPSS
rpss.sevr = rps(obs.sevr, probs.sevr, baseline = climo.sevr)$rpss
cat(sprintf("RPSS = %.3f", rpss.sevr ))
## RPSS = 0.865

6 Final remarks