Catalina Jerez
catalina.jerez@colorado.edu
Construction safety outcomes data have been compiled from accident reports and described in Tixier et al. (2016). The data, attribute information and relevant can be found at For the safety outcome of body part, with five categories (see Table 2 of Tixier et al., 2016).
Twenty leading attributes, all binary (1 or 0 – i.e., present or absent) are provided for each accident record.
Perform a PCA on the attribute data - show the Eigen spectrum and the leading four Eigen vectors.
Fit a best multinomial regression with the principal components as predictors to model the categorical probability of injuries to the five category of body parts and compute the RPSS
Repeat b) for the Injury Severity Tixier et al. (2016) modeled these using Random Forests and Stochastic Boosting. They could not get good skill for Injury Severity. Compare your results with theirs.
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 542916 29.0 1201048 64.2 NA 700280 37.4
## Vcells 1002446 7.7 8388608 64.0 36864 1963567 15.0
rm(list = ls())
# Load necessary libraries
spatialAnalysis.Lib = c("fields", "maps", "rnaturalearth", "sf")
statisAnalysis.Lib = c("psych")
dataManipulation.Lib = c("dplyr", "reshape2", "tidyr")
dataVisualization.Lib = c("ggplot2", "ggrepel", "ggpubr", "RColorBrewer")
modeling.Lib = c("nnet", "MASS", "verification")
list.packages = unique(c(spatialAnalysis.Lib, statisAnalysis.Lib,
modeling.Lib,
dataManipulation.Lib, dataVisualization.Lib))
# Load all required packages
sapply(list.packages, require, character.only = TRUE)
## fields maps rnaturalearth sf psych
## TRUE TRUE TRUE TRUE TRUE
## nnet MASS verification dplyr reshape2
## TRUE TRUE TRUE TRUE TRUE
## tidyr ggplot2 ggrepel ggpubr RColorBrewer
## TRUE TRUE TRUE TRUE TRUE
# constants
text.size = 15
customize.bw = theme_bw(base_family = "Times") +
theme(
axis.text.x = element_text(size = text.size, vjust = 0.5),
axis.text.y = element_text(size = text.size),
axis.title = element_text(size = text.size + 2, face = "bold"),
plot.title = element_text(size = text.size + 2, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = text.size, face = "bold"),
legend.text = element_text(size = text.size),
panel.border = element_rect(color = "#000000", fill = NA, linewidth = 0.5),
panel.grid.minor.x = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.minor.y = element_line(color = "#d9d9d9", linetype = "dotted"),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
)
# color palettes
palette.rdbu = colorRampPalette(rev(brewer.pal(9, "RdBu")))(200)
# world map
world.sf = ne_countries(scale = "medium", returnclass = "sf") %>%
st_wrap_dateline(options = c("WRAPDATELINE=YES", "DATELINEOFFSET=180")) %>%
st_transform("+proj=longlat +datum=WGS84")
# plot for PCA
plot.eigenSpectrum = function(pca_result, n = 20) {
eigvals = pca_result$sdev^2
frac_var = eigvals / sum(eigvals)
df = data.frame(Mode = 1:length(frac_var), Variance = frac_var)
breaks = seq(0, 20, 4); breaks[1] = 1
ggplot(df[1:n, ], aes(x = Mode, y = Variance)) +
geom_line(color = "#636363") +
geom_point(color = "#000", fill = "#bdbdbd", shape = 21, size = 3) +
scale_x_continuous(breaks = breaks)+
labs(y = "Fraction Variance", x = "Mode",title = "PCA Eigenvalue Spectrum")+
theme_bw(base_size = 12, base_family = "Times") +
customize.bw
}
base.path = "https://civil.colorado.edu/~balajir/CVEN6833/const-data/"
# Body part data's
df.body = read.csv(paste0(base.path, "data.body.part.csv"))
imp.body = read.csv(paste0(base.path, "gbm.importance.50.body.csv"), header=TRUE)
# Injury Severity data's
df.sevr = read.csv(paste0(base.path, "data.severity.csv"))
imp.sevr = read.csv(paste0(base.path, "gbm.importance.50.code.csv"), header=TRUE)
# Top 20 variables
to.retain = 20
impVars.body = as.character(imp.body[1:to.retain, "var"])
impVars.sevr = as.character(imp.sevr[1:to.retain, "var"])
# select the important variables
bp.index = which(colnames(df.body) %in% impVars.body)
bp.predictors = df.body[,bp.index]
bp.value = df.body$body.part
is.index = which(colnames(df.sevr) %in% impVars.sevr)
is.predictors = df.sevr[,is.index]
is.value = df.sevr$injury.severity
# perform PCA
bp.pca = prcomp(bp.predictors, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (20 modes)
plot.eigenSpectrum(bp.pca)
leading.vect = as.data.frame( bp.pca$rotation[,1:4] )
leading.vect$var = rownames(leading.vect)
leading.vectLong = gather(leading.vect, key = "PC", value = "Loading", -var)
# plot
ggplot(leading.vectLong, aes(x = reorder(var, abs(Loading)), y = Loading, fill = PC)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Leading Four Eigenvectors (Loadings)",
x = "Variables", y = "Loading") +
scale_fill_brewer(palette = "Set1") +
customize.bw
# data for regression
pcs.body = as.data.frame(bp.pca$x)
pcs.body$body.part = as.factor(bp.value)
# fit multinomial regression with first 5 PCs
model.body = multinom(body.part ~ PC1 + PC2 + PC3 + PC4 + PC5, data = pcs.body)
## # weights: 35 (24 variable)
## initial value 6830.454500
## iter 10 value 5030.738100
## iter 20 value 4884.611910
## iter 30 value 4854.568291
## final value 4854.559561
## converged
bestModel.body = stepAIC(model.body, k = log(nrow(pcs.body))) # best model via BIC criteria
## Start: AIC=9909.6
## body.part ~ PC1 + PC2 + PC3 + PC4 + PC5
##
## # weights: 30 (20 variable)
## initial value 6830.454500
## iter 10 value 5416.594308
## iter 20 value 5265.669544
## final value 5242.496039
## converged
## # weights: 30 (20 variable)
## initial value 6830.454500
## iter 10 value 5115.416155
## iter 20 value 4977.899356
## final value 4948.834541
## converged
## # weights: 30 (20 variable)
## initial value 6830.454500
## iter 10 value 5138.226446
## iter 20 value 5025.810296
## final value 5000.686585
## converged
## # weights: 30 (20 variable)
## initial value 6830.454500
## iter 10 value 5048.321172
## iter 20 value 4973.282029
## iter 30 value 4961.197355
## iter 30 value 4961.197349
## iter 30 value 4961.197349
## final value 4961.197349
## converged
## # weights: 30 (20 variable)
## initial value 6830.454500
## iter 10 value 5089.009095
## iter 20 value 4927.337443
## iter 30 value 4887.101658
## iter 30 value 4887.101653
## iter 30 value 4887.101653
## final value 4887.101653
## converged
## Df AIC
## <none> 9909.6
## - PC5 4 9941.3
## - PC2 4 10064.7
## - PC4 4 10089.5
## - PC3 4 10168.4
## - PC1 4 10652.1
probs.body = predict(bestModel.body, type = "probs") # predict class probabilities
# climatological baseline (same shape as probs)
true.labels = factor(bp.value, levels = levels(pcs.body$body.part))
obs.body = as.numeric(true.labels) # convert to class indices (1-based)
climo.tab = prop.table(table(obs.body))
climo.body = matrix(rep(climo.tab, each = nrow(probs.body)),
nrow = nrow(probs.body), byrow = FALSE)
# compute RPSS
rpss.body = rps(obs.body, probs.body, baseline = climo.body)$rpss
cat(sprintf("RPSS = %.3f", rpss.body ))
## RPSS = 0.627
# perform PCA
is.pca = prcomp(is.predictors, center = TRUE, scale. = TRUE)
# plot eigenvalue spectrum (20 modes)
plot.eigenSpectrum(is.pca)
leading.vect = as.data.frame( is.pca$rotation[,1:4] )
leading.vect$var = rownames(leading.vect)
leading.vectLong = gather(leading.vect, key = "PC", value = "Loading", -var)
# plot
ggplot(leading.vectLong, aes(x = reorder(var, abs(Loading)), y = Loading, fill = PC)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Leading Four Eigenvectors (Loadings)",
x = "Variables", y = "Loading") +
scale_fill_brewer(palette = "Set1") +
customize.bw
# data for regression
pcs.sevr = as.data.frame(is.pca$x)
pcs.sevr$sevr.part = as.factor(is.value)
# fit multinomial regression with first 5 PCs
model.sevr = multinom(sevr.part ~ PC1 + PC2 + PC3 + PC4 + PC5, data = pcs.sevr)
## # weights: 28 (18 variable)
## initial value 2470.376552
## iter 10 value 1472.530627
## iter 20 value 1471.598266
## final value 1471.524236
## converged
bestModel.sevr = stepAIC(model.sevr, k = log(nrow(pcs.sevr))) # best model via BIC criteria
## Start: AIC=3077.79
## sevr.part ~ PC1 + PC2 + PC3 + PC4 + PC5
##
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1475.783073
## iter 20 value 1475.463913
## iter 20 value 1475.463913
## iter 20 value 1475.463913
## final value 1475.463913
## converged
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1479.525967
## iter 20 value 1478.876491
## final value 1478.862661
## converged
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1473.799972
## iter 20 value 1473.587005
## iter 20 value 1473.587000
## iter 20 value 1473.587000
## final value 1473.587000
## converged
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1472.938898
## iter 20 value 1472.689044
## final value 1472.688977
## converged
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1475.392168
## iter 20 value 1474.727691
## final value 1474.726661
## converged
## Df AIC
## - PC4 3 3057.7
## - PC3 3 3059.5
## - PC5 3 3061.7
## - PC1 3 3063.2
## - PC2 3 3070.0
## <none> 3077.8
## # weights: 24 (15 variable)
## initial value 2470.376552
## iter 10 value 1472.938898
## iter 20 value 1472.689044
## final value 1472.688977
## converged
##
## Step: AIC=3057.66
## sevr.part ~ PC1 + PC2 + PC3 + PC5
##
## # weights: 20 (12 variable)
## initial value 2470.376552
## iter 10 value 1477.399576
## final value 1476.796010
## converged
## # weights: 20 (12 variable)
## initial value 2470.376552
## iter 10 value 1481.097060
## final value 1479.913365
## converged
## # weights: 20 (12 variable)
## initial value 2470.376552
## iter 10 value 1474.603291
## final value 1474.435427
## converged
## # weights: 20 (12 variable)
## initial value 2470.376552
## iter 10 value 1476.909086
## final value 1475.830819
## converged
## Df AIC
## - PC3 3 3038.7
## - PC5 3 3041.5
## - PC1 3 3043.4
## - PC2 3 3049.7
## <none> 3057.7
## # weights: 20 (12 variable)
## initial value 2470.376552
## iter 10 value 1474.603291
## final value 1474.435427
## converged
##
## Step: AIC=3038.7
## sevr.part ~ PC1 + PC2 + PC5
##
## # weights: 16 (9 variable)
## initial value 2470.376552
## iter 10 value 1479.107979
## final value 1479.006364
## converged
## # weights: 16 (9 variable)
## initial value 2470.376552
## iter 10 value 1481.860575
## final value 1481.548181
## converged
## # weights: 16 (9 variable)
## initial value 2470.376552
## iter 10 value 1478.790820
## final value 1477.733336
## converged
## Df AIC
## - PC5 3 3022.8
## - PC1 3 3025.4
## - PC2 3 3030.5
## <none> 3038.7
## # weights: 16 (9 variable)
## initial value 2470.376552
## iter 10 value 1478.790820
## final value 1477.733336
## converged
##
## Step: AIC=3022.84
## sevr.part ~ PC1 + PC2
##
## # weights: 12 (6 variable)
## initial value 2470.376552
## iter 10 value 1481.845961
## iter 10 value 1481.845960
## iter 10 value 1481.845960
## final value 1481.845960
## converged
## # weights: 12 (6 variable)
## initial value 2470.376552
## iter 10 value 1484.477035
## iter 10 value 1484.477024
## iter 10 value 1484.477024
## final value 1484.477024
## converged
## Df AIC
## - PC1 3 3008.6
## - PC2 3 3013.9
## <none> 3022.8
## # weights: 12 (6 variable)
## initial value 2470.376552
## iter 10 value 1481.845961
## iter 10 value 1481.845960
## iter 10 value 1481.845960
## final value 1481.845960
## converged
##
## Step: AIC=3008.6
## sevr.part ~ PC2
##
## # weights: 8 (3 variable)
## initial value 2470.376552
## final value 1488.557695
## converged
## Df AIC
## - PC2 3 2999.6
## <none> 3008.6
## # weights: 8 (3 variable)
## initial value 2470.376552
## final value 1488.557695
## converged
##
## Step: AIC=2999.57
## sevr.part ~ 1
probs.sevr = predict(bestModel.sevr, type = "probs") # predict class probabilities
# climatological baseline (same shape as probs)
true.labels = factor(is.value, levels = levels(pcs.sevr$sevr.part))
obs.sevr = as.numeric(true.labels) # convert to class indices (1-based)
climo.tab = prop.table(table(obs.sevr))
climo.sevr = matrix(rep(climo.tab, each = nrow(probs.sevr)),
nrow = nrow(probs.sevr), byrow = FALSE)
# compute RPSS
rpss.sevr = rps(obs.sevr, probs.sevr, baseline = climo.sevr)$rpss
cat(sprintf("RPSS = %.3f", rpss.sevr ))
## RPSS = 0.865
Multinomial Regression for Body Part: the model significantly outperforms the climatological baseline (random prediction based on prior class frequencies), showing good predictive power for body part injuries.
Multinomial Regression for Injury Severity: contrary to expectations from Tixier et al. (2016), the model shows high predictive skill (RPSS > 0.8). This may be attributed to dimensionality reduction, noise removal, or better class balance after PCA compression.