This R script trains gradient boosting machines (GBM) models to perform the state classification task (i.e., predict homeostatic vs. intermediate vs. reactive).
Load requisite packages and define directories.
Note that directories are relative to the R project path.
# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()
# set directories
ddir = file.path("Results", "CNN", "1.2 - State Partition")
dir4 = file.path("Results", "4 - Spectral Clustering")
dir7 = file.path("Results", "7 - Gradient Boosting Machines", "State")
# create directory
if(!dir.exists(dir7)) {dir.create(dir7)}
# set seed
set.seed(1234)
Load processed ROI measurement data from the 4 - Spectral Clustering
directory. Note that this script uses the same train/test/validation split as the convolutional neural network (CNN) by loading the data object created by the CNN/1 - Partition ROIs
script.
Define function to merge ROI measurement data with the predetermined train/test/validation split.
Map function over data objects for cell-types with CNN output data.
# function to parse and merge data
all = map(celltypes, ~merge_data(all[[.x]], split[[.x]]))
Define function to plot variable importance scores.
# function to plot variable importance
plot_imp = function(imp) {
# convert to data table
imp = as.data.table(imp$importance, keep.rownames = "Marker") %>%
setnames("Overall", "Importance") %>%
.[order(-Importance)] %>%
.[, Marker := factor(Marker, levels = rev(Marker))]
# plot variable importance
p = ggplot(imp, aes(x = Importance, y = Marker, fill = Importance,
label = round(Importance, 2))) +
geom_bar(stat = "identity", width = 0.7, color = "black") +
scale_fill_gradient(low = "#EADA86", high = "#B67977") +
geom_text(size = 3, hjust = 1.2, fontface = "bold")+
scale_x_continuous(limits = c(-0.4, max(imp$Importance)),
expand = expansion(mult = c(0, 0.05))) +
theme_bw() +
theme(plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.title.y = element_blank(),
axis.text.y = element_text(size = 10, color = "black"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.ticks.x = element_line(color = "black"),
axis.ticks.y = element_blank(),
panel.border = element_rect(color = "black", fill = NA, size = 1),
legend.position = "none")
return(p)
}
Define function to train the gradient boosting machines (GBM) model and save output
train_gbm = function(dat, lab, mx) {
# create subdirectory if needed
wdir = file.path(dir7, lab)
if(!dir.exists(wdir)) {dir.create(wdir)}
# partition data into training/test
train_dat = dat[Partition %in% c("Train", "Validation"), ]
test_dat = dat[Partition == "Test", ]
# establish 10-fold cross validation to determine the out-of-sample error
tC = trainControl(method = "cv", number = 10, savePredictions = TRUE,
classProbs = TRUE, verboseIter = TRUE)
# estimate pre-processing transformation (centering, scaling, remove zero
# variance) from training data, apply to all data
normalize = preProcess(train_dat[, ..mx],
method = c("center", "scale", "zv"), verbose = TRUE)
train_dat[, (mx) := predict(normalize, train_dat[, mx, with = FALSE])]
test_dat[, (mx) := predict(normalize, test_dat[, mx, with = FALSE])]
# train model on training set (80%)
gbm_model = train(x = train_dat[, ..mx], y = train_dat[, State],
method = "gbm", trControl = tC)
gbm_imp = varImp(gbm_model, scale = FALSE)
# plot variable importance
ggsave(file.path(wdir, "Variable Importance.pdf"), plot_imp(gbm_imp),
width = 8, height = 2.5 + length(mx)/2)
# test model on test set (20%)
gbm_pred = predict(gbm_model, test_dat[, ..mx])
gbm_cm = confusionMatrix(gbm_pred, test_dat[, State])
gbm_prob = data.table(predict(gbm_model, test_dat[, ..mx], type = "prob"),
test_dat[, .(State)])
# write GBM probabilities to file
fwrite(gbm_prob, file.path(wdir, "GBM Probabilities.csv"))
# calculate multiclass AUC
roc_calc = multiclass.roc(response = gbm_prob$State, predictor = gbm_prob[, .(Homeostatic, Intermediate, Reactive)])
# define aes values
h_i = copy(gbm_prob)[State %in% c("Homeostatic", "Intermediate")][, State := droplevels(State)]
h_r = copy(gbm_prob)[State %in% c("Homeostatic", "Reactive")][, State := droplevels(State)]
i_r = copy(gbm_prob)[State %in% c("Intermediate", "Reactive")][, State := droplevels(State)]
# plot multiclass AUC curve
roc_plot = ggplot() +
ggtitle("Gradient Boosting Machines ROC") +
geom_abline(aes(intercept = 0, slope = 1), linetype = "dashed", size = 1, color = "gray") +
geom_roc(data = h_i, aes(d = State, m = Intermediate, color = "Homeostatic/Intermediate"), labels = FALSE, pointsize = 0) +
geom_roc(data = i_r, aes(d = State, m = Reactive, color = "Intermediate/Reactive"), labels = FALSE, pointsize = 0) +
geom_roc(data = h_r, aes(d = State, m = Reactive, color = "Reactive/Homeostatic"), labels = FALSE, pointsize = 0) +
scale_colour_manual(values = c("#39B200", "#F0C808", "#960200")) +
labs(x = "1 - Specificity", y = "Sensitivity", color = "Comparison", subtitle = paste("Multi-Class AUC =", round(as.numeric(roc_calc$auc), 4))) + theme_bw() +
theme(plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(face = "italic"),
axis.title.x = element_text(size=14, face="bold"),
axis.title.y = element_text(size=14, face="bold"),
legend.title = element_text(size=12, face="bold"),
legend.text = element_text(size=12),
legend.position = c(0.72, 0.17),
legend.background = element_rect(fill = "white", color = "black"),
panel.border = element_rect(color = "black", fill = NA, size = 1))
# save ROC curve
ggsave(file.path(wdir, "Multi-Class ROC Curve.pdf"), roc_plot, width = 6, height = 6.2)
# send output to external file
logf = file.path(wdir, "GBM Log.txt")
if (file.exists(logf)) { file.remove(logf) }
# establish sink
sink(logf)
gbm_model %>%
list(., .$results, .$finalModel, gbm_imp,
as.matrix(gbm_imp$importance), gbm_cm) %>%
walk(., print)
cat(paste0("\nMulti-Class AUC = ", round(as.numeric(roc_calc$auc), 4)))
sink()
# return full list of output
return(list(Test = test_dat, Train = train_dat, Model = gbm_model,
Prediction = gbm_pred, CM = gbm_cm, Scores = gbm_prob,
ROC = roc_calc, Importance = gbm_imp))
}
Train the GBM model by mapping over all
.
# define markers of interest
markers = list(Astrocyte = c("GFAP", "YKL40", "VIM", "TSPO",
"EAAT1", "EAAT2", "GS"),
Microglia = c("MHC2", "CD68", "TMEM119", "TSPO", "FTL"),
Vessel = c("GFAP", "YKL40", "VIM", "TSPO", "EAAT1", "EAAT2", "GS"))
# train GBM model
gbm_results = imap(all, ~train_gbm(.x, .y, markers[[.y]]))
# save output
saveRDS(gbm_results, file.path(dir7, "GBM Results.rds"))
If you see mistakes or want to suggest changes, please create an issue on the source repository.