Visualization of xgboost ML with 4-fold cross-validation of full dataset.
Plot ROC curves, confusion matrices, and SHAP plots from top n models using xgboost results derived from “KPSZV1 Postvax Highdose PfSPZ Multimodal ML xgb nested fs cv on full set JCI Revision.Rmd”.
library(xgboost)
library(dplyr)
library(tidyverse)
library(pROC)
library(caret)
library(doMC)
library(data.table)
library(xgboost)
library(SHAPforxgboost)
library(mlr)
library(ggpubr)
library(tictoc)
library(Biobase)
library(googledrive)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
#from google drive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1QGtZ_VpOFgS7fGDeOwCwYg-hBzENwhk6"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)
train_features <- high_dose_features_class %>%
dplyr::select(-c(sex, site))
This is a large file.
#from google drive
temp <- tempfile(fileext = ".Rdata")
dl <- drive_download(
as_id("1Rb9kQ5ZjtRJsZ9hX-n0xSUcuPlqrlTMf"), path = temp, overwrite = TRUE)
all_folds_2500_runs <- load(file = dl$local_path)
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
as_id("1ScvZjV0WWXc7H0tPCJoN61FyuM-bUExQ"), path = temp, overwrite = TRUE)
per_run_res_2500_runs <- readxl::read_xlsx(dl$local_path)
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
as_id("1SmyKUy6STA5BEvDUF7KJFE3g_E-ibY7K"), path = temp, overwrite = TRUE)
summarized_res_2500_runs <- readxl::read_xlsx(dl$local_path)
smooth <- FALSE
runs_to_test <- summarized_res_2500_runs[summarized_res_2500_runs$mean_cv_accuracy>.66,]$run #select only the high-dose PfSPZ models with accuracy >66%.
#runs_to_test <- c("run 211", "run 2222", "run 671")
best_run <- best_combo <- all_folds_response <- all_folds_predictor <- roc_dat <- auc <- confusion_mat <- response_dichotomous <- c()
across_fold_performance <- logloss_list <- brier_list <- c()
for(j in runs_to_test){
best_run[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
group_by(run) %>%
slice_max(AUC) %>%
distinct(Features) %>%
pull(var = run)
best_combo[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
group_by(run) %>%
slice_max(AUC) %>%
distinct(Features) %>%
pull(var = Features)
#calculate predictive accuracy across all 4 folds
all_folds_response[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$class,
fold2[[best_run[[j]]]]$predictor_data$class,
fold3[[best_run[[j]]]]$predictor_data$class,
fold4[[best_run[[j]]]]$predictor_data$class)
all_folds_predictor[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$pred.prob,
fold2[[best_run[[j]]]]$predictor_data$pred.prob,
fold3[[best_run[[j]]]]$predictor_data$pred.prob,
fold4[[best_run[[j]]]]$predictor_data$pred.prob)
roc_dat[[j]] <- pROC::roc(response = all_folds_response[[j]],
predictor = all_folds_predictor[[j]],
smooth = smooth)
auc[[j]] <- roc_dat[[j]]$auc %>% as.numeric(.) %>% signif(.,digits=3)
response_dichotomous[[j]] <- ifelse(all_folds_predictor[[j]] > 0.5,1,0)
#remember that since we are using the high-dose model, we are going with the original high-dose designation where infected=0, protected=1
#but we are actually predicting the opposite for placebo, where 1 means infected(not protected), and infected=0
response_dichotomous[[j]] <- factor(ifelse(response_dichotomous[[j]]==1, "protected", "infected"))
confusion_mat[[j]] <- caret::confusionMatrix(response_dichotomous[[j]], all_folds_response[[j]], positive = "protected")
logloss_list[[j]] <- MLmetrics::LogLoss(all_folds_predictor[[j]], as.numeric(all_folds_response[[j]])-1)
brier_list[[j]] <- mlr::measureBrier(all_folds_predictor[[j]], all_folds_response[[j]], "infected", "protected")
across_fold_performance[[j]] <- confusion_mat[[j]]$overall
}
across_fold_performance_df <- bind_rows(across_fold_performance, .id = "run") %>%
arrange(desc(Kappa)) %>%
mutate(Kappa_rank = 1:nrow(.)) %>%
arrange(desc(Accuracy)) %>%
mutate(Accuracy_rank = 1:nrow(.))
auc_df_high_dose <- bind_rows(auc, .id = "run") %>%
pivot_longer(., cols = everything(), names_to = "run", values_to = "AUC") %>%
arrange(desc(AUC)) %>%
mutate(AUC_rank = 1:nrow(.))
logloss_df <- bind_rows(logloss_list, .id = "run") %>%
pivot_longer(., cols = everything(), names_to = "run", values_to = "LogLoss") %>%
arrange(LogLoss) %>%
mutate(LogLoss_rank = 1:nrow(.))
brier_df <- bind_rows(brier_list, .id = "run") %>%
pivot_longer(., cols = everything(), names_to = "run", values_to = "BrierScore") %>%
mutate(BrierScore = signif(BrierScore, 3)) %>%
arrange(BrierScore) %>%
mutate(BrierScore_rank = 1:nrow(.))
across_fold_perform_high_dose <- auc_df_high_dose %>%
left_join(., across_fold_performance_df, by = "run") %>%
left_join(., summarized_res_2500_runs,
by = "run") %>%
left_join(., logloss_df,
by = "run") %>%
left_join(., brier_df,
by = "run") %>%
mutate(AverageRank = (Accuracy_rank + AUC_rank + BrierScore_rank + Kappa_rank + LogLoss_rank)/5) %>%
dplyr::select(run, Features, AverageRank, BrierScore_rank, Kappa_rank, LogLoss_rank, AUC_rank, Accuracy_rank, AUC, BrierScore, LogLoss, Kappa, Accuracy:McnemarPValue, mean_cv_AUC:sd_cv_accuracy) %>%
group_by(run) %>%
slice_head(n=1) %>%
ungroup() %>%
arrange(AverageRank)
#clean up feature names
# across_fold_perform_high_dose <- across_fold_perform_high_dose %>%
# mutate(Features = gsub("CSP_specific_IgG_", "CSP-specific IgG_", Features)) %>%
# mutate(Features = gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", Features)) %>%
# mutate(Features = gsub("PfSPZ", "Pf", Features)) %>%
# mutate(Features = gsub("_of_live_leukocytes", "_of_live_PBMCs", Features)) %>%
# mutate(Features = gsub("_", " ", Features))
top_n <- 4
across_fold_perform_highdose_rank_by_AverageRank <- across_fold_perform_high_dose %>%
arrange(AverageRank) %>%
drop_na()
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]
data.labels <- data.frame("name" = select_data[1:4,]$run,
"features" = select_data[1:4,]$Features,
"brier" = select_data[1:4,]$BrierScore,
"auc" = select_data[1:4,]$AUC,
"kappa" = select_data[1:4,]$Kappa,
"logloss" = select_data[1:4,]$LogLoss)
roc_plot_dat <- roc_dat[select_data[1:4,]$run]
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1.5) +
theme_bw() +
scale_color_brewer("", palette = "Set1", type = "qual") +
scale_fill_brewer("", palette = "Set1", type = "qual") +
theme(legend.position = "none") +
geom_text(data = data.labels[1,], aes(0.875, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
geom_text(data = data.labels[2,], aes(0.875, 0.2, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
geom_text(data = data.labels[3,], aes(0.875, 0.15, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
geom_text(data = data.labels[4,], aes(0.875, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
#geom_text(data = data.labels[5,], aes(0.875, 0.05, label = paste("AUC =", auc)), hjust=1, show_guide=FALSE) +
theme(plot.margin = unit(c(1,1,1,1), "cm")) +
geom_abline(slope = 1, linetype = "dotted", color = "gray")
roc_plots
library(ggplot2)
library(scales)
ggplotConfusionMatrix <- function(m){
mytitle <- paste(
"accuracy", percent_format()(m$overall[1]),
"kappa", percent_format()(m$overall[2]))
mydata <- as.data.frame(m$table) %>%
mutate(Prediction = ifelse(Prediction == "infected", "NP", "P")) %>%
mutate(Reference = ifelse(Reference == "infected", "NP", "P"))
p <- ggplot(data = mydata,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(x = Reference, y = Prediction, label = Freq, size = 24)) +
ggtitle(mytitle) +
ylab("predicted") +
xlab("actual outcome") +
theme(legend.position = "none",
panel.background = element_blank(),
plot.background = element_blank(),
axis.ticks = element_blank(),
axis.title = element_text(color = "black", size = 10),
axis.text = element_text(color = "black", size = 10),
title = element_text(size = 9),) +
theme(plot.margin = unit(c(0.5,0.45,0.01,0.4), "cm"))
return(p)
}
#my_models <- c("run 1727", "run 671", "run 808", "run 439") #ranked by Brier
my_models <- select_data[1:4,]$run #ranked by average rank
my_plots <- list(ggplotConfusionMatrix(confusion_mat[[my_models[1]]]),
ggplotConfusionMatrix(confusion_mat[[my_models[2]]]),
ggplotConfusionMatrix(confusion_mat[[my_models[3]]]),
ggplotConfusionMatrix(confusion_mat[[my_models[4]]]))
names(my_plots) <- my_models
my_highdose_confus_mat_plot_top <- ggarrange(plotlist = my_plots[1:2],
labels = names(my_plots[1:2]),
font.label = list(size=14, face="plain"),
ncol = 2,
hjust = -0.1)
my_highdose_confus_mat_plot_bottom <- ggarrange(plotlist = my_plots[3:4],
labels = names(my_plots[3:4]),
font.label = list(size=14, face="plain"),
ncol = 2,
hjust = -0.1)
my_highdose_confus_mat_plot_all <- ggarrange(my_highdose_confus_mat_plot_top,
my_highdose_confus_mat_plot_bottom,
nrow=2)
my_highdose_confus_mat_plot_all
addSmallLegend <- function(myPlot, pointSize = 0.75, textSize = 6, spaceLegend = 0.75) {
myPlot +
guides(shape = guide_legend(override.aes = list(size = pointSize)),
color = guide_legend(override.aes = list(size = pointSize))) +
theme(legend.title = element_text(size = textSize),
legend.text = element_text(size = textSize),
legend.key.size = unit(spaceLegend, "lines"))
}
#filter select data by performance metric
select_data <- across_fold_perform_highdose_rank_by_AverageRank %>%
arrange(AverageRank) %>%
slice_head(n=25)
#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
shap_prep_fold2[[k]],
shap_prep_fold3[[k]],
shap_prep_fold4[[k]]) %>%
group_by(variable) %>%
mutate(ID = 1:n()) %>%
ungroup()
levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))
shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}
shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")
number_models <- sum(summary(factor(unique(shap_prep_rbind_df$model))))
var_appearances <- shap_prep_rbind_df %>%
dplyr::select(model,variable) %>%
distinct() %>%
group_by(variable) %>%
summarize(proportion=n()/number_models, n=n()) %>%
ungroup() %>%
arrange(desc(n))
foo <- shap_long_iris %>%
group_by(variable) %>%
summarize(mean = mean(value))
SHAP_feature_correlations <- shap_prep_rbind_df %>%
dplyr::select(model,variable, value, rfvalue) %>%
distinct() %>%
group_by(model, variable) %>%
summarise(r = cor(value, rfvalue)) %>%
ungroup() %>%
group_by(variable) %>%
summarize(mean_r=mean(r))
## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.
var_appearances <- var_appearances %>%
left_join(., SHAP_feature_correlations,
by = "variable")
feature_importance_plot <- var_appearances %>%
filter(n>1) %>%
ggplot(., aes(x= fct_reorder(variable, n, .desc = FALSE))) +
geom_bar(aes(y=proportion, fill=mean_r), stat="identity") +
scale_fill_gradient2(low = "darkblue", mid = "white",
high = "darkred",
midpoint = 0,limits = c(-1,1)) +
theme_bw() +
coord_flip() +
xlab("features") +
ylab("appearances") +
labs(fill = "R") +
scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
theme(legend.position = "top",
axis.text = element_text(colour = "black"))
feature_importance_plot <- addSmallLegend(feature_importance_plot)
feature_importance_plot
#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected
#filter select data by top n
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]
#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
shap_prep_fold2[[k]],
shap_prep_fold3[[k]],
shap_prep_fold4[[k]]) %>%
group_by(variable) %>%
mutate(ID = 1:n()) %>%
ungroup()
levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))
shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}
shap_plots_top <- ggarrange(plotlist = shap_plots[4:3], labels = names(shap_plots[4:3]),
heights = c(7,6.5),
nrow=2, align = "hv", common.legend = TRUE,
font.label = list(size = 10, face = "plain", color ="black"),
hjust = -0.5)
shap_plots_bottom <- ggarrange(plotlist = shap_plots[2:1], labels = names(shap_plots[2:1]),
heights = c(7,7),
nrow=2, align = "hv", common.legend = TRUE,
font.label = list(size = 10, face = "plain", color ="black"),
hjust = -0.5,
legend = "none")
shap_plots_all <- ggarrange(shap_plots_top, shap_plots_bottom,
heights = c(1.08,1),
ncol=1, align = "hv", common.legend = TRUE)
shap_plots_all
roc_plot_dat <- roc_dat[select_data[1:4,]$run]
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1) +
theme_bw() +
scale_color_brewer("", palette = "Set1", type = "qual") +
scale_fill_brewer("", palette = "Set1", type = "qual") +
theme(legend.position = "none") +
geom_text(data = data.labels[1,], aes(1, 0.55, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
geom_text(data = data.labels[2,], aes(1, 0.40, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
geom_text(data = data.labels[3,], aes(1, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
geom_text(data = data.labels[4,], aes(1, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
theme(plot.margin = unit(c(0.1,3,0.1,3), "cm"))
figure_5C_plot <- ggarrange(ggarrange(NULL, feature_importance_plot, NULL, nrow = 3,
heights = c(0.2,1,0.2)),
shap_plots_all,
ggarrange(roc_plots,
my_highdose_confus_mat_plot_all,
nrow=2,
heights = c(0.6,1)),
widths = c(1.1,1.2,1),
nrow=1)
figure_5C_plot
shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")
var_appearances <- shap_prep_rbind_df %>%
group_by(variable) %>%
mutate(n=n()/63) %>%
distinct(variable, n) %>%
arrange(desc(n))
shap_by_feature_dat <-shap_prep_rbind_df %>%
left_join(., select_data %>%
rename(model = "run") %>%
dplyr::select(model, contains("rank")),
by = "model") %>%
left_join(., var_appearances,
by = "variable") %>%
mutate(model = fct_reorder(model, AverageRank)) %>%
mutate(variable = fct_reorder2(variable, mean_value, n))
shap_by_feature <- shap_by_feature_dat %>%
ggplot(., aes(x = value, y = rfvalue)) +
geom_point(fill = "gray", pch = 21, alpha = 0.3) +
theme_bw() +
ylab("feature value") +
xlab("SHAP value (impact on model output)") +
xlim(c(-2,2)) +
geom_smooth(method='lm', formula= y~x, color = "salmon", alpha = 0.30) +
stat_cor(method = "pearson", "cor.coef.name" = "R", size = 2.5, label.x.npc = "left", label.y.npc = "top",
aes(label = paste(..r.label.., cut(..p..,
breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf),
labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")),
sep = "~"))) +
ggh4x::facet_grid2(model~variable, scales = "free_x", labeller = label_wrap_gen(), switch = "y") +
theme(strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
strip.clip = "off",
axis.text = element_blank(),
axis.ticks = element_blank())
shap_by_feature
### Run xgb on full data set using the downselected, cross-validated features
#This can be used to create new models de novo for Shapley plots.
source("/Users/tuantran/Library/CloudStorage/OneDrive-IndianaUniversity/Manuscripts/KSPZV1 Manuscript/kspzv1-systems-analysis/kspzv1 xgboost manual train and cv function.R")
xgb_res <- c()
set.seed(12345)
for(k in select_data$run){
my_features <- fold4[[k]]$downselected_xgb_results$feature_names
xgb_res[[k]] <- xgb_train_on_selected_features(train_features = train_features,
features_to_test = my_features,
maxiterations = 2000)
}
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## booster discrete - - gbtree - TRUE -
## gamma integer - - 0 to 3 - TRUE -
## max_depth integer - - 2 to 5 - TRUE -
## eta numeric - - 0.01 to 0.2 - TRUE -
## min_child_weight numeric - - 0 to 4 - TRUE -
## subsample numeric - - 0.75 to 0.9 - TRUE -
## lambda numeric - - 0 to 1 - TRUE -
## alpha numeric - - 0 to 1 - TRUE -
## colsample_bytree numeric - - 0.5 to 1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=1; max_depth=5; eta=0.151; min_child_weight=0.0702; subsample=0.864; lambda=0.95; alpha=0.456; colsample_bytree=0.827 : acc.test.mean=0.7727328
## hyperparameter tuning...: 36.606 sec elapsed
## [1] "Training on best hyperparameters."
## [1] train-error:0.046154
## Will train until train_error hasn't improved in 20 rounds.
##
## [11] train-error:0.000000
## [21] train-error:0.000000
## Stopping. Best iteration:
## [5] train-error:0.000000
##
## [1] "The 7 downselected features used in the model are TBA_(M248); myeloid,_dendritic_cell_activation_via_NFkB_(I)_(M43.0); TBA_(M153); cell_cycle_and_growth_arrest_(M31); metabolism_of_steroids_(M225); FACS_PfSPZ-specific_Vd2_of_T_cells; FACS_IgM+_of_memory"
## training on best hyperparameters: 0.009 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## booster discrete - - gbtree - TRUE -
## gamma integer - - 0 to 3 - TRUE -
## max_depth integer - - 2 to 5 - TRUE -
## eta numeric - - 0.01 to 0.2 - TRUE -
## min_child_weight numeric - - 0 to 4 - TRUE -
## subsample numeric - - 0.75 to 0.9 - TRUE -
## lambda numeric - - 0 to 1 - TRUE -
## alpha numeric - - 0 to 1 - TRUE -
## colsample_bytree numeric - - 0.5 to 1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=3; eta=0.189; min_child_weight=2.42; subsample=0.802; lambda=0.54; alpha=0.313; colsample_bytree=0.725 : acc.test.mean=0.8161765
## hyperparameter tuning...: 24.354 sec elapsed
## [1] "Training on best hyperparameters."
## [1] train-error:0.230769
## Will train until train_error hasn't improved in 20 rounds.
##
## [11] train-error:0.107692
## [21] train-error:0.092308
## Stopping. Best iteration:
## [9] train-error:0.061538
##
## [1] "The 6 downselected features used in the model are TBA_(M243); FACS_PBs_of_memory; TBA_(M153); NK_cell_surface_signature_(S1); CD4_T_cell_surface_signature_Th1-stimulated_(S6); respiratory_electron_transport_chain_(mitochondrion)_(M238)"
## training on best hyperparameters: 0.005 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## booster discrete - - gbtree - TRUE -
## gamma integer - - 0 to 3 - TRUE -
## max_depth integer - - 2 to 5 - TRUE -
## eta numeric - - 0.01 to 0.2 - TRUE -
## min_child_weight numeric - - 0 to 4 - TRUE -
## subsample numeric - - 0.75 to 0.9 - TRUE -
## lambda numeric - - 0 to 1 - TRUE -
## alpha numeric - - 0 to 1 - TRUE -
## colsample_bytree numeric - - 0.5 to 1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=4; eta=0.0547; min_child_weight=1.88; subsample=0.837; lambda=0.978; alpha=0.873; colsample_bytree=0.662 : acc.test.mean=0.7715686
## hyperparameter tuning...: 22.715 sec elapsed
## [1] "Training on best hyperparameters."
## [1] train-error:0.246154
## Will train until train_error hasn't improved in 20 rounds.
##
## [11] train-error:0.076923
## [21] train-error:0.061538
## [31] train-error:0.046154
## [41] train-error:0.076923
## [51] train-error:0.061538
## Stopping. Best iteration:
## [37] train-error:0.030769
##
## [1] "The 7 downselected features used in the model are NK_cell_surface_signature_(S1); platelet_activation_&_blood_coagulation_(M199); cell_cycle,_ATP_binding_(M144); TBA_(M153); TBA_(M141); metabolism_of_steroids_(M225); FACS_PBs_of_memory"
## training on best hyperparameters: 0.007 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## booster discrete - - gbtree - TRUE -
## gamma integer - - 0 to 3 - TRUE -
## max_depth integer - - 2 to 5 - TRUE -
## eta numeric - - 0.01 to 0.2 - TRUE -
## min_child_weight numeric - - 0 to 4 - TRUE -
## subsample numeric - - 0.75 to 0.9 - TRUE -
## lambda numeric - - 0 to 1 - TRUE -
## alpha numeric - - 0 to 1 - TRUE -
## colsample_bytree numeric - - 0.5 to 1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=5; eta=0.194; min_child_weight=1.97; subsample=0.845; lambda=0.442; alpha=0.166; colsample_bytree=0.546 : acc.test.mean=0.7996324
## hyperparameter tuning...: 22.059 sec elapsed
## [1] "Training on best hyperparameters."
## [1] train-error:0.276923
## Will train until train_error hasn't improved in 20 rounds.
##
## [11] train-error:0.092308
## [21] train-error:0.107692
## Stopping. Best iteration:
## [8] train-error:0.092308
##
## [1] "The 7 downselected features used in the model are TBA_(M243); TBA_(M180); FACS_CD3+CD4+_of_live_PBMCs; metabolism_of_steroids_(M225); FACS_CD3+_of_live_PBMCs; cytoskeletal_remodeling_(enriched_for_SRF_targets)_(M34); TBA_(M153)"
## training on best hyperparameters: 0.006 sec elapsed
#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected
shap_plot <- c()
for(k in rev(select_data$run)){
my_features <- xgb_res[[k]]$xgb_results$feature_names
shap_plot[[k]] <- shap.plot.summary.wrap1(xgb_res[[k]]$xgb_results,
X = as.matrix(train_features[, my_features]))
}
shap_plots_top <- ggarrange(plotlist = shap_plot[4:3], labels = names(shap_plot[4:3]),
heights = c(6.4,6),
nrow=2, align = "hv", common.legend = TRUE,
font.label = list(size = 10, face = "plain", color ="black"),
hjust = -0.5)
shap_plots_bottom <- ggarrange(plotlist = shap_plot[2:1], labels = names(shap_plot[2:1]),
heights = c(1,1),
nrow=2, align = "hv", common.legend = TRUE,
font.label = list(size = 10, face = "plain", color ="black"),
hjust = -0.5,
legend = "none")
shap_plots <- ggarrange(shap_plots_top, shap_plots_bottom,
heights = c(1,1),
ncol=1, align = "hv", common.legend = TRUE)
shap_plots
library(ggpubr)
#load the postvax highdose data set
#local filename: "highdose_PfSPZ_postvax_correlation_ML_data_with_missing.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1PXWHbJLUZqqzuHBGA8vidm05iG6pxpoL"), path = temp, overwrite = TRUE)
baseline_placebo_features <- readRDS(file = dl$local_path) %>%
rownames_to_column(var = "PATID") %>%
dplyr::select(PATID, contains("CD3"), contains("IgM"))
#local filename: "KSPZV1 PhenoData TTE 6 months Long Format.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1v2l7G7AzXhgcqObzAabYy6_QJsdB5hFn"), path = temp, overwrite = TRUE)
pheno_data <- readRDS(file = dl$local_path) %>%
filter(Dosegroup == "1.8 x 10^6 PfSPZ") %>%
dplyr::select(PATID, tte.mal.atp.3, Outcome, Dosegroup) %>%
distinct(PATID, tte.mal.atp.3, Outcome, Dosegroup)
facs_dat <- pheno_data %>%
left_join(., baseline_placebo_features, by = "PATID") %>%
drop_na("FACS_CD3+CD4+_of_live_PBMCs")
facs_dat_n <- facs_dat %>%
group_by(Outcome) %>%
mutate(n=n()) %>%
ungroup() %>%
mutate(outcome = ifelse(Outcome == "infected", paste0("NP (", n, ")"),
paste0("P (", n, ")")))
CD4_plot <- facs_dat_n %>%
ggplot(., aes(x = outcome, y = `FACS_CD3+CD4+_of_live_PBMCs`, fill = outcome)) +
geom_boxplot() +
ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
ylab("CD3+CD4+ (% of live PBMCs)") +
xlab("outcome (n)") +
scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
stat_compare_means(method = "wilcox", vjust = 0.1, hjust = 0.5) +
theme_bw(base_family = "sans") +
theme(legend.position = "none",
axis.text = element_text(size = 12, colour = "black"),
axis.title = element_text(size = 13))
facs_dat <- pheno_data %>%
left_join(., baseline_placebo_features, by = "PATID") %>%
drop_na("FACS_IgM+_of_memory")
facs_dat_n <- facs_dat %>%
group_by(Outcome) %>%
mutate(n=n()) %>%
ungroup() %>%
mutate(outcome = ifelse(Outcome == "infected", paste0("NP (", n, ")"),
paste0("P (", n, ")")))
IgM_MBC_plot <- facs_dat_n %>%
ggplot(., aes(x = outcome, y = `FACS_IgM+_of_memory`, fill = outcome)) +
geom_boxplot() +
ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
ylab("IgM+ (% of memory B cells)") +
xlab("outcome (n)") +
scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
stat_compare_means(method = "wilcox", vjust = 0.1, hjust = 0.5) +
theme_bw(base_family = "sans") +
theme(legend.position = "none",
axis.text = element_text(size = 12, colour = "black"),
axis.title = element_text(size = 13))
CD4_IgM_plot <- ggarrange(CD4_plot, IgM_MBC_plot, ncol = 1)
CD4_IgM_plot