Approach

Plot ROC curves, confusion matrices, and SHAP plots from top n models using xgboost results derived from “KPSZV1 Postvax Highdose PfSPZ Multimodal ML xgb nested fs cv on full set JCI Revision.Rmd”.

library(xgboost)
library(dplyr)
library(tidyverse)
library(pROC)
library(caret)
library(doMC)
library(data.table)
library(xgboost)
library(SHAPforxgboost)
library(mlr)
library(ggpubr)
library(tictoc)
library(Biobase)
library(googledrive)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())

Load feature data

#from google drive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1QGtZ_VpOFgS7fGDeOwCwYg-hBzENwhk6"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)
train_features <- high_dose_features_class %>%
  dplyr::select(-c(sex, site))

Load ML results

This is a large file.

#from google drive
temp <- tempfile(fileext = ".Rdata")
dl <- drive_download(
  as_id("1Rb9kQ5ZjtRJsZ9hX-n0xSUcuPlqrlTMf"), path = temp, overwrite = TRUE)
all_folds_2500_runs <- load(file = dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1ScvZjV0WWXc7H0tPCJoN61FyuM-bUExQ"), path = temp, overwrite = TRUE)
per_run_res_2500_runs <- readxl::read_xlsx(dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1SmyKUy6STA5BEvDUF7KJFE3g_E-ibY7K"), path = temp, overwrite = TRUE)
summarized_res_2500_runs <- readxl::read_xlsx(dl$local_path)

Determine across-fold performance metrics (accuracy, AUC, Brier score, kappa, logless) select best performing high-dose PfSPZ post-vax models

smooth <- FALSE
runs_to_test <- summarized_res_2500_runs[summarized_res_2500_runs$mean_cv_accuracy>.66,]$run #select only the high-dose PfSPZ models with accuracy >66%.
#runs_to_test <- c("run 211", "run 2222", "run 671")

best_run <- best_combo <- all_folds_response <- all_folds_predictor <- roc_dat <- auc <- confusion_mat <- response_dichotomous <- c()
across_fold_performance <- logloss_list <- brier_list <- c()
for(j in runs_to_test){
  best_run[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = run)
  best_combo[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = Features)
  #calculate predictive accuracy across all 4 folds
  all_folds_response[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$class,
                               fold2[[best_run[[j]]]]$predictor_data$class,
                               fold3[[best_run[[j]]]]$predictor_data$class,
                               fold4[[best_run[[j]]]]$predictor_data$class)
  all_folds_predictor[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold2[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold3[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold4[[best_run[[j]]]]$predictor_data$pred.prob)
  roc_dat[[j]] <- pROC::roc(response = all_folds_response[[j]],
                            predictor = all_folds_predictor[[j]],
                            smooth = smooth) 
  auc[[j]] <- roc_dat[[j]]$auc %>% as.numeric(.) %>% signif(.,digits=3)
  response_dichotomous[[j]]  <- ifelse(all_folds_predictor[[j]]  > 0.5,1,0)
  #remember that since we are using the high-dose model, we are going with the original high-dose designation where infected=0, protected=1
  #but we are actually predicting the opposite for placebo, where 1 means infected(not protected), and infected=0
  response_dichotomous[[j]]   <- factor(ifelse(response_dichotomous[[j]]==1, "protected", "infected")) 
  confusion_mat[[j]] <- caret::confusionMatrix(response_dichotomous[[j]], all_folds_response[[j]], positive = "protected")
  logloss_list[[j]] <- MLmetrics::LogLoss(all_folds_predictor[[j]], as.numeric(all_folds_response[[j]])-1)
  brier_list[[j]] <- mlr::measureBrier(all_folds_predictor[[j]], all_folds_response[[j]], "infected", "protected")
  across_fold_performance[[j]] <- confusion_mat[[j]]$overall
} 

across_fold_performance_df <- bind_rows(across_fold_performance, .id = "run") %>%
  arrange(desc(Kappa))  %>%
  mutate(Kappa_rank = 1:nrow(.)) %>%
  arrange(desc(Accuracy)) %>%
  mutate(Accuracy_rank = 1:nrow(.))
auc_df_high_dose <- bind_rows(auc, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "AUC") %>%
  arrange(desc(AUC))  %>%
  mutate(AUC_rank = 1:nrow(.))
logloss_df <- bind_rows(logloss_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "LogLoss") %>%
  arrange(LogLoss)  %>%
  mutate(LogLoss_rank = 1:nrow(.))
brier_df <- bind_rows(brier_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "BrierScore") %>%
  mutate(BrierScore = signif(BrierScore, 3)) %>%
  arrange(BrierScore) %>%
  mutate(BrierScore_rank = 1:nrow(.))

across_fold_perform_high_dose <- auc_df_high_dose %>%
  left_join(., across_fold_performance_df, by = "run") %>%
  left_join(., summarized_res_2500_runs,
            by = "run") %>%
  left_join(., logloss_df,
            by = "run") %>%
  left_join(., brier_df,
            by = "run") %>%
  mutate(AverageRank = (Accuracy_rank + AUC_rank + BrierScore_rank + Kappa_rank + LogLoss_rank)/5) %>%
  dplyr::select(run, Features, AverageRank, BrierScore_rank, Kappa_rank, LogLoss_rank, AUC_rank, Accuracy_rank, AUC, BrierScore, LogLoss, Kappa, Accuracy:McnemarPValue, mean_cv_AUC:sd_cv_accuracy) %>%
  group_by(run) %>%
  slice_head(n=1) %>%
  ungroup() %>%
  arrange(AverageRank)

#clean up feature names
  # across_fold_perform_high_dose <- across_fold_perform_high_dose %>%
  #   mutate(Features =  gsub("CSP_specific_IgG_", "CSP-specific IgG_", Features)) %>% 
  #   mutate(Features =  gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", Features)) %>%   
  #   mutate(Features =  gsub("PfSPZ", "Pf", Features)) %>% 
  #   mutate(Features =  gsub("_of_live_leukocytes", "_of_live_PBMCs", Features)) %>% 
  #   mutate(Features =  gsub("_", " ", Features))

Arrange data and plot AUROC curves

top_n <- 4
across_fold_perform_highdose_rank_by_AverageRank <- across_fold_perform_high_dose %>%
  arrange(AverageRank) %>%
  drop_na()
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]
data.labels <- data.frame("name" = select_data[1:4,]$run,
                          "features" = select_data[1:4,]$Features,
                          "brier" = select_data[1:4,]$BrierScore,
                          "auc" = select_data[1:4,]$AUC,
                          "kappa" = select_data[1:4,]$Kappa,
                          "logloss" = select_data[1:4,]$LogLoss)
roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1.5) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(0.875, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[2,], aes(0.875, 0.2, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[3,], aes(0.875, 0.15, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[4,], aes(0.875, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  #geom_text(data = data.labels[5,], aes(0.875, 0.05, label = paste("AUC =", auc)), hjust=1, show_guide=FALSE) +
  theme(plot.margin = unit(c(1,1,1,1), "cm")) +
  geom_abline(slope = 1, linetype = "dotted", color = "gray")

roc_plots

Plot confusion matrices for high-dose postvax models

library(ggplot2)
library(scales)

ggplotConfusionMatrix <- function(m){
  mytitle <- paste(
    "accuracy", percent_format()(m$overall[1]),
                   "kappa", percent_format()(m$overall[2]))
  mydata <- as.data.frame(m$table) %>%
    mutate(Prediction = ifelse(Prediction == "infected", "NP", "P")) %>%
    mutate(Reference = ifelse(Reference == "infected", "NP", "P"))
  p <- ggplot(data = mydata,
           aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = log(Freq)), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    geom_text(aes(x = Reference, y = Prediction, label = Freq, size = 24)) +
    ggtitle(mytitle) +
    ylab("predicted") +
    xlab("actual outcome") +
    theme(legend.position = "none",
          panel.background = element_blank(),
          plot.background = element_blank(),
          axis.ticks = element_blank(),
          axis.title = element_text(color = "black", size = 10),
          axis.text = element_text(color = "black", size = 10),
          title = element_text(size = 9),) +
    theme(plot.margin = unit(c(0.5,0.45,0.01,0.4), "cm"))
  return(p)
}

#my_models <- c("run 1727", "run 671", "run 808", "run 439") #ranked by Brier
my_models <- select_data[1:4,]$run #ranked by average rank

my_plots <- list(ggplotConfusionMatrix(confusion_mat[[my_models[1]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[2]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[3]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[4]]]))
names(my_plots) <- my_models

my_highdose_confus_mat_plot_top <- ggarrange(plotlist = my_plots[1:2],
                                         labels = names(my_plots[1:2]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_bottom <- ggarrange(plotlist = my_plots[3:4],
                                         labels = names(my_plots[3:4]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_all <- ggarrange(my_highdose_confus_mat_plot_top,
                                             my_highdose_confus_mat_plot_bottom,
                                             nrow=2)

my_highdose_confus_mat_plot_all

Feature importance by number of appearances in top %1 models

addSmallLegend <- function(myPlot, pointSize = 0.75, textSize = 6, spaceLegend = 0.75) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

#filter select data by performance metric
select_data <- across_fold_perform_highdose_rank_by_AverageRank %>%
  arrange(AverageRank) %>%
  slice_head(n=25)

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}
shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

number_models <- sum(summary(factor(unique(shap_prep_rbind_df$model))))

var_appearances <- shap_prep_rbind_df %>%
  dplyr::select(model,variable) %>%
  distinct() %>%
  group_by(variable) %>%
  summarize(proportion=n()/number_models, n=n()) %>%
  ungroup() %>%
  arrange(desc(n))

foo <- shap_long_iris %>%
  group_by(variable) %>%
  summarize(mean = mean(value))


SHAP_feature_correlations <- shap_prep_rbind_df %>%
  dplyr::select(model,variable, value, rfvalue) %>%
  distinct() %>%
  group_by(model, variable) %>% 
  summarise(r = cor(value, rfvalue)) %>%
  ungroup() %>%
  group_by(variable) %>%
  summarize(mean_r=mean(r))

## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.

var_appearances <- var_appearances %>%
  left_join(., SHAP_feature_correlations,
            by = "variable")

feature_importance_plot <- var_appearances %>%
  filter(n>1) %>%
  ggplot(., aes(x= fct_reorder(variable, n, .desc = FALSE))) +
  geom_bar(aes(y=proportion, fill=mean_r), stat="identity") +
  scale_fill_gradient2(low = "darkblue", mid = "white",
                       high = "darkred",
                       midpoint = 0,limits = c(-1,1)) +
  theme_bw() +
  coord_flip() +
  xlab("features") +
  ylab("appearances") + 
  labs(fill = "R") +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  theme(legend.position = "top",
        axis.text = element_text(colour = "black"))

feature_importance_plot <- addSmallLegend(feature_importance_plot)
feature_importance_plot

#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected

#filter select data by top n
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}

shap_plots_top <- ggarrange(plotlist = shap_plots[4:3], labels = names(shap_plots[4:3]),
                        heights = c(7,6.5),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5)

shap_plots_bottom <- ggarrange(plotlist = shap_plots[2:1], labels = names(shap_plots[2:1]),
                        heights = c(7,7),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5,
                        legend = "none")

shap_plots_all <- ggarrange(shap_plots_top, shap_plots_bottom,
                        heights = c(1.08,1),
                        ncol=1, align = "hv", common.legend = TRUE)

shap_plots_all

roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(1, 0.55, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[2,], aes(1, 0.40, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[3,], aes(1, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[4,], aes(1, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  theme(plot.margin = unit(c(0.1,3,0.1,3), "cm"))

figure_5C_plot <- ggarrange(ggarrange(NULL, feature_importance_plot, NULL, nrow = 3,
                    heights = c(0.2,1,0.2)),
          shap_plots_all,
          ggarrange(roc_plots,
                    my_highdose_confus_mat_plot_all,
                    nrow=2,
                    heights = c(0.6,1)),
          widths = c(1.1,1.2,1),
          nrow=1)
figure_5C_plot

Examine top n models

shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

var_appearances <- shap_prep_rbind_df %>%
  group_by(variable) %>%
  mutate(n=n()/63) %>%
  distinct(variable, n) %>%
  arrange(desc(n))

shap_by_feature_dat <-shap_prep_rbind_df %>%
  left_join(., select_data %>%
              rename(model = "run") %>%
              dplyr::select(model, contains("rank")),
            by = "model") %>%
  left_join(., var_appearances,
            by = "variable") %>%
  mutate(model = fct_reorder(model, AverageRank)) %>%
  mutate(variable = fct_reorder2(variable, mean_value, n))

shap_by_feature <- shap_by_feature_dat %>%
  ggplot(., aes(x = value, y = rfvalue)) +
  geom_point(fill = "gray", pch = 21, alpha = 0.3) +
  theme_bw() + 
  ylab("feature value") +
  xlab("SHAP value (impact on model output)") +
  xlim(c(-2,2)) +
  geom_smooth(method='lm', formula= y~x, color = "salmon", alpha = 0.30) +
  stat_cor(method = "pearson", "cor.coef.name" = "R", size = 2.5, label.x.npc = "left", label.y.npc = "top",
           aes(label = paste(..r.label.., cut(..p.., 
                                          breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf),
                                          labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")),
                         sep = "~"))) +
  ggh4x::facet_grid2(model~variable, scales = "free_x", labeller = label_wrap_gen(), switch = "y") +
  theme(strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.clip = "off",
        axis.text = element_blank(),
        axis.ticks = element_blank())
  
shap_by_feature

### Run xgb on full data set using the downselected, cross-validated features
#This can be used to create new models de novo for Shapley plots.
source("/Users/tuantran/Library/CloudStorage/OneDrive-IndianaUniversity/Manuscripts/KSPZV1 Manuscript/kspzv1-systems-analysis/kspzv1 xgboost manual train and cv function.R")
xgb_res <- c()
set.seed(12345)
for(k in select_data$run){
  my_features <- fold4[[k]]$downselected_xgb_results$feature_names
  xgb_res[[k]] <- xgb_train_on_selected_features(train_features = train_features,
                                                 features_to_test = my_features,
                                                 maxiterations = 2000)
}

## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"

## [Tune] Started tuning learner classif.xgboost for parameter set:

##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -

## With control class: TuneControlRandom

## Imputation value: -0

## Exporting objects to slaves for mode socket: .mlr.slave.options

## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.

## [Tune] Result: booster=gbtree; gamma=1; max_depth=5; eta=0.151; min_child_weight=0.0702; subsample=0.864; lambda=0.95; alpha=0.456; colsample_bytree=0.827 : acc.test.mean=0.7727328

## hyperparameter tuning...: 36.606 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.046154 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.000000 
## [21] train-error:0.000000 
## Stopping. Best iteration:
## [5]  train-error:0.000000
## 
## [1] "The 7 downselected features used in the model are TBA_(M248); myeloid,_dendritic_cell_activation_via_NFkB_(I)_(M43.0); TBA_(M153); cell_cycle_and_growth_arrest_(M31); metabolism_of_steroids_(M225); FACS_PfSPZ-specific_Vd2_of_T_cells; FACS_IgM+_of_memory"
## training on best hyperparameters: 0.009 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"

## [Tune] Started tuning learner classif.xgboost for parameter set:

##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -

## With control class: TuneControlRandom

## Imputation value: -0

## Exporting objects to slaves for mode socket: .mlr.slave.options

## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.

## [Tune] Result: booster=gbtree; gamma=0; max_depth=3; eta=0.189; min_child_weight=2.42; subsample=0.802; lambda=0.54; alpha=0.313; colsample_bytree=0.725 : acc.test.mean=0.8161765

## hyperparameter tuning...: 24.354 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.230769 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.107692 
## [21] train-error:0.092308 
## Stopping. Best iteration:
## [9]  train-error:0.061538
## 
## [1] "The 6 downselected features used in the model are TBA_(M243); FACS_PBs_of_memory; TBA_(M153); NK_cell_surface_signature_(S1); CD4_T_cell_surface_signature_Th1-stimulated_(S6); respiratory_electron_transport_chain_(mitochondrion)_(M238)"
## training on best hyperparameters: 0.005 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"

## [Tune] Started tuning learner classif.xgboost for parameter set:

##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -

## With control class: TuneControlRandom

## Imputation value: -0

## Exporting objects to slaves for mode socket: .mlr.slave.options

## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.

## [Tune] Result: booster=gbtree; gamma=0; max_depth=4; eta=0.0547; min_child_weight=1.88; subsample=0.837; lambda=0.978; alpha=0.873; colsample_bytree=0.662 : acc.test.mean=0.7715686

## hyperparameter tuning...: 22.715 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.246154 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.076923 
## [21] train-error:0.061538 
## [31] train-error:0.046154 
## [41] train-error:0.076923 
## [51] train-error:0.061538 
## Stopping. Best iteration:
## [37] train-error:0.030769
## 
## [1] "The 7 downselected features used in the model are NK_cell_surface_signature_(S1); platelet_activation_&_blood_coagulation_(M199); cell_cycle,_ATP_binding_(M144); TBA_(M153); TBA_(M141); metabolism_of_steroids_(M225); FACS_PBs_of_memory"
## training on best hyperparameters: 0.007 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 65 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"

## [Tune] Started tuning learner classif.xgboost for parameter set:

##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -

## With control class: TuneControlRandom

## Imputation value: -0

## Exporting objects to slaves for mode socket: .mlr.slave.options

## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.

## [Tune] Result: booster=gbtree; gamma=0; max_depth=5; eta=0.194; min_child_weight=1.97; subsample=0.845; lambda=0.442; alpha=0.166; colsample_bytree=0.546 : acc.test.mean=0.7996324

## hyperparameter tuning...: 22.059 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.276923 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.092308 
## [21] train-error:0.107692 
## Stopping. Best iteration:
## [8]  train-error:0.092308
## 
## [1] "The 7 downselected features used in the model are TBA_(M243); TBA_(M180); FACS_CD3+CD4+_of_live_PBMCs; metabolism_of_steroids_(M225); FACS_CD3+_of_live_PBMCs; cytoskeletal_remodeling_(enriched_for_SRF_targets)_(M34); TBA_(M153)"
## training on best hyperparameters: 0.006 sec elapsed

#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected

shap_plot <- c()
for(k in rev(select_data$run)){
  my_features <- xgb_res[[k]]$xgb_results$feature_names
  shap_plot[[k]] <- shap.plot.summary.wrap1(xgb_res[[k]]$xgb_results,
                                           X = as.matrix(train_features[, my_features]))
}
                                           

shap_plots_top <- ggarrange(plotlist = shap_plot[4:3], labels = names(shap_plot[4:3]),
                        heights = c(6.4,6),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5)

shap_plots_bottom <- ggarrange(plotlist = shap_plot[2:1], labels = names(shap_plot[2:1]),
                        heights = c(1,1),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5,
                        legend = "none")

shap_plots <- ggarrange(shap_plots_top, shap_plots_bottom,
                        heights = c(1,1),
                        ncol=1, align = "hv", common.legend = TRUE)

shap_plots

Confirm differences in CD3+CD4+ T cells between NP and P in placebo group at baseline

library(ggpubr)
#load the postvax highdose data set
#local filename: "highdose_PfSPZ_postvax_correlation_ML_data_with_missing.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1PXWHbJLUZqqzuHBGA8vidm05iG6pxpoL"), path = temp, overwrite = TRUE)
baseline_placebo_features <- readRDS(file = dl$local_path) %>%
  rownames_to_column(var = "PATID") %>%
  dplyr::select(PATID, contains("CD3"), contains("IgM"))

#local filename: "KSPZV1 PhenoData TTE 6 months Long Format.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1v2l7G7AzXhgcqObzAabYy6_QJsdB5hFn"), path = temp, overwrite = TRUE)
pheno_data <- readRDS(file = dl$local_path) %>%
  filter(Dosegroup == "1.8 x 10^6 PfSPZ") %>%
  dplyr::select(PATID, tte.mal.atp.3, Outcome, Dosegroup) %>%
  distinct(PATID, tte.mal.atp.3, Outcome, Dosegroup)

facs_dat <- pheno_data %>%
  left_join(., baseline_placebo_features, by = "PATID") %>%
  drop_na("FACS_CD3+CD4+_of_live_PBMCs")
  
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("NP (", n, ")"),
                          paste0("P (", n, ")")))

CD4_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_CD3+CD4+_of_live_PBMCs`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("CD3+CD4+ (% of live PBMCs)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0.1, hjust = 0.5) +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 12, colour = "black"),
        axis.title = element_text(size = 13))

facs_dat <- pheno_data %>%
  left_join(., baseline_placebo_features, by = "PATID") %>%
  drop_na("FACS_IgM+_of_memory")
  
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("NP (", n, ")"),
                          paste0("P (", n, ")")))

IgM_MBC_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_IgM+_of_memory`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("IgM+ (% of memory B cells)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0.1, hjust = 0.5) +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 12, colour = "black"),
        axis.title = element_text(size = 13))

CD4_IgM_plot <- ggarrange(CD4_plot, IgM_MBC_plot, ncol = 1)

CD4_IgM_plot

KSPZV1 postvax high-dose PfSPZ multimodal ML- visualize xgboost ML results, JCI revision (Figure 6C)

Tuan M. Tran

12 December, 2023

Objective