Objective

Visualization of xgboost ML with 4-fold cross-validation of full dataset.

Approach

Plot ROC curves, confusion matrices, and SHAP plots from top n models using xgboost results derived from “KPSZV1 Delta Highdose PfSPZ Multimodal ML xgb nested fs cv on full set JCI Revision.Rmd”.

library(xgboost)
library(dplyr)
library(tidyverse)
library(pROC)
library(caret)
library(doMC)
library(data.table)
library(xgboost)
library(SHAPforxgboost)
library(mlr)
library(ggpubr)
library(tictoc)
library(Biobase)
library(googledrive)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())

Load feature data

#from google drive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1QI4BIKcW-5oQfHl-o3t7Hj58OaKl8Mn8"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)
train_features <- high_dose_features_class %>%
  dplyr::select(-c(sex, site))

Load ML results

This is a large file.

#from google drive
temp <- tempfile(fileext = ".Rdata")
dl <- drive_download(
  as_id("1R0xB2u_7nMdszP0q2glaouZjb3ihmYPU"), path = temp, overwrite = TRUE)
all_folds_2500_runs <- load(file = dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1SVbsvL-SK4zDjHle-LuGIh0qm4gAZf2x"), path = temp, overwrite = TRUE)
per_run_res_2500_runs <- readxl::read_xlsx(dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1R83aGAo2jfkf48MWyeF37DUbV2LRZDdt"), path = temp, overwrite = TRUE)
summarized_res_2500_runs <- readxl::read_xlsx(dl$local_path)

Determine across-fold performance metrics (accuracy, AUC, Brier score, kappa, logless) select best performing high-dose PfSPZ delta models

smooth <- FALSE
runs_to_test <- summarized_res_2500_runs[summarized_res_2500_runs$mean_cv_accuracy>.66,]$run #select only the high-dose PfSPZ models with accuracy >66%.
#runs_to_test <- c("run 211", "run 2222", "run 671")

best_run <- best_combo <- all_folds_response <- all_folds_predictor <- roc_dat <- auc <- confusion_mat <- response_dichotomous <- c()
across_fold_performance <- logloss_list <- brier_list <- c()
for(j in runs_to_test){
  best_run[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = run)
  best_combo[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = Features)
  #calculate predictive accuracy across all 4 folds
  all_folds_response[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$class,
                               fold2[[best_run[[j]]]]$predictor_data$class,
                               fold3[[best_run[[j]]]]$predictor_data$class,
                               fold4[[best_run[[j]]]]$predictor_data$class)
  all_folds_predictor[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold2[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold3[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold4[[best_run[[j]]]]$predictor_data$pred.prob)
  roc_dat[[j]] <- pROC::roc(response = all_folds_response[[j]],
                            predictor = all_folds_predictor[[j]],
                            smooth = smooth) 
  auc[[j]] <- roc_dat[[j]]$auc %>% as.numeric(.) %>% signif(.,digits=3)
  response_dichotomous[[j]]  <- ifelse(all_folds_predictor[[j]]  > 0.5,1,0)
  #remember that since we are using the high-dose model, we are going with the original high-dose designation where infected=0, protected=1
  #but we are actually predicting the opposite for placebo, where 1 means infected(not protected), and infected=0
  response_dichotomous[[j]]   <- factor(ifelse(response_dichotomous[[j]]==1, "protected", "infected")) 
  confusion_mat[[j]] <- caret::confusionMatrix(response_dichotomous[[j]], all_folds_response[[j]], positive = "protected")
  logloss_list[[j]] <- MLmetrics::LogLoss(all_folds_predictor[[j]], as.numeric(all_folds_response[[j]])-1)
  brier_list[[j]] <- mlr::measureBrier(all_folds_predictor[[j]], all_folds_response[[j]], "infected", "protected")
  across_fold_performance[[j]] <- confusion_mat[[j]]$overall
} 

across_fold_performance_df <- bind_rows(across_fold_performance, .id = "run") %>%
  arrange(desc(Kappa))  %>%
  mutate(Kappa_rank = 1:nrow(.)) %>%
  arrange(desc(Accuracy)) %>%
  mutate(Accuracy_rank = 1:nrow(.))
auc_df_high_dose <- bind_rows(auc, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "AUC") %>%
  arrange(desc(AUC))  %>%
  mutate(AUC_rank = 1:nrow(.))
logloss_df <- bind_rows(logloss_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "LogLoss") %>%
  arrange(LogLoss)  %>%
  mutate(LogLoss_rank = 1:nrow(.))
brier_df <- bind_rows(brier_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "BrierScore") %>%
  mutate(BrierScore = signif(BrierScore, 3)) %>%
  arrange(BrierScore) %>%
  mutate(BrierScore_rank = 1:nrow(.))

across_fold_perform_high_dose <- auc_df_high_dose %>%
  left_join(., across_fold_performance_df, by = "run") %>%
  left_join(., summarized_res_2500_runs,
            by = "run") %>%
  left_join(., logloss_df,
            by = "run") %>%
  left_join(., brier_df,
            by = "run") %>%
  mutate(AverageRank = (Accuracy_rank + AUC_rank + BrierScore_rank + Kappa_rank + LogLoss_rank)/5) %>%
  dplyr::select(run, Features, AverageRank, BrierScore_rank, Kappa_rank, LogLoss_rank, AUC_rank, Accuracy_rank, AUC, BrierScore, LogLoss, Kappa, Accuracy:McnemarPValue, mean_cv_AUC:sd_cv_accuracy) %>%
  group_by(run) %>%
  slice_head(n=1) %>%
  ungroup() %>%
  arrange(AverageRank)

#clean up feature names
  # across_fold_perform_high_dose <- across_fold_perform_high_dose %>%
  #   mutate(Features =  gsub("CSP_specific_IgG_", "CSP-specific IgG_", Features)) %>% 
  #   mutate(Features =  gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", Features)) %>%   
  #   mutate(Features =  gsub("PfSPZ", "Pf", Features)) %>% 
  #   mutate(Features =  gsub("_of_live_leukocytes", "_of_live_PBMCs", Features)) %>% 
  #   mutate(Features =  gsub("_", " ", Features))

Arrange data and plot AUROC curves

top_n <- 4
across_fold_perform_highdose_rank_by_AverageRank <- across_fold_perform_high_dose %>%
  arrange(AverageRank) %>%
  drop_na()
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]
data.labels <- data.frame("name" = select_data[1:4,]$run,
                          "features" = select_data[1:4,]$Features,
                          "brier" = select_data[1:4,]$BrierScore,
                          "auc" = select_data[1:4,]$AUC,
                          "kappa" = select_data[1:4,]$Kappa,
                          "logloss" = select_data[1:4,]$LogLoss)
roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1.5) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(0.875, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[2,], aes(0.875, 0.2, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[3,], aes(0.875, 0.15, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[4,], aes(0.875, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  #geom_text(data = data.labels[5,], aes(0.875, 0.05, label = paste("AUC =", auc)), hjust=1, show_guide=FALSE) +
  theme(plot.margin = unit(c(1,1,1,1), "cm")) +
  geom_abline(slope = 1, linetype = "dotted", color = "gray")
roc_plots

Plot confusion matrices for high-dose delta models

library(ggplot2)
library(scales)

ggplotConfusionMatrix <- function(m){
  mytitle <- paste(
    "accuracy", percent_format()(m$overall[1]),
                   "kappa", percent_format()(m$overall[2]))
  mydata <- as.data.frame(m$table) %>%
    mutate(Prediction = ifelse(Prediction == "infected", "NP", "P")) %>%
    mutate(Reference = ifelse(Reference == "infected", "NP", "P"))
  p <- ggplot(data = mydata,
           aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = log(Freq)), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    geom_text(aes(x = Reference, y = Prediction, label = Freq, size = 24)) +
    ggtitle(mytitle) +
    ylab("predicted") +
    xlab("actual outcome") +
    theme(legend.position = "none",
          panel.background = element_blank(),
          plot.background = element_blank(),
          axis.ticks = element_blank(),
          axis.title = element_text(color = "black", size = 10),
          axis.text = element_text(color = "black", size = 10),
          title = element_text(size = 9),) +
    theme(plot.margin = unit(c(0.5,0.45,0.01,0.4), "cm"))
  return(p)
}
#my_models <- c("run 1727", "run 671", "run 808", "run 439") #ranked by Brier
my_models <- select_data[1:4,]$run #ranked by average rank

my_plots <- list(ggplotConfusionMatrix(confusion_mat[[my_models[1]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[2]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[3]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[4]]]))
names(my_plots) <- my_models

my_highdose_confus_mat_plot_top <- ggarrange(plotlist = my_plots[1:2],
                                         labels = names(my_plots[1:2]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_bottom <- ggarrange(plotlist = my_plots[3:4],
                                         labels = names(my_plots[3:4]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_all <- ggarrange(my_highdose_confus_mat_plot_top,
                                             my_highdose_confus_mat_plot_bottom,
                                             nrow=2)
my_highdose_confus_mat_plot_all

Feature importance by number of appearances in top %1 models

addSmallLegend <- function(myPlot, pointSize = 0.75, textSize = 6, spaceLegend = 0.75) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

#filter select data by performance metric
select_data <- across_fold_perform_highdose_rank_by_AverageRank %>%
  arrange(AverageRank) %>%
  slice_head(n=25)

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}
shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

number_models <- sum(summary(factor(unique(shap_prep_rbind_df$model))))

var_appearances <- shap_prep_rbind_df %>%
  dplyr::select(model,variable) %>%
  distinct() %>%
  group_by(variable) %>%
  summarize(proportion=n()/number_models, n=n()) %>%
  ungroup() %>%
  arrange(desc(n))

foo <- shap_long_iris %>%
  group_by(variable) %>%
  summarize(mean = mean(value))


SHAP_feature_correlations <- shap_prep_rbind_df %>%
  dplyr::select(model,variable, value, rfvalue) %>%
  distinct() %>%
  group_by(model, variable) %>% 
  summarise(r = cor(value, rfvalue)) %>%
  ungroup() %>%
  group_by(variable) %>%
  summarize(mean_r=mean(r))
## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.
var_appearances <- var_appearances %>%
  left_join(., SHAP_feature_correlations,
            by = "variable")

feature_importance_plot <- var_appearances %>%
  filter(n>1) %>%
  ggplot(., aes(x= fct_reorder(variable, n, .desc = FALSE))) +
  geom_bar(aes(y=proportion, fill=mean_r), stat="identity") +
  scale_fill_gradient2(low = "darkblue", mid = "white",
                       high = "darkred",
                       midpoint = 0,limits = c(-1,1)) +
  theme_bw() +
  coord_flip() +
  xlab("features") +
  ylab("appearances") + 
  labs(fill = "R") +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  theme(legend.position = "top",
        axis.text = element_text(colour = "black"))

feature_importance_plot <- addSmallLegend(feature_importance_plot)
feature_importance_plot

#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected

#filter select data by top n
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}

shap_plots_top <- ggarrange(plotlist = shap_plots[4:3], labels = names(shap_plots[4:3]),
                        heights = c(4.3,7),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5)

shap_plots_bottom <- ggarrange(plotlist = shap_plots[2:1], labels = names(shap_plots[2:1]),
                        heights = c(6,6),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5,
                        legend = "none")

shap_plots_all <- ggarrange(shap_plots_top, shap_plots_bottom,
                        heights = c(1.025,1),
                        ncol=1, align = "hv", common.legend = TRUE)

shap_plots_all

roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(1, 0.55, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[2,], aes(1, 0.40, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[3,], aes(1, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[4,], aes(1, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  theme(plot.margin = unit(c(0.1,3,0.1,3), "cm"))
figure_5D_plot <- ggarrange(ggarrange(NULL, feature_importance_plot, NULL, nrow = 3,
                    heights = c(0.2,1,0.2)),
          shap_plots_all,
          ggarrange(roc_plots,
                    my_highdose_confus_mat_plot_all,
                    nrow=2,
                    heights = c(0.6,1)),
          widths = c(1.1,1.2,1),
          nrow=1)
figure_5D_plot

Examine top n models

shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

var_appearances <- shap_prep_rbind_df %>%
  group_by(variable) %>%
  mutate(n=n()/63) %>%
  distinct(variable, n) %>%
  arrange(desc(n))

shap_by_feature_dat <-shap_prep_rbind_df %>%
  left_join(., select_data %>%
              rename(model = "run") %>%
              dplyr::select(model, contains("rank")),
            by = "model") %>%
  left_join(., var_appearances,
            by = "variable") %>%
  mutate(model = fct_reorder(model, AverageRank)) %>%
  mutate(variable = fct_reorder2(variable, mean_value, n))

shap_by_feature <- shap_by_feature_dat %>%
  ggplot(., aes(x = value, y = rfvalue)) +
  geom_point(fill = "gray", pch = 21, alpha = 0.3) +
  theme_bw() + 
  ylab("feature value") +
  xlab("SHAP value (impact on model output)") +
  xlim(c(-2,2)) +
  geom_smooth(method='lm', formula= y~x, color = "salmon", alpha = 0.30) +
  stat_cor(method = "pearson", "cor.coef.name" = "R", size = 2.5, label.x.npc = "left", label.y.npc = "top",
           aes(label = paste(..r.label.., cut(..p.., 
                                          breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf),
                                          labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")),
                         sep = "~"))) +
  ggh4x::facet_grid2(model~variable, scales = "free_x", labeller = label_wrap_gen(), switch = "y") +
  theme(strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.clip = "off",
        axis.text = element_blank(),
        axis.ticks = element_blank())
  
shap_by_feature

### Run xgb on full data set using the downselected, cross-validated features
#This can be used to create new models de novo for Shapley plots.
source("/Users/tuantran/Library/CloudStorage/OneDrive-IndianaUniversity/Manuscripts/KSPZV1 Manuscript/kspzv1-systems-analysis/kspzv1 xgboost manual train and cv function.R")
xgb_res <- c()
set.seed(12345)
#for(k in select_data$run){
for(k in my_models){
  my_features <- fold4[[k]]$downselected_xgb_results$feature_names
  xgb_res[[k]] <- xgb_train_on_selected_features(train_features = train_features,
                                                 features_to_test = my_features,
                                                 maxiterations = 2000)
}
## [1] "training set good to go!"
## [1] "Training set reduced to 61 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=3; eta=0.101; min_child_weight=3.04; subsample=0.89; lambda=0.356; alpha=0.607; colsample_bytree=0.602 : acc.test.mean=0.8020833
## hyperparameter tuning...: 35.432 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.295082 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.147541 
## [21] train-error:0.131148 
## [31] train-error:0.114754 
## [41] train-error:0.098361 
## [51] train-error:0.098361 
## Stopping. Best iteration:
## [33] train-error:0.081967
## 
## [1] "The 3 downselected features used in the model are TBA_(M72.0); TBA_(M70.0); double_positive_thymocytes_(M126)"
## training on best hyperparameters: 0.008 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 61 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=5; eta=0.118; min_child_weight=2; subsample=0.751; lambda=0.499; alpha=0.0172; colsample_bytree=0.767 : acc.test.mean=0.8197917
## hyperparameter tuning...: 23.601 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.311475 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.131148 
## [21] train-error:0.065574 
## [31] train-error:0.049180 
## [41] train-error:0.032787 
## [51] train-error:0.032787 
## Stopping. Best iteration:
## [34] train-error:0.016393
## 
## [1] "The 7 downselected features used in the model are transmembrane_transport_(SLC_cluster)_(M154.1); TBA_(M137); FACS_PBs_of_memory; translation_initiation_factor_3_complex_(M245); TBA_(M180); double_positive_thymocytes_(M126); TBA_(M70.0)"
## training on best hyperparameters: 0.007 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 61 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=0; max_depth=4; eta=0.151; min_child_weight=0.512; subsample=0.858; lambda=0.7; alpha=0.403; colsample_bytree=0.753 : acc.test.mean=0.8197917
## hyperparameter tuning...: 21.561 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.147541 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.049180 
## [21] train-error:0.016393 
## [31] train-error:0.000000 
## [41] train-error:0.000000 
## Stopping. Best iteration:
## [23] train-error:0.000000
## 
## [1] "The 6 downselected features used in the model are double_positive_thymocytes_(M126); transcription_elongation,_RNA_polymerase_II_(M234); FACS_CD56+CD16+_of_live_PBMCs; plasma_membrane,_cell_junction_(M162.0); proteasome_(M226); TBA_(M218)"
## training on best hyperparameters: 0.007 sec elapsed
## [1] "training set good to go!"
## [1] "Training set reduced to 61 samples and 389 features."
## [1] "Max iterations for hyperparameter tuning: 2000"
## [1] "Running hyperparameter tuning."
## [1] "Begin training..."
## [1] "downselected training set good to go!"
## [1] "Running hyperparameter tuning..."
## [1] "maxiterations for hyperparameter tuning: 2000"
## [Tune] Started tuning learner classif.xgboost for parameter set:
##                      Type len Def      Constr Req Tunable Trafo
## booster          discrete   -   -      gbtree   -    TRUE     -
## gamma             integer   -   -      0 to 3   -    TRUE     -
## max_depth         integer   -   -      2 to 5   -    TRUE     -
## eta               numeric   -   - 0.01 to 0.2   -    TRUE     -
## min_child_weight  numeric   -   -      0 to 4   -    TRUE     -
## subsample         numeric   -   - 0.75 to 0.9   -    TRUE     -
## lambda            numeric   -   -      0 to 1   -    TRUE     -
## alpha             numeric   -   -      0 to 1   -    TRUE     -
## colsample_bytree  numeric   -   -    0.5 to 1   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 10; elements = 2000.
## [Tune] Result: booster=gbtree; gamma=1; max_depth=4; eta=0.115; min_child_weight=1.15; subsample=0.761; lambda=0.339; alpha=0.472; colsample_bytree=0.797 : acc.test.mean=0.8520833
## hyperparameter tuning...: 22.612 sec elapsed
## [1] "Training on best hyperparameters."
## [1]  train-error:0.213115 
## Will train until train_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.081967 
## [21] train-error:0.114754 
## Stopping. Best iteration:
## [10] train-error:0.065574
## 
## [1] "The 6 downselected features used in the model are double_positive_thymocytes_(M126); FACS_Vd1/2-_of_T_cells; TBA_(M185); T_cell_surface_signature_(S0); FACS_PfSPZ-specific_CD3+CD4+_of_live_PBMCs; FACS_atypical_memory_of_B_cells"
## training on best hyperparameters: 0.005 sec elapsed
#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected

shap_plot <- c()
#for(k in rev(select_data$run)){
for(k in rev(my_models)){
  my_features <- xgb_res[[k]]$xgb_results$feature_names
  shap_plot[[k]] <- shap.plot.summary.wrap1(xgb_res[[k]]$xgb_results,
                                           X = as.matrix(train_features[, my_features]))
}
                                           

shap_plots_top <- ggarrange(plotlist = shap_plot[4:3], labels = names(shap_plot[4:3]),
                        heights = c(3,5),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5)

shap_plots_bottom <- ggarrange(plotlist = shap_plot[2:1], labels = names(shap_plot[2:1]),
                        heights = c(1,1),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5,
                        legend = "none")

shap_plots <- ggarrange(shap_plots_top, shap_plots_bottom,
                        heights = c(1,1.07),
                        ncol=1, align = "hv", common.legend = TRUE)

shap_plots

Confirm differences in CD56+CD16+ and Pf−specific CD3+CD4+ cells between ΔNP and ΔP in 1.8x10^6 PfSPZ group

Note that values are log2 fold-change post-vax over baseline.

library(ggpubr)
#load the delta high dose dataset
#local filename: "highdose_PfSPZ_delta_correlation_ML_data_with_missing.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1Q-VUAyvGxZcmEV9hgshpuz87QifzpzSd"), path = temp, overwrite = TRUE)
delta_highdose_features <- readRDS(file = dl$local_path) %>%
  rownames_to_column(var="PATID") %>%
  dplyr::select(PATID, contains("CD3"), contains("CD56"), contains("atypical"), contains("Vd1/2"))

#local filename: "KSPZV1 PhenoData TTE 6 months Long Format.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1v2l7G7AzXhgcqObzAabYy6_QJsdB5hFn"), path = temp, overwrite = TRUE)
pheno_data <- readRDS(file = dl$local_path) %>%
  filter(Dosegroup == "1.8 x 10^6 PfSPZ") %>%
  dplyr::select(PATID, tte.mal.atp.3, Outcome, Dosegroup) %>%
  distinct(PATID, tte.mal.atp.3, Outcome, Dosegroup)

facs_dat <- pheno_data %>%
  left_join(., delta_highdose_features, by = "PATID") %>%
  drop_na(`FACS_CD56+CD16+_of_live_PBMCs`) 
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("ΔNP (", n, ")"),
                          paste0("ΔP (", n, ")")))
CD56_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_CD56+CD16+_of_live_PBMCs`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("CD56+CD16+ (% of live PBMCs)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0, hjust = 0.5, label.y.npc = "bottom") +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 10, colour = "black"),
        axis.title = element_text(size = 12))

facs_dat <- pheno_data %>%
  left_join(., delta_highdose_features, by = "PATID") %>%
  drop_na(`FACS_PfSPZ-specific_CD3+CD4+_of_live_PBMCs`) 
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("ΔNP (", n, ")"),
                          paste0("ΔP (", n, ")")))
CD3CD4_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_PfSPZ-specific_CD3+CD4+_of_live_PBMCs`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("Pf−specific CD3+CD4+ (% of live PBMCs)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0, hjust = 0.5, label.y.npc = "bottom") +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 10, colour = "black"),
        axis.title = element_text(size = 12))

facs_dat <- pheno_data %>%
  left_join(., delta_highdose_features, by = "PATID") %>%
  drop_na(`FACS_atypical_memory_of_B_cells`) 
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("ΔNP (", n, ")"),
                          paste0("ΔP (", n, ")")))
atBcell_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_atypical_memory_of_B_cells`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("atypical memory (% of B cells)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0, hjust = 0.5, label.y.npc = "bottom") +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 10, colour = "black"),
        axis.title = element_text(size = 12))

facs_dat <- pheno_data %>%
  left_join(., delta_highdose_features, by = "PATID") %>%
  drop_na(`FACS_Vd1/2-_of_T_cells`) 
facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("ΔNP (", n, ")"),
                          paste0("ΔP (", n, ")")))
vd12_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_Vd1/2-_of_T_cells`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("Vδ1/2− (% of T cells)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 0, hjust = 0.5, label.y.npc = "bottom") +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 10, colour = "black"),
        axis.title = element_text(size = 12))

my_FACS_plot <- ggarrange(CD56_plot, CD3CD4_plot, atBcell_plot, vd12_plot)

my_FACS_plot