Objective

Visualization of xgboost ML with 4-fold cross-validation of full dataset.

Approach

Plot ROC curves, confusion matrices, and SHAP plots from top n models using xgboost results derived from “KPSZV1 Baseline Highdose PfSPZ Multimodal ML xgb nested fs cv on full set JCI Revision.Rmd”.

library(xgboost)
library(dplyr)
library(tidyverse)
library(pROC)
library(caret)
library(doMC)
library(data.table)
library(xgboost)
library(mlr)
library(ggpubr)
library(tictoc)
library(Biobase)
library(googledrive)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
library(SHAPforxgboost)

Load feature data

#from google drive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1QXCOZfeSizhJd53NDbnMU28xO14q1d2G"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)
train_features <- high_dose_features_class %>%
  dplyr::select(-c(sex, site))

Load ML results

This is a large file.

#from google drive
temp <- tempfile(fileext = ".Rdata")
dl <- drive_download(
  as_id("1R8FTKq6_DIKWWj-Z_gvsPtEXUfK_WlKe"), path = temp, overwrite = TRUE)
all_folds_2500_runs <- load(file = dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1RhGlVDH7Gbw8H-YUkO7ScX0NR96nTKlB"), path = temp, overwrite = TRUE)
per_run_res_2500_runs <- readxl::read_xlsx(dl$local_path)

temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1SM2KWomZ_uZaP8gpgdBwuI-3P810vP9v"), path = temp, overwrite = TRUE)
summarized_res_2500_runs <- readxl::read_xlsx(dl$local_path)

Determine across-fold performance metrics (accuracy, AUC, Brier score, kappa, logless) select best performing high-dose PfSPZ baseline models

smooth <- FALSE
runs_to_test <- summarized_res_2500_runs[summarized_res_2500_runs$mean_cv_accuracy>.66,]$run #select only the high-dose PfSPZ models with accuracy >66%.
#runs_to_test <- c("run 211", "run 2222", "run 671")

best_run <- best_combo <- all_folds_response <- all_folds_predictor <- roc_dat <- auc <- confusion_mat <- response_dichotomous <- c()
across_fold_performance <- logloss_list <- brier_list <- c()
for(j in runs_to_test){
  best_run[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = run)
  best_combo[[j]] <- per_run_res_2500_runs[per_run_res_2500_runs$run==j,] %>%
    group_by(run) %>%
    slice_max(AUC) %>%
    distinct(Features) %>%
    pull(var = Features)
  #calculate predictive accuracy across all 4 folds
  all_folds_response[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$class,
                               fold2[[best_run[[j]]]]$predictor_data$class,
                               fold3[[best_run[[j]]]]$predictor_data$class,
                               fold4[[best_run[[j]]]]$predictor_data$class)
  all_folds_predictor[[j]] <- c(fold1[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold2[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold3[[best_run[[j]]]]$predictor_data$pred.prob,
                            fold4[[best_run[[j]]]]$predictor_data$pred.prob)
  roc_dat[[j]] <- pROC::roc(response = all_folds_response[[j]],
                            predictor = all_folds_predictor[[j]],
                            smooth = smooth) 
  auc[[j]] <- roc_dat[[j]]$auc %>% as.numeric(.) %>% signif(.,digits=3)
  response_dichotomous[[j]]  <- ifelse(all_folds_predictor[[j]]  > 0.5,1,0)
  #remember that since we are using the high-dose model, we are going with the original high-dose designation where infected=0, protected=1
  response_dichotomous[[j]]   <- factor(ifelse(response_dichotomous[[j]]==1, "protected", "infected")) 
  confusion_mat[[j]] <- caret::confusionMatrix(response_dichotomous[[j]], all_folds_response[[j]], positive = "protected")
  logloss_list[[j]] <- MLmetrics::LogLoss(all_folds_predictor[[j]], as.numeric(all_folds_response[[j]])-1)
  brier_list[[j]] <- mlr::measureBrier(all_folds_predictor[[j]], all_folds_response[[j]], "infected", "protected")
  across_fold_performance[[j]] <- confusion_mat[[j]]$overall
} 

across_fold_performance_df <- bind_rows(across_fold_performance, .id = "run") %>%
  arrange(desc(Kappa))  %>%
  mutate(Kappa_rank = 1:nrow(.)) %>%
  arrange(desc(Accuracy)) %>%
  mutate(Accuracy_rank = 1:nrow(.))
auc_df_high_dose <- bind_rows(auc, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "AUC") %>%
  arrange(desc(AUC))  %>%
  mutate(AUC_rank = 1:nrow(.))
logloss_df <- bind_rows(logloss_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "LogLoss") %>%
  arrange(LogLoss)  %>%
  mutate(LogLoss_rank = 1:nrow(.))
brier_df <- bind_rows(brier_list, .id = "run") %>%
  pivot_longer(., cols = everything(), names_to = "run", values_to = "BrierScore") %>%
  mutate(BrierScore = signif(BrierScore, 3)) %>%
  arrange(BrierScore) %>%
  mutate(BrierScore_rank = 1:nrow(.))

across_fold_perform_high_dose <- auc_df_high_dose %>%
  left_join(., across_fold_performance_df, by = "run") %>%
  left_join(., summarized_res_2500_runs,
            by = "run") %>%
  left_join(., logloss_df,
            by = "run") %>%
  left_join(., brier_df,
            by = "run") %>%
  mutate(AverageRank = (Accuracy_rank + AUC_rank + BrierScore_rank + Kappa_rank + LogLoss_rank)/5) %>%
  dplyr::select(run, Features, AverageRank, BrierScore_rank, Kappa_rank, LogLoss_rank, AUC_rank, Accuracy_rank, AUC, BrierScore, LogLoss, Kappa, Accuracy:McnemarPValue, mean_cv_AUC:sd_cv_accuracy) %>%
  group_by(run) %>%
  slice_head(n=1) %>%
  ungroup() %>%
  arrange(AverageRank)

#clean up feature names
  # across_fold_perform_high_dose <- across_fold_perform_high_dose %>%
  #   mutate(Features =  gsub("CSP_specific_IgG_", "CSP-specific IgG_", Features)) %>% 
  #   mutate(Features =  gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", Features)) %>%   
  #   mutate(Features =  gsub("PfSPZ", "Pf", Features)) %>% 
  #   mutate(Features =  gsub("_of_live_leukocytes", "_of_live_PBMCs", Features)) %>% 
  #   mutate(Features =  gsub("_", " ", Features))

Arrange data and plot AUROC curves

top_n <- 4
across_fold_perform_highdose_rank_by_AverageRank <- across_fold_perform_high_dose %>%
  arrange(AverageRank) %>%
  drop_na()
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]
data.labels <- data.frame("name" = select_data[1:4,]$run,
                          "features" = select_data[1:4,]$Features,
                          "brier" = select_data[1:4,]$BrierScore,
                          "auc" = select_data[1:4,]$AUC,
                          "kappa" = select_data[1:4,]$Kappa,
                          "logloss" = select_data[1:4,]$LogLoss)
roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1.5) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(0.875, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[2,], aes(0.875, 0.2, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[3,], aes(0.875, 0.15, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  geom_text(data = data.labels[4,], aes(0.875, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE) +
  #geom_text(data = data.labels[5,], aes(0.875, 0.05, label = paste("AUC =", auc)), hjust=1, show_guide=FALSE) +
  theme(plot.margin = unit(c(1,1,1,1), "cm")) +
  geom_abline(slope = 1, linetype = "dotted", color = "gray")
roc_plots

Plot confusion matrices for high-dose baseline models

library(ggplot2)
library(scales)

ggplotConfusionMatrix <- function(m){
  mytitle <- paste(
    "accuracy", percent_format()(m$overall[1]),
                   "kappa", percent_format()(m$overall[2]))
  mydata <- as.data.frame(m$table) %>%
    mutate(Prediction = ifelse(Prediction == "infected", "NP", "P")) %>%
    mutate(Reference = ifelse(Reference == "infected", "NP", "P"))
  p <- ggplot(data = mydata,
           aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = log(Freq)), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    geom_text(aes(x = Reference, y = Prediction, label = Freq, size = 24)) +
    ggtitle(mytitle) +
    ylab("predicted") +
    xlab("actual outcome") +
    theme(legend.position = "none",
          panel.background = element_blank(),
          plot.background = element_blank(),
          axis.ticks = element_blank(),
          axis.title = element_text(color = "black", size = 10),
          axis.text = element_text(color = "black", size = 10),
          title = element_text(size = 9),) +
    theme(plot.margin = unit(c(0.5,0.45,0.01,0.4), "cm"))
  return(p)
}
#my_models <- c("run 1727", "run 671", "run 808", "run 439") #ranked by Brier
my_models <- select_data[1:4,]$run #ranked by average rank

my_plots <- list(ggplotConfusionMatrix(confusion_mat[[my_models[1]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[2]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[3]]]),
              ggplotConfusionMatrix(confusion_mat[[my_models[4]]]))
names(my_plots) <- my_models

my_highdose_confus_mat_plot_top <- ggarrange(plotlist = my_plots[1:2],
                                         labels = names(my_plots[1:2]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_bottom <- ggarrange(plotlist = my_plots[3:4],
                                         labels = names(my_plots[3:4]),
                                         font.label = list(size=14, face="plain"),
                                         ncol = 2,
                                         hjust = -0.1)
my_highdose_confus_mat_plot_all <- ggarrange(my_highdose_confus_mat_plot_top,
                                             my_highdose_confus_mat_plot_bottom,
                                             nrow=2)
my_highdose_confus_mat_plot_all

Feature importance by number of appearances in top %1 models

addSmallLegend <- function(myPlot, pointSize = 0.75, textSize = 6, spaceLegend = 0.75) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

#filter select data by performance metric
select_data <- across_fold_perform_highdose_rank_by_AverageRank %>%
  arrange(AverageRank) %>%
  slice_head(n=25)

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}
shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

number_models <- sum(summary(factor(unique(shap_prep_rbind_df$model))))

var_appearances <- shap_prep_rbind_df %>%
  dplyr::select(model,variable) %>%
  distinct() %>%
  group_by(variable) %>%
  summarize(proportion=n()/number_models, n=n()) %>%
  ungroup() %>%
  arrange(desc(n))

foo <- shap_long_iris %>%
  group_by(variable) %>%
  summarize(mean = mean(value))


SHAP_feature_correlations <- shap_prep_rbind_df %>%
  dplyr::select(model,variable, value, rfvalue) %>%
  distinct() %>%
  group_by(model, variable) %>% 
  summarise(r = cor(value, rfvalue)) %>%
  ungroup() %>%
  group_by(variable) %>%
  summarize(mean_r=mean(r))
## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.
var_appearances <- var_appearances %>%
  left_join(., SHAP_feature_correlations,
            by = "variable")

feature_importance_plot <- var_appearances %>%
  filter(n>1) %>%
  ggplot(., aes(x= fct_reorder(variable, n, .desc = FALSE))) +
  geom_bar(aes(y=proportion, fill=mean_r), stat="identity") +
  scale_fill_gradient2(low = "darkblue", mid = "white",
                       high = "darkred",
                       midpoint = 0,limits = c(-1,1)) +
  theme_bw() +
  coord_flip() +
  xlab("features") +
  ylab("appearances") + 
  labs(fill = "R") +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  theme(legend.position = "top",
        axis.text = element_text(colour = "black"))

feature_importance_plot <- addSmallLegend(feature_importance_plot)
feature_importance_plot

#see vignettes in https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#note that 0=infected (not protected) and 1 = protected

#filter select data by top n
select_data <- across_fold_perform_highdose_rank_by_AverageRank[c(1:top_n),]

#build shap data from results of each fold
shap_prep_fold1 <- shap_prep_fold2 <- shap_prep_fold3 <- shap_prep_fold4 <- shap_prep_rbind <- shap_plots <- c()
for(k in rev(select_data$run)){
  shap_prep_fold1[[k]] <- shap.prep(xgb_model = fold1[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold1[[k]]$predictor_data[,fold1[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold2[[k]] <- shap.prep(xgb_model = fold2[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold2[[k]]$predictor_data[,fold2[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold3[[k]] <- shap.prep(xgb_model = fold3[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold3[[k]]$predictor_data[,fold3[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_fold4[[k]] <- shap.prep(xgb_model = fold4[[k]]$downselected_xgb_results,
                                    X_train = as.matrix(fold4[[k]]$predictor_data[,fold4[[k]]$downselected_xgb_results$feature_names]))
  shap_prep_rbind[[k]] <- rbind(shap_prep_fold1[[k]],
                           shap_prep_fold2[[k]],
                           shap_prep_fold3[[k]],
                           shap_prep_fold4[[k]]) %>%
    group_by(variable) %>%
    mutate(ID = 1:n()) %>%
    ungroup()
  levels(shap_prep_rbind[[k]]$variable) <- gsub("FACS_|_innate_BTM|_highBTM|_monaco|CytokineObsConc_", "", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("PfSPZ", "Pf", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_of_live_leukocytes", "_of_live_PBMCs", levels(shap_prep_rbind[[k]]$variable))
  levels(shap_prep_rbind[[k]]$variable) <- gsub("_", " ", levels(shap_prep_rbind[[k]]$variable))  
  shap_plots[[k]] <- shap.plot.summary(shap_prep_rbind[[k]], scientific = FALSE)
}

shap_plots_top <- ggarrange(plotlist = shap_plots[4:3], labels = names(shap_plots[4:3]),
                        heights = c(7,4.3),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5)

shap_plots_bottom <- ggarrange(plotlist = shap_plots[2:1], labels = names(shap_plots[2:1]),
                        heights = c(7,5.6),
                        nrow=2, align = "hv", common.legend = TRUE,
                        font.label = list(size = 10, face = "plain", color ="black"),
                        hjust = -0.5,
                        legend = "none")

shap_plots_all <- ggarrange(shap_plots_top, shap_plots_bottom,
                        heights = c(1.02,1),
                        ncol=1, align = "hv", common.legend = TRUE)

shap_plots_all

roc_plot_dat <- roc_dat[select_data[1:4,]$run] 
roc_plots <- ggroc(roc_plot_dat, legacy.axes = TRUE, size = 1) +
  theme_bw() +
  scale_color_brewer("", palette = "Set1", type = "qual") +
  scale_fill_brewer("", palette = "Set1", type = "qual") +
  theme(legend.position = "none") +
  geom_text(data = data.labels[1,], aes(1, 0.55, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[2,], aes(1, 0.40, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[3,], aes(1, 0.25, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  geom_text(data = data.labels[4,], aes(1, 0.1, label = paste0(name, ", Brier=", brier, ", AUC=", auc)), hjust=1, show_guide=FALSE, size = 2.75) +
  theme(plot.margin = unit(c(0.1,3,0.1,3), "cm"))
figure_6B_plot <- ggarrange(ggarrange(NULL, feature_importance_plot, NULL, nrow = 3,
                    heights = c(0.25,1,0.25)),
          shap_plots_all,
          ggarrange(roc_plots,
                    my_highdose_confus_mat_plot_all,
                    nrow=2,
                    heights = c(0.6,1)),
          widths = c(1.1,1.2,1),
          nrow=1)
figure_6B_plot

Examine top n models

shap_prep_rbind_df <- bind_rows(shap_prep_rbind, .id = "model")

var_appearances <- shap_prep_rbind_df %>%
  group_by(variable) %>%
  mutate(n=n()/63) %>%
  distinct(variable, n) %>%
  arrange(desc(n))

shap_by_feature_dat <-shap_prep_rbind_df %>%
  left_join(., select_data %>%
              rename(model = "run") %>%
              dplyr::select(model, contains("rank")),
            by = "model") %>%
  left_join(., var_appearances,
            by = "variable") %>%
  mutate(model = fct_reorder(model, AverageRank)) %>%
  mutate(variable = fct_reorder2(variable, mean_value, n))

shap_by_feature <- shap_by_feature_dat %>%
  ggplot(., aes(x = value, y = rfvalue)) +
  geom_point(fill = "gray", pch = 21, alpha = 0.3) +
  theme_bw() + 
  ylab("feature value") +
  xlab("SHAP value (impact on model output)") +
  xlim(c(-2,2)) +
  geom_smooth(method='lm', formula= y~x, color = "salmon", alpha = 0.30) +
  stat_cor(method = "pearson", "cor.coef.name" = "R", size = 2.5, label.x.npc = "left", label.y.npc = "top",
           aes(label = paste(..r.label.., cut(..p.., 
                                          breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf),
                                          labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")),
                         sep = "~"))) +
  ggh4x::facet_grid2(model~variable, scales = "free_x", labeller = label_wrap_gen(), switch = "y") +
  theme(strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.clip = "off",
        axis.text = element_blank(),
        axis.ticks = element_blank())
  
shap_by_feature

Confirm differences in CD11c+ cells between NP and P in 1.8x10^6 PfSPZ group at baseline

library(ggpubr)
#load the baseline high dose dataset
#local filename: "highdose_PfSPZ_baseline_correlation_ML_data_with_missing.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1PNBLTGLRlcXYuLhJxOWUy6YcLOgaJwDr"), path = temp, overwrite = TRUE)
baseline_highdose_features <- readRDS(file = dl$local_path) %>%
  rownames_to_column(var = "PATID") %>%
  dplyr::select(PATID, contains("CD11c"))

#local filename: "KSPZV1 PhenoData TTE 6 months Long Format.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1v2l7G7AzXhgcqObzAabYy6_QJsdB5hFn"), path = temp, overwrite = TRUE)
pheno_data <- readRDS(file = dl$local_path) %>%
  filter(Dosegroup == "1.8 x 10^6 PfSPZ") %>%
  dplyr::select(PATID, tte.mal.atp.3, Outcome, Dosegroup) %>%
  distinct(PATID, tte.mal.atp.3, Outcome, Dosegroup)

facs_dat <- pheno_data %>%
  left_join(., baseline_highdose_features, by = "PATID") %>%
  drop_na(`FACS_CD11c+_of_live_PBMCs`) 

facs_dat_n <- facs_dat %>% 
  group_by(Outcome) %>%
  mutate(n=n()) %>%
  ungroup() %>%
  mutate(outcome = ifelse(Outcome == "infected", paste0("NP (", n, ")"),
                          paste0("P (", n, ")")))

CD11c_plot <- facs_dat_n %>% 
  ggplot(., aes(x = outcome, y = `FACS_CD11c+_of_live_PBMCs`, fill = outcome)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom(size = 2, width = 0.2) +
  ylab("CD11c+ (% of live PBMCs)") +
  xlab("outcome (n)") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  stat_compare_means(method = "wilcox", vjust = 1, hjust = 0.5) +
  theme_bw(base_family = "sans") +
  theme(legend.position = "none",
        axis.text = element_text(size = 12, colour = "black"),
        axis.title = element_text(size = 14))

CD11c_plot