Approach

Perform training using xgboost with hyperparameter tuning with 4-fold cross-validation using random search strategy in 2500 runs on full dataset.
For each run:

manually split the training and validation set 3:1 (47:16) using a random seed
use xgb.train on the training set with hyperparameter tuning
determine feature importance and randomly select 3 to 7 of the top 7-10 features based on gain
validate these selected features on the test set using the caret confusionMatrix function, which provides prediction accuracy, sensitivity, and specificity
use pROC::roc function to get auc (https://stackoverflow.com/questions/44228137/auc-from-averaged-class-probabilities-in-caret-r)
for each run, record the features, accuracy, error rate [(FP+FN)/total or 1-accuracy], sensitivity, specificity, and auc

Repeat step 2 for 2500 runs
Calculate mean test AUC and mean test accuracy for the 4 folds.
Rank the feature combinations by highest test AUC mean and highest test accuracy mean.
In another script, calculate across-folds AUC, kappa, and logloss performance metrics. These will be the final metrics used for the 4-fold CV model.

Notes: This can be run in 500 run chunks and with each chunk saved if memory is an issue. Results can later be concatenated. If using R studio, run code directly in console for better performance.

library(openxlsx)
library(xgboost)
library(dplyr)
library(tidyverse)
library(fgsea)
library(glmnet)
library(pROC)
library(aod)
library(caret)
library(limma)
library(caTools)
library(e1071)
library(robustbase)
library(Biobase)
library(doMC)
library(randomForest)
library(data.table)
library(googledrive)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1QI4BIKcW-5oQfHl-o3t7Hj58OaKl8Mn8"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)

Select options

train_on_split_or_full_set <- "full" #options: original_split, split, full

myseed <- sample(1:5000,1)
set.seed(myseed) #set seed for reproducibility
  print(paste0("Will train and cross-validate with nested feature selection on full high dose PfSPZ dataset which consists of ", ncol(high_dose_delta_eset), " samples and ", ncol(high_dose_features_class), " features."))

Use xgboost for both feature selection and training model. Use caret and pROC for cross-validation.

library(mlr)
library(mlr3)
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/

#create learner
lrn <- makeLearner("classif.xgboost",
                   objective="binary:logistic",
                   nrounds=1000,
                   early_stopping_rounds = 100,
                   eval_metric="error",
                   predict.type = "response")

#set parameter space
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
#https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
params <- makeParamSet(makeDiscreteParam("booster", values = c("gbtree")),
                       makeIntegerParam("gamma",lower = 0L,upper = 3L),
                       makeIntegerParam("max_depth",lower = 2L,upper = 5L),
                       makeNumericParam("eta",lower = 0.01,upper = 0.2),
                       makeNumericParam("min_child_weight",lower = 0L,upper = 8L),
                       makeNumericParam("subsample",lower = 0.75,upper = 0.9),
                       makeNumericParam("lambda",lower = 0, upper = 1),
                       makeNumericParam("alpha",lower = 0, upper = 1),
                       makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))

#set resampling strategy
rdesc <- makeResampleDesc(method = "CV",
                          predict = "test",
                          iters = 4,
                          stratify = T)

#set parallel backend.  
library(parallel)
library(parallelMap)
library(tictoc)
parallelStartSocket(cpus = detectCores())

#set options
maxiterations <- 500 #number of iterations for each run of hyperparameter tuning
runs <- 2500 #number of runs
n_folds <- 4
n_feat_sampled <- 3:7 #range for number of features sampled
ctrl <- makeTuneControlRandom(maxit = maxiterations)

The following code sources a custom function (“kspzv1 xgboost manual train and cv function.R”) to run feature selection and cross validation x times for every fold of cross-validation.

source("https://raw.githubusercontent.com/TranLab/kspzv1-systems/main/kspzv1%20xgboost%20manual%20train%20and%20cv%20function.R")
fold1 <- fold2 <- fold3 <- fold4 <- c()

tic(msg = paste0("total for ", runs, " total runs"))
for(i in 2100:runs){
  run_seed <- sample(1:1e4,1)
  # Splitting data in training and validation sets
  set.seed(run_seed) #set seed for run
  tic(msg = paste0("first fold in ", i, " run of ,", runs," total runs"))
  #run first fold
  split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
  
  train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
  train_samples_fold1 <- colnames(train_eset_fold1)
  train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
    dplyr::select(-c(sex,site))
  
  validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
  validation_samples_fold1 <- colnames(validation_eset_fold1)
  validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
    dplyr::select(-c(sex,site))
  
  #following if else statement checks to see if split results in single-level outcome in validation set. Does this 2 times (just in case!).
  if(nlevels(validation_features_fold1$class)==1){
    run_new_seed <- sample(1e4:2e4,1)
    # Splitting data in training and validation sets
    set.seed(run_new_seed) #set seed for run
    split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
    train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
    train_samples_fold1 <- colnames(train_eset_fold1)
    train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
      dplyr::select(-c(sex,site))
    validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
    validation_samples_fold1 <- colnames(validation_eset_fold1)
    validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
      dplyr::select(-c(sex,site))
    }
  if(nlevels(validation_features_fold1$class)==1){
    run_newer_seed <- sample(2e4:3e4,1)
    # Splitting data in training and validation sets
    set.seed(run_newer_seed) #set seed for run
    split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
    train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
    train_samples_fold1 <- colnames(train_eset_fold1)
    train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
      dplyr::select(-c(sex,site))
    validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
    validation_samples_fold1 <- colnames(validation_eset_fold1)
    validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
      dplyr::select(-c(sex,site))
    }
  
  fold1[[i]] <- xgb_train_and_cv(train_features = train_features_fold1, validation_features = validation_features_fold1)
  names(fold1)[i] <- paste0("run ", i)
  toc()
  
  #begin second fold of cross-validation
  tic(msg = paste0("second fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold2 <- sample(train_samples_fold1,
                                    size = length(validation_samples_fold1))
  validation_features_fold2 <- high_dose_features_class[validation_samples_fold2,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold2 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold2)
  train_features_fold2 <- high_dose_features_class[train_samples_fold2,] %>%
      dplyr::select(-c(sex,site))
  fold2[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold2,
                                                      validation_features = validation_features_fold2,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold2)[i] <- paste0("run ", i)
  toc()
  
  #begin third fold of cross-validation
  tic(msg = paste0("third fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold3 <- sample(setdiff(colnames(high_dose_delta_eset), c(validation_samples_fold1,
                                                                                 validation_samples_fold2)),
                                    size = length(validation_samples_fold1))
  validation_features_fold3 <- high_dose_features_class[validation_samples_fold3,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold3 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold3)
  train_features_fold3 <- high_dose_features_class[train_samples_fold3,] %>%
      dplyr::select(-c(sex,site))
  fold3[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold3,
                                                      validation_features = validation_features_fold3,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold3)[i] <- paste0("run ", i)
  toc()
  
  #begin fourth fold of cross-validation
  tic(msg = paste0("fourth fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold4 <- setdiff(colnames(high_dose_delta_eset), c(validation_samples_fold1,
                                                                          validation_samples_fold2,
                                                                          validation_samples_fold3))
  validation_features_fold4 <- high_dose_features_class[validation_samples_fold4,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold4 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold3)
  train_features_fold4 <- high_dose_features_class[train_samples_fold4,] %>%
      dplyr::select(-c(sex,site))
  fold4[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold4,
                                                      validation_features = validation_features_fold4,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold4)[i] <- paste0("run ", i)
  toc()
  }
toc()

validation_res_list <- c()
for(i in 1:length(fold4)){
  validation_res_list[[i]] <- bind_rows(fold1[[i]]$validation_results %>%
                                          mutate(fold = 1) %>%
                                          dplyr::select(fold, everything()),
                                        fold2[[i]]$validation_results %>%
                                          mutate(fold = 2) %>%
                                          dplyr::select(fold, everything()),
                                        fold3[[i]]$validation_results %>%
                                          mutate(fold = 3) %>%
                                          dplyr::select(fold, everything()),
                                        fold4[[i]]$validation_results %>%
                                          mutate(fold = 4) %>%
                                          dplyr::select(fold, everything()))
  }
names(validation_res_list) <- paste0("run ", 1:length(fold4))
validation_res_df <- bind_rows(validation_res_list, .id="run")

validation_summarized_res_df <- validation_res_df %>%
  dplyr::select(run, Features, AUC, Accuracy) %>%
  group_by(run, Features) %>%
  dplyr::summarize(mean_cv_AUC = mean(AUC),
            sd_cv_AUC = sd(AUC),
            mean_cv_accuracy = mean(Accuracy),
            sd_cv_accuracy = sd(Accuracy)) %>%
  arrange(desc(mean_cv_AUC))

KSPZV1 delta 1.8x10^6 PfSPZ multimodal ML revised - xgboost nested feature selection and cv FULL DATASET

Tuan M. Tran

19 December, 2023

Objective

Approach

Select options

Use xgboost for both feature selection and training model. Use caret and pROC for cross-validation.