Objective

Performing training and cross-validation on full dataset

Approach

  1. Perform training using xgboost with hyperparameter tuning with 4-fold cross-validation using random search strategy in 2500 runs on full dataset.
  2. For each run:
  1. manually split the training and validation set 3:1 (47:16) using a random seed
  2. use xgb.train on the training set with hyperparameter tuning
  3. determine feature importance and randomly select 3 to 7 of the top 7-10 features based on gain
  4. validate these selected features on the test set using the caret confusionMatrix function, which provides prediction accuracy, sensitivity, and specificity
  5. use pROC::roc function to get auc (https://stackoverflow.com/questions/44228137/auc-from-averaged-class-probabilities-in-caret-r)
  6. for each run, record the features, accuracy, error rate [(FP+FN)/total or 1-accuracy], sensitivity, specificity, and auc
  1. Repeat step 2 for 2500 runs
  2. Calculate mean test AUC and mean test accuracy for the 4 folds.
  3. Rank the feature combinations by highest test AUC mean and highest test accuracy mean.
  4. In another script, calculate across-folds AUC, kappa, and logloss performance metrics. These will be the final metrics used for the 4-fold CV model.

Notes:

This can be run in 500 run chunks and with each chunk saved if memory is an issue. Results can later be concatenated. If using R studio, run code directly in console for better performance.

library(openxlsx)
library(xgboost)
library(dplyr)
library(tidyverse)
library(fgsea)
library(glmnet)
library(pROC)
library(aod)
library(caret)
library(limma)
library(caTools)
library(e1071)
library(robustbase)
library(Biobase)
library(doMC)
library(randomForest)
library(data.table)
library(googledrive)
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1QXNOms5dHbJtjV8HRRzGpjY4ErjghDPi"), path = temp, overwrite = TRUE)
placebo_features_class <- readRDS(file = dl$local_path)

Select options

train_on_split_or_full_set <- "full" #options: original_split, split, full
myseed <- sample(1:5000,1)
set.seed(myseed) #set seed for reproducibility
  print(paste0("Will train and cross-validate with nested feature selection on full placebo PfSPZ dataset which consists of ", ncol(placebo_baseline_eset), " samples and ", ncol(placebo_features_class), " features."))

Use xgboost for both feature selection and training model. Use caret and pROC for cross-validation.

library(mlr)
library(mlr3)
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/

#create learner
lrn <- makeLearner("classif.xgboost",
                   objective="binary:logistic",
                   nrounds=1000,
                   early_stopping_rounds = 100,
                   eval_metric="error",
                   predict.type = "response")

#set parameter space
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
#https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
params <- makeParamSet(makeDiscreteParam("booster", values = c("gbtree")),
                       makeIntegerParam("gamma",lower = 0L,upper = 3L),
                       makeIntegerParam("max_depth",lower = 2L,upper = 5L),
                       makeNumericParam("eta",lower = 0.01,upper = 0.2),
                       makeNumericParam("min_child_weight",lower = 0L,upper = 8L),
                       makeNumericParam("subsample",lower = 0.75,upper = 0.9),
                       makeNumericParam("lambda",lower = 0, upper = 1),
                       makeNumericParam("alpha",lower = 0, upper = 1),
                       makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))

#set resampling strategy
rdesc <- makeResampleDesc(method = "CV",
                          predict = "test",
                          iters = 4,
                          stratify = T)
#set parallel backend.  
library(parallel)
library(parallelMap)
library(tictoc)
parallelStartSocket(cpus = detectCores())

#set options
maxiterations <- 500 #number of iterations for each run of hyperparameter tuning
runs <- 2500 #number of runs
n_folds <- 4
n_feat_sampled <- 3:7 #range for number of features sampled
ctrl <- makeTuneControlRandom(maxit = maxiterations)

The following code sources a custom function (“kspzv1 xgboost manual train and cv function.R”) to run feature selection and cross validation x times for every fold of cross-validation.

source("https://raw.githubusercontent.com/TranLab/kspzv1-systems/main/kspzv1%20xgboost%20manual%20train%20and%20cv%20function.R")
fold1 <- fold2 <- fold3 <- fold4 <- c()
tic(msg = paste0("total for ", runs, " total runs"))
for(i in 1001:runs){
  run_seed <- sample(1:1e4,1)
  # Splitting data in training and validation sets
  set.seed(run_seed) #set seed for run
  tic(msg = paste0("first fold in ", i, " run of ,", runs," total runs"))
  #run first fold
  split <- sample.split(colnames(placebo_baseline_eset), SplitRatio = (n_folds-1)/n_folds)
  
  train_eset_fold1 <- placebo_baseline_eset[,split==TRUE]
  train_samples_fold1 <- colnames(train_eset_fold1)
  train_features_fold1 <- placebo_features_class[split==TRUE,] %>%
    dplyr::select(-c(sex,site))
  
  validation_eset_fold1 <- placebo_baseline_eset[,split==FALSE]
  validation_samples_fold1 <- colnames(validation_eset_fold1)
  validation_features_fold1 <- placebo_features_class[split==FALSE,] %>%
    dplyr::select(-c(sex,site))
  
  #following if else statement checks to see if split results in single-level outcome in validation set. Does this 2 times (just in case!).
  if(nlevels(validation_features_fold1$class)==1){
    run_new_seed <- sample(1e4:2e4,1)
    # Splitting data in training and validation sets
    set.seed(run_new_seed) #set seed for run
    split <- sample.split(colnames(placebo_baseline_eset), SplitRatio = (n_folds-1)/n_folds)
    train_eset_fold1 <- placebo_baseline_eset[,split==TRUE]
    train_samples_fold1 <- colnames(train_eset_fold1)
    train_features_fold1 <- placebo_features_class[split==TRUE,] %>%
      dplyr::select(-c(sex,site))
    validation_eset_fold1 <- placebo_baseline_eset[,split==FALSE]
    validation_samples_fold1 <- colnames(validation_eset_fold1)
    validation_features_fold1 <- placebo_features_class[split==FALSE,] %>%
      dplyr::select(-c(sex,site))
    }
  if(nlevels(validation_features_fold1$class)==1){
    run_newer_seed <- sample(2e4:3e4,1)
    # Splitting data in training and validation sets
    set.seed(run_newer_seed) #set seed for run
    split <- sample.split(colnames(placebo_baseline_eset), SplitRatio = (n_folds-1)/n_folds)
    train_eset_fold1 <- placebo_baseline_eset[,split==TRUE]
    train_samples_fold1 <- colnames(train_eset_fold1)
    train_features_fold1 <- placebo_features_class[split==TRUE,] %>%
      dplyr::select(-c(sex,site))
    validation_eset_fold1 <- placebo_baseline_eset[,split==FALSE]
    validation_samples_fold1 <- colnames(validation_eset_fold1)
    validation_features_fold1 <- placebo_features_class[split==FALSE,] %>%
      dplyr::select(-c(sex,site))
    }
  
  fold1[[i]] <- xgb_train_and_cv(train_features = train_features_fold1, validation_features = validation_features_fold1)
  names(fold1)[i] <- paste0("run ", i)
  toc()
  
  #begin second fold of cross-validation
  tic(msg = paste0("second fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold2 <- sample(train_samples_fold1,
                                    size = length(validation_samples_fold1))
  validation_features_fold2 <- placebo_features_class[validation_samples_fold2,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold2 <- setdiff(colnames(placebo_baseline_eset), validation_samples_fold2)
  train_features_fold2 <- placebo_features_class[train_samples_fold2,] %>%
      dplyr::select(-c(sex,site))
  fold2[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold2,
                                                      validation_features = validation_features_fold2,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold2)[i] <- paste0("run ", i)
  toc()
  
  #begin third fold of cross-validation
  tic(msg = paste0("third fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold3 <- sample(setdiff(colnames(placebo_baseline_eset), c(validation_samples_fold1,
                                                                                 validation_samples_fold2)),
                                    size = length(validation_samples_fold1))
  validation_features_fold3 <- placebo_features_class[validation_samples_fold3,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold3 <- setdiff(colnames(placebo_baseline_eset), validation_samples_fold3)
  train_features_fold3 <- placebo_features_class[train_samples_fold3,] %>%
      dplyr::select(-c(sex,site))
  fold3[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold3,
                                                      validation_features = validation_features_fold3,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold3)[i] <- paste0("run ", i)
  toc()
  
  #begin fourth fold of cross-validation
  tic(msg = paste0("fourth fold in ", i, " run of ", runs," total runs"))
  validation_samples_fold4 <- setdiff(colnames(placebo_baseline_eset), c(validation_samples_fold1,
                                                                          validation_samples_fold2,
                                                                          validation_samples_fold3))
  validation_features_fold4 <- placebo_features_class[validation_samples_fold4,] %>%
      dplyr::select(-c(sex,site))
  train_samples_fold4 <- setdiff(colnames(placebo_baseline_eset), validation_samples_fold3)
  train_features_fold4 <- placebo_features_class[train_samples_fold4,] %>%
      dplyr::select(-c(sex,site))
  fold4[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold4,
                                                      validation_features = validation_features_fold4,
                                                      features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
  names(fold4)[i] <- paste0("run ", i)
  toc()
  }
toc()
validation_res_list <- c()
for(i in 1:length(fold4)){
  validation_res_list[[i]] <- bind_rows(fold1[[i]]$validation_results %>%
                                          mutate(fold = 1) %>%
                                          dplyr::select(fold, everything()),
                                        fold2[[i]]$validation_results %>%
                                          mutate(fold = 2) %>%
                                          dplyr::select(fold, everything()),
                                        fold3[[i]]$validation_results %>%
                                          mutate(fold = 3) %>%
                                          dplyr::select(fold, everything()),
                                        fold4[[i]]$validation_results %>%
                                          mutate(fold = 4) %>%
                                          dplyr::select(fold, everything()))
  }
names(validation_res_list) <- paste0("run ", 1:length(fold4))
validation_res_df <- bind_rows(validation_res_list, .id="run")

validation_summarized_res_df <- validation_res_df %>%
  dplyr::select(run, Features, AUC, Accuracy) %>%
  group_by(run, Features) %>%
  dplyr::summarize(mean_cv_AUC = mean(AUC),
            sd_cv_AUC = sd(AUC),
            mean_cv_accuracy = mean(Accuracy),
            sd_cv_accuracy = sd(Accuracy)) %>%
  arrange(desc(mean_cv_AUC))