Performing training and cross-validation on full dataset. The following code is the same as for baseline high-dose PfSPZ except it includes steps for scaling and imputing missing data for delta.
Notes: This can be run in 500 run chunks and with each chunk saved if memory is an issue. Results can later be concatenated. If using R studio, run code directly in console for better performance.
library(openxlsx)
library(xgboost)
library(dplyr)
library(tidyverse)
library(fgsea)
library(glmnet)
library(pROC)
library(aod)
library(caret)
library(limma)
library(caTools)
library(e1071)
library(robustbase)
library(Biobase)
library(doMC)
library(randomForest)
library(data.table)
library(googledrive)
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1QI4BIKcW-5oQfHl-o3t7Hj58OaKl8Mn8"), path = temp, overwrite = TRUE)
high_dose_features_class <- readRDS(file = dl$local_path)
train_on_split_or_full_set <- "full" #options: original_split, split, full
myseed <- sample(1:5000,1)
set.seed(myseed) #set seed for reproducibility
print(paste0("Will train and cross-validate with nested feature selection on full high dose PfSPZ dataset which consists of ", ncol(high_dose_delta_eset), " samples and ", ncol(high_dose_features_class), " features."))
library(mlr)
library(mlr3)
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
#create learner
lrn <- makeLearner("classif.xgboost",
objective="binary:logistic",
nrounds=1000,
early_stopping_rounds = 100,
eval_metric="error",
predict.type = "response")
#set parameter space
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
#https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
params <- makeParamSet(makeDiscreteParam("booster", values = c("gbtree")),
makeIntegerParam("gamma",lower = 0L,upper = 3L),
makeIntegerParam("max_depth",lower = 2L,upper = 5L),
makeNumericParam("eta",lower = 0.01,upper = 0.2),
makeNumericParam("min_child_weight",lower = 0L,upper = 8L),
makeNumericParam("subsample",lower = 0.75,upper = 0.9),
makeNumericParam("lambda",lower = 0, upper = 1),
makeNumericParam("alpha",lower = 0, upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))
#set resampling strategy
rdesc <- makeResampleDesc(method = "CV",
predict = "test",
iters = 4,
stratify = T)
#set parallel backend.
library(parallel)
library(parallelMap)
library(tictoc)
parallelStartSocket(cpus = detectCores())
#set options
maxiterations <- 500 #number of iterations for each run of hyperparameter tuning
runs <- 2500 #number of runs
n_folds <- 4
n_feat_sampled <- 3:7 #range for number of features sampled
ctrl <- makeTuneControlRandom(maxit = maxiterations)
The following code sources a custom function (“kspzv1 xgboost manual train and cv function.R”) to run feature selection and cross validation x times for every fold of cross-validation.
source("https://raw.githubusercontent.com/TranLab/kspzv1-systems/main/kspzv1%20xgboost%20manual%20train%20and%20cv%20function.R")
fold1 <- fold2 <- fold3 <- fold4 <- c()
tic(msg = paste0("total for ", runs, " total runs"))
for(i in 2100:runs){
run_seed <- sample(1:1e4,1)
# Splitting data in training and validation sets
set.seed(run_seed) #set seed for run
tic(msg = paste0("first fold in ", i, " run of ,", runs," total runs"))
#run first fold
split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
train_samples_fold1 <- colnames(train_eset_fold1)
train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
dplyr::select(-c(sex,site))
validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
validation_samples_fold1 <- colnames(validation_eset_fold1)
validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
dplyr::select(-c(sex,site))
#following if else statement checks to see if split results in single-level outcome in validation set. Does this 2 times (just in case!).
if(nlevels(validation_features_fold1$class)==1){
run_new_seed <- sample(1e4:2e4,1)
# Splitting data in training and validation sets
set.seed(run_new_seed) #set seed for run
split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
train_samples_fold1 <- colnames(train_eset_fold1)
train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
dplyr::select(-c(sex,site))
validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
validation_samples_fold1 <- colnames(validation_eset_fold1)
validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
dplyr::select(-c(sex,site))
}
if(nlevels(validation_features_fold1$class)==1){
run_newer_seed <- sample(2e4:3e4,1)
# Splitting data in training and validation sets
set.seed(run_newer_seed) #set seed for run
split <- sample.split(colnames(high_dose_delta_eset), SplitRatio = (n_folds-1)/n_folds)
train_eset_fold1 <- high_dose_delta_eset[,split==TRUE]
train_samples_fold1 <- colnames(train_eset_fold1)
train_features_fold1 <- high_dose_features_class[split==TRUE,] %>%
dplyr::select(-c(sex,site))
validation_eset_fold1 <- high_dose_delta_eset[,split==FALSE]
validation_samples_fold1 <- colnames(validation_eset_fold1)
validation_features_fold1 <- high_dose_features_class[split==FALSE,] %>%
dplyr::select(-c(sex,site))
}
fold1[[i]] <- xgb_train_and_cv(train_features = train_features_fold1, validation_features = validation_features_fold1)
names(fold1)[i] <- paste0("run ", i)
toc()
#begin second fold of cross-validation
tic(msg = paste0("second fold in ", i, " run of ", runs," total runs"))
validation_samples_fold2 <- sample(train_samples_fold1,
size = length(validation_samples_fold1))
validation_features_fold2 <- high_dose_features_class[validation_samples_fold2,] %>%
dplyr::select(-c(sex,site))
train_samples_fold2 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold2)
train_features_fold2 <- high_dose_features_class[train_samples_fold2,] %>%
dplyr::select(-c(sex,site))
fold2[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold2,
validation_features = validation_features_fold2,
features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
names(fold2)[i] <- paste0("run ", i)
toc()
#begin third fold of cross-validation
tic(msg = paste0("third fold in ", i, " run of ", runs," total runs"))
validation_samples_fold3 <- sample(setdiff(colnames(high_dose_delta_eset), c(validation_samples_fold1,
validation_samples_fold2)),
size = length(validation_samples_fold1))
validation_features_fold3 <- high_dose_features_class[validation_samples_fold3,] %>%
dplyr::select(-c(sex,site))
train_samples_fold3 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold3)
train_features_fold3 <- high_dose_features_class[train_samples_fold3,] %>%
dplyr::select(-c(sex,site))
fold3[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold3,
validation_features = validation_features_fold3,
features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
names(fold3)[i] <- paste0("run ", i)
toc()
#begin fourth fold of cross-validation
tic(msg = paste0("fourth fold in ", i, " run of ", runs," total runs"))
validation_samples_fold4 <- setdiff(colnames(high_dose_delta_eset), c(validation_samples_fold1,
validation_samples_fold2,
validation_samples_fold3))
validation_features_fold4 <- high_dose_features_class[validation_samples_fold4,] %>%
dplyr::select(-c(sex,site))
train_samples_fold4 <- setdiff(colnames(high_dose_delta_eset), validation_samples_fold3)
train_features_fold4 <- high_dose_features_class[train_samples_fold4,] %>%
dplyr::select(-c(sex,site))
fold4[[i]] <- xgb_train_and_cv_on_selected_features(train_features = train_features_fold4,
validation_features = validation_features_fold4,
features_to_test = fold1[[i]]$downselected_xgb_results$feature_names)
names(fold4)[i] <- paste0("run ", i)
toc()
}
toc()
validation_res_list <- c()
for(i in 1:length(fold4)){
validation_res_list[[i]] <- bind_rows(fold1[[i]]$validation_results %>%
mutate(fold = 1) %>%
dplyr::select(fold, everything()),
fold2[[i]]$validation_results %>%
mutate(fold = 2) %>%
dplyr::select(fold, everything()),
fold3[[i]]$validation_results %>%
mutate(fold = 3) %>%
dplyr::select(fold, everything()),
fold4[[i]]$validation_results %>%
mutate(fold = 4) %>%
dplyr::select(fold, everything()))
}
names(validation_res_list) <- paste0("run ", 1:length(fold4))
validation_res_df <- bind_rows(validation_res_list, .id="run")
validation_summarized_res_df <- validation_res_df %>%
dplyr::select(run, Features, AUC, Accuracy) %>%
group_by(run, Features) %>%
dplyr::summarize(mean_cv_AUC = mean(AUC),
sd_cv_AUC = sd(AUC),
mean_cv_accuracy = mean(Accuracy),
sd_cv_accuracy = sd(Accuracy)) %>%
arrange(desc(mean_cv_AUC))