KSPZV1 Complex Heatmaps Δ (postvax/baseline), JCI Revision

Objective

The following code generates the unsupervised hierarchical clustering heatmap for the KSPZV1 dataset using the ComplexHeatmap package. Clustering analysis is performed to test for statistical differences between Sample Clusters (SC) for outcomes as well as relevant variables (specifically SC2 and SC1,3,4). In addition, row clustering generates Gene Clusters (GC), and GSEA using blood transcription modules is then applied to genes within GC4 ranked by variance.

Final options for delta: allgroups filter pre-seqmonk < 7.5M MADS = 25% clustering: ward.D2 and euclidean, 4 clusters for columns, 5 clusters for genes

This script reproduces Figures 4B, S5A, 4D in the revised manuscript. It includes the following changes:

Add study site (Wagai vs Siaya) to column annotations

For revision, address Reviewer’s concerns that generation of clusters in heatmap is random by generating heatmap using random seed to show that the clustering is deterministic. (Note for the delta heatmap, Ward D2, a hierarchical agglomerative method, was used.)

library(ComplexHeatmap)
library(miscTools)
library(edgeR)
library(Biobase)
library(tidyverse)
library(googledrive)
library(data.table)
library(ggpubr)
library(fgsea)
library(devtools)
library(knitr)
figdir <- "/Volumes/GoogleDrive/My Drive/Manuscripts/KSPZV1 Manuscript/Med Revised Submission 2022/Figures/Revised Figure Panels/"

myBatch <- "both" 
myGroup <- "allGroups" 
myTimepoint <- "delta" 
ClusterCols <- TRUE
col.hclust.method <- "ward.D2" 
col.dist.method <- "euclidean" 
row.hclust.method <- "ward.D2" 
row.dist.method <- "euclidean" 
PCT <- 25 
filter.type <- paste("mads",PCT,"pct",sep="")
noColClust <- 4
noRowClust <- 5

Load ExpressionSet

#local path: "/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/Final Data Files/KSPZV1 logCPM expression sets for visualization/PfSPZ_cpm_ExpressionSet_230x23768_allGroups_bothBatches_delta_rmBatchFX_06082021_TMT.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("17xx0YaggLiyWdJc9rqA1nkPSE7dj05p0"), path = temp, overwrite = TRUE)
x <- readRDS(file = dl$local_path)
x$treat <- factor(x$treat, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))

Add additional baseline data

#CBC data
#local path: "/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/CBC baseline with days before VAX1.xlsx"
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1df0pHJAlFteRIcQuJql3U2IZFe5SpZdB"), path = temp, overwrite = TRUE)
CBCdat <- readxl::read_excel(dl$local_path) %>%
  mutate(DaysPriorVax1 = as.integer(gsub(" \\(Dose 1\\)", "", .$`Days Post Vaccination`))*-1,
         Hemoglobin = as.numeric(Hemoglobin)) %>%
  dplyr::select(PATID, DaysPriorVax1, Hemoglobin, Platelets, WBC, Neutrophils, Percent.Neutrophils)
pData(x) <- pData(x) %>%
  left_join(., CBCdat, by = "PATID")

## weight data
#local path: "/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/CDC PfSPZ Trial Results/KSPZV1_anthro_2021-04-12.csv"

temp <- tempfile(fileext = ".csv")
dl <- drive_download(
  as_id("1hvjTuBKWIYKmDrOdiogGqcHPJUUPZM4P"), path = temp, overwrite = TRUE)
WeightDat <- read_csv(dl$local_path) %>%
  dplyr::select(PATID, zwei, zwfl, zbmi) #for definitions: https://cran.r-project.org/web/packages/anthro/anthro.pdf
pData(x) <- pData(x) %>%
  left_join(., WeightDat, by = "PATID") 

#add pre-vax cluster information
#local path: "/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/Final Data Files/PfSPZ_pData_cluster_preSEQMonk75e5cpm_mads100pct_pam_euclidean_Clust_4_Cols_zscored_0_allGroups_244x24197.csv"

temp <- tempfile(fileext = ".csv")
dl <- drive_download(
  as_id("1LtD4PeZZ34dCnghGBikY9wTs9CtsebKo"), path = temp, overwrite = TRUE)
pData(x) <- read_csv(dl$local_path) %>%
  dplyr::select(PATID, Cluster) %>%
  dplyr::rename(PrevaxCluster = Cluster) %>%
  left_join(pData(x), ., by = "PATID")
rownames(pData(x)) <- pData(x)$PATID
all.pDat <- pData(x)

## Filtering features
#for delta, MADS 25% was used
dim(x)

## Features  Samples 
##    23768      230

if(PCT < 100){
  madsno <- as.integer(nrow(x)*(PCT/100))
  mads <- apply(exprs(x), 1, mad)  #mad filtering
  x <- x[mads>sort(mads, decr=TRUE)[madsno],]
}
dim(x)

## Features  Samples 
##     5941      230

Make annotation dataframe and then build heatmap annotation object using HeatMapAnnotation function

For details on how to make annotations, refer to: https://jokergoo.github.io/ComplexHeatmap-reference/book/

#full
annot.df <- as.data.frame(cbind(
  "Site" = as.character(x$site),
  "Gender" = as.character(factor(x$SEX, levels = c("M","F"), labels = c("male","female"))),
  "Age" = as.character(x$age.vax1),
  "Pf at first vax" = as.character(factor(x$mal.vax.1, levels = c(0,1), labels = c("neg","pos"))),
  "# Pf episodes during vax" = as.character(x$mal.dvax.tot),
  "CSP IgG pre-vax" = as.character(log10(x$pfcsp_pre+1)),
  "CSP IgG response (log2 fold change)" = as.character(x$log2FC_CSPAb),
  "Days to first parasitemia post-vax" = as.character(x$tte.mal.atp.6),
  "Outcome" = as.character(factor(x$mal.atp.3, levels = c(0,1), labels = c("protected","unprotected"))),
  "Dose" = as.character(x$treat)
  ))
annot.df <- rev(annot.df)
for(i in c(1:ncol(annot.df))){
  annot.df[,i] <- as.character(annot.df[,i])
  }
annot.df$Age <- as.numeric(annot.df$Age)
annot.df$`Days to first parasitemia post-vax` <- as.numeric(annot.df$`Days to first parasitemia post-vax`)
annot.df$`# Pf episodes during vax` <- as.numeric(annot.df$`# Pf episodes during vax`)
annot.df$`CSP IgG response (log2 fold change)` <- as.numeric(annot.df$`CSP IgG response (log2 fold change)`)
annot.df$`CSP IgG pre-vax` <- as.numeric(annot.df$`CSP IgG pre-vax`)
rownames(annot.df) <- colnames(x)
clab <- list(
  "Site" = c("Wagai" = "#B2DF8A", "Siaya" = "#33A02C"),
  "Gender" = c("male" = "#E5E5E5", "female" = "#191919"),
  "Age" = circlize::colorRamp2(c(min(all.pDat$age.vax1, na.rm=T), max(all.pDat$age.vax1, na.rm=T)), c("#E5E5E5", "#333333")),
  "Pf at first vax" = c("neg" = "#D1D3D3", "pos" = "#b2182b"),
  "# Pf episodes during vax" = circlize::colorRamp2(c(min(all.pDat$mal.dvax.tot, na.rm=T), max(all.pDat$mal.dvax.tot, na.rm=T)),
                                                    c("#fee5d9", "#a50f15")),
  "CSP IgG pre-vax" = circlize::colorRamp2(c(min(log10(all.pDat$pfcsp_pre+1), na.rm=T), max(log10(all.pDat$pfcsp_pre+1), na.rm=T)), c("#f2f0f7", "#54278f")),
  "CSP IgG response (log2 fold change)" = circlize::colorRamp2(c(-max(abs(all.pDat$log2FC_CSPAb), na.rm=T), 0, max(abs(all.pDat$log2FC_CSPAb), na.rm=T)), c("#656566", "white", "#54278f")),
  "Days to first parasitemia post-vax" = circlize::colorRamp2(c(min(all.pDat$tte.mal.atp.6, na.rm=T), max(all.pDat$tte.mal.atp.6, na.rm=T)), c("#d4defc", "#112663")),
  "Outcome" = c("protected" = "#1F78B4", "unprotected" = "#A6CEE3"),
  "Dose" = c("Placebo" = "#808080", "4.5 x 10^5 PfSPZ" = "#fdcc8a", "9.0 x 10^5 PfSPZ" = "#fc8d59", "1.8 x 10^6 PfSPZ" = "#d7301f")
  )
clab <- rev(clab)
ha <- HeatmapAnnotation(df = annot.df, col = clab, na_col = "lavender", annotation_name_gp = gpar(fontsize = 7),
                        simple_anno_size = unit(0.08, "inches"), #this adjusts the height of the column annotations
                        annotation_name_side = "left")

Scaling

colnames(x) <- colnames(exprs(x))
unscaled_logCPM <- x
ScaleData <- FALSE
if(myTimepoint == "delta"){
  myScaleData <- "logFC"
  myHeatlegend.title <- "log2 fold-change over pre-vax"
}

Clustering and Distance options

This chunk takes a while if you run the full gene set so be patient.

#https://jokergoo.github.io/ComplexHeatmap-reference/book/a-single-heatmap.html
#See options for variables
#set parallel backend.  
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
#use random seed to access stability of clustering motivated by reviewer comment
myseed <- sample(1:5000,1) #sample random integer
set.seed(myseed) #set random seed

hclust.col <- cutree(stats::hclust(dist(t(exprs(x)), method = col.dist.method), method = col.hclust.method), k = noColClust)
hclust.row <- cutree(stats::hclust(dist(exprs(x), method = row.dist.method), method = row.hclust.method), k = noRowClust)

Build heatmap object

x$treat <- droplevels(x$treat)
x$dummytreatlevels <- factor(x$treat, levels = c("Placebo",
                                                   "4.5 x 10^5 PfSPZ",
                                                   "9.0 x 10^5 PfSPZ",
                                                   "1.8 x 10^6 PfSPZ"),
                               labels = c(1,2,3,4))
hm <- Heatmap(exprs(x),
              name = "mat",
              column_title = "subjects",
              column_title_side = "bottom",
              show_row_dend = FALSE,
              row_title = paste(filter.type, nrow(x), "genes"),
              row_title_side = "left",
              row_names_side = "left",
              row_names_gp = gpar(fontsize = 8),
              row_names_max_width = unit(9, "cm"),
              cluster_columns = TRUE,
              cluster_rows = TRUE,
              column_split = paste(x$dummytreatlevels, hclust.col, sep = "_"),
              row_split = paste("gene cluster ",hclust.row, sep = "_"),
              show_row_names = FALSE,
              top_annotation = ha,
              heatmap_legend_param = list(title = myHeatlegend.title, color_bar = "continuous", legend_direction = "horizontal"),
              show_column_names=FALSE,
              cluster_column_slices = FALSE,
              cluster_row_slices = FALSE,
              height = unit(3, "inches"),
              width = unit(10, "inches")
              )

Print heatmap (Figure 4A) and extract cluster information

see https://github.com/jokergoo/ComplexHeatmap/issues/136

Note: may have to run this chunk directly in console if you get the following error: Error: The width or height of the raster image is zero, maybe you forget to turn off the previous graphic device or it was corrupted. Run dev.off() to close it.

hm <- draw(hm)

c.dend <- column_dend(hm)  #Extract col dendrogram
ccl.list <- column_order(hm)  #Extract clusters (output is a list)
#lapply(ccl.list, function(x) length(x))  #check/confirm size clusters

#loop to extract samples for each cluster.
for (i in 1:length(column_order(hm))){
  if (i == 1) {
    clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE]))) #one cluster had a single column, so have to add drop = FALSE argument to prevent the matrix from becoming a vector
    out <- cbind(clu, paste("cluster", i, sep=""))
    colnames(out) <- c("SampleID", "Cluster")
    } else {
      clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE])))
      clu <- cbind(clu, paste("cluster", i, sep=""))
      out <- rbind(out, clu)
    }
}

## Merge with pData(x)
pData.cluster <- pData(x) %>%
  rownames_to_column(var = "SampleID") %>%
  left_join(., as_tibble(out), by = "SampleID") %>%
  column_to_rownames(var = "SampleID") %>%
  mutate(Cluster = gsub("cluster", "SC", .$Cluster)) %>%
  mutate(Cluster.pam4.cl16 = Cluster) %>%
  mutate(Cluster = gsub("6", "2", .$Cluster)) %>%
  mutate(Cluster = gsub("10", "2", .$Cluster)) %>%
  mutate(Cluster = gsub("14", "2", .$Cluster)) %>%
  mutate(Cluster = ifelse(.$Cluster != "SC2", "SC1,3,4", .$Cluster)) %>%
  filter(Cluster == "SC2" | Cluster == "SC1,3,4")

pData.cluster <- pData.cluster %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC13", "SC1", "TBD")) %>% #clusters 13-16 are SC1-4 in 1.8x10^6 clusters on Figure S5A
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC14", "SC2", Cluster2)) %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC15", "SC3", Cluster2)) %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC16", "SC4", Cluster2)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC1", "SC1.1", "TBD")) %>% #clusters 1-4 are SC1-4 (Placebo) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC2", "SC2.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC3", "SC3.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC4", "SC4.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC5", "SC1.2", Cluster3)) %>% #clusters 5-8 are SC1-4 (4.5x10^5) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC6", "SC2.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC7", "SC3.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC8", "SC4.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC9", "SC1.3", Cluster3)) %>% #clusters 9-12 are SC1-4 (9.0x10^5) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC10", "SC2.3", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC11", "SC3.3", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC12", "SC4.3", Cluster3)) %>%  
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC13", "SC1.4", Cluster3)) %>% #clusters 13-16 are SC1-4 (1.8x10^6) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC14", "SC2.4", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC15", "SC3.4", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC16", "SC4.4", Cluster3)) %>%
  mutate(Cluster3 = factor(Cluster3,
                           levels = c("SC1.1", "SC2.1", "SC3.1", "SC4.1",
                                      "SC1.2", "SC2.2", "SC3.2", "SC4.2",
                                      "SC1.3", "SC2.3", "SC3.3", "SC4.3",
                                      "SC1.4", "SC2.4", "SC3.4", "SC4.4"))) %>%
  droplevels()
#sanity checks
table(pData.cluster$Cluster2,pData.cluster$treat)

##      
##       Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1       0                0                0               28
##   SC2       0                0                0               13
##   SC3       0                0                0               17
##   SC4       0                0                0                3
##   TBD      61               55               53                0

table(pData.cluster$Cluster3,pData.cluster$treat)

##        
##         Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1.1      35                0                0                0
##   SC2.1      10                0                0                0
##   SC3.1      14                0                0                0
##   SC4.1       2                0                0                0
##   SC1.2       0               36                0                0
##   SC2.2       0                1                0                0
##   SC3.2       0               12                0                0
##   SC4.2       0                6                0                0
##   SC1.3       0                0               25                0
##   SC2.3       0                0                7                0
##   SC3.3       0                0               17                0
##   SC4.3       0                0                4                0
##   SC1.4       0                0                0               28
##   SC2.4       0                0                0               13
##   SC3.4       0                0                0               17
##   SC4.4       0                0                0                3

#setdiff(colnames(x), rownames(pData.cluster))
#setdiff(rownames(pData.cluster), colnames(x))

#sanity checks
table(pData.cluster$Cluster,pData.cluster$treat)

##          
##           Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1,3,4      51               54               46               48
##   SC2          10                1                7               13

table(pData.cluster$Cluster.pam4.cl16,pData.cluster$treat)

##       
##        Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1       35                0                0                0
##   SC10       0                0                7                0
##   SC11       0                0               17                0
##   SC12       0                0                4                0
##   SC13       0                0                0               28
##   SC14       0                0                0               13
##   SC15       0                0                0               17
##   SC16       0                0                0                3
##   SC2       10                0                0                0
##   SC3       14                0                0                0
##   SC4        2                0                0                0
##   SC5        0               36                0                0
##   SC6        0                1                0                0
##   SC7        0               12                0                0
##   SC8        0                6                0                0
##   SC9        0                0               25                0

Plot contingency table and stats for Figure S5A

First plot 2x2 contingency for P and NP for Placebo SC2 and 1.8x10^6 PfSPZ SC2 and perform Fisher’s Exact as per original.
To address reviewer concerns, plot 2x4 contingency for P and NP in SC1, SC2, SC3, SC4. Perform chi-square as per reviewer suggestion.

placebo_highdose_SC2 <- pData.cluster %>%
  filter(treat == "1.8 x 10^6 PfSPZ" | treat == "Placebo") %>%
  filter(Cluster == "SC2") %>%
  mutate(Cluster3 = gsub(".1","_placebo", Cluster3)) %>%
  mutate(Cluster3 = gsub(".4","_highdose_PfSPZ", Cluster3)) %>% 
  mutate(Cluster3 = factor(Cluster3, levels = c("SC2_placebo", "SC2_highdose_PfSPZ"))) %>%
  mutate(outcome = ifelse(mal.atp.3 == 1, "NP", "P")) %>% #mal.atp.3 == 1 means malaria infected at least once during 3 months (according to protocol)
  droplevels()
#original Fisher's exact test between SC2 and SC1,2,4
my_tab <- table(placebo_highdose_SC2$outcome, placebo_highdose_SC2$Cluster3)
my_tab

##     
##      SC2_placebo SC2_highdose_PfSPZ
##   NP           7                  2
##   P            3                 11

fisher.test(my_tab)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  my_tab
## p-value = 0.01306
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    1.27226 167.87678
## sample estimates:
## odds ratio 
##   11.11222

#reviewer suggested chi-square between SC1, SC2, SC3, SC4 and placebo vs high-dose
placebo_highdose_allSCs <- pData.cluster %>%
  filter(treat == "1.8 x 10^6 PfSPZ" | treat == "Placebo") %>%
  mutate(Cluster3 = gsub("\\.1","_placebo", Cluster3)) %>%
  mutate(Cluster3 = gsub("\\.4","_highdose_PfSPZ", Cluster3)) %>% 
  mutate(Cluster3 = factor(Cluster3, levels = c("SC1_placebo", "SC2_placebo", "SC3_placebo", "SC4_placebo",
                                                "SC1_highdose_PfSPZ", "SC2_highdose_PfSPZ", "SC3_highdose_PfSPZ", "SC4_highdose_PfSPZ"))) %>%
  mutate(outcome = ifelse(mal.atp.3 == 1, "NP", "P")) %>% #mal.atp.3 == 1 means malaria infected at least once during 3 months (according to protocol)
  droplevels()
my_tab <- table(placebo_highdose_allSCs$outcome, placebo_highdose_allSCs$Cluster3)
my_tab

##     
##      SC1_placebo SC2_placebo SC3_placebo SC4_placebo SC1_highdose_PfSPZ
##   NP          16           7          11           2                 13
##   P           19           3           3           0                 15
##     
##      SC2_highdose_PfSPZ SC3_highdose_PfSPZ SC4_highdose_PfSPZ
##   NP                  2                  8                  1
##   P                  11                  9                  2

chisq.test(my_tab)

## 
##  Pearson's Chi-squared test
## 
## data:  my_tab
## X-squared = 15.166, df = 7, p-value = 0.03393

Results remain significant.

#perform chi-square for P vs NP distribution across ALL 16 clusters
#not requested by reviewer but did just for fun
foo <- pData.cluster %>%
  mutate(outcome = ifelse(mal.atp.3 == 1, "NP", "P")) %>% #mal.atp.3 == 1 means malaria infected at least once during 3 months (according to protocol)
  droplevels()

my_tab <- table(foo$outcome, foo$Cluster3)
my_tab

##     
##      SC1.1 SC2.1 SC3.1 SC4.1 SC1.2 SC2.2 SC3.2 SC4.2 SC1.3 SC2.3 SC3.3 SC4.3
##   NP    16     7    11     2    20     1     7     0    10     2    13     0
##   P     19     3     3     0    16     0     5     6    15     5     4     4
##     
##      SC1.4 SC2.4 SC3.4 SC4.4
##   NP    13     2     8     1
##   P     15    11     9     2

chisq.test(my_tab)

## 
##  Pearson's Chi-squared test
## 
## data:  my_tab
## X-squared = 33.963, df = 15, p-value = 0.003446

Results are significant.

Plot stats based on clusters

library(tidyverse)
basefont <- "sans"
basefontsize <- 9
pvalfontsize <- 7
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("****", "***", "**", "*", "trend",""))
pval_label <- "p.format"
adjust_p <- "adjp"

#OUTCOME
foo1 <- c(1,2,3,4)  
names(foo1) <-  names(summary(pData.cluster$treat))
for(i in names(summary(pData.cluster$treat))){
  pData.cluster.temp <- pData.cluster %>%
    filter(treat == i)
  foo1[i] <- signif(fisher.test(table("Cluster" = pData.cluster.temp$Cluster, "Outcome" = pData.cluster.temp$mal.atp.3))$p.value,2)
    if(pval_label == "p.format" & adjust_p == "adjp"){
      foo1 <- p.adjust(foo1, method = "BH")
    }
    if(pval_label == "p.signif" ){
      foo1[i] <- ifelse(foo1[i] >= 0.05, "",
                        ifelse(foo1[i] >= 0.01 & foo1[i] < 0.05, "*",
                               ifelse(foo1[i] >= 0.001 & foo1[i] < 0.01, "**",
                                      ifelse(foo1[i] >= 0.0001 & foo1[i] < 0.001, "***", "****"))))
    }
    }
dat_text <- data.frame(treat = factor(levels(pData.cluster$treat),
                                      levels = levels(pData.cluster$treat)),
                       label = foo1, Cluster = levels(factor(pData.cluster.temp$Cluster)), x = 1.5, y = 25,
                       Outcome = c("not protected (infected)", "protected (uninfected)"))

Outcome.counts <- pData.cluster %>%
  mutate(Cluster = factor(Cluster)) %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  ggplot(., aes(x=Cluster, fill = Outcome)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("Outcome (count)") +
  facet_wrap(.~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.background = element_rect(fill="white"),
        legend.position="none"
        ) +
  geom_text(
    data = dat_text,
    mapping = aes(x = x, y = y, label = label))

#Days to first parasitemia
TTE.boxplot.dat <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  dplyr::filter(!(is.na(tte.mal.atp.6) | is.na(Outcome)))

TTE.boxplot.stats <-  TTE.boxplot.dat %>% 
  filter(!(treat == "4.5 x 10^5 PfSPZ" & Cluster == "SC2")) %>%
  group_by(treat, Cluster) %>%
  do(w = wilcox.test(tte.mal.atp.6~Outcome, data=., paired=FALSE, exact = FALSE)) %>%
  summarize(treat, Cluster, Wilcox_pval = w$p.value) %>%
  mutate(padj = p.adjust(Wilcox_pval, method= "BH")) %>%
  mutate_if(is.numeric, signif, 2)

TTE.boxplot <- TTE.boxplot.dat %>%
  ggplot(., aes(Cluster, tte.mal.atp.6, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = pval_label,
                     method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("Days to first parasitemia") +
  ylim(c(0,180)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.background = element_rect(fill="white"),
        legend.position="none"
        )

#CSP-specific IgG at pre-vax
CSPbl.boxplot.dat <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  dplyr::filter(!(is.na(pfcsp_pre) | is.na(Outcome)))

CSPbl.boxplot.stats <- CSPbl.boxplot.dat %>% 
  filter(!(treat == "4.5 x 10^5 PfSPZ" & Cluster == "SC2")) %>%
  group_by(treat, Cluster) %>%
  do(w = wilcox.test(pfcsp_pre~Outcome, data=., paired=FALSE, exact = FALSE)) %>%
  summarize(treat, Cluster, Wilcox_pval = w$p.value) %>%
  mutate(padj = p.adjust(Wilcox_pval, method= "BH")) %>%
  mutate_if(is.numeric, signif, 2)

CSPbl.boxplot <- CSPbl.boxplot.dat %>% 
  ggplot(., aes(Cluster, log10(pfcsp_pre+1), fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
     stat_compare_means(aes(group = Outcome), label = pval_label,
                        method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("CSP-specific IgG at pre-vax") +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

#CSP-specific IgG log2FC
CSPlogfc.boxplot.dat <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  dplyr::filter(!(is.na(log2FC_CSPAb) | is.na(Outcome)))
                
CSPlogfc.boxplot.stats <-  CSPlogfc.boxplot.dat %>% 
  filter(!(treat == "4.5 x 10^5 PfSPZ" & Cluster == "SC2")) %>%
  group_by(treat, Cluster) %>%
  do(w = wilcox.test(log2FC_CSPAb~Outcome, data=., paired=FALSE, exact = FALSE)) %>%
  summarize(treat, Cluster, Wilcox_pval = w$p.value) %>%
  mutate(padj = p.adjust(Wilcox_pval, method= "BH")) %>%
  mutate_if(is.numeric, signif, 2)
                
CSPlogfc.boxplot <- CSPlogfc.boxplot.dat %>% 
  ggplot(., aes(x = Cluster, y = log2FC_CSPAb, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = pval_label,
                     method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("CSP-specific IgG (log fold change)") +
  ylim(c(-12.5,25)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

#Pf at first vax
foo1 <- c(1,2,3,4)  
names(foo1) <-  names(summary(pData.cluster$treat))
pData.cluster <- pData.cluster %>%
  mutate(Cluster = factor(Cluster)) %>%
  mutate(Pf = factor(mal.vax.1, levels = c(1,0), labels = c("pos", "neg")))
for(i in names(summary(pData.cluster$treat))){
  pData.cluster.temp <- pData.cluster %>%
    filter(treat == i)
  foo1[i] <- signif(fisher.test(table("Cluster" = pData.cluster.temp$Cluster, "Pf at Vax1" = pData.cluster.temp$Pf))$p.value,2)
  if(pval_label == "p.format" & adjust_p == "adjp"){
    foo1 <- p.adjust(foo1, method = "BH")
    }
  if(pval_label == "p.signif" ){
    foo1[i] <- ifelse(foo1[i] >= 0.05, "",
                      ifelse(foo1[i] >= 0.01 & foo1[i] < 0.05, "*",
                             ifelse(foo1[i] >= 0.001 & foo1[i] < 0.01, "**",
                                    ifelse(foo1[i] >= 0.0001 & foo1[i] < 0.001, "***", "****"))))
  }
  }

dat_text <- data.frame(treat = factor(levels(pData.cluster$treat), levels = levels(pData.cluster$treat)), label = foo1, Cluster = levels(factor(pData.cluster.temp$Cluster)),
                           x = 1.5, y = 45,
                           Pf = c("pos","neg"))              

Pf.counts <-  pData.cluster %>%
  ggplot(., aes(x=Cluster, fill = Pf)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#b2182b", "#D1D3D3")) +
  ylab("Pf at first vax (counts)") +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        ) +
  geom_text(data = dat_text,mapping = aes(x = x, y = y, label = label))

#Pf episodes during vax period
mal.dvax.tot.boxplot.dat <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)")))

mal.dvax.tot.boxplot.stats <- mal.dvax.tot.boxplot.dat %>% 
  filter(!(treat == "4.5 x 10^5 PfSPZ" & Cluster == "SC2")) %>%
  group_by(treat, Cluster) %>%
  do(w = wilcox.test(mal.dvax.tot~Outcome, data=., paired=FALSE, exact = FALSE)) %>%
  summarize(treat, Cluster, Wilcox_pval = w$p.value) %>%
  mutate(padj = p.adjust(Wilcox_pval, method= "BH")) %>%
  mutate_if(is.numeric, signif, 2)

mal.dvax.tot.boxplot <- mal.dvax.tot.boxplot.dat %>%
  ggplot(., aes(Cluster, mal.dvax.tot, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = pval_label,
                     method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("# Pf episodes during vax") +
  ylim(c(0,6.5)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

foo1 <- c(1,2,3,4)  
names(foo1) <-  names(summary(pData.cluster$treat))
pData.cluster <- pData.cluster %>%
  mutate(Site = factor(site))
for(i in names(summary(pData.cluster$treat))){
  pData.cluster.temp <- pData.cluster %>%
    filter(treat == i)
  foo1[i] <- signif(fisher.test(table("Cluster" = pData.cluster.temp$Cluster, "Site" = pData.cluster.temp$Site))$p.value,2)
  if(pval_label == "p.format" & adjust_p == "adjp"){
    foo1 <- p.adjust(foo1, method = "BH")
    }
  if(pval_label == "p.signif" ){
    foo1[i] <- ifelse(foo1[i] >= 0.05, "",
                      ifelse(foo1[i] >= 0.01 & foo1[i] < 0.05, "*",
                             ifelse(foo1[i] >= 0.001 & foo1[i] < 0.01, "**",
                                    ifelse(foo1[i] >= 0.0001 & foo1[i] < 0.001, "***", "****"))))
  }
  }

dat_text <- data.frame(treat = factor(levels(pData.cluster$treat), levels = levels(pData.cluster$treat)), label = foo1, Cluster = levels(factor(pData.cluster.temp$Cluster)),
                           x = 1.5, y = 25,
                           Site = c("Siaya","Wagai"))      

Site.counts <- pData.cluster %>%
  dplyr::filter(!(is.na(Cluster) | is.na(Site))) %>% 
  ggplot(., aes(x= Cluster, fill = Site)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#33A02C","#B2DF8A")) +
  ylab("Site (counts)") +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        ) +
  geom_text(data = dat_text,mapping = aes(x = x, y = y, label = label, size = basefontsize))

Print column annotation bar and box plots (Figure S5)

Note that the boxplots here show unadjusted p values, whereas the count plots show adjusted p values. Figure S5 in the manuscript shows BH-adjusted p values for all. BH-adjusted p values for the boxplots can be accessed from the objects with the “.stats” suffix.

print(ggarrange(
  Outcome.counts,
  TTE.boxplot,
  CSPbl.boxplot,
  CSPlogfc.boxplot,
  Pf.counts,
  mal.dvax.tot.boxplot,
  Site.counts,
  ncol = 2,
  nrow = 4)
)

Examine SC2 Placebo vs SC2 1.8x10^6 PfSPZ

pval_label <- "p.format"
adjust_p <- "adjp"

sc2_subset <- pData.cluster %>%
  filter(Cluster == "SC2") %>%
  filter(treat == "Placebo" | treat == "1.8 x 10^6 PfSPZ") %>%
  droplevels() %>%
  mutate(treat = factor(treat)) %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)")))

#OUTCOME SC2 Placebo vs SC2 1.8x10^6 PfSPZ
foo <- signif(fisher.test(table("Dosegroup" = sc2_subset$treat, "Outcome" = sc2_subset$Outcome))$p.value,2)
if(pval_label == "p.format" & adjust_p == "adjp"){
    foo_text <- paste0("p=",p.adjust(foo, method = "BH"))
    }
if(pval_label == "p.signif" ){
  foo_text <- ifelse(foo >= 0.05, "",
                     ifelse(foo >= 0.01 & foo < 0.05, "*",
                            ifelse(foo >= 0.001 & foo1[i] < 0.01, "**",
                                   ifelse(foo1[i] >= 0.0001 & foo1[i] < 0.001, "***", "****"))))
}
  dat_text <- data.frame(label = foo_text, x = 1.5, y = 9, Outcome = c("not protected (infected)", "protected (uninfected)"))

Outcome.counts <- sc2_subset  %>%
  ggplot(., aes(x=treat, fill = Outcome)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("Outcome (count)") +
  theme_classic(base_family = "Arial", base_size = 10) +
  theme(#axis.text.x=element_text(angle = 45, vjust = -1, hjust = 0),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.background = element_rect(fill="white"),
        legend.position="none"
        ) +
  geom_text(data = dat_text,
    mapping = aes(x = x, y = y, label = label))

Outcome.counts