1 Objective

This markdown doc creates Seq (count) and cpm expression sets for VRC 312 and VRC 314 data, does differential gene expression with edgeR, and outputs to fast GSEA using several different module types (BTMs, MSigDB collections, MonacoMods). Generates fgsea output tables that can be visualized using the “PfSPZ Import VRC 312 314 GSEA IPA for Figures.Rmd” script in “/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/” or on the KSPZV1 github repo.

Here, the VRC312 and VRC314 data is combined with BSPZV1 data from another script.

Notes:

  1. For VRC 314, performed analysis on Group 1 (270K x 3 IV), Group 4 & 5 (270K x 4 IV), and Group 6 (900 x 3IV) as these were the groups with same IV dose repeated.
  2. Used -log10(p) * sign(logFC) as rank metric for GSEA.
  3. Added BSPZV1 study for comparison based on reviewer feedback.
  4. Use FDR<10% across at least 2 studies as final filter for visualization.
  5. Added CAMERA results to complement pre-ranked GSEA results. This was motivated by prior reviewer’s comments that pre-ranked GSEA did not account for inter-gene correlations. Results here show that CAMERA results are similar to pre-ranked GSEA results.

2 Baseline Analysis

Run DGE, Camera, pre-ranked fastGSEA for baseline VRC 312 and VRC 314 datasets.

library(doBy)
library(Biobase)
library(biomaRt)
library(edgeR)
library(genefilter)
library(NMF)
library(EDASeq)
library(tweeDEseqCountData)
library(reshape2)
library(tidyverse)
library(googledrive)
library(Microsoft365R)
library(ggplot2)
library(ggpubr)

#Add small legend helper function
addSmallLegend <- function(myPlot, pointSize = 4, textSize = 6, spaceLegend = 1) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

2.1 Read in the VRC 312 and VRC 314 expression sets

Download from google drive.

#VRC 312 
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1X139jUTL4lAELEA3eBTZFx2hQgmOPFSZ"), path = temp, overwrite = TRUE)
hdat_312 <- readRDS(file = dl$local_path)
#VRC 314
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1VXTgNSQuWrsA5E-3OgOgJ-AzMZpL4afo"), path = temp, overwrite = TRUE)
hdat_314 <- readRDS(file = dl$local_path)

Set options

myTIMEPOINT <- "Baseline" #"Baseline"

my_VRC312_Group <- "Grp3a3b4a4b4c"
if(my_VRC312_Group == "Grp3a3b4a4b4c"){
  my_VRC312_Groups <- c("3a","3b", "4a","4b", "4c")
  }

my_VRC314_Group <- "Grp1456" #used for final analysis for reasons stated above
PCT <- 100 #used 100 PCT for final analysis

if(my_VRC314_Group == "Grp134567"){
my_VRC314_Groups <- c("Group1","Group3", "Group4","Group5", "Group6", "Group7") # "Group6" #Group2 is IM
}
if(my_VRC314_Group == "Grp567"){
my_VRC314_Groups <- c("Group5", "Group6","Group7")
}
if(my_VRC314_Group == "Grp1456"){
my_VRC314_Groups <- c("Group1", "Group4", "Group5", "Group6")
}

Reduce datasets to relevant samples

hdat_312 <- hdat_312[,which(hdat_312$Dose.group != "EXT")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)

#remove outcome = N/A
hdat_312 <- hdat_312[,!is.na(hdat_312$Outcome.Final)]
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)
foo <- hdat_312[,hdat_312$Patient == 331 & grepl("Imm5",hdat_312$Dose.group)] #Remove 331 Imm5 (failed QC)
hdat_312 <- hdat_312[,setdiff(colnames(hdat_312),colnames(foo))]

hdat_312 <- hdat_312[,which(grepl(myTIMEPOINT, hdat_312$Dose.group))]
hdat_312 <- hdat_312[,which(hdat_312$Outcome.Final != "N/A")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)
hdat_312$Outcome.Final <- factor(hdat_312$Outcome.Final)
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)

#Filter on Group
hdat_312 <- hdat_312[,which(hdat_312$Group %in% my_VRC312_Groups)]
hdat_312$Group <- factor(hdat_312$Group)
hdat_312$Group <- droplevels(hdat_312$Group)
table(hdat_312$Group,hdat_312$Outcome.Final)
##     
##      Malaria Protected
##   3a       3         0
##   3b       8         1
##   4a       0         2
##   4b       0         4
##   4c       3         6
table(hdat_312$Dose.group,hdat_312$Outcome.Final)
##                
##                 Malaria Protected
##   Baseline. D01      14        13
##         
##          Malaria Protected
##   Group1       1         3
##   Group4       0         5
##   Group5       5         6
##   Group6       5         9
##           
##            Malaria Protected
##   Baseline      11        23

3 VRC 312 Analysis

Make DGElist object VRC 312

# Make DGElist object
y_VRC312   <- DGEList(count = counts(hdat_312), genes= fData(hdat_312), samples = pData(hdat_312), group= hdat_312$Group, remove.zeros=T)

Filter genes VRC 312

filter.type <- "-low_express_filter-"
dim(y_VRC312)
## [1] 18580    27
#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC312)
y_VRC312 <- y_VRC312[keep, , keep.lib.sizes=FALSE]
dim(y_VRC312)
## [1] 13713    27

Normalization VRC 312

y_VRC312$samples$lib.size <- colSums(y_VRC312$counts)
y_VRC312 <- calcNormFactors(y_VRC312)           #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC312)
## [1] 13713    27

3.1 Generalized linear modeling VRC 312

Using edgeR with batch adjustment

# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments) 
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432
Group   <-   factor(y_VRC312$samples$Group)
Outcome <-   y_VRC312$sample$Outcome.Final
Timept  <-   y_VRC312$sample$Dose.group
Dose    <-   factor(y_VRC312$sample$Dose)
Batch1  <-   factor(y_VRC312$sample$BatchNumber)
Batch2  <-   factor(y_VRC312$sample$Date.Extracted)
Batch   <-   as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Subject <-   as.factor(as.character(y_VRC312$sample$Patient))
Gender  <-   as.factor(as.character(y_VRC312$sample$Gender))
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
#design <- model.matrix(~0 + Outcome + Batch1 + Batch2) # batch corrected; baseline only
design <- model.matrix(~Gender + Group + Batch1 + Batch2 + Outcome) # dose adjusted; batch corrected; edgeR
#design <- model.matrix(~0 + Outcome_Timept + Dose + Batch1 + Batch2) # batch corrected; dose adjusted; use limma pairing method
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub(". ", "_", colnames(design))
colnames(design) <- gsub("/", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
colnames(design) <- gsub("_Dose", "", colnames(design))
design <- design[,!(colSums(design) == 0)]          #remove columns with all zeros

##################################
#Remove Columns not of Full Rank #
##################################
#Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, : Design matrix not of full rank. The following coefficients not estimable: Batch28_18_14 Batch28_6_14
design <- design[,colnames(design)!="Batch28_18_14"]
design <- design[,colnames(design)!="Batch28_6_14"]
colnames(design)
##  [1] "(Intercept)"   "GenderMale"    "Group3b"       "Group4a"      
##  [5] "Group4b"       "Group4c"       "Batch12"       "Batch13"      
##  [9] "Batch14"       "Batch15"       "Batch16"       "Batch17"      
## [13] "Batch18"       "Batch19"       "Batch27_28_14" "Batch27_30_14"
## [17] "Batch28_11_14" "Batch28_12_14" "Batch28_14_14" "Batch28_4_14" 
## [21] "Protected"

Estimate Dispersions VRC 312

y_VRC312 <- estimateDisp(y_VRC312,design) 

Perform differential gene expression VRC 312

fit <- glmQLFit(y_VRC312, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC312 <- topTags(qlf, n = nrow(y_VRC312))$table

Examine top 15 DEGs (protected vs not protected) VRC 312

VRC 312 DEGs

TopTab_VRC312 %>%
  arrange(PValue) %>%
  slice_head(., n=15) %>%
  knitr::kable()
hgnc_symbol Gene_length logFC logCPM F PValue FDR
DSP DSP 45143 -6.907299 0.7967660 43.40111 0.0000047 0.0640497
MYOM2 MYOM2 120513 -3.473403 3.9798950 24.48981 0.0003267 0.9991389
MIR3064 MIR3064 66 1.654995 3.9692524 23.00357 0.0004167 0.9991389
KLRC2 KLRC2 15447 2.772390 0.7660329 20.22618 0.0006619 0.9991389
GATA2 GATA2 13759 -2.331792 4.4725489 20.10624 0.0007177 0.9991389
KLF4 KLF4 5631 -1.565611 4.9953269 19.05373 0.0008856 0.9991389
CPA3 CPA3 31941 -2.063480 3.0558447 18.32238 0.0010307 0.9991389
DEFA4 DEFA4 2517 2.182439 2.9653183 17.13226 0.0013157 0.9991389
KLRD1 KLRD1 103543 1.478543 5.9226770 16.85227 0.0014106 0.9991389
ATOH8 ATOH8 36723 -3.987656 1.9543972 15.62164 0.0015321 0.9991389
FLVCR1-AS1 FLVCR1-AS1 5981 -2.345739 1.5393108 16.34732 0.0015905 0.9991389
CRIP2 CRIP2 7209 -2.041907 4.2468983 16.17218 0.0016419 0.9991389
LIMS2 LIMS2 43406 -1.538278 3.2091761 15.26722 0.0020214 0.9991389
CLEC1A CLEC1A 42074 2.208181 1.2225062 15.21198 0.0020652 0.9991389
N4BP2L2-IT2 N4BP2L2-IT2 4890 1.250707 3.7626413 15.15035 0.0020766 0.9991389

3.2 Camera VRC 312

Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 312 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151

This is not in the manuscript, but included here as an alternative approach.

mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
  url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
  temp_list <- readRDS(url(url, method="libcurl"))
  temp_indices <- ids2indices(temp_list, y_VRC312$genes$hgnc_symbol)
  cameratab <- camera(y_VRC312, temp_indices, design, contrast =  c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
  cameratab_list[[i]] <- cameratab %>%
    rownames_to_column(var = "module")
}
cameratab_bind_VRC312 <- bind_rows(cameratab_list, .id = "module_type") %>%
  mutate(study = "VRC312") %>%
  dplyr::select(study, module_type, module, everything())

Plot CAMERA results VRC 312

#plotting options
basetextsize <- 12 
myfont <- "Helvetica"
bubble_max_size <- 15

myCameraPlotDat_VRC312 <- cameratab_bind_VRC312 %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  #filter(!grepl("TBA", module)) %>%
  dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log10(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
  filter(!grepl("TBD", module)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels() %>%
  dplyr::select(study, module_type, module, everything())

#saveRDS(myCameraPlotDat_VRC312, paste0(datadir, "VRC312_Camera_PvNP.rds"))

3.3 Apply GSEA VRC 312

Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.

set.seed(23)
ranks_VRC312 <- TopTab_VRC312 %>% 
  mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
  dplyr::select(hgnc_symbol,rankmetric) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(hgnc_symbol) %>%  
  dplyr::summarize(rankmetric = mean(rankmetric)) %>%
  arrange(desc(rankmetric)) %>%
  deframe()

#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC312 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC312,
                                       geneset = "all",
                                       output_directory = tempdir(),
                                       filename_prefix = "GSEA",
                                       sampleSize = 101,
                                       minSize = 20,
                                       maxSize = Inf,
                                       scoreType = "std") %>%
  as_tibble() %>%
  arrange(desc(NES)) %>% 
  mutate(study = "VRC312") %>%
  dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>% 
  mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
  mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
  arrange(padj)
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
#saveRDS(GSEAtab_VRC312, paste0(datadir, "VRC312_GSEA_PvNP_FDR5.rds"))

4 VRC 314 Analysis

Make DGElist object VRC 314

y_VRC314   <- DGEList(count = counts(hdat_314), genes= fData(hdat_314), samples = pData(hdat_314),group= hdat_314$Outcome.Final, remove.zeros=T)

Filter genes VRC 314

filter.type <- "-low_express_filter-"
dim(y_VRC314)
## [1] 17699    34
#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC314, group = hdat_314$Outcome.Final)
y_VRC314 <- y_VRC314[keep, , keep.lib.sizes=FALSE]
dim(y_VRC314)
## [1] 12059    34

Normalization VRC 314

###############
y_VRC314$samples$lib.size <- colSums(y_VRC314$counts)
y_VRC314 <- calcNormFactors(y_VRC314)           #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC314)
## [1] 12059    34
logcpm_314 <- cpm(y_VRC314, prior.count = 0.25,log=T)

Filter by library size VRC 314

Note that expression set already has been filtered.

y_VRC314 <- y_VRC314[,y_VRC314$samples$lib.size>7.5e6]

4.1 Generalized linear modeling VRC 314

Use edgeR with batch adjustment

# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments) 
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432

Outcome <- y_VRC314$samples$Outcome.Final
Timept <- y_VRC314$samples$Dose.group
Sex <- factor(y_VRC314$samples$Gender)
Batch1  <-   factor(y_VRC314$samples$Lib_Batch)
Batch2  <-   factor(y_VRC314$samples$Date)
Batch   <-   as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Group <- factor(y_VRC314$samples$Group)
Subject <-   y_VRC314$samples$Patient
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
design <- model.matrix(~Group + Sex + Batch + Outcome) # GROUP and BATCH corrected
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub("\\.", "_", colnames(design))
colnames(design) <- gsub(", ", "_", colnames(design))
colnames(design) <- gsub(" \\+ ", "_", colnames(design))
colnames(design) <- gsub("CHMI 1", "CHMI1", colnames(design))
colnames(design) <- gsub(" ", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
design <- design[,!(colSums(design) == 0)]          #remove columns with all zeros

# When running myGroup = Grp1456
# Design matrix not of full rank.  The following coefficients not estimable:
#  Batch4_Jun_27_2016 Batch5_March_23_2016 Batch5_Nov_24_2015
#for Grp
if(my_VRC314_Group == "Grp1456"){
  design <- design[,!colnames(design)%in%c("Batch4_Jun_27_2016", "Batch5_March_23_2016", "Batch5_Nov_24_2015")]
}
colnames(design)
##  [1] "(Intercept)"          "GroupGroup4"          "GroupGroup5"         
##  [4] "GroupGroup6"          "SexMale"              "Batch1_Jun_27_2016"  
##  [7] "Batch1_March_23_2016" "Batch2_Dec_14_2015"   "Batch2_Feb_15_2016"  
## [10] "Batch2_Jun_27_2016"   "Batch2_March_22_2016" "Batch2_March_23_2016"
## [13] "Batch3_Dec_14_2015"   "Batch3_Feb_15_2016"   "Batch3_March_22_2016"
## [16] "Batch3_March_23_2016" "Batch4_Feb_15_2016"   "Batch4_March_23_2016"
## [19] "Batch5_Feb_15_2016"   "Batch5_Jan_10_2017"   "Batch5_Nov_23_2015"  
## [22] "Protected"

Estimate Dispersions VRC 314

y_VRC314 <- estimateDisp(y_VRC314, design) 

Perform differential gene expression VRC 314

fit <- glmQLFit(y_VRC314, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC314 <- topTags(qlf, n = nrow(y_VRC314))$table

Examine top 15 DEGs (protected vs not protected) VRC 314

VRC 314 DEGs

TopTab_VRC314 %>%
  arrange(PValue) %>%
  slice_head(., n=15) %>%
  knitr::kable()
hgnc_symbol Gene_length logFC logCPM F PValue FDR
LSM3 LSM3 22762 1.2447685 2.0450349 21.01062 0.0002565 0.9998583
MRPL22 MRPL22 28342 0.8401138 2.0954516 19.75821 0.0003436 0.9998583
PKMYT1 PKMYT1 12516 1.3669605 0.9853327 18.97528 0.0004217 0.9998583
HMGB2 HMGB2 3431 0.5957184 5.9447479 18.09448 0.0005160 0.9998583
RSL24D1 RSL24D1 16262 0.8829434 4.1146620 18.01766 0.0005265 0.9998583
RPAP2 RPAP2 103092 0.6307832 2.6740872 17.63788 0.0005815 0.9998583
CETN3 CETN3 17526 1.3326930 0.5170825 16.61217 0.0006672 0.9998583
LRRN3 LRRN3 34449 0.5702500 5.0630690 15.88376 0.0009267 0.9998583
TNFRSF17 TNFRSF17 2962 1.0779445 2.0137156 15.80919 0.0009603 0.9998583
TOP2A TOP2A 29435 1.1391423 2.0631915 15.40097 0.0010699 0.9998583
ERH ERH 18505 0.4994195 4.9424513 15.36273 0.0010704 0.9998583
EIF3E EIF3E 234118 0.5335506 5.5085325 14.86408 0.0012317 0.9998583
HAT1 HAT1 61235 0.6950987 3.7125304 14.62087 0.0013206 0.9998583
RPF1 RPF1 18532 0.4463541 4.3677015 14.53896 0.0013517 0.9998583
KIAA0391 KIAA0391 155769 0.4019658 4.9269749 14.40103 0.0014063 0.9998583

4.2 Camera VRC 314

Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 314 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151

This is not in the manuscript, but included here as an alternative approach.

mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
  url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
  temp_list <- readRDS(url(url, method="libcurl"))
  temp_indices <- ids2indices(temp_list, y_VRC314$genes$hgnc_symbol)
  cameratab <- camera(y_VRC314, temp_indices, design, contrast =  c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
  cameratab_list[[i]] <- cameratab %>%
    rownames_to_column(var = "module")
}
cameratab_bind_VRC314 <- bind_rows(cameratab_list, .id = "module_type") %>%
  mutate(study = "VRC314") %>%
  dplyr::select(study, module_type, module, everything())

Plot CAMERA results VRC 314

#plotting options
basetextsize <- 12 
myfont <- "Helvetica"
bubble_max_size <- 15

myCameraPlotDat_VRC314 <- cameratab_bind_VRC314 %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  #filter(!grepl("TBA", module)) %>%
  dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log10(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
  filter(!grepl("TBD", module)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels() %>%
  dplyr::select(study, module_type, module, everything())

#saveRDS(myCameraPlotDat_VRC314, paste0(datadir, "VRC314_Camera_PvNP.rds"))

4.3 Apply GSEA VRC 314

Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.

set.seed(23)
ranks_VRC314 <- TopTab_VRC314 %>% 
  mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
  dplyr::select(hgnc_symbol,rankmetric) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(hgnc_symbol) %>%  
  dplyr::summarize(rankmetric = mean(rankmetric)) %>%
  arrange(desc(rankmetric)) %>%
  deframe()

#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC314 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC314,
                                       geneset = "all",
                                       output_directory = tempdir(),
                                       filename_prefix = "GSEA",
                                       sampleSize = 101,
                                       minSize = 20,
                                       maxSize = Inf,
                                       scoreType = "std") %>%
  as_tibble() %>%
  arrange(desc(NES)) %>%
  mutate(study = "VRC314") %>%
  dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>% 
  mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
  mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
  arrange(padj)
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
#Get BSPZV1 DEG table from googledrive
temp <- tempfile(fileext = ".rds")
dl <- googledrive::drive_download(
   googledrive::as_id("1NR0SKRw37ZUTnKfvdedCPWoyOv3aDECU"), path = temp, overwrite = TRUE)
TopTab_BSPZV1 <- readRDS(file = dl$local_path)
#Bind data from VRC312, VRC314, and BSPZV1 studies
TopTab_BSPZV1_VRC312_VRC314 <- bind_rows(TopTab_BSPZV1 %>%
                                           as_tibble() %>%
                                           dplyr::rename(hgnc_symbol = "gene_symbol") %>%
                                           dplyr::select(-c(ensembl_id)) %>%
                                           mutate(study = "BSPZV1"),
                                         TopTab_VRC312 %>%
                                           as_tibble() %>%
                                           dplyr::select(-c(Gene_length)) %>%
                                           mutate(study = "VRC312"),
                                         TopTab_VRC314 %>%
                                           as_tibble() %>%
                                           dplyr::select(-c(Gene_length)) %>%
                                           mutate(study = "VRC314")) %>%
  dplyr::select(study, everything())

5 Make VRC 312, VRC 314, BSPZV1 tables

study_tab <- data.frame(matrix(c("KPSZV1", "Kenya", "malaria-exposed infants","high dose","1,800,000 PfSPZ x 3", "24", "37",
                                 "VRC 312", "US", "malaria-naive adults","3a", "30,000 PfSPZ x 6", "3", "0",
                                 "", "", "", "3b", "30,000 PfSPZ x 4", "8", "1",
                                 "", "", "", "4a", "135,000 PfSPZ x 5", "0", "2",
                                 "", "", "", "4b", "135,000 PfSPZ x 5", "0", "4",
                                 "", "", "", "4c", "135,000 PfSPZ x 4", "3", "6",
                                 "", "", "", "All", "", "14", "13",
                                 "VRC 314", "US", "malaria-naive adults" , "1", "270,000 PfSPZ x 3", "1", "3",
                                 "", "", "", "4", "270,000 PfSPZ x 4", "0", "5",
                                 "", "", "", "5", "270,000 PfSPZ x 4", "5", "6",
                                 "", "", "", "6", "900,000 PfSPZ x 3", "5", "9",
                                 "", "", "", "All", "", "11", "23",
                                 "BSPZV1", "Tanzania", "malaria-exposed adults", "2", "135,000 PfSPZ x 5", "7", "4", 
                                 "", "", "", "3", "270,000 PfSPZ x 5", "7", "4",
                                 "", "", "", "All", "", "14", "8"),
                               nrow = 15, byrow = T))
colnames(study_tab) <- c("Study", "Country", "Malaria Exposure","Group","Regimen","NP","P")
#for the Neal et al study (PMID 35031601), 3 subjects in BSPZV1 group 1who were smear negative throughout but PCR positive were considered protected

study_tab_plot <- ggtexttable(study_tab,
                                      rows = NULL,
                                      theme = ttheme("classic",
                                                     base_size=8,
                                                     padding = unit(c(8, 2.5), "mm")))

study_tab_plot

6 Combine with BSPZV data and visualize GSEA data as bubble plot.

Filter based on padj < 0.10.

6.1 Code for GSEA bubble plot

#Get BSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1NF0IIJbFDMH_fjPnqS3yPRHb42ajXZug"), path = temp, overwrite = TRUE)
GSEAtab_BSPZV1 <- readRDS(file = dl$local_path)

#Get KSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1VsKSiDGfyPMrJoVFCNhkiffzkXBb41IE"), path = temp, overwrite = TRUE)
GSEAtab_KSPZV1 <- readxl::read_excel(path = dl$local_path) %>%
  filter(treatment == "1.8 x 10^6 PfSPZ") %>% #use only high-dose group for comparison
  mutate(study = "KSPZV1") %>%
  mutate(neglogpadj = -log10(`BH-adj p value (FDR)`)) %>%
  dplyr::select(colnames(GSEAtab_BSPZV1)) %>%
  mutate(pathway = gsub("HALLMARK_", "", pathway)) %>% #clean up pathways to match other dataframes
  mutate(pathway = gsub("\\_", " ", pathway)) #clean up pathways to match other dataframes  
GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- bind_rows(GSEAtab_VRC312,
                     GSEAtab_VRC314) %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  filter(!grepl("TBA", pathway)) %>%
  dplyr::select(study, module_type, pathway, leadingEdge, size, NES, padj) %>%
  mutate(neglogpadj = -log10(padj)) %>%
  dplyr::rename("BH-adj p value (FDR)" = "padj") %>%
  mutate(pathway = gsub("gd", "γδ", pathway)) %>%
  mutate(pathway = gsub("Vd2", "Vδ2", pathway)) %>%
  mutate(pathway = gsub("Vg", "Vγ", pathway)) %>%
  mutate(pathway = gsub("HALLMARK_", "", pathway)) %>%
  mutate(pathway = gsub("_", " ", pathway)) %>%
  mutate(pathway = sub(".*?\\_", "", pathway)) %>%
  filter(!grepl("TBD", pathway)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  droplevels() %>%
  bind_rows(.,GSEAtab_BSPZV1, GSEAtab_KSPZV1) %>%
  mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
  arrange(study, module_type, desc(neglogpadj)) %>%
  group_by(module_type, study) %>%
  mutate(pathway = fct_reorder(pathway, NES, .desc = TRUE)) %>%
  ungroup() %>%
  filter(!grepl("TBD", pathway))
#plotting options
basetextsize <- 8.5  
myfont <- "sans"
bubble_max_size <- 9
fdr_threshold <- 0.1

common_pathways <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  group_by(module_type, pathway) %>%
  dplyr::select(pathway) %>%
  dplyr::summarize(n=n()) %>%
  filter(n>1)

GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(pathway %in% common_pathways$pathway)

TopPlot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("highBTMs", "MonacoModules"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,5,10,20,30,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size=5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.5, 1, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)

TopPlot <- ggarrange(study_tab_plot ,addSmallLegend(TopPlot), widths = c(1,1))

Bottom_Left_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("lowBTMs"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.5, 5, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)

Bottom_Middle_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("MSigDB_Hallmark_v7.4"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.75, 16, 0.75), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)


Bottom_Right_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("BloodGen3Module"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))  +
    theme(plot.margin = unit(c(1, 1.3, 0.5, 1.3), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)

bottom_panels <- ggarrange(addSmallLegend(Bottom_Left_Plot), addSmallLegend(Bottom_Middle_Plot), addSmallLegend(Bottom_Right_Plot),
                           widths = c(1.1, 1, 1),
                           ncol = 3,
                           common.legend = TRUE)

6.2 Plot GSEA bubbleplots (Figures S4B-C)

Red: enriched in protected/uninfected.
Blue: enriched in not protected/infected.
Only modules with BH-adjusted p value (FDR) < 0.10 in at least two studies are shown.

ggarrange(addSmallLegend(TopPlot), bottom_panels, common.legend = TRUE, nrow = 2, heights = c(1,3))

7 Combine KSPZV1, VRC 312, VRC214, BSPZV data and visualize Camera data as bubble plot.

7.1 Code for Camera bubble plot

#Get BSPZV1 Camera data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1NWQVtjpDP5Qjtee4aezR0fSywXpEn5fR"), path = temp, overwrite = TRUE)
myCameraPlotDat_BSPZV1 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DI6Xvqp5emAs_u_SNZxyxnuPkcxkDRxq"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC312 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DIaX2KI9fBuKUIP-rbdKiOr5gwzZG39D"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC314 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DHHoRME8ljljACDmtzyGm8tgt0OajxtS"), path = temp, overwrite = TRUE)
myCameraPlotDat_KSPZV1 <- readRDS(file = dl$local_path)

Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- bind_rows(myCameraPlotDat_BSPZV1, myCameraPlotDat_VRC312, myCameraPlotDat_VRC314, myCameraPlotDat_KSPZV1)                         
fdr_threshold <- 0.1

common_pathways <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  filter(FDR < fdr_threshold) %>%
  group_by(module_type, module) %>%
  dplyr::select(module) %>%
  dplyr::summarize(n=n()) %>%
  filter(n>1)

Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  filter(module %in% common_pathways$module)

plotDat_Camera <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
  mutate(enrichment = ifelse(Direction == "Up",
                           "up in protected/uninfected",
                           "up in not protected/infected")) %>%
  filter(FDR < fdr_threshold) %>%
  filter(!grepl("TBA", module)) %>%
  mutate(neglogFDR = -log10(FDR)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  filter(!grepl("TBD", module)) %>%
  arrange(desc(neglogFDR)) %>%
  group_by(study, module_type) %>%
  mutate(module = fct_reorder(module, neglogFDR, .desc = FALSE)) %>%
  ungroup() %>% 
  droplevels()

#plotting options
basetextsize <- 7.5  
myfont <- "Helvetica"
bubble_max_size <- 6

camera_hibtm_monaco_plot <- plotDat_Camera %>%
  filter(module_type %in% c("highBTMs", "MonacoModules")) %>%
  mutate(module_type = factor(module_type,
                              levels = c("highBTMs", "MonacoModules"),
                              labels = c("high BTMs", "Monaco"))) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        strip.background = element_blank(),
        legend.position = "none",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type, scales = "free_y", nrow = 3)

module_summary_lowbtms <- plotDat_Camera %>%
  group_by(module_type, module) %>%
  dplyr::summarize(n = n()) %>%
  filter(module_type == "lowBTMs" & n>1) %>%
  arrange(n)

camera_lowbtm_plot <- plotDat_Camera %>%
  filter(module_type %in% c("lowBTMs")) %>%
  filter(module %in% module_summary_lowbtms$module) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  #ggtitle("low BTMs") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type)

module_summary_BloodGen3Module <- plotDat_Camera %>%
  group_by(module_type, module) %>%
  dplyr::summarize(n = n()) %>%
  filter(module_type == "BloodGen3Module" & n>1) %>%
  arrange(n)

camera_bloodgen3_plot <- plotDat_Camera %>%
  filter(module_type %in% c("BloodGen3Module")) %>%
  filter(module %in% module_summary_BloodGen3Module$module) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  #ggtitle("low BTMs") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type)

7.2 Plot Camera bubbleplots

Red: enriched in protected/uninfected Blue: enriched in not protected/infected

Only modules with FDR<10% in at least 2 studies are shown.

first_col_plot <- ggpubr::ggarrange(camera_hibtm_monaco_plot, NULL, nrow = 2, heights = c(2.25,3))
ggpubr::ggarrange(first_col_plot, ggplot() + theme_void(), camera_lowbtm_plot, ggplot() + theme_void(), camera_bloodgen3_plot,
                  ncol = 5, widths = c(0.77, -0.175, 1, -0.175, 0.78), common.legend = TRUE)