1 Objective
2 Baseline Analysis
- 2.1 Read in the VRC 312 and VRC 314 expression sets
3 VRC 312 Analysis
4 VRC 314 Analysis
5 Make VRC 312, VRC 314, BSPZV1 tables
6 Combine with BSPZV data and visualize GSEA data as bubble plot.
- 6.1 Code for GSEA bubble plot
- 6.2 Plot GSEA bubbleplots (Figures S4B-C)
7 Combine KSPZV1, VRC 312, VRC214, BSPZV data and visualize Camera data as bubble plot.
- 7.1 Code for Camera bubble plot
- 7.2 Plot Camera bubbleplots

1 Objective

This markdown doc creates Seq (count) and cpm expression sets for VRC 312 and VRC 314 data, does differential gene expression with edgeR, and outputs to fast GSEA using several different module types (BTMs, MSigDB collections, MonacoMods). Generates fgsea output tables that can be visualized using the “PfSPZ Import VRC 312 314 GSEA IPA for Figures.Rmd” script in “/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/” or on the KSPZV1 github repo.

Here, the VRC312 and VRC314 data is combined with BSPZV1 data from another script.

Notes:

For VRC 314, performed analysis on Group 1 (270K x 3 IV), Group 4 & 5 (270K x 4 IV), and Group 6 (900 x 3IV) as these were the groups with same IV dose repeated.
Used -log10(p) * sign(logFC) as rank metric for GSEA.
Added BSPZV1 study for comparison based on reviewer feedback.
Use FDR<10% across at least 2 studies as final filter for visualization.
Added CAMERA results to complement pre-ranked GSEA results. This was motivated by prior reviewer’s comments that pre-ranked GSEA did not account for inter-gene correlations. Results here show that CAMERA results are similar to pre-ranked GSEA results.

2 Baseline Analysis

Run DGE, Camera, pre-ranked fastGSEA for baseline VRC 312 and VRC 314 datasets.

library(doBy)
library(Biobase)
library(biomaRt)
library(edgeR)
library(genefilter)
library(NMF)
library(EDASeq)
library(tweeDEseqCountData)
library(reshape2)
library(tidyverse)
library(googledrive)
library(Microsoft365R)
library(ggplot2)
library(ggpubr)

#Add small legend helper function
addSmallLegend <- function(myPlot, pointSize = 4, textSize = 6, spaceLegend = 1) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

2.1 Read in the VRC 312 and VRC 314 expression sets

Download from google drive.

#VRC 312 
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1X139jUTL4lAELEA3eBTZFx2hQgmOPFSZ"), path = temp, overwrite = TRUE)
hdat_312 <- readRDS(file = dl$local_path)
#VRC 314
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1VXTgNSQuWrsA5E-3OgOgJ-AzMZpL4afo"), path = temp, overwrite = TRUE)
hdat_314 <- readRDS(file = dl$local_path)

Set options

myTIMEPOINT <- "Baseline" #"Baseline"

my_VRC312_Group <- "Grp3a3b4a4b4c"
if(my_VRC312_Group == "Grp3a3b4a4b4c"){
  my_VRC312_Groups <- c("3a","3b", "4a","4b", "4c")
  }

my_VRC314_Group <- "Grp1456" #used for final analysis for reasons stated above
PCT <- 100 #used 100 PCT for final analysis

if(my_VRC314_Group == "Grp134567"){
my_VRC314_Groups <- c("Group1","Group3", "Group4","Group5", "Group6", "Group7") # "Group6" #Group2 is IM
}
if(my_VRC314_Group == "Grp567"){
my_VRC314_Groups <- c("Group5", "Group6","Group7")
}
if(my_VRC314_Group == "Grp1456"){
my_VRC314_Groups <- c("Group1", "Group4", "Group5", "Group6")
}

Reduce datasets to relevant samples

hdat_312 <- hdat_312[,which(hdat_312$Dose.group != "EXT")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)

#remove outcome = N/A
hdat_312 <- hdat_312[,!is.na(hdat_312$Outcome.Final)]
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)
foo <- hdat_312[,hdat_312$Patient == 331 & grepl("Imm5",hdat_312$Dose.group)] #Remove 331 Imm5 (failed QC)
hdat_312 <- hdat_312[,setdiff(colnames(hdat_312),colnames(foo))]

hdat_312 <- hdat_312[,which(grepl(myTIMEPOINT, hdat_312$Dose.group))]
hdat_312 <- hdat_312[,which(hdat_312$Outcome.Final != "N/A")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)
hdat_312$Outcome.Final <- factor(hdat_312$Outcome.Final)
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)

#Filter on Group
hdat_312 <- hdat_312[,which(hdat_312$Group %in% my_VRC312_Groups)]
hdat_312$Group <- factor(hdat_312$Group)
hdat_312$Group <- droplevels(hdat_312$Group)
table(hdat_312$Group,hdat_312$Outcome.Final)

##     
##      Malaria Protected
##   3a       3         0
##   3b       8         1
##   4a       0         2
##   4b       0         4
##   4c       3         6

table(hdat_312$Dose.group,hdat_312$Outcome.Final)

##                
##                 Malaria Protected
##   Baseline. D01      14        13

##         
##          Malaria Protected
##   Group1       1         3
##   Group4       0         5
##   Group5       5         6
##   Group6       5         9

##           
##            Malaria Protected
##   Baseline      11        23

3 VRC 312 Analysis

Make DGElist object VRC 312

# Make DGElist object
y_VRC312   <- DGEList(count = counts(hdat_312), genes= fData(hdat_312), samples = pData(hdat_312), group= hdat_312$Group, remove.zeros=T)

Filter genes VRC 312

filter.type <- "-low_express_filter-"
dim(y_VRC312)

## [1] 18580    27

#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC312)
y_VRC312 <- y_VRC312[keep, , keep.lib.sizes=FALSE]
dim(y_VRC312)

## [1] 13713    27

Normalization VRC 312

y_VRC312$samples$lib.size <- colSums(y_VRC312$counts)
y_VRC312 <- calcNormFactors(y_VRC312)           #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC312)

## [1] 13713    27

3.1 Generalized linear modeling VRC 312

Using edgeR with batch adjustment

# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments) 
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432
Group   <-   factor(y_VRC312$samples$Group)
Outcome <-   y_VRC312$sample$Outcome.Final
Timept  <-   y_VRC312$sample$Dose.group
Dose    <-   factor(y_VRC312$sample$Dose)
Batch1  <-   factor(y_VRC312$sample$BatchNumber)
Batch2  <-   factor(y_VRC312$sample$Date.Extracted)
Batch   <-   as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Subject <-   as.factor(as.character(y_VRC312$sample$Patient))
Gender  <-   as.factor(as.character(y_VRC312$sample$Gender))
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
#design <- model.matrix(~0 + Outcome + Batch1 + Batch2) # batch corrected; baseline only
design <- model.matrix(~Gender + Group + Batch1 + Batch2 + Outcome) # dose adjusted; batch corrected; edgeR
#design <- model.matrix(~0 + Outcome_Timept + Dose + Batch1 + Batch2) # batch corrected; dose adjusted; use limma pairing method
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub(". ", "_", colnames(design))
colnames(design) <- gsub("/", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
colnames(design) <- gsub("_Dose", "", colnames(design))
design <- design[,!(colSums(design) == 0)]          #remove columns with all zeros

##################################
#Remove Columns not of Full Rank #
##################################
#Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, : Design matrix not of full rank. The following coefficients not estimable: Batch28_18_14 Batch28_6_14
design <- design[,colnames(design)!="Batch28_18_14"]
design <- design[,colnames(design)!="Batch28_6_14"]
colnames(design)

##  [1] "(Intercept)"   "GenderMale"    "Group3b"       "Group4a"      
##  [5] "Group4b"       "Group4c"       "Batch12"       "Batch13"      
##  [9] "Batch14"       "Batch15"       "Batch16"       "Batch17"      
## [13] "Batch18"       "Batch19"       "Batch27_28_14" "Batch27_30_14"
## [17] "Batch28_11_14" "Batch28_12_14" "Batch28_14_14" "Batch28_4_14" 
## [21] "Protected"

Estimate Dispersions VRC 312

y_VRC312 <- estimateDisp(y_VRC312,design)

Perform differential gene expression VRC 312

fit <- glmQLFit(y_VRC312, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC312 <- topTags(qlf, n = nrow(y_VRC312))$table

Examine top 15 DEGs (protected vs not protected) VRC 312

VRC 312 DEGs

TopTab_VRC312 %>%
  arrange(PValue) %>%
  slice_head(., n=15) %>%
  knitr::kable()

	hgnc_symbol	Gene_length	logFC	logCPM	F	PValue	FDR
DSP	DSP	45143	-6.907299	0.7967660	43.40111	0.0000047	0.0640497
MYOM2	MYOM2	120513	-3.473403	3.9798950	24.48981	0.0003267	0.9991389
MIR3064	MIR3064	66	1.654995	3.9692524	23.00357	0.0004167	0.9991389
KLRC2	KLRC2	15447	2.772390	0.7660329	20.22618	0.0006619	0.9991389
GATA2	GATA2	13759	-2.331792	4.4725489	20.10624	0.0007177	0.9991389
KLF4	KLF4	5631	-1.565611	4.9953269	19.05373	0.0008856	0.9991389
CPA3	CPA3	31941	-2.063480	3.0558447	18.32238	0.0010307	0.9991389
DEFA4	DEFA4	2517	2.182439	2.9653183	17.13226	0.0013157	0.9991389
KLRD1	KLRD1	103543	1.478543	5.9226770	16.85227	0.0014106	0.9991389
ATOH8	ATOH8	36723	-3.987656	1.9543972	15.62164	0.0015321	0.9991389
FLVCR1-AS1	FLVCR1-AS1	5981	-2.345739	1.5393108	16.34732	0.0015905	0.9991389
CRIP2	CRIP2	7209	-2.041907	4.2468983	16.17218	0.0016419	0.9991389
LIMS2	LIMS2	43406	-1.538278	3.2091761	15.26722	0.0020214	0.9991389
CLEC1A	CLEC1A	42074	2.208181	1.2225062	15.21198	0.0020652	0.9991389
N4BP2L2-IT2	N4BP2L2-IT2	4890	1.250707	3.7626413	15.15035	0.0020766	0.9991389

3.2 Camera VRC 312

Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 312 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151

This is not in the manuscript, but included here as an alternative approach.

mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
  url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
  temp_list <- readRDS(url(url, method="libcurl"))
  temp_indices <- ids2indices(temp_list, y_VRC312$genes$hgnc_symbol)
  cameratab <- camera(y_VRC312, temp_indices, design, contrast =  c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
  cameratab_list[[i]] <- cameratab %>%
    rownames_to_column(var = "module")
}
cameratab_bind_VRC312 <- bind_rows(cameratab_list, .id = "module_type") %>%
  mutate(study = "VRC312") %>%
  dplyr::select(study, module_type, module, everything())

Plot CAMERA results VRC 312

#plotting options
basetextsize <- 12 
myfont <- "Helvetica"
bubble_max_size <- 15

myCameraPlotDat_VRC312 <- cameratab_bind_VRC312 %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  #filter(!grepl("TBA", module)) %>%
  dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log10(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
  filter(!grepl("TBD", module)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels() %>%
  dplyr::select(study, module_type, module, everything())

#saveRDS(myCameraPlotDat_VRC312, paste0(datadir, "VRC312_Camera_PvNP.rds"))

3.3 Apply GSEA VRC 312

Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.

set.seed(23)
ranks_VRC312 <- TopTab_VRC312 %>% 
  mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
  dplyr::select(hgnc_symbol,rankmetric) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(hgnc_symbol) %>%  
  dplyr::summarize(rankmetric = mean(rankmetric)) %>%
  arrange(desc(rankmetric)) %>%
  deframe()

#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC312 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC312,
                                       geneset = "all",
                                       output_directory = tempdir(),
                                       filename_prefix = "GSEA",
                                       sampleSize = 101,
                                       minSize = 20,
                                       maxSize = Inf,
                                       scoreType = "std") %>%
  as_tibble() %>%
  arrange(desc(NES)) %>% 
  mutate(study = "VRC312") %>%
  dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>% 
  mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
  mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
  arrange(padj)

## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"

#saveRDS(GSEAtab_VRC312, paste0(datadir, "VRC312_GSEA_PvNP_FDR5.rds"))

4 VRC 314 Analysis

Make DGElist object VRC 314

y_VRC314   <- DGEList(count = counts(hdat_314), genes= fData(hdat_314), samples = pData(hdat_314),group= hdat_314$Outcome.Final, remove.zeros=T)

Filter genes VRC 314

filter.type <- "-low_express_filter-"
dim(y_VRC314)

## [1] 17699    34

#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC314, group = hdat_314$Outcome.Final)
y_VRC314 <- y_VRC314[keep, , keep.lib.sizes=FALSE]
dim(y_VRC314)

## [1] 12059    34

Normalization VRC 314

###############
y_VRC314$samples$lib.size <- colSums(y_VRC314$counts)
y_VRC314 <- calcNormFactors(y_VRC314)           #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC314)

## [1] 12059    34

logcpm_314 <- cpm(y_VRC314, prior.count = 0.25,log=T)

Filter by library size VRC 314

Note that expression set already has been filtered.

y_VRC314 <- y_VRC314[,y_VRC314$samples$lib.size>7.5e6]

4.1 Generalized linear modeling VRC 314

Use edgeR with batch adjustment

# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments) 
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432

Outcome <- y_VRC314$samples$Outcome.Final
Timept <- y_VRC314$samples$Dose.group
Sex <- factor(y_VRC314$samples$Gender)
Batch1  <-   factor(y_VRC314$samples$Lib_Batch)
Batch2  <-   factor(y_VRC314$samples$Date)
Batch   <-   as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Group <- factor(y_VRC314$samples$Group)
Subject <-   y_VRC314$samples$Patient
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
design <- model.matrix(~Group + Sex + Batch + Outcome) # GROUP and BATCH corrected
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub("\\.", "_", colnames(design))
colnames(design) <- gsub(", ", "_", colnames(design))
colnames(design) <- gsub(" \\+ ", "_", colnames(design))
colnames(design) <- gsub("CHMI 1", "CHMI1", colnames(design))
colnames(design) <- gsub(" ", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
design <- design[,!(colSums(design) == 0)]          #remove columns with all zeros

# When running myGroup = Grp1456
# Design matrix not of full rank.  The following coefficients not estimable:
#  Batch4_Jun_27_2016 Batch5_March_23_2016 Batch5_Nov_24_2015
#for Grp
if(my_VRC314_Group == "Grp1456"){
  design <- design[,!colnames(design)%in%c("Batch4_Jun_27_2016", "Batch5_March_23_2016", "Batch5_Nov_24_2015")]
}
colnames(design)

##  [1] "(Intercept)"          "GroupGroup4"          "GroupGroup5"         
##  [4] "GroupGroup6"          "SexMale"              "Batch1_Jun_27_2016"  
##  [7] "Batch1_March_23_2016" "Batch2_Dec_14_2015"   "Batch2_Feb_15_2016"  
## [10] "Batch2_Jun_27_2016"   "Batch2_March_22_2016" "Batch2_March_23_2016"
## [13] "Batch3_Dec_14_2015"   "Batch3_Feb_15_2016"   "Batch3_March_22_2016"
## [16] "Batch3_March_23_2016" "Batch4_Feb_15_2016"   "Batch4_March_23_2016"
## [19] "Batch5_Feb_15_2016"   "Batch5_Jan_10_2017"   "Batch5_Nov_23_2015"  
## [22] "Protected"

Estimate Dispersions VRC 314

y_VRC314 <- estimateDisp(y_VRC314, design)

Perform differential gene expression VRC 314

fit <- glmQLFit(y_VRC314, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC314 <- topTags(qlf, n = nrow(y_VRC314))$table

Examine top 15 DEGs (protected vs not protected) VRC 314

VRC 314 DEGs

TopTab_VRC314 %>%
  arrange(PValue) %>%
  slice_head(., n=15) %>%
  knitr::kable()

	hgnc_symbol	Gene_length	logFC	logCPM	F	PValue	FDR
LSM3	LSM3	22762	1.2447685	2.0450349	21.01062	0.0002565	0.9998583
MRPL22	MRPL22	28342	0.8401138	2.0954516	19.75821	0.0003436	0.9998583
PKMYT1	PKMYT1	12516	1.3669605	0.9853327	18.97528	0.0004217	0.9998583
HMGB2	HMGB2	3431	0.5957184	5.9447479	18.09448	0.0005160	0.9998583
RSL24D1	RSL24D1	16262	0.8829434	4.1146620	18.01766	0.0005265	0.9998583
RPAP2	RPAP2	103092	0.6307832	2.6740872	17.63788	0.0005815	0.9998583
CETN3	CETN3	17526	1.3326930	0.5170825	16.61217	0.0006672	0.9998583
LRRN3	LRRN3	34449	0.5702500	5.0630690	15.88376	0.0009267	0.9998583
TNFRSF17	TNFRSF17	2962	1.0779445	2.0137156	15.80919	0.0009603	0.9998583
TOP2A	TOP2A	29435	1.1391423	2.0631915	15.40097	0.0010699	0.9998583
ERH	ERH	18505	0.4994195	4.9424513	15.36273	0.0010704	0.9998583
EIF3E	EIF3E	234118	0.5335506	5.5085325	14.86408	0.0012317	0.9998583
HAT1	HAT1	61235	0.6950987	3.7125304	14.62087	0.0013206	0.9998583
RPF1	RPF1	18532	0.4463541	4.3677015	14.53896	0.0013517	0.9998583
KIAA0391	KIAA0391	155769	0.4019658	4.9269749	14.40103	0.0014063	0.9998583

4.2 Camera VRC 314

Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 314 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151

This is not in the manuscript, but included here as an alternative approach.

mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
  url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
  temp_list <- readRDS(url(url, method="libcurl"))
  temp_indices <- ids2indices(temp_list, y_VRC314$genes$hgnc_symbol)
  cameratab <- camera(y_VRC314, temp_indices, design, contrast =  c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
  cameratab_list[[i]] <- cameratab %>%
    rownames_to_column(var = "module")
}
cameratab_bind_VRC314 <- bind_rows(cameratab_list, .id = "module_type") %>%
  mutate(study = "VRC314") %>%
  dplyr::select(study, module_type, module, everything())

Plot CAMERA results VRC 314

#plotting options
basetextsize <- 12 
myfont <- "Helvetica"
bubble_max_size <- 15

myCameraPlotDat_VRC314 <- cameratab_bind_VRC314 %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  #filter(!grepl("TBA", module)) %>%
  dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log10(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
  filter(!grepl("TBD", module)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels() %>%
  dplyr::select(study, module_type, module, everything())

#saveRDS(myCameraPlotDat_VRC314, paste0(datadir, "VRC314_Camera_PvNP.rds"))

4.3 Apply GSEA VRC 314

Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.

set.seed(23)
ranks_VRC314 <- TopTab_VRC314 %>% 
  mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
  dplyr::select(hgnc_symbol,rankmetric) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(hgnc_symbol) %>%  
  dplyr::summarize(rankmetric = mean(rankmetric)) %>%
  arrange(desc(rankmetric)) %>%
  deframe()

#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC314 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC314,
                                       geneset = "all",
                                       output_directory = tempdir(),
                                       filename_prefix = "GSEA",
                                       sampleSize = 101,
                                       minSize = 20,
                                       maxSize = Inf,
                                       scoreType = "std") %>%
  as_tibble() %>%
  arrange(desc(NES)) %>%
  mutate(study = "VRC314") %>%
  dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>% 
  mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
  mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
  arrange(padj)

## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"

#Get BSPZV1 DEG table from googledrive
temp <- tempfile(fileext = ".rds")
dl <- googledrive::drive_download(
   googledrive::as_id("1NR0SKRw37ZUTnKfvdedCPWoyOv3aDECU"), path = temp, overwrite = TRUE)
TopTab_BSPZV1 <- readRDS(file = dl$local_path)

#Bind data from VRC312, VRC314, and BSPZV1 studies
TopTab_BSPZV1_VRC312_VRC314 <- bind_rows(TopTab_BSPZV1 %>%
                                           as_tibble() %>%
                                           dplyr::rename(hgnc_symbol = "gene_symbol") %>%
                                           dplyr::select(-c(ensembl_id)) %>%
                                           mutate(study = "BSPZV1"),
                                         TopTab_VRC312 %>%
                                           as_tibble() %>%
                                           dplyr::select(-c(Gene_length)) %>%
                                           mutate(study = "VRC312"),
                                         TopTab_VRC314 %>%
                                           as_tibble() %>%
                                           dplyr::select(-c(Gene_length)) %>%
                                           mutate(study = "VRC314")) %>%
  dplyr::select(study, everything())

5 Make VRC 312, VRC 314, BSPZV1 tables

study_tab <- data.frame(matrix(c("KPSZV1", "Kenya", "malaria-exposed infants","high dose","1,800,000 PfSPZ x 3", "24", "37",
                                 "VRC 312", "US", "malaria-naive adults","3a", "30,000 PfSPZ x 6", "3", "0",
                                 "", "", "", "3b", "30,000 PfSPZ x 4", "8", "1",
                                 "", "", "", "4a", "135,000 PfSPZ x 5", "0", "2",
                                 "", "", "", "4b", "135,000 PfSPZ x 5", "0", "4",
                                 "", "", "", "4c", "135,000 PfSPZ x 4", "3", "6",
                                 "", "", "", "All", "", "14", "13",
                                 "VRC 314", "US", "malaria-naive adults" , "1", "270,000 PfSPZ x 3", "1", "3",
                                 "", "", "", "4", "270,000 PfSPZ x 4", "0", "5",
                                 "", "", "", "5", "270,000 PfSPZ x 4", "5", "6",
                                 "", "", "", "6", "900,000 PfSPZ x 3", "5", "9",
                                 "", "", "", "All", "", "11", "23",
                                 "BSPZV1", "Tanzania", "malaria-exposed adults", "2", "135,000 PfSPZ x 5", "7", "4", 
                                 "", "", "", "3", "270,000 PfSPZ x 5", "7", "4",
                                 "", "", "", "All", "", "14", "8"),
                               nrow = 15, byrow = T))
colnames(study_tab) <- c("Study", "Country", "Malaria Exposure","Group","Regimen","NP","P")
#for the Neal et al study (PMID 35031601), 3 subjects in BSPZV1 group 1who were smear negative throughout but PCR positive were considered protected

study_tab_plot <- ggtexttable(study_tab,
                                      rows = NULL,
                                      theme = ttheme("classic",
                                                     base_size=8,
                                                     padding = unit(c(8, 2.5), "mm")))

study_tab_plot

6 Combine with BSPZV data and visualize GSEA data as bubble plot.

Filter based on padj < 0.10.

6.1 Code for GSEA bubble plot

#Get BSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1NF0IIJbFDMH_fjPnqS3yPRHb42ajXZug"), path = temp, overwrite = TRUE)
GSEAtab_BSPZV1 <- readRDS(file = dl$local_path)

#Get KSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1VsKSiDGfyPMrJoVFCNhkiffzkXBb41IE"), path = temp, overwrite = TRUE)
GSEAtab_KSPZV1 <- readxl::read_excel(path = dl$local_path) %>%
  filter(treatment == "1.8 x 10^6 PfSPZ") %>% #use only high-dose group for comparison
  mutate(study = "KSPZV1") %>%
  mutate(neglogpadj = -log10(`BH-adj p value (FDR)`)) %>%
  dplyr::select(colnames(GSEAtab_BSPZV1)) %>%
  mutate(pathway = gsub("HALLMARK_", "", pathway)) %>% #clean up pathways to match other dataframes
  mutate(pathway = gsub("\\_", " ", pathway)) #clean up pathways to match other dataframes

GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- bind_rows(GSEAtab_VRC312,
                     GSEAtab_VRC314) %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  filter(!grepl("TBA", pathway)) %>%
  dplyr::select(study, module_type, pathway, leadingEdge, size, NES, padj) %>%
  mutate(neglogpadj = -log10(padj)) %>%
  dplyr::rename("BH-adj p value (FDR)" = "padj") %>%
  mutate(pathway = gsub("gd", "γδ", pathway)) %>%
  mutate(pathway = gsub("Vd2", "Vδ2", pathway)) %>%
  mutate(pathway = gsub("Vg", "Vγ", pathway)) %>%
  mutate(pathway = gsub("HALLMARK_", "", pathway)) %>%
  mutate(pathway = gsub("_", " ", pathway)) %>%
  mutate(pathway = sub(".*?\\_", "", pathway)) %>%
  filter(!grepl("TBD", pathway)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  droplevels() %>%
  bind_rows(.,GSEAtab_BSPZV1, GSEAtab_KSPZV1) %>%
  mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
  arrange(study, module_type, desc(neglogpadj)) %>%
  group_by(module_type, study) %>%
  mutate(pathway = fct_reorder(pathway, NES, .desc = TRUE)) %>%
  ungroup() %>%
  filter(!grepl("TBD", pathway))

#plotting options
basetextsize <- 8.5  
myfont <- "sans"
bubble_max_size <- 9
fdr_threshold <- 0.1

common_pathways <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  group_by(module_type, pathway) %>%
  dplyr::select(pathway) %>%
  dplyr::summarize(n=n()) %>%
  filter(n>1)

GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(pathway %in% common_pathways$pathway)

TopPlot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("highBTMs", "MonacoModules"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,5,10,20,30,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size=5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.5, 1, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)

TopPlot <- ggarrange(study_tab_plot ,addSmallLegend(TopPlot), widths = c(1,1))

Bottom_Left_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("lowBTMs"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.5, 5, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)

Bottom_Middle_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("MSigDB_Hallmark_v7.4"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.margin = unit(c(1, 0.75, 16, 0.75), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)


Bottom_Right_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
  filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
  filter(module_type %in% c("BloodGen3Module"))  %>%
  ggplot(., aes(x = study, y = pathway)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size=5.5),
        strip.background = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))  +
    theme(plot.margin = unit(c(1, 1.3, 0.5, 1.3), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)

bottom_panels <- ggarrange(addSmallLegend(Bottom_Left_Plot), addSmallLegend(Bottom_Middle_Plot), addSmallLegend(Bottom_Right_Plot),
                           widths = c(1.1, 1, 1),
                           ncol = 3,
                           common.legend = TRUE)

6.2 Plot GSEA bubbleplots (Figures S4B-C)

Red: enriched in protected/uninfected.
Blue: enriched in not protected/infected.
Only modules with BH-adjusted p value (FDR) < 0.10 in at least two studies are shown.

ggarrange(addSmallLegend(TopPlot), bottom_panels, common.legend = TRUE, nrow = 2, heights = c(1,3))

7 Combine KSPZV1, VRC 312, VRC214, BSPZV data and visualize Camera data as bubble plot.

7.1 Code for Camera bubble plot

#Get BSPZV1 Camera data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1NWQVtjpDP5Qjtee4aezR0fSywXpEn5fR"), path = temp, overwrite = TRUE)
myCameraPlotDat_BSPZV1 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DI6Xvqp5emAs_u_SNZxyxnuPkcxkDRxq"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC312 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DIaX2KI9fBuKUIP-rbdKiOr5gwzZG39D"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC314 <- readRDS(file = dl$local_path)

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1DHHoRME8ljljACDmtzyGm8tgt0OajxtS"), path = temp, overwrite = TRUE)
myCameraPlotDat_KSPZV1 <- readRDS(file = dl$local_path)

Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- bind_rows(myCameraPlotDat_BSPZV1, myCameraPlotDat_VRC312, myCameraPlotDat_VRC314, myCameraPlotDat_KSPZV1)

fdr_threshold <- 0.1

common_pathways <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  filter(FDR < fdr_threshold) %>%
  group_by(module_type, module) %>%
  dplyr::select(module) %>%
  dplyr::summarize(n=n()) %>%
  filter(n>1)

Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  filter(module %in% common_pathways$module)

plotDat_Camera <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
  mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
  mutate(enrichment = ifelse(Direction == "Up",
                           "up in protected/uninfected",
                           "up in not protected/infected")) %>%
  filter(FDR < fdr_threshold) %>%
  filter(!grepl("TBA", module)) %>%
  mutate(neglogFDR = -log10(FDR)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  filter(!grepl("TBD", module)) %>%
  arrange(desc(neglogFDR)) %>%
  group_by(study, module_type) %>%
  mutate(module = fct_reorder(module, neglogFDR, .desc = FALSE)) %>%
  ungroup() %>% 
  droplevels()

#plotting options
basetextsize <- 7.5  
myfont <- "Helvetica"
bubble_max_size <- 6

camera_hibtm_monaco_plot <- plotDat_Camera %>%
  filter(module_type %in% c("highBTMs", "MonacoModules")) %>%
  mutate(module_type = factor(module_type,
                              levels = c("highBTMs", "MonacoModules"),
                              labels = c("high BTMs", "Monaco"))) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        strip.background = element_blank(),
        legend.position = "none",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type, scales = "free_y", nrow = 3)

module_summary_lowbtms <- plotDat_Camera %>%
  group_by(module_type, module) %>%
  dplyr::summarize(n = n()) %>%
  filter(module_type == "lowBTMs" & n>1) %>%
  arrange(n)

camera_lowbtm_plot <- plotDat_Camera %>%
  filter(module_type %in% c("lowBTMs")) %>%
  filter(module %in% module_summary_lowbtms$module) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  #ggtitle("low BTMs") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type)

module_summary_BloodGen3Module <- plotDat_Camera %>%
  group_by(module_type, module) %>%
  dplyr::summarize(n = n()) %>%
  filter(module_type == "BloodGen3Module" & n>1) %>%
  arrange(n)

camera_bloodgen3_plot <- plotDat_Camera %>%
  filter(module_type %in% c("BloodGen3Module")) %>%
  filter(module %in% module_summary_BloodGen3Module$module) %>%
  ggplot(., aes(x = study, y = module, fill = enrichment)) +
  geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
  scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
  scale_fill_manual(values = c("blue","red")) +
  #ggtitle("low BTMs") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.text = element_text(size = 5.5),
        legend.position = "right",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(~module_type)

7.2 Plot Camera bubbleplots

Red: enriched in protected/uninfected Blue: enriched in not protected/infected

Only modules with FDR<10% in at least 2 studies are shown.

first_col_plot <- ggpubr::ggarrange(camera_hibtm_monaco_plot, NULL, nrow = 2, heights = c(2.25,3))
ggpubr::ggarrange(first_col_plot, ggplot() + theme_void(), camera_lowbtm_plot, ggplot() + theme_void(), camera_bloodgen3_plot,
                  ncol = 5, widths = c(0.77, -0.175, 1, -0.175, 0.78), common.legend = TRUE)

PfSPZ DGE on VRC 312 & VRC 314 Datasets, then combine with KSPZV1 and BSPZV1 data

Tuan M. Tran

04 September, 2024