This markdown doc creates Seq (count) and cpm expression sets for VRC 312 and VRC 314 data, does differential gene expression with edgeR, and outputs to fast GSEA using several different module types (BTMs, MSigDB collections, MonacoMods). Generates fgsea output tables that can be visualized using the “PfSPZ Import VRC 312 314 GSEA IPA for Figures.Rmd” script in “/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/” or on the KSPZV1 github repo.
Here, the VRC312 and VRC314 data is combined with BSPZV1 data from another script.
Notes:
Run DGE, Camera, pre-ranked fastGSEA for baseline VRC 312 and VRC 314 datasets.
library(doBy)
library(Biobase)
library(biomaRt)
library(edgeR)
library(genefilter)
library(NMF)
library(EDASeq)
library(tweeDEseqCountData)
library(reshape2)
library(tidyverse)
library(googledrive)
library(Microsoft365R)
library(ggplot2)
library(ggpubr)
#Add small legend helper function
addSmallLegend <- function(myPlot, pointSize = 4, textSize = 6, spaceLegend = 1) {
myPlot +
guides(shape = guide_legend(override.aes = list(size = pointSize)),
color = guide_legend(override.aes = list(size = pointSize))) +
theme(legend.title = element_text(size = textSize),
legend.text = element_text(size = textSize),
legend.key.size = unit(spaceLegend, "lines"))
}
Download from google drive.
#VRC 312
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1X139jUTL4lAELEA3eBTZFx2hQgmOPFSZ"), path = temp, overwrite = TRUE)
hdat_312 <- readRDS(file = dl$local_path)
#VRC 314
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1VXTgNSQuWrsA5E-3OgOgJ-AzMZpL4afo"), path = temp, overwrite = TRUE)
hdat_314 <- readRDS(file = dl$local_path)
Set options
myTIMEPOINT <- "Baseline" #"Baseline"
my_VRC312_Group <- "Grp3a3b4a4b4c"
if(my_VRC312_Group == "Grp3a3b4a4b4c"){
my_VRC312_Groups <- c("3a","3b", "4a","4b", "4c")
}
my_VRC314_Group <- "Grp1456" #used for final analysis for reasons stated above
PCT <- 100 #used 100 PCT for final analysis
if(my_VRC314_Group == "Grp134567"){
my_VRC314_Groups <- c("Group1","Group3", "Group4","Group5", "Group6", "Group7") # "Group6" #Group2 is IM
}
if(my_VRC314_Group == "Grp567"){
my_VRC314_Groups <- c("Group5", "Group6","Group7")
}
if(my_VRC314_Group == "Grp1456"){
my_VRC314_Groups <- c("Group1", "Group4", "Group5", "Group6")
}
Reduce datasets to relevant samples
hdat_312 <- hdat_312[,which(hdat_312$Dose.group != "EXT")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)
#remove outcome = N/A
hdat_312 <- hdat_312[,!is.na(hdat_312$Outcome.Final)]
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)
foo <- hdat_312[,hdat_312$Patient == 331 & grepl("Imm5",hdat_312$Dose.group)] #Remove 331 Imm5 (failed QC)
hdat_312 <- hdat_312[,setdiff(colnames(hdat_312),colnames(foo))]
hdat_312 <- hdat_312[,which(grepl(myTIMEPOINT, hdat_312$Dose.group))]
hdat_312 <- hdat_312[,which(hdat_312$Outcome.Final != "N/A")]
hdat_312$Dose.group <- droplevels(hdat_312$Dose.group)
hdat_312$Outcome.Final <- factor(hdat_312$Outcome.Final)
hdat_312$Outcome.Final <- droplevels(hdat_312$Outcome.Final)
#Filter on Group
hdat_312 <- hdat_312[,which(hdat_312$Group %in% my_VRC312_Groups)]
hdat_312$Group <- factor(hdat_312$Group)
hdat_312$Group <- droplevels(hdat_312$Group)
table(hdat_312$Group,hdat_312$Outcome.Final)
##
## Malaria Protected
## 3a 3 0
## 3b 8 1
## 4a 0 2
## 4b 0 4
## 4c 3 6
table(hdat_312$Dose.group,hdat_312$Outcome.Final)
##
## Malaria Protected
## Baseline. D01 14 13
##
## Malaria Protected
## Group1 1 3
## Group4 0 5
## Group5 5 6
## Group6 5 9
##
## Malaria Protected
## Baseline 11 23
Make DGElist object VRC 312
# Make DGElist object
y_VRC312 <- DGEList(count = counts(hdat_312), genes= fData(hdat_312), samples = pData(hdat_312), group= hdat_312$Group, remove.zeros=T)
Filter genes VRC 312
filter.type <- "-low_express_filter-"
dim(y_VRC312)
## [1] 18580 27
#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC312)
y_VRC312 <- y_VRC312[keep, , keep.lib.sizes=FALSE]
dim(y_VRC312)
## [1] 13713 27
Normalization VRC 312
y_VRC312$samples$lib.size <- colSums(y_VRC312$counts)
y_VRC312 <- calcNormFactors(y_VRC312) #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC312)
## [1] 13713 27
Using edgeR with batch adjustment
# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments)
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432
Group <- factor(y_VRC312$samples$Group)
Outcome <- y_VRC312$sample$Outcome.Final
Timept <- y_VRC312$sample$Dose.group
Dose <- factor(y_VRC312$sample$Dose)
Batch1 <- factor(y_VRC312$sample$BatchNumber)
Batch2 <- factor(y_VRC312$sample$Date.Extracted)
Batch <- as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Subject <- as.factor(as.character(y_VRC312$sample$Patient))
Gender <- as.factor(as.character(y_VRC312$sample$Gender))
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
#design <- model.matrix(~0 + Outcome + Batch1 + Batch2) # batch corrected; baseline only
design <- model.matrix(~Gender + Group + Batch1 + Batch2 + Outcome) # dose adjusted; batch corrected; edgeR
#design <- model.matrix(~0 + Outcome_Timept + Dose + Batch1 + Batch2) # batch corrected; dose adjusted; use limma pairing method
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub(". ", "_", colnames(design))
colnames(design) <- gsub("/", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
colnames(design) <- gsub("_Dose", "", colnames(design))
design <- design[,!(colSums(design) == 0)] #remove columns with all zeros
##################################
#Remove Columns not of Full Rank #
##################################
#Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, : Design matrix not of full rank. The following coefficients not estimable: Batch28_18_14 Batch28_6_14
design <- design[,colnames(design)!="Batch28_18_14"]
design <- design[,colnames(design)!="Batch28_6_14"]
colnames(design)
## [1] "(Intercept)" "GenderMale" "Group3b" "Group4a"
## [5] "Group4b" "Group4c" "Batch12" "Batch13"
## [9] "Batch14" "Batch15" "Batch16" "Batch17"
## [13] "Batch18" "Batch19" "Batch27_28_14" "Batch27_30_14"
## [17] "Batch28_11_14" "Batch28_12_14" "Batch28_14_14" "Batch28_4_14"
## [21] "Protected"
Estimate Dispersions VRC 312
y_VRC312 <- estimateDisp(y_VRC312,design)
Perform differential gene expression VRC 312
fit <- glmQLFit(y_VRC312, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC312 <- topTags(qlf, n = nrow(y_VRC312))$table
Examine top 15 DEGs (protected vs not protected) VRC 312
VRC 312 DEGs
TopTab_VRC312 %>%
arrange(PValue) %>%
slice_head(., n=15) %>%
knitr::kable()
| hgnc_symbol | Gene_length | logFC | logCPM | F | PValue | FDR | |
|---|---|---|---|---|---|---|---|
| DSP | DSP | 45143 | -6.907299 | 0.7967660 | 43.40111 | 0.0000047 | 0.0640497 |
| MYOM2 | MYOM2 | 120513 | -3.473403 | 3.9798950 | 24.48981 | 0.0003267 | 0.9991389 |
| MIR3064 | MIR3064 | 66 | 1.654995 | 3.9692524 | 23.00357 | 0.0004167 | 0.9991389 |
| KLRC2 | KLRC2 | 15447 | 2.772390 | 0.7660329 | 20.22618 | 0.0006619 | 0.9991389 |
| GATA2 | GATA2 | 13759 | -2.331792 | 4.4725489 | 20.10624 | 0.0007177 | 0.9991389 |
| KLF4 | KLF4 | 5631 | -1.565611 | 4.9953269 | 19.05373 | 0.0008856 | 0.9991389 |
| CPA3 | CPA3 | 31941 | -2.063480 | 3.0558447 | 18.32238 | 0.0010307 | 0.9991389 |
| DEFA4 | DEFA4 | 2517 | 2.182439 | 2.9653183 | 17.13226 | 0.0013157 | 0.9991389 |
| KLRD1 | KLRD1 | 103543 | 1.478543 | 5.9226770 | 16.85227 | 0.0014106 | 0.9991389 |
| ATOH8 | ATOH8 | 36723 | -3.987656 | 1.9543972 | 15.62164 | 0.0015321 | 0.9991389 |
| FLVCR1-AS1 | FLVCR1-AS1 | 5981 | -2.345739 | 1.5393108 | 16.34732 | 0.0015905 | 0.9991389 |
| CRIP2 | CRIP2 | 7209 | -2.041907 | 4.2468983 | 16.17218 | 0.0016419 | 0.9991389 |
| LIMS2 | LIMS2 | 43406 | -1.538278 | 3.2091761 | 15.26722 | 0.0020214 | 0.9991389 |
| CLEC1A | CLEC1A | 42074 | 2.208181 | 1.2225062 | 15.21198 | 0.0020652 | 0.9991389 |
| N4BP2L2-IT2 | N4BP2L2-IT2 | 4890 | 1.250707 | 3.7626413 | 15.15035 | 0.0020766 | 0.9991389 |
Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 312 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151
This is not in the manuscript, but included here as an alternative approach.
mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
temp_list <- readRDS(url(url, method="libcurl"))
temp_indices <- ids2indices(temp_list, y_VRC312$genes$hgnc_symbol)
cameratab <- camera(y_VRC312, temp_indices, design, contrast = c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
cameratab_list[[i]] <- cameratab %>%
rownames_to_column(var = "module")
}
cameratab_bind_VRC312 <- bind_rows(cameratab_list, .id = "module_type") %>%
mutate(study = "VRC312") %>%
dplyr::select(study, module_type, module, everything())
Plot CAMERA results VRC 312
#plotting options
basetextsize <- 12
myfont <- "Helvetica"
bubble_max_size <- 15
myCameraPlotDat_VRC312 <- cameratab_bind_VRC312 %>%
filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")) %>%
#filter(!grepl("TBA", module)) %>%
dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
mutate(neglogpadj = -log10(FDR)) %>%
mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
mutate(module = gsub("gd", "γδ", module)) %>%
mutate(module = gsub("Vd2", "Vδ2", module)) %>%
mutate(module = gsub("Vg", "Vγ", module)) %>%
mutate(module = gsub("HALLMARK_", "", module)) %>%
mutate(module = gsub("_", " ", module)) %>%
mutate(module = sub(".*?\\_", "", module)) %>%
mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
filter(!grepl("TBD", module)) %>%
mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
arrange(desc(neglogpadj)) %>%
droplevels() %>%
dplyr::select(study, module_type, module, everything())
#saveRDS(myCameraPlotDat_VRC312, paste0(datadir, "VRC312_Camera_PvNP.rds"))
Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.
set.seed(23)
ranks_VRC312 <- TopTab_VRC312 %>%
mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
dplyr::select(hgnc_symbol,rankmetric) %>%
na.omit() %>%
distinct() %>%
group_by(hgnc_symbol) %>%
dplyr::summarize(rankmetric = mean(rankmetric)) %>%
arrange(desc(rankmetric)) %>%
deframe()
#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC312 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC312,
geneset = "all",
output_directory = tempdir(),
filename_prefix = "GSEA",
sampleSize = 101,
minSize = 20,
maxSize = Inf,
scoreType = "std") %>%
as_tibble() %>%
arrange(desc(NES)) %>%
mutate(study = "VRC312") %>%
dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>%
mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
arrange(padj)
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
#saveRDS(GSEAtab_VRC312, paste0(datadir, "VRC312_GSEA_PvNP_FDR5.rds"))
Make DGElist object VRC 314
y_VRC314 <- DGEList(count = counts(hdat_314), genes= fData(hdat_314), samples = pData(hdat_314),group= hdat_314$Outcome.Final, remove.zeros=T)
Filter genes VRC 314
filter.type <- "-low_express_filter-"
dim(y_VRC314)
## [1] 17699 34
#Filter low expression tags: keep genes with at least 1 CPM in at least X, where X is the number of samples in the smallest group (exclude controls by using foo.dat above)
keep <- filterByExpr(y_VRC314, group = hdat_314$Outcome.Final)
y_VRC314 <- y_VRC314[keep, , keep.lib.sizes=FALSE]
dim(y_VRC314)
## [1] 12059 34
Normalization VRC 314
###############
y_VRC314$samples$lib.size <- colSums(y_VRC314$counts)
y_VRC314 <- calcNormFactors(y_VRC314) #Normalization, use scale (TMM) here or quantile normalization later with voom, but not both
#reference for this here: https://support.bioconductor.org/p/77664/
dim(y_VRC314)
## [1] 12059 34
logcpm_314 <- cpm(y_VRC314, prior.count = 0.25,log=T)
Filter by library size VRC 314
Note that expression set already has been filtered.
y_VRC314 <- y_VRC314[,y_VRC314$samples$lib.size>7.5e6]
Use edgeR with batch adjustment
# create design matrix according to Limma user guide (Ch 9.7 Multi Level Experiments)
# http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf and
# http://permalink.gmane.org/gmane.science.biology.informatics.conductor/48432
Outcome <- y_VRC314$samples$Outcome.Final
Timept <- y_VRC314$samples$Dose.group
Sex <- factor(y_VRC314$samples$Gender)
Batch1 <- factor(y_VRC314$samples$Lib_Batch)
Batch2 <- factor(y_VRC314$samples$Date)
Batch <- as.factor(paste(as.character(Batch1), as.character(Batch2), sep="_"))
Group <- factor(y_VRC314$samples$Group)
Subject <- y_VRC314$samples$Patient
Outcome_Timept <- factor(paste(Outcome,Timept,sep="_"))
design <- model.matrix(~Group + Sex + Batch + Outcome) # GROUP and BATCH corrected
rownames(design) <- Subject
colnames(design) <- gsub(":", "_", colnames(design))
colnames(design) <- gsub("\\.", "_", colnames(design))
colnames(design) <- gsub(", ", "_", colnames(design))
colnames(design) <- gsub(" \\+ ", "_", colnames(design))
colnames(design) <- gsub("CHMI 1", "CHMI1", colnames(design))
colnames(design) <- gsub(" ", "_", colnames(design))
colnames(design) <- gsub("Outcome_Timept", "", colnames(design))
colnames(design) <- gsub("Outcome", "", colnames(design))
colnames(design) <- gsub("Timept", "", colnames(design))
design <- design[,!(colSums(design) == 0)] #remove columns with all zeros
# When running myGroup = Grp1456
# Design matrix not of full rank. The following coefficients not estimable:
# Batch4_Jun_27_2016 Batch5_March_23_2016 Batch5_Nov_24_2015
#for Grp
if(my_VRC314_Group == "Grp1456"){
design <- design[,!colnames(design)%in%c("Batch4_Jun_27_2016", "Batch5_March_23_2016", "Batch5_Nov_24_2015")]
}
colnames(design)
## [1] "(Intercept)" "GroupGroup4" "GroupGroup5"
## [4] "GroupGroup6" "SexMale" "Batch1_Jun_27_2016"
## [7] "Batch1_March_23_2016" "Batch2_Dec_14_2015" "Batch2_Feb_15_2016"
## [10] "Batch2_Jun_27_2016" "Batch2_March_22_2016" "Batch2_March_23_2016"
## [13] "Batch3_Dec_14_2015" "Batch3_Feb_15_2016" "Batch3_March_22_2016"
## [16] "Batch3_March_23_2016" "Batch4_Feb_15_2016" "Batch4_March_23_2016"
## [19] "Batch5_Feb_15_2016" "Batch5_Jan_10_2017" "Batch5_Nov_23_2015"
## [22] "Protected"
Estimate Dispersions VRC 314
y_VRC314 <- estimateDisp(y_VRC314, design)
Perform differential gene expression VRC 314
fit <- glmQLFit(y_VRC314, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0,(ncol(design)-1)),1))
TopTab_VRC314 <- topTags(qlf, n = nrow(y_VRC314))$table
Examine top 15 DEGs (protected vs not protected) VRC 314
VRC 314 DEGs
TopTab_VRC314 %>%
arrange(PValue) %>%
slice_head(., n=15) %>%
knitr::kable()
| hgnc_symbol | Gene_length | logFC | logCPM | F | PValue | FDR | |
|---|---|---|---|---|---|---|---|
| LSM3 | LSM3 | 22762 | 1.2447685 | 2.0450349 | 21.01062 | 0.0002565 | 0.9998583 |
| MRPL22 | MRPL22 | 28342 | 0.8401138 | 2.0954516 | 19.75821 | 0.0003436 | 0.9998583 |
| PKMYT1 | PKMYT1 | 12516 | 1.3669605 | 0.9853327 | 18.97528 | 0.0004217 | 0.9998583 |
| HMGB2 | HMGB2 | 3431 | 0.5957184 | 5.9447479 | 18.09448 | 0.0005160 | 0.9998583 |
| RSL24D1 | RSL24D1 | 16262 | 0.8829434 | 4.1146620 | 18.01766 | 0.0005265 | 0.9998583 |
| RPAP2 | RPAP2 | 103092 | 0.6307832 | 2.6740872 | 17.63788 | 0.0005815 | 0.9998583 |
| CETN3 | CETN3 | 17526 | 1.3326930 | 0.5170825 | 16.61217 | 0.0006672 | 0.9998583 |
| LRRN3 | LRRN3 | 34449 | 0.5702500 | 5.0630690 | 15.88376 | 0.0009267 | 0.9998583 |
| TNFRSF17 | TNFRSF17 | 2962 | 1.0779445 | 2.0137156 | 15.80919 | 0.0009603 | 0.9998583 |
| TOP2A | TOP2A | 29435 | 1.1391423 | 2.0631915 | 15.40097 | 0.0010699 | 0.9998583 |
| ERH | ERH | 18505 | 0.4994195 | 4.9424513 | 15.36273 | 0.0010704 | 0.9998583 |
| EIF3E | EIF3E | 234118 | 0.5335506 | 5.5085325 | 14.86408 | 0.0012317 | 0.9998583 |
| HAT1 | HAT1 | 61235 | 0.6950987 | 3.7125304 | 14.62087 | 0.0013206 | 0.9998583 |
| RPF1 | RPF1 | 18532 | 0.4463541 | 4.3677015 | 14.53896 | 0.0013517 | 0.9998583 |
| KIAA0391 | KIAA0391 | 155769 | 0.4019658 | 4.9269749 | 14.40103 | 0.0014063 | 0.9998583 |
Competitive gene set test accounting for inter-gene correlation between “protected” vs “not protected” VRC 314 Uses Camera: https://academic.oup.com/nar/article/40/17/e133/2411151
This is not in the manuscript, but included here as an alternative approach.
mymodules <- c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")
cameratab_list <- vector("list", length(mymodules))
names(cameratab_list) <- mymodules
for(i in mymodules){
url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
temp_list <- readRDS(url(url, method="libcurl"))
temp_indices <- ids2indices(temp_list, y_VRC314$genes$hgnc_symbol)
cameratab <- camera(y_VRC314, temp_indices, design, contrast = c(rep(0,(ncol(design)-1)),1), inter.gene.cor=0.01)
cameratab_list[[i]] <- cameratab %>%
rownames_to_column(var = "module")
}
cameratab_bind_VRC314 <- bind_rows(cameratab_list, .id = "module_type") %>%
mutate(study = "VRC314") %>%
dplyr::select(study, module_type, module, everything())
Plot CAMERA results VRC 314
#plotting options
basetextsize <- 12
myfont <- "Helvetica"
bubble_max_size <- 15
myCameraPlotDat_VRC314 <- cameratab_bind_VRC314 %>%
filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")) %>%
#filter(!grepl("TBA", module)) %>%
dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
mutate(neglogpadj = -log10(FDR)) %>%
mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
mutate(module = gsub("gd", "γδ", module)) %>%
mutate(module = gsub("Vd2", "Vδ2", module)) %>%
mutate(module = gsub("Vg", "Vγ", module)) %>%
mutate(module = gsub("HALLMARK_", "", module)) %>%
mutate(module = gsub("_", " ", module)) %>%
mutate(module = sub(".*?\\_", "", module)) %>%
mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
filter(!grepl("TBD", module)) %>%
mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
arrange(desc(neglogpadj)) %>%
droplevels() %>%
dplyr::select(study, module_type, module, everything())
#saveRDS(myCameraPlotDat_VRC314, paste0(datadir, "VRC314_Camera_PvNP.rds"))
Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.
set.seed(23)
ranks_VRC314 <- TopTab_VRC314 %>%
mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
dplyr::select(hgnc_symbol,rankmetric) %>%
na.omit() %>%
distinct() %>%
group_by(hgnc_symbol) %>%
dplyr::summarize(rankmetric = mean(rankmetric)) %>%
arrange(desc(rankmetric)) %>%
deframe()
#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab_VRC314 <- NamedGeneRankList2GseaTable(rankedgenes = ranks_VRC314,
geneset = "all",
output_directory = tempdir(),
filename_prefix = "GSEA",
sampleSize = 101,
minSize = 20,
maxSize = Inf,
scoreType = "std") %>%
as_tibble() %>%
arrange(desc(NES)) %>%
mutate(study = "VRC314") %>%
dplyr::select(study, module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>%
mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
arrange(padj)
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
#Get BSPZV1 DEG table from googledrive
temp <- tempfile(fileext = ".rds")
dl <- googledrive::drive_download(
googledrive::as_id("1NR0SKRw37ZUTnKfvdedCPWoyOv3aDECU"), path = temp, overwrite = TRUE)
TopTab_BSPZV1 <- readRDS(file = dl$local_path)
#Bind data from VRC312, VRC314, and BSPZV1 studies
TopTab_BSPZV1_VRC312_VRC314 <- bind_rows(TopTab_BSPZV1 %>%
as_tibble() %>%
dplyr::rename(hgnc_symbol = "gene_symbol") %>%
dplyr::select(-c(ensembl_id)) %>%
mutate(study = "BSPZV1"),
TopTab_VRC312 %>%
as_tibble() %>%
dplyr::select(-c(Gene_length)) %>%
mutate(study = "VRC312"),
TopTab_VRC314 %>%
as_tibble() %>%
dplyr::select(-c(Gene_length)) %>%
mutate(study = "VRC314")) %>%
dplyr::select(study, everything())
study_tab <- data.frame(matrix(c("KPSZV1", "Kenya", "malaria-exposed infants","high dose","1,800,000 PfSPZ x 3", "24", "37",
"VRC 312", "US", "malaria-naive adults","3a", "30,000 PfSPZ x 6", "3", "0",
"", "", "", "3b", "30,000 PfSPZ x 4", "8", "1",
"", "", "", "4a", "135,000 PfSPZ x 5", "0", "2",
"", "", "", "4b", "135,000 PfSPZ x 5", "0", "4",
"", "", "", "4c", "135,000 PfSPZ x 4", "3", "6",
"", "", "", "All", "", "14", "13",
"VRC 314", "US", "malaria-naive adults" , "1", "270,000 PfSPZ x 3", "1", "3",
"", "", "", "4", "270,000 PfSPZ x 4", "0", "5",
"", "", "", "5", "270,000 PfSPZ x 4", "5", "6",
"", "", "", "6", "900,000 PfSPZ x 3", "5", "9",
"", "", "", "All", "", "11", "23",
"BSPZV1", "Tanzania", "malaria-exposed adults", "2", "135,000 PfSPZ x 5", "7", "4",
"", "", "", "3", "270,000 PfSPZ x 5", "7", "4",
"", "", "", "All", "", "14", "8"),
nrow = 15, byrow = T))
colnames(study_tab) <- c("Study", "Country", "Malaria Exposure","Group","Regimen","NP","P")
#for the Neal et al study (PMID 35031601), 3 subjects in BSPZV1 group 1who were smear negative throughout but PCR positive were considered protected
study_tab_plot <- ggtexttable(study_tab,
rows = NULL,
theme = ttheme("classic",
base_size=8,
padding = unit(c(8, 2.5), "mm")))
study_tab_plot
Filter based on padj < 0.10.
#Get BSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1NF0IIJbFDMH_fjPnqS3yPRHb42ajXZug"), path = temp, overwrite = TRUE)
GSEAtab_BSPZV1 <- readRDS(file = dl$local_path)
#Get KSPZV1 GSEA data from googledrive
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
as_id("1VsKSiDGfyPMrJoVFCNhkiffzkXBb41IE"), path = temp, overwrite = TRUE)
GSEAtab_KSPZV1 <- readxl::read_excel(path = dl$local_path) %>%
filter(treatment == "1.8 x 10^6 PfSPZ") %>% #use only high-dose group for comparison
mutate(study = "KSPZV1") %>%
mutate(neglogpadj = -log10(`BH-adj p value (FDR)`)) %>%
dplyr::select(colnames(GSEAtab_BSPZV1)) %>%
mutate(pathway = gsub("HALLMARK_", "", pathway)) %>% #clean up pathways to match other dataframes
mutate(pathway = gsub("\\_", " ", pathway)) #clean up pathways to match other dataframes
GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- bind_rows(GSEAtab_VRC312,
GSEAtab_VRC314) %>%
filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4")) %>%
filter(!grepl("TBA", pathway)) %>%
dplyr::select(study, module_type, pathway, leadingEdge, size, NES, padj) %>%
mutate(neglogpadj = -log10(padj)) %>%
dplyr::rename("BH-adj p value (FDR)" = "padj") %>%
mutate(pathway = gsub("gd", "γδ", pathway)) %>%
mutate(pathway = gsub("Vd2", "Vδ2", pathway)) %>%
mutate(pathway = gsub("Vg", "Vγ", pathway)) %>%
mutate(pathway = gsub("HALLMARK_", "", pathway)) %>%
mutate(pathway = gsub("_", " ", pathway)) %>%
mutate(pathway = sub(".*?\\_", "", pathway)) %>%
filter(!grepl("TBD", pathway)) %>%
mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
droplevels() %>%
bind_rows(.,GSEAtab_BSPZV1, GSEAtab_KSPZV1) %>%
mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
arrange(study, module_type, desc(neglogpadj)) %>%
group_by(module_type, study) %>%
mutate(pathway = fct_reorder(pathway, NES, .desc = TRUE)) %>%
ungroup() %>%
filter(!grepl("TBD", pathway))
#plotting options
basetextsize <- 8.5
myfont <- "sans"
bubble_max_size <- 9
fdr_threshold <- 0.1
common_pathways <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
group_by(module_type, pathway) %>%
dplyr::select(pathway) %>%
dplyr::summarize(n=n()) %>%
filter(n>1)
GSEA_KPSZV1_VRC312_VRC314_BPSZV1 <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(pathway %in% common_pathways$pathway)
TopPlot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
filter(module_type %in% c("highBTMs", "MonacoModules")) %>%
ggplot(., aes(x = study, y = pathway)) +
geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,5,10,20,30,40), limits = c(1,42)) +
scale_fill_gradient2(low = "blue",
mid = "white",
high = "red") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.background = element_blank(),
strip.text = element_text(size=5.5),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.margin = unit(c(1, 0.5, 1, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)
TopPlot <- ggarrange(study_tab_plot ,addSmallLegend(TopPlot), widths = c(1,1))
Bottom_Left_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
filter(module_type %in% c("lowBTMs")) %>%
ggplot(., aes(x = study, y = pathway)) +
geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
scale_fill_gradient2(low = "blue",
mid = "white",
high = "red") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size=5.5),
strip.background = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.margin = unit(c(1, 0.5, 5, 0.5), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)
Bottom_Middle_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
filter(module_type %in% c("MSigDB_Hallmark_v7.4")) %>%
ggplot(., aes(x = study, y = pathway)) +
geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
scale_fill_gradient2(low = "blue",
mid = "white",
high = "red") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size=5.5),
strip.background = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.margin = unit(c(1, 0.75, 16, 0.75), "cm")) +
facet_wrap(~module_type, scales = "free_y", nrow =2)
Bottom_Right_Plot <- GSEA_KPSZV1_VRC312_VRC314_BPSZV1 %>%
filter(`BH-adj p value (FDR)`< fdr_threshold) %>%
filter(module_type %in% c("BloodGen3Module")) %>%
ggplot(., aes(x = study, y = pathway)) +
geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1,20,40), limits = c(1,42)) +
scale_fill_gradient2(low = "blue",
mid = "white",
high = "red") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size=5.5),
strip.background = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.margin = unit(c(1, 1.3, 0.5, 1.3), "cm")) +
facet_wrap(~module_type, scales = "free_y", ncol =2)
bottom_panels <- ggarrange(addSmallLegend(Bottom_Left_Plot), addSmallLegend(Bottom_Middle_Plot), addSmallLegend(Bottom_Right_Plot),
widths = c(1.1, 1, 1),
ncol = 3,
common.legend = TRUE)
Red: enriched in protected/uninfected.
Blue: enriched in not protected/infected.
Only modules with BH-adjusted p value (FDR) < 0.10 in at least two
studies are shown.
ggarrange(addSmallLegend(TopPlot), bottom_panels, common.legend = TRUE, nrow = 2, heights = c(1,3))
#Get BSPZV1 Camera data from googledrive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1NWQVtjpDP5Qjtee4aezR0fSywXpEn5fR"), path = temp, overwrite = TRUE)
myCameraPlotDat_BSPZV1 <- readRDS(file = dl$local_path)
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1DI6Xvqp5emAs_u_SNZxyxnuPkcxkDRxq"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC312 <- readRDS(file = dl$local_path)
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1DIaX2KI9fBuKUIP-rbdKiOr5gwzZG39D"), path = temp, overwrite = TRUE)
myCameraPlotDat_VRC314 <- readRDS(file = dl$local_path)
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
as_id("1DHHoRME8ljljACDmtzyGm8tgt0OajxtS"), path = temp, overwrite = TRUE)
myCameraPlotDat_KSPZV1 <- readRDS(file = dl$local_path)
Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- bind_rows(myCameraPlotDat_BSPZV1, myCameraPlotDat_VRC312, myCameraPlotDat_VRC314, myCameraPlotDat_KSPZV1)
fdr_threshold <- 0.1
common_pathways <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
filter(FDR < fdr_threshold) %>%
group_by(module_type, module) %>%
dplyr::select(module) %>%
dplyr::summarize(n=n()) %>%
filter(n>1)
Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
filter(module %in% common_pathways$module)
plotDat_Camera <- Cameratab_KSPZV1_BSPZV1_VRC312_VRC314 %>%
mutate(study = factor(study, levels = c("KSPZV1","VRC312", "VRC314", "BSPZV1"))) %>%
mutate(enrichment = ifelse(Direction == "Up",
"up in protected/uninfected",
"up in not protected/infected")) %>%
filter(FDR < fdr_threshold) %>%
filter(!grepl("TBA", module)) %>%
mutate(neglogFDR = -log10(FDR)) %>%
mutate(module = gsub("gd", "γδ", module)) %>%
mutate(module = gsub("Vd2", "Vδ2", module)) %>%
mutate(module = gsub("Vg", "Vγ", module)) %>%
mutate(module = gsub("HALLMARK_", "", module)) %>%
mutate(module = gsub("_", " ", module)) %>%
mutate(module = sub(".*?\\_", "", module)) %>%
filter(!grepl("TBD", module)) %>%
arrange(desc(neglogFDR)) %>%
group_by(study, module_type) %>%
mutate(module = fct_reorder(module, neglogFDR, .desc = FALSE)) %>%
ungroup() %>%
droplevels()
#plotting options
basetextsize <- 7.5
myfont <- "Helvetica"
bubble_max_size <- 6
camera_hibtm_monaco_plot <- plotDat_Camera %>%
filter(module_type %in% c("highBTMs", "MonacoModules")) %>%
mutate(module_type = factor(module_type,
levels = c("highBTMs", "MonacoModules"),
labels = c("high BTMs", "Monaco"))) %>%
ggplot(., aes(x = study, y = module, fill = enrichment)) +
geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
scale_fill_manual(values = c("blue","red")) +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size = 5.5),
strip.background = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
facet_wrap(~module_type, scales = "free_y", nrow = 3)
module_summary_lowbtms <- plotDat_Camera %>%
group_by(module_type, module) %>%
dplyr::summarize(n = n()) %>%
filter(module_type == "lowBTMs" & n>1) %>%
arrange(n)
camera_lowbtm_plot <- plotDat_Camera %>%
filter(module_type %in% c("lowBTMs")) %>%
filter(module %in% module_summary_lowbtms$module) %>%
ggplot(., aes(x = study, y = module, fill = enrichment)) +
geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
scale_fill_manual(values = c("blue","red")) +
#ggtitle("low BTMs") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size = 5.5),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
facet_wrap(~module_type)
module_summary_BloodGen3Module <- plotDat_Camera %>%
group_by(module_type, module) %>%
dplyr::summarize(n = n()) %>%
filter(module_type == "BloodGen3Module" & n>1) %>%
arrange(n)
camera_bloodgen3_plot <- plotDat_Camera %>%
filter(module_type %in% c("BloodGen3Module")) %>%
filter(module %in% module_summary_BloodGen3Module$module) %>%
ggplot(., aes(x = study, y = module, fill = enrichment)) +
geom_point(aes(size=neglogFDR), shape=21, stroke = 0.25, alpha = 0.6) +
scale_size_area(name = expression(-log[10]~FDR), max_size = bubble_max_size) +
scale_fill_manual(values = c("blue","red")) +
#ggtitle("low BTMs") +
hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text = element_text(size = 5.5),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
facet_wrap(~module_type)
Red: enriched in protected/uninfected Blue: enriched in not protected/infected
Only modules with FDR<10% in at least 2 studies are shown.
first_col_plot <- ggpubr::ggarrange(camera_hibtm_monaco_plot, NULL, nrow = 2, heights = c(2.25,3))
ggpubr::ggarrange(first_col_plot, ggplot() + theme_void(), camera_lowbtm_plot, ggplot() + theme_void(), camera_bloodgen3_plot,
ncol = 5, widths = c(0.77, -0.175, 1, -0.175, 0.78), common.legend = TRUE)