library(knitr)
library(tidyverse)
library(edgeR)
library(googledrive)
library(tidyverse)
library(fgsea)
library(data.table)

Objective

Perform differential gene expression using edgeR on pre-immunization baseline whole-blood samples from the KSPZV1 malaria vaccine trial. Revised analysis includes adjustments for:

  1. Batch
  2. Sex
  3. Study site
  4. log10(CSP Ab + 1)
  5. parasitemia at VAX1 (closest to baseline)

JCI reviewer requested using FDR 5% as a cut-off for GSEA

Load ExpressionSet

#from google drive
#local path "/Volumes/GoogleDrive/My Drive/Tran Lab Shared/Projects/Doris Duke PfSPZ Kenya/Tuan PfSPZ/KenyaPfSPZ/Final Data Files/KSPZV1 SeqExpressionSet Baseline Reduced Phenodat 35716x487.rds"
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1Q4VXYxdl9CqtzhcvHA9pn8q06_YOZgi8"), path = temp, overwrite = TRUE)
x <- readRDS(file = dl$local_path)
x$treat <- factor(x$treat, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))

Set options

myTimepoint <- 0

Reduce samples

xgroup <- setNames(as.list(c(1:4)), c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))
for(i in names(xgroup)){
  xgroup[[i]] <- x[, which(x$treat == i)]
  xgroup[[i]] <- xgroup[[i]][,xgroup[[i]]$Timepoint == myTimepoint]
  xgroup[[i]]$treat <- droplevels(xgroup[[i]]$treat)
  #impute only missing CSP Ab value as median CSP A of all baseline samples (only affects one subject in 9.0x10^6 PfSPZ dose group)
  xgroup[[i]]$CSPAb <- ifelse(is.na(xgroup[[i]]$CSPAb), median(xgroup[[i]]$CSPAb, na.rm = TRUE), xgroup[[i]]$CSPAb)
  print(i)
  print(table(Outcome = xgroup[[i]]$Outcome, Timepoint = xgroup[[i]]$Timepoint))
  print(table(Outcome = xgroup[[i]]$Outcome, Dosegroup = xgroup[[i]]$treat))
  print(dim(xgroup[[i]]))
}
## [1] "Placebo"
##        Timepoint
## Outcome  0
##       0 26
##       1 39
##        Dosegroup
## Outcome Placebo
##       0      26
##       1      39
## Features  Samples 
##    35716       65 
## [1] "4.5 x 10^5 PfSPZ"
##        Timepoint
## Outcome  0
##       0 29
##       1 29
##        Dosegroup
## Outcome 4.5 x 10^5 PfSPZ
##       0               29
##       1               29
## Features  Samples 
##    35716       58 
## [1] "9.0 x 10^5 PfSPZ"
##        Timepoint
## Outcome  0
##       0 29
##       1 29
##        Dosegroup
## Outcome 9.0 x 10^5 PfSPZ
##       0               29
##       1               29
## Features  Samples 
##    35716       58 
## [1] "1.8 x 10^6 PfSPZ"
##        Timepoint
## Outcome  0
##       0 38
##       1 25
##        Dosegroup
## Outcome 1.8 x 10^6 PfSPZ
##       0               38
##       1               25
## Features  Samples 
##    35716       63

Build DGEList Object

#Define group by dose, timepoint and outcome
ygroup <- setNames(as.list(c(1:4)), c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))
for(i in names(ygroup)){
  ygroup[[i]]  <- DGEList(counts=counts(xgroup[[i]]), genes=fData(xgroup[[i]]), group= factor(paste(xgroup[[i]]$treat, xgroup[[i]]$Timepoint, xgroup[[i]]$Outcome, sep = "_")), remove.zeros=T)
  ygroup[[i]]$samples$PATID <- gsub("_.*", "", rownames(ygroup[[i]]$samples))
}

Filter out low expression features

for(i in names(ygroup)){
  keep <- filterByExpr(ygroup[[i]])
  ygroup[[i]] <- ygroup[[i]][keep, , keep.lib.sizes=FALSE]
  ygroup[[i]]$samples$lib.size <- colSums(ygroup[[i]]$counts)
  ygroup[[i]] <- calcNormFactors(ygroup[[i]])   #Normalization
}

Analysis between Not Protected and Protected within a Dose Group

degtab <- setNames(as.list(c(1:4)), c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))
for(i in names(ygroup)){
  Subject <- factor(xgroup[[i]]$PATID)
  Outcome <- factor(xgroup[[i]]$Outcome, levels = c(1,0), labels = c("NotProtected","Protected"))
  Timepoint <- factor(xgroup[[i]]$Timepoint, levels = c(0,25), labels = c("baseline","postvax"))
  Sex <- factor(xgroup[[i]]$SEX)
  Batch <- factor(xgroup[[i]]$SEQBATCH, levels = c("Aug2019","Nov2019"))
  Site <- as.factor(xgroup[[i]]$site)
  CSPAb <- as.integer(xgroup[[i]]$CSPAb)
  CSPAb <- log10(CSPAb+1) #log transform CSP Ab data to make linear
  Pf_VAX1 <- as.factor(xgroup[[i]]$mal.vax.1)
  #This is the revised Timepoint 0 analysis.
  design <- model.matrix(~Batch + Sex + Site + CSPAb + Pf_VAX1 + Outcome)
  print(i)
  print(colnames(design))
  rownames(design) <- ygroup[[i]]$samples$PATID
  ygroup[[i]]     <- estimateDisp(ygroup[[i]],design, robust = TRUE)
  fit   <- glmQLFit(ygroup[[i]], design, robust = TRUE)
  qlf <- glmQLFTest(fit, contrast = c(rep(0, ncol(design)-1),1))
  degtab[[i]] <- topTags(qlf, n = nrow(ygroup[[i]]))$table
}
## [1] "Placebo"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "4.5 x 10^5 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "9.0 x 10^5 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "1.8 x 10^6 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
temp <- degtab
for(i in names(degtab)){
  temp[[i]] <- degtab[[i]] %>%
    dplyr::select(7,8,10:14)
}

Examine top 15 DEGs (protected vs not protected)

Placebo

GeneSymbol descripton_new logFC logCPM F PValue FDR
ENSG00000198502 HLA-DRB5 major histocompatibility complex, class II, DR beta 5 [Source:HGNC Symbol;Acc:HGNC:4953] -0.3204301 4.8632239 53.34851 0.0000000 0.0000137
ENSG00000269102 CTD-2525I3.5 novel transcript, antisense to ZNF766 and ZNF480 -0.7566281 0.1228600 13.62446 0.0004258 0.9998239
ENSG00000170509 HSD17B13 hydroxysteroid 17-beta dehydrogenase 13 [Source:HGNC Symbol;Acc:HGNC:18685] 0.9911967 0.1771091 13.33673 0.0004661 0.9998239
ENSG00000135919 SERPINE2 serpin family E member 2 [Source:HGNC Symbol;Acc:HGNC:8951] 0.5546532 3.2966072 13.55441 0.0004822 0.9998239
ENSG00000239474 KLHL41 kelch like family member 41 [Source:HGNC Symbol;Acc:HGNC:16905] 1.3120210 0.8788559 13.25895 0.0004823 0.9998239
ENSG00000267277 CTD-2342J14.6 novel transcript, antisense to SWSAP1 -0.9393928 0.1562798 13.12189 0.0005179 0.9998239
ENSG00000145975 FAM217A family with sequence similarity 217 member A [Source:HGNC Symbol;Acc:HGNC:21362] 1.0110628 -0.2813040 12.78545 0.0005569 0.9998239
ENSG00000270641 TSIX TSIX transcript, XIST antisense RNA [Source:HGNC Symbol;Acc:HGNC:12377] -2.2321348 10.0609278 12.22884 0.0008394 0.9998239
ENSG00000232838 PET117 PET117 cytochrome c oxidase chaperone [Source:HGNC Symbol;Acc:HGNC:40045] -0.3037263 3.0022032 12.03755 0.0009454 0.9998239
ENSG00000105971 CAV2 caveolin 2 [Source:HGNC Symbol;Acc:HGNC:1528] 1.1153769 0.2471148 10.96021 0.0013792 0.9998239
ENSG00000154734 ADAMTS1 ADAM metallopeptidase with thrombospondin type 1 motif 1 [Source:HGNC Symbol;Acc:HGNC:217] 1.3905202 -0.0870266 10.84868 0.0013976 0.9998239
ENSG00000260156 RP11-394B2.6 novel transcript, antisense to VAC14 -0.6498158 0.0826368 10.72329 0.0016174 0.9998239
ENSG00000267001 AC006538.4 novel protein -0.3783996 1.7898322 10.78385 0.0016845 0.9998239
ENSG00000169247 SH3TC2 SH3 domain and tetratricopeptide repeats 2 [Source:HGNC Symbol;Acc:HGNC:29427] 1.3065830 0.3351870 10.38112 0.0018273 0.9998239
ENSG00000196872 CRACDL CRACD like [Source:HGNC Symbol;Acc:HGNC:33454] 1.2735306 0.5037540 10.24530 0.0019460 0.9998239

4.5 x 10^5 PfSPZ

GeneSymbol descripton_new logFC logCPM F PValue FDR
ENSG00000174358 SLC6A19 solute carrier family 6 member 19 [Source:HGNC Symbol;Acc:HGNC:27960] 0.6464728 3.4725610 48.20614 0.0000000 0.0000939
ENSG00000228146 CASP16P caspase 16, pseudogene [Source:HGNC Symbol;Acc:HGNC:27290] 1.0120463 0.6917596 15.64360 0.0002077 0.9706245
ENSG00000131042 LILRB2 leukocyte immunoglobulin like receptor B2 [Source:HGNC Symbol;Acc:HGNC:6606] -2.2324223 1.2356883 15.31299 0.0002296 0.9706245
ENSG00000232860 SMG7-AS1 SMG7 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:24518] 1.0780769 0.1308753 13.91497 0.0003940 0.9706245
ENSG00000184208 C22orf46 chromosome 22 putative open reading frame 46 [Source:HGNC Symbol;Acc:HGNC:26294] 0.4903798 2.5285402 12.76209 0.0007396 0.9706245
ENSG00000116957 TBCE NA -0.2732737 4.0904363 12.70067 0.0007568 0.9706245
ENSG00000232131 NCOA7-AS1 NCOA7 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:40954] -0.5923958 0.0008323 12.45038 0.0007635 0.9706245
ENSG00000257298 RP3-405J10.3 novel transcript, sense intronic to LIMA1 -1.1495151 0.1915984 12.30623 0.0008043 0.9706245
ENSG00000261404 PSMD7-DT PSMD7 divergent transcript [Source:HGNC Symbol;Acc:HGNC:53056] 1.0781941 0.1621347 12.28842 0.0008247 0.9706245
ENSG00000260488 RP11-166B2.7 novel transcript, antisense to RUNDC2A 0.8667144 -0.0193516 12.15854 0.0008376 0.9706245
ENSG00000179743 FLJ37453 uncharacterized LOC729614 [Source:NCBI gene (formerly Entrezgene);Acc:729614] 0.3321373 3.5788304 11.33642 0.0013815 0.9706245
ENSG00000171680 PLEKHG5 pleckstrin homology and RhoGEF domain containing G5 [Source:HGNC Symbol;Acc:HGNC:29105] 1.2192565 0.5782173 10.78084 0.0015145 0.9706245
ENSG00000215045 GRID2IP Grid2 interacting protein [Source:HGNC Symbol;Acc:HGNC:18464] 0.9388905 0.9956381 11.06766 0.0015373 0.9706245
ENSG00000279428 RP11-258F1.2 TEC 0.4806882 2.2930463 11.04463 0.0015765 0.9706245
ENSG00000276704 RP11-128N14.4 novel transcript, sense intronic to RAP2A -0.6930811 -0.2636300 10.59938 0.0016988 0.9706245

9.0 x 10^5 PfSPZ

GeneSymbol descripton_new logFC logCPM F PValue FDR
ENSG00000229807 XIST X inactive specific transcript [Source:HGNC Symbol;Acc:HGNC:12810] -3.9328609 11.7024516 24.65698 0.0000059 0.0883134
ENSG00000270641 TSIX TSIX transcript, XIST antisense RNA [Source:HGNC Symbol;Acc:HGNC:12377] -3.2429393 9.8875476 22.21131 0.0000143 0.0883134
ENSG00000160062 ZBTB8A zinc finger and BTB domain containing 8A [Source:HGNC Symbol;Acc:HGNC:24172] -1.4630630 0.1854980 21.18984 0.0000153 0.0883134
ENSG00000134198 TSPAN2 tetraspanin 2 [Source:HGNC Symbol;Acc:HGNC:20659] 0.5752927 4.8254668 21.66800 0.0000200 0.0883134
ENSG00000204277 LINC01993 long intergenic non-protein coding RNA 1993 [Source:HGNC Symbol;Acc:HGNC:52826] 1.3027314 0.5064032 18.32967 0.0000641 0.2201121
ENSG00000223855 FLJ44511 heart tissue-associated transcript 92 [Source:NCBI gene (formerly Entrezgene);Acc:441307] 1.9135299 0.8521485 17.29490 0.0000794 0.2201121
ENSG00000124140 SLC12A5 solute carrier family 12 member 5 [Source:HGNC Symbol;Acc:HGNC:13818] 1.6465180 1.0937933 17.58976 0.0000874 0.2201121
ENSG00000185933 CALHM1 calcium homeostasis modulator 1 [Source:HGNC Symbol;Acc:HGNC:23494] 1.3662165 0.3817580 14.98899 0.0002274 0.3607218
ENSG00000174705 SH3PXD2B SH3 and PX domains 2B [Source:HGNC Symbol;Acc:HGNC:29242] 1.5523185 0.8624536 14.87031 0.0002416 0.3607218
ENSG00000255198 SNHG9 small nucleolar RNA host gene 9 [Source:HGNC Symbol;Acc:HGNC:33102] 1.0527682 -0.0686977 14.80654 0.0002468 0.3607218
ENSG00000092051 JPH4 junctophilin 4 [Source:HGNC Symbol;Acc:HGNC:20156] 1.5852542 0.6588844 14.70184 0.0002565 0.3607218
ENSG00000279259 RP11-334C17.3 TEC 0.9321266 0.7208732 14.96871 0.0002721 0.3607218
ENSG00000270885 RASL10B RAS like family 10 member B [Source:HGNC Symbol;Acc:HGNC:30295] 1.6005796 0.5601429 14.50795 0.0002738 0.3607218
ENSG00000261394 RP11-165M1.3 novel transcript, antisense to a novel protein -0.8331614 0.5787782 14.81925 0.0002863 0.3607218
ENSG00000168159 RNF187 ring finger protein 187 [Source:HGNC Symbol;Acc:HGNC:27146] -0.4259981 5.3871479 14.69177 0.0003202 0.3765719

1.8 x 10^6 PfSPZ

GeneSymbol descripton_new logFC logCPM F PValue FDR
ENSG00000171723 GPHN gephyrin [Source:HGNC Symbol;Acc:HGNC:15465] -0.4446555 3.1381660 16.32377 0.0001515 0.8923642
ENSG00000224093 BCAR3-AS1 BCAR3 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:40093] -1.1018234 -0.1733145 15.13474 0.0001920 0.8923642
ENSG00000151090 THRB thyroid hormone receptor beta [Source:HGNC Symbol;Acc:HGNC:11799] -1.6602567 -0.2683385 14.67727 0.0002219 0.8923642
ENSG00000120500 ARR3 arrestin 3 [Source:HGNC Symbol;Acc:HGNC:710] -0.8623949 -0.0435407 14.70570 0.0002492 0.8923642
ENSG00000155368 DBI diazepam binding inhibitor, acyl-CoA binding protein [Source:HGNC Symbol;Acc:HGNC:2690] 0.4042811 4.8208190 14.14063 0.0003809 0.8923642
ENSG00000279107 RP11-823P9.4 TEC -1.6550035 1.4440390 13.51962 0.0004237 0.8923642
ENSG00000180574 EIF2S3B eukaryotic translation initiation factor 2 subunit gamma B [Source:HGNC Symbol;Acc:HGNC:43863] 1.2286486 0.9493500 13.05148 0.0005565 0.8923642
ENSG00000102390 PBDC1 polysaccharide biosynthesis domain containing 1 [Source:HGNC Symbol;Acc:HGNC:28790] 0.3625400 3.1652506 13.09394 0.0006020 0.8923642
ENSG00000145649 GZMA granzyme A [Source:HGNC Symbol;Acc:HGNC:4708] 0.8392956 5.4447857 12.99217 0.0006282 0.8923642
ENSG00000170743 SYT9 synaptotagmin 9 [Source:HGNC Symbol;Acc:HGNC:19265] -1.2239459 -0.1511131 12.39982 0.0006671 0.8923642
ENSG00000125870 SNRPB2 small nuclear ribonucleoprotein polypeptide B2 [Source:HGNC Symbol;Acc:HGNC:11155] 0.2715634 5.4846867 12.78421 0.0006885 0.8923642
ENSG00000215529 EFCAB8 EF-hand calcium binding domain 8 [Source:HGNC Symbol;Acc:HGNC:34532] -1.1103715 0.4727191 12.45192 0.0007035 0.8923642
ENSG00000095637 SORBS1 sorbin and SH3 domain containing 1 [Source:HGNC Symbol;Acc:HGNC:14565] -1.2426460 1.4754030 12.61575 0.0007397 0.8923642
ENSG00000269054 ZNF497-AS1 ZNF497 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:55277] 0.8907559 -0.3178188 12.12753 0.0007703 0.8923642
ENSG00000158301 GPRASP2 G protein-coupled receptor associated sorting protein 2 [Source:HGNC Symbol;Acc:HGNC:25169] -0.8991203 1.0094728 12.44045 0.0007870 0.8923642

Apply GSEA

Rank genes by -log10(PValue)*sign(logFC). Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function.

set.seed(23)
ranks <- degtab
for(i in names(degtab)){
  ranks[[i]] <- degtab[[i]] %>% 
  mutate(rankmetric = -log10(.$PValue)*sign(.$logFC)) %>%
  dplyr::select(GeneSymbol,rankmetric) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(GeneSymbol) %>%  
  summarize(rankmetric = mean(rankmetric)) %>%
  arrange(desc(rankmetric)) %>%
  deframe()
}
#Run fgsea from fgsea package using NamedGeneRankList2GseaTable helper function
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEAtab <- ranks
for(i in names(ranks)){
  GSEAtab[[i]] <- NamedGeneRankList2GseaTable(rankedgenes = ranks[[i]],
                                         geneset = "all",
                                         output_directory = tempdir(),
                                         filename_prefix = "GSEA",
                                         sampleSize = 101,
                                         minSize = 20,
                                         maxSize = Inf,
                                         scoreType = "std") %>%
    as_tibble() %>%
    arrange(desc(NES)) %>% 
    dplyr::select(module_type, pathway, ES, NES, size, leadingEdge, pval, padj) %>% 
    mutate(leadingEdge = gsub("^c\\(|\\)$", "", leadingEdge)) %>%
    mutate(leadingEdge = gsub('"', "", leadingEdge)) %>%
    arrange(padj)
}
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"
## [1] "Running fgsea for MSigDB_Hallmark_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_biocarta_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C2_kegg_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C8_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C7_all_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_bp_v7.4 signatures"
## [1] "Running fgsea for MSigDB_C5_GO_mf_v7.4 signatures"
# These are the genes with duplicated rankmetrics...can ignore warning about ties b/c it's trivial (only involves two genes)
# 1 RP11-521C22.3    -0.0303
# 2 AC069547.1       -0.0303

Visualize GSEA data as bubble plot

Filter based on padj < 0.05.

plotDat <- GSEAtab %>%
  bind_rows(., .id = "dosegroup") %>%
  mutate(dosegroup = factor(dosegroup, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"),
                            labels = c("Placebo", "4.5x105 PfSPZ", "9.0x105 PfSPZ", "1.8x106 PfSPZ"))) %>%
  filter(padj < 0.05) %>%
  filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  filter(!grepl("TBA", pathway)) %>%
  dplyr::select(dosegroup, module_type, pathway, leadingEdge, size, NES, padj) %>%
  mutate(neglogpadj = -log10(padj)) %>%
  dplyr::rename("BH-adj p value (FDR)" = "padj") %>%
  mutate(pathway = gsub("gd", "γδ", pathway)) %>%
  mutate(pathway = gsub("Vd2", "Vδ2", pathway)) %>%
  mutate(pathway = gsub("Vg", "Vγ", pathway)) %>%
  mutate(pathway = gsub("HALLMARK_", "", pathway)) %>%
  mutate(pathway = gsub("_", " ", pathway)) %>%
  mutate(pathway = sub(".*?\\_", "", pathway)) %>%
  group_by(module_type, dosegroup) %>%
  mutate(pathway = fct_reorder(pathway, NES, .desc = TRUE)) %>%
  ungroup() %>%
  mutate(dosegroup = fct_rev(dosegroup)) %>%
  filter(!grepl("TBD", pathway)) %>%
  mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels()

#plotting options
basetextsize <- 7.5  
myfont <- "Helvetica"
bubble_max_size <- 7.5

TopPlot <- plotDat %>%
  filter(module_type %in% c("highBTMs", "MonacoModules"))  %>%
  ggplot(., aes(x = pathway, y = dosegroup)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1.3, 5, 10, 20, 40), limits = c(1,40)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, colour = "black"),
        axis.text.y = element_text(color = "black")) +
  facet_wrap(~module_type, scales = "free_x", nrow= 2)

MidPlot <- plotDat %>%
  filter(module_type %in% c("lowBTMs", "MSigDB_Hallmark_v7.4"))  %>%
  ggplot(., aes(x = pathway, y = dosegroup)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1.3, 5, 10, 20, 40), limits = c(1,40)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, colour = "black"),
        axis.text.y = element_text(color = "black")) +
  facet_wrap(~module_type, scales = "free_x")

BottomPlot <- plotDat %>%
  filter(module_type %in% c("BloodGen3Module"))  %>%
  ggplot(., aes(x = pathway, y = dosegroup)) +
  geom_point(aes(size=neglogpadj, fill = NES), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adjp), max_size = bubble_max_size, breaks = c(1.3, 5, 10, 20, 40), limits = c(1,40)) +
      scale_fill_gradient2(low = "blue",
                           mid = "white",
                           high = "red") +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, colour = "black"),
        axis.text.y = element_text(color = "black")) +
  facet_wrap(~module_type, scales = "free_x")

Plot GSEA bubbleplots (Figures 2D and S3B of revised manuscript)

Red: enriched in protected/uninfected through 3 months post-vax surveillance Blue: enriched in not protected/infected through 3 months post-vax surveillance

Gene Set Testing between Not Protected and Protected within a Dose Group using CAMERA

This is not included in the manuscript but provided here as it was requested during prior reviews.

i <- "highBTMs"
url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
highBTM_list <- readRDS(url(url, method="libcurl"))
    
# c2.indices <- ids2indices(highBTM_list, ygroup$`1.8 x 10^6 PfSPZ`$genes$GeneSymbol)
# camera(ygroup$`1.8 x 10^6 PfSPZ`, c2.indices, design)
# c2.indices <- ids2indices(highBTM_list, ygroup$Placebo$genes$GeneSymbol)
# camera(ygroup$Placebo, c2.indices, design)

cameratab <- highBTM.indices <- setNames(as.list(c(1:4)), c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))
for(i in names(ygroup)){
  Subject <- factor(xgroup[[i]]$PATID)
  Outcome <- factor(xgroup[[i]]$Outcome, levels = c(1,0), labels = c("NotProtected","Protected"))
  Timepoint <- factor(xgroup[[i]]$Timepoint, levels = c(0,25), labels = c("baseline","postvax"))
  Sex <- factor(xgroup[[i]]$SEX)
  Batch <- factor(xgroup[[i]]$SEQBATCH, levels = c("Aug2019","Nov2019"))
  Site <- as.factor(xgroup[[i]]$site)
  CSPAb <- as.integer(xgroup[[i]]$CSPAb)
  CSPAb <- log10(CSPAb+1) #log transform CSP Ab data to make linear
  Pf_VAX1 <- as.factor(xgroup[[i]]$mal.vax.1)
  #This is the revised Timepoint 0 analysis.
  design <- model.matrix(~Batch + Sex + Site + CSPAb + Pf_VAX1 + Outcome)
  print(i)
  print(colnames(design))
  rownames(design) <- ygroup[[i]]$samples$PATID
  ygroup[[i]]     <- estimateDisp(ygroup[[i]],design, robust = TRUE)
  fit   <- glmQLFit(ygroup[[i]], design, robust = TRUE)
  qlf <- glmQLFTest(fit, contrast = c(rep(0, ncol(design)-1),1))
  highBTM.indices[[i]] <- ids2indices(highBTM_list, ygroup[[i]]$genes$GeneSymbol)
  cameratab[[i]] <- camera(ygroup[[i]], highBTM.indices[[i]], design, contrast = c(rep(0, ncol(design)-1),1), inter.gene.cor=0.01)
}
## [1] "Placebo"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "4.5 x 10^5 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "9.0 x 10^5 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
## [1] "1.8 x 10^6 PfSPZ"
## [1] "(Intercept)"      "BatchNov2019"     "SexM"             "SiteWagai"       
## [5] "CSPAb"            "Pf_VAX11"         "OutcomeProtected"
for(i in names(ygroup)){
  cameratab[[i]] <- cameratab[[i]] %>%
    rownames_to_column(var = "module")
}
cameratab_bind <- dplyr::bind_rows(cameratab, .id = "dosegroup")

Code for Camera for high-dose PfSPZ baseline, all module types

highdosegroup <- xgroup$`1.8 x 10^6 PfSPZ`
Subject <- factor(highdosegroup$PATID)
Outcome <- factor(highdosegroup$Outcome, levels = c(1,0), labels = c("NotProtected","Protected"))
Timepoint <- factor(highdosegroup$Timepoint, levels = c(0,25), labels = c("baseline","postvax"))
Sex <- factor(highdosegroup$SEX)
Batch <- factor(highdosegroup$SEQBATCH, levels = c("Aug2019","Nov2019"))
Site <- as.factor(highdosegroup$site)
CSPAb <- as.integer(highdosegroup$CSPAb)
CSPAb <- log10(CSPAb+1) #log transform CSP Ab data to make linear
Pf_VAX1 <- as.factor(highdosegroup$mal.vax.1)
#This is the revised Timepoint 0 analysis.
design <- model.matrix(~Batch + Sex + Site + CSPAb + Pf_VAX1 + Outcome)
print(colnames(design))
rownames(design) <- ygroup$`1.8 x 10^6 PfSPZ`$samples$PATID
ygroup$`1.8 x 10^6 PfSPZ`     <- estimateDisp(ygroup$`1.8 x 10^6 PfSPZ`,design, robust = TRUE)
fit   <- glmQLFit(ygroup$`1.8 x 10^6 PfSPZ`, design, robust = TRUE)
qlf <- glmQLFTest(fit, contrast = c(rep(0, ncol(design)-1),1))

cameratab2 <- setNames(as.list(c(1:5)), c("highBTMs", "lowBTMs", "MonacoModules", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))
for(i in names(cameratab2)){
print(i)
url <- paste0("https://github.com/TranLab/ModuleLists/blob/main/", i, ".rds?raw=true")
module_list <- readRDS(url(url, method="libcurl"))
module.indices <- ids2indices(module_list, ygroup$`1.8 x 10^6 PfSPZ`$genes$GeneSymbol)
cameratab2[[i]] <- camera(ygroup$`1.8 x 10^6 PfSPZ`, module.indices, design, contrast = c(rep(0, ncol(design)-1),1), inter.gene.cor=0.01)
cameratab2[[i]] <- cameratab2[[i]] %>%
    rownames_to_column(var = "module")
}

myCameraPlotDat_KSPZV1 <- dplyr::bind_rows(cameratab2, .id = "module_type") %>%
  mutate(study = "KSPZV1") %>%
  dplyr::select(study, module_type, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Down", -1*neglogpadj, neglogpadj))

myCameraPlotDat_KSPZV1 %>%
  write_rds(. , "/Users/tuantran/Library/CloudStorage/OneDrive-IndianaUniversity/Manuscripts/KSPZV1 Manuscript/JCI Resubmission April 2023/Prelim Results JCI resubmission/KSPZV1_Camera_PvNP.rds")

CAMERA bubble plots, FDR<5%

#plotting options
basetextsize <- 12 
myfont <- "Helvetica"
bubble_max_size <- 15

myCameraPlotDat <- cameratab_bind %>%
  mutate(dosegroup = factor(dosegroup, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))) %>%
  filter(FDR < 0.05) %>%
  #filter(module_type %in% c("highBTMs", "lowBTMs", "MonacoModules", "BloodGen3Module", "MSigDB_Hallmark_v7.4"))  %>%
  #filter(!grepl("TBA", module)) %>%
  dplyr::select(dosegroup, module, NGenes, Direction, FDR) %>%
  mutate(neglogpadj = -log10(FDR)) %>%
  mutate(fdr_direction = ifelse(Direction == "Up", neglogpadj*1, neglogpadj*-1)) %>%
  mutate(module = gsub("gd", "γδ", module)) %>%
  mutate(module = gsub("Vd2", "Vδ2", module)) %>%
  mutate(module = gsub("Vg", "Vγ", module)) %>%
  mutate(module = gsub("HALLMARK_", "", module)) %>%
  mutate(module = gsub("_", " ", module)) %>%
  mutate(module = sub(".*?\\_", "", module)) %>%
  group_by(dosegroup) %>%
  mutate(module = fct_reorder(module, fdr_direction, .desc = TRUE)) %>%
  ungroup() %>%
  mutate(dosegroup = fct_rev(dosegroup)) %>%
  filter(!grepl("TBD", module)) %>%
  #mutate(module_type = factor(module_type, levels = c("highBTMs", "MonacoModules", "lowBTMs", "MSigDB_Hallmark_v7.4", "BloodGen3Module"))) %>%
  arrange(desc(neglogpadj)) %>% 
  droplevels()

myCameraPlot <- myCameraPlotDat %>%
  #filter(module_type %in% c("BloodGen3Module"))  %>%
  ggplot(., aes(x = module, y = dosegroup)) +
  geom_point(aes(size=neglogpadj, fill = Direction), alpha = 0.65, shape=21, stroke = 0.25) +
  scale_size_area(name = expression(-log[10]~adj.~p~value), max_size = bubble_max_size, breaks = c(1.3, 5, 10, 20), limits = c(1,20)) +
      scale_fill_manual(values = c("blue","red")) +
  hrbrthemes::theme_ipsum_es(base_family = myfont, base_size = basetextsize) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
myCameraPlot