KSPZV1 Complex Heatmaps Baseline, JCI revision

Objective

The following code generates the unsupervised hierarchical clustering heatmap for the KSPZV1 dataset using the ComplexHeatmap package. Clustering analysis is performed to test for statistical differences between Sample Clusters (SC) for outcomes as well as relevant variables (specifically SC1 and SC2-4). Gene Clusters (GC) are removed.

This script reproduces Figures 1C, S2, 1D, 1E in the revised manuscript. It includes the following changes:

Add study site (Wagai vs Siaya) to column annotations
Add chunk for comparing P vs NP between SC1 Placebo and SC1 1.8x10^6 PfSPZ

For revision, address Reviewer’s concerns that generation of clusters in heatmap is random by generating heatmap using random seed to show that the clustering is deterministic in our case.

Load required packages

library(tidyverse)
library(ComplexHeatmap)
library(miscTools)
library(edgeR)
library(Biobase)
library(googledrive)
library(data.table)
library(ggpubr)
library(fgsea)
library(devtools)
library(Cairo)

Options

Final options for Timepoint 0 (baseline): allgroups filter pre-seqmonk < 7.5M MADS = 100% (no filter, take all genes) clustering: pam and euclidean, 4 clusters for columns, 5 clusters for genes

myBatch <- "both" # "Aug2019" "Nov2019"
myGroup <- "allGroups" #"allGroups" #Placebo #4.5 x 10^5 PfSPZ #9.0 x 10^5 PfSPZ #1.8 x 10^6 PfSPZ
myTimepoint <- 0  #"delta" #0 25 #delta
ClusterCols <- TRUE
myFilter <- "preSEQMonk" #filter to remove based on library sizes prior to SEQMonk correction
CPMFILTER <- 7.5e6 #filter any samples with mapped library sizes less than 7.5 million reads
myCPMFILTER <- gsub("00000","e5", paste0(myFilter, CPMFILTER, "cpm"))

col.hclust.method <- "pam" 
col.dist.method <- "euclidean" 
row.hclust.method <- "pam" 
row.dist.method <- "euclidean" 
PCT <- 100

filter.type <- paste("mads",PCT,"pct",sep="")
noColClust <- 4
noRowClust <- 5

Load ExpressionSet

#from google drive
temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("1togaBlNIxiDj16FyXTC-r7Qw0A9cG2az"), path = temp, overwrite = TRUE)
x <- readRDS(file = dl$local_path)
x$treat <- factor(x$treat, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))

Load additional baseline data

#CBC data
#local file "CBC baseline with days before VAX1.xlsx"
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1df0pHJAlFteRIcQuJql3U2IZFe5SpZdB"), path = temp, overwrite = TRUE)
CBCdat <- readxl::read_excel(dl$local_path) %>%
  mutate(DaysPriorVax1 = as.integer(gsub(" \\(Dose 1\\)", "", .$`Days Post Vaccination`))*-1,
         Hemoglobin = as.numeric(Hemoglobin)) %>%
  dplyr::select(PATID, DaysPriorVax1, Hemoglobin, Platelets, WBC, Neutrophils, Percent.Neutrophils)
pData(x) <- pData(x) %>%
  left_join(., CBCdat, by = "PATID")

## weight data
#loca file "KSPZV1_anthro_2021-04-12.csv""
temp <- tempfile(fileext = ".csv")
dl <- drive_download(
  as_id("1hvjTuBKWIYKmDrOdiogGqcHPJUUPZM4P"), path = temp, overwrite = TRUE)
WeightDat <- read_csv(dl$local_path) %>%
  dplyr::select(PATID, zwei, zwfl, zbmi) #for definitions: https://cran.r-project.org/web/packages/anthro/anthro.pdf
pData(x) <- pData(x) %>%
  left_join(., WeightDat, by = "PATID") 

rownames(pData(x)) <- pData(x)$SAMPLEID
all.pDat <- pData(x)

Make annotation dataframe and then build heatmap annotation object using HeatMapAnnotation function

For details on how to make annotations, refer to: https://jokergoo.github.io/ComplexHeatmap-reference/book/

annot.df <- as.data.frame(cbind(
  "Site" = as.character(x$site),
  "Weight-for-age" = as.character(x$zwei),
  "Gender" = as.character(factor(x$SEX, levels = c("M","F"), labels = c("male","female"))),
  "Age" = as.character(x$age.vax1),
  "Pf at first vax" = as.character(factor(x$mal.vax.1, levels = c(0,1), labels = c("neg","pos"))),
  "CSP IgG pre-vax" = as.character(log10(x$pfcsp_pre+1)),
  "CSP IgG response (log2 fold change)" = as.character(x$log2FC_CSPAb),
  "Outcome" = as.character(factor(x$mal.atp.3, levels = c(0,1), labels = c("uninfected (protected)","infected (not protected)"))),
  "Dose" = as.character(x$treat))) 

annot.df <- rev(annot.df)
for(i in c(1:ncol(annot.df))){
  annot.df[,i] <- as.character(annot.df[,i])
  }
annot.df$`Weight-for-age` <- as.numeric(annot.df$`Weight-for-age`)
annot.df$Age <- as.numeric(annot.df$Age)
annot.df$`CSP IgG response (log2 fold change)` <- as.numeric(annot.df$`CSP IgG response (log2 fold change)`)
annot.df$`CSP IgG pre-vax` <- as.numeric(annot.df$`CSP IgG pre-vax`)
rownames(annot.df) <- colnames(x)
clab <- list(
  "Site" = c("Wagai" = "#B2DF8A", "Siaya" = "#33A02C"),
  "Weight-for-age" = circlize::colorRamp2(c(-max(abs(all.pDat$zwei), na.rm = TRUE), 0, max(abs(all.pDat$zwei), na.rm = TRUE)), c("blue", "white", "red")),
  "Gender" = c("male" = "#E5E5E5", "female" = "#191919"),
  "Age" = circlize::colorRamp2(c(min(all.pDat$age.vax1, na.rm=T), max(all.pDat$age.vax1, na.rm=T)), c("#E5E5E5", "#333333")),
  "Pf at first vax" = c("neg" = "#D1D3D3", "pos" = "#b2182b"),
  "CSP IgG pre-vax" = circlize::colorRamp2(c(min(log10(all.pDat$pfcsp_pre+1), na.rm=T), max(log10(all.pDat$pfcsp_pre+1), na.rm=T)), c("#f2f0f7", "#54278f")),
  "CSP IgG response (log2 fold change)" = circlize::colorRamp2(c(-max(abs(all.pDat$log2FC_CSPAb), na.rm=T), 0, max(abs(all.pDat$log2FC_CSPAb), na.rm=T)), c("#656566", "white", "#54278f")),
  "Outcome" = c("uninfected (protected)" = "#1F78B4", "infected (not protected)" = "#A6CEE3"),
  "Dose" = c("Placebo" = "#808080", "4.5 x 10^5 PfSPZ" = "#fdcc8a", "9.0 x 10^5 PfSPZ" = "#fc8d59", "1.8 x 10^6 PfSPZ" = "#d7301f")
  )

clab <- rev(clab)
ha <- HeatmapAnnotation(df = annot.df, col = clab, na_col = "lavender", annotation_name_gp = gpar(fontsize = 7),
                        simple_anno_size = unit(0.08, "inches"), #this adjusts the height of the column annotations
                        annotation_name_side = "left",
                        annotation_legend_param = list(ncol = 4, by_row = TRUE))

Scaling

colnames(x) <- colnames(exprs(x))
unscaled_logCPM <- x
ScaleData <- TRUE
if(ScaleData == TRUE){
  myScaleData <- "zscored"
  exprs(x) <- t(scale(t(exprs(x)), center = TRUE, scale = TRUE)) #scale is generic function whose default method centers and/or scales the columns of a numeric matrix, so you must transpose to scale the genes
  myHeatlegend.title <- "z-score"
}else{
  myScaleData <- "cpm"
  myHeatlegend.title <- "cpm"
}

Clustering and Distance options

Note: This chunk takes a while if you run the full gene set.

#https://jokergoo.github.io/ComplexHeatmap-reference/book/a-single-heatmap.html
#See options for variables
#set.seed(12345)
#set parallel backend.  
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
#use random seed to access stability of clustering motivated by reviewer comment
myseed <- sample(1:5000,1) #sample random integer
set.seed(myseed) #set random seed

if(col.hclust.method == "pam"){
  pa.col <- cluster::pam(t(exprs(x)), k = noColClust, metric = col.dist.method)
  col.dist.method <- "euclidean"
}
if(col.hclust.method == "kmeans"){
  km.col = amap::Kmeans(t(exprs(x)), centers = noColClust, nstart = 25, method = col.dist.method)$cluster
}
if(col.hclust.method != "kmeans" & col.hclust.method != "pam"){
  hclust.col <- cutree(stats::hclust(dist(t(exprs(x)), method = col.dist.method), method = col.hclust.method), k = noColClust)
}

#row.hclust.method <- "pam" #pam #kmeans
if(row.hclust.method == "pam"){
  pa.row <- cluster::pam(exprs(x), k = noRowClust, metric = row.dist.method)
}
if(row.hclust.method == "kmeans"){
  km.row = amap::Kmeans(exprs(x), centers = noRowClust, nstart = 25, method = row.dist.method)$cluster
}
if(row.hclust.method != "kmeans" & row.hclust.method != "pam"){
  hclust.row <- cutree(stats::hclust(dist(exprs(x), method = row.dist.method), method = row.hclust.method), k = noRowClust)
}

For manuscript, used dummy levels for ordering by treat correctly AFTER pam cluster. For columns, used pam k=4 clusters for pre-vax For rows, used pam k=5 cluster for pre-vax

Plot heatmap (Figure 1B in revised manuscript)

hm <- draw(hm, heatmap_legend_side="top", annotation_legend_side="right",
           legend_grouping = "original")

Extract clustering information

see https://github.com/jokergoo/ComplexHeatmap/issues/136

Note: may have to run this chunk directly in console if you get the following error: Error: The width or height of the raster image is zero, maybe you forget to turn off the previous graphic device or it was corrupted. Run dev.off() to close it.

c.dend <- column_dend(hm)  #Extract col dendrogram
ccl.list <- column_order(hm)  #Extract clusters (output is a list)
#lapply(ccl.list, function(x) length(x))  #check/confirm size clusters

# loop to extract samples for each cluster.
for (i in 1:length(column_order(hm))){
  if (i == 1) {
    clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE]))) #add drop = FALSE argument to prevent the matrix from becoming a vector for single column clusters
    out <- cbind(clu, paste("cluster", i, sep=""))
    colnames(out) <- c("SampleID", "Cluster")
    } else {
      clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE])))
      clu <- cbind(clu, paste("cluster", i, sep=""))
      out <- rbind(out, clu)
    }
}

## Merge with pData(x)
if(myGroup == "allGroups"){
  if(noColClust == 4){
    if(myTimepoint != "delta"){
      pData.cluster <- pData(x) %>%
        rownames_to_column(var = "SampleID") %>%
        left_join(., as_tibble(out), by = "SampleID") %>%
        column_to_rownames(var = "SampleID") %>%
        mutate(Cluster = gsub("cluster", "SC", .$Cluster)) %>%
        mutate(Cluster.pam4.cl16 = Cluster) %>%
        mutate(Cluster = gsub("5", "1", .$Cluster)) %>%
        mutate(Cluster = gsub("9", "1", .$Cluster)) %>%
        mutate(Cluster = gsub("13", "1", .$Cluster)) %>%
        mutate(Cluster = ifelse(.$Cluster != "SC1", "SC2-4", .$Cluster)) %>% #regroup SC2, SC3, SC4 as single group SC2-4
        filter(Cluster == "SC1" | Cluster == "SC2-4")
    }
  }
}
pData.cluster <- pData.cluster %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC13", "SC1", "TBD")) %>% #clusters 13-16 are SC1-4 in 1.8x10^6 clusters on Figure S2A
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC14", "SC2", Cluster2)) %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC15", "SC3", Cluster2)) %>%
  mutate(Cluster2 = ifelse(Cluster.pam4.cl16 == "SC16", "SC4", Cluster2)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC1", "SC1.1", "TBD")) %>% #clusters 1-4 are SC1-4 (Placebo) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC2", "SC2.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC3", "SC3.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC4", "SC4.1", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC5", "SC1.2", Cluster3)) %>% #clusters 5-8 are SC1-4 (4.5x10^5) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC6", "SC2.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC7", "SC3.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC8", "SC4.2", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC9", "SC1.3", Cluster3)) %>% #clusters 9-12 are SC1-4 (9.0x10^5) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC10", "SC2.3", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC11", "SC3.3", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC12", "SC4.3", Cluster3)) %>%  
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC13", "SC1.4", Cluster3)) %>% #clusters 13-16 are SC1-4 (1.8x10^6) on Figure S2A
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC14", "SC2.4", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC15", "SC3.4", Cluster3)) %>%
  mutate(Cluster3 = ifelse(Cluster.pam4.cl16 == "SC16", "SC4.4", Cluster3)) %>%
  mutate(Cluster3 = factor(Cluster3,
                           levels = c("SC1.1", "SC2.1", "SC3.1", "SC4.1",
                                      "SC1.2", "SC2.2", "SC3.2", "SC4.2",
                                      "SC1.3", "SC2.3", "SC3.3", "SC4.3",
                                      "SC1.4", "SC2.4", "SC3.4", "SC4.4"))) %>%
  droplevels()
#sanity checks
table(pData.cluster$Cluster2,pData.cluster$treat)

##      
##       Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1       0                0                0               23
##   SC2       0                0                0               22
##   SC3       0                0                0               12
##   SC4       0                0                0                6
##   TBD      65               58               58                0

table(pData.cluster$Cluster3,pData.cluster$treat)

##        
##         Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1.1      18                0                0                0
##   SC2.1      18                0                0                0
##   SC3.1      21                0                0                0
##   SC4.1       8                0                0                0
##   SC1.2       0               12                0                0
##   SC2.2       0               27                0                0
##   SC3.2       0               13                0                0
##   SC4.2       0                6                0                0
##   SC1.3       0                0               14                0
##   SC2.3       0                0               18                0
##   SC3.3       0                0               17                0
##   SC4.3       0                0                9                0
##   SC1.4       0                0                0               23
##   SC2.4       0                0                0               22
##   SC3.4       0                0                0               12
##   SC4.4       0                0                0                6

#setdiff(colnames(x), rownames(pData.cluster))
#setdiff(rownames(pData.cluster), colnames(x))

Plot contingency table and stats for Figure S2A

First plot 2x2 contingency for P and NP in SC1 and SC2-4 and perform Fisher’s Exact as per original.
To address reviewer concerns, plot 2x4 contingency for P and NP in SC1, SC2, SC3, SC4. Perform chi-square as per reviewer suggestion.

high_dose_SC <- pData.cluster %>%
  filter(treat == "1.8 x 10^6 PfSPZ") %>%
  mutate(outcome = ifelse(mal.atp.3 == 1, "NP", "P")) %>% #mal.atp.3 == 1 means malaria infected at least once during 3 months (according to protocol)
  droplevels()
#original Fisher's exact test between SC1 and SC2-4
my_tab <- table(high_dose_SC$outcome, high_dose_SC$Cluster)
my_tab

##     
##      SC1 SC2-4
##   NP   4    21
##   P   19    19

fisher.test(my_tab)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  my_tab
## p-value = 0.007686
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.04093891 0.73404658
## sample estimates:
## odds ratio 
##  0.1955618

#reviewer suggested chi-square between SC1, SC2, SC3, SC4
my_tab <- table(high_dose_SC$outcome, high_dose_SC$Cluster2)
my_tab

##     
##      SC1 SC2 SC3 SC4
##   NP   4  12   5   4
##   P   19  10   7   2

chisq.test(my_tab)

## 
##  Pearson's Chi-squared test
## 
## data:  my_tab
## X-squared = 8.6502, df = 3, p-value = 0.03432

Results remain significant.

#perform chi-square for P vs NP distribution across ALL 16 clusters
#not requested by reviewer but did just for fun
foo <- pData.cluster %>%
  mutate(outcome = ifelse(mal.atp.3 == 1, "NP", "P")) %>% #mal.atp.3 == 1 means malaria infected at least once during 3 months (according to protocol)
  droplevels()

my_tab <- table(foo$outcome, foo$Cluster3)
my_tab

##     
##      SC1.1 SC2.1 SC3.1 SC4.1 SC1.2 SC2.2 SC3.2 SC4.2 SC1.3 SC2.3 SC3.3 SC4.3
##   NP    10    12    15     2     7    15     5     2     7    12     9     1
##   P      8     6     6     6     5    12     8     4     7     6     8     8
##     
##      SC1.4 SC2.4 SC3.4 SC4.4
##   NP     4    12     5     4
##   P     19    10     7     2

chisq.test(my_tab)

## 
##  Pearson's Chi-squared test
## 
## data:  my_tab
## X-squared = 28.573, df = 15, p-value = 0.01825