Last updated: 2020-12-23

Here we perform TF motif and gene analysis for the Cusanovich et al (2018) scATAC-seq result inferred from the multinomial topic model with \(k = 13\).

Load packages and some functions used in this analysis


Load data and topic model results

Load the data and the \(k = 13\) Poisson NMF fit results.

data.dir <- "/project2/mstephens/kevinluo/scATACseq-topics/data/Cusanovich_2018/processed_data/"
load(file.path(data.dir, "Cusanovich_2018.RData"))
out.dir <- "/project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018"
fit <- readRDS(file.path(out.dir, "/fit-Cusanovich2018-scd-ex-k=13.rds"))$fit
fit_multinom <- poisson2multinom(fit)

Visualize by Structure plot grouped by tissues

colors_topics <- c("#a6cee3","#1f78b4","#b2df8a","#33a02c","#fb9a99","#e31a1c",
rows <- sample(nrow(fit$L),4000)
samples$tissue <- as.factor(samples$tissue)

p.structure <- structure_plot(select(fit_multinom,loadings = rows),
                                grouping = samples[rows, "tissue"],n = Inf,gap = 40,
                                perplexity = 50,topics = 1:13,colors = colors_topics,
                                num_threads = 4,verbose = FALSE)
# Perplexity automatically changed to 42 because original setting of 50 was too large for the number of samples (132)
# Perplexity automatically changed to 40 because original setting of 50 was too large for the number of samples (124)


Version Author Date
b9aa0bf kevinlkx 2020-12-22

Differential accessbility analysis of the ATAC-seq regions for the topics

Load results from differential accessbility analysis for the topics

diff_count_topics <- readRDS(file.path(out.dir, "/diffcount-Cusanovich2018-13topics.rds"))

Distribution of z-scores

zscore_topics <-  melt(diff_count_topics$Z)
colnames(zscore_topics) <- c("region", "topic", "zscore")
levels(zscore_topics$topic) <- colnames(diff_count_topics$Z)

z.quantile.99 <- apply(abs(diff_count_topics$Z), 2, quantile, 0.99)
cat("z-score 99% quantile: \n")

p.hist.zscores <- ggplot(zscore_topics, aes(x=zscore)) + 
  geom_histogram(binwidth=1, color="black", fill="white") + 
  coord_cartesian(xlim = c(-10, 30)) + theme_cowplot(font_size = 10) +
  facet_wrap(~ topic, ncol=4)


Version Author Date
b9aa0bf kevinlkx 2020-12-22
# z-score 99% quantile: 
#       k1       k2       k3       k4       k5       k6       k7       k8 
# 21.42992 31.48751 25.46082 25.97670 34.64418 37.09098 32.07250 39.65746 
#       k9      k10      k11      k12      k13 
# 25.93102 15.88394 34.29782 39.80147 20.71928

Motif enrichment analysis using HOMER

homer.dir <- paste0(out.dir, "/motifanalysis-Cusanovich2018-k=13-quantile/HOMER/quantile")
cat(sprintf("Directory of motif analysis result: %s \n", homer.dir))
homer_res <- readRDS(file.path(homer.dir, "/homer_knownResults.rds"))
selected_regions <- readRDS(file.path(homer.dir, "/selected_regions.rds"))

cat("Number of regions selected for each topic: \n")
print(mapply(nrow, selected_regions[1:(length(selected_regions)-1)]))

top_motifs <- data.frame(matrix(nrow=10, ncol = ncol(diff_count_topics$Z)))
colnames(top_motifs) <- colnames(diff_count_topics$Z)
for (k in colnames(top_motifs)){
  homer_motifs <- homer_res[[k]]
  colnames(homer_motifs) <- c("", "Consensus", "P-value", "Log.P-value", "q-value (Benjamini)", 
                              "# of Target Sequences with Motif", "% of Target Sequences with Motif",
                              "# of Background Sequences with Motif", "% of Background Sequences with Motif")
  homer_motifs <- homer_motifs %>% separate(, c("motif", "experiment", "database"), "/")
  top_motifs[,k] <- head(homer_motifs$motif, 10)

DT::datatable(data.frame(rank = 1:10, top_motifs), rownames = F,
              caption = "Top 10 motifs enriched in each topic.")
# Directory of motif analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/motifanalysis-Cusanovich2018-k=13-quantile/HOMER/quantile 
# Number of regions selected for each topic: 
#   k1   k2   k3   k4   k5   k6   k7   k8   k9  k10  k11  k12  k13 
# 4363 4363 4363 4363 4363 4363 4363 4363 4363 4363 4363 4363 4363
homer.dir <- paste0(out.dir, "/motifanalysis-Cusanovich2018-k=13-zscore/HOMER/zscore")
cat(sprintf("Directory of motif analysis result: %s \n", homer.dir))
homer_res <- readRDS(file.path(homer.dir, "/homer_knownResults.rds"))
selected_regions <- readRDS(file.path(homer.dir, "/selected_regions.rds"))

cat("Number of regions selected for each topic: \n")
print(mapply(nrow, selected_regions[1:(length(selected_regions)-1)]))

top_motifs <- data.frame(matrix(nrow=10, ncol = ncol(diff_count_topics$Z)))
colnames(top_motifs) <- colnames(diff_count_topics$Z)
for (k in colnames(top_motifs)){
  homer_motifs <- homer_res[[k]]
  colnames(homer_motifs) <- c("", "Consensus", "P-value", "Log.P-value", "q-value (Benjamini)", 
                              "# of Target Sequences with Motif", "% of Target Sequences with Motif",
                              "# of Background Sequences with Motif", "% of Background Sequences with Motif")
  homer_motifs <- homer_motifs %>% separate(, c("motif", "experiment", "database"), "/")
  top_motifs[,k] <- head(homer_motifs$motif, 10)

DT::datatable(data.frame(rank = 1:10, top_motifs), rownames = F,
              caption = "Top 10 motifs enriched in each topic.")
# Directory of motif analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/motifanalysis-Cusanovich2018-k=13-zscore/HOMER/zscore 
# Number of regions selected for each topic: 
#    k1    k2    k3    k4    k5    k6    k7    k8    k9   k10   k11   k12   k13 
#  5199  9695  8446  9752 17209 17758 12988 22940  8024   747 27044 20069  5467

Top genes

Gene body model

Gene scores were computed using the gene score model (model 42) in the archR paper with some modifications. This model uses bi-directional exponential decays from the gene TSS (extended upstream by 5 kb by default) and the gene transcription termination site (TTS). Note: the current version of the function does not account for neighboring gene boundaries.

  • Gene body model, normalized by the l2 norm of weights, as in Stouffer's z-score method.
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-genebody-l2")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_genes <- data.frame(matrix(nrow=10, ncol = ncol(gene_scores)))
colnames(top_genes) <- colnames(gene_scores)

rownames(gene_scores) <- genes[match(rownames(gene_scores), genes$ENSEMBL), "SYMBOL"]

for (k in colnames(top_genes)){
  top_genes[,k] <- rownames(gene_scores)[head(order(abs(gene_scores[,k]), decreasing=TRUE), 10)]

DT::datatable(data.frame(rank = 1:10, top_genes), rownames = F,
              caption = "Top 10 genes in each topic.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-genebody-l2
  • Gene body model, normalized by the total weights (i.e. weighted averge).
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-genebody-sum")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_genes <- data.frame(matrix(nrow=10, ncol = ncol(gene_scores)))
colnames(top_genes) <- colnames(gene_scores)

rownames(gene_scores) <- genes[match(rownames(gene_scores), genes$ENSEMBL), "SYMBOL"]

for (k in colnames(top_genes)){
  top_genes[,k] <- rownames(gene_scores)[head(order(abs(gene_scores[,k]), decreasing=TRUE), 10)]

DT::datatable(data.frame(rank = 1:10, top_genes), rownames = F,
              caption = "Top 10 genes in each topic.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-genebody-sum

TSS model

Gene scores were computed using TSS based method as in Lareau et al. Nature Biotech, 2019 as well as the model 21 in archR paper. This model weights chromatin accessibility around gene promoters by using bi-directional exponential decays from the TSS.

  • TSS model, normalized by the l2 norm of weights, as in Stouffer's z-score method.
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-TSS-l2")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_genes <- data.frame(matrix(nrow=10, ncol = ncol(gene_scores)))
colnames(top_genes) <- colnames(gene_scores)

rownames(gene_scores) <- genes[match(rownames(gene_scores), genes$ENSEMBL), "SYMBOL"]

for (k in colnames(top_genes)){
  top_genes[,k] <- rownames(gene_scores)[head(order(abs(gene_scores[,k]), decreasing=TRUE), 10)]

DT::datatable(data.frame(rank = 1:10, top_genes), rownames = F,
              caption = "Top 10 genes in each topic.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-TSS-l2
  • TSS model, normalized by the total weights (i.e. weighted averge).
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-TSS-sum")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_genes <- data.frame(matrix(nrow=10, ncol = ncol(gene_scores)))
colnames(top_genes) <- colnames(gene_scores)

rownames(gene_scores) <- genes[match(rownames(gene_scores), genes$ENSEMBL), "SYMBOL"]

for (k in colnames(top_genes)){
  top_genes[,k] <- rownames(gene_scores)[head(order(abs(gene_scores[,k]), decreasing=TRUE), 10)]

DT::datatable(data.frame(rank = 1:10, top_genes), rownames = F,
              caption = "Top 10 genes in each topic.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-TSS-sum

Gene-set enrichment analysis (GSEA)

gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-genebody-l2")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_pathways_up <- top_pathways_down <- data.frame(matrix(nrow=10, ncol = ncol(gsea_res$pval)))
colnames(top_pathways_up) <- colnames(top_pathways_down) <- colnames(gsea_res$pval)

for (k in 1:ncol(gsea_res$pval)){
  gsea_topic <- data.frame(pathway = rownames(gsea_res$pval),  
                           pval = gsea_res$pval[,k],
                           log2err = gsea_res$log2err[,k],
                           ES = gsea_res$ES[,k],
                           NES = gsea_res$NES[,k])
  gsea_up <- gsea_topic[gsea_topic$ES > 0,]
  top_IDs_up <- as.character(gsea_up[head(order(gsea_up$pval), 10), "pathway"])
  top_pathways_up[,k] <- gene_set_info[match(top_IDs_up, gene_set_info$id),c("name", "id")] %>% 
                     unite("pathway", c("name", "id"), sep = ".", remove = TRUE)
  gsea_down <- gsea_topic[gsea_topic$ES < 0,]
  top_IDs_down <- as.character(gsea_down[head(order(gsea_down$pval), 10), "pathway"])
  top_pathways_down[,k] <- gene_set_info[match(top_IDs_down, gene_set_info$id),c("name", "id")] %>% 
                     unite("pathway", c("name", "id"), sep = ".", remove = TRUE)

DT::datatable(data.frame(rank = 1:10, top_pathways_up), rownames = F,
              caption = "Top 10 pathways enriched at the top of the gene rank list.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-genebody-l2
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-genebody-sum")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_pathways_up <- top_pathways_down <- data.frame(matrix(nrow=10, ncol = ncol(gsea_res$pval)))
colnames(top_pathways_up) <- colnames(top_pathways_down) <- colnames(gsea_res$pval)

for (k in 1:ncol(gsea_res$pval)){
  gsea_topic <- data.frame(pathway = rownames(gsea_res$pval),  
                           pval = gsea_res$pval[,k],
                           log2err = gsea_res$log2err[,k],
                           ES = gsea_res$ES[,k],
                           NES = gsea_res$NES[,k])
  gsea_up <- gsea_topic[gsea_topic$ES > 0,]
  top_IDs_up <- as.character(gsea_up[head(order(gsea_up$pval), 10), "pathway"])
  top_pathways_up[,k] <- gene_set_info[match(top_IDs_up, gene_set_info$id),c("name", "id")] %>% 
                     unite("pathway", c("name", "id"), sep = ".", remove = TRUE)
  gsea_down <- gsea_topic[gsea_topic$ES < 0,]
  top_IDs_down <- as.character(gsea_down[head(order(gsea_down$pval), 10), "pathway"])
  top_pathways_down[,k] <- gene_set_info[match(top_IDs_down, gene_set_info$id),c("name", "id")] %>% 
                     unite("pathway", c("name", "id"), sep = ".", remove = TRUE)

DT::datatable(data.frame(rank = 1:10, top_pathways_up), rownames = F,
              caption = "Top 10 pathways enriched at the top of the gene rank list.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-genebody-sum
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-TSS-l2")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_pathways_up <- top_pathways_down <- data.frame(matrix(nrow=10, ncol = ncol(gsea_res$pval)))
colnames(top_pathways_up) <- colnames(top_pathways_down) <- colnames(gsea_res$pval)

for (k in 1:ncol(gsea_res$pval)){
  gsea_topic <- data.frame(pathway = rownames(gsea_res$pval),  
                           pval = gsea_res$pval[,k],
                           log2err = gsea_res$log2err[,k],
                           ES = gsea_res$ES[,k],
                           NES = gsea_res$NES[,k])
  gsea_up <- gsea_topic[gsea_topic$ES > 0,]
  top_IDs_up <- as.character(gsea_up[head(order(gsea_up$pval), 10), "pathway"])
  top_IDs_up <- gene_set_info[match(top_IDs_up, gene_set_info$id),c("name", "id")]
  top_pathways_up[,k] <- paste0(top_IDs_up$name, "(", top_IDs_up$id, ")")
  gsea_down <- gsea_topic[gsea_topic$ES < 0,]
  top_IDs_down <- as.character(gsea_down[head(order(gsea_down$pval), 10), "pathway"])
  top_IDs_down <- gene_set_info[match(top_IDs_down, gene_set_info$id),c("name", "id")]
  top_pathways_down[,k] <- paste0(top_IDs_down$name, "(", top_IDs_down$id, ")")

DT::datatable(data.frame(rank = 1:10, top_pathways_up), rownames = F,
              caption = "Top 10 pathways enriched at the top of the gene rank list.")
# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-TSS-l2
gene.dir <- paste0(out.dir, "/geneanalysis-Cusanovich2018-k=13-TSS-sum")
cat(sprintf("Directory of gene analysis result: %s \n", gene.dir))
load(file.path(gene.dir, "genescores_gsea.Rdata"))

top_pathways_up <- top_pathways_down <- data.frame(matrix(nrow=10, ncol = ncol(gsea_res$pval)))
colnames(top_pathways_up) <- colnames(top_pathways_down) <- colnames(gsea_res$pval)

for (k in 1:ncol(gsea_res$pval)){
  gsea_topic <- data.frame(pathway = rownames(gsea_res$pval),  
                           pval = gsea_res$pval[,k],
                           log2err = gsea_res$log2err[,k],
                           ES = gsea_res$ES[,k],
                           NES = gsea_res$NES[,k])
  gsea_up <- gsea_topic[gsea_topic$ES > 0,]
  top_IDs_up <- as.character(gsea_up[head(order(gsea_up$pval), 10), "pathway"])
  top_IDs_up <- gene_set_info[match(top_IDs_up, gene_set_info$id),c("name", "id")]
  top_pathways_up[,k] <- paste0(top_IDs_up$name, "(", top_IDs_up$id, ")")
  gsea_down <- gsea_topic[gsea_topic$ES < 0,]
  top_IDs_down <- as.character(gsea_down[head(order(gsea_down$pval), 10), "pathway"])
  top_IDs_down <- gene_set_info[match(top_IDs_down, gene_set_info$id),c("name", "id")]
  top_pathways_down[,k] <- paste0(top_IDs_down$name, "(", top_IDs_down$id, ")")

DT::datatable(data.frame(rank = 1:10, top_pathways_up), rownames = F,
              caption = "Top 10 pathways enriched at the top of the gene rank list.")

# Directory of gene analysis result: /project2/mstephens/kevinluo/scATACseq-topics/output/Cusanovich_2018/geneanalysis-Cusanovich2018-k=13-TSS-sum

