#! /usr/bin/env Rscript # Perform differential accessbility analysis for ATAC-seq regions (peaks), # and perform TF motif enrichment analysis using HOMER. setwd("~/projects/scATACseq-topics/") library(optparse) library(Matrix) library(fastTopics) suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(tidyr)) source("code/motif_analysis.R") # SETTINGS # --------- DAfile <- '/project2/mstephens/kevinluo/scATACseq-topics/output/Buenrostro_2018/binarized/postfit_v2/DAanalysis-Buenrostro2018-k=10/DA_regions_topics_noshrinkage_10000iters.rds' genome <- 'hg19' homerpath <- '/project2/xinhe/software/homer/bin/findMotifsGenome.pl' nc <- 8 out.dir <- '/project2/mstephens/kevinluo/scATACseq-topics/output/Buenrostro_2018/binarized/postfit_v2/motifanalysis-Buenrostro2018-k=10' cat(sprintf("DAfile = %s \n", DAfile)) cat(sprintf("genome = %s \n", genome)) cat(sprintf("homerpath = %s \n", homerpath)) cat(sprintf("nc = %s \n", nc)) cat(sprintf("out.dir = %s \n", out.dir)) if(!dir.exists(out.dir)) dir.create(out.dir, showWarnings = FALSE, recursive = TRUE) # LOAD DATA # --------- # DIFFERENTIAL ACCESSBILITY ANALYSIS # ------------------------------------------ # Load differential accessbility analysis result using the topic model. if(!file.exists(DAfile)){ stop("DA result not available") } cat("Load precomputed differential accessbility statistics.\n") DA_res <- readRDS(DAfile) # Regions with NAs rows_withNAs <- which(apply(DA_res$z, 1, anyNA)) cat(length(rows_withNAs), "regions with NAs in z-scores... \n") head(DA_res$z[rows_withNAs,]) # Filter out regions with NAs in z-scores # DA_res$z <- DA_res$z[-rows_withNAs,] # DA_res$postmean <- DA_res$postmean[-rows_withNAs,] # DA_res$lpval <- DA_res$lpval[-rows_withNAs,] # DA_res$lfsr <- DA_res$lfsr[-rows_withNAs,] # SELECT REGIONS and PERFORM MOTIF ENRICHMENT ANALYSIS USING HOMER # ----------------------------------------------------------------- # Select regions with DA p-value < 0.1 # ------------------------------------- homer.dir <- paste0(out.dir, "/HOMER/DA_pval_0.1_regions") cat("Select regions for motif enrichment analysis... \n") cat(sprintf("%d regions in total. \n", nrow(DA_res$z))) selected_regions <- select_DA_regions(DA_res, method = "pval", thresh.pval = 0.1, out.dir = homer.dir, save.bed = TRUE) saveRDS(selected_regions, paste0(homer.dir, "/selected_regions.rds")) # For each topic, perform TF motif enrichment analysis using HOMER hypergeometric test. cat("Performing motif enrichment analysis using HOMER.\n") homer_res <- vector("list", ncol(DA_res$z)) names(homer_res) <- colnames(DA_res$z) for(k in 1:ncol(DA_res$z)){ homer_res[[k]] <- run_homer(selected_regions$filenames[k], genome = genome, homer.path = homerpath, use.hypergeometric = TRUE, out.dir=paste0(homer.dir, "/homer_result_topic_", k), n.cores=nc) } saveRDS(homer_res, paste0(homer.dir, "/homer_knownResults.rds")) # Select regions with DA p-value < 0.05 # -------------------------------------- homer.dir <- paste0(out.dir, "/HOMER/DA_pval_0.05_regions") cat("Select regions for motif enrichment analysis... \n") cat(sprintf("%d regions in total. \n", nrow(DA_res$z))) selected_regions <- select_DA_regions(DA_res, method = "pval", thresh.pval = 0.05, out.dir = homer.dir, save.bed = TRUE) saveRDS(selected_regions, paste0(homer.dir, "/selected_regions.rds")) # For each topic, perform TF motif enrichment analysis using HOMER hypergeometric test. cat("Performing motif enrichment analysis using HOMER.\n") homer_res <- vector("list", ncol(DA_res$z)) names(homer_res) <- colnames(DA_res$z) for(k in 1:ncol(DA_res$z)){ homer_res[[k]] <- run_homer(selected_regions$filenames[k], genome = genome, homer.path = homerpath, use.hypergeometric = TRUE, out.dir=paste0(homer.dir, "/homer_result_topic_", k), n.cores=nc) } saveRDS(homer_res, paste0(homer.dir, "/homer_knownResults.rds")) # Select top 1% regions with largest logFC # ------------------------------------------ homer.dir <- paste0(out.dir, "/HOMER/DA_top1percent_regions") cat("Select regions for motif enrichment analysis... \n") cat(sprintf("%d regions in total. \n", nrow(DA_res$z))) selected_regions <- select_DA_regions(DA_res, method = "topPercent", top.percent = 0.01, out.dir = homer.dir, save.bed = TRUE) saveRDS(selected_regions, paste0(homer.dir, "/selected_regions.rds")) # For each topic, perform TF motif enrichment analysis using HOMER hypergeometric test. cat("Performing motif enrichment analysis using HOMER.\n") homer_res <- vector("list", ncol(DA_res$z)) names(homer_res) <- colnames(DA_res$z) for(k in 1:ncol(DA_res$z)){ homer_res[[k]] <- run_homer(selected_regions$filenames[k], genome = genome, homer.path = homerpath, use.hypergeometric = TRUE, out.dir=paste0(homer.dir, "/homer_result_topic_", k), n.cores=nc) } saveRDS(homer_res, paste0(homer.dir, "/homer_knownResults.rds")) # Select top 2000 regions with largest logFC # ------------------------------------------- homer.dir <- paste0(out.dir, "/HOMER/DA_top2000_regions") cat("Select regions for motif enrichment analysis... \n") cat(sprintf("%d regions in total. \n", nrow(DA_res$z))) selected_regions <- select_DA_regions(DA_res, method = "topN", top.n = 2000, out.dir = homer.dir, save.bed = TRUE) saveRDS(selected_regions, paste0(homer.dir, "/selected_regions.rds")) # For each topic, perform TF motif enrichment analysis using HOMER hypergeometric test. cat("Performing motif enrichment analysis using HOMER.\n") homer_res <- vector("list", ncol(DA_res$z)) names(homer_res) <- colnames(DA_res$z) for(k in 1:ncol(DA_res$z)){ homer_res[[k]] <- run_homer(selected_regions$filenames[k], genome = genome, homer.path = homerpath, use.hypergeometric = TRUE, out.dir=paste0(homer.dir, "/homer_result_topic_", k), n.cores=nc) } saveRDS(homer_res, paste0(homer.dir, "/homer_knownResults.rds")) sessionInfo()