Procedure
library("ArchR")
library(Matrix)
library("Seurat")
#Load ArchR object
proj<-loadArchRProject("~/cluster/projects/archR/tissueRes_tcells/")
Loading required package: ggplot2 Warning message: “replacing previous import ‘lifecycle::last_warnings’ by ‘rlang::last_warnings’ when loading ‘tibble’” Loading required package: SummarizedExperiment Loading required package: MatrixGenerics Loading required package: matrixStats Attaching package: ‘MatrixGenerics’ The following objects are masked from ‘package:matrixStats’: colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars Loading required package: GenomicRanges Loading required package: stats4 Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which.max, which.min Loading required package: S4Vectors Attaching package: ‘S4Vectors’ The following object is masked from ‘package:base’: expand.grid Loading required package: IRanges Loading required package: GenomeInfoDb Loading required package: Biobase Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Attaching package: ‘Biobase’ The following object is masked from ‘package:MatrixGenerics’: rowMedians The following objects are masked from ‘package:matrixStats’: anyMissing, rowMedians Loading required package: data.table Attaching package: ‘data.table’ The following object is masked from ‘package:SummarizedExperiment’: shift The following object is masked from ‘package:GenomicRanges’: shift The following object is masked from ‘package:IRanges’: shift The following objects are masked from ‘package:S4Vectors’: first, second Loading required package: Matrix Attaching package: ‘Matrix’ The following object is masked from ‘package:S4Vectors’: expand Loading required package: rhdf5 Loading required package: magrittr Attaching SeuratObject Attaching package: ‘Seurat’ The following object is masked from ‘package:SummarizedExperiment’: Assays Successfully loaded ArchRProject! / | / \ . / |. \\\ / |. \\\ / `|. \\\ / |. \ / |\ \\#####\ / || ==###########> / || \\##==......\ / || ______ = =|__ /__ || \\\ ,--' ,----`-,__ ___/' --,-`-===================##========> \ ' ##_______ _____ ,--,__,=##,__ /// , __== ___,-,__,--'#' ===' `-' | ##,-/ -,____,---' \\####\\________________,--\\_##,/ ___ .______ ______ __ __ .______ / \ | _ \ / || | | | | _ \ / ^ \ | |_) | | ,----'| |__| | | |_) | / /_\ \ | / | | | __ | | / / _____ \ | |\ \\___ | `----.| | | | | |\ \\___. /__/ \__\ | _| `._____| \______||__| |__| | _| `._____|
markersPeaks <- getMarkerFeatures(
ArchRProj = proj,
useMatrix = "GeneIntegrationMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
heatmapPeaks <- markerHeatmap(
seMarker = markersPeaks,
cutOff = "FDR <= 0.1 & Log2FC >= 1.5",
transpose = FALSE
)
Comparing predicted gene scores with linked gene expression
#predict gene scores for marker genes
markerGenes <- c(
'CD8A', 'CD4',
'CD69','ITGAE',
'ICOS','NCR1',
'CCR7', 'SELL','RGS1','LMNA','RGCC','DUSP6', 'SOCS1'
)
proj <- addImputeWeights(proj)
#make plots for marker genes
p1 <- plotEmbedding(
ArchRProj = proj,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
imputeWeights = getImputeWeights(proj)
)
p2 <- plotEmbedding(
ArchRProj = proj,
colorBy = "GeneIntegrationMatrix",
name = markerGenes,
embedding = "UMAP",
imputeWeights = getImputeWeights(proj)
)
#Rearrange for grid plotting
p1c <- lapply(p1, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 12) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
p2c <- lapply(p2, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 12) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 2),p1c[c(1:4)]))
do.call(cowplot::plot_grid, c(list(ncol = 2),p2c[c(1:4)]))
do.call(cowplot::plot_grid, c(list(ncol = 2),p1c[c(9:12)]))
do.call(cowplot::plot_grid, c(list(ncol = 2),p2c[c(9:12)]))
Note: We observed discrepancy between predicted gene scores and linked gene expression data in marker genes listed in the single cell lung atlas paper.
Heatmap for DEGs in linked gene expression data from scATAC-seq
draw(heatmapPeaks, heatmap_legend_side = "bot", annotation_legend_side = "bot")
dat<-load("archR/tissueRes_tcells/DifferentialAnalysis/marker_genes.RData")
markerList<-getMarkers(markersGS, cutOff = "FDR <= 0.01 & Log2FC >= 1.25")
for(i in 1:9){
write.table(markerList[paste0("C",i)], sprintf("archR/tissueRes_tcells/Tables/%s_predicted_DEG.csv", paste0("C", i)),
quote = F, row.names = F, sep=",")
}
markersPeaks <- getMarkerFeatures(
ArchRProj = proj.t,
useMatrix = "PeakMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
proj.t<-addImputeWeights(proj.t)
pas<-plotEmbedding(
ArchRProj = proj.t,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
imputeWeights = getImputeWeights(proj.t)
)
pas1c <- lapply(pas, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
heatmapGS <- markerHeatmap(
seMarker = markersGS,
cutOff = "FDR <= 0.08 & Log2FC >= 1",
labelMarkers = markerGenes,
transpose = FALSE
)
markerGenes <- c(
'CD8A', 'IFNG',
'CD4',
'NCAM1', #NK
'TRGV1','TRGVA','IMGT','TRDV1', #gamma/delta
'TBX21','EOMES','IL7R', ##NK
'CD19', #B cell
'SELL','CCR7','PTPRC'
)
ComplexHeatmap::draw(heatmapGS, heatmap_legend_side = "bot", annotation_legend_side = "bot")
p <- plotBrowserTrack(
ArchRProj = proj.t,
groupBy = "Clusters",
geneSymbol = markerGenes,
features = getMarkers(markersPeaks, cutOff = "FDR <= 0.1 & Log2FC >= 1", returnGR = TRUE)[paste0("C",1:9)],
upstream = 50000,
downstream = 50000
)
options(repr.plot.width=10, repr.plot.height=12)
grid::grid.newpage()
grid::grid.draw(p$CD69)
proj <- addGroupCoverages(ArchRProj = proj, groupBy = "Clusters")
proj<- addReproduciblePeakSet(
ArchRProj = proj,
groupBy = "Clusters",
pathToMacs2 = "/scratch/midway2/jinggu/env/sc-atac/bin/macs2"
)
load(file="archR/all_transfer_labeling/markersGS_RNA_matched.Rdata")
pma <- plotMarkers(seMarker = markersPeaks, name="C23",cutOff = "FDR <= 0.01 & abs(Log2FC) >= 2", plotAs = "Volcano")
pma
Warning message: “Removed 11 rows containing missing values (geom_point_rast).”
Previously, I tested ScType with a scRNA-seq dataset and it worked well on identifying the main immune cell types. Here I applied scType on the gene score matrix predicted using scATAC-seq dataset.
# load gene score matrix for sub-clustering of T/NK cell populations with scATAC-seq data
mat<-getMatrixFromProject(ArchRProj = proj)
ArchR logging to : ArchRLogs/ArchR-getMatrixFromProject-22a0e18ff2772-Date-2022-06-04_Time-17-40-45.log If there is an issue, please report to github with logFile! 2022-06-04 17:42:46 : Organizing colData, 2.014 mins elapsed. 2022-06-04 17:42:46 : Organizing rowData, 2.015 mins elapsed. 2022-06-04 17:42:46 : Organizing rowRanges, 2.015 mins elapsed. 2022-06-04 17:42:46 : Organizing Assays (1 of 1), 2.015 mins elapsed. 2022-06-04 17:42:50 : Constructing SummarizedExperiment, 2.086 mins elapsed. 2022-06-04 17:42:51 : Finished Matrix Creation, 2.094 mins elapsed.
# load libraries and functions
lapply(c("dplyr","Seurat","HGNChelper"), library, character.only = T)
source("~/projects/funcFinemapping/code/sctype/gene_sets_prepare.R")
source("~/projects/funcFinemapping/code/sctype/sctype_score_.R")
# DB file
db_ = "~/projects/funcFinemapping/data/ScTypeDB_full.xlsx";
tissue = "Immune system" # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain
# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)
# load gene scores matrix for the t-cell cluster from lung snATAC-seq datasets
gsMatrix<-as.matrix(mat@assays@data@listData$GeneScoreMatrix)
rownames(gsMatrix)<-mat@elementMetadata$name
es.max = sctype_score(scRNAseqData = gsMatrix, scaled = FALSE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
# merge by cluster
cL_results = do.call("rbind", lapply(unique(mat$Clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,colnames(gsMatrix[ ,mat$Clusters==cl])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(mat$Clusters==cl)), 10)
}))
sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
print(sctype_scores[,1:3])
# A tibble: 9 x 3 # Groups: cluster [9] cluster type scores <chr> <chr> <dbl> 1 C6 Unknown 44.3 2 C1 Naive B cells 633. 3 C8 Natural killer cells 964. 4 C9 Neutrophils 220. 5 C3 Memory CD4+ T cells 304. 6 C7 CD8+ NKT-like cells 332. 7 C2 Pro-B cells 1050. 8 C5 CD8+ NKT-like cells 345. 9 C4 Memory CD8+ T cells 424.
# UMAP plots with new labels
options(repr.plot.height=10, repr.plot.width=12)
labelOld<-sctype_scores$cluster
labelNew<-sctype_scores$type
proj$Clusters2 <- mapLabels(proj$Clusters, newLabels = labelNew, oldLabels = labelOld)
df<-proj@embeddings$UMAP$df
class(df$umap_x)
colnames(df)<-c("umap_x", "umap_y")
df["cluster"]<-proj$Clusters2
coords<-data.frame(t(sapply(unique(proj$Clusters2), function(i){colMeans(df[df$cluster==i, 1:2])})))
ggplot(df,aes(x=umap_x, y=umap_y, color=cluster)) +
geom_point(size=0.2) + theme_bw() +
annotate("text", x=coords$umap_x, y=coords$umap_y,
label=unique(proj$Clusters2), size=5)
d<-c("C1"="Naive B cells",
"C2"="Pro-B cells",
"C3"="Memory CD4+ T cells",
"C4"="Memory CD8+ T cells",
"C5"="CD8+ NK/T-like cells",
"C6"="Unknown",
"C7"="CD8+ NK/T-like cells",
"C8"="Natural killer cells",
"C9"="Neutrophils"
)
sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 3, wt = scores)
sctype_scores
cluster | type | scores | ncells |
---|---|---|---|
<chr> | <chr> | <dbl> | <int> |
C6 | Effector CD4+ T cells | 44.330538 | 1381 |
C6 | γδ-T cells | 27.577846 | 1381 |
C6 | Intermediate monocytes | 7.971455 | 1381 |
C1 | Naive B cells | 632.963564 | 776 |
C1 | Myeloid Dendritic cells | 627.799812 | 776 |
C1 | Memory B cells | 620.988746 | 776 |
C8 | Natural killer cells | 964.293047 | 1218 |
C8 | CD8+ NKT-like cells | 853.038042 | 1218 |
C8 | CD4+ NKT-like cells | 770.324517 | 1218 |
C9 | Neutrophils | 219.556705 | 288 |
C9 | Granulocytes | 149.750130 | 288 |
C9 | Plasmacytoid Dendritic cells | 147.349830 | 288 |
C3 | Memory CD4+ T cells | 303.804163 | 839 |
C3 | Memory CD8+ T cells | 280.651275 | 839 |
C3 | Naive CD4+ T cells | 226.140925 | 839 |
C7 | CD8+ NKT-like cells | 332.343455 | 1304 |
C7 | Effector CD8+ T cells | 307.467564 | 1304 |
C7 | CD4+ NKT-like cells | 300.783381 | 1304 |
C2 | Pro-B cells | 1050.377867 | 718 |
C2 | Pre-B cells | 821.620518 | 718 |
C2 | Naive B cells | 799.424347 | 718 |
C5 | CD8+ NKT-like cells | 345.433298 | 298 |
C5 | CD4+ NKT-like cells | 322.479651 | 298 |
C5 | Natural killer cells | 267.352406 | 298 |
C4 | Memory CD8+ T cells | 424.137248 | 578 |
C4 | Memory CD4+ T cells | 413.997403 | 578 |
C4 | Naive CD8+ T cells | 388.485137 | 578 |
options(repr.plot.height=20, repr.plot.width=10)
ggplot(sctype_scores, aes(x=cluster, y=scores, label=type)) + geom_point() +
geom_text(check_overlap = FALSE, hjust=0, nudge_x=0.05, nudge_y=0.5)
#make violin plots for gene score distributions
violin_plot<-function(gene.score.mat, gene.list){
df<-data.frame(gene.score.mat[rownames(gene.score.mat)==gene.name,], mat$Clusters)
colnames(df)<-c("scores", "cluster")
p<-ggplot(df, aes(x=cluster, y=log2(scores+2), color=cluster)) +
geom_violin() + geom_jitter(size=0.5) + theme_bw()
return(p)
}
hist_plot<-function(gene.score.mat, gene.name){
df<-data.frame(gene.score.mat[rownames(gene.score.mat)==gene.name,], mat$Clusters)
colnames(df)<-c("scores", "cluster")
p<-ggplot(df, aes(x=log2(scores+1), color=cluster)) +
geom_density() + theme_bw()
return(p)
}
violin_plot(gsMatrix, "CD3D")
hist_plot(gsMatrix, "CD3D")
plot_gene_scores<-function(gene.sets, title){
#given a gene set from one cell-type in SCType database,
#plot gene score distribution across clusters
dat<-gsMatrix[rownames(gsMatrix) %in% gene.sets,]
tbl<-do.call(rbind, lapply(unique(mat$Clusters), function(cl){
data.frame(colMeans(dat[ ,mat$Clusters==cl]), cl)
}))
colnames(tbl)<-c("mean_scores", "cluster")
p<-ggplot(tbl, aes(x=cluster, y=log2(mean_scores+1), color=cluster)) +
geom_violin() + geom_boxplot(width=0.1) +
theme_bw() + theme(legend.position = "none") +
ggtitle(title)
return(p)
}
d
Violin plots for mean expression across a set of marker genes
options(repr.plot.width=12, repr.plot.height=10)
pl<-lapply(names(gs_list$gs_positive)[c(seq(1,12),14,15,16,18)], function(i){
genes<-gs_list$gs_positive[[which(names(gs_list$gs_positive)==i)]]
p<-plot_gene_scores(genes, i)
})
do.call(cowplot::plot_grid, c(list(ncol = 3),pl[1:9]))
do.call(cowplot::plot_grid, c(list(ncol = 3),pl[10:16]))
Note: Overall, the violin plots of the mean expression levels across a sets of marker genes indeed show higher median expression in the corresponding cell type than the rest ones. For instance, the mean expression levels for marker genes specific to pro-B and Naive B cells show higher median expression in C1/C2 than other clusters. The metrics used in ScType allow the distinction between pro-B and naive B cells. This indicates that ScType works to assign clusters and it gives more refined assignment than the average approach.
Previous studies have identified a few genes with different expression patterns in lung versus blood. We would like to examine these genes in scATAC-seq dataset to find epigenetic features that identify a cell to be tissue-resident.
#tissue-resident genes from the cell atlas paper (Travaglini et al. 2020)
trm.genes<-c("CD69", "RGS1", "LMNA", "RGCC","DUSP6", "SOCS1")
# log2 mean expression for the TRM genes looks similar across clusters
options(repr.plot.width=8, repr.plot.height=6)
plot_gene_scores(trm.genes, title="tissue resident genes")
Error in plot_gene_scores(trm.genes, title = "tissue resident genes"): could not find function "plot_gene_scores" Traceback:
tbl<-melt(gsMatrix[rownames(gsMatrix)%in%trm.genes,])
# individual gene expression distribution
ggplot(tbl, aes(x=log2(value+1), color=Var1)) + geom_density() + theme_bw()
Warning message in melt(gsMatrix[rownames(gsMatrix) %in% trm.genes, ]): “The melt generic in data.table has been passed a matrix and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(gsMatrix[rownames(gsMatrix) %in% trm.genes, ]). In the next version, this warning will become an error.”
TRM.mean<-data.frame(colMeans(gsMatrix[rownames(gsMatrix)%in%trm.genes,]))
colnames(TRM.mean)<-"mean_scores"
# Distribution of mean expression across TRM genes
ggplot(TRM.mean, aes(x=mean_scores))+ geom_density()
quantile(TRM.mean$mean_scores, 0.95)
Note: Without comparing to blood, we will be able to draw an accurate cutoff for determining whether a cell is tissue-resident. For now, let's try a few cutoffs to see if there are a decent number of differentially accessible peaks between TR cells and nonTR cells.
#95% set to be the cutoff
proj$Clusters3<-as.numeric(TRM.mean >=3.65)
table(proj$Clusters3)
0 1 7028 372
markersPeaks <- getMarkerFeatures(
ArchRProj = proj,
useMatrix = "PeakMatrix",
groupBy = "Clusters4", #binary - whether or not a cell has mean TRM score larger than the cutoff
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
markerList <- getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1")
markerList
List of length 2 names(2): 0 1
pma <- plotMarkers(seMarker = markersPeaks, name="1", cutOff = "FDR <= 0.1 & abs(Log2FC) >= 1", plotAs = "MA")
pma
markerList[[2]]
Warning message: “Removed 132 rows containing missing values (geom_point_rast).”
DataFrame with 3 rows and 7 columns seqnames idx start end Log2FC FDR MeanDiff <Rle> <array> <array> <array> <numeric> <numeric> <numeric> 121128 chr16 1368 11349749 11350249 1.93903 0.00065029 0.916407 121129 chr16 1369 11354749 11355249 2.23394 0.00065029 0.569762 121127 chr16 1367 11344750 11345250 1.64697 0.00445989 0.449140
Note: Only three peaks contain significantly higher number of reads between TR cells and nonTR cells.
quantile(TRM.mean$mean_scores)
#50% set to be the cutoff
proj$Clusters4<-as.numeric(TRM.mean >=1.107)
table(proj$Clusters4)
0 1 3701 3699
pma <- plotMarkers(seMarker = markersPeaks, name="1", cutOff = "FDR <= 0.1 & abs(Log2FC) >= 1", plotAs = "MA")
pma
markerList[[2]]
Warning message: “Removed 164 rows containing missing values (geom_point_rast).”
DataFrame with 3 rows and 7 columns seqnames idx start end Log2FC FDR MeanDiff <Rle> <array> <array> <array> <numeric> <numeric> <numeric> 121128 chr16 1368 11349749 11350249 1.93903 0.00065029 0.916407 121129 chr16 1369 11354749 11355249 2.23394 0.00065029 0.569762 121127 chr16 1367 11344750 11345250 1.64697 0.00445989 0.449140
Similar results for 50% cutoff