geneBasisR 0.0.0.9000
library(geneBasisR)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#> dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#> grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#> union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#>
#> rowMedians
#> The following objects are masked from 'package:matrixStats':
#>
#> anyMissing, rowMedians
library(tibble)
library(ggplot2)
library(wesanderson)
library(ggcorrplot)
library(ggpubr)
geneBasis
is an iterative greedy approach for gene selection. It strives a gene selection that will preserve transcriptional relationships between cells, represented as kNN-graphs.
Here we exemplify how geneBasis
can be utilized for gene panel design. First, we design a panel of designated size from scratch. Additionally, we compare selections with either including known batch ID (assigned as sample) or without batch (batch=NULL). We confirm that selection where batch is not specified includes ‘technically variable’ genes. We therefore suggest that inclusion of batch if possible results in cleaner final results.
We then estimate the quality of designed panel and provide a workflow that can be used to investigate cells and/or genes that are not explained with the selected gene panel. Finally, we provide the estimation of the redundancy of the gene panel for cell type mapping.
We will be working with mouse embryo, E8.5. Initial filtering for uninteresting genes was already performed (to see how genes can be pre-filtered, see Vignette for spleen).
data("sce_mouseEmbryo", package = "geneBasisR")
# Fetch meta-data and umap coordinates, merge together
meta = as.data.frame(colData(sce_mouseEmbryo))
umaps = as.data.frame(reducedDim(sce_mouseEmbryo, "UMAP"))
umaps = rownames_to_column(umaps, var = "cell")
meta = merge(meta, umaps)
# Fetch colour scheme for celltypes from meta-data
celltype_colors_df = unique(meta[, c("celltype" , "colour")])
celltype_colors = c(paste0("#",as.character(celltype_colors_df$colour)))
names(celltype_colors) = as.character(celltype_colors_df$celltype)
Let’s see which cell types (and in which quantities) we observe in mouse embryo.
# celltype composition
tab = as.data.frame(table(sce_mouseEmbryo$celltype))
colnames(tab) = c("celltype" , "n")
tab = tab[order(tab$n) , ]
tab$celltype = factor(tab$celltype , levels = tab$celltype)
p1 = ggplot(tab , aes(x = celltype , y = log2(n) , fill = celltype)) +
geom_bar(stat = "identity" , position = "dodge") +
scale_fill_manual(values = celltype_colors) +
theme_classic() +
theme(axis.text.x = element_blank()) +
labs(y = "log2(# cells)", x = "Cell type")
# UMAP
p2 = ggplot(meta , aes(x = x , y = y , col = celltype)) +
geom_point(size=1,alpha = .9) +
scale_color_manual(values = celltype_colors) +
theme_classic() +
labs(x = "UMAP-1" , y = "UMAP-2") +
ggtitle("UMAP")
# combine
p = ggarrange(p1,p2, common.legend = T)
p
To select gene panel, use gene_search
. We will perform selections (of size 20) with 2 options:
batch = NULL
corresponds to default case where batch is not specified.
batch = "sample
corresponds to the case where batch is specified (in our case this is sample id from different sequencing runs).
We will not print intermediate outputs for this run (by setting verbose=FALSE
).
n_genes_total = 20
genes_stat_batch_off = gene_search(sce_mouseEmbryo , n_genes_total = n_genes_total, batch = NULL, verbose = F)
genes_batch_off = genes_stat_batch_off$gene
genes_stat_batch_on = gene_search(sce_mouseEmbryo , n_genes_total = n_genes_total, batch = "sample", verbose = F)
genes_batch_on = genes_stat_batch_on$gene
Let’s compare 2 selections. To do so, we will assess bulk distribution of logcounts between different samples (i.e. batches) for either genes that are only selected with batch ‘on’ or genes that are only selected with batch ‘off’.
get_violins_w_counts = function(gene){
meta = merge(meta , data.frame(cell = sce_mouseEmbryo$cell , counts = as.numeric(logcounts(sce_mouseEmbryo[gene,]))))
meta$sample = factor(meta$sample)
p <- ggplot(meta , aes(x = sample , y = counts , fill = sample)) +
geom_violin() +
theme_classic() +
ggtitle(gene) +
labs(x = "Sample" , y = "Logcounts")
return(p)
}
plots = lapply(setdiff(genes_batch_on , genes_batch_off) , function(gene){
p = get_violins_w_counts(gene)
return(p)
})
p = ggarrange(plotlist = plots, common.legend = T)
p = annotate_figure(p, fig.lab = "Selection with batch specified")
p
plots = lapply(setdiff(genes_batch_off , genes_batch_on) , function(gene){
p = get_violins_w_counts(gene)
return(p)
})
p = ggarrange(plotlist = plots, common.legend = T)
p = annotate_figure(p, fig.lab = "Selection without batch")
p
We observe that couple genes among those that are only selected with batch ‘off’ show significant change between two batches, and likely represent technical variability. To further assess that these genes are not important cell type markers, let’s plot UMAPs colored by logcounts.
get_umap_w_counts = function(gene){
meta = merge(meta , data.frame(cell = sce_mouseEmbryo$cell , counts = as.numeric(logcounts(sce_mouseEmbryo[gene,]))))
meta$sample = factor(meta$sample)
p <- ggplot(meta , aes(x = x , y = y , col = counts)) +
geom_point(size = 0.5) +
scale_color_gradient(low = "azure3" , high = "darkgreen") +
theme_classic() +
ggtitle(gene) +
facet_wrap(~sample) +
labs(x = "UMAP-1" , y = "UMAP-2")
return(p)
}
plots = lapply(c("Gm10076" , "Hist1h2ap"), function(gene){
p = get_umap_w_counts(gene)
print(p)
})
We conclude that Gm10076 and Hist1h2ap likely vary due to technical aspects of different sequencing runs, and do not represent relevant biological variability (at least to the extent required from top genes). We threfore suggest to include batch wheneber this information is available.
Let’s select a bigger panel (50 genes), with specifying batch = "sample"
.
Since we have already calculated first 20 genes, we will plug them in argument genes_base
.
n_genes_total = 50
genes_stat = gene_search(sce_mouseEmbryo , n_genes_total = n_genes_total,
genes_base = genes_batch_on, batch = "sample", verbose = F)
genes = genes_stat$gene
We estimate the quality of the gene panel using evaluate_library
.
Here, to be less wordy, we refer to cell neighborhood preservation score as cell score; gene prediction score as gene score.
stat = evaluate_library(sce_mouseEmbryo, genes, genes.all = rownames(sce_mouseEmbryo), batch = "sample",
library.size_type = "series", celltype.id = "celltype", n_genes.step = 10,
return.cell_score_stat = T, return.gene_score_stat = T, return.celltype_stat = T, verbose = F)
p1 = ggplot(stat$cell_score_stat , aes(x = n_genes , y = cell_score, fill = n_genes)) +
geom_boxplot() +
scale_fill_manual(values = wes_palette("Zissou1", length(levels(stat$cell_score_stat$n_genes)), type = "continuous")) +
ylim(c(-.2,1)) +
theme_classic() +
labs(y = "Cell neighborhood preservation score" , x = "# genes") +
theme(legend.position = "none")
p2 = ggplot(stat$gene_score_stat , aes(x = n_genes , y = gene_score, fill = n_genes)) +
geom_boxplot() +
scale_fill_manual(values = wes_palette("Zissou1", length(levels(stat$gene_score_stat$n_genes)), type = "continuous")) +
ylim(c(0,1)) +
theme_classic() +
labs(y = "Gene prediction score" , x = "# genes") +
theme(legend.position = "none")
p = ggarrange(p1,p2,ncol=2)
#> Warning: Removed 11 rows containing non-finite values (stat_boxplot).
#> Warning: Removed 648 rows containing non-finite values (stat_boxplot).
p
We suggest that these plots can be useful to make a decision at which number of genes cell and gene score distributions converge (more relevant for higher number of genes).
In our example, we observe that distributions do not converge yet and therefore we suggest that 50 genes is not above the optimal size for the gene panel.
Let’s see how well we can map cell types using selected panel.
Cell type mapping for a single gene panel can be computed separately using get_celltype_mapping
.
We also included plotting function that returns heatmap for the confusion matrix: plot_mapping_heatmap
.
celltype_mapping = get_celltype_mapping(sce_mouseEmbryo , genes, batch = "sample", return.stat = F)
p = plot_mapping_heatmap(celltype_mapping$mapping, title = "Cell type confusion matrix")
p
pal = wes_palette("Zissou1" , length(levels(stat$celltype_stat)) , type = "continuous")
p = ggplot(stat$celltype_stat, aes( x = as.character( n_genes ), y = frac_correctly_mapped , col = "black")) +
geom_point(size=1.5) +
facet_wrap(~celltype) +
#scale_color_manual(values = celltype_colors) +
geom_hline(yintercept = 1, linetype = "dashed") +
theme_classic() +
theme(legend.position = "none") +
labs(y = "Fraction of cells mapped correctly", x = "# genes")
ggtitle("Cell type mapping accuracy")
#> $title
#> [1] "Cell type mapping accuracy"
#>
#> attr(,"class")
#> [1] "labels"
p
Let’s assess whether certain cell types are consistently ‘under-preserved’ by looking at the distribution of cell scores across cell types.
cell_score_stat = stat$cell_score_stat[stat$cell_score_stat$n_genes == n_genes_total , ]
cell_score_stat = merge(cell_score_stat, meta)
p = ggplot(cell_score_stat , aes(x = celltype , y = cell_score, fill = celltype)) +
geom_boxplot() +
scale_fill_manual(values = celltype_colors) +
theme_classic() +
labs(y = "Cell neighborhood preservation score" , x = "# genes") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p
We see that PGCs in average have lower scores. This is expected since we have only 8 PGC cells and our toy gene panel is of very limited size. We also observe that a decent fraction of PGCs do not map correctly to the same cell type (Fig. 3). We therefore suggest to increase number of genes to be selected or manually add PGC-specific markers into the panel (especially if detecting PGCs is of high interest).
On a more general note, analysis of cell score distribution is not per se constrained to the comparison between cell types. In a later version of the package, we plan to introduce a statistical approach that will provide an estimate of transcriptional regions (regardless their cell type identities) that are ‘under-preserved’.
Now let’s assess which genes can not be predicted well from k-NN graph constructed with selected gene panel.
We assign threshold for prediction score as 0.75 and look at the genes that have gene score < 0.75.
Let’s see if we observe a highly co-expressed gene set. The plotting function is included in the package: plot_coexpression
.
gene_score.thresh = 0.75
gene_score_stat = stat$gene_score_stat[stat$gene_score_stat$n_genes == n_genes_total , ]
genes_lowly_explained = gene_score_stat$gene[gene_score_stat$gene_score < gene_score.thresh]
p = plot_coexpression(sce_mouseEmbryo , genes_lowly_explained )
p
A function to plot a heatmap of average expression across is built in the package: plot_expression_heatmap
.
p = plot_expression_heatmap(sce_mouseEmbryo, genes = genes_lowly_explained, value.type = "mean")
p
We see that most of selected genes are cell state genes, expressed across most of the cell types.
Interestingly, we also see that strong PGC marker Dppa3 is not explained well. This is consistent with the observation from the above. Additionally, blood progenitor marker F2rl2 is also not explained well, and this is also consistent with cell type confusion matrix from Fig. 3.
We suggest that this workflow can facilitate the understanding which genes are ‘missing’ from the panel and whether it is desirable to include them.
Let’s actually look into what genes we select.
Let’s plot UMAPs, colored by expression of the selected genes, to have an intuitive grasp of what kind of genes we select. UMAPs can be easily plotted with provided function plot_umaps_w_counts
.
p = plot_umaps_w_counts(sce_mouseEmbryo, genes)
p
Overall we see that most of the genes are expressed in some transcriptional regions but not others (i.e. like cell type markers behavior).
Additionally, some genes seem to be expressed across the whole manifold and are representative of a cell state (Ube2c, Cdc20, Xist, etc).
Let’s look a bit more systematically into how cell type specific are expressions of the selected genes.
p = plot_expression_heatmap(sce_mouseEmbryo, genes = genes, value.type = "mean")
p
Finally, let’s estimate the redundancy of the gene panel on gene/celltype level: for each gene, we temporarily remove it from the panel and compare the accuracy of cell type mappings with and without removed gene (per cell type).
The function to get this stat is get_redundancy_stat
. We also provide the function that will plot the results as a heatmap: plot_redundancy_stat
. Red corresponds to cases where cell type mapping accuracy drops with the removal of the gene, green - where cell type mapping accuracy increases with the removal of the gene (that can occur for cell type that were not well mapped to begin with).
redundancy_stat = get_redundancy_stat(sce_mouseEmbryo, genes, genes_to_assess = genes, batch = "sample")
p = plot_redundancy_stat(redundancy_stat)
p
From this analysis, we can conclude the particular relevance of some genes for some cell types e.g. Mest for Blood progenitors 1, Hbb-y for Erythroid2.
This line of analysis also allows to estimate overall redundancy of the gene panel which is useful when planning -FISH experiments.
sessionInfo()
#> R version 4.1.1 (2021-08-10)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] ggpubr_0.4.0 ggcorrplot_0.1.3
#> [3] wesanderson_0.3.6 ggplot2_3.3.5
#> [5] tibble_3.1.4 SingleCellExperiment_1.14.1
#> [7] SummarizedExperiment_1.22.0 Biobase_2.52.0
#> [9] GenomicRanges_1.44.0 GenomeInfoDb_1.28.2
#> [11] IRanges_2.26.0 S4Vectors_0.30.0
#> [13] BiocGenerics_0.38.0 MatrixGenerics_1.4.3
#> [15] matrixStats_0.60.1 geneBasisR_0.0.0.9000
#> [17] BiocStyle_2.20.2
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.3.1 backports_1.2.1
#> [3] paleotree_3.3.25 fastmatch_1.1-3
#> [5] plyr_1.8.6 igraph_1.2.6
#> [7] BiocParallel_1.26.2 digest_0.6.27
#> [9] htmltools_0.5.2 magick_2.7.3
#> [11] viridis_0.6.1 gdata_2.18.0
#> [13] fansi_0.5.0 magrittr_2.0.1
#> [15] phytools_0.7-80 ScaledMatrix_1.0.0
#> [17] cluster_2.1.2 openxlsx_4.2.4
#> [19] limma_3.48.3 colorspace_2.0-2
#> [21] haven_2.4.3 xfun_0.25
#> [23] dplyr_1.0.7 crayon_1.4.1
#> [25] RCurl_1.98-1.4 jsonlite_1.7.2
#> [27] phangorn_2.7.1 ape_5.5
#> [29] glue_1.4.2 gtable_0.3.0
#> [31] zlibbioc_1.38.0 XVector_0.32.0
#> [33] DelayedArray_0.18.0 car_3.0-11
#> [35] BiocSingular_1.8.1 RcppZiggurat_0.1.6
#> [37] maps_3.3.0 abind_1.4-5
#> [39] scales_1.1.1 DBI_1.1.1
#> [41] edgeR_3.34.0 rstatix_0.7.0
#> [43] Rcpp_1.0.7 plotrix_3.8-1
#> [45] viridisLite_0.4.0 tmvnsim_1.0-2
#> [47] dqrng_0.3.0 foreign_0.8-81
#> [49] rsvd_1.0.5 ResidualMatrix_1.2.0
#> [51] metapod_1.0.0 ellipsis_0.3.2
#> [53] farver_2.1.0 pkgconfig_2.0.3
#> [55] scuttle_1.2.1 sass_0.4.0
#> [57] uwot_0.1.10 locfit_1.5-9.4
#> [59] utf8_1.2.2 reshape2_1.4.4
#> [61] tidyselect_1.1.1 labeling_0.4.2
#> [63] rlang_0.4.11 munsell_0.5.0
#> [65] cellranger_1.1.0 tools_4.1.1
#> [67] generics_0.1.0 broom_0.7.9
#> [69] batchelor_1.8.1 evaluate_0.14
#> [71] stringr_1.4.0 fastmap_1.1.0
#> [73] yaml_2.2.1 knitr_1.33
#> [75] zip_2.2.0 purrr_0.3.4
#> [77] nlme_3.1-152 sparseMatrixStats_1.4.2
#> [79] scran_1.20.1 compiler_4.1.1
#> [81] curl_4.3.2 png_0.1-7
#> [83] ggsignif_0.6.2 clusterGeneration_1.3.7
#> [85] statmod_1.4.36 bslib_0.2.5.1
#> [87] stringi_1.7.4 highr_0.9
#> [89] forcats_0.5.1 lattice_0.20-44
#> [91] bluster_1.2.1 Matrix_1.3-4
#> [93] vctrs_0.3.8 pillar_1.6.2
#> [95] lifecycle_1.0.0 BiocManager_1.30.16
#> [97] combinat_0.0-8 jquerylib_0.1.4
#> [99] BiocNeighbors_1.10.0 cowplot_1.1.1
#> [101] data.table_1.14.0 bitops_1.0-7
#> [103] irlba_2.3.3 R6_2.5.1
#> [105] bookdown_0.23 gridExtra_2.3
#> [107] rio_0.5.27 codetools_0.2-18
#> [109] MASS_7.3-54 gtools_3.9.2
#> [111] assertthat_0.2.1 withr_2.4.2
#> [113] mnormt_2.0.2 GenomeInfoDbData_1.2.6
#> [115] expm_0.999-6 hms_1.1.0
#> [117] quadprog_1.5-8 grid_4.1.1
#> [119] beachmat_2.8.1 tidyr_1.1.3
#> [121] coda_0.19-4 rmarkdown_2.10
#> [123] Rfast_2.0.3 DelayedMatrixStats_1.14.3
#> [125] carData_3.0-4 numDeriv_2016.8-1.1
#> [127] scatterplot3d_0.3-41