library(geneBasisR)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#>     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#>     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#>     union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:base':
#> 
#>     expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#> 
#>     rowMedians
#> The following objects are masked from 'package:matrixStats':
#> 
#>     anyMissing, rowMedians
library(tibble)
library(ggplot2)
library(ggpubr)

1 Introduction

geneBasis is an iterative greedy approach for gene selection. It strives a gene selection that will preserve transcriptional relationships between cells, represented as kNN-graphs.

Here we exemplify how geneBasis can be utilized for the design of targeted gene panels (just main steps; for the extended workflow, covering various aspects of gene selection process, please see tutorials here: https://github.com/MarioniLab/geneBasisR).

2 Load data

We will be working with mouse embryo, E8.5. Initial filtering for uninteresting genes was already performed.


data("sce_mouseEmbryo", package = "geneBasisR")

# Fetch meta-data and umap coordinates, merge together
meta = as.data.frame(colData(sce_mouseEmbryo))
umaps = as.data.frame(reducedDim(sce_mouseEmbryo, "UMAP"))
umaps = rownames_to_column(umaps, var = "cell")
meta = merge(meta, umaps)

# Fetch colour scheme for celltypes from meta-data
celltype_colors_df = unique(meta[, c("celltype" , "colour")])
celltype_colors = c(paste0("#",as.character(celltype_colors_df$colour)))
names(celltype_colors) = as.character(celltype_colors_df$celltype)

3 Cell type composition for mouse embryo

Let’s see which cell types (and in which quantities) we observe in mouse embryo.


# celltype composition
tab = as.data.frame(table(sce_mouseEmbryo$celltype))
colnames(tab) = c("celltype" , "n")
tab = tab[order(tab$n) , ]
tab$celltype = factor(tab$celltype , levels = tab$celltype)
p1 = ggplot(tab , aes(x = celltype , y = log2(n) , fill = celltype)) +
  geom_bar(stat = "identity" , position = "dodge") +
  scale_fill_manual(values = celltype_colors) +
  theme_classic() +
  theme(axis.text.x = element_blank()) +
  labs(y = "log2(# cells)", x = "Cell type")

# UMAP
p2 = ggplot(meta , aes(x = x , y = y , col = celltype)) +
  geom_point(size=1,alpha = .9) +
  scale_color_manual(values = celltype_colors) +
  theme_classic() +
  labs(x = "UMAP-1" , y = "UMAP-2") +
  ggtitle("UMAP")

# combine
p = ggarrange(p1,p2, common.legend = T)
p
Cell type composition.

Figure 1: Cell type composition

4 Select gene panel of size 10

To select gene panel, use gene_search.

We use sample as batch identification.

We also pre-computed first 5 genes for the selection from scratch, and we will plug them as an argument for genes_base.

 
n_genes_total = 10
genes_stat = gene_search(sce_mouseEmbryo , genes_base = c("Hba-x", "Acta2", "Ttr", "Crabp1" , "Hoxaas3"), n_genes_total = n_genes_total, batch = "sample", verbose = T)
#> Constructing the True graph.
#> True graph is constructed.
#> New gene is added: Krt18. 4 left.
#> New gene is added: Phlda2. 3 left.
#> New gene is added: Plvap. 2 left.
#> New gene is added: Ube2c. 1 left.
#> New gene is added: Meox1. 0 left.
genes = genes_stat$gene

5 Evaluation of the gene panel

5.1 Cell type confusion matrix

Cell type mapping for a single gene panel can be computed separately using get_celltype_mapping.

We also included visualization function that returns heatmap for the confusion matrix: plot_mapping_heatmap.


celltype_mapping = get_celltype_mapping(sce_mouseEmbryo , genes.selection = genes, batch = "sample", return.stat = F)
p = plot_mapping_heatmap(celltype_mapping$mapping, title = "Cell type confusion matrix")
p
Cell type confusion matrix.

Figure 2: Cell type confusion matrix

5.2 Cell score

Henceforth, to be less wordy, we refer to cell neighborhood preservation score as cell score; gene prediction score as gene score.

We estimate the quality of the panel using evaluate_library. In this example, we will only compute cell score (see extended tutorials for more in-depth analysis).


stat = evaluate_library(sce_mouseEmbryo, genes, genes.all = rownames(sce_mouseEmbryo), batch = "sample", 
                        library.size_type = "single", celltype.id = "celltype",
                        return.cell_score_stat = T, return.gene_score_stat = F, return.celltype_stat = F, verbose = FALSE)

5.2.1 Cell score - distribution per cell type

Let’s assess whether certain cell types are consistently ‘under-preserved’ by looking at the distribution of cell scores across cell types.



cell_score_stat = stat$cell_score_stat[stat$cell_score_stat$n_genes == n_genes_total , ] 
cell_score_stat = merge(cell_score_stat, meta) 

p = ggplot(cell_score_stat , aes(x = celltype , y = cell_score, fill = celltype)) + 
  geom_boxplot() + 
  scale_fill_manual(values = celltype_colors) + 
  theme_classic() + 
  labs(y = "Cell neighborhood preservation score" , x = "# genes") + 
  theme(legend.position = "none") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 
p 
Cell score per cell type.

Figure 3: Cell score per cell type

We see that PGCs and blood progenitors in average have lower scores. This is consistent with observation from Fig. 2.

To note, it is not particularly surprising since PGC and blood progenitors are very rare cell types, and our panel size is only 10. Therefore we suggest that more genes should be included into the panel to successfully resolve these cell types.

6 Characterisation of the selected genes

Let’s actually look into what genes we select.

6.1 What are the genes we are selecting - visual inspection

Let’s plot UMAPs, colored by expression of the selected genes, to have an intuitive grasp of what kind of genes we select. UMAPs can be easily plotted with provided function plot_umaps_w_counts. Note that we require UMAP coordinates to be stored in reducedDim(sce) under the name ‘UMAP’, and colnames for the coordinates should be ‘x’ and ‘y’.

If UMAPs are not provided, you can compute them using get_umap_coordinates and then store it in sce.


p = plot_umaps_w_counts(sce_mouseEmbryo, genes)
p
UMAPs of selected genes.

Figure 4: UMAPs of selected genes

Overall we see that most of the genes are expressed in some transcriptional regions but not others (i.e. like cell type markers behavior).

We also select cell state marker - cell cycle gene Ube2c.

6.2 UMAP-representation on how well data is preserved for the selected panel

Let’s compare UMAP plots when using whole transcriptome and using the selected panel (UMAP-coordinates can be calculated using get_umap_coordinates).

For consistency, e will re-calculate UMAP coordinates for the whole transcriptome as well.


umaps_all = get_umap_coordinates(sce_mouseEmbryo, genes = rownames(sce_mouseEmbryo), batch = "sample", nPC = 50)
colnames(umaps_all) = c("cell", "x_all", "y_all")

umaps_selection = get_umap_coordinates(sce_mouseEmbryo, genes = genes, batch = "sample")
#> Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth =
#> TRUE, : You're computing too large a percentage of total singular values, use a
#> standard svd instead.
#> Warning in (function (query, X, clust_centers, clust_info, dtype, nn,
#> get_index, : detected tied distances to neighbors, see ?'BiocNeighbors-ties'
colnames(umaps_selection) = c("cell", "x_selection", "y_selection")

umaps_combined = merge(umaps_all, umaps_selection)

meta = as.data.frame(colData(sce_mouseEmbryo))
meta = merge(meta, umaps_combined)

# plot
p1 = ggplot(meta , aes(x = x_all , y = y_all , col = celltype)) +
  geom_point(size=1,alpha = .9) +
  scale_color_manual(values = celltype_colors) +
  theme_classic() +
  labs(x = "UMAP-1" , y = "UMAP-2") +
  ggtitle("All genes")
p2 = ggplot(meta , aes(x = x_selection , y = y_selection , col = celltype)) +
  geom_point(size=1,alpha = .9) +
  scale_color_manual(values = celltype_colors) +
  theme_classic() +
  labs(x = "UMAP-1" , y = "UMAP-2") +
  ggtitle("All genes")
p = ggarrange(p1,p2,common.legend = T)
p
UMAP comparison while using all genes and the selection.

Figure 5: UMAP comparison while using all genes and the selection

6.3 Average expression across cell types and co-expression

Let’s look a bit more systematically into how cell type specific are expressions of the selected genes.


p = plot_expression_heatmap(sce_mouseEmbryo, genes = genes, value.type = "mean") 
p 
Average expression per cell type.

Figure 6: Average expression per cell type

Also, let’s look in overall co-expression between selected genes.


p = plot_coexpression(sce_mouseEmbryo, genes = genes) 
p 
Co-expression of the selected genes.

Figure 7: Co-expression of the selected genes

6.4 Estimation of redundancy

Finally, let’s estimate the redundancy of the gene panel on gene/celltype level: for each gene, we temporarily remove it from the panel and compare the accuracy of cell type mappings with and without removed gene (per cell type).

The function to get this stat is get_redundancy_stat. We also provide the function that will plot the results as a heatmap: plot_redundancy_stat. Red corresponds to cases where cell type mapping accuracy drops with the removal of the gene, green - where cell type mapping accuracy increases with the removal of the gene (that can occur for cell type that were not well mapped to begin with).



redundancy_stat = get_redundancy_stat(sce_mouseEmbryo, genes, genes_to_assess = genes, batch = "sample") 
p = plot_redundancy_stat(redundancy_stat) 
p 
Redundancy heatmap.

Figure 8: Redundancy heatmap

From this analysis, we can conclude the particular relevance of some genes for some cell types e.g. Plvap for Endothelium, Acta2 for Cardiomyocytes, etc.

This line of analysis also allows to estimate overall redundancy of the panel which is useful when planning -FISH experiments.

7 Session Info

sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] ggpubr_0.4.0                ggplot2_3.3.3              
#>  [3] tibble_3.1.2                SingleCellExperiment_1.14.1
#>  [5] SummarizedExperiment_1.22.0 Biobase_2.52.0             
#>  [7] GenomicRanges_1.44.0        GenomeInfoDb_1.28.0        
#>  [9] IRanges_2.26.0              S4Vectors_0.30.0           
#> [11] BiocGenerics_0.38.0         MatrixGenerics_1.4.0       
#> [13] matrixStats_0.59.0          geneBasisR_0.0.0.9000      
#> [15] BiocStyle_2.20.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1              backports_1.2.1          
#>   [3] paleotree_3.3.25          fastmatch_1.1-0          
#>   [5] plyr_1.8.6                igraph_1.2.6             
#>   [7] BiocParallel_1.26.0       digest_0.6.27            
#>   [9] htmltools_0.5.1.1         magick_2.7.2             
#>  [11] viridis_0.6.1             wesanderson_0.3.6        
#>  [13] gdata_2.18.0              fansi_0.5.0              
#>  [15] magrittr_2.0.1            phytools_0.7-80          
#>  [17] ScaledMatrix_1.0.0        cluster_2.1.2            
#>  [19] openxlsx_4.2.3            limma_3.48.0             
#>  [21] colorspace_2.0-1          haven_2.4.1              
#>  [23] xfun_0.23                 dplyr_1.0.6              
#>  [25] crayon_1.4.1              RCurl_1.98-1.3           
#>  [27] jsonlite_1.7.2            phangorn_2.7.0           
#>  [29] ape_5.5                   glue_1.4.2               
#>  [31] gtable_0.3.0              zlibbioc_1.38.0          
#>  [33] XVector_0.32.0            DelayedArray_0.18.0      
#>  [35] car_3.0-10                BiocSingular_1.8.0       
#>  [37] RcppZiggurat_0.1.6        maps_3.3.0               
#>  [39] abind_1.4-5               scales_1.1.1             
#>  [41] DBI_1.1.1                 edgeR_3.34.0             
#>  [43] rstatix_0.7.0             Rcpp_1.0.6               
#>  [45] plotrix_3.8-1             viridisLite_0.4.0        
#>  [47] tmvnsim_1.0-2             dqrng_0.3.0              
#>  [49] foreign_0.8-81            rsvd_1.0.5               
#>  [51] ResidualMatrix_1.2.0      metapod_1.0.0            
#>  [53] ellipsis_0.3.2            farver_2.1.0             
#>  [55] pkgconfig_2.0.3           scuttle_1.2.0            
#>  [57] sass_0.4.0                uwot_0.1.10              
#>  [59] locfit_1.5-9.4            utf8_1.2.1               
#>  [61] reshape2_1.4.4            labeling_0.4.2           
#>  [63] tidyselect_1.1.1          rlang_0.4.11             
#>  [65] ggcorrplot_0.1.3          munsell_0.5.0            
#>  [67] cellranger_1.1.0          tools_4.1.0              
#>  [69] generics_0.1.0            broom_0.7.6              
#>  [71] batchelor_1.8.0           evaluate_0.14            
#>  [73] stringr_1.4.0             yaml_2.2.1               
#>  [75] knitr_1.33                zip_2.2.0                
#>  [77] purrr_0.3.4               nlme_3.1-152             
#>  [79] sparseMatrixStats_1.4.0   scran_1.20.1             
#>  [81] compiler_4.1.0            curl_4.3.1               
#>  [83] png_0.1-7                 ggsignif_0.6.1           
#>  [85] clusterGeneration_1.3.7   statmod_1.4.36           
#>  [87] bslib_0.2.5.1             stringi_1.6.2            
#>  [89] highr_0.9                 RSpectra_0.16-0          
#>  [91] forcats_0.5.1             lattice_0.20-44          
#>  [93] bluster_1.2.1             Matrix_1.3-4             
#>  [95] vctrs_0.3.8               pillar_1.6.1             
#>  [97] lifecycle_1.0.0           BiocManager_1.30.15      
#>  [99] combinat_0.0-8            jquerylib_0.1.4          
#> [101] RcppAnnoy_0.0.18          BiocNeighbors_1.10.0     
#> [103] cowplot_1.1.1             data.table_1.14.0        
#> [105] bitops_1.0-7              irlba_2.3.3              
#> [107] R6_2.5.0                  bookdown_0.22            
#> [109] gridExtra_2.3             rio_0.5.26               
#> [111] codetools_0.2-18          MASS_7.3-54              
#> [113] gtools_3.8.2              assertthat_0.2.1         
#> [115] withr_2.4.2               mnormt_2.0.2             
#> [117] GenomeInfoDbData_1.2.6    expm_0.999-6             
#> [119] hms_1.1.0                 quadprog_1.5-8           
#> [121] grid_4.1.0                beachmat_2.8.0           
#> [123] tidyr_1.1.3               coda_0.19-4              
#> [125] rmarkdown_2.8             Rfast_2.0.3              
#> [127] DelayedMatrixStats_1.14.0 carData_3.0-4            
#> [129] numDeriv_2016.8-1.1       scatterplot3d_0.3-41