library(geneBasisR)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#>     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#>     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#>     union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:base':
#> 
#>     expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#> 
#>     rowMedians
#> The following objects are masked from 'package:matrixStats':
#> 
#>     anyMissing, rowMedians
library(tibble)
library(ggplot2)
library(ggpubr)
library(knitr)
library(reshape2)

1 Introduction

geneBasis is an iterative greedy approach for gene selection. It strives a gene selection that will preserve transcriptional relationships between cells, represented as kNN-graphs.

Here we exemplify how geneBasis can be utilized for library design. As an input, we use raw .txt files containing counts and metadata.

First, we design a library of designated size from scratch and using an initial selection. We then compare two libraries at cell type, cell and gene score levels.

2 Load data

We will be working with adult human spleen. Log-counts and meta file are stored as .txt files, and we will use the function raw_to_sce to transform it to the right format (SingleCellExperiment object).

We performed initial selection for genes and retained only first 3000 highly variable genes (solely to decrease memory usage), but we will nevertheless introduce how to pre-select features using retain_informative_genes (but since we already did the selection for HVGs, we set select.hvgs = FALSE).


# note that we saved logcounts already. If you work with counts, specify it in 'counts_type'
counts_dir = system.file("extdata", "raw_spleen.txt", package = "geneBasisR")
meta_dir = system.file("extdata", "raw_spleen_meta.txt", package = "geneBasisR")

sce_spleen = raw_to_sce(counts_dir = counts_dir, counts_type = "logcounts", transform_counts_to_logcounts = FALSE, header = TRUE, sep = "\t" , meta_dir = meta_dir, batch = NULL)
#> Counts matrix is being processed.
#> Meta file is being processed.
meta_spleen = as.data.frame(colData(sce_spleen))

sce_spleen = retain_informative_genes(sce_spleen, select.hvgs = FALSE, discard.mt = TRUE)
#> 2997 genes retained

# let's print out cell type composition
tab = as.data.frame(table(sce_spleen$celltype))
colnames(tab) = c("celltype", "n")
tab = tab[order(tab$n),]
kable(tab , caption = "Cell type composition")

Table 1: Cell type composition
celltype n
7 Fibroblast 11
3 DC1 13
9 HSC 29
10 ILC 34
16 Plasmablast 41
6 Ery 44
15 Plasma cell 114
5 EC 118
4 DC2 126
11 Mac 198
14 NK 201
1 CD4+ T 208
2 CD8+ T 255
13 MZ B 320
8 Follicular B 324
12 Monocyte 561

3 Select and compare libraries

3.1 geneBasis

Let’s select library of 10 genes from scratch with gene_search.

We will precompute intermediate stat we will need for the run using calc_Minkowski_distances.

It is not essential and will be computed automatically, if you set stat_all = NULL in gene_search (default option). The user-case where it is beneficial to precompute intermediate stat is when multiple libraries are sought to be selected and this stat can be recycled. Also note that to set the colnames correctly (see example below).


stat_all = calc_Minkowski_distances(sce_spleen , genes = rownames(sce_spleen))
colnames(stat_all) = c("gene" , "dist_all")

genes_stat = gene_search(sce_spleen , n_genes_total = 25, stat_all = stat_all , verbose = F)
genes_geneBasis = genes_stat$gene
kable(genes_stat , caption = "Ranked gene panel")

Table 2: Ranked gene panel
rank gene
1 S100A9
2 IGKC
3 HBB
4 IGLC2
5 C1QB
6 IGHM
7 IGHG1
8 IGHA2
9 RPL13
10 IGLC3
11 AREG
12 IL1B
13 CST3
14 HSP90AA1
15 AHSP
16 COL4A1
17 RPL10P9
18 FTH1
19 RPL17-C18orf32
20 IGLL5
21 IGHA1
22 FCN3
23 S100A8
24 G0S2
25 SAT1

3.2 Manually selected

As a control, we manually picked 1-2 markers for each cell type that were used for cell type annotation.

 

genes_manual = c("IL7R", "CD3D", "KLRD1", "IFITM1", "C1QA" , "C1QB" , "CST3" , "CD74" , "CCL14" , "TIMP3" , 
               "HBB", "HBA2", "IGFBP7", "CXCL13" , "CD37" , "TCL1A", "GSTM3" , "AREG" , "LRRC75A" , 
               "GNLY" , "APOE" , "S100A9" , "CD27" , "GZMB" , "IGKC")

4 Comparison between libraries

4.1 Cell type mapping



mapping_geneBasis = get_celltype_mapping(sce_spleen, genes_geneBasis)
mapping_manual = get_celltype_mapping(sce_spleen, genes_manual)


p1 = plot_mapping_heatmap(mapping_geneBasis$mapping) + ggtitle("geneBasis")
p2 = plot_mapping_heatmap(mapping_manual$mapping) + ggtitle("Manual")
p = ggarrange(p1,p2, common.legend = T) 
p
Cell type confusion matrices.

Figure 1: Cell type confusion matrices

Overall, we see that we select genes that resolve cell types to a similar degree than manually curated and selected markers.

Let’s perform a quantitative per cell type comparison.

4.2 Quantitative comparison


mapping_geneBasis = mapping_geneBasis$stat
colnames(mapping_geneBasis) = c("celltype" , "geneBasis")

mapping_manual = mapping_manual$stat
colnames(mapping_manual) = c("celltype" , "Manual")

mapping = merge(mapping_geneBasis, mapping_manual)
mapping = melt(mapping, id = "celltype")

p = ggplot(mapping , aes(x = variable , y = value , fill = variable)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  facet_wrap(~celltype) + 
  theme_classic() + 
  labs(x = "method" , y = "Fraction of cells mapped correctly")
p
Quantitatice comparison between cell type mappings.

Figure 2: Quantitatice comparison between cell type mappings

4.3 Cell neighborhood preservation score (aka cell score)

Now, let’s assess how well we preserve cell neighborhoods.

We can calculate cell score for each library using get_neighborhood_preservation_scores.

We will also precompute intermediate variable neighs.all_stat (using get_neighs_all_stat) since it will be recycled (note that this is optional, get_neighborhood_preservation_scores can compute it automatically if default settings).



# we calculate neighs.all because we will recycle it
neighs.all_stat = get_neighs_all_stat(sce_spleen)

# cell score - geneBasis selection
cellScore_geneBasis = get_neighborhood_preservation_scores(sce_spleen, neighs.all_stat = neighs.all_stat, genes.selection = genes_geneBasis)
colnames(cellScore_geneBasis) = c("cell", "cell_score_geneBasis")

# cell score - manual selection
cellScore_manual = get_neighborhood_preservation_scores(sce_spleen, neighs.all_stat = neighs.all_stat, genes.selection = genes_manual)
colnames(cellScore_manual) = c("cell", "cell_score_manual")

# combine together
cellScore = merge(cellScore_geneBasis, cellScore_manual)
cellScore = merge(cellScore, meta_spleen)

p = ggplot(cellScore, aes(x = cell_score_geneBasis , y = cell_score_manual)) + 
  geom_point(alpha = .5) + 
  geom_abline(slope=1, color = "firebrick4", size = 1.5) + 
  facet_wrap(~celltype) + 
  theme_classic() + 
  ylim(c(0,1)) + 
  labs(x = "geneBasis" , y = "Manual") + 
  ggtitle("Cell score")
p
#> Warning: Removed 43 rows containing missing values (geom_point).
Cell score comparison.

Figure 3: Cell score comparison

Overall, we observe that two selections either preserve cell neighborhoods comparably or geneBasis preserves it better for some cell types (e.g. Plasma cells, Plasmablasts, Endothelium cells (EC)).

4.4 Gene prediction score (aka gene score)

Finally, let’s assess how well we explain transcriptome.

We can calculate cell score for each library using get_gene_prediction_scores.

We will also precompute intermediate variable gene_stat_all (using get_gene_correlation_scores) since it will be recycled (note that this is optional, get_neighborhood_preservation_scores can compute it automatically if default settings). If you plan to precompute this, do not forget to set right the colnames (see example below).



# we calculate gene_stat_all because we will recycle it
gene_stat_all = get_gene_correlation_scores(sce_spleen, genes = rownames(sce_spleen), nPC = 50)
#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero

#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero

#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero
colnames(gene_stat_all) = c("gene" , "corr_all")

# gene score - geneBasis selection
geneScore_geneBasis = get_gene_prediction_scores(sce_spleen, genes.selection = genes_geneBasis, gene_stat_all = gene_stat_all)
geneScore_geneBasis = geneScore_geneBasis[, c("gene", "gene_score")]
colnames(geneScore_geneBasis) = c("gene", "gene_score_geneBasis")

# gene score - manual selection
geneScore_manual = get_gene_prediction_scores(sce_spleen, genes.selection = genes_manual, gene_stat_all = gene_stat_all)
geneScore_manual = geneScore_manual[, c("gene", "gene_score")]
colnames(geneScore_manual) = c("gene", "gene_score_manual")

# combine together
geneScore = merge(geneScore_geneBasis, geneScore_manual)

p = ggplot(geneScore, aes(x = gene_score_geneBasis , y = gene_score_manual)) + 
  geom_point(alpha = .5) + 
  geom_abline(slope=1, color = "firebrick4", size = 1.5) + 
  theme_classic() + 
  labs(x = "geneBasis" , y = "Manual") + 
  ggtitle("Gene score")
p
Gene score comparison.

Figure 4: Gene score comparison

Overall, we see that we predict transcriptome to a similar degree. However, for both methods, there is a subset of genes predicted well with one selection but not the other. The downstream assessment of genes that are not predicted well is not a part of the package per se, however, we include several plotting functions to facilitate the analysis (exemplified in the Tutorial for Mouse embryo).

5 Session Info

sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] reshape2_1.4.4              knitr_1.33                 
#>  [3] ggpubr_0.4.0                ggplot2_3.3.3              
#>  [5] tibble_3.1.2                SingleCellExperiment_1.14.1
#>  [7] SummarizedExperiment_1.22.0 Biobase_2.52.0             
#>  [9] GenomicRanges_1.44.0        GenomeInfoDb_1.28.0        
#> [11] IRanges_2.26.0              S4Vectors_0.30.0           
#> [13] BiocGenerics_0.38.0         MatrixGenerics_1.4.0       
#> [15] matrixStats_0.59.0          geneBasisR_0.0.0.9000      
#> [17] BiocStyle_2.20.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1              backports_1.2.1          
#>   [3] paleotree_3.3.25          fastmatch_1.1-0          
#>   [5] plyr_1.8.6                igraph_1.2.6             
#>   [7] BiocParallel_1.26.0       digest_0.6.27            
#>   [9] htmltools_0.5.1.1         magick_2.7.2             
#>  [11] viridis_0.6.1             wesanderson_0.3.6        
#>  [13] gdata_2.18.0              fansi_0.5.0              
#>  [15] magrittr_2.0.1            phytools_0.7-80          
#>  [17] ScaledMatrix_1.0.0        cluster_2.1.2            
#>  [19] openxlsx_4.2.3            limma_3.48.0             
#>  [21] colorspace_2.0-1          haven_2.4.1              
#>  [23] xfun_0.23                 dplyr_1.0.6              
#>  [25] crayon_1.4.1              RCurl_1.98-1.3           
#>  [27] jsonlite_1.7.2            phangorn_2.7.0           
#>  [29] ape_5.5                   glue_1.4.2               
#>  [31] gtable_0.3.0              zlibbioc_1.38.0          
#>  [33] XVector_0.32.0            DelayedArray_0.18.0      
#>  [35] car_3.0-10                BiocSingular_1.8.0       
#>  [37] RcppZiggurat_0.1.6        maps_3.3.0               
#>  [39] abind_1.4-5               scales_1.1.1             
#>  [41] DBI_1.1.1                 edgeR_3.34.0             
#>  [43] rstatix_0.7.0             Rcpp_1.0.6               
#>  [45] plotrix_3.8-1             viridisLite_0.4.0        
#>  [47] tmvnsim_1.0-2             dqrng_0.3.0              
#>  [49] foreign_0.8-81            rsvd_1.0.5               
#>  [51] ResidualMatrix_1.2.0      metapod_1.0.0            
#>  [53] ellipsis_0.3.2            farver_2.1.0             
#>  [55] pkgconfig_2.0.3           scuttle_1.2.0            
#>  [57] sass_0.4.0                uwot_0.1.10              
#>  [59] locfit_1.5-9.4            utf8_1.2.1               
#>  [61] labeling_0.4.2            tidyselect_1.1.1         
#>  [63] rlang_0.4.11              ggcorrplot_0.1.3         
#>  [65] munsell_0.5.0             cellranger_1.1.0         
#>  [67] tools_4.1.0               generics_0.1.0           
#>  [69] broom_0.7.6               batchelor_1.8.0          
#>  [71] evaluate_0.14             stringr_1.4.0            
#>  [73] yaml_2.2.1                zip_2.2.0                
#>  [75] purrr_0.3.4               nlme_3.1-152             
#>  [77] sparseMatrixStats_1.4.0   scran_1.20.1             
#>  [79] compiler_4.1.0            curl_4.3.1               
#>  [81] png_0.1-7                 ggsignif_0.6.1           
#>  [83] clusterGeneration_1.3.7   statmod_1.4.36           
#>  [85] bslib_0.2.5.1             stringi_1.6.2            
#>  [87] highr_0.9                 forcats_0.5.1            
#>  [89] lattice_0.20-44           bluster_1.2.1            
#>  [91] Matrix_1.3-4              vctrs_0.3.8              
#>  [93] pillar_1.6.1              lifecycle_1.0.0          
#>  [95] BiocManager_1.30.15       combinat_0.0-8           
#>  [97] jquerylib_0.1.4           BiocNeighbors_1.10.0     
#>  [99] cowplot_1.1.1             data.table_1.14.0        
#> [101] bitops_1.0-7              irlba_2.3.3              
#> [103] R6_2.5.0                  bookdown_0.22            
#> [105] gridExtra_2.3             rio_0.5.26               
#> [107] codetools_0.2-18          MASS_7.3-54              
#> [109] gtools_3.8.2              assertthat_0.2.1         
#> [111] withr_2.4.2               mnormt_2.0.2             
#> [113] GenomeInfoDbData_1.2.6    expm_0.999-6             
#> [115] hms_1.1.0                 quadprog_1.5-8           
#> [117] grid_4.1.0                beachmat_2.8.0           
#> [119] tidyr_1.1.3               coda_0.19-4              
#> [121] rmarkdown_2.8             Rfast_2.0.3              
#> [123] DelayedMatrixStats_1.14.0 carData_3.0-4            
#> [125] numDeriv_2016.8-1.1       scatterplot3d_0.3-41