geneBasisR 0.0.0.9000
library(geneBasisR)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#> dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#> grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#> union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#>
#> rowMedians
#> The following objects are masked from 'package:matrixStats':
#>
#> anyMissing, rowMedians
library(tibble)
library(ggplot2)
library(ggpubr)
library(knitr)
library(reshape2)
geneBasis
is an iterative greedy approach for gene selection. It strives a gene selection that will preserve transcriptional relationships between cells, represented as kNN-graphs.
Here we exemplify how geneBasis
can be utilized for library design. As an input, we use raw .txt files containing counts and metadata.
First, we design a library of designated size from scratch and using an initial selection. We then compare two libraries at cell type, cell and gene score levels.
We will be working with adult human spleen. Log-counts and meta file are stored as .txt files, and we will use the function raw_to_sce
to transform it to the right format (SingleCellExperiment object).
We performed initial selection for genes and retained only first 3000 highly variable genes (solely to decrease memory usage), but we will nevertheless introduce how to pre-select features using retain_informative_genes
(but since we already did the selection for HVGs, we set select.hvgs = FALSE
).
# note that we saved logcounts already. If you work with counts, specify it in 'counts_type'
counts_dir = system.file("extdata", "raw_spleen.txt", package = "geneBasisR")
meta_dir = system.file("extdata", "raw_spleen_meta.txt", package = "geneBasisR")
sce_spleen = raw_to_sce(counts_dir = counts_dir, counts_type = "logcounts", transform_counts_to_logcounts = FALSE, header = TRUE, sep = "\t" , meta_dir = meta_dir, batch = NULL)
#> Counts matrix is being processed.
#> Meta file is being processed.
meta_spleen = as.data.frame(colData(sce_spleen))
sce_spleen = retain_informative_genes(sce_spleen, select.hvgs = FALSE, discard.mt = TRUE)
#> 2997 genes retained
# let's print out cell type composition
tab = as.data.frame(table(sce_spleen$celltype))
colnames(tab) = c("celltype", "n")
tab = tab[order(tab$n),]
kable(tab , caption = "Cell type composition")
celltype | n | |
---|---|---|
7 | Fibroblast | 11 |
3 | DC1 | 13 |
9 | HSC | 29 |
10 | ILC | 34 |
16 | Plasmablast | 41 |
6 | Ery | 44 |
15 | Plasma cell | 114 |
5 | EC | 118 |
4 | DC2 | 126 |
11 | Mac | 198 |
14 | NK | 201 |
1 | CD4+ T | 208 |
2 | CD8+ T | 255 |
13 | MZ B | 320 |
8 | Follicular B | 324 |
12 | Monocyte | 561 |
Let’s select library of 10 genes from scratch with gene_search
.
We will precompute intermediate stat we will need for the run using calc_Minkowski_distances
.
It is not essential and will be computed automatically, if you set stat_all = NULL
in gene_search
(default option). The user-case where it is beneficial to precompute intermediate stat is when multiple libraries are sought to be selected and this stat can be recycled. Also note that to set the colnames correctly (see example below).
stat_all = calc_Minkowski_distances(sce_spleen , genes = rownames(sce_spleen))
colnames(stat_all) = c("gene" , "dist_all")
genes_stat = gene_search(sce_spleen , n_genes_total = 25, stat_all = stat_all , verbose = F)
genes_geneBasis = genes_stat$gene
kable(genes_stat , caption = "Ranked gene panel")
rank | gene |
---|---|
1 | S100A9 |
2 | IGKC |
3 | HBB |
4 | IGLC2 |
5 | C1QB |
6 | IGHM |
7 | IGHG1 |
8 | IGHA2 |
9 | RPL13 |
10 | IGLC3 |
11 | AREG |
12 | IL1B |
13 | CST3 |
14 | HSP90AA1 |
15 | AHSP |
16 | COL4A1 |
17 | RPL10P9 |
18 | FTH1 |
19 | RPL17-C18orf32 |
20 | IGLL5 |
21 | IGHA1 |
22 | FCN3 |
23 | S100A8 |
24 | G0S2 |
25 | SAT1 |
As a control, we manually picked 1-2 markers for each cell type that were used for cell type annotation.
genes_manual = c("IL7R", "CD3D", "KLRD1", "IFITM1", "C1QA" , "C1QB" , "CST3" , "CD74" , "CCL14" , "TIMP3" ,
"HBB", "HBA2", "IGFBP7", "CXCL13" , "CD37" , "TCL1A", "GSTM3" , "AREG" , "LRRC75A" ,
"GNLY" , "APOE" , "S100A9" , "CD27" , "GZMB" , "IGKC")
mapping_geneBasis = get_celltype_mapping(sce_spleen, genes_geneBasis)
mapping_manual = get_celltype_mapping(sce_spleen, genes_manual)
p1 = plot_mapping_heatmap(mapping_geneBasis$mapping) + ggtitle("geneBasis")
p2 = plot_mapping_heatmap(mapping_manual$mapping) + ggtitle("Manual")
p = ggarrange(p1,p2, common.legend = T)
p
Overall, we see that we select genes that resolve cell types to a similar degree than manually curated and selected markers.
Let’s perform a quantitative per cell type comparison.
mapping_geneBasis = mapping_geneBasis$stat
colnames(mapping_geneBasis) = c("celltype" , "geneBasis")
mapping_manual = mapping_manual$stat
colnames(mapping_manual) = c("celltype" , "Manual")
mapping = merge(mapping_geneBasis, mapping_manual)
mapping = melt(mapping, id = "celltype")
p = ggplot(mapping , aes(x = variable , y = value , fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~celltype) +
theme_classic() +
labs(x = "method" , y = "Fraction of cells mapped correctly")
p
Now, let’s assess how well we preserve cell neighborhoods.
We can calculate cell score for each library using get_neighborhood_preservation_scores
.
We will also precompute intermediate variable neighs.all_stat
(using get_neighs_all_stat
) since it will be recycled (note that this is optional, get_neighborhood_preservation_scores
can compute it automatically if default settings).
# we calculate neighs.all because we will recycle it
neighs.all_stat = get_neighs_all_stat(sce_spleen)
# cell score - geneBasis selection
cellScore_geneBasis = get_neighborhood_preservation_scores(sce_spleen, neighs.all_stat = neighs.all_stat, genes.selection = genes_geneBasis)
colnames(cellScore_geneBasis) = c("cell", "cell_score_geneBasis")
# cell score - manual selection
cellScore_manual = get_neighborhood_preservation_scores(sce_spleen, neighs.all_stat = neighs.all_stat, genes.selection = genes_manual)
colnames(cellScore_manual) = c("cell", "cell_score_manual")
# combine together
cellScore = merge(cellScore_geneBasis, cellScore_manual)
cellScore = merge(cellScore, meta_spleen)
p = ggplot(cellScore, aes(x = cell_score_geneBasis , y = cell_score_manual)) +
geom_point(alpha = .5) +
geom_abline(slope=1, color = "firebrick4", size = 1.5) +
facet_wrap(~celltype) +
theme_classic() +
ylim(c(0,1)) +
labs(x = "geneBasis" , y = "Manual") +
ggtitle("Cell score")
p
#> Warning: Removed 43 rows containing missing values (geom_point).
Overall, we observe that two selections either preserve cell neighborhoods comparably or geneBasis preserves it better for some cell types (e.g. Plasma cells, Plasmablasts, Endothelium cells (EC)).
Finally, let’s assess how well we explain transcriptome.
We can calculate cell score for each library using get_gene_prediction_scores
.
We will also precompute intermediate variable gene_stat_all
(using get_gene_correlation_scores
) since it will be recycled (note that this is optional, get_neighborhood_preservation_scores
can compute it automatically if default settings). If you plan to precompute this, do not forget to set right the colnames (see example below).
# we calculate gene_stat_all because we will recycle it
gene_stat_all = get_gene_correlation_scores(sce_spleen, genes = rownames(sce_spleen), nPC = 50)
#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero
#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero
#> Warning in cor(stat_real[i, ], stat_predict[i, ], method = method): the standard
#> deviation is zero
colnames(gene_stat_all) = c("gene" , "corr_all")
# gene score - geneBasis selection
geneScore_geneBasis = get_gene_prediction_scores(sce_spleen, genes.selection = genes_geneBasis, gene_stat_all = gene_stat_all)
geneScore_geneBasis = geneScore_geneBasis[, c("gene", "gene_score")]
colnames(geneScore_geneBasis) = c("gene", "gene_score_geneBasis")
# gene score - manual selection
geneScore_manual = get_gene_prediction_scores(sce_spleen, genes.selection = genes_manual, gene_stat_all = gene_stat_all)
geneScore_manual = geneScore_manual[, c("gene", "gene_score")]
colnames(geneScore_manual) = c("gene", "gene_score_manual")
# combine together
geneScore = merge(geneScore_geneBasis, geneScore_manual)
p = ggplot(geneScore, aes(x = gene_score_geneBasis , y = gene_score_manual)) +
geom_point(alpha = .5) +
geom_abline(slope=1, color = "firebrick4", size = 1.5) +
theme_classic() +
labs(x = "geneBasis" , y = "Manual") +
ggtitle("Gene score")
p
Overall, we see that we predict transcriptome to a similar degree. However, for both methods, there is a subset of genes predicted well with one selection but not the other. The downstream assessment of genes that are not predicted well is not a part of the package per se, however, we include several plotting functions to facilitate the analysis (exemplified in the Tutorial for Mouse embryo).
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] reshape2_1.4.4 knitr_1.33
#> [3] ggpubr_0.4.0 ggplot2_3.3.3
#> [5] tibble_3.1.2 SingleCellExperiment_1.14.1
#> [7] SummarizedExperiment_1.22.0 Biobase_2.52.0
#> [9] GenomicRanges_1.44.0 GenomeInfoDb_1.28.0
#> [11] IRanges_2.26.0 S4Vectors_0.30.0
#> [13] BiocGenerics_0.38.0 MatrixGenerics_1.4.0
#> [15] matrixStats_0.59.0 geneBasisR_0.0.0.9000
#> [17] BiocStyle_2.20.0
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.3.1 backports_1.2.1
#> [3] paleotree_3.3.25 fastmatch_1.1-0
#> [5] plyr_1.8.6 igraph_1.2.6
#> [7] BiocParallel_1.26.0 digest_0.6.27
#> [9] htmltools_0.5.1.1 magick_2.7.2
#> [11] viridis_0.6.1 wesanderson_0.3.6
#> [13] gdata_2.18.0 fansi_0.5.0
#> [15] magrittr_2.0.1 phytools_0.7-80
#> [17] ScaledMatrix_1.0.0 cluster_2.1.2
#> [19] openxlsx_4.2.3 limma_3.48.0
#> [21] colorspace_2.0-1 haven_2.4.1
#> [23] xfun_0.23 dplyr_1.0.6
#> [25] crayon_1.4.1 RCurl_1.98-1.3
#> [27] jsonlite_1.7.2 phangorn_2.7.0
#> [29] ape_5.5 glue_1.4.2
#> [31] gtable_0.3.0 zlibbioc_1.38.0
#> [33] XVector_0.32.0 DelayedArray_0.18.0
#> [35] car_3.0-10 BiocSingular_1.8.0
#> [37] RcppZiggurat_0.1.6 maps_3.3.0
#> [39] abind_1.4-5 scales_1.1.1
#> [41] DBI_1.1.1 edgeR_3.34.0
#> [43] rstatix_0.7.0 Rcpp_1.0.6
#> [45] plotrix_3.8-1 viridisLite_0.4.0
#> [47] tmvnsim_1.0-2 dqrng_0.3.0
#> [49] foreign_0.8-81 rsvd_1.0.5
#> [51] ResidualMatrix_1.2.0 metapod_1.0.0
#> [53] ellipsis_0.3.2 farver_2.1.0
#> [55] pkgconfig_2.0.3 scuttle_1.2.0
#> [57] sass_0.4.0 uwot_0.1.10
#> [59] locfit_1.5-9.4 utf8_1.2.1
#> [61] labeling_0.4.2 tidyselect_1.1.1
#> [63] rlang_0.4.11 ggcorrplot_0.1.3
#> [65] munsell_0.5.0 cellranger_1.1.0
#> [67] tools_4.1.0 generics_0.1.0
#> [69] broom_0.7.6 batchelor_1.8.0
#> [71] evaluate_0.14 stringr_1.4.0
#> [73] yaml_2.2.1 zip_2.2.0
#> [75] purrr_0.3.4 nlme_3.1-152
#> [77] sparseMatrixStats_1.4.0 scran_1.20.1
#> [79] compiler_4.1.0 curl_4.3.1
#> [81] png_0.1-7 ggsignif_0.6.1
#> [83] clusterGeneration_1.3.7 statmod_1.4.36
#> [85] bslib_0.2.5.1 stringi_1.6.2
#> [87] highr_0.9 forcats_0.5.1
#> [89] lattice_0.20-44 bluster_1.2.1
#> [91] Matrix_1.3-4 vctrs_0.3.8
#> [93] pillar_1.6.1 lifecycle_1.0.0
#> [95] BiocManager_1.30.15 combinat_0.0-8
#> [97] jquerylib_0.1.4 BiocNeighbors_1.10.0
#> [99] cowplot_1.1.1 data.table_1.14.0
#> [101] bitops_1.0-7 irlba_2.3.3
#> [103] R6_2.5.0 bookdown_0.22
#> [105] gridExtra_2.3 rio_0.5.26
#> [107] codetools_0.2-18 MASS_7.3-54
#> [109] gtools_3.8.2 assertthat_0.2.1
#> [111] withr_2.4.2 mnormt_2.0.2
#> [113] GenomeInfoDbData_1.2.6 expm_0.999-6
#> [115] hms_1.1.0 quadprog_1.5-8
#> [117] grid_4.1.0 beachmat_2.8.0
#> [119] tidyr_1.1.3 coda_0.19-4
#> [121] rmarkdown_2.8 Rfast_2.0.3
#> [123] DelayedMatrixStats_1.14.0 carData_3.0-4
#> [125] numDeriv_2016.8-1.1 scatterplot3d_0.3-41