knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l1
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))
Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.
max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name,
project_dir,
max_fpr=max_fpr)
Step 1: reading read counts from existing file: 540.666 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 71.602 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.147 sec elapsed
Step 4: computing read counts distribution statistics: 0.237 sec elapsed
Step 5: estimating pi_r matrix: 0.032 sec elapsed
Step 6: infering the true sample of origin: 16.302 sec elapsed
Step 7: estimating g and computing classification metrics: 0.102 sec elapsed
Step 8: determining the optimal cutoff: 0.062 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.011 sec elapsed
Step 10.1: reassigning reads to sample of origin: 3.156 sec elapsed
Step 10.2: deduplicating read counts: 0.021 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 3.201 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0.001 sec elapsed
Step 10.5: adding cell-umi-gene labels: 77.54 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 48.49 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 418.564 sec elapsed
Step 10: purging phantoms at q cutoff of 0.960838. Max-FPR threshold user-set FALSE: 890.973 sec elapsed
Step 11: calling cells: 2105.52 sec elapsed
Step 12: saving purged data: 54.558 sec elapsed
Step 13: tallying molecules by cell-barcode: 369.919 sec elapsed
Step 14: saving results: 19.665 sec elapsed
Running workflow: 4069.898 sec elapsed
data_list$read_counts
cell <chr> | gene <chr> | umi <int> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | |
---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | Ubqln1 | 790638 | 3 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Fn1 | 283552 | 18 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Ndufa10 | 143313 | 9 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Ndufa10 | 658493 | 8 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Prdx6 | 786649 | 16 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 115888 | 11 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 160431 | 6 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 524725 | 6 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 572628 | 1 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 819247 | 3 | 0 | 0 | 0 | 0 | 0 |
data_list$reads_dist_summary$summary_stats
n_obs <dbl> | p_chimeras <dbl> | g <dbl> | u <dbl> | n_reads <int> |
---|---|---|---|---|
270046363 | 0.039507 | 0.0020802 | 0.041312 | 1448171916 |
data_list$reads_dist_summary$conditional
r <int> | n_obs <dbl> | m_bar <dbl> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | |
---|---|---|---|---|---|---|---|---|
1 | 68109075 | 0.2522125247064 | 0.04325200 | 0.0250924 | 0.20344678 | 0.03599161 | 0.05853807 | |
2 | 42204756 | 0.1562870743051 | 0.02863105 | 0.0130457 | 0.22498291 | 0.03287596 | 0.05987137 | |
3 | 31885692 | 0.1180748803493 | 0.02600054 | 0.0109845 | 0.20229338 | 0.03842759 | 0.06984847 | |
4 | 24566893 | 0.0909728712029 | 0.02627956 | 0.0113972 | 0.16166724 | 0.04837964 | 0.08491213 | |
5 | 19093124 | 0.0707031332986 | 0.02804412 | 0.0123261 | 0.11536959 | 0.06197838 | 0.10184015 | |
6 | 15056752 | 0.0557561739871 | 0.03138674 | 0.0133346 | 0.07418112 | 0.07806721 | 0.11662184 | |
7 | 12043737 | 0.0445987750629 | 0.03634271 | 0.0142518 | 0.04367466 | 0.09474717 | 0.12599337 | |
8 | 9721218 | 0.0359983296646 | 0.04317443 | 0.0151554 | 0.02407465 | 0.11076014 | 0.12853383 | |
9 | 7861456 | 0.0291115048270 | 0.05245781 | 0.0163692 | 0.01276833 | 0.12404782 | 0.12440489 | |
10 | 6360310 | 0.0235526593632 | 0.06463308 | 0.0179692 | 0.00667032 | 0.13375400 | 0.11515667 |
data_list$pi_r_hat
r <int> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 0.04308204250 | 0.02476208487 | 0.20469135422 | 0.03575753434 | 0.05850308872 | 0.05664131200 | |
2 | 0.02833198427 | 0.01260897568 | 0.22641765466 | 0.03261437987 | 0.05984815745 | 0.05614651479 | |
3 | 0.02567825001 | 0.01052962192 | 0.20352776718 | 0.03821502929 | 0.06991335347 | 0.06528140988 | |
4 | 0.02595973446 | 0.01094593486 | 0.16254289516 | 0.04825495830 | 0.08511003291 | 0.07993427913 | |
5 | 0.02773986792 | 0.01188301951 | 0.11583642914 | 0.06197376904 | 0.10218752682 | 0.09707183561 | |
6 | 0.03111200499 | 0.01290044682 | 0.07428426142 | 0.07820467313 | 0.11709974422 | 0.11315517835 | |
7 | 0.03611173507 | 0.01382579327 | 0.04350843239 | 0.09503191599 | 0.12655402181 | 0.12466304033 | |
8 | 0.04300377850 | 0.01473738683 | 0.02373534554 | 0.11118628533 | 0.12911691463 | 0.13010173685 | |
9 | 0.05236913807 | 0.01596189307 | 0.01232919465 | 0.12459128900 | 0.12495151466 | 0.12898816943 | |
10 | 0.06465191134 | 0.01757603894 | 0.00617733786 | 0.13438317936 | 0.11562163054 | 0.12219689953 |
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
data_list$outcome_counts
outcome <chr> | n <int> | q <dbl> | qs <dbl> | j <dbl> | o <dbl> | FPR <dbl> | FNR <dbl> | r <int> | |
---|---|---|---|---|---|---|---|---|---|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6 | 318062 | 1 | 0 | 0.0011803 | 0.0011778 | 0 | 0.99882 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0 | 98291 | 1 | 0 | 0.0015450 | 0.0015418 | 0 | 0.99846 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0 | 810916 | 1 | 0 | 0.0045541 | 0.0045447 | 0 | 0.99545 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0 | 640901 | 1 | 0 | 0.0069324 | 0.0069180 | 0 | 0.99307 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0 | 1423411 | 1 | 0 | 0.0122144 | 0.0121889 | 0 | 0.98779 | 6 | |
0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0 | 266441 | 1 | 0 | 0.0132031 | 0.0131756 | 0 | 0.98680 | 6 | |
0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0 | 1570432 | 1 | 0 | 0.0190306 | 0.0189910 | 0 | 0.98097 | 6 | |
0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0 | 1379298 | 1 | 0 | 0.0241489 | 0.0240986 | 0 | 0.97585 | 6 | |
0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0 | 974884 | 1 | 0 | 0.0277665 | 0.0277087 | 0 | 0.97223 | 6 | |
0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0 | 731726 | 1 | 0 | 0.0304817 | 0.0304183 | 0 | 0.96952 | 6 |
data_list$fit_out$chimera_counts
r <dbl> | 1 <dbl> | 2 <dbl> | 3 <dbl> | 4 <dbl> | 5 <dbl> | 6 <dbl> | 7 <dbl> | 8 <dbl> | 9 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 68109075 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
2 | 41506874 | 697882 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
3 | 31102134 | 777105 | 6453 | 0 | 0 | 0 | 0 | 0 | 0 | |
4 | 23765697 | 791777 | 9367 | 52 | 0 | 0 | 0 | 0 | 0 | |
5 | 18318240 | 762497 | 12298 | 87 | 2 | 0 | 0 | 0 | 0 | |
6 | 14326357 | 715947 | 14296 | 148 | 3 | 1 | 0 | 0 | 0 | |
7 | 11364897 | 662959 | 15669 | 206 | 5 | 1 | 0 | 0 | 0 | |
8 | 9093619 | 610534 | 16846 | 219 | 0 | 0 | 0 | 0 | 0 | |
9 | 7294944 | 548742 | 17466 | 296 | 6 | 0 | 2 | 0 | 0 | |
10 | 5853521 | 489138 | 17275 | 369 | 5 | 2 | 0 | 0 | 0 |
data_list$fit_out$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
25 | 0.99179 | 0.99179 | 0.9918 | 0.0082058 | 0.008616 |
p_fit <- plot_fit(data_list, dataset_name)
plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
data_list$optimal_cutoff
cutoff <chr> | outcome <chr> | q <dbl> | s <int> | FPR <dbl> | j <dbl> | qs <dbl> | o <dbl> | |
---|---|---|---|---|---|---|---|---|
optimal | 0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0 | 0.96084 | 9 | 0.044675 | 0.94973 | 145.93 | 0.99428 | |
above | 0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0 | 0.96147 | 10 | 0.044674 | 0.94973 | 145.86 | 0.99428 | |
below | 0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0 | 0.95818 | 9 | 0.044707 | 0.94973 | 146.21 | 0.99431 | |
none | 0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,0 | 0.27418 | 10 | 0.061346 | 0.93807 | 158.61 | 1.00000 |
p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
p_post$legend,
ncol=2,
rel_widths=c(1, 0.1))
Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.
First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.
data_list$reads_dist_summary$marginal
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
P7_0 | 13.5422 | 0.93165 | 0.00117330 | 0.067179 | 0.048158 | 0.073506 | 7.8606 |
P7_1 | 8.1590 | 0.86535 | 0.00035433 | 0.134298 | 0.029015 | 0.081457 | 14.4582 |
P7_10 | 38.6810 | 0.98274 | 0.00401132 | 0.013247 | 0.137556 | 0.064398 | 2.4110 |
P7_11 | 16.1980 | 0.95491 | 0.00139400 | 0.043695 | 0.057603 | 0.068377 | 6.1132 |
P7_12 | 21.2821 | 0.96631 | 0.00221609 | 0.031474 | 0.075682 | 0.070070 | 4.7680 |
P7_13 | 20.7982 | 0.96809 | 0.00215490 | 0.029755 | 0.073962 | 0.070786 | 4.9288 |
P7_14 | 12.9203 | 0.94700 | 0.00070532 | 0.052297 | 0.045947 | 0.060563 | 6.7882 |
P7_15 | 12.8065 | 0.95999 | 0.00137547 | 0.038635 | 0.045542 | 0.047825 | 5.4081 |
P7_2 | 28.0185 | 0.97482 | 0.00377101 | 0.021411 | 0.099638 | 0.056673 | 2.9292 |
P7_3 | 28.5820 | 0.97545 | 0.00312844 | 0.021423 | 0.101642 | 0.061378 | 3.1099 |
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$reads_dist_summary$marginal_called_cells
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
P7_0 | 4.0834 | 0.98841 | 0.00124089 | 0.01034514 | 0.0176011 | 0.073506 | 26.0690 |
P7_1 | 2.1012 | 0.97400 | 0.00032171 | 0.02567378 | 0.0090573 | 0.081457 | 56.1401 |
P7_10 | 36.2339 | 0.99649 | 0.00335661 | 0.00015792 | 0.1561839 | 0.064398 | 2.5738 |
P7_11 | 13.8809 | 0.99767 | 0.00095239 | 0.00137361 | 0.0598328 | 0.068377 | 7.1337 |
P7_12 | 19.1294 | 0.99751 | 0.00157302 | 0.00091994 | 0.0824562 | 0.070070 | 5.3046 |
P7_13 | 18.4950 | 0.99718 | 0.00168116 | 0.00113798 | 0.0797215 | 0.070786 | 5.5426 |
P7_14 | 11.2416 | 0.99761 | 0.00056442 | 0.00182758 | 0.0484562 | 0.060563 | 7.8019 |
P7_15 | 11.3507 | 0.99766 | 0.00109905 | 0.00124353 | 0.0489266 | 0.047825 | 6.1017 |
P7_2 | 25.7848 | 0.99624 | 0.00310435 | 0.00065124 | 0.1111435 | 0.056673 | 3.1830 |
P7_3 | 26.2136 | 0.99653 | 0.00264160 | 0.00082491 | 0.1129921 | 0.061378 | 3.3908 |
data_list$called_cells_tally
barcode <chr> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | P7_14 <dbl> | P7_15 <dbl> | P7_2 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
consensus_background | 245915 | 231834 | 375416 | 259465 | 291811 | 305485 | 244136 | 242666 | 306264 | |
transition_cell | 146 | 95 | 19 | 1 | 16 | 10 | 13 | 10 | 13 | |
phantom_background | 61772 | 73677 | 41615 | 64228 | 59249 | 53451 | 64676 | 54118 | 54021 | |
transition_background | 198 | 223 | 11 | 32 | 27 | 26 | 28 | 8 | 21 | |
consensus_cell | 1134 | 547 | 2449 | 3474 | 3027 | 3383 | 2600 | 2755 | 4037 | |
phantom_cell | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
data_list$umi_counts_cell %>%
map(list("called_cells"))
$P7_0
# A tibble: 1,332 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 475 40 430 5
2 AGTGAGGTCTGCCCTA 850 432 417 1
3 CTAACTTAGTTGTAGA 433 34 399 0
4 CATGCCTTCAATACCG 785 392 391 2
5 CTCGAGGGTTTCGCTC 396 27 368 1
6 AGACGTTGTCCGAGTC 403 36 365 2
7 GAACATCTCCTTGGTC 369 32 336 1
8 CGGGTCATCCCAAGTA 369 39 329 1
9 GTCGTAAGTTAAGATG 367 42 325 0
10 AGCATACCATCATCCC 357 37 320 0
# … with 1,322 more rows
$P7_1
# A tibble: 770 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 555 51 500 4
2 AGACGTTGTCCGAGTC 518 42 471 5
3 CTAACTTAGTTGTAGA 492 48 443 1
4 CGGGTCATCCCAAGTA 486 53 432 1
5 GAATAAGCATATGGTC 464 48 416 0
6 TGTTCCGAGGCTAGAC 477 70 407 0
7 GTGGGTCCAAACTGTC 470 87 382 1
8 CATGCCTTCAATACCG 434 57 377 0
9 TAAGAGATCTTGGGTA 480 114 366 0
10 GAACATCTCAGAGACG 424 59 365 0
# … with 760 more rows
$P7_10
# A tibble: 2,460 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 293 39 254 0
2 CTAACTTAGTTGTAGA 255 21 234 0
3 GAACATCTCCTTGGTC 227 12 214 1
4 AGACGTTGTCCGAGTC 234 25 209 0
5 CGGGTCATCCCAAGTA 224 15 209 0
6 AGTGAGGTCTGCCCTA 218 19 196 3
7 CTCGAGGGTTTCGCTC 214 18 195 1
8 GGACATTTCGTAGGAG 207 17 189 1
9 GTGCTTCGTCCCTACT 205 38 163 4
10 ACATACGCAGCATGAG 12457 12311 103 43
# … with 2,450 more rows
$P7_11
# A tibble: 3,506 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 394 55 338 1
2 AGGGATGGTCTAACGT 369 30 334 5
3 CGGGTCATCCCAAGTA 373 62 310 1
4 CTCGAGGGTTTCGCTC 323 20 302 1
5 AGACGTTGTCCGAGTC 304 27 274 3
6 GAACATCTCCTTGGTC 297 22 272 3
7 CATGCCTTCAATACCG 316 45 270 1
8 TGTTCCGAGGCTAGAC 284 23 258 3
9 GGACATTTCGTAGGAG 271 18 247 6
10 GAACATCTCAGAGACG 268 32 232 4
# … with 3,496 more rows
$P7_12
# A tibble: 3,054 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 368 40 322 6
2 CTAACTTAGTTGTAGA 339 26 313 0
3 AGTGAGGTCTGCCCTA 331 29 297 5
4 AGACGTTGTCCGAGTC 303 37 261 5
5 TGTTCCGAGGCTAGAC 273 21 251 1
6 GAACATCTCAGAGACG 315 60 250 5
7 CTCGAGGGTTTCGCTC 271 24 246 1
8 GAACATCTCCTTGGTC 275 32 242 1
9 GGCTGGTGTGTCGCTG 258 30 225 3
10 TAAACCGTCTCTGTCG 252 25 224 3
# … with 3,044 more rows
$P7_13
# A tibble: 3,409 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 381 42 336 3
2 CTAACTTAGTTGTAGA 312 22 286 4
3 AGTGAGGTCTGCCCTA 308 18 285 5
4 AGACGTTGTCCGAGTC 290 31 255 4
5 GAACATCTCCTTGGTC 276 22 251 3
6 CGGGTCATCCCAAGTA 271 19 246 6
7 CTCGAGGGTTTCGCTC 260 25 234 1
8 GAACATCTCAGAGACG 272 43 223 6
9 TGTTCCGAGGCTAGAC 257 36 218 3
10 GAATAAGCATATGGTC 249 22 216 11
# … with 3,399 more rows
$P7_14
# A tibble: 2,628 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 371 35 334 2
2 AGACGTTGTCCGAGTC 320 38 282 0
3 CTAACTTAGTTGTAGA 311 28 282 1
4 CGGGTCATCCCAAGTA 315 31 281 3
5 AGTGAGGTCTGCCCTA 301 20 279 2
6 GAACATCTCCTTGGTC 293 16 277 0
7 CTCGAGGGTTTCGCTC 268 17 249 2
8 GAATAAGCATATGGTC 273 43 230 0
9 GAACATCTCAGAGACG 261 37 224 0
10 TGAGAGGCAACACGCC 239 17 221 1
# … with 2,618 more rows
$P7_15
# A tibble: 2,763 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 322 28 289 5
2 CTAACTTAGTTGTAGA 242 17 221 4
3 CGGGTCATCCCAAGTA 234 23 209 2
4 AGTGAGGTCTGCCCTA 223 19 204 0
5 AGACGTTGTCCGAGTC 218 18 200 0
6 GAATAAGCATATGGTC 227 24 192 11
7 TGAGAGGCAACACGCC 204 14 190 0
8 GCGCCAAAGGCTATCT 226 80 146 0
9 CATATGGGTTGCGTTA 1696 1606 89 1
10 CTCGTCATCAGAGGTG 5133 5047 81 5
# … with 2,753 more rows
$P7_2
# A tibble: 4,058 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 347 35 309 3
2 AGTGAGGTCTGCCCTA 317 26 278 13
3 CGGGTCATCCCAAGTA 282 28 251 3
4 GAACATCTCCTTGGTC 268 19 247 2
5 CTAACTTAGTTGTAGA 266 24 241 1
6 CTCGAGGGTTTCGCTC 275 36 238 1
7 GAACATCTCAGAGACG 288 46 238 4
8 CATGCCTTCAATACCG 277 35 227 15
9 AGACGTTGTCCGAGTC 253 32 217 4
10 TGTTCCGAGGCTAGAC 232 19 211 2
# … with 4,048 more rows
$P7_3
# A tibble: 3,997 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 376 75 299 2
2 AGTGAGGTCTGCCCTA 310 30 276 4
3 CTAACTTAGTTGTAGA 307 37 268 2
4 AGACGTTGTCCGAGTC 290 23 265 2
5 GAACATCTCCTTGGTC 275 29 245 1
6 CTCGAGGGTTTCGCTC 258 22 235 1
7 TGTTCCGAGGCTAGAC 254 29 223 2
8 CGGGTCATCCCAAGTA 249 30 218 1
9 CATCCACAGATGGCGT 238 22 215 1
10 CATGCCTTCAATACCG 253 31 213 9
# … with 3,987 more rows
$P7_4
# A tibble: 996 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 764 158 606 0
2 GTGCTTCGTCCCTACT 570 86 483 1
3 AGGGATGGTCTAACGT 396 41 352 3
4 TGGGAAGCATTCGACA 358 18 340 0
5 CTAACTTAGTTGTAGA 324 25 297 2
6 AGTGAGGTCTGCCCTA 329 33 295 1
7 AGGCCGTGTGCGATAG 309 16 293 0
8 AGACGTTGTCCGAGTC 295 26 268 1
9 AGCGGTCTCTTAGCCC 285 20 265 0
10 CAGATCAAGACAGAGA 289 27 262 0
# … with 986 more rows
$P7_5
# A tibble: 1,979 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 272 27 243 2
2 CGGGTCATCCCAAGTA 251 26 224 1
3 CTAACTTAGTTGTAGA 241 22 219 0
4 AGTGAGGTCTGCCCTA 12662 12418 210 34
5 GAACATCTCCTTGGTC 224 21 203 0
6 CTCGAGGGTTTCGCTC 216 15 201 0
7 AGACGTTGTCCGAGTC 217 18 197 2
8 TAAACCGTCTCTGTCG 213 20 193 0
9 GAATAAGCATATGGTC 222 29 183 10
10 GGACATTTCGTAGGAG 257 73 182 2
# … with 1,969 more rows
$P7_6
# A tibble: 7,291 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 266 16 248 2
2 AGGGATGGTCTAACGT 248 31 217 0
3 GAACATCTCCTTGGTC 238 22 216 0
4 CGGGTCATCCCAAGTA 210 19 191 0
5 CTCGAGGGTTTCGCTC 208 15 191 2
6 GTGCTTCGTCCCTACT 219 32 182 5
7 GAATAAGCATATGGTC 208 17 181 10
8 CTAACTTAGTTGTAGA 201 22 178 1
9 AAATGCCTCCCAAGTA 208 29 176 3
10 TAAGAGATCTTGGGTA 5376 5228 126 22
# … with 7,281 more rows
$P7_7
# A tibble: 742 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 301 30 269 2
2 AGTGAGGTCTGCCCTA 258 20 238 0
3 CTAACTTAGTTGTAGA 247 10 235 2
4 CTCGAGGGTTTCGCTC 234 17 216 1
5 GAACATCTCAGAGACG 246 33 213 0
6 TAAACCGTCTCTGTCG 228 14 211 3
7 GAACATCTCCTTGGTC 235 25 209 1
8 AGACGTTGTCCGAGTC 222 17 204 1
9 GGACATTTCGTAGGAG 219 28 191 0
10 TAAACCGCATGTAGTC 212 22 187 3
# … with 732 more rows
$P7_8
# A tibble: 1,199 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 505 1 465 39
2 GAATAAGCATATGGTC 468 1 420 47
3 GAACATCTCCTTGGTC 442 2 408 32
4 CTCGAGGGTTTCGCTC 410 0 389 21
5 TGTTCCGAGGCTAGAC 421 0 377 44
6 GAACATCTCAGAGACG 415 2 363 50
7 GAACGGAGTTGCGCAC 406 5 361 40
8 CATGCCTTCAATACCG 405 1 356 48
9 GTGCTTCGTCCCTACT 425 1 344 80
10 AAATGCCTCCCAAGTA 391 2 341 48
# … with 1,189 more rows
$P7_9
# A tibble: 2,396 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 446 42 403 1
2 AGTGAGGTCTGCCCTA 409 31 376 2
3 CGGGTCATCCCAAGTA 411 34 374 3
4 CTAACTTAGTTGTAGA 385 27 356 2
5 CTCGAGGGTTTCGCTC 375 26 346 3
6 AGACGTTGTCCGAGTC 365 26 338 1
7 GAACATCTCCTTGGTC 359 29 330 0
8 GAATAAGCATATGGTC 337 34 303 0
9 GGCTGGTGTGTCGCTG 316 18 292 6
10 GAACATCTCAGAGACG 329 45 284 0
# … with 2,386 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$P7_0
# A tibble: 307,833 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 449 30 413 6
2 GCTGCTTGTCCGAAGA 533 222 302 9
3 AAATGCCTCCCAAGTA 353 54 296 3
4 GGACATTTCGTAGGAG 315 34 277 4
5 TAAACCGCATGTAGTC 278 33 238 7
6 GCAAACTAGCTTATCG 253 13 237 3
7 ACACCAACATAAGACA 238 8 227 3
8 ACGCCGAGTCTACCTC 234 22 210 2
9 TTAACTCCAATGGTCT 228 18 209 1
10 AACTCAGTCTAACGGT 226 16 208 2
# … with 307,823 more rows
$P7_1
# A tibble: 305,606 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 546 145 400 1
2 AGAGTGGTCATGTCCC 199 10 188 1
3 GACCAATAGTACGACG 200 14 186 0
4 CTCGTCATCAGAGGTG 200 14 185 1
5 TGGCTGGAGAAGGTGA 196 12 184 0
6 AGGGATGAGTACTTGC 197 15 182 0
7 GCTTCCAGTTCCGGCA 195 11 182 2
8 CATGGCGGTGCGATAG 198 17 180 1
9 CCACCTAAGGCCCGTT 199 19 180 0
10 GGCAATTTCTTCTGGC 195 15 180 0
# … with 305,596 more rows
$P7_10
# A tibble: 417,050 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 225 17 205 3
2 GAATAAGCATATGGTC 227 34 188 5
3 GATCTAGTCGAGAACG 189 15 174 0
4 TGTTCCGAGGCTAGAC 195 19 171 5
5 GAACATCTCAGAGACG 197 31 166 0
6 GGCTGGTGTGTCGCTG 182 14 165 3
7 TAAGAGATCTTGGGTA 196 26 164 6
8 GCTGCTTGTCCGAAGA 243 73 163 7
9 AAATGCCTCCCAAGTA 183 20 162 1
10 GAACGGAGTTGCGCAC 190 19 162 9
# … with 417,040 more rows
$P7_11
# A tibble: 323,694 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTAACTTAGTTGTAGA 332 26 304 2
2 CACACCTTCAACGCTA 323 23 298 2
3 GCTGCTTGTCCGAAGA 403 131 263 9
4 TAAACCGTCTCTGTCG 272 21 250 1
5 GAATAAGCATATGGTC 290 24 249 17
6 AAATGCCTCCCAAGTA 288 36 248 4
7 CATCCACAGATGGCGT 243 20 218 5
8 GATCTAGTCGAGAACG 236 19 217 0
9 GAACGGAGTTGCGCAC 263 39 213 11
10 GCAAACTTCGACAGCC 233 18 213 2
# … with 323,684 more rows
$P7_12
# A tibble: 351,076 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CGGGTCATCCCAAGTA 342 37 302 3
2 CACACCTTCAACGCTA 292 31 261 0
3 GCTGCTTGTCCGAAGA 383 110 259 14
4 GAATAAGCATATGGTC 300 34 255 11
5 GAACGGAGTTGCGCAC 276 32 232 12
6 ACTTACTCAGCTCGAC 225 16 206 3
7 TAAGAGATCTTGGGTA 239 32 202 5
8 GCAAACTTCGACAGCC 230 29 198 3
9 AAACGGGAGGATATAC 219 26 191 2
10 GGATGTTAGGGAACGG 204 11 191 2
# … with 351,066 more rows
$P7_13
# A tibble: 358,946 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 275 25 245 5
2 GCTGCTTGTCCGAAGA 311 91 213 7
3 CGGACACAGCGCTTAT 223 15 206 2
4 CATCCACAGATGGCGT 215 29 184 2
5 GAACGGAGTTGCGCAC 225 36 183 6
6 GCTGCGACAGTAAGCG 190 8 181 1
7 GGCTGGTGTGTCGCTG 198 16 178 4
8 ACATACGCAGCATGAG 198 24 174 0
9 ACTTTCAGTTTGACTG 194 18 174 2
10 CGACTTCAGACCTAGG 194 20 174 0
# … with 358,936 more rows
$P7_14
# A tibble: 308,825 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 306 21 284 1
2 GAACGGAGTTGCGCAC 257 25 232 0
3 GCTGCTTGTCCGAAGA 309 77 225 7
4 GATCTAGTCGAGAACG 240 32 208 0
5 CATCCACAGATGGCGT 223 21 199 3
6 GCAAACTTCGACAGCC 213 21 190 2
7 ACTTACTCAGCTCGAC 206 16 189 1
8 AAATGCCGTGAACCTT 197 13 184 0
9 CGACTTCAGACCTAGG 198 15 182 1
10 TGGGAAGTCAACCAAC 198 17 181 0
# … with 308,815 more rows
$P7_15
# A tibble: 296,794 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 TAAACCGTCTCTGTCG 198 11 187 0
2 GAACGGAGTTGCGCAC 212 24 180 8
3 TGTTCCGAGGCTAGAC 195 22 173 0
4 CACACCTTCAACGCTA 193 18 172 3
5 CTCGAGGGTTTCGCTC 188 18 170 0
6 GAACATCTCCTTGGTC 180 10 170 0
7 CATGCCTTCAATACCG 197 31 166 0
8 GGCTGGTGTGTCGCTG 182 16 166 0
9 GTGCTTCGTCCCTACT 201 33 165 3
10 ACTTACTCAGCTCGAC 177 13 162 2
# … with 296,784 more rows
$P7_2
# A tibble: 360,298 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 254 18 234 2
2 GATCTAGTCGAGAACG 219 16 201 2
3 GAATAAGCATATGGTC 238 32 196 10
4 ACTTACTCAGCTCGAC 228 35 192 1
5 GCTGCTTGTCCGAAGA 271 69 191 11
6 CGGACACAGCGCTTAT 208 25 181 2
7 GAACGGAGTTGCGCAC 208 20 181 7
8 GCTGCGACAGTAAGCG 197 17 180 0
9 AAACGGGAGGATATAC 189 11 176 2
10 CATCCACAGATGGCGT 197 25 170 2
# … with 360,288 more rows
$P7_3
# A tibble: 379,886 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 247 18 223 6
2 GCTGCTTGTCCGAAGA 306 84 215 7
3 GAATAAGCATATGGTC 263 43 209 11
4 TAAACCGTCTCTGTCG 231 20 205 6
5 AGATTGCAGAGCAATT 200 13 187 0
6 ACTTTCAGTTTGACTG 198 19 179 0
7 TCGAGGCTCCCTCTTT 194 15 178 1
8 GAACGGAGTTGCGCAC 214 29 176 9
9 CTCAGAACATATGCTG 194 19 170 5
10 CGGACACAGCGCTTAT 188 19 168 1
# … with 379,876 more rows
$P7_4
# A tibble: 314,515 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 320 16 304 0
2 CGTCTACTCTTGACGA 195 9 186 0
3 CCTACCACATCGGTTA 197 19 178 0
4 GACTACAGTTCAACCA 189 11 178 0
5 GGTGAAGAGGTGATAT 189 12 177 0
6 ACTTTCAGTTTGACTG 192 14 176 2
7 AACGTTGTCAACACAC 187 11 175 1
8 AAATGCCGTGAACCTT 199 25 174 0
9 TCGAGGCTCGCGCCAA 191 17 174 0
10 TGTTCCGCACTTAAGC 187 13 174 0
# … with 314,505 more rows
$P7_5
# A tibble: 363,946 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 287 40 244 3
2 GCTGCTTGTCCGAAGA 301 79 217 5
3 TGAGAGGCAACACGCC 198 15 180 3
4 AAATGCCTCCCAAGTA 195 22 169 4
5 TGTTCCGAGGCTAGAC 191 21 167 3
6 ACTTACTCAGCTCGAC 181 13 166 2
7 AGATTGCAGAGCAATT 167 8 159 0
8 CCGTACTGTCAGATAA 178 17 159 2
9 TAAGAGATCTTGGGTA 182 20 156 6
10 TAAACCGCATGTAGTC 174 17 154 3
# … with 363,936 more rows
$P7_6
# A tibble: 349,960 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 220 11 204 5
2 GCTGCTTGTCCGAAGA 268 78 187 3
3 AGACGTTGTCCGAGTC 195 15 180 0
4 TGTTCCGAGGCTAGAC 195 21 170 4
5 GGACATTTCGTAGGAG 187 14 168 5
6 TGAGAGGCAACACGCC 179 16 162 1
7 AAATGCCGTGAACCTT 167 11 155 1
8 TCTATTGCATAAAGGT 168 13 155 0
9 ACTTACTCAGCTCGAC 169 15 153 1
10 CATGCCTTCAATACCG 182 22 152 8
# … with 349,950 more rows
$P7_7
# A tibble: 273,890 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 230 10 219 1
2 CGGGTCATCCCAAGTA 235 25 210 0
3 CATCCACAGATGGCGT 196 12 184 0
4 GTGCTTCGTCCCTACT 235 48 184 3
5 TGTTCCGAGGCTAGAC 199 21 178 0
6 GCTGCTTGTCCGAAGA 253 75 173 5
7 GATCTAGTCGAGAACG 190 20 170 0
8 GCTGCGACAGTAAGCG 179 9 169 1
9 AAATGCCTCCCAAGTA 199 30 168 1
10 ACTTACTCAGCTCGAC 181 13 168 0
# … with 273,880 more rows
$P7_8
# A tibble: 270,526 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 470 0 432 38
2 GCTGCTTGTCCGAAGA 473 6 336 131
3 GCAAACTAGCTTATCG 299 0 284 15
4 GTAGGCCAGCCGATTT 309 0 270 39
5 ACACCAACATAAGACA 247 1 232 14
6 GCTTCCACACTTCGAA 251 0 223 28
7 TTAACTCCAATGGTCT 252 7 222 23
8 GCACTCTAGATACACA 243 0 218 25
9 ACGCCGAGTCTACCTC 233 0 215 18
10 CTTAACTCACTACAGT 224 0 210 14
# … with 270,516 more rows
$P7_9
# A tibble: 314,581 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 389 27 358 4
2 GAACGGAGTTGCGCAC 362 44 318 0
3 AAATGCCTCCCAAGTA 322 29 292 1
4 ACTTACTCAGCTCGAC 290 33 256 1
5 CATCCACAGATGGCGT 274 24 249 1
6 GTAGGCCAGCCGATTT 272 37 235 0
7 GATCTAGTCGAGAACG 254 19 233 2
8 CGGACACAGCGCTTAT 259 28 230 1
9 AAACGGGAGGATATAC 264 35 228 1
10 GGATGTTAGGGAACGG 247 27 219 1
# … with 314,571 more rows
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6356772 339.5 10309335 550.6 10309335 550.6
Vcells 5305151352 40475.1 19670404828 150073.3 30723203766 234399.5
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] rhdf5_2.26.2 cowplot_0.9.4 data.table_1.12.0
[4] tictoc_1.0 furrr_0.1.0 future_1.11.1.1
[7] broom_0.5.1 matrixStats_0.54.0 forcats_0.4.0
[10] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.1
[13] readr_1.3.1 tidyr_0.8.3 tibble_2.0.1
[16] ggplot2_3.1.0 tidyverse_1.2.1 rmarkdown_1.11
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.3
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.1
[13] BiocGenerics_0.28.0 colorspace_1.4-0
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.0.1
[19] rvest_0.3.2 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.18 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.1 readxl_1.3.0
[33] rstudioapi_0.9.0 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.0 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.3.1
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] knitr_1.21 pillar_1.3.1
[63] GenomicRanges_1.34.0 codetools_0.2-16
[65] stats4_3.5.2 glue_1.3.0
[67] evaluate_0.13 modelr_0.1.4
[69] cellranger_1.1.0 gtable_0.2.0
[71] assertthat_0.2.0 xfun_0.5
[73] DropletUtils_1.2.2 viridisLite_0.3.0
[75] SingleCellExperiment_1.4.1 IRanges_2.16.0
[77] globals_0.12.4