knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l2
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))
Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.
max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name,
project_dir,
max_fpr=max_fpr)
Step 1: reading read counts from existing file: 528.547 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 70.215 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.16 sec elapsed
Step 4: computing read counts distribution statistics: 0.249 sec elapsed
Step 5: estimating pi_r matrix: 0.037 sec elapsed
Step 6: infering the true sample of origin: 13.681 sec elapsed
Step 7: estimating g and computing classification metrics: 0.101 sec elapsed
Step 8: determining the optimal cutoff: 0.059 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.01 sec elapsed
Step 10.1: reassigning reads to sample of origin: 3.087 sec elapsed
Step 10.2: deduplicating read counts: 0.02 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 3.128 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0.001 sec elapsed
Step 10.5: adding cell-umi-gene labels: 77.069 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 44.17 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 421.683 sec elapsed
Step 10: purging phantoms at q cutoff of 0.957998. Max-FPR threshold user-set FALSE: 899.891 sec elapsed
Step 11: calling cells: 2068.459 sec elapsed
Step 12: saving purged data: 56.202 sec elapsed
Step 13: tallying molecules by cell-barcode: 368.277 sec elapsed
Step 14: saving results: 19.775 sec elapsed
Running workflow: 4025.732 sec elapsed
data_list$read_counts
cell <chr> | gene <chr> | umi <int> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | |
---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | Spp2 | 539457 | 1 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCTA | Car3 | 405996 | 1 | 50 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCTA | Fabp1 | 484820 | 1 | 37 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCTA | Ubqln1 | 790638 | 3 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Fn1 | 283552 | 17 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Ndufa10 | 658493 | 4 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Ndufa10 | 143313 | 17 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Prdx6 | 786649 | 15 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 115888 | 11 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACGAG | Apoa2 | 524725 | 7 | 0 | 0 | 0 | 0 | 0 |
data_list$reads_dist_summary$summary_stats
n_obs <dbl> | p_chimeras <dbl> | g <dbl> | u <dbl> | n_reads <int> |
---|---|---|---|---|
270731056 | 0.040026 | 0.0021131 | 0.041873 | 1452691297 |
data_list$reads_dist_summary$conditional
r <int> | n_obs <dbl> | m_bar <dbl> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | |
---|---|---|---|---|---|---|---|---|
1 | 68446441 | 0.2528207957051 | 0.04349294 | 0.0253631 | 0.20274423 | 0.03621291 | 0.05860270 | |
2 | 42208353 | 0.1559051023685 | 0.02863829 | 0.0131132 | 0.22510009 | 0.03280784 | 0.05978075 | |
3 | 31905217 | 0.1178483823444 | 0.02590146 | 0.0109687 | 0.20288584 | 0.03825035 | 0.06966525 | |
4 | 24594880 | 0.0908461717078 | 0.02613882 | 0.0113563 | 0.16204781 | 0.04815907 | 0.08473356 | |
5 | 19128961 | 0.0706566925961 | 0.02790589 | 0.0122600 | 0.11572487 | 0.06188727 | 0.10162541 | |
6 | 15088271 | 0.0557315855186 | 0.03117935 | 0.0131829 | 0.07452893 | 0.07790234 | 0.11648744 | |
7 | 12071359 | 0.0445880098809 | 0.03600818 | 0.0141314 | 0.04374590 | 0.09477211 | 0.12606345 | |
8 | 9742958 | 0.0359875891002 | 0.04291366 | 0.0150616 | 0.02421799 | 0.11047542 | 0.12877492 | |
9 | 7890048 | 0.0291434906530 | 0.05224339 | 0.0161492 | 0.01280399 | 0.12411413 | 0.12461732 | |
10 | 6372799 | 0.0235392241073 | 0.06427518 | 0.0177997 | 0.00670848 | 0.13404233 | 0.11543811 |
data_list$pi_r_hat
r <int> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 0.04332285092 | 0.02503074204 | 0.20399924359 | 0.03597767573 | 0.05856782031 | 0.05671544440 | |
2 | 0.02833526809 | 0.01267122324 | 0.22655515927 | 0.03254213430 | 0.05975641609 | 0.05595406545 | |
3 | 0.02557394700 | 0.01050755554 | 0.20414211780 | 0.03803334388 | 0.06972937274 | 0.06513595955 | |
4 | 0.02581343665 | 0.01089867459 | 0.16293864279 | 0.04803073438 | 0.08493252340 | 0.07982153402 | |
5 | 0.02759631100 | 0.01181041057 | 0.11620117104 | 0.06188178818 | 0.10197553930 | 0.09694580682 | |
6 | 0.03089906991 | 0.01274161716 | 0.07463657235 | 0.07804017528 | 0.11697055694 | 0.11308911399 | |
7 | 0.03577111471 | 0.01369857945 | 0.04357807095 | 0.09506090398 | 0.12663226307 | 0.12478329708 | |
8 | 0.04273838771 | 0.01463704212 | 0.02387541474 | 0.11090474074 | 0.12936799772 | 0.13013712642 | |
9 | 0.05215161033 | 0.01573442210 | 0.01235927385 | 0.12466550079 | 0.12517319743 | 0.12898975986 | |
10 | 0.06429106523 | 0.01739965403 | 0.00620921541 | 0.13468254159 | 0.11591184537 | 0.12236465068 |
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
data_list$outcome_counts
outcome <chr> | n <int> | q <dbl> | qs <dbl> | j <dbl> | o <dbl> | FPR <dbl> | FNR <dbl> | r <int> | |
---|---|---|---|---|---|---|---|---|---|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6 | 317305 | 1 | 0 | 0.0011745 | 0.0011720 | 0 | 0.99883 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0 | 97616 | 1 | 0 | 0.0015358 | 0.0015326 | 0 | 0.99846 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0 | 811783 | 1 | 0 | 0.0045407 | 0.0045311 | 0 | 0.99546 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0 | 644309 | 1 | 0 | 0.0069256 | 0.0069110 | 0 | 0.99307 | 6 | |
0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0 | 1426227 | 1 | 0 | 0.0122048 | 0.0121790 | 0 | 0.98780 | 6 | |
0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0 | 264101 | 1 | 0 | 0.0131824 | 0.0131545 | 0 | 0.98682 | 6 | |
0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0 | 1579786 | 1 | 0 | 0.0190300 | 0.0189898 | 0 | 0.98097 | 6 | |
0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0 | 1387968 | 1 | 0 | 0.0241676 | 0.0241165 | 0 | 0.97583 | 6 | |
0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0 | 972700 | 1 | 0 | 0.0277681 | 0.0277094 | 0 | 0.97223 | 6 | |
0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0 | 729496 | 1 | 0 | 0.0304683 | 0.0304039 | 0 | 0.96953 | 6 |
data_list$fit_out$chimera_counts
r <dbl> | 1 <dbl> | 2 <dbl> | 3 <dbl> | 4 <dbl> | 5 <dbl> | 6 <dbl> | 7 <dbl> | 8 <dbl> | 9 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 68446441 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
2 | 41501484 | 706869 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
3 | 31114714 | 783939 | 6564 | 0 | 0 | 0 | 0 | 0 | 0 | |
4 | 23782890 | 802312 | 9610 | 68 | 0 | 0 | 0 | 0 | 0 | |
5 | 18342907 | 773882 | 12066 | 104 | 2 | 0 | 0 | 0 | 0 | |
6 | 14345945 | 727822 | 14351 | 148 | 3 | 2 | 0 | 0 | 0 | |
7 | 11381007 | 674092 | 16064 | 193 | 3 | 0 | 0 | 0 | 0 | |
8 | 9107995 | 617422 | 17258 | 276 | 5 | 0 | 2 | 0 | 0 | |
9 | 7313742 | 558018 | 17954 | 328 | 3 | 1 | 2 | 0 | 0 | |
10 | 5858706 | 495894 | 17838 | 352 | 8 | 1 | 0 | 0 | 0 |
data_list$fit_out$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
25 | 0.99168 | 0.99168 | 0.99169 | 0.0083151 | 0.0087308 |
p_fit <- plot_fit(data_list, dataset_name)
plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
data_list$optimal_cutoff
cutoff <chr> | outcome <chr> | q <dbl> | s <int> | FPR <dbl> | j <dbl> | qs <dbl> | o <dbl> | |
---|---|---|---|---|---|---|---|---|
optimal | 0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0 | 0.95800 | 9 | 0.044800 | 0.94954 | 146.23 | 0.99421 | |
above | 0,0,0,0,0,0,0,11,0,0,11,0,0,0,0,0 | 0.96048 | 11 | 0.044768 | 0.94954 | 145.97 | 0.99417 | |
below | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 | 0.95737 | 15 | 0.047947 | 0.94951 | 146.30 | 0.99746 | |
none | 0,0,0,1,0,0,0,0,1,1,0,1,0,1,0,1 | 0.23670 | 10 | 0.061391 | 0.93802 | 158.83 | 1.00000 |
p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
p_post$legend,
ncol=2,
rel_widths=c(1, 0.1))
Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.
First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.
data_list$reads_dist_summary$marginal
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
P7_0 | 13.5885 | 0.93087 | 0.00119153 | 0.067935 | 0.048175 | 0.073508 | 7.8585 |
P7_1 | 8.1965 | 0.86451 | 0.00035430 | 0.135139 | 0.029059 | 0.081434 | 14.4328 |
P7_10 | 38.7803 | 0.98246 | 0.00406304 | 0.013478 | 0.137486 | 0.064406 | 2.4126 |
P7_11 | 16.2499 | 0.95427 | 0.00142764 | 0.044304 | 0.057610 | 0.068353 | 6.1105 |
P7_12 | 21.3420 | 0.96574 | 0.00222571 | 0.032035 | 0.075663 | 0.070071 | 4.7696 |
P7_13 | 20.8525 | 0.96755 | 0.00219437 | 0.030254 | 0.073927 | 0.070759 | 4.9294 |
P7_14 | 12.9619 | 0.94630 | 0.00069905 | 0.052997 | 0.045953 | 0.060566 | 6.7879 |
P7_15 | 12.8412 | 0.95946 | 0.00136840 | 0.039170 | 0.045525 | 0.047778 | 5.4050 |
P7_2 | 28.1076 | 0.97477 | 0.00345996 | 0.021770 | 0.099648 | 0.056708 | 2.9309 |
P7_3 | 28.6691 | 0.97509 | 0.00316424 | 0.021742 | 0.101639 | 0.061417 | 3.1121 |
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$reads_dist_summary$marginal_called_cells
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
P7_0 | 3.3586 | 0.98546 | 0.00134609 | 0.01319086 | 0.014443 | 0.073508 | 31.7944 |
P7_1 | 2.3952 | 0.97729 | 0.00033401 | 0.02237796 | 0.010300 | 0.081434 | 49.3906 |
P7_10 | 36.3192 | 0.99645 | 0.00338702 | 0.00016597 | 0.156181 | 0.064406 | 2.5761 |
P7_11 | 13.9123 | 0.99750 | 0.00097044 | 0.00152642 | 0.059826 | 0.068353 | 7.1373 |
P7_12 | 19.1693 | 0.99741 | 0.00157741 | 0.00101375 | 0.082433 | 0.070071 | 5.3101 |
P7_13 | 18.5318 | 0.99712 | 0.00171840 | 0.00116055 | 0.079691 | 0.070759 | 5.5467 |
P7_14 | 11.2653 | 0.99752 | 0.00055729 | 0.00192219 | 0.048443 | 0.060566 | 7.8102 |
P7_15 | 11.3739 | 0.99765 | 0.00109232 | 0.00126210 | 0.048911 | 0.047778 | 6.1023 |
P7_2 | 25.8541 | 0.99655 | 0.00279511 | 0.00065804 | 0.111179 | 0.056708 | 3.1863 |
P7_3 | 26.2849 | 0.99649 | 0.00267705 | 0.00083306 | 0.113032 | 0.061417 | 3.3944 |
data_list$called_cells_tally
barcode <chr> | P7_0 <dbl> | P7_1 <dbl> | P7_10 <dbl> | P7_11 <dbl> | P7_12 <dbl> | P7_13 <dbl> | P7_14 <dbl> | P7_15 <dbl> | P7_2 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
consensus_background | 246421 | 232309 | 376553 | 260999 | 293205 | 306709 | 245290 | 244201 | 307338 | |
transition_cell | 304 | 36 | 23 | 0 | 16 | 7 | 15 | 7 | 11 | |
phantom_background | 62321 | 73684 | 42025 | 64495 | 59620 | 54120 | 65386 | 54121 | 53777 | |
transition_background | 216 | 224 | 12 | 39 | 33 | 27 | 33 | 8 | 22 | |
consensus_cell | 959 | 612 | 2451 | 3476 | 3029 | 3389 | 2597 | 2756 | 4036 | |
phantom_cell | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
data_list$umi_counts_cell %>%
map(list("called_cells"))
$P7_0
# A tibble: 1,175 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 487 49 433 5
2 AGTGAGGTCTGCCCTA 867 433 430 4
3 GAACATCTCCTTGGTC 445 34 409 2
4 CTAACTTAGTTGTAGA 420 27 391 2
5 CGGGTCATCCCAAGTA 425 33 388 4
6 CATGCCTTCAATACCG 782 409 373 0
7 AGACGTTGTCCGAGTC 405 35 370 0
8 CTCGAGGGTTTCGCTC 378 27 351 0
9 GAATAAGCATATGGTC 395 48 337 10
10 GCTGCTTGTCCGAAGA 555 220 317 18
# … with 1,165 more rows
$P7_1
# A tibble: 836 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 595 51 540 4
2 AGACGTTGTCCGAGTC 482 36 445 1
3 CGGGTCATCCCAAGTA 493 45 444 4
4 CTAACTTAGTTGTAGA 499 53 444 2
5 GAATAAGCATATGGTC 472 55 417 0
6 TGTTCCGAGGCTAGAC 483 67 416 0
7 GTGGGTCCAAACTGTC 464 79 385 0
8 GAACGGAGTTGCGCAC 425 46 379 0
9 TAAACCGTCTCTGTCG 413 36 374 3
10 GTGCTTCGTCCCTACT 446 87 358 1
# … with 826 more rows
$P7_10
# A tibble: 2,463 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 286 21 265 0
2 AGTGAGGTCTGCCCTA 279 24 253 2
3 CTAACTTAGTTGTAGA 272 20 251 1
4 CGGGTCATCCCAAGTA 263 35 227 1
5 AGACGTTGTCCGAGTC 251 29 222 0
6 GAACATCTCCTTGGTC 234 13 221 0
7 CTCGAGGGTTTCGCTC 203 16 183 4
8 CATGCCTTCAATACCG 222 33 179 10
9 TGTTCCGAGGCTAGAC 201 19 179 3
10 GTGCTTCGTCCCTACT 225 47 173 5
# … with 2,453 more rows
$P7_11
# A tibble: 3,515 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 387 36 347 4
2 GAACATCTCCTTGGTC 351 23 327 1
3 CGGGTCATCCCAAGTA 382 51 325 6
4 AGTGAGGTCTGCCCTA 368 57 310 1
5 AGACGTTGTCCGAGTC 309 25 284 0
6 TAAACCGTCTCTGTCG 296 19 272 5
7 AAATGCCTCCCAAGTA 294 25 262 7
8 TGTTCCGAGGCTAGAC 296 31 260 5
9 CTCGAGGGTTTCGCTC 294 31 259 4
10 TAAGAGATCTTGGGTA 279 26 241 12
# … with 3,505 more rows
$P7_12
# A tibble: 3,062 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 384 39 339 6
2 CTAACTTAGTTGTAGA 336 28 303 5
3 AGTGAGGTCTGCCCTA 318 31 280 7
4 AGACGTTGTCCGAGTC 306 27 278 1
5 GAACATCTCCTTGGTC 313 40 273 0
6 CTCGAGGGTTTCGCTC 281 31 248 2
7 GAATAAGCATATGGTC 295 30 247 18
8 AAATGCCTCCCAAGTA 264 31 230 3
9 GGACATTTCGTAGGAG 253 23 226 4
10 TGTTCCGAGGCTAGAC 249 21 226 2
# … with 3,052 more rows
$P7_13
# A tibble: 3,416 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CGGGTCATCCCAAGTA 343 45 296 2
2 AGTGAGGTCTGCCCTA 325 31 292 2
3 CTAACTTAGTTGTAGA 317 33 283 1
4 GAACATCTCCTTGGTC 305 22 282 1
5 AGGGATGGTCTAACGT 307 33 271 3
6 AGACGTTGTCCGAGTC 293 31 261 1
7 GAATAAGCATATGGTC 287 36 241 10
8 CTCGAGGGTTTCGCTC 264 23 240 1
9 TAAACCGTCTCTGTCG 235 18 215 2
10 GGACATTTCGTAGGAG 249 32 213 4
# … with 3,406 more rows
$P7_14
# A tibble: 2,630 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 380 36 341 3
2 CTAACTTAGTTGTAGA 307 17 288 2
3 AGACGTTGTCCGAGTC 319 35 283 1
4 AGTGAGGTCTGCCCTA 307 32 271 4
5 GAACATCTCCTTGGTC 284 33 251 0
6 CTCGAGGGTTTCGCTC 265 22 242 1
7 TAAACCGTCTCTGTCG 260 18 241 1
8 GAACATCTCAGAGACG 276 38 238 0
9 GTGCTTCGTCCCTACT 299 61 234 4
10 GATCTAGTCGAGAACG 247 21 225 1
# … with 2,620 more rows
$P7_15
# A tibble: 2,764 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 290 26 261 3
2 CTAACTTAGTTGTAGA 248 18 227 3
3 CGGGTCATCCCAAGTA 242 24 217 1
4 AGACGTTGTCCGAGTC 229 17 211 1
5 AGTGAGGTCTGCCCTA 226 22 204 0
6 GAACATCTCCTTGGTC 220 19 201 0
7 CTCGAGGGTTTCGCTC 206 16 189 1
8 GTGCTTCGTCCCTACT 211 43 167 1
9 CTCGTCATCAGAGGTG 5116 5030 78 8
10 CATATGGGTTGCGTTA 1665 1597 66 2
# … with 2,754 more rows
$P7_2
# A tibble: 4,058 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 341 44 297 0
2 AGTGAGGTCTGCCCTA 308 29 275 4
3 CTAACTTAGTTGTAGA 293 26 267 0
4 GAACATCTCCTTGGTC 280 20 259 1
5 CGGGTCATCCCAAGTA 287 35 251 1
6 CTCGAGGGTTTCGCTC 261 29 232 0
7 AGACGTTGTCCGAGTC 262 41 221 0
8 TAAACCGTCTCTGTCG 239 26 213 0
9 TGTTCCGAGGCTAGAC 241 30 208 3
10 AAATGCCTCCCAAGTA 232 28 204 0
# … with 4,048 more rows
$P7_3
# A tibble: 4,007 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 373 82 290 1
2 CTAACTTAGTTGTAGA 311 41 267 3
3 AGTGAGGTCTGCCCTA 292 31 258 3
4 AGACGTTGTCCGAGTC 273 24 247 2
5 TGTTCCGAGGCTAGAC 278 34 244 0
6 CGGGTCATCCCAAGTA 281 37 239 5
7 CTCGAGGGTTTCGCTC 266 25 238 3
8 GAACATCTCCTTGGTC 269 28 237 4
9 AAATGCCTCCCAAGTA 252 27 220 5
10 TAAACCGTCTCTGTCG 246 26 218 2
# … with 3,997 more rows
$P7_4
# A tibble: 999 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 727 147 578 2
2 GTGCTTCGTCCCTACT 583 75 506 2
3 AGGGATGGTCTAACGT 402 32 368 2
4 TGGGAAGCATTCGACA 343 17 326 0
5 AGGCCGTGTGCGATAG 339 18 321 0
6 CGGGTCATCCCAAGTA 326 29 294 3
7 CTAACTTAGTTGTAGA 304 17 287 0
8 AGTGAGGTCTGCCCTA 302 25 277 0
9 AGACGTTGTCCGAGTC 306 28 276 2
10 CAGATCAAGACAGAGA 298 29 269 0
# … with 989 more rows
$P7_5
# A tibble: 2,038 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 307 34 272 1
2 AGACGTTGTCCGAGTC 276 29 247 0
3 AGTGAGGTCTGCCCTA 12731 12472 219 40
4 CTAACTTAGTTGTAGA 242 26 215 1
5 CGGGTCATCCCAAGTA 234 27 207 0
6 GAACATCTCCTTGGTC 224 23 201 0
7 CTCGAGGGTTTCGCTC 219 19 200 0
8 AAATGCCTCCCAAGTA 217 18 196 3
9 CATGCCTTCAATACCG 221 29 192 0
10 TGTTCCGAGGCTAGAC 210 19 190 1
# … with 2,028 more rows
$P7_6
# A tibble: 7,280 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 260 23 234 3
2 AGGGATGGTCTAACGT 243 27 215 1
3 CTAACTTAGTTGTAGA 234 19 214 1
4 GAACATCTCCTTGGTC 227 18 208 1
5 CTCGAGGGTTTCGCTC 217 18 198 1
6 CGGGTCATCCCAAGTA 220 24 195 1
7 AAATGCCTCCCAAGTA 201 21 177 3
8 GTGCTTCGTCCCTACT 212 48 157 7
9 TAAGAGATCTTGGGTA 5407 5267 116 24
10 CTAACTTCATCGATGT 3752 3619 103 30
# … with 7,270 more rows
$P7_7
# A tibble: 744 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 285 29 256 0
2 GAACATCTCCTTGGTC 235 9 225 1
3 AGTGAGGTCTGCCCTA 237 20 216 1
4 CTCGAGGGTTTCGCTC 241 23 216 2
5 CTAACTTAGTTGTAGA 232 22 208 2
6 AGACGTTGTCCGAGTC 222 19 202 1
7 TGTTCCGAGGCTAGAC 208 15 193 0
8 GGACATTTCGTAGGAG 213 18 189 6
9 CGGGTCATCCCAAGTA 211 23 186 2
10 GAACATCTCAGAGACG 201 27 174 0
# … with 734 more rows
$P7_8
# A tibble: 1,209 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 495 1 457 37
2 GAACATCTCCTTGGTC 425 3 396 26
3 GAACATCTCAGAGACG 443 2 393 48
4 GAATAAGCATATGGTC 421 1 381 39
5 CTCGAGGGTTTCGCTC 408 0 379 29
6 TGTTCCGAGGCTAGAC 400 1 372 27
7 TAAGAGATCTTGGGTA 403 0 364 39
8 GAACGGAGTTGCGCAC 398 1 360 37
9 CATGCCTTCAATACCG 398 0 355 43
10 GGCTGGTGTGTCGCTG 365 1 338 26
# … with 1,199 more rows
$P7_9
# A tibble: 2,400 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 446 39 406 1
2 CTAACTTAGTTGTAGA 420 37 381 2
3 AGTGAGGTCTGCCCTA 411 31 378 2
4 AGACGTTGTCCGAGTC 375 24 348 3
5 CGGGTCATCCCAAGTA 380 35 343 2
6 GAACATCTCCTTGGTC 365 25 337 3
7 GAATAAGCATATGGTC 372 38 334 0
8 CTCGAGGGTTTCGCTC 346 27 318 1
9 CATGCCTTCAATACCG 340 36 304 0
10 GGCTGGTGTGTCGCTG 322 17 303 2
# … with 2,390 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$P7_0
# A tibble: 309,046 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 429 21 403 5
2 AAATGCCTCCCAAGTA 370 45 315 10
3 GCAAACTAGCTTATCG 252 20 228 4
4 AACTCAGTCTAACGGT 249 24 222 3
5 TTAACTCCAATGGTCT 234 15 216 3
6 CTTAACTCACTACAGT 220 18 201 1
7 ACGCCGAGTCTACCTC 216 23 191 2
8 ACACCAACATAAGACA 208 18 189 1
9 ACATACGGTAATCGTC 210 20 189 1
10 GTATTCTTCACCATAG 197 8 187 2
# … with 309,036 more rows
$P7_1
# A tibble: 306,029 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 580 162 418 0
2 GAACATCTCAGAGACG 439 63 376 0
3 CAGATCAAGACAGAGA 261 60 201 0
4 CCTAGCTGTTCTGGTA 199 11 186 2
5 TTAGTTCGTTCAGCGC 198 11 186 1
6 GTTTCTACAGTAAGAT 194 7 185 2
7 CTCTAATAGGACATTA 197 14 183 0
8 TCTTTCCCAGACAGGT 195 10 183 2
9 ATCATCTAGTTACGGG 193 11 182 0
10 TAGACCACAAACGCGA 197 15 182 0
# … with 306,019 more rows
$P7_10
# A tibble: 418,601 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAATAAGCATATGGTC 266 27 230 9
2 CACACCTTCAACGCTA 224 15 207 2
3 GCTGCTTGTCCGAAGA 260 60 186 14
4 GAACGGAGTTGCGCAC 215 27 178 10
5 CGGACACAGCGCTTAT 186 14 172 0
6 TCAGGTACATAACCTG 186 14 171 1
7 TGAGAGGCAACACGCC 180 15 165 0
8 TAAACCGTCTCTGTCG 182 19 163 0
9 AAATGCCTCCCAAGTA 190 25 159 6
10 ACTTACTCAGCTCGAC 176 17 159 0
# … with 418,591 more rows
$P7_11
# A tibble: 325,494 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTAACTTAGTTGTAGA 345 23 321 1
2 CACACCTTCAACGCTA 317 18 297 2
3 GCTGCTTGTCCGAAGA 374 90 273 11
4 ACTTACTCAGCTCGAC 279 17 262 0
5 GAACGGAGTTGCGCAC 304 37 258 9
6 GAATAAGCATATGGTC 294 44 242 8
7 CGGACACAGCGCTTAT 257 26 230 1
8 CATCCACAGATGGCGT 247 29 215 3
9 AAACGGGAGGATATAC 229 16 210 3
10 GTGCTTCGTCCCTACT 276 60 208 8
# … with 325,484 more rows
$P7_12
# A tibble: 352,841 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 314 30 278 6
2 CGGGTCATCCCAAGTA 298 31 265 2
3 GCTGCTTGTCCGAAGA 347 98 238 11
4 GAACGGAGTTGCGCAC 264 30 223 11
5 CATCCACAGATGGCGT 242 20 221 1
6 CGGACACAGCGCTTAT 222 10 211 1
7 GGATGTTAGGGAACGG 223 20 200 3
8 AAACGGGAGGATATAC 215 16 198 1
9 GCGGGTTTCAGGATCT 199 15 183 1
10 GTATTCTTCACCATAG 190 7 183 0
# … with 352,831 more rows
$P7_13
# A tibble: 360,836 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 251 20 226 5
2 GAACGGAGTTGCGCAC 261 37 209 15
3 CATCCACAGATGGCGT 243 31 208 4
4 GCTGCTTGTCCGAAGA 302 90 205 7
5 ACTTACTCAGCTCGAC 222 27 194 1
6 AAACGGGAGGATATAC 205 13 191 1
7 GGATGTTAGGGAACGG 216 25 188 3
8 GCTGCGACAGTAAGCG 200 19 181 0
9 CCTACCAAGGTAAACT 195 14 178 3
10 CCTACCACATCGGTTA 207 29 177 1
# … with 360,826 more rows
$P7_14
# A tibble: 310,691 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 314 17 295 2
2 CGGGTCATCCCAAGTA 292 26 262 4
3 GAATAAGCATATGGTC 281 35 245 1
4 GCTGCTTGTCCGAAGA 331 82 237 12
5 GAACGGAGTTGCGCAC 252 36 216 0
6 CGGACACAGCGCTTAT 209 18 191 0
7 AAACGGGAGGATATAC 216 24 190 2
8 GCTGCGACAGTAAGCG 200 17 183 0
9 CGACTTCAGACCTAGG 199 16 181 2
10 GTAGGCCAGCCGATTT 203 22 181 0
# … with 310,681 more rows
$P7_15
# A tibble: 298,329 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 197 17 178 2
2 GCTGCGACAGTAAGCG 179 8 171 0
3 TAAACCGTCTCTGTCG 189 18 170 1
4 GAACATCTCAGAGACG 197 25 169 3
5 GATCTAGTCGAGAACG 188 19 167 2
6 ACTTACTCAGCTCGAC 179 13 166 0
7 TCTATTGCATAAAGGT 175 9 166 0
8 TGAGAGGCAACACGCC 178 12 166 0
9 GAATAAGCATATGGTC 193 23 165 5
10 GCAAACTTCGACAGCC 180 15 164 1
# … with 298,319 more rows
$P7_2
# A tibble: 361,126 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 270 23 239 8
2 GCTGCTTGTCCGAAGA 309 75 228 6
3 GAATAAGCATATGGTC 251 31 206 14
4 ACTTACTCAGCTCGAC 246 41 205 0
5 CATCCACAGATGGCGT 237 37 200 0
6 GAACGGAGTTGCGCAC 232 26 197 9
7 GCTGCGACAGTAAGCG 195 16 179 0
8 CGACTTCAGACCTAGG 193 18 174 1
9 CGGACACAGCGCTTAT 196 22 172 2
10 GCGCCAAAGGCTATCT 190 18 172 0
# … with 361,116 more rows
$P7_3
# A tibble: 381,118 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 325 80 233 12
2 CACACCTTCAACGCTA 255 19 231 5
3 GAATAAGCATATGGTC 264 33 221 10
4 GAACGGAGTTGCGCAC 247 34 205 8
5 CATCCACAGATGGCGT 219 21 196 2
6 CGGACACAGCGCTTAT 217 22 195 0
7 TCAGGTACATAACCTG 195 16 175 4
8 GCTGCGACAGTAAGCG 188 17 171 0
9 CGACTTCAGACCTAGG 193 22 170 1
10 GATCTAGTCGAGAACG 210 37 170 3
# … with 381,108 more rows
$P7_4
# A tibble: 315,858 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 303 18 285 0
2 CGTCCATGTGTTCGAT 199 10 189 0
3 AGATTGCAGAGCAATT 200 16 184 0
4 CGATGTACATATGCTG 199 14 184 1
5 AAATGCCGTGAACCTT 199 15 182 2
6 AACGTTGTCAACACAC 195 14 181 0
7 GCTGCGACAGTAAGCG 194 12 180 2
8 TAAGCGTTCAGAGACG 194 14 180 0
9 TCGAGGCTCCCTCTTT 193 10 180 3
10 ACACCAACATAAGACA 198 20 178 0
# … with 315,848 more rows
$P7_5
# A tibble: 364,540 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 288 48 239 1
2 TAAACCGTCTCTGTCG 190 16 174 0
3 GAACATCTCAGAGACG 199 31 166 2
4 ACTTACTCAGCTCGAC 178 13 165 0
5 GGCTGGTGTGTCGCTG 178 15 161 2
6 TAAACCGCATGTAGTC 181 21 158 2
7 GTGGGTCCAAACTGTC 190 34 156 0
8 GCTGCGACAGTAAGCG 166 11 155 0
9 CCGTACTGTCAGATAA 175 19 154 2
10 CATCCACAGATGGCGT 164 10 151 3
# … with 364,530 more rows
$P7_6
# A tibble: 351,694 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 327 85 231 11
2 CACACCTTCAACGCTA 224 22 196 6
3 TAAACCGTCTCTGTCG 188 15 173 0
4 TGTTCCGAGGCTAGAC 200 23 173 4
5 GGACATTTCGTAGGAG 180 9 168 3
6 ACTTACTCAGCTCGAC 183 17 165 1
7 AGACGTTGTCCGAGTC 185 21 164 0
8 GGCTGGTGTGTCGCTG 172 8 163 1
9 CGGACACAGCGCTTAT 199 42 157 0
10 CGACTTCAGACCTAGG 169 14 154 1
# … with 351,684 more rows
$P7_7
# A tibble: 275,154 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 238 16 215 7
2 GCTGCTTGTCCGAAGA 272 65 200 7
3 ACTTACTCAGCTCGAC 206 14 191 1
4 GTGCTTCGTCCCTACT 245 50 187 8
5 TGAGAGGCAACACGCC 194 6 186 2
6 CATCCACAGATGGCGT 197 14 182 1
7 TAAACCGTCTCTGTCG 194 13 180 1
8 TCTATTGCATAAAGGT 195 15 179 1
9 CATGCCTTCAATACCG 199 29 170 0
10 TAAACCGCATGTAGTC 197 26 167 4
# … with 275,144 more rows
$P7_8
# A tibble: 272,056 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 478 0 445 33
2 AAATGCCTCCCAAGTA 418 3 370 45
3 GCTGCTTGTCCGAAGA 474 7 322 145
4 ACGCCGAGTCTACCTC 253 0 234 19
5 ACACCAACATAAGACA 251 0 229 22
6 GCAAACTAGCTTATCG 247 0 224 23
7 TTAACTCCAATGGTCT 249 7 220 22
8 GATCGTATCGGCGCAT 249 1 206 42
9 ACGCCGAAGATCCGAG 203 0 191 12
10 ATTCTACCAGGACGTA 199 0 190 9
# … with 272,046 more rows
$P7_9
# A tibble: 316,062 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 397 32 362 3
2 ACTTACTCAGCTCGAC 321 26 295 0
3 GTAGGCCAGCCGATTT 284 33 251 0
4 CATCCACAGATGGCGT 271 22 248 1
5 AAATGCCTCCCAAGTA 275 29 244 2
6 CGGACACAGCGCTTAT 274 28 244 2
7 CGGAGTCCATGAGCGA 248 12 236 0
8 AAACGGGAGGATATAC 269 43 225 1
9 CGATGTACATATGCTG 245 22 223 0
10 GCAAACTTCGACAGCC 243 18 223 2
# … with 316,052 more rows
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6357878 339.6 10297860 550 10297860 550
Vcells 5318635663 40578.0 19708764628 150366 30776494722 234806
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] rhdf5_2.26.2 cowplot_0.9.4 data.table_1.12.0
[4] tictoc_1.0 furrr_0.1.0 future_1.11.1.1
[7] broom_0.5.1 matrixStats_0.54.0 forcats_0.4.0
[10] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.1
[13] readr_1.3.1 tidyr_0.8.3 tibble_2.0.1
[16] ggplot2_3.1.0 tidyverse_1.2.1 rmarkdown_1.11
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.3
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.1
[13] BiocGenerics_0.28.0 colorspace_1.4-0
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.0.1
[19] rvest_0.3.2 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.18 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.1 readxl_1.3.0
[33] rstudioapi_0.9.0 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.0 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.3.1
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] knitr_1.21 pillar_1.3.1
[63] GenomicRanges_1.34.0 codetools_0.2-16
[65] stats4_3.5.2 glue_1.3.0
[67] evaluate_0.13 modelr_0.1.4
[69] cellranger_1.1.0 gtable_0.2.0
[71] assertthat_0.2.0 xfun_0.5
[73] DropletUtils_1.2.2 viridisLite_0.3.0
[75] SingleCellExperiment_1.4.1 IRanges_2.16.0
[77] globals_0.12.4