knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq2500
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))
Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.
max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name,
project_dir,
max_fpr=max_fpr)
Step 1: reading read counts from existing file: 63.974 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 8.678 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.238 sec elapsed
Step 4: computing read counts distribution statistics: 0.268 sec elapsed
Step 5: estimating pi_r matrix: 0.031 sec elapsed
Step 6: infering the true sample of origin: 1.694 sec elapsed
Step 7: estimating g and computing classification metrics: 0.067 sec elapsed
Step 8: determining the optimal cutoff: 0.447 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.011 sec elapsed
Step 10.1: reassigning reads to sample of origin: 0.131 sec elapsed
Step 10.2: deduplicating read counts: 0.003 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 0.152 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0 sec elapsed
Step 10.5: adding cell-umi-gene labels: 8.351 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 3.872 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 24.626 sec elapsed
Step 10: purging phantoms at q cutoff of 0.999661. Max-FPR threshold user-set FALSE: 57.508 sec elapsed
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 5
Using defaultDrops cell calling instead.
EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 11: calling cells: 713.25 sec elapsed
Step 12: saving purged data: 9.695 sec elapsed
Step 13: tallying molecules by cell-barcode: 21.122 sec elapsed
Step 14: saving results: 2.827 sec elapsed
Running workflow: 879.922 sec elapsed
data_list$read_counts
cell <chr> | gene <chr> | umi <int> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCAT | Csn3 | 468967 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Sdc4 | 1012503 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Txn1 | 621442 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Mrpl54 | 648997 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Higd1a | 325897 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Rps21 | 476545 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Mlec | 776993 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Nap1l1 | 283231 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Csn1s2a | 5045 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Rps28 | 174081 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
data_list$reads_dist_summary$summary_stats
n_obs <dbl> | p_chimeras <dbl> | g <dbl> | u <dbl> | n_reads <int> |
---|---|---|---|---|
44062616 | 0.00059245 | 0.000028358 | 0.00059568 | 268768037 |
data_list$reads_dist_summary$conditional
r <int> | n_obs <dbl> | m_bar <dbl> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 11242464 | 0.255147447442 | 0.0813012165 | 0.1380801397 | 0.00198773 | 0.002577638 | |
2 | 7556220 | 0.171488229387 | 0.0835708198 | 0.1497099079 | 0.00122151 | 0.001662273 | |
3 | 5707051 | 0.129521383842 | 0.0963883098 | 0.1726053730 | 0.00114391 | 0.001429343 | |
4 | 4344844 | 0.098606129060 | 0.1151406702 | 0.2038855710 | 0.00110930 | 0.001212529 | |
5 | 3342437 | 0.075856526539 | 0.1395874926 | 0.2378879841 | 0.00097725 | 0.000912628 | |
6 | 2604471 | 0.059108406092 | 0.1690894875 | 0.2651573646 | 0.00084726 | 0.000795491 | |
7 | 2048065 | 0.046480785435 | 0.2028776988 | 0.2792064705 | 0.00070840 | 0.000676039 | |
8 | 1603535 | 0.036392187881 | 0.2391953559 | 0.2793755671 | 0.00067905 | 0.000659325 | |
9 | 1243973 | 0.028231937023 | 0.2784262824 | 0.2663382208 | 0.00060371 | 0.000691423 | |
10 | 946847 | 0.021488669670 | 0.3173679591 | 0.2448392401 | 0.00063907 | 0.000892013 |
data_list$pi_r_hat
r <int> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 0.08129566552 | 0.13808180128 | 0.00197210539 | 0.00256208660 | 0.17659079615 | 0.49747785494 | |
2 | 0.08356555710 | 0.14971304674 | 0.00120578677 | 0.00164660547 | 0.19234195544 | 0.47740910403 | |
3 | 0.09638467532 | 0.17261142028 | 0.00112817329 | 0.00141364594 | 0.21600554012 | 0.40910812626 | |
4 | 0.11513941776 | 0.20389559179 | 0.00109356564 | 0.00119680455 | 0.24214250160 | 0.31471704060 | |
5 | 0.13958934563 | 0.23790232417 | 0.00096149655 | 0.00089686483 | 0.25825437272 | 0.21398194662 | |
6 | 0.16909508817 | 0.26517516870 | 0.00083148999 | 0.00077971346 | 0.25537199706 | 0.12923893908 | |
7 | 0.20288759157 | 0.27922605919 | 0.00069261522 | 0.00066024607 | 0.23355952902 | 0.07055383344 | |
8 | 0.23920986200 | 0.27939517728 | 0.00066325424 | 0.00064352969 | 0.19978686577 | 0.03576485734 | |
9 | 0.27844577197 | 0.26635617486 | 0.00058790892 | 0.00067563186 | 0.16223288388 | 0.01708848712 | |
10 | 0.31739239544 | 0.24485446319 | 0.00062327096 | 0.00087624787 | 0.12568670818 | 0.00801484418 |
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
data_list$outcome_counts
outcome <chr> | n <int> | q <dbl> | qs <dbl> | j <dbl> | o <dbl> | FPR <dbl> | FNR <dbl> | r <int> | |
---|---|---|---|---|---|---|---|---|---|
0,0,0,0,0,0,0,4 | 51258 | 1.00000 | 0.0000 | 0.0011633 | 0.0011633 | 0.0000e+00 | 0.9988366679 | 4 | |
0,0,0,0,0,0,4,0 | 477628 | 1.00000 | 0.0000 | 0.0120034 | 0.0120031 | 0.0000e+00 | 0.9879966047 | 4 | |
0,0,0,0,0,4,0,0 | 1366901 | 1.00000 | 0.0000 | 0.0430261 | 0.0430248 | 0.0000e+00 | 0.9569739399 | 4 | |
0,0,0,0,4,0,0,0 | 1051708 | 1.00000 | 0.0000 | 0.0668952 | 0.0668933 | 0.0000e+00 | 0.9331047755 | 4 | |
0,0,0,4,0,0,0,0 | 5215 | 1.00000 | 0.0000 | 0.0670136 | 0.0670117 | 0.0000e+00 | 0.9329864179 | 4 | |
0,0,4,0,0,0,0,0 | 4759 | 1.00000 | 0.0000 | 0.0671216 | 0.0671197 | 0.0000e+00 | 0.9328784094 | 4 | |
0,4,0,0,0,0,0,0 | 885608 | 1.00000 | 0.0000 | 0.0872210 | 0.0872185 | 0.0000e+00 | 0.9127789874 | 4 | |
4,0,0,0,0,0,0,0 | 500025 | 1.00000 | 0.0000 | 0.0985694 | 0.0985666 | 0.0000e+00 | 0.9014306103 | 4 | |
0,0,0,0,0,0,0,5 | 46901 | 1.00000 | 0.0000 | 0.0996338 | 0.0996310 | 0.0000e+00 | 0.9003661631 | 5 | |
0,0,0,0,0,0,5,0 | 448674 | 1.00000 | 0.0000 | 0.1098168 | 0.1098137 | 0.0000e+00 | 0.8901832288 | 5 |
data_list$fit_out$chimera_counts
r <dbl> | 1 <dbl> | 2 <dbl> | 3 <dbl> | 4 <dbl> | 5 <dbl> | 6 <dbl> | 7 <dbl> | 8 <dbl> | n_obs <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 11242464 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11242464 | |
2 | 7554778 | 1442 | 0 | 0 | 0 | 0 | 0 | 0 | 7556220 | |
3 | 5705362 | 1689 | 0 | 0 | 0 | 0 | 0 | 0 | 5707051 | |
4 | 4343102 | 1742 | 0 | 0 | 0 | 0 | 0 | 0 | 4344844 | |
5 | 3340666 | 1771 | 0 | 0 | 0 | 0 | 0 | 0 | 3342437 | |
6 | 2602778 | 1692 | 1 | 0 | 0 | 0 | 0 | 0 | 2604471 | |
7 | 2046469 | 1595 | 1 | 0 | 0 | 0 | 0 | 0 | 2048065 | |
8 | 1602022 | 1511 | 1 | 1 | 0 | 0 | 0 | 0 | 1603535 | |
9 | 1242591 | 1380 | 2 | 0 | 0 | 0 | 0 | 0 | 1243973 | |
10 | 945648 | 1199 | 0 | 0 | 0 | 0 | 0 | 0 | 946847 |
data_list$fit_out$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
25 | 0.99989 | 0.99989 | 0.99989 | 0.00011114 | 0.00012304 |
p_fit <- plot_fit(data_list, dataset_name)
plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
data_list$optimal_cutoff
cutoff <chr> | outcome <chr> | q <dbl> | s <int> | FPR <dbl> | j <dbl> | qs <dbl> | o <dbl> | FP <dbl> | TP <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
optimal | 0,0,0,0,1,0,0,2 | 0.99966 | 8 | 0.026103 | 0.96850 | 125.30 | 0.99459 | 0.000016289 | 0.99457 | |
above | 0,4,0,0,0,5,0,0 | 0.99975 | 6 | 0.026103 | 0.96850 | 123.93 | 0.99459 | 0.000016289 | 0.99457 | |
below | 0,0,0,0,0,1,0,2 | 0.99936 | 8 | 0.026103 | 0.96850 | 128.07 | 0.99459 | 0.000016289 | 0.99457 | |
none | 1,0,0,0,0,0,1,0 | 0.50174 | 7 | 0.056466 | 0.94353 | 156.97 | 1.00000 | 0.000035237 | 0.99996 |
p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
p_post$legend,
ncol=2,
rel_widths=c(1, 0.1))
Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.
First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.
data_list$reads_dist_summary$marginal
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
A1 | 5.93931 | 0.99862 | 0.000007745 | 0.00136884 | 0.1347123 | 0.13436 | 6.0801 |
A2 | 8.01873 | 0.99967 | 0.000018582 | 0.00030878 | 0.1818765 | 0.13477 | 4.5171 |
B1 | 0.14831 | 0.82879 | 0.150691120 | 0.02051783 | 0.0033639 | 0.12624 | 228.7785 |
B2 | 0.19973 | 0.84136 | 0.145124546 | 0.01351859 | 0.0045301 | 0.13215 | 177.8310 |
C1 | 8.51224 | 0.99965 | 0.000029135 | 0.00031778 | 0.1930701 | 0.11791 | 3.7228 |
C2 | 14.19085 | 0.99978 | 0.000050314 | 0.00016933 | 0.3218692 | 0.12152 | 2.3015 |
D1 | 5.45395 | 0.99954 | 0.000051889 | 0.00041089 | 0.1237036 | 0.11167 | 5.5033 |
D2 | 1.62575 | 0.88430 | 0.114134259 | 0.00156789 | 0.0368744 | 0.12138 | 20.0664 |
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$reads_dist_summary$marginal_called_cells
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
A1 | 5.40026 | 0.99998 | 0.0000072219 | 0.00000907364 | 0.1340264 | 0.13436 | 6.6870 |
A2 | 7.35779 | 0.99998 | 0.0000159015 | 0.00000489277 | 0.1826095 | 0.13477 | 4.9229 |
B1 | 0.11064 | 0.87848 | 0.1167257149 | 0.00479013774 | 0.0027460 | 0.12624 | 306.6604 |
B2 | 0.14207 | 0.90241 | 0.0975814058 | 0.00000703898 | 0.0035259 | 0.13215 | 250.0056 |
C1 | 7.86623 | 0.99997 | 0.0000240267 | 0.00000381377 | 0.1952283 | 0.11791 | 4.0285 |
C2 | 12.98796 | 0.99995 | 0.0000456577 | 0.00000046197 | 0.3223420 | 0.12152 | 2.5147 |
D1 | 4.94853 | 0.99994 | 0.0000511263 | 0.00000727489 | 0.1228152 | 0.11167 | 6.0654 |
D2 | 1.47901 | 0.90728 | 0.0927190289 | 0.00000540904 | 0.0367067 | 0.12138 | 22.0573 |
data_list$called_cells_tally
barcode <chr> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | D2 <dbl> |
---|---|---|---|---|---|---|---|---|
consensus_background | 123769 | 138948 | 20300 | 22004 | 145282 | 195617 | 98968 | 41695 |
transition_cell | 0 | 0 | 75 | 0 | 4 | 0 | 0 | 2 |
phantom_background | 1332 | 770 | 7021 | 9103 | 808 | 668 | 778 | 20632 |
transition_background | 0 | 1 | 11 | 2 | 0 | 2 | 2 | 2 |
consensus_cell | 410 | 565 | 184 | 14 | 736 | 1316 | 685 | 158 |
phantom_cell | 0 | 0 | 25 | 0 | 0 | 0 | 0 | 0 |
data_list$umi_counts_cell %>%
map(list("called_cells"))
$A1
# A tibble: 410 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AGGTCCGGTCCGAGTC 37482 37479 3 0
2 ACGTCAAGTTGCCTCT 31541 31538 2 1
3 CACCACTTCGGTTCGG 18666 18664 2 0
4 CGTAGGCCAACACCTA 65340 65338 2 0
5 CTGCTGTTCCGGGTGT 17297 17294 2 1
6 TCTCTAAAGATCACGG 21938 21936 2 0
7 AACCGCGTCATGCTCC 47423 47422 1 0
8 ACATACGCATTCGACA 3517 3516 1 0
9 ACGAGGACATCACCCT 15208 15207 1 0
10 ACGGCCACAAGGACAC 49067 49064 1 2
# … with 400 more rows
$A2
# A tibble: 566 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 ACGAGCCCACCGTTGG 43895 43892 3 0
2 CGTGTAATCTAACTCT 22943 22941 2 0
3 CTTAGGAAGAAACGCC 19123 19121 2 0
4 GCTGGGTAGTAGGTGC 55651 55648 2 1
5 AAAGATGAGGCCCTCA 68719 68718 1 0
6 AACTCCCCATTCGACA 15035 15034 1 0
7 ACAGCTAAGGCCCGTT 29916 29914 1 1
8 ACGGGCTTCGGTCCGA 34499 34498 1 0
9 ACTTACTCAACACCCG 29456 29455 1 0
10 CAAGATCCAACACGCC 44113 44112 1 0
# … with 556 more rows
$B1
# A tibble: 220 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 103 0 94 9
2 TCGTACCGTCCATGAT 74 0 63 11
3 TGCACCTTCTGGCGAC 74 0 63 11
4 GTGCAGCCAAGCGAGT 43 0 38 5
5 CTAGCCTTCCGCGCAA 41 0 37 4
6 AGCAGCCGTTAAGGGC 28 0 26 2
7 TGCACCTCATGCCACG 28 0 25 3
8 TCATTACCAGGCAGTA 19 0 16 3
9 GATCGTAGTCACCCAG 18 0 14 4
10 GTGCTTCAGGCATTGG 15 0 14 1
# … with 210 more rows
$B2
# A tibble: 16 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 TCGTACCGTCCATGAT 26473 24103 1 2369
2 ACGCCAGAGTACGTAA 417 359 0 58
3 AGCAGCCGTTAAGGGC 8853 7985 0 868
4 CAAGGCCGTCCGAGTC 232 196 0 36
5 CGTGAGCCAAGGACTG 1478 1351 0 127
6 CTAGCCTTCCGCGCAA 11808 10648 0 1160
7 CTCACACGTCTCTCGT 36274 32695 0 3579
8 CTTGGCTCAAAGTGCG 1034 942 0 92
9 GACAGAGTCAATACCG 211 188 0 23
10 GATCGTAGTCACCCAG 5904 5294 0 610
11 GTGCAGCCAAGCGAGT 12113 10974 0 1139
12 GTGGGTCCAGGCTGAA 2190 1983 0 207
13 TCATTACCAGGCAGTA 5893 5233 0 660
14 TGCACCTCATGCCACG 5479 4928 0 551
15 TGCACCTTCTGGCGAC 22310 20099 0 2211
16 TGGCTGGCATCCGTGG 1397 1224 0 173
$C1
# A tibble: 736 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 ACTATCTTCCTGTACC 33437 33433 2 2
2 CCCTCCTGTGATGTCT 10899 10897 2 0
3 GGTGCGTCAGTTAACC 38377 38374 2 1
4 ACCTTTACAAAGTGCG 19839 19837 1 1
5 ACGGCCAGTACTCAAC 26169 26168 1 0
6 AGCGTATGTTCTCATT 15803 15802 1 0
7 AGGGATGTCGCCGTGA 9354 9353 1 0
8 AGGTCATGTAAGCACG 24932 24930 1 1
9 ATCTGCCTCTTGCATT 4779 4777 1 1
10 ATTATCCGTTAAGATG 30075 30072 1 2
# … with 726 more rows
$C2
# A tibble: 1,318 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTTAACTGTGAGTATA 6233 6229 3 1
2 ATAACGCCACGGTTTA 1584 1583 1 0
3 GGAACTTTCAAAGTAG 1634 1633 1 0
4 TACCTATCATCACAAC 6081 6080 1 0
5 AAACCTGCAGTGGGAT 8200 8200 0 0
6 AAACCTGGTCGAAAGC 7482 7480 0 2
7 AAACGGGCAAGTTAAG 9265 9265 0 0
8 AAACGGGGTTATCACG 13268 13268 0 0
9 AAACGGGGTTGTCTTT 13437 13437 0 0
10 AAAGATGAGAGGTTGC 795 795 0 0
# … with 1,308 more rows
$D1
# A tibble: 687 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AAAGATGAGGCCGAAT 13329 13326 1 2
2 AAGGCAGGTTATGTGC 4278 4277 1 0
3 ACCGTAACAGGACGTA 10930 10929 1 0
4 AGCTCCTTCTCAAACG 34839 34837 1 1
5 ATAACGCGTTGTCTTT 11030 11027 1 2
6 ATCCACCGTAAGGGAA 20527 20526 1 0
7 ATCGAGTCAGATCTGT 25174 25172 1 1
8 CATATTCGTACGAAAT 23309 23304 1 4
9 CATCGGGGTATGAAAC 29755 29753 1 1
10 CCAGCGATCGACCAGC 13789 13785 1 3
# … with 677 more rows
$D2
# A tibble: 160 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 AACCATGTCAGCGACC 18595 16795 1 1799
2 AGATTGCCAGGTTTCA 14970 13750 1 1219
3 CGCGTTTCATGTTGAC 11987 10878 1 1108
4 CGTCAGGAGCTTCGCG 14085 12830 1 1254
5 GCATGTATCCCAGGTG 27178 24872 1 2305
6 GTGTTAGGTAGCGTCC 11840 10762 1 1077
7 TCAGGTATCATAACCG 8832 7922 1 909
8 TGTCCCAGTCATCCCT 3194 2878 1 315
9 AAAGCAAGTGAACCTT 9734 8840 0 894
10 AAAGCAATCTCCTATA 11917 10826 0 1091
# … with 150 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$A1
# A tibble: 125,101 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 86 4 82 0
2 CTCACACGTCTCTCGT 99 19 80 0
3 CATGCCTAGTGCTGCC 92 18 74 0
4 TGCACCTTCTGGCGAC 57 5 52 0
5 GGAATAAGTCAGGACA 53 3 50 0
6 CACACAAAGGCGATAC 67 19 48 0
7 TCGTACCGTCCATGAT 56 10 46 0
8 CGGCTAGGTAAACCTC 42 0 42 0
9 CTTCTCTCAACTGCTA 48 6 42 0
10 TGGTTAGTCCTGCAGG 40 1 39 0
# … with 125,091 more rows
$A2
# A tibble: 139,718 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 97 15 82 0
2 CTCACACGTCTCTCGT 93 21 72 0
3 CATGCCTAGTGCTGCC 84 15 69 0
4 TCGTACCGTCCATGAT 76 18 58 0
5 TGCACCTTCTGGCGAC 66 14 52 0
6 CACACAAAGGCGATAC 55 14 41 0
7 GTGCAGCCAAGCGAGT 50 12 38 0
8 AAGCCGCGTGCTCTTC 44 8 36 0
9 GGAATAAGTCAGGACA 38 5 33 0
10 CTAGCCTTCCGCGCAA 40 8 32 0
# … with 139,708 more rows
$B1
# A tibble: 27,396 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 ACGGCCACAAGGACAC 8 0 8 0
2 AGATTGCCAGGTTTCA 8 0 8 0
3 CAGCCGACAGGATCGA 8 0 8 0
4 GATCGTAAGATACACA 8 0 8 0
5 ACTGTCCCAGCGAACA 8 0 7 1
6 ATTATCCGTGTGAATA 8 0 7 1
7 CAGATCAGTGCTCTTC 7 0 7 0
8 CTACACCCACGCGAAA 7 0 7 0
9 TTTGGTTCAGGAACGT 8 0 7 1
10 ACGGGCTTCGGTCCGA 6 0 6 0
# … with 27,386 more rows
$B2
# A tibble: 31,107 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 96 0 86 10
2 CATGCCTAGTGCTGCC 85 0 76 9
3 CACACAAAGGCGATAC 66 0 58 8
4 AAGCCGCGTGCTCTTC 46 0 40 6
5 GGAATAAGTCAGGACA 42 0 39 3
6 ATAAGAGAGTGTGGCA 32 0 29 3
7 TGCCCATCAGTAAGCG 32 0 29 3
8 CACACTCCACATAACC 29 0 26 3
9 CATCGAAAGTCAATAG 28 0 22 6
10 CATATTCGTTGTACAC 14 0 14 0
# … with 31,097 more rows
$C1
# A tibble: 146,094 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 113 10 103 0
2 CATGCCTAGTGCTGCC 107 9 98 0
3 CTCACACGTCTCTCGT 88 16 72 0
4 CACACAAAGGCGATAC 71 12 59 0
5 TCGTACCGTCCATGAT 64 10 54 0
6 TGCACCTTCTGGCGAC 57 12 45 0
7 GGAATAAGTCAGGACA 50 8 42 0
8 AAGCCGCGTGCTCTTC 47 7 40 0
9 CATCGAAAGTCAATAG 42 5 37 0
10 ATAAGAGAGTGTGGCA 38 7 31 0
# … with 146,084 more rows
$C2
# A tibble: 196,285 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CATGCCTAGTGCTGCC 103 13 90 0
2 GGCTGGTGTCCCTACT 109 20 89 0
3 CACACAAAGGCGATAC 67 2 65 0
4 CTCACACGTCTCTCGT 69 8 61 0
5 TCGTACCGTCCATGAT 67 9 58 0
6 TGCACCTTCTGGCGAC 54 8 46 0
7 GGAATAAGTCAGGACA 43 5 38 0
8 AAGCCGCGTGCTCTTC 44 8 36 0
9 CATCGAAAGTCAATAG 40 7 33 0
10 CTAGCCTTCCGCGCAA 33 2 31 0
# … with 196,275 more rows
$D1
# A tibble: 99,746 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 94 20 74 0
2 GGCTGGTGTCCCTACT 80 8 72 0
3 CATGCCTAGTGCTGCC 86 15 71 0
4 TCGTACCGTCCATGAT 62 16 46 0
5 CACACAAAGGCGATAC 46 5 41 0
6 TGCACCTTCTGGCGAC 44 7 37 0
7 AAGCCGCGTGCTCTTC 32 3 29 0
8 CATCGAAAGTCAATAG 28 4 24 0
9 CTAGCCTTCCGCGCAA 33 9 24 0
10 GTGCAGCCAAGCGAGT 27 4 23 0
# … with 99,736 more rows
$D2
# A tibble: 62,329 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 120 0 101 19
2 CTCACACGTCTCTCGT 90 0 82 8
3 CATGCCTAGTGCTGCC 93 0 81 12
4 TCGTACCGTCCATGAT 83 0 60 23
5 CACACAAAGGCGATAC 66 0 59 7
6 TGCACCTTCTGGCGAC 62 0 57 5
7 GTGCAGCCAAGCGAGT 40 0 35 5
8 ATAAGAGAGTGTGGCA 32 0 30 2
9 AAGCCGCGTGCTCTTC 28 0 28 0
10 CTAGCCTTCCGCGCAA 33 0 28 5
# … with 62,319 more rows
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6032502 322.2 11951962 638.4 11951962 638.4
Vcells 522868676 3989.2 2326586835 17750.5 2898180107 22111.4
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] rhdf5_2.26.2 cowplot_0.9.4 data.table_1.12.0
[4] tictoc_1.0 furrr_0.1.0 future_1.11.1.1
[7] broom_0.5.1 matrixStats_0.54.0 forcats_0.4.0
[10] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.1
[13] readr_1.3.1 tidyr_0.8.3 tibble_2.0.1
[16] ggplot2_3.1.0 tidyverse_1.2.1 rmarkdown_1.11
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.3
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.1
[13] BiocGenerics_0.28.0 colorspace_1.4-0
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.0.1
[19] rvest_0.3.2 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.18 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.1 readxl_1.3.0
[33] rstudioapi_0.9.0 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.0 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.3.1
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] knitr_1.21 pillar_1.3.1
[63] GenomicRanges_1.34.0 codetools_0.2-16
[65] stats4_3.5.2 glue_1.3.0
[67] evaluate_0.13 modelr_0.1.4
[69] cellranger_1.1.0 gtable_0.2.0
[71] assertthat_0.2.0 xfun_0.5
[73] DropletUtils_1.2.2 viridisLite_0.3.0
[75] SingleCellExperiment_1.4.1 IRanges_2.16.0
[77] globals_0.12.4