Prepare analysis workflow

Set parameters

knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)

Set filepaths

dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq2500
project_dir <- rprojroot::find_rstudio_root_file()

if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping

Load libraries

#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)

Load functions

code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))

Run workflow

Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.

max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name, 
                          project_dir, 
                          max_fpr=max_fpr)
Step 1: reading read counts from existing file: 63.974 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 8.678 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.238 sec elapsed
Step 4: computing read counts distribution statistics: 0.268 sec elapsed
Step 5: estimating pi_r matrix: 0.031 sec elapsed
Step 6: infering the true sample of origin: 1.694 sec elapsed
Step 7: estimating g and computing classification metrics: 0.067 sec elapsed
Step 8: determining the optimal cutoff: 0.447 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.011 sec elapsed
Step 10.1: reassigning reads to sample of origin: 0.131 sec elapsed
Step 10.2: deduplicating read counts: 0.003 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 0.152 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0 sec elapsed
Step 10.5: adding cell-umi-gene labels: 8.351 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 3.872 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 24.626 sec elapsed
Step 10: purging phantoms at q cutoff of  0.999661. Max-FPR threshold user-set FALSE: 57.508 sec elapsed
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 5
Using defaultDrops cell calling instead.
                   EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 11: calling cells: 713.25 sec elapsed
Step 12: saving purged data: 9.695 sec elapsed
Step 13: tallying molecules by cell-barcode: 21.122 sec elapsed
Step 14: saving results: 2.827 sec elapsed
Running workflow: 879.922 sec elapsed

Show workflow output

1. Tranforming data and computing summary statistics

Read counts datatable

data_list$read_counts 
ABCDEFGHIJ0123456789
cell
<chr>
gene
<chr>
umi
<int>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
D1
<dbl>
AAACCTGAGAAACCATCsn346896718000000
AAACCTGAGAAACCATSdc410125032000000
AAACCTGAGAAACCATTxn16214428000000
AAACCTGAGAAACCATMrpl546489977000000
AAACCTGAGAAACCATHigd1a3258974000000
AAACCTGAGAAACCATRps214765453000000
AAACCTGAGAAACCATMlec7769936000000
AAACCTGAGAAACCATNap1l12832313000000
AAACCTGAGAAACCATCsn1s2a50456000000
AAACCTGAGAAACCATRps281740811000000

Summary statistics of the joined read counts datatable

 data_list$reads_dist_summary$summary_stats
ABCDEFGHIJ0123456789
n_obs
<dbl>
p_chimeras
<dbl>
g
<dbl>
u
<dbl>
n_reads
<int>
440626160.000592450.0000283580.00059568268768037

Summary statistics conditional on the PCR duplication level r

 data_list$reads_dist_summary$conditional
ABCDEFGHIJ0123456789
r
<int>
n_obs
<dbl>
m_bar
<dbl>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
1112424640.2551474474420.08130121650.13808013970.001987730.002577638
275562200.1714882293870.08357081980.14970990790.001221510.001662273
357070510.1295213838420.09638830980.17260537300.001143910.001429343
443448440.0986061290600.11514067020.20388557100.001109300.001212529
533424370.0758565265390.13958749260.23788798410.000977250.000912628
626044710.0591084060920.16908948750.26515736460.000847260.000795491
720480650.0464807854350.20287769880.27920647050.000708400.000676039
816035350.0363921878810.23919535590.27937556710.000679050.000659325
912439730.0282319370230.27842628240.26633822080.000603710.000691423
109468470.0214886696700.31736795910.24483924010.000639070.000892013

The molecular proportions complexity profile

 data_list$pi_r_hat
ABCDEFGHIJ0123456789
r
<int>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
10.081295665520.138081801280.001972105390.002562086600.176590796150.49747785494
20.083565557100.149713046740.001205786770.001646605470.192341955440.47740910403
30.096384675320.172611420280.001128173290.001413645940.216005540120.40910812626
40.115139417760.203895591790.001093565640.001196804550.242142501600.31471704060
50.139589345630.237902324170.000961496550.000896864830.258254372720.21398194662
60.169095088170.265175168700.000831489990.000779713460.255371997060.12923893908
70.202887591570.279226059190.000692615220.000660246070.233559529020.07055383344
80.239209862000.279395177280.000663254240.000643529690.199786865770.03576485734
90.278445771970.266356174860.000587908920.000675631860.162232883880.01708848712
100.317392395440.244854463190.000623270960.000876247870.125686708180.00801484418

The molecular proportions complexity profile plot

p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))

Outcome counts datatable

data_list$outcome_counts
ABCDEFGHIJ0123456789
outcome
<chr>
n
<int>
q
<dbl>
qs
<dbl>
j
<dbl>
o
<dbl>
FPR
<dbl>
FNR
<dbl>
r
<int>
0,0,0,0,0,0,0,4512581.000000.00000.00116330.00116330.0000e+000.99883666794
0,0,0,0,0,0,4,04776281.000000.00000.01200340.01200310.0000e+000.98799660474
0,0,0,0,0,4,0,013669011.000000.00000.04302610.04302480.0000e+000.95697393994
0,0,0,0,4,0,0,010517081.000000.00000.06689520.06689330.0000e+000.93310477554
0,0,0,4,0,0,0,052151.000000.00000.06701360.06701170.0000e+000.93298641794
0,0,4,0,0,0,0,047591.000000.00000.06712160.06711970.0000e+000.93287840944
0,4,0,0,0,0,0,08856081.000000.00000.08722100.08721850.0000e+000.91277898744
4,0,0,0,0,0,0,05000251.000000.00000.09856940.09856660.0000e+000.90143061034
0,0,0,0,0,0,0,5469011.000000.00000.09963380.09963100.0000e+000.90036616315
0,0,0,0,0,0,5,04486741.000000.00000.10981680.10981370.0000e+000.89018322885

Chimera counts datatable

data_list$fit_out$chimera_counts
ABCDEFGHIJ0123456789
r
<dbl>
1
<dbl>
2
<dbl>
3
<dbl>
4
<dbl>
5
<dbl>
6
<dbl>
7
<dbl>
8
<dbl>
n_obs
<dbl>
111242464000000011242464
2755477814420000007556220
3570536216890000005707051
4434310217420000004344844
5334066617710000003342437
6260277816921000002604471
7204646915951000002048065
8160202215111100001603535
9124259113802000001243973
109456481199000000946847

2. Estimating the sample index hopping rate (SIHR)

GLM fit estimates

data_list$fit_out$glm_estimates
ABCDEFGHIJ0123456789
max_r
<int>
phat
<dbl>
phat_low
<dbl>
phat_high
<dbl>
SIHR
<dbl>
SBIHR
<dbl>
250.999890.999890.999890.000111140.00012304

Model fit plot

p_fit <- plot_fit(data_list, dataset_name)

plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))

3. Purging phantom molecules

The optimal cutoff for minimizing the false positive rate

data_list$optimal_cutoff
ABCDEFGHIJ0123456789
cutoff
<chr>
outcome
<chr>
q
<dbl>
s
<int>
FPR
<dbl>
j
<dbl>
qs
<dbl>
o
<dbl>
FP
<dbl>
TP
<dbl>
optimal0,0,0,0,1,0,0,20.9996680.0261030.96850125.300.994590.0000162890.99457
above0,4,0,0,0,5,0,00.9997560.0261030.96850123.930.994590.0000162890.99457
below0,0,0,0,0,1,0,20.9993680.0261030.96850128.070.994590.0000162890.99457
none1,0,0,0,0,0,1,00.5017470.0564660.94353156.971.000000.0000352370.99996

Plot of the empirical CDF of qs

p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
          p_post$legend,
          ncol=2,
          rel_widths=c(1, 0.1))

Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.

4. Examining the consequences of index hopping

First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.

Tally of predicted phantoms by sample

Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.

data_list$reads_dist_summary$marginal
ABCDEFGHIJ0123456789
sample
<chr>
m
<dbl>
r_a
<dbl>
r_b
<dbl>
f
<dbl>
prop_m
<dbl>
prop_reads
<dbl>
FRM
<dbl>
A15.939310.998620.0000077450.001368840.13471230.134366.0801
A28.018730.999670.0000185820.000308780.18187650.134774.5171
B10.148310.828790.1506911200.020517830.00336390.12624228.7785
B20.199730.841360.1451245460.013518590.00453010.13215177.8310
C18.512240.999650.0000291350.000317780.19307010.117913.7228
C214.190850.999780.0000503140.000169330.32186920.121522.3015
D15.453950.999540.0000518890.000410890.12370360.111675.5033
D21.625750.884300.1141342590.001567890.03687440.1213820.0664

Tally of predicted phantoms in called cells

The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.

data_list$reads_dist_summary$marginal_called_cells
ABCDEFGHIJ0123456789
sample
<chr>
m
<dbl>
r_a
<dbl>
r_b
<dbl>
f
<dbl>
prop_m
<dbl>
prop_reads
<dbl>
FRM
<dbl>
A15.400260.999980.00000722190.000009073640.13402640.134366.6870
A27.357790.999980.00001590150.000004892770.18260950.134774.9229
B10.110640.878480.11672571490.004790137740.00274600.12624306.6604
B20.142070.902410.09758140580.000007038980.00352590.13215250.0056
C17.866230.999970.00002402670.000003813770.19522830.117914.0285
C212.987960.999950.00004565770.000000461970.32234200.121522.5147
D14.948530.999940.00005112630.000007274890.12281520.111676.0654
D21.479010.907280.09271902890.000005409040.03670670.1213822.0573

Tally of barcodes by concordance between purged and unpurged data

data_list$called_cells_tally
ABCDEFGHIJ0123456789
barcode
<chr>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
D1
<dbl>
D2
<dbl>
consensus_background12376913894820300220041452821956179896841695
transition_cell007504002
phantom_background13327707021910380866877820632
transition_background011120222
consensus_cell410565184147361316685158
phantom_cell002500000

Table of called cell-barcodes with the highest number of phantoms

data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$A1
# A tibble: 410 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 AGGTCCGGTCCGAGTC 37482 37479     3     0
 2 ACGTCAAGTTGCCTCT 31541 31538     2     1
 3 CACCACTTCGGTTCGG 18666 18664     2     0
 4 CGTAGGCCAACACCTA 65340 65338     2     0
 5 CTGCTGTTCCGGGTGT 17297 17294     2     1
 6 TCTCTAAAGATCACGG 21938 21936     2     0
 7 AACCGCGTCATGCTCC 47423 47422     1     0
 8 ACATACGCATTCGACA  3517  3516     1     0
 9 ACGAGGACATCACCCT 15208 15207     1     0
10 ACGGCCACAAGGACAC 49067 49064     1     2
# … with 400 more rows

$A2
# A tibble: 566 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 ACGAGCCCACCGTTGG 43895 43892     3     0
 2 CGTGTAATCTAACTCT 22943 22941     2     0
 3 CTTAGGAAGAAACGCC 19123 19121     2     0
 4 GCTGGGTAGTAGGTGC 55651 55648     2     1
 5 AAAGATGAGGCCCTCA 68719 68718     1     0
 6 AACTCCCCATTCGACA 15035 15034     1     0
 7 ACAGCTAAGGCCCGTT 29916 29914     1     1
 8 ACGGGCTTCGGTCCGA 34499 34498     1     0
 9 ACTTACTCAACACCCG 29456 29455     1     0
10 CAAGATCCAACACGCC 44113 44112     1     0
# … with 556 more rows

$B1
# A tibble: 220 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT   103     0    94     9
 2 TCGTACCGTCCATGAT    74     0    63    11
 3 TGCACCTTCTGGCGAC    74     0    63    11
 4 GTGCAGCCAAGCGAGT    43     0    38     5
 5 CTAGCCTTCCGCGCAA    41     0    37     4
 6 AGCAGCCGTTAAGGGC    28     0    26     2
 7 TGCACCTCATGCCACG    28     0    25     3
 8 TCATTACCAGGCAGTA    19     0    16     3
 9 GATCGTAGTCACCCAG    18     0    14     4
10 GTGCTTCAGGCATTGG    15     0    14     1
# … with 210 more rows

$B2
# A tibble: 16 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 TCGTACCGTCCATGAT 26473 24103     1  2369
 2 ACGCCAGAGTACGTAA   417   359     0    58
 3 AGCAGCCGTTAAGGGC  8853  7985     0   868
 4 CAAGGCCGTCCGAGTC   232   196     0    36
 5 CGTGAGCCAAGGACTG  1478  1351     0   127
 6 CTAGCCTTCCGCGCAA 11808 10648     0  1160
 7 CTCACACGTCTCTCGT 36274 32695     0  3579
 8 CTTGGCTCAAAGTGCG  1034   942     0    92
 9 GACAGAGTCAATACCG   211   188     0    23
10 GATCGTAGTCACCCAG  5904  5294     0   610
11 GTGCAGCCAAGCGAGT 12113 10974     0  1139
12 GTGGGTCCAGGCTGAA  2190  1983     0   207
13 TCATTACCAGGCAGTA  5893  5233     0   660
14 TGCACCTCATGCCACG  5479  4928     0   551
15 TGCACCTTCTGGCGAC 22310 20099     0  2211
16 TGGCTGGCATCCGTGG  1397  1224     0   173

$C1
# A tibble: 736 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 ACTATCTTCCTGTACC 33437 33433     2     2
 2 CCCTCCTGTGATGTCT 10899 10897     2     0
 3 GGTGCGTCAGTTAACC 38377 38374     2     1
 4 ACCTTTACAAAGTGCG 19839 19837     1     1
 5 ACGGCCAGTACTCAAC 26169 26168     1     0
 6 AGCGTATGTTCTCATT 15803 15802     1     0
 7 AGGGATGTCGCCGTGA  9354  9353     1     0
 8 AGGTCATGTAAGCACG 24932 24930     1     1
 9 ATCTGCCTCTTGCATT  4779  4777     1     1
10 ATTATCCGTTAAGATG 30075 30072     1     2
# … with 726 more rows

$C2
# A tibble: 1,318 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTTAACTGTGAGTATA  6233  6229     3     1
 2 ATAACGCCACGGTTTA  1584  1583     1     0
 3 GGAACTTTCAAAGTAG  1634  1633     1     0
 4 TACCTATCATCACAAC  6081  6080     1     0
 5 AAACCTGCAGTGGGAT  8200  8200     0     0
 6 AAACCTGGTCGAAAGC  7482  7480     0     2
 7 AAACGGGCAAGTTAAG  9265  9265     0     0
 8 AAACGGGGTTATCACG 13268 13268     0     0
 9 AAACGGGGTTGTCTTT 13437 13437     0     0
10 AAAGATGAGAGGTTGC   795   795     0     0
# … with 1,308 more rows

$D1
# A tibble: 687 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 AAAGATGAGGCCGAAT 13329 13326     1     2
 2 AAGGCAGGTTATGTGC  4278  4277     1     0
 3 ACCGTAACAGGACGTA 10930 10929     1     0
 4 AGCTCCTTCTCAAACG 34839 34837     1     1
 5 ATAACGCGTTGTCTTT 11030 11027     1     2
 6 ATCCACCGTAAGGGAA 20527 20526     1     0
 7 ATCGAGTCAGATCTGT 25174 25172     1     1
 8 CATATTCGTACGAAAT 23309 23304     1     4
 9 CATCGGGGTATGAAAC 29755 29753     1     1
10 CCAGCGATCGACCAGC 13789 13785     1     3
# … with 677 more rows

$D2
# A tibble: 160 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 AACCATGTCAGCGACC 18595 16795     1  1799
 2 AGATTGCCAGGTTTCA 14970 13750     1  1219
 3 CGCGTTTCATGTTGAC 11987 10878     1  1108
 4 CGTCAGGAGCTTCGCG 14085 12830     1  1254
 5 GCATGTATCCCAGGTG 27178 24872     1  2305
 6 GTGTTAGGTAGCGTCC 11840 10762     1  1077
 7 TCAGGTATCATAACCG  8832  7922     1   909
 8 TGTCCCAGTCATCCCT  3194  2878     1   315
 9 AAAGCAAGTGAACCTT  9734  8840     0   894
10 AAAGCAATCTCCTATA 11917 10826     0  1091
# … with 150 more rows

Table of background cell-barcodes with highest number of phantoms

data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$A1
# A tibble: 125,101 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT    86     4    82     0
 2 CTCACACGTCTCTCGT    99    19    80     0
 3 CATGCCTAGTGCTGCC    92    18    74     0
 4 TGCACCTTCTGGCGAC    57     5    52     0
 5 GGAATAAGTCAGGACA    53     3    50     0
 6 CACACAAAGGCGATAC    67    19    48     0
 7 TCGTACCGTCCATGAT    56    10    46     0
 8 CGGCTAGGTAAACCTC    42     0    42     0
 9 CTTCTCTCAACTGCTA    48     6    42     0
10 TGGTTAGTCCTGCAGG    40     1    39     0
# … with 125,091 more rows

$A2
# A tibble: 139,718 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT    97    15    82     0
 2 CTCACACGTCTCTCGT    93    21    72     0
 3 CATGCCTAGTGCTGCC    84    15    69     0
 4 TCGTACCGTCCATGAT    76    18    58     0
 5 TGCACCTTCTGGCGAC    66    14    52     0
 6 CACACAAAGGCGATAC    55    14    41     0
 7 GTGCAGCCAAGCGAGT    50    12    38     0
 8 AAGCCGCGTGCTCTTC    44     8    36     0
 9 GGAATAAGTCAGGACA    38     5    33     0
10 CTAGCCTTCCGCGCAA    40     8    32     0
# … with 139,708 more rows

$B1
# A tibble: 27,396 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 ACGGCCACAAGGACAC     8     0     8     0
 2 AGATTGCCAGGTTTCA     8     0     8     0
 3 CAGCCGACAGGATCGA     8     0     8     0
 4 GATCGTAAGATACACA     8     0     8     0
 5 ACTGTCCCAGCGAACA     8     0     7     1
 6 ATTATCCGTGTGAATA     8     0     7     1
 7 CAGATCAGTGCTCTTC     7     0     7     0
 8 CTACACCCACGCGAAA     7     0     7     0
 9 TTTGGTTCAGGAACGT     8     0     7     1
10 ACGGGCTTCGGTCCGA     6     0     6     0
# … with 27,386 more rows

$B2
# A tibble: 31,107 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT    96     0    86    10
 2 CATGCCTAGTGCTGCC    85     0    76     9
 3 CACACAAAGGCGATAC    66     0    58     8
 4 AAGCCGCGTGCTCTTC    46     0    40     6
 5 GGAATAAGTCAGGACA    42     0    39     3
 6 ATAAGAGAGTGTGGCA    32     0    29     3
 7 TGCCCATCAGTAAGCG    32     0    29     3
 8 CACACTCCACATAACC    29     0    26     3
 9 CATCGAAAGTCAATAG    28     0    22     6
10 CATATTCGTTGTACAC    14     0    14     0
# … with 31,097 more rows

$C1
# A tibble: 146,094 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT   113    10   103     0
 2 CATGCCTAGTGCTGCC   107     9    98     0
 3 CTCACACGTCTCTCGT    88    16    72     0
 4 CACACAAAGGCGATAC    71    12    59     0
 5 TCGTACCGTCCATGAT    64    10    54     0
 6 TGCACCTTCTGGCGAC    57    12    45     0
 7 GGAATAAGTCAGGACA    50     8    42     0
 8 AAGCCGCGTGCTCTTC    47     7    40     0
 9 CATCGAAAGTCAATAG    42     5    37     0
10 ATAAGAGAGTGTGGCA    38     7    31     0
# … with 146,084 more rows

$C2
# A tibble: 196,285 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CATGCCTAGTGCTGCC   103    13    90     0
 2 GGCTGGTGTCCCTACT   109    20    89     0
 3 CACACAAAGGCGATAC    67     2    65     0
 4 CTCACACGTCTCTCGT    69     8    61     0
 5 TCGTACCGTCCATGAT    67     9    58     0
 6 TGCACCTTCTGGCGAC    54     8    46     0
 7 GGAATAAGTCAGGACA    43     5    38     0
 8 AAGCCGCGTGCTCTTC    44     8    36     0
 9 CATCGAAAGTCAATAG    40     7    33     0
10 CTAGCCTTCCGCGCAA    33     2    31     0
# … with 196,275 more rows

$D1
# A tibble: 99,746 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT    94    20    74     0
 2 GGCTGGTGTCCCTACT    80     8    72     0
 3 CATGCCTAGTGCTGCC    86    15    71     0
 4 TCGTACCGTCCATGAT    62    16    46     0
 5 CACACAAAGGCGATAC    46     5    41     0
 6 TGCACCTTCTGGCGAC    44     7    37     0
 7 AAGCCGCGTGCTCTTC    32     3    29     0
 8 CATCGAAAGTCAATAG    28     4    24     0
 9 CTAGCCTTCCGCGCAA    33     9    24     0
10 GTGCAGCCAAGCGAGT    27     4    23     0
# … with 99,736 more rows

$D2
# A tibble: 62,329 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT   120     0   101    19
 2 CTCACACGTCTCTCGT    90     0    82     8
 3 CATGCCTAGTGCTGCC    93     0    81    12
 4 TCGTACCGTCCATGAT    83     0    60    23
 5 CACACAAAGGCGATAC    66     0    59     7
 6 TGCACCTTCTGGCGAC    62     0    57     5
 7 GTGCAGCCAAGCGAGT    40     0    35     5
 8 ATAAGAGAGTGTGGCA    32     0    30     2
 9 AAGCCGCGTGCTCTTC    28     0    28     0
10 CTAGCCTTCCGCGCAA    33     0    28     5
# … with 62,319 more rows

Session Info

# memory usage
gc()
            used   (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells   6032502  322.2   11951962   638.4   11951962   638.4
Vcells 522868676 3989.2 2326586835 17750.5 2898180107 22111.4
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] rhdf5_2.26.2       cowplot_0.9.4      data.table_1.12.0 
 [4] tictoc_1.0         furrr_0.1.0        future_1.11.1.1   
 [7] broom_0.5.1        matrixStats_0.54.0 forcats_0.4.0     
[10] stringr_1.4.0      dplyr_0.8.0.1      purrr_0.3.1       
[13] readr_1.3.1        tidyr_0.8.3        tibble_2.0.1      
[16] ggplot2_3.1.0      tidyverse_1.2.1    rmarkdown_1.11    

loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.3            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.1             
[13] BiocGenerics_0.28.0         colorspace_1.4-0           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.0.1                  
[19] rvest_0.3.2                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.18               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.1                 readxl_1.3.0               
[33] rstudioapi_0.9.0            generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.0                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.3.1              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] knitr_1.21                  pillar_1.3.1               
[63] GenomicRanges_1.34.0        codetools_0.2-16           
[65] stats4_3.5.2                glue_1.3.0                 
[67] evaluate_0.13               modelr_0.1.4               
[69] cellranger_1.1.0            gtable_0.2.0               
[71] assertthat_0.2.0            xfun_0.5                   
[73] DropletUtils_1.2.2          viridisLite_0.3.0          
[75] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[77] globals_0.12.4             
LS0tCnRpdGxlOiAiUGhhbnRvbSBQdXJnZSIKc3VidGl0bGU6ICJBbmFseXNpcyB3b3JrZmxvdyBmb3IgYHIgY29tbWFuZEFyZ3ModHJhaWxpbmdPbmx5PVQpWzFdYCBkYXRhIgphdXRob3I6ICJSaWNrIEZhcm91bmkiCmRhdGU6ICdgciBmb3JtYXQoU3lzLkRhdGUoKSwgIiVZLSVCLSVkIilgJwpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIGRmX3ByaW50OiBwYWdlZAogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICB0b2M6IG5vCiAgICB0b2NfZmxvYXQ6IAogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKCiMgUHJlcGFyZSBhbmFseXNpcyB3b3JrZmxvdwoKIyMjIFNldCBwYXJhbWV0ZXJzCgpgYGB7ciBzZXR1cH0Ka25pdHI6Om9wdHNfa25pdCRzZXQocm9vdC5kaXIgPSBycHJvanJvb3Q6OmZpbmRfcnN0dWRpb19yb290X2ZpbGUoKSwKICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTE1LAogICAgICAgICAgICAgICAgICAgICBkaWdpdD01LAogICAgICAgICAgICAgICAgICAgICBzY2lwZW49OCkKb3B0aW9ucyhkaWdpdHM9NSwgCiAgICAgICAgc2NpcGVuPTgsCiAgICAgICAgZnV0dXJlLmdsb2JhbHMubWF4U2l6ZSA9ICtJbmYpCmBgYAoKCiMjIyBTZXQgZmlsZXBhdGhzCgpgYGB7cn0KZGF0YXNldF9uYW1lIDwtIGNvbW1hbmRBcmdzKHRyYWlsaW5nT25seT1UKVsxXQojZGF0YXNldF9uYW1lIDwtICJoaXNlcTQwMDAiCm1lc3NhZ2Uoc3ByaW50ZigiRGF0YXNldCBuYW1lOiAlcyIsIGRhdGFzZXRfbmFtZSkpCmBgYAoKCmBgYHtyfQpwcm9qZWN0X2RpciA8LSBycHJvanJvb3Q6OmZpbmRfcnN0dWRpb19yb290X2ZpbGUoKQoKaWYoaXMubnVsbChwcm9qZWN0X2RpcikpewogIHByb2plY3RfZGlyIDwtIGdldHdkKCkKICB3YXJuaW5nKHNwcmludGYoIk5vIHJzdHVkaW8gcHJvamVjdCByb290IGZpbGUgIGZvdW5kLiAKICAgICAgICAgICAgICAgICAgU2V0dGluZyBwcm9qZWN0IGRpcmVjdG9yeSB0byBjdXJyZW50IHdvcmtmbG93LlJtZCBmaWxlIGxvY2F0aW9uOiAlcy4gCiAgICAgICAgICAgICAgICAgIE92ZXJyaWRlIGlmIG5lZWRlZC4iLAogICAgICAgICAgICAgICAgICBwcm9qZWN0X2RpcikpCiAKfQptZXNzYWdlKHNwcmludGYoIlByb2plY3QgZGlyZWN0b3J5OiAlcyIsCiAgICAgICAgICAgICAgICBwcm9qZWN0X2RpcikpCmBgYAoKIyMjIExvYWQgbGlicmFyaWVzCgoKYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KI2xpYnJhcnkoRHJvcGxldFV0aWxzKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShtYXRyaXhTdGF0cykKbGlicmFyeShicm9vbSkKbGlicmFyeShmdXJycikKbGlicmFyeSh0aWN0b2MpCmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeShjb3dwbG90KQpsaWJyYXJ5KHJoZGY1KQpwbGFuKG11bHRpcHJvY2VzcykKYGBgCgojIyMgTG9hZCBmdW5jdGlvbnMKCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQpjb2RlX2RpciA8LSBmaWxlLnBhdGgocHJvamVjdF9kaXIsICJjb2RlIikKc291cmNlKGZpbGUucGF0aChjb2RlX2RpciwgImFuYWx5c2lzX2Z1bmN0aW9ucy5SIikpCnNvdXJjZShmaWxlLnBhdGgoY29kZV9kaXIsICJpb19mdW5jdGlvbnMuUiIpKQpzb3VyY2UoZmlsZS5wYXRoKGNvZGVfZGlyLCAid29ya2Zsb3dfZnVuY3Rpb25zLlIiKSkKc291cmNlKGZpbGUucGF0aChjb2RlX2RpciwgInBsb3R0aW5nX2Z1bmN0aW9ucy5SIikpCmBgYAoKCiMgUnVuIHdvcmtmbG93CgoKRXN0aW1hdGUgdGhlIHNhbXBsZSBpbmRleCBob3BwaW5nIHByb2JhYmlsaXR5LCBpbmZlciB0aGUgdHJ1ZSBzYW1wbGUgb2Ygb3JpZ2luLCBhbmQgZmluZCB0aGUgb3B0aW1hbCBwb3N0ZXJpb3IgcHJvYmFiaWxpdHkgdGhyZXNob2xkIGZvciByZXRhaW5pbmcgcHJlZGljdGVkIHJlYWwgbW9sZWN1bGVzLgoKCmBgYHtyfQptYXhfZnByIDwtIE5VTEwgIyBtYW51YWxseSBzZXQgdGhlIG1heGltdW0gZmFsc2UgcG9zaXRpdmUgcmF0ZSAobm90IHJlY29tbWVuZGVkKQpkYXRhX2xpc3QgPC0gcnVuX3dvcmtmbG93KGRhdGFzZXRfbmFtZSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgcHJvamVjdF9kaXIsIAogICAgICAgICAgICAgICAgICAgICAgICAgIG1heF9mcHI9bWF4X2ZwcikKYGBgCgoKIyBTaG93IHdvcmtmbG93IG91dHB1dAoKIyMgMS4gVHJhbmZvcm1pbmcgZGF0YSBhbmQgY29tcHV0aW5nIHN1bW1hcnkgc3RhdGlzdGljcwoKIyMjIFJlYWQgY291bnRzIGRhdGF0YWJsZQoKYGBge3J9CmRhdGFfbGlzdCRyZWFkX2NvdW50cyAKYGBgCgoKIyMjIFN1bW1hcnkgc3RhdGlzdGljcyBvZiB0aGUgam9pbmVkIHJlYWQgY291bnRzIGRhdGF0YWJsZQoKYGBge3J9CiBkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JHN1bW1hcnlfc3RhdHMKYGBgCgoKIyMjIFN1bW1hcnkgc3RhdGlzdGljcyBjb25kaXRpb25hbCBvbiB0aGUgUENSIGR1cGxpY2F0aW9uIGxldmVsICpyKgoKYGBge3J9CiBkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JGNvbmRpdGlvbmFsCmBgYAoKCiMjIyAgVGhlIG1vbGVjdWxhciBwcm9wb3J0aW9ucyBjb21wbGV4aXR5IHByb2ZpbGUKCmBgYHtyfQogZGF0YV9saXN0JHBpX3JfaGF0CmBgYAoKCiMjIyBUaGUgbW9sZWN1bGFyIHByb3BvcnRpb25zIGNvbXBsZXhpdHkgcHJvZmlsZSBwbG90IAoKYGBge3IgZmlnLmhlaWdodD05LCBmaWcud2lkdGg9MTUsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CnBfcmVhZCA8LSBwbG90X21vbGVjdWxlc19kaXN0cmlidXRpb25zKGRhdGFfbGlzdCwgZGF0YXNldF9uYW1lLCB4X2xpbT0xMjApCnBsb3RfZ3JpZChwX3JlYWQkcCwgCiAgICAgICAgICBwX3JlYWQkbGVnZW5kLAogICAgICAgICAgbmNvbD0yLAogICAgICAgICAgcmVsX3dpZHRocz1jKDEsIDAuMSkpCmBgYAoKCiMjIyBPdXRjb21lIGNvdW50cyBkYXRhdGFibGUKCmBgYHtyIH0KZGF0YV9saXN0JG91dGNvbWVfY291bnRzCmBgYAoKCgojIyMgQ2hpbWVyYSBjb3VudHMgZGF0YXRhYmxlCgoKYGBge3J9CmRhdGFfbGlzdCRmaXRfb3V0JGNoaW1lcmFfY291bnRzCmBgYAoKIyMgMi4gRXN0aW1hdGluZyB0aGUgc2FtcGxlIGluZGV4IGhvcHBpbmcgcmF0ZSAoKlNJSFIqKQoKCiMjIyBHTE0gZml0IGVzdGltYXRlcwoKYGBge3J9CmRhdGFfbGlzdCRmaXRfb3V0JGdsbV9lc3RpbWF0ZXMKYGBgCgojIyMgTW9kZWwgZml0IHBsb3QgCgpgYGB7ciBmaWcuaGVpZ2h0PTksIGZpZy53aWR0aD0xNSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KcF9maXQgPC0gcGxvdF9maXQoZGF0YV9saXN0LCBkYXRhc2V0X25hbWUpCgpwbG90X2dyaWQocF9maXQkcCwKICAgICAgICAgIHBfZml0JGxlZ2VuZCwKICAgICAgICAgIG5jb2w9MiwKICAgICAgICAgIHJlbF93aWR0aHM9YygxLCAwLjIpKQpgYGAKCgojIyAzLiBQdXJnaW5nIHBoYW50b20gbW9sZWN1bGVzCgoKIyMjIFRoZSBvcHRpbWFsIGN1dG9mZiBmb3IgbWluaW1pemluZyB0aGUgZmFsc2UgcG9zaXRpdmUgcmF0ZQoKYGBge3J9CmRhdGFfbGlzdCRvcHRpbWFsX2N1dG9mZgpgYGAKCgojIyMgUGxvdCBvZiB0aGUgZW1waXJpY2FsIENERiBvZiAqcXMqIAoKCgpgYGB7ciBmaWcuaGVpZ2h0PTksIGZpZy53aWR0aD0xNSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KcF9wb3N0IDwtcGxvdF9wb3N0ZXJpb3JfcHJvYihkYXRhX2xpc3QsIGRhdGFzZXRfbmFtZSkKcGxvdF9ncmlkKHBfcG9zdCRwLAogICAgICAgICAgcF9wb3N0JGxlZ2VuZCwKICAgICAgICAgIG5jb2w9MiwKICAgICAgICAgIHJlbF93aWR0aHM9YygxLCAwLjEpKQpgYGAKCk5vdGUgdGhhdCAqcSogaXMgdGhlIG1hcmdpbmFsIHBvc3RlcmlvciBkaXN0cmlidXRpb24gb2YgcHJlZGljdGVkIHNhbXBsZSBvZiBvcmlnaW4gKipzKiogYW5kICpxcyogaXMgYSB0cmFuZm9ybWF0aW9uIG9mICpxKi4KCiMjIDQuIEV4YW1pbmluZyB0aGUgY29uc2VxdWVuY2VzIG9mIGluZGV4IGhvcHBpbmcgCgpGaXJzdCB3ZSBleGFtaW5nIHRoZSBleHRlbnQgb2YgdGhlIGVmZmVjdHMgb2YgaW5kZXggaG9wcGluZyBvbiBpbmRpdmlkdWFsIHNhbXBsZXMgYW5kIHRoZW4gb24gY2VsbC1iYXJjb2Rlcy4KCiMjIyBUYWxseSBvZiBwcmVkaWN0ZWQgcGhhbnRvbXMgYnkgc2FtcGxlCgpIZXJlICpyX2EqIGlzIHRoZSBudW1iZXIgb2YgbW9sZWN1bGVzIGFib3ZlIHRoZSBvcHRpbWFsIGN1dG9mZiBxLCB3aGljaCB3ZSBwcmVkaWN0IGFzIHJlYWwgbW9sZWN1bGVzLiAqcl9hKiBhcmUgdGhlIG51bWJlciBvZiBtb2xlY3VsZXMgYmVsb3cgb3IgZXF1YWwgdG8gY3V0b2ZmIGFuZCB0aHVzIHdlIHByZWRpY3QgYXMgcGhhbnRvbXMuICpmKiBpcyB0aGUgbnVtYmVyIG9mIG1vbGVjdWxlcyBwcmVkaWN0ZWQgYXMgcGhhbnRvbSBubyBtYXR0ZXIgd2hhdCB0aGUgdGhyZXNob2xkIGlzLiAqbSogaXMgdGhlIG51bWJlciBvZiB0b3RhbCBtb2xlY3VsZXMuIAoKYGBge3J9CmRhdGFfbGlzdCRyZWFkc19kaXN0X3N1bW1hcnkkbWFyZ2luYWwKYGBgCgojIyMgVGFsbHkgb2YgcHJlZGljdGVkIHBoYW50b21zIGluIGNhbGxlZCBjZWxscyAKClRoZSBjYWxsZWQgY2VsbHMgd2VyZSBkZXRlcm1pbmVkIGZyb20gdGhlIHVucHVyZ2VkIGRhdGEgaW4gb3JkZXIgdG8gc2hvdyB0aGUgbGV2ZWwgb2YgY29udGFtaW5hdGlvbiBieSBwaGFudG9tIG1vbGVjdWxlcyBpZiBkYXRhIHdlcmUgbm90IHB1cmdlZC4KCmBgYHtyfQpkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JG1hcmdpbmFsX2NhbGxlZF9jZWxscwpgYGAKCgojIyMgVGFsbHkgb2YgYmFyY29kZXMgYnkgY29uY29yZGFuY2UgYmV0d2VlbiBwdXJnZWQgYW5kIHVucHVyZ2VkIGRhdGEKCmBgYHtyfQpkYXRhX2xpc3QkY2FsbGVkX2NlbGxzX3RhbGx5CmBgYAoKIyMjIFRhYmxlIG9mIGNhbGxlZCBjZWxsLWJhcmNvZGVzIHdpdGggdGhlIGhpZ2hlc3QgbnVtYmVyIG9mIHBoYW50b21zCgpgYGB7cn0KZGF0YV9saXN0JHVtaV9jb3VudHNfY2VsbCAlPiUgCiAgbWFwKGxpc3QoImNhbGxlZF9jZWxscyIpKQpgYGAKCiMjIyBUYWJsZSBvZiBiYWNrZ3JvdW5kIGNlbGwtYmFyY29kZXMgd2l0aCBoaWdoZXN0IG51bWJlciBvZiBwaGFudG9tcwoKYGBge3J9CmRhdGFfbGlzdCR1bWlfY291bnRzX2NlbGwgJT4lIAogIG1hcChsaXN0KCJiYWNrZ3JvdW5kX2NlbGxzIikpCmBgYAoKCgojIFNlc3Npb24gSW5mbwoKYGBge3J9CiMgbWVtb3J5IHVzYWdlCmdjKCkKYGBgCgpgYGB7cn0Kc2Vzc2lvbkluZm8oKQpgYGAK