Prepare analysis workflow

Set parameters

knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)

Set filepaths

dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq4000
project_dir <- rprojroot::find_rstudio_root_file()

if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping

Load libraries

#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)

Load functions

code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))

Run workflow

Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.

max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name, 
                          project_dir, 
                          max_fpr=max_fpr)
Step 1: reading read counts from existing file: 81.69 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 18.807 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.178 sec elapsed
Step 4: computing read counts distribution statistics: 0.364 sec elapsed
Step 5: estimating pi_r matrix: 0.114 sec elapsed
Step 6: infering the true sample of origin: 7.63 sec elapsed
Step 7: estimating g and computing classification metrics: 0.192 sec elapsed
Step 8: determining the optimal cutoff: 0.066 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.025 sec elapsed
Step 10.1: reassigning reads to sample of origin: 3.816 sec elapsed
Step 10.2: deduplicating read counts: 0.024 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 3.933 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0.001 sec elapsed
Step 10.5: adding cell-umi-gene labels: 11.497 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 5.69 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 31.882 sec elapsed
Step 10: purging phantoms at q cutoff of  0.923593. Max-FPR threshold user-set FALSE: 77.261 sec elapsed
Using defaultDrops cell calling instead.
                   EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 11: calling cells: 915.338 sec elapsed
Step 12: saving purged data: 10.691 sec elapsed
Step 13: tallying molecules by cell-barcode: 34.187 sec elapsed
Step 14: saving results: 5.529 sec elapsed
Running workflow: 1152.219 sec elapsed

Show workflow output

1. Tranforming data and computing summary statistics

Read counts datatable

data_list$read_counts 
ABCDEFGHIJ0123456789
cell
<chr>
gene
<chr>
umi
<int>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
D1
<dbl>
AAACCTGAGAAACCATCsn346896721000000
AAACCTGAGAAACCATAkt14109241000000
AAACCTGAGAAACCATAp1s1435911000000
AAACCTGAGAAACCATSdc4101250313000000
AAACCTGAGAAACCATTxn162144212000000
AAACCTGAGAAACCATNfe2l33552041000000
AAACCTGAGAAACCATNucb26555586000000
AAACCTGAGAAACCATMrpl5464899716000000
AAACCTGAGAAACCATHigd1a3258975000000
AAACCTGAGAAACCATRps2147654511000000

Summary statistics of the joined read counts datatable

 data_list$reads_dist_summary$summary_stats
ABCDEFGHIJ0123456789
n_obs
<dbl>
p_chimeras
<dbl>
g
<dbl>
u
<dbl>
n_reads
<int>
536366950.0710330.00147530.081685626168518

Summary statistics conditional on the PCR duplication level r

 data_list$reads_dist_summary$conditional
ABCDEFGHIJ0123456789
r
<int>
n_obs
<dbl>
m_bar
<dbl>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
190478080.1686869036210.088545870.136680400.011041240.011823970.16590029
255606970.1036733713740.076284590.134797850.004884460.005235140.16891579
345691430.0851868855830.075372780.136437690.004494210.004814980.16929148
440038360.0746473286620.077805630.138913530.004512110.004786600.17149541
535376080.0659549959220.081428690.144124670.004604810.004810600.17883740
631043080.0578765712540.086721370.153445680.004702340.004802200.19097434
727066160.0504620204510.093825230.168006680.004780460.004715690.21056821
823546630.0439002253960.101599780.186593630.004733060.004586750.23413807
920578210.0383659172140.111249660.208047250.004735970.004457090.25803945
1018116410.0337761489590.121225450.231414340.004655060.004223460.27841995

The molecular proportions complexity profile

 data_list$pi_r_hat
ABCDEFGHIJ0123456789
r
<int>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
10.088179817780.136797685160.009896937760.010687528460.166310987450.46492653892
20.075795423090.134896233640.003678337170.004032533890.169356759070.51375076673
30.074874453360.136552543660.003284164160.003608160140.169736230600.52109914440
40.077331739150.139053242600.003302249040.003579492030.171962287700.51652246846
50.080991179360.144316711660.003395875860.003603731030.179377995460.50000041631
60.086336997760.153731317870.003494385020.003595248980.191636810030.46955499826
70.093512188830.168438526430.003573288630.003507876390.211427430330.42192178496
80.101364813250.187212116940.003525412460.003377637920.235233963670.36266403683
90.111111589120.208881156670.003528356140.003246673960.259375346460.29537482014
100.121187550940.232482880340.003446635430.003010703950.279960495640.22811960714

The molecular proportions complexity profile plot

p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))

Outcome counts datatable

data_list$outcome_counts
ABCDEFGHIJ0123456789
outcome
<chr>
n
<int>
q
<dbl>
qs
<dbl>
j
<dbl>
o
<dbl>
FPR
<dbl>
FNR
<dbl>
r
<int>
k_chimera
<int>
0,0,0,0,0,0,6,0248029100.00463110.004624200.9953761
0,0,0,0,0,6,0,01383880100.03047020.030425200.9695361
0,0,0,0,6,0,0,0568918100.04109270.041032100.9589161
0,6,0,0,0,0,0,0456096100.04960870.049535500.9503961
6,0,0,0,0,0,0,0256555100.05439900.054318700.9456061
0,0,0,0,0,0,0,724568100.05485770.054776800.9451471
0,0,0,0,0,0,7,0227376100.05910320.059016000.9409071
0,0,0,0,0,7,0,01072636100.07913090.079014200.9208771
0,0,0,0,7,0,0,0542820100.08926620.089134500.9107371
0,0,0,7,0,0,0,03824100.08933760.089205800.9106671

Chimera counts datatable

data_list$fit_out$chimera_counts
ABCDEFGHIJ0123456789
r
<dbl>
1
<dbl>
2
<dbl>
3
<dbl>
4
<dbl>
5
<dbl>
6
<dbl>
7
<dbl>
8
<dbl>
n_obs
<dbl>
1904780800000009047808
25474096866010000005560697
34458826109716601000004569143
438729661298411025400004003836
533907911451271681810003537608
6294820115380522861420003104308
7254639515741227871930002706616
8219564415562833613000002354663
9190197815205537444220002057821
10165960014780541964000001811641

2. Estimating the sample index hopping rate (SIHR)

GLM fit estimates

data_list$fit_out$glm_estimates
ABCDEFGHIJ0123456789
max_r
<int>
phat
<dbl>
phat_low
<dbl>
phat_high
<dbl>
SIHR
<dbl>
SBIHR
<dbl>
250.99130.991290.991310.00869880.0096309

Model fit plot

p_fit <- plot_fit(data_list, dataset_name)

plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))

3. Purging phantom molecules

The optimal cutoff for minimizing the false positive rate

data_list$optimal_cutoff
ABCDEFGHIJ0123456789
cutoff
<chr>
outcome
<chr>
q
<dbl>
s
<int>
FPR
<dbl>
j
<dbl>
qs
<dbl>
o
<dbl>
FP
<dbl>
TP
<dbl>
optimal0,0,0,1,1,0,0,10.9235950.0130630.98299148.830.995670.00108640.99458
above1,1,2,0,0,0,0,00.9237730.0130630.98299148.820.995670.00108640.99458
below0,1,2,0,0,0,1,00.9228130.0130640.98299148.880.995670.00108640.99458
none0,1,0,0,1,0,1,00.4369150.0194320.98043157.511.000000.00161590.99838

Plot of the empirical CDF of qs

p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
          p_post$legend,
          ncol=2,
          rel_widths=c(1, 0.1))

Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.

4. Examining the consequences of index hopping

First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.

Tally of predicted phantoms by sample

Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.

data_list$reads_dist_summary$marginal
ABCDEFGHIJ0123456789
sample
<chr>
m
<dbl>
r_a
<dbl>
r_b
<dbl>
f
<dbl>
prop_m
<dbl>
prop_reads
<dbl>
FRM
<dbl>
A17.02670.9661970.0000822570.0337200.1211130.1305311.6320
A29.59200.9774780.0003221430.0222000.1653280.131008.5519
B11.82030.0802650.0554377780.8642970.0313750.1282144.1022
B21.75130.1143440.0617281620.8239280.0301860.1400450.0703
C110.45650.9780070.0004799870.0215130.1802290.121857.2968
C218.64650.9873700.0006329880.0119970.3213910.120544.0478
D16.56480.9684020.0002464640.0313510.1131520.1067510.1821
D22.15980.8786830.0006028450.1207140.0372260.1210835.1032

Tally of predicted phantoms in called cells

The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.

data_list$reads_dist_summary$marginal_called_cells
ABCDEFGHIJ0123456789
sample
<chr>
m
<dbl>
r_a
<dbl>
r_b
<dbl>
f
<dbl>
prop_m
<dbl>
prop_reads
<dbl>
FRM
<dbl>
A16.171470.985230.0000776150.01468950.1217550.1305313.2441
A28.576680.990440.0002331900.00932270.1692060.131009.5643
B11.252100.084200.0509112690.86488860.0247020.1282164.1154
B20.986020.144190.0584664780.79734350.0194530.1400488.9336
C19.420010.990660.0003491500.00899130.1858440.121858.0997
C216.840060.994140.0005257700.00533220.3322310.120544.4820
D15.724730.988230.0001926730.01157710.1129410.1067511.6763
D21.716700.963020.0004992140.03647930.0338680.1210844.1628

Tally of barcodes by concordance between purged and unpurged data

data_list$called_cells_tally
ABCDEFGHIJ0123456789
barcode
<chr>
A1
<dbl>
A2
<dbl>
B1
<dbl>
B2
<dbl>
C1
<dbl>
C2
<dbl>
D1
<dbl>
D2
<dbl>
consensus_background138117156898274783229016847423240211947182288
transition_cell1413159105157
phantom_background1247811203823617819411750100871161115912
transition_background1371059306611251536428
consensus_cell40156534147311321661165
phantom_cell002702490000

Table of called cell-barcodes with the highest number of phantoms

data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$A1
# A tibble: 538 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  8325  1084  7238     3
 2 GGCTGGTGTCCCTACT  6661   888  5773     0
 3 TCGTACCGTCCATGAT  5765   804  4960     1
 4 CATGCCTAGTGCTGCC  5496   805  4691     0
 5 TGCACCTTCTGGCGAC  4967   676  4290     1
 6 CACACAAAGGCGATAC  3464   541  2922     1
 7 GTGCAGCCAAGCGAGT  3020   370  2650     0
 8 GGAATAAGTCAGGACA  2961   443  2518     0
 9 CTAGCCTTCCGCGCAA  2717   411  2306     0
10 AAGCCGCGTGCTCTTC  2501   363  2137     1
# … with 528 more rows

$A2
# A tibble: 670 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  7834  1192  6637     5
 2 GGCTGGTGTCCCTACT  6399   970  5428     1
 3 TCGTACCGTCCATGAT  5485   880  4602     3
 4 CATGCCTAGTGCTGCC  5421   827  4593     1
 5 TGCACCTTCTGGCGAC  4687   757  3930     0
 6 CACACAAAGGCGATAC  3280   530  2749     1
 7 GTGCAGCCAAGCGAGT  2841   404  2437     0
 8 GGAATAAGTCAGGACA  2869   476  2392     1
 9 CTAGCCTTCCGCGCAA  2628   425  2202     1
10 AAGCCGCGTGCTCTTC  2533   369  2164     0
# … with 660 more rows

$B1
# A tibble: 1,234 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT 23426    94 21976  1356
 2 TCGTACCGTCCATGAT 16849    59 15913   877
 3 TGCACCTTCTGGCGAC 13898    64 13038   796
 4 GTGCAGCCAAGCGAGT  8279    26  7839   414
 5 GAAACTCAGTTCCACA  7856    20  7597   239
 6 CTAGCCTTCCGCGCAA  7648    30  7163   455
 7 AGCAGCCGTTAAGGGC  5905    16  5528   361
 8 TGGCCAGTCATGTCTT  4906    11  4755   140
 9 GCATGTATCCCAGGTG  4308    11  4183   114
10 GTGCTTCAGGCATTGG  4303    10  4166   127
# … with 1,224 more rows

$B2
# A tibble: 924 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GGCTGGTGTCCCTACT 15818    52 14699  1067
 2 CATGCCTAGTGCTGCC 12752    34 11739   979
 3 CACACAAAGGCGATAC  8024    28  7354   642
 4 GAAACTCAGTTCCACA  7246    13  6988   245
 5 GGAATAAGTCAGGACA  6611    18  6101   492
 6 AAGCCGCGTGCTCTTC  6047    22  5592   433
 7 CATCGAAAGTCAATAG  4913    10  4530   373
 8 TGGCCAGTCATGTCTT  4521    12  4335   174
 9 GTGCTTCAGGCATTGG  4036     5  3916   115
10 GCATGTATCCCAGGTG  3977     4  3847   126
# … with 914 more rows

$C1
# A tibble: 856 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  7983  1099  6880     4
 2 GGCTGGTGTCCCTACT  6334   852  5481     1
 3 TCGTACCGTCCATGAT  5530   718  4810     2
 4 CATGCCTAGTGCTGCC  5396   776  4619     1
 5 TGCACCTTCTGGCGAC  4715   608  4105     2
 6 CACACAAAGGCGATAC  3351   516  2835     0
 7 GTGCAGCCAAGCGAGT  2941   373  2568     0
 8 GGAATAAGTCAGGACA  2852   432  2419     1
 9 CTAGCCTTCCGCGCAA  2597   344  2252     1
10 AAGCCGCGTGCTCTTC  2485   337  2148     0
# … with 846 more rows

$C2
# A tibble: 1,474 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  8207  1285  6916     6
 2 GGCTGGTGTCCCTACT  6761   978  5783     0
 3 TCGTACCGTCCATGAT  5509   838  4668     3
 4 CATGCCTAGTGCTGCC  5478   826  4647     5
 5 TGCACCTTCTGGCGAC  4786   742  4040     4
 6 CACACAAAGGCGATAC  3515   515  3000     0
 7 GTGCAGCCAAGCGAGT  2849   391  2458     0
 8 GGAATAAGTCAGGACA  2877   474  2403     0
 9 AAGCCGCGTGCTCTTC  2588   399  2189     0
10 CTAGCCTTCCGCGCAA  2608   441  2164     3
# … with 1,464 more rows

$D1
# A tibble: 725 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  7496  1035  6460     1
 2 GGCTGGTGTCCCTACT  5953   786  5164     3
 3 TCGTACCGTCCATGAT  5054   699  4354     1
 4 CATGCCTAGTGCTGCC  5060   735  4325     0
 5 TGCACCTTCTGGCGAC  4437   639  3797     1
 6 CACACAAAGGCGATAC  3188   500  2688     0
 7 GTGCAGCCAAGCGAGT  2707   337  2370     0
 8 GGAATAAGTCAGGACA  2657   390  2267     0
 9 AAGCCGCGTGCTCTTC  2311   315  1995     1
10 CTAGCCTTCCGCGCAA  2344   352  1992     0
# … with 715 more rows

$D2
# A tibble: 193 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CTCACACGTCTCTCGT  8986  1146  7832     8
 2 GGCTGGTGTCCCTACT  7238   911  6326     1
 3 TCGTACCGTCCATGAT  6015   759  5253     3
 4 CATGCCTAGTGCTGCC  6039   841  5194     4
 5 TGCACCTTCTGGCGAC  5328   728  4595     5
 6 CACACAAAGGCGATAC  3797   545  3250     2
 7 GTGCAGCCAAGCGAGT  3315   392  2921     2
 8 GGAATAAGTCAGGACA  3158   467  2690     1
 9 CTAGCCTTCCGCGCAA  2964   395  2564     5
10 AAGCCGCGTGCTCTTC  2936   378  2555     3
# … with 183 more rows

Table of background cell-barcodes with highest number of phantoms

data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$A1
# A tibble: 150,609 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   544    78   466     0
 2 TGCGCAGCACGTGAGA   298    38   260     0
 3 CTACACCTCGCCTGAG   307    48   259     0
 4 TGTTCCGAGTACGTAA   293    44   249     0
 5 CACTCCAAGCTAAACA   285    40   245     0
 6 AAAGCAAGTGAACCTT   256    28   228     0
 7 CCATGTCCACGAGGTA   258    31   227     0
 8 CAGATCAGTGCTCTTC   259    33   226     0
 9 AAACGGGAGTGGGCTA   267    44   223     0
10 TGTGGTATCTCCGGTT   249    31   218     0
# … with 150,599 more rows

$A2
# A tibble: 168,114 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   506    67   439     0
 2 TGAAAGACATCCGTGG   372    43   324     5
 3 ACACCAACAATGCCAT   379    64   309     6
 4 ACGGCCACAAGGACAC   343    50   290     3
 5 CCAGCGATCTGTTTGT   333    44   285     4
 6 AACTCAGTCCGCTGTT   350    64   283     3
 7 ATAGACCAGTACGTAA   325    48   273     4
 8 GCACATAAGTACGTAA   325    54   270     1
 9 TTTACTGAGTAGGTGC   298    36   258     4
10 GAGTCCGAGCGATATA   305    46   254     5
# … with 168,104 more rows

$B1
# A tibble: 109,998 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 CACACTCTCTGGCGAC   359     1   348    10
 2 ACATACGAGATCCGAG   360     0   347    13
 3 AACTTTCTCTCTAGGA   359     1   346    12
 4 AGGCCACCAGCTCCGA   353     0   345     8
 5 CTGAAACCAAACCCAT   356     0   345    11
 6 CTACCCAAGACTGGGT   354     2   344     8
 7 GAAGCAGAGTCATCCA   359     1   344    14
 8 GCGCAGTGTAGCAAAT   360     0   344    16
 9 TCGTAGACACCTTGTC   353     0   344     9
10 TGAAAGACAATGAATG   360     1   344    15
# … with 109,988 more rows

$B2
# A tibble: 110,485 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 TACACGACAGGCGATA  1785     2  1728    55
 2 CGAGCACAGTGAAGTT  1820     4  1722    94
 3 GAGGTGATCCAAGCCG  1719     6  1640    73
 4 TCTGAGACACACCGAC  1683     1  1612    70
 5 ACTGAACTCCCGACTT  1621     4  1565    52
 6 TTTATGCAGTTAAGTG  1605     4  1546    55
 7 GCTCCTATCCTAGAAC  1536     5  1479    52
 8 GGAACTTCATTTGCTT  1544     5  1478    61
 9 ACATGGTGTCGTTGTA  1519     7  1460    52
10 GTTCATTTCTTGAGAC  1435     1  1380    54
# … with 110,475 more rows

$C1
# A tibble: 180,224 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   581    81   500     0
 2 GGAATAATCTGGGCCA   394    55   339     0
 3 TATCTCAGTCGCGAAA   344    41   303     0
 4 TGAAAGACATCCGTGG   344    38   302     4
 5 AACTCAGTCCGCTGTT   342    40   298     4
 6 CCAGCGATCTGTTTGT   377    81   295     1
 7 ACGGCCACAAGGACAC   346    51   292     3
 8 GTGCTTCGTCCGTCAG   334    40   292     2
 9 TCAATCTTCCAGAGGA   334    48   285     1
10 CACAAACGTGTGACGA   330    50   272     8
# … with 180,214 more rows

$C2
# A tibble: 242,494 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   597   100   497     0
 2 AACCATGTCAGCGACC   412    74   338     0
 3 CTGGTCTTCGGTTCGG   354    29   325     0
 4 TGAAAGACATCCGTGG   376    58   313     5
 5 CCACCTAGTCGATTGT   351    44   307     0
 6 ACGGCCACAAGGACAC   439   130   304     5
 7 ATCACGACATGCTAGT   348    53   295     0
 8 TTCGAAGCAAGGTGTG   337    44   293     0
 9 CAGGTGCTCTGAAAGA   330    43   287     0
10 TATCTCAGTCGCGAAA   328    43   285     0
# … with 242,484 more rows

$D1
# A tibble: 131,097 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   513    88   424     1
 2 GATCGTAAGATACACA   401    48   353     0
 3 TTTCCTCCATCACGAT   375    42   328     5
 4 ACTGATGTCAATCACG   419    98   321     0
 5 GGAGCAATCCGCATCT   358    49   308     1
 6 CAGCAGCCAGAGCCAA   347    46   300     1
 7 TTCGAAGCAACAACCT   348    49   299     0
 8 ACACCAACAATGCCAT   336    46   286     4
 9 ACGGCCACAAGGACAC   327    38   285     4
10 GCACATAAGTACGTAA   330    48   278     4
# … with 131,087 more rows

$D2
# A tibble: 98,207 x 5
   cell                 m   r_a     f   r_b
   <chr>            <int> <dbl> <int> <dbl>
 1 GAGGTGATCCAAGCCG   577    74   502     1
 2 AGAATAGTCATTCACT   525    75   450     0
 3 ACTGATGTCAATCACG   554   127   427     0
 4 CTACACCCACGCGAAA   464    69   394     1
 5 CAGCAGCCAGAGCCAA   446    53   393     0
 6 ACACCAACAATGCCAT   440    52   388     0
 7 ACGGCCACAAGGACAC   420    46   373     1
 8 ATAGACCAGTACGTAA   410    42   368     0
 9 CGTAGGCCAACACCTA   415    49   366     0
10 AACTCAGTCCGCTGTT   415    49   365     1
# … with 98,197 more rows

Session Info

# memory usage
gc()
            used   (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells   6250248  333.8   13096221   699.5   13096221   699.5
Vcells 641290929 4892.7 2112754741 16119.1 3286988887 25077.8
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] rhdf5_2.26.2       cowplot_0.9.4      data.table_1.12.0 
 [4] tictoc_1.0         furrr_0.1.0        future_1.11.1.1   
 [7] broom_0.5.1        matrixStats_0.54.0 forcats_0.4.0     
[10] stringr_1.4.0      dplyr_0.8.0.1      purrr_0.3.1       
[13] readr_1.3.1        tidyr_0.8.3        tibble_2.0.1      
[16] ggplot2_3.1.0      tidyverse_1.2.1    rmarkdown_1.11    

loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.3            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.1             
[13] BiocGenerics_0.28.0         colorspace_1.4-0           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.0.1                  
[19] rvest_0.3.2                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.18               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.1                 readxl_1.3.0               
[33] rstudioapi_0.9.0            generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.0                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.3.1              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] knitr_1.21                  pillar_1.3.1               
[63] GenomicRanges_1.34.0        codetools_0.2-16           
[65] stats4_3.5.2                glue_1.3.0                 
[67] evaluate_0.13               modelr_0.1.4               
[69] cellranger_1.1.0            gtable_0.2.0               
[71] assertthat_0.2.0            xfun_0.5                   
[73] DropletUtils_1.2.2          viridisLite_0.3.0          
[75] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[77] globals_0.12.4             
LS0tCnRpdGxlOiAiUGhhbnRvbSBQdXJnZSIKc3VidGl0bGU6ICJBbmFseXNpcyB3b3JrZmxvdyBmb3IgYHIgY29tbWFuZEFyZ3ModHJhaWxpbmdPbmx5PVQpWzFdYCBkYXRhIgphdXRob3I6ICJSaWNrIEZhcm91bmkiCmRhdGU6ICdgciBmb3JtYXQoU3lzLkRhdGUoKSwgIiVZLSVCLSVkIilgJwpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIGRmX3ByaW50OiBwYWdlZAogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICB0b2M6IG5vCiAgICB0b2NfZmxvYXQ6IAogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKCiMgUHJlcGFyZSBhbmFseXNpcyB3b3JrZmxvdwoKIyMjIFNldCBwYXJhbWV0ZXJzCgpgYGB7ciBzZXR1cH0Ka25pdHI6Om9wdHNfa25pdCRzZXQocm9vdC5kaXIgPSBycHJvanJvb3Q6OmZpbmRfcnN0dWRpb19yb290X2ZpbGUoKSwKICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTE1LAogICAgICAgICAgICAgICAgICAgICBkaWdpdD01LAogICAgICAgICAgICAgICAgICAgICBzY2lwZW49OCkKb3B0aW9ucyhkaWdpdHM9NSwgCiAgICAgICAgc2NpcGVuPTgsCiAgICAgICAgZnV0dXJlLmdsb2JhbHMubWF4U2l6ZSA9ICtJbmYpCmBgYAoKCiMjIyBTZXQgZmlsZXBhdGhzCgpgYGB7cn0KZGF0YXNldF9uYW1lIDwtIGNvbW1hbmRBcmdzKHRyYWlsaW5nT25seT1UKVsxXQojZGF0YXNldF9uYW1lIDwtICJoaXNlcTQwMDAiCm1lc3NhZ2Uoc3ByaW50ZigiRGF0YXNldCBuYW1lOiAlcyIsIGRhdGFzZXRfbmFtZSkpCmBgYAoKCmBgYHtyfQpwcm9qZWN0X2RpciA8LSBycHJvanJvb3Q6OmZpbmRfcnN0dWRpb19yb290X2ZpbGUoKQoKaWYoaXMubnVsbChwcm9qZWN0X2RpcikpewogIHByb2plY3RfZGlyIDwtIGdldHdkKCkKICB3YXJuaW5nKHNwcmludGYoIk5vIHJzdHVkaW8gcHJvamVjdCByb290IGZpbGUgIGZvdW5kLiAKICAgICAgICAgICAgICAgICAgU2V0dGluZyBwcm9qZWN0IGRpcmVjdG9yeSB0byBjdXJyZW50IHdvcmtmbG93LlJtZCBmaWxlIGxvY2F0aW9uOiAlcy4gCiAgICAgICAgICAgICAgICAgIE92ZXJyaWRlIGlmIG5lZWRlZC4iLAogICAgICAgICAgICAgICAgICBwcm9qZWN0X2RpcikpCiAKfQptZXNzYWdlKHNwcmludGYoIlByb2plY3QgZGlyZWN0b3J5OiAlcyIsCiAgICAgICAgICAgICAgICBwcm9qZWN0X2RpcikpCmBgYAoKIyMjIExvYWQgbGlicmFyaWVzCgoKYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KI2xpYnJhcnkoRHJvcGxldFV0aWxzKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShtYXRyaXhTdGF0cykKbGlicmFyeShicm9vbSkKbGlicmFyeShmdXJycikKbGlicmFyeSh0aWN0b2MpCmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeShjb3dwbG90KQpsaWJyYXJ5KHJoZGY1KQpwbGFuKG11bHRpcHJvY2VzcykKYGBgCgojIyMgTG9hZCBmdW5jdGlvbnMKCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQpjb2RlX2RpciA8LSBmaWxlLnBhdGgocHJvamVjdF9kaXIsICJjb2RlIikKc291cmNlKGZpbGUucGF0aChjb2RlX2RpciwgImFuYWx5c2lzX2Z1bmN0aW9ucy5SIikpCnNvdXJjZShmaWxlLnBhdGgoY29kZV9kaXIsICJpb19mdW5jdGlvbnMuUiIpKQpzb3VyY2UoZmlsZS5wYXRoKGNvZGVfZGlyLCAid29ya2Zsb3dfZnVuY3Rpb25zLlIiKSkKc291cmNlKGZpbGUucGF0aChjb2RlX2RpciwgInBsb3R0aW5nX2Z1bmN0aW9ucy5SIikpCmBgYAoKCiMgUnVuIHdvcmtmbG93CgoKRXN0aW1hdGUgdGhlIHNhbXBsZSBpbmRleCBob3BwaW5nIHByb2JhYmlsaXR5LCBpbmZlciB0aGUgdHJ1ZSBzYW1wbGUgb2Ygb3JpZ2luLCBhbmQgZmluZCB0aGUgb3B0aW1hbCBwb3N0ZXJpb3IgcHJvYmFiaWxpdHkgdGhyZXNob2xkIGZvciByZXRhaW5pbmcgcHJlZGljdGVkIHJlYWwgbW9sZWN1bGVzLgoKCmBgYHtyfQptYXhfZnByIDwtIE5VTEwgIyBtYW51YWxseSBzZXQgdGhlIG1heGltdW0gZmFsc2UgcG9zaXRpdmUgcmF0ZSAobm90IHJlY29tbWVuZGVkKQpkYXRhX2xpc3QgPC0gcnVuX3dvcmtmbG93KGRhdGFzZXRfbmFtZSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgcHJvamVjdF9kaXIsIAogICAgICAgICAgICAgICAgICAgICAgICAgIG1heF9mcHI9bWF4X2ZwcikKYGBgCgoKIyBTaG93IHdvcmtmbG93IG91dHB1dAoKIyMgMS4gVHJhbmZvcm1pbmcgZGF0YSBhbmQgY29tcHV0aW5nIHN1bW1hcnkgc3RhdGlzdGljcwoKIyMjIFJlYWQgY291bnRzIGRhdGF0YWJsZQoKYGBge3J9CmRhdGFfbGlzdCRyZWFkX2NvdW50cyAKYGBgCgoKIyMjIFN1bW1hcnkgc3RhdGlzdGljcyBvZiB0aGUgam9pbmVkIHJlYWQgY291bnRzIGRhdGF0YWJsZQoKYGBge3J9CiBkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JHN1bW1hcnlfc3RhdHMKYGBgCgoKIyMjIFN1bW1hcnkgc3RhdGlzdGljcyBjb25kaXRpb25hbCBvbiB0aGUgUENSIGR1cGxpY2F0aW9uIGxldmVsICpyKgoKYGBge3J9CiBkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JGNvbmRpdGlvbmFsCmBgYAoKCiMjIyAgVGhlIG1vbGVjdWxhciBwcm9wb3J0aW9ucyBjb21wbGV4aXR5IHByb2ZpbGUKCmBgYHtyfQogZGF0YV9saXN0JHBpX3JfaGF0CmBgYAoKCiMjIyBUaGUgbW9sZWN1bGFyIHByb3BvcnRpb25zIGNvbXBsZXhpdHkgcHJvZmlsZSBwbG90IAoKYGBge3IgZmlnLmhlaWdodD05LCBmaWcud2lkdGg9MTUsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CnBfcmVhZCA8LSBwbG90X21vbGVjdWxlc19kaXN0cmlidXRpb25zKGRhdGFfbGlzdCwgZGF0YXNldF9uYW1lLCB4X2xpbT0xMjApCnBsb3RfZ3JpZChwX3JlYWQkcCwgCiAgICAgICAgICBwX3JlYWQkbGVnZW5kLAogICAgICAgICAgbmNvbD0yLAogICAgICAgICAgcmVsX3dpZHRocz1jKDEsIDAuMSkpCmBgYAoKCiMjIyBPdXRjb21lIGNvdW50cyBkYXRhdGFibGUKCmBgYHtyIH0KZGF0YV9saXN0JG91dGNvbWVfY291bnRzCmBgYAoKCgojIyMgQ2hpbWVyYSBjb3VudHMgZGF0YXRhYmxlCgoKYGBge3J9CmRhdGFfbGlzdCRmaXRfb3V0JGNoaW1lcmFfY291bnRzCmBgYAoKIyMgMi4gRXN0aW1hdGluZyB0aGUgc2FtcGxlIGluZGV4IGhvcHBpbmcgcmF0ZSAoKlNJSFIqKQoKCiMjIyBHTE0gZml0IGVzdGltYXRlcwoKYGBge3J9CmRhdGFfbGlzdCRmaXRfb3V0JGdsbV9lc3RpbWF0ZXMKYGBgCgojIyMgTW9kZWwgZml0IHBsb3QgCgpgYGB7ciBmaWcuaGVpZ2h0PTksIGZpZy53aWR0aD0xNSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KcF9maXQgPC0gcGxvdF9maXQoZGF0YV9saXN0LCBkYXRhc2V0X25hbWUpCgpwbG90X2dyaWQocF9maXQkcCwKICAgICAgICAgIHBfZml0JGxlZ2VuZCwKICAgICAgICAgIG5jb2w9MiwKICAgICAgICAgIHJlbF93aWR0aHM9YygxLCAwLjIpKQpgYGAKCgojIyAzLiBQdXJnaW5nIHBoYW50b20gbW9sZWN1bGVzCgoKIyMjIFRoZSBvcHRpbWFsIGN1dG9mZiBmb3IgbWluaW1pemluZyB0aGUgZmFsc2UgcG9zaXRpdmUgcmF0ZQoKYGBge3J9CmRhdGFfbGlzdCRvcHRpbWFsX2N1dG9mZgpgYGAKCgojIyMgUGxvdCBvZiB0aGUgZW1waXJpY2FsIENERiBvZiAqcXMqIAoKCgpgYGB7ciBmaWcuaGVpZ2h0PTksIGZpZy53aWR0aD0xNSwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KcF9wb3N0IDwtcGxvdF9wb3N0ZXJpb3JfcHJvYihkYXRhX2xpc3QsIGRhdGFzZXRfbmFtZSkKcGxvdF9ncmlkKHBfcG9zdCRwLAogICAgICAgICAgcF9wb3N0JGxlZ2VuZCwKICAgICAgICAgIG5jb2w9MiwKICAgICAgICAgIHJlbF93aWR0aHM9YygxLCAwLjEpKQpgYGAKCk5vdGUgdGhhdCAqcSogaXMgdGhlIG1hcmdpbmFsIHBvc3RlcmlvciBkaXN0cmlidXRpb24gb2YgcHJlZGljdGVkIHNhbXBsZSBvZiBvcmlnaW4gKipzKiogYW5kICpxcyogaXMgYSB0cmFuZm9ybWF0aW9uIG9mICpxKi4KCiMjIDQuIEV4YW1pbmluZyB0aGUgY29uc2VxdWVuY2VzIG9mIGluZGV4IGhvcHBpbmcgCgpGaXJzdCB3ZSBleGFtaW5nIHRoZSBleHRlbnQgb2YgdGhlIGVmZmVjdHMgb2YgaW5kZXggaG9wcGluZyBvbiBpbmRpdmlkdWFsIHNhbXBsZXMgYW5kIHRoZW4gb24gY2VsbC1iYXJjb2Rlcy4KCiMjIyBUYWxseSBvZiBwcmVkaWN0ZWQgcGhhbnRvbXMgYnkgc2FtcGxlCgpIZXJlICpyX2EqIGlzIHRoZSBudW1iZXIgb2YgbW9sZWN1bGVzIGFib3ZlIHRoZSBvcHRpbWFsIGN1dG9mZiBxLCB3aGljaCB3ZSBwcmVkaWN0IGFzIHJlYWwgbW9sZWN1bGVzLiAqcl9hKiBhcmUgdGhlIG51bWJlciBvZiBtb2xlY3VsZXMgYmVsb3cgb3IgZXF1YWwgdG8gY3V0b2ZmIGFuZCB0aHVzIHdlIHByZWRpY3QgYXMgcGhhbnRvbXMuICpmKiBpcyB0aGUgbnVtYmVyIG9mIG1vbGVjdWxlcyBwcmVkaWN0ZWQgYXMgcGhhbnRvbSBubyBtYXR0ZXIgd2hhdCB0aGUgdGhyZXNob2xkIGlzLiAqbSogaXMgdGhlIG51bWJlciBvZiB0b3RhbCBtb2xlY3VsZXMuIAoKYGBge3J9CmRhdGFfbGlzdCRyZWFkc19kaXN0X3N1bW1hcnkkbWFyZ2luYWwKYGBgCgojIyMgVGFsbHkgb2YgcHJlZGljdGVkIHBoYW50b21zIGluIGNhbGxlZCBjZWxscyAKClRoZSBjYWxsZWQgY2VsbHMgd2VyZSBkZXRlcm1pbmVkIGZyb20gdGhlIHVucHVyZ2VkIGRhdGEgaW4gb3JkZXIgdG8gc2hvdyB0aGUgbGV2ZWwgb2YgY29udGFtaW5hdGlvbiBieSBwaGFudG9tIG1vbGVjdWxlcyBpZiBkYXRhIHdlcmUgbm90IHB1cmdlZC4KCmBgYHtyfQpkYXRhX2xpc3QkcmVhZHNfZGlzdF9zdW1tYXJ5JG1hcmdpbmFsX2NhbGxlZF9jZWxscwpgYGAKCgojIyMgVGFsbHkgb2YgYmFyY29kZXMgYnkgY29uY29yZGFuY2UgYmV0d2VlbiBwdXJnZWQgYW5kIHVucHVyZ2VkIGRhdGEKCmBgYHtyfQpkYXRhX2xpc3QkY2FsbGVkX2NlbGxzX3RhbGx5CmBgYAoKIyMjIFRhYmxlIG9mIGNhbGxlZCBjZWxsLWJhcmNvZGVzIHdpdGggdGhlIGhpZ2hlc3QgbnVtYmVyIG9mIHBoYW50b21zCgpgYGB7cn0KZGF0YV9saXN0JHVtaV9jb3VudHNfY2VsbCAlPiUgCiAgbWFwKGxpc3QoImNhbGxlZF9jZWxscyIpKQpgYGAKCiMjIyBUYWJsZSBvZiBiYWNrZ3JvdW5kIGNlbGwtYmFyY29kZXMgd2l0aCBoaWdoZXN0IG51bWJlciBvZiBwaGFudG9tcwoKYGBge3J9CmRhdGFfbGlzdCR1bWlfY291bnRzX2NlbGwgJT4lIAogIG1hcChsaXN0KCJiYWNrZ3JvdW5kX2NlbGxzIikpCmBgYAoKCgojIFNlc3Npb24gSW5mbwoKYGBge3J9CiMgbWVtb3J5IHVzYWdlCmdjKCkKYGBgCgpgYGB7cn0Kc2Vzc2lvbkluZm8oKQpgYGAK