knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <- "hiseq4000"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq4000
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
#library(DropletUtils)
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
library(rhdf5)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "analysis_functions.R"))
source(file.path(code_dir, "io_functions.R"))
source(file.path(code_dir, "workflow_functions.R"))
source(file.path(code_dir, "plotting_functions.R"))
Estimate the sample index hopping probability, infer the true sample of origin, and find the optimal posterior probability threshold for retaining predicted real molecules.
max_fpr <- NULL # manually set the maximum false positive rate (not recommended)
data_list <- run_workflow(dataset_name,
project_dir,
max_fpr=max_fpr)
Step 1: reading read counts from existing file: 81.69 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 18.807 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.178 sec elapsed
Step 4: computing read counts distribution statistics: 0.364 sec elapsed
Step 5: estimating pi_r matrix: 0.114 sec elapsed
Step 6: infering the true sample of origin: 7.63 sec elapsed
Step 7: estimating g and computing classification metrics: 0.192 sec elapsed
Step 8: determining the optimal cutoff: 0.066 sec elapsed
Step 9: computing proportion of nonmissingness and updating summary data list: 0.025 sec elapsed
Step 10.1: reassigning reads to sample of origin: 3.816 sec elapsed
Step 10.2: deduplicating read counts: 0.024 sec elapsed
Step 10.3: reassigning hopped reads and deduplicating read counts: 3.933 sec elapsed
Step 10.4: labelling phantom molecules below cutoff: 0.001 sec elapsed
Step 10.5: adding cell-umi-gene labels: 11.497 sec elapsed
Step 10.6: tallying molecule counts by cell-barcode and gene ID: 5.69 sec elapsed
Step 10.7: tranforming cell-gene molecule tally table into long format: 31.882 sec elapsed
Step 10: purging phantoms at q cutoff of 0.923593. Max-FPR threshold user-set FALSE: 77.261 sec elapsed
Using defaultDrops cell calling instead.
EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 11: calling cells: 915.338 sec elapsed
Step 12: saving purged data: 10.691 sec elapsed
Step 13: tallying molecules by cell-barcode: 34.187 sec elapsed
Step 14: saving results: 5.529 sec elapsed
Running workflow: 1152.219 sec elapsed
data_list$read_counts
cell <chr> | gene <chr> | umi <int> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCAT | Csn3 | 468967 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Akt1 | 410924 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Ap1s1 | 43591 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Sdc4 | 1012503 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Txn1 | 621442 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Nfe2l3 | 355204 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Nucb2 | 655558 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Mrpl54 | 648997 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Higd1a | 325897 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | Rps21 | 476545 | 11 | 0 | 0 | 0 | 0 | 0 | 0 |
data_list$reads_dist_summary$summary_stats
n_obs <dbl> | p_chimeras <dbl> | g <dbl> | u <dbl> | n_reads <int> |
---|---|---|---|---|
53636695 | 0.071033 | 0.0014753 | 0.081685 | 626168518 |
data_list$reads_dist_summary$conditional
r <int> | n_obs <dbl> | m_bar <dbl> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | |
---|---|---|---|---|---|---|---|---|
1 | 9047808 | 0.168686903621 | 0.08854587 | 0.13668040 | 0.01104124 | 0.01182397 | 0.16590029 | |
2 | 5560697 | 0.103673371374 | 0.07628459 | 0.13479785 | 0.00488446 | 0.00523514 | 0.16891579 | |
3 | 4569143 | 0.085186885583 | 0.07537278 | 0.13643769 | 0.00449421 | 0.00481498 | 0.16929148 | |
4 | 4003836 | 0.074647328662 | 0.07780563 | 0.13891353 | 0.00451211 | 0.00478660 | 0.17149541 | |
5 | 3537608 | 0.065954995922 | 0.08142869 | 0.14412467 | 0.00460481 | 0.00481060 | 0.17883740 | |
6 | 3104308 | 0.057876571254 | 0.08672137 | 0.15344568 | 0.00470234 | 0.00480220 | 0.19097434 | |
7 | 2706616 | 0.050462020451 | 0.09382523 | 0.16800668 | 0.00478046 | 0.00471569 | 0.21056821 | |
8 | 2354663 | 0.043900225396 | 0.10159978 | 0.18659363 | 0.00473306 | 0.00458675 | 0.23413807 | |
9 | 2057821 | 0.038365917214 | 0.11124966 | 0.20804725 | 0.00473597 | 0.00445709 | 0.25803945 | |
10 | 1811641 | 0.033776148959 | 0.12122545 | 0.23141434 | 0.00465506 | 0.00422346 | 0.27841995 |
data_list$pi_r_hat
r <int> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 0.08817981778 | 0.13679768516 | 0.00989693776 | 0.01068752846 | 0.16631098745 | 0.46492653892 | |
2 | 0.07579542309 | 0.13489623364 | 0.00367833717 | 0.00403253389 | 0.16935675907 | 0.51375076673 | |
3 | 0.07487445336 | 0.13655254366 | 0.00328416416 | 0.00360816014 | 0.16973623060 | 0.52109914440 | |
4 | 0.07733173915 | 0.13905324260 | 0.00330224904 | 0.00357949203 | 0.17196228770 | 0.51652246846 | |
5 | 0.08099117936 | 0.14431671166 | 0.00339587586 | 0.00360373103 | 0.17937799546 | 0.50000041631 | |
6 | 0.08633699776 | 0.15373131787 | 0.00349438502 | 0.00359524898 | 0.19163681003 | 0.46955499826 | |
7 | 0.09351218883 | 0.16843852643 | 0.00357328863 | 0.00350787639 | 0.21142743033 | 0.42192178496 | |
8 | 0.10136481325 | 0.18721211694 | 0.00352541246 | 0.00337763792 | 0.23523396367 | 0.36266403683 | |
9 | 0.11111158912 | 0.20888115667 | 0.00352835614 | 0.00324667396 | 0.25937534646 | 0.29537482014 | |
10 | 0.12118755094 | 0.23248288034 | 0.00344663543 | 0.00301070395 | 0.27996049564 | 0.22811960714 |
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=120)
plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
data_list$outcome_counts
outcome <chr> | n <int> | q <dbl> | qs <dbl> | j <dbl> | o <dbl> | FPR <dbl> | FNR <dbl> | r <int> | k_chimera <int> | |
---|---|---|---|---|---|---|---|---|---|---|
0,0,0,0,0,0,6,0 | 248029 | 1 | 0 | 0.0046311 | 0.0046242 | 0 | 0.99537 | 6 | 1 | |
0,0,0,0,0,6,0,0 | 1383880 | 1 | 0 | 0.0304702 | 0.0304252 | 0 | 0.96953 | 6 | 1 | |
0,0,0,0,6,0,0,0 | 568918 | 1 | 0 | 0.0410927 | 0.0410321 | 0 | 0.95891 | 6 | 1 | |
0,6,0,0,0,0,0,0 | 456096 | 1 | 0 | 0.0496087 | 0.0495355 | 0 | 0.95039 | 6 | 1 | |
6,0,0,0,0,0,0,0 | 256555 | 1 | 0 | 0.0543990 | 0.0543187 | 0 | 0.94560 | 6 | 1 | |
0,0,0,0,0,0,0,7 | 24568 | 1 | 0 | 0.0548577 | 0.0547768 | 0 | 0.94514 | 7 | 1 | |
0,0,0,0,0,0,7,0 | 227376 | 1 | 0 | 0.0591032 | 0.0590160 | 0 | 0.94090 | 7 | 1 | |
0,0,0,0,0,7,0,0 | 1072636 | 1 | 0 | 0.0791309 | 0.0790142 | 0 | 0.92087 | 7 | 1 | |
0,0,0,0,7,0,0,0 | 542820 | 1 | 0 | 0.0892662 | 0.0891345 | 0 | 0.91073 | 7 | 1 | |
0,0,0,7,0,0,0,0 | 3824 | 1 | 0 | 0.0893376 | 0.0892058 | 0 | 0.91066 | 7 | 1 |
data_list$fit_out$chimera_counts
r <dbl> | 1 <dbl> | 2 <dbl> | 3 <dbl> | 4 <dbl> | 5 <dbl> | 6 <dbl> | 7 <dbl> | 8 <dbl> | n_obs <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 9047808 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9047808 | |
2 | 5474096 | 86601 | 0 | 0 | 0 | 0 | 0 | 0 | 5560697 | |
3 | 4458826 | 109716 | 601 | 0 | 0 | 0 | 0 | 0 | 4569143 | |
4 | 3872966 | 129841 | 1025 | 4 | 0 | 0 | 0 | 0 | 4003836 | |
5 | 3390791 | 145127 | 1681 | 8 | 1 | 0 | 0 | 0 | 3537608 | |
6 | 2948201 | 153805 | 2286 | 14 | 2 | 0 | 0 | 0 | 3104308 | |
7 | 2546395 | 157412 | 2787 | 19 | 3 | 0 | 0 | 0 | 2706616 | |
8 | 2195644 | 155628 | 3361 | 30 | 0 | 0 | 0 | 0 | 2354663 | |
9 | 1901978 | 152055 | 3744 | 42 | 2 | 0 | 0 | 0 | 2057821 | |
10 | 1659600 | 147805 | 4196 | 40 | 0 | 0 | 0 | 0 | 1811641 |
data_list$fit_out$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
25 | 0.9913 | 0.99129 | 0.99131 | 0.0086988 | 0.0096309 |
p_fit <- plot_fit(data_list, dataset_name)
plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
data_list$optimal_cutoff
cutoff <chr> | outcome <chr> | q <dbl> | s <int> | FPR <dbl> | j <dbl> | qs <dbl> | o <dbl> | FP <dbl> | TP <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
optimal | 0,0,0,1,1,0,0,1 | 0.92359 | 5 | 0.013063 | 0.98299 | 148.83 | 0.99567 | 0.0010864 | 0.99458 | |
above | 1,1,2,0,0,0,0,0 | 0.92377 | 3 | 0.013063 | 0.98299 | 148.82 | 0.99567 | 0.0010864 | 0.99458 | |
below | 0,1,2,0,0,0,1,0 | 0.92281 | 3 | 0.013064 | 0.98299 | 148.88 | 0.99567 | 0.0010864 | 0.99458 | |
none | 0,1,0,0,1,0,1,0 | 0.43691 | 5 | 0.019432 | 0.98043 | 157.51 | 1.00000 | 0.0016159 | 0.99838 |
p_post <-plot_posterior_prob(data_list, dataset_name)
plot_grid(p_post$p,
p_post$legend,
ncol=2,
rel_widths=c(1, 0.1))
Note that q is the marginal posterior distribution of predicted sample of origin s and qs is a tranformation of q.
First we examing the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here r_a is the number of molecules above the optimal cutoff q, which we predict as real molecules. r_a are the number of molecules below or equal to cutoff and thus we predict as phantoms. f is the number of molecules predicted as phantom no matter what the threshold is. m is the number of total molecules.
data_list$reads_dist_summary$marginal
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
A1 | 7.0267 | 0.966197 | 0.000082257 | 0.033720 | 0.121113 | 0.13053 | 11.6320 |
A2 | 9.5920 | 0.977478 | 0.000322143 | 0.022200 | 0.165328 | 0.13100 | 8.5519 |
B1 | 1.8203 | 0.080265 | 0.055437778 | 0.864297 | 0.031375 | 0.12821 | 44.1022 |
B2 | 1.7513 | 0.114344 | 0.061728162 | 0.823928 | 0.030186 | 0.14004 | 50.0703 |
C1 | 10.4565 | 0.978007 | 0.000479987 | 0.021513 | 0.180229 | 0.12185 | 7.2968 |
C2 | 18.6465 | 0.987370 | 0.000632988 | 0.011997 | 0.321391 | 0.12054 | 4.0478 |
D1 | 6.5648 | 0.968402 | 0.000246464 | 0.031351 | 0.113152 | 0.10675 | 10.1821 |
D2 | 2.1598 | 0.878683 | 0.000602845 | 0.120714 | 0.037226 | 0.12108 | 35.1032 |
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$reads_dist_summary$marginal_called_cells
sample <chr> | m <dbl> | r_a <dbl> | r_b <dbl> | f <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> |
---|---|---|---|---|---|---|---|
A1 | 6.17147 | 0.98523 | 0.000077615 | 0.0146895 | 0.121755 | 0.13053 | 13.2441 |
A2 | 8.57668 | 0.99044 | 0.000233190 | 0.0093227 | 0.169206 | 0.13100 | 9.5643 |
B1 | 1.25210 | 0.08420 | 0.050911269 | 0.8648886 | 0.024702 | 0.12821 | 64.1154 |
B2 | 0.98602 | 0.14419 | 0.058466478 | 0.7973435 | 0.019453 | 0.14004 | 88.9336 |
C1 | 9.42001 | 0.99066 | 0.000349150 | 0.0089913 | 0.185844 | 0.12185 | 8.0997 |
C2 | 16.84006 | 0.99414 | 0.000525770 | 0.0053322 | 0.332231 | 0.12054 | 4.4820 |
D1 | 5.72473 | 0.98823 | 0.000192673 | 0.0115771 | 0.112941 | 0.10675 | 11.6763 |
D2 | 1.71670 | 0.96302 | 0.000499214 | 0.0364793 | 0.033868 | 0.12108 | 44.1628 |
data_list$called_cells_tally
barcode <chr> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | D2 <dbl> |
---|---|---|---|---|---|---|---|---|
consensus_background | 138117 | 156898 | 27478 | 32290 | 168474 | 232402 | 119471 | 82288 |
transition_cell | 14 | 13 | 159 | 1 | 0 | 5 | 15 | 7 |
phantom_background | 12478 | 11203 | 82361 | 78194 | 11750 | 10087 | 11611 | 15912 |
transition_background | 137 | 105 | 930 | 661 | 125 | 153 | 64 | 28 |
consensus_cell | 401 | 565 | 34 | 14 | 731 | 1321 | 661 | 165 |
phantom_cell | 0 | 0 | 270 | 249 | 0 | 0 | 0 | 0 |
data_list$umi_counts_cell %>%
map(list("called_cells"))
$A1
# A tibble: 538 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 8325 1084 7238 3
2 GGCTGGTGTCCCTACT 6661 888 5773 0
3 TCGTACCGTCCATGAT 5765 804 4960 1
4 CATGCCTAGTGCTGCC 5496 805 4691 0
5 TGCACCTTCTGGCGAC 4967 676 4290 1
6 CACACAAAGGCGATAC 3464 541 2922 1
7 GTGCAGCCAAGCGAGT 3020 370 2650 0
8 GGAATAAGTCAGGACA 2961 443 2518 0
9 CTAGCCTTCCGCGCAA 2717 411 2306 0
10 AAGCCGCGTGCTCTTC 2501 363 2137 1
# … with 528 more rows
$A2
# A tibble: 670 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 7834 1192 6637 5
2 GGCTGGTGTCCCTACT 6399 970 5428 1
3 TCGTACCGTCCATGAT 5485 880 4602 3
4 CATGCCTAGTGCTGCC 5421 827 4593 1
5 TGCACCTTCTGGCGAC 4687 757 3930 0
6 CACACAAAGGCGATAC 3280 530 2749 1
7 GTGCAGCCAAGCGAGT 2841 404 2437 0
8 GGAATAAGTCAGGACA 2869 476 2392 1
9 CTAGCCTTCCGCGCAA 2628 425 2202 1
10 AAGCCGCGTGCTCTTC 2533 369 2164 0
# … with 660 more rows
$B1
# A tibble: 1,234 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 23426 94 21976 1356
2 TCGTACCGTCCATGAT 16849 59 15913 877
3 TGCACCTTCTGGCGAC 13898 64 13038 796
4 GTGCAGCCAAGCGAGT 8279 26 7839 414
5 GAAACTCAGTTCCACA 7856 20 7597 239
6 CTAGCCTTCCGCGCAA 7648 30 7163 455
7 AGCAGCCGTTAAGGGC 5905 16 5528 361
8 TGGCCAGTCATGTCTT 4906 11 4755 140
9 GCATGTATCCCAGGTG 4308 11 4183 114
10 GTGCTTCAGGCATTGG 4303 10 4166 127
# … with 1,224 more rows
$B2
# A tibble: 924 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 15818 52 14699 1067
2 CATGCCTAGTGCTGCC 12752 34 11739 979
3 CACACAAAGGCGATAC 8024 28 7354 642
4 GAAACTCAGTTCCACA 7246 13 6988 245
5 GGAATAAGTCAGGACA 6611 18 6101 492
6 AAGCCGCGTGCTCTTC 6047 22 5592 433
7 CATCGAAAGTCAATAG 4913 10 4530 373
8 TGGCCAGTCATGTCTT 4521 12 4335 174
9 GTGCTTCAGGCATTGG 4036 5 3916 115
10 GCATGTATCCCAGGTG 3977 4 3847 126
# … with 914 more rows
$C1
# A tibble: 856 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 7983 1099 6880 4
2 GGCTGGTGTCCCTACT 6334 852 5481 1
3 TCGTACCGTCCATGAT 5530 718 4810 2
4 CATGCCTAGTGCTGCC 5396 776 4619 1
5 TGCACCTTCTGGCGAC 4715 608 4105 2
6 CACACAAAGGCGATAC 3351 516 2835 0
7 GTGCAGCCAAGCGAGT 2941 373 2568 0
8 GGAATAAGTCAGGACA 2852 432 2419 1
9 CTAGCCTTCCGCGCAA 2597 344 2252 1
10 AAGCCGCGTGCTCTTC 2485 337 2148 0
# … with 846 more rows
$C2
# A tibble: 1,474 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 8207 1285 6916 6
2 GGCTGGTGTCCCTACT 6761 978 5783 0
3 TCGTACCGTCCATGAT 5509 838 4668 3
4 CATGCCTAGTGCTGCC 5478 826 4647 5
5 TGCACCTTCTGGCGAC 4786 742 4040 4
6 CACACAAAGGCGATAC 3515 515 3000 0
7 GTGCAGCCAAGCGAGT 2849 391 2458 0
8 GGAATAAGTCAGGACA 2877 474 2403 0
9 AAGCCGCGTGCTCTTC 2588 399 2189 0
10 CTAGCCTTCCGCGCAA 2608 441 2164 3
# … with 1,464 more rows
$D1
# A tibble: 725 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 7496 1035 6460 1
2 GGCTGGTGTCCCTACT 5953 786 5164 3
3 TCGTACCGTCCATGAT 5054 699 4354 1
4 CATGCCTAGTGCTGCC 5060 735 4325 0
5 TGCACCTTCTGGCGAC 4437 639 3797 1
6 CACACAAAGGCGATAC 3188 500 2688 0
7 GTGCAGCCAAGCGAGT 2707 337 2370 0
8 GGAATAAGTCAGGACA 2657 390 2267 0
9 AAGCCGCGTGCTCTTC 2311 315 1995 1
10 CTAGCCTTCCGCGCAA 2344 352 1992 0
# … with 715 more rows
$D2
# A tibble: 193 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 8986 1146 7832 8
2 GGCTGGTGTCCCTACT 7238 911 6326 1
3 TCGTACCGTCCATGAT 6015 759 5253 3
4 CATGCCTAGTGCTGCC 6039 841 5194 4
5 TGCACCTTCTGGCGAC 5328 728 4595 5
6 CACACAAAGGCGATAC 3797 545 3250 2
7 GTGCAGCCAAGCGAGT 3315 392 2921 2
8 GGAATAAGTCAGGACA 3158 467 2690 1
9 CTAGCCTTCCGCGCAA 2964 395 2564 5
10 AAGCCGCGTGCTCTTC 2936 378 2555 3
# … with 183 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$A1
# A tibble: 150,609 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 544 78 466 0
2 TGCGCAGCACGTGAGA 298 38 260 0
3 CTACACCTCGCCTGAG 307 48 259 0
4 TGTTCCGAGTACGTAA 293 44 249 0
5 CACTCCAAGCTAAACA 285 40 245 0
6 AAAGCAAGTGAACCTT 256 28 228 0
7 CCATGTCCACGAGGTA 258 31 227 0
8 CAGATCAGTGCTCTTC 259 33 226 0
9 AAACGGGAGTGGGCTA 267 44 223 0
10 TGTGGTATCTCCGGTT 249 31 218 0
# … with 150,599 more rows
$A2
# A tibble: 168,114 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 506 67 439 0
2 TGAAAGACATCCGTGG 372 43 324 5
3 ACACCAACAATGCCAT 379 64 309 6
4 ACGGCCACAAGGACAC 343 50 290 3
5 CCAGCGATCTGTTTGT 333 44 285 4
6 AACTCAGTCCGCTGTT 350 64 283 3
7 ATAGACCAGTACGTAA 325 48 273 4
8 GCACATAAGTACGTAA 325 54 270 1
9 TTTACTGAGTAGGTGC 298 36 258 4
10 GAGTCCGAGCGATATA 305 46 254 5
# … with 168,104 more rows
$B1
# A tibble: 109,998 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 CACACTCTCTGGCGAC 359 1 348 10
2 ACATACGAGATCCGAG 360 0 347 13
3 AACTTTCTCTCTAGGA 359 1 346 12
4 AGGCCACCAGCTCCGA 353 0 345 8
5 CTGAAACCAAACCCAT 356 0 345 11
6 CTACCCAAGACTGGGT 354 2 344 8
7 GAAGCAGAGTCATCCA 359 1 344 14
8 GCGCAGTGTAGCAAAT 360 0 344 16
9 TCGTAGACACCTTGTC 353 0 344 9
10 TGAAAGACAATGAATG 360 1 344 15
# … with 109,988 more rows
$B2
# A tibble: 110,485 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 TACACGACAGGCGATA 1785 2 1728 55
2 CGAGCACAGTGAAGTT 1820 4 1722 94
3 GAGGTGATCCAAGCCG 1719 6 1640 73
4 TCTGAGACACACCGAC 1683 1 1612 70
5 ACTGAACTCCCGACTT 1621 4 1565 52
6 TTTATGCAGTTAAGTG 1605 4 1546 55
7 GCTCCTATCCTAGAAC 1536 5 1479 52
8 GGAACTTCATTTGCTT 1544 5 1478 61
9 ACATGGTGTCGTTGTA 1519 7 1460 52
10 GTTCATTTCTTGAGAC 1435 1 1380 54
# … with 110,475 more rows
$C1
# A tibble: 180,224 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 581 81 500 0
2 GGAATAATCTGGGCCA 394 55 339 0
3 TATCTCAGTCGCGAAA 344 41 303 0
4 TGAAAGACATCCGTGG 344 38 302 4
5 AACTCAGTCCGCTGTT 342 40 298 4
6 CCAGCGATCTGTTTGT 377 81 295 1
7 ACGGCCACAAGGACAC 346 51 292 3
8 GTGCTTCGTCCGTCAG 334 40 292 2
9 TCAATCTTCCAGAGGA 334 48 285 1
10 CACAAACGTGTGACGA 330 50 272 8
# … with 180,214 more rows
$C2
# A tibble: 242,494 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 597 100 497 0
2 AACCATGTCAGCGACC 412 74 338 0
3 CTGGTCTTCGGTTCGG 354 29 325 0
4 TGAAAGACATCCGTGG 376 58 313 5
5 CCACCTAGTCGATTGT 351 44 307 0
6 ACGGCCACAAGGACAC 439 130 304 5
7 ATCACGACATGCTAGT 348 53 295 0
8 TTCGAAGCAAGGTGTG 337 44 293 0
9 CAGGTGCTCTGAAAGA 330 43 287 0
10 TATCTCAGTCGCGAAA 328 43 285 0
# … with 242,484 more rows
$D1
# A tibble: 131,097 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 513 88 424 1
2 GATCGTAAGATACACA 401 48 353 0
3 TTTCCTCCATCACGAT 375 42 328 5
4 ACTGATGTCAATCACG 419 98 321 0
5 GGAGCAATCCGCATCT 358 49 308 1
6 CAGCAGCCAGAGCCAA 347 46 300 1
7 TTCGAAGCAACAACCT 348 49 299 0
8 ACACCAACAATGCCAT 336 46 286 4
9 ACGGCCACAAGGACAC 327 38 285 4
10 GCACATAAGTACGTAA 330 48 278 4
# … with 131,087 more rows
$D2
# A tibble: 98,207 x 5
cell m r_a f r_b
<chr> <int> <dbl> <int> <dbl>
1 GAGGTGATCCAAGCCG 577 74 502 1
2 AGAATAGTCATTCACT 525 75 450 0
3 ACTGATGTCAATCACG 554 127 427 0
4 CTACACCCACGCGAAA 464 69 394 1
5 CAGCAGCCAGAGCCAA 446 53 393 0
6 ACACCAACAATGCCAT 440 52 388 0
7 ACGGCCACAAGGACAC 420 46 373 1
8 ATAGACCAGTACGTAA 410 42 368 0
9 CGTAGGCCAACACCTA 415 49 366 0
10 AACTCAGTCCGCTGTT 415 49 365 1
# … with 98,197 more rows
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6250248 333.8 13096221 699.5 13096221 699.5
Vcells 641290929 4892.7 2112754741 16119.1 3286988887 25077.8
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] rhdf5_2.26.2 cowplot_0.9.4 data.table_1.12.0
[4] tictoc_1.0 furrr_0.1.0 future_1.11.1.1
[7] broom_0.5.1 matrixStats_0.54.0 forcats_0.4.0
[10] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.1
[13] readr_1.3.1 tidyr_0.8.3 tibble_2.0.1
[16] ggplot2_3.1.0 tidyverse_1.2.1 rmarkdown_1.11
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.3
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.1
[13] BiocGenerics_0.28.0 colorspace_1.4-0
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.0.1
[19] rvest_0.3.2 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.18 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.1 readxl_1.3.0
[33] rstudioapi_0.9.0 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.0 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.3.1
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] knitr_1.21 pillar_1.3.1
[63] GenomicRanges_1.34.0 codetools_0.2-16
[65] stats4_3.5.2 glue_1.3.0
[67] evaluate_0.13 modelr_0.1.4
[69] cellranger_1.1.0 gtable_0.2.0
[71] assertthat_0.2.0 xfun_0.5
[73] DropletUtils_1.2.2 viridisLite_0.3.0
[75] SingleCellExperiment_1.4.1 IRanges_2.16.0
[77] globals_0.12.4