knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq2500
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data",
dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
sprintf("%s_read_counts.rds",
dataset_name))
results_filepath <- file.path(output_dir,
sprintf("%s_results.rds",
dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq2500/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq2500/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
output_dir,
read_counts_filepath = NULL,
torc = 3,
max_r = NULL) {
tic("Running workflow I")
tic("Step 1: loading molecule_info files and creating read counts datatable")
read_counts <- create_joined_counts(input_dir, read_counts_filepath)
toc()
sample_names <-
setdiff(
colnames(read_counts),
c("cell", "umi", "gene", "outcome")
)
S <- length(sample_names)
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts, sample_names)
toc()
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S,
max_r = max_r
)
toc()
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names),
outcome_counts %>%
select(c("outcome", "retain", paste0(sample_names, "_hat"))),
by = c("outcome")
) %>%
select(-outcome)
purge_and_save_read_counts(
read_counts,
dataset_name,
sample_names,
output_dir
)
toc()
tic("Step 7: create umi counts matrices")
umi_counts_cell_gene <-
create_umi_counts(
read_counts,
sample_names
)
toc()
outcome_counts <-
outcome_counts %>%
arrange(-qr) %>%
select(-c(paste0(sample_names, "_hat")))
read_counts <-
read_counts %>%
select(-c("retain", paste0(sample_names, "_hat")))
summary_stats$sample_names <- sample_names
data_list <-
list(
umi_counts_cell_gene = umi_counts_cell_gene,
read_counts = read_counts,
outcome_counts = outcome_counts,
fit_out = fit_out,
summary_stats = summary_stats
)
toc()
return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
tic("Running workflow II")
tic("Step 7: identify RNA-containing cells")
called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
toc()
tic("Step 9: tallying molecules by cell-barcode")
umi_counts_cell <- map2(
called_cells_out$called_cells,
data_list$umi_counts_cell_gene,
get_umi_counts_cell
)
umi_counts_sample <-
map(umi_counts_cell,
map_dfr,
get_umi_counts_sample,
.id = "split"
) %>%
bind_rows(.id = "sample")
data_list$summary_stats <-
update_summary_stats(
data_list$summary_stats,
umi_counts_sample
)
toc()
data_list <-
c(
data_list,
list(
umi_counts_cell = umi_counts_cell,
called_cells_tally = called_cells_out$called_cells_tally
)
)
toc()
return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 60.875 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 8.44 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.358 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 1.383 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 0.782 sec elapsed
Step 6: Purge and save read counts datatable to disk: 100.268 sec elapsed
Step 7: create umi counts matrices: 53.312 sec elapsed
Running workflow I: 225.432 sec elapsed
data_list$read_counts
cell <chr> | umi <int> | gene <chr> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCAT | 468967 | Csn3 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 1012503 | Sdc4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 621442 | Txn1 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 648997 | Mrpl54 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 325897 | Higd1a | 4 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 476545 | Rps21 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 776993 | Mlec | 6 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 283231 | Nap1l1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 5045 | Csn1s2a | 6 | 0 | 0 | 0 | 0 | 0 | 0 | |
AAACCTGAGAAACCAT | 174081 | Rps28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts
outcome <chr> | s <int> | qr <dbl> | r <int> | n <int> | o <dbl> | FP <dbl> | FN <dbl> | TP <dbl> | TN <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1,0,0,0,0,0,1,0 | 7 | 4.9826e-01 | 2 | 252 | 1.0000000 | 1449 | 303 | 44061167 | 25944 | |
2,0,0,0,0,0,2,0 | 1 | 4.8863e-01 | 4 | 2 | 0.9999943 | 1324 | 430 | 44061040 | 26069 | |
0,1,0,0,1,0,0,0 | 5 | 4.3770e-01 | 2 | 62 | 0.9999942 | 1323 | 431 | 44061039 | 26070 | |
0,1,0,0,0,0,1,0 | 2 | 3.5989e-01 | 2 | 32 | 0.9999928 | 1296 | 465 | 44061004 | 26098 | |
1,1,0,0,0,0,0,0 | 2 | 3.5825e-01 | 2 | 35 | 0.9999921 | 1284 | 486 | 44060984 | 26109 | |
0,0,0,0,1,0,1,0 | 5 | 3.0441e-01 | 2 | 42 | 0.9999913 | 1271 | 508 | 44060962 | 26122 | |
1,0,0,0,1,0,0,0 | 5 | 3.0290e-01 | 2 | 36 | 0.9999904 | 1259 | 538 | 44060932 | 26134 | |
0,0,0,0,1,1,0,0 | 6 | 2.8719e-01 | 2 | 133 | 0.9999895 | 1248 | 563 | 44060907 | 26145 | |
0,1,0,0,0,1,0,0 | 6 | 2.3874e-01 | 2 | 101 | 0.9999865 | 1210 | 657 | 44060812 | 26184 | |
0,0,0,0,0,1,1,0 | 6 | 1.4988e-01 | 2 | 94 | 0.9999842 | 1185 | 734 | 44060736 | 26208 |
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
data_list$summary_stats$summary_estimates
n_reads <int> | n_cugs <dbl> | n_pm <dbl> | u <dbl> | g <dbl> | frm <dbl> | p_chimeras <dbl> |
---|---|---|---|---|---|---|
268768037 | 44062616 | 27393 | 0.00059568 | 0.000026012 | 6.0997 | 0.00059245 |
data_list$summary_stats$conditional
r <int> | n_cugs <dbl> | p_cugs <dbl> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 11242464 | 0.255147447442 | 0.0813012165 | 0.1380801397 | 0.00198773 | 0.002577638 | |
2 | 7556220 | 0.171488229387 | 0.0835708198 | 0.1497099079 | 0.00122151 | 0.001662273 | |
3 | 5707051 | 0.129521383842 | 0.0963883098 | 0.1726053730 | 0.00114391 | 0.001429343 | |
4 | 4344844 | 0.098606129060 | 0.1151406702 | 0.2038855710 | 0.00110930 | 0.001212529 | |
5 | 3342437 | 0.075856526539 | 0.1395874926 | 0.2378879841 | 0.00097725 | 0.000912628 | |
6 | 2604471 | 0.059108406092 | 0.1690894875 | 0.2651573646 | 0.00084726 | 0.000795491 | |
7 | 2048065 | 0.046480785435 | 0.2028776988 | 0.2792064705 | 0.00070840 | 0.000676039 | |
8 | 1603535 | 0.036392187881 | 0.2391953559 | 0.2793755671 | 0.00067905 | 0.000659325 | |
9 | 1243973 | 0.028231937023 | 0.2784262824 | 0.2663382208 | 0.00060371 | 0.000691423 | |
10 | 946847 | 0.021488669670 | 0.3173679591 | 0.2448392401 | 0.00063907 | 0.000892013 |
data_list$summary_stats$pi_r_hat
r <int> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | |
---|---|---|---|---|---|---|---|
1 | 0.08129612491 | 0.13808166378 | 0.00197339857 | 0.00256337357 | 0.17659025386 | 0.49747393973 | |
2 | 0.08356599262 | 0.14971278697 | 0.00120708800 | 0.00164790207 | 0.19234124760 | 0.47740539977 | |
3 | 0.09638497610 | 0.17261091982 | 0.00112947534 | 0.00141494498 | 0.21600458354 | 0.40910513993 | |
4 | 0.11513952141 | 0.20389476250 | 0.00109486805 | 0.00119810588 | 0.24214127028 | 0.31471504643 | |
5 | 0.13958919228 | 0.23790113743 | 0.00096280035 | 0.00089816931 | 0.25825297205 | 0.21398101131 | |
6 | 0.16909462467 | 0.26517369528 | 0.00083279516 | 0.00078101918 | 0.25537062669 | 0.12923889453 | |
7 | 0.20288677287 | 0.27922443808 | 0.00069392185 | 0.00066155303 | 0.23355838792 | 0.07055440574 | |
8 | 0.23920866152 | 0.27939355439 | 0.00066456118 | 0.00064483683 | 0.19978607967 | 0.03576579532 | |
9 | 0.27844415906 | 0.26635468903 | 0.00058921665 | 0.00067693867 | 0.16223249252 | 0.01708962141 | |
10 | 0.31739037315 | 0.24485320337 | 0.00062457832 | 0.00087755256 | 0.12568670097 | 0.00801607384 |
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_molcomplexity.pdf")),
p_read,
width=9,
height=6)
p_read
data_list$fit_out$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
1706 | 0.9999 | 0.9999 | 0.9999 | 0.00010194 | 0.00011286 |
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fit.pdf")),
p_fit,
width=9,
height=6)
p_fit
data_list$summary_stats$cutoff_dt
approach <chr> | outcome <chr> | s <int> | qr <dbl> | n <int> | tor <dbl> | o <dbl> | FP <dbl> | FN <dbl> | TP <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
torc_before | 0,0,1,0,1,0,0,0 | 5 | 0.0062969 | 41 | 49.6366 | 0.99947 | 984 | 23384 | 44038085 | |
discard_torc | 0,0,1,0,0,0,0,0 | 3 | 0.0073119 | 22347 | 2.9735 | 0.99997 | 1147 | 1201 | 44060269 | |
torc_after | 0,1,1,0,0,0,0,0 | 2 | 0.0080795 | 32 | 2.8675 | 0.99997 | 1147 | 1169 | 44060301 | |
no_discarding | 1,0,0,0,0,0,1,0 | 7 | 0.4982596 | 252 | NaN | 1.00000 | 1449 | 303 | 44061167 | |
no_purging | NA | NA | NA | NA | NA | 1.00060 | 27393 | 0 | 44061470 |
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fp_performance.pdf")),
p_fp,
width=9,
height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_tradeoff.pdf")),
p_tradeoff,
width=9,
height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Using defaultDrops cell calling instead.
EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 7: identify RNA-containing cells: 733.182 sec elapsed
Step 9: tallying molecules by cell-barcode: 25.698 sec elapsed
Running workflow II: 758.881 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
sample <chr> | m <dbl> | prm_ret <dbl> | prm_disc <dbl> | ppm <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> | rm_ret <dbl> | |
---|---|---|---|---|---|---|---|---|---|
A1 | 5.93931 | 0.99862 | 0.0000075766 | 0.00136884 | 0.1347123 | 0.13436 | 6.0801 | 5931139 | |
A2 | 8.01873 | 0.99967 | 0.0000185815 | 0.00030878 | 0.1818765 | 0.13477 | 4.5171 | 8016103 | |
B1 | 0.14831 | 0.97948 | 0.0000000000 | 0.02051783 | 0.0033639 | 0.12624 | 228.7785 | 145267 | |
B2 | 0.19973 | 0.98648 | 0.0000000000 | 0.01351859 | 0.0045301 | 0.13215 | 177.8310 | 197025 | |
C1 | 8.51224 | 0.99966 | 0.0000243179 | 0.00031778 | 0.1930701 | 0.11791 | 3.7228 | 8509327 | |
C2 | 14.19085 | 0.99979 | 0.0000359387 | 0.00016933 | 0.3218692 | 0.12152 | 2.3015 | 14187933 | |
D1 | 5.45395 | 0.99954 | 0.0000518890 | 0.00041089 | 0.1237036 | 0.11167 | 5.5033 | 5451425 | |
D2 | 1.62575 | 0.99843 | 0.0000036906 | 0.00156789 | 0.0368744 | 0.12138 | 20.0664 | 1623197 |
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
sample <chr> | m <dbl> | prm_ret <dbl> | prm_disc <dbl> | ppm <dbl> | prop_m <dbl> | prop_reads <dbl> | FRM <dbl> | |
---|---|---|---|---|---|---|---|---|
A1 | 5.40001 | 0.99998 | 0.0000070370 | 0.00000907406 | 0.1340182 | 0.13436 | 6.6874 | |
A2 | 7.35779 | 0.99998 | 0.0000159015 | 0.00000489277 | 0.1826066 | 0.13477 | 4.9229 | |
B1 | 0.11064 | 0.99521 | 0.0000000000 | 0.00479013774 | 0.0027460 | 0.12624 | 306.6604 | |
B2 | 0.14207 | 0.99999 | 0.0000000000 | 0.00000703898 | 0.0035258 | 0.13215 | 250.0056 | |
C1 | 7.86794 | 0.99998 | 0.0000191918 | 0.00000381294 | 0.1952675 | 0.11791 | 4.0276 | |
C2 | 12.98775 | 0.99997 | 0.0000312602 | 0.00000046197 | 0.3223316 | 0.12152 | 2.5147 | |
D1 | 4.94792 | 0.99994 | 0.0000511326 | 0.00000727579 | 0.1227981 | 0.11167 | 6.0661 | |
D2 | 1.47901 | 0.99999 | 0.0000020284 | 0.00000540904 | 0.0367062 | 0.12138 | 22.0573 |
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
barcode <chr> | A1 <dbl> | A2 <dbl> | B1 <dbl> | B2 <dbl> | C1 <dbl> | C2 <dbl> | D1 <dbl> | D2 <dbl> |
---|---|---|---|---|---|---|---|---|
consensus_background | 123769 | 138948 | 26138 | 29971 | 145283 | 195614 | 98969 | 61353 |
transition_cell | 1 | 0 | 0 | 0 | 0 | 4 | 1 | 1 |
phantom_background | 1332 | 770 | 1258 | 1136 | 808 | 668 | 778 | 975 |
transition_background | 0 | 2 | 17 | 0 | 0 | 1 | 0 | 0 |
consensus_cell | 409 | 564 | 199 | 16 | 739 | 1316 | 685 | 160 |
phantom_cell | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 |
data_list$umi_counts_cell %>%
map(list("called_cells"))
$A1
# A tibble: 409 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGTCCGGTCCGAGTC 37482 37479 3 0
2 ACGTCAAGTTGCCTCT 31541 31538 2 1
3 CACCACTTCGGTTCGG 18666 18664 2 0
4 CGTAGGCCAACACCTA 65340 65338 2 0
5 CTGCTGTTCCGGGTGT 17297 17294 2 1
6 TCTCTAAAGATCACGG 21938 21936 2 0
7 AACCGCGTCATGCTCC 47423 47422 1 0
8 ACATACGCATTCGACA 3517 3516 1 0
9 ACGAGGACATCACCCT 15208 15207 1 0
10 ACGGCCACAAGGACAC 49067 49064 1 2
# … with 399 more rows
$A2
# A tibble: 566 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 ACGAGCCCACCGTTGG 43895 43892 3 0
2 CGTGTAATCTAACTCT 22943 22941 2 0
3 CTTAGGAAGAAACGCC 19123 19121 2 0
4 GCTGGGTAGTAGGTGC 55651 55648 2 1
5 AAAGATGAGGCCCTCA 68719 68718 1 0
6 AACTCCCCATTCGACA 15035 15034 1 0
7 ACAGCTAAGGCCCGTT 29916 29914 1 1
8 ACGGGCTTCGGTCCGA 34499 34498 1 0
9 ACTTACTCAACACCCG 29456 29455 1 0
10 CAAGATCCAACACGCC 44113 44112 1 0
# … with 556 more rows
$B1
# A tibble: 220 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 103 9 94 0
2 TCGTACCGTCCATGAT 74 11 63 0
3 TGCACCTTCTGGCGAC 74 11 63 0
4 GTGCAGCCAAGCGAGT 43 5 38 0
5 CTAGCCTTCCGCGCAA 41 4 37 0
6 AGCAGCCGTTAAGGGC 28 2 26 0
7 TGCACCTCATGCCACG 28 3 25 0
8 TCATTACCAGGCAGTA 19 3 16 0
9 GATCGTAGTCACCCAG 18 4 14 0
10 GTGCTTCAGGCATTGG 15 1 14 0
# … with 210 more rows
$B2
# A tibble: 16 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 TCGTACCGTCCATGAT 26473 26472 1 0
2 ACGCCAGAGTACGTAA 417 417 0 0
3 AGCAGCCGTTAAGGGC 8853 8853 0 0
4 CAAGGCCGTCCGAGTC 232 232 0 0
5 CGTGAGCCAAGGACTG 1478 1478 0 0
6 CTAGCCTTCCGCGCAA 11808 11808 0 0
7 CTCACACGTCTCTCGT 36274 36274 0 0
8 CTTGGCTCAAAGTGCG 1034 1034 0 0
9 GACAGAGTCAATACCG 211 211 0 0
10 GATCGTAGTCACCCAG 5904 5904 0 0
11 GTGCAGCCAAGCGAGT 12113 12113 0 0
12 GTGGGTCCAGGCTGAA 2190 2190 0 0
13 TCATTACCAGGCAGTA 5893 5893 0 0
14 TGCACCTCATGCCACG 5479 5479 0 0
15 TGCACCTTCTGGCGAC 22310 22310 0 0
16 TGGCTGGCATCCGTGG 1397 1397 0 0
$C1
# A tibble: 739 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 ACTATCTTCCTGTACC 33437 33434 2 1
2 CCCTCCTGTGATGTCT 10899 10897 2 0
3 GGTGCGTCAGTTAACC 38377 38375 2 0
4 ACCTTTACAAAGTGCG 19839 19837 1 1
5 ACGGCCAGTACTCAAC 26169 26168 1 0
6 AGCGTATGTTCTCATT 15803 15802 1 0
7 AGGGATGTCGCCGTGA 9354 9353 1 0
8 AGGTCATGTAAGCACG 24932 24930 1 1
9 ATCTGCCTCTTGCATT 4779 4777 1 1
10 ATTATCCGTTAAGATG 30075 30072 1 2
# … with 729 more rows
$C2
# A tibble: 1,317 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CTTAACTGTGAGTATA 6233 6230 3 0
2 ATAACGCCACGGTTTA 1584 1583 1 0
3 GGAACTTTCAAAGTAG 1634 1633 1 0
4 TACCTATCATCACAAC 6081 6080 1 0
5 AAACCTGCAGTGGGAT 8200 8200 0 0
6 AAACCTGGTCGAAAGC 7482 7482 0 0
7 AAACGGGCAAGTTAAG 9265 9265 0 0
8 AAACGGGGTTATCACG 13268 13268 0 0
9 AAACGGGGTTGTCTTT 13437 13437 0 0
10 AAAGATGAGAGGTTGC 795 795 0 0
# … with 1,307 more rows
$D1
# A tibble: 685 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AAAGATGAGGCCGAAT 13329 13326 1 2
2 AAGGCAGGTTATGTGC 4278 4277 1 0
3 ACCGTAACAGGACGTA 10930 10929 1 0
4 AGCTCCTTCTCAAACG 34839 34837 1 1
5 ATAACGCGTTGTCTTT 11030 11027 1 2
6 ATCCACCGTAAGGGAA 20527 20526 1 0
7 ATCGAGTCAGATCTGT 25174 25172 1 1
8 CATATTCGTACGAAAT 23309 23304 1 4
9 CATCGGGGTATGAAAC 29755 29753 1 1
10 CCAGCGATCGACCAGC 13789 13785 1 3
# … with 675 more rows
$D2
# A tibble: 160 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AACCATGTCAGCGACC 18595 18594 1 0
2 AGATTGCCAGGTTTCA 14970 14969 1 0
3 CGCGTTTCATGTTGAC 11987 11986 1 0
4 CGTCAGGAGCTTCGCG 14085 14084 1 0
5 GCATGTATCCCAGGTG 27178 27177 1 0
6 GTGTTAGGTAGCGTCC 11840 11839 1 0
7 TCAGGTATCATAACCG 8832 8831 1 0
8 TGTCCCAGTCATCCCT 3194 3193 1 0
9 AAAGCAAGTGAACCTT 9734 9734 0 0
10 AAAGCAATCTCCTATA 11917 11917 0 0
# … with 150 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$A1
# A tibble: 125,102 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 86 4 82 0
2 CTCACACGTCTCTCGT 99 19 80 0
3 CATGCCTAGTGCTGCC 92 18 74 0
4 TGCACCTTCTGGCGAC 57 5 52 0
5 GGAATAAGTCAGGACA 53 3 50 0
6 CACACAAAGGCGATAC 67 19 48 0
7 TCGTACCGTCCATGAT 56 10 46 0
8 CGGCTAGGTAAACCTC 42 0 42 0
9 CTTCTCTCAACTGCTA 48 6 42 0
10 TGGTTAGTCCTGCAGG 40 1 39 0
# … with 125,092 more rows
$A2
# A tibble: 139,718 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 97 15 82 0
2 CTCACACGTCTCTCGT 93 21 72 0
3 CATGCCTAGTGCTGCC 84 15 69 0
4 TCGTACCGTCCATGAT 76 18 58 0
5 TGCACCTTCTGGCGAC 66 14 52 0
6 CACACAAAGGCGATAC 55 14 41 0
7 GTGCAGCCAAGCGAGT 50 12 38 0
8 AAGCCGCGTGCTCTTC 44 8 36 0
9 GGAATAAGTCAGGACA 38 5 33 0
10 CTAGCCTTCCGCGCAA 40 8 32 0
# … with 139,708 more rows
$B1
# A tibble: 27,396 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 ACGGCCACAAGGACAC 8 0 8 0
2 AGATTGCCAGGTTTCA 8 0 8 0
3 CAGCCGACAGGATCGA 8 0 8 0
4 GATCGTAAGATACACA 8 0 8 0
5 ACTGTCCCAGCGAACA 8 1 7 0
6 ATTATCCGTGTGAATA 8 1 7 0
7 CAGATCAGTGCTCTTC 7 0 7 0
8 CTACACCCACGCGAAA 7 0 7 0
9 TTTGGTTCAGGAACGT 8 1 7 0
10 ACGGGCTTCGGTCCGA 6 0 6 0
# … with 27,386 more rows
$B2
# A tibble: 31,107 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 96 10 86 0
2 CATGCCTAGTGCTGCC 85 9 76 0
3 CACACAAAGGCGATAC 66 8 58 0
4 AAGCCGCGTGCTCTTC 46 6 40 0
5 GGAATAAGTCAGGACA 42 3 39 0
6 ATAAGAGAGTGTGGCA 32 3 29 0
7 TGCCCATCAGTAAGCG 32 3 29 0
8 CACACTCCACATAACC 29 3 26 0
9 CATCGAAAGTCAATAG 28 6 22 0
10 CATATTCGTTGTACAC 14 0 14 0
# … with 31,097 more rows
$C1
# A tibble: 146,091 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 113 10 103 0
2 CATGCCTAGTGCTGCC 107 9 98 0
3 CTCACACGTCTCTCGT 88 16 72 0
4 CACACAAAGGCGATAC 71 12 59 0
5 TCGTACCGTCCATGAT 64 10 54 0
6 TGCACCTTCTGGCGAC 57 12 45 0
7 GGAATAAGTCAGGACA 50 8 42 0
8 AAGCCGCGTGCTCTTC 47 7 40 0
9 CATCGAAAGTCAATAG 42 5 37 0
10 ATAAGAGAGTGTGGCA 38 7 31 0
# … with 146,081 more rows
$C2
# A tibble: 196,286 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CATGCCTAGTGCTGCC 103 13 90 0
2 GGCTGGTGTCCCTACT 109 20 89 0
3 CACACAAAGGCGATAC 67 2 65 0
4 CTCACACGTCTCTCGT 69 8 61 0
5 TCGTACCGTCCATGAT 67 9 58 0
6 TGCACCTTCTGGCGAC 54 8 46 0
7 GGAATAAGTCAGGACA 43 5 38 0
8 AAGCCGCGTGCTCTTC 44 8 36 0
9 CATCGAAAGTCAATAG 40 7 33 0
10 CTAGCCTTCCGCGCAA 33 2 31 0
# … with 196,276 more rows
$D1
# A tibble: 99,748 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CTCACACGTCTCTCGT 94 20 74 0
2 GGCTGGTGTCCCTACT 80 8 72 0
3 CATGCCTAGTGCTGCC 86 15 71 0
4 TCGTACCGTCCATGAT 62 16 46 0
5 CACACAAAGGCGATAC 46 5 41 0
6 TGCACCTTCTGGCGAC 44 7 37 0
7 AAGCCGCGTGCTCTTC 32 3 29 0
8 CATCGAAAGTCAATAG 28 4 24 0
9 CTAGCCTTCCGCGCAA 33 9 24 0
10 GTGCAGCCAAGCGAGT 27 4 23 0
# … with 99,738 more rows
$D2
# A tibble: 62,329 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GGCTGGTGTCCCTACT 120 19 101 0
2 CTCACACGTCTCTCGT 90 8 82 0
3 CATGCCTAGTGCTGCC 93 12 81 0
4 TCGTACCGTCCATGAT 83 22 60 1
5 CACACAAAGGCGATAC 66 7 59 0
6 TGCACCTTCTGGCGAC 62 5 57 0
7 GTGCAGCCAAGCGAGT 40 5 35 0
8 ATAAGAGAGTGTGGCA 32 2 30 0
9 AAGCCGCGTGCTCTTC 28 0 28 0
10 CTAGCCTTCCGCGCAA 33 5 28 0
# … with 62,319 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 289.009 sec elapsed
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6055524 323.4 9523112 508.6 9523112 508.6
Vcells 705180869 5380.2 2345841278 17897.4 2931643582 22366.7
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.9.4 data.table_1.12.2 tictoc_1.0
[4] furrr_0.1.0 future_1.13.0 broom_0.5.2
[7] matrixStats_0.54.0 forcats_0.4.0 stringr_1.4.0
[10] dplyr_0.8.1 purrr_0.3.2 readr_1.3.1
[13] tidyr_0.8.3 tibble_2.1.2 ggplot2_3.1.1
[16] tidyverse_1.2.1 rhdf5_2.26.2 rmarkdown_1.13
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.4
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.2
[13] BiocGenerics_0.28.0 colorspace_1.4-1
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.1.0
[19] rvest_0.3.4 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.19 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.4 readxl_1.3.1
[33] rstudioapi_0.10 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.1 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.4.3
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] zeallot_0.1.0 knitr_1.23
[63] pillar_1.4.1 GenomicRanges_1.34.0
[65] codetools_0.2-16 stats4_3.5.2
[67] glue_1.3.1 evaluate_0.14
[69] modelr_0.1.4 vctrs_0.1.0
[71] cellranger_1.1.0 gtable_0.3.0
[73] assertthat_0.2.1 xfun_0.7
[75] DropletUtils_1.2.2 viridisLite_0.3.0
[77] SingleCellExperiment_1.4.1 IRanges_2.16.0
[79] globals_0.12.4