knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(readr.show_progress = FALSE,
digits=5,
scipen=8,
future.globals.maxSize = +Inf)
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
data <- read_tsv(file.path(validation_output_dir,
"hiseq4000_inner_joined_with_labels.txt"))
Parsed with column specification:
cols(
cell = [31mcol_character()[39m,
gene = [32mcol_double()[39m,
umi = [32mcol_double()[39m,
s1_nonplexed = [32mcol_double()[39m,
s2_nonplexed = [32mcol_double()[39m,
s1_plexed = [32mcol_double()[39m,
s2_plexed = [32mcol_double()[39m,
outcome = [31mcol_character()[39m,
label = [31mcol_character()[39m
)
data
cell <chr> | gene <dbl> | umi <dbl> | s1_nonplexed <dbl> | s2_nonplexed <dbl> | s1_plexed <dbl> | s2_plexed <dbl> | outcome <chr> | label <chr> |
---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | 14121 | 159000 | 1 | 0 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 8108 | 842392 | 3 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2605 | 138546 | 1 | 0 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 2725 | 855812 | 5 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2211 | 577124 | 2 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 5339 | 250233 | 1 | 0 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 19619 | 302544 | 3 | 0 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 20884 | 644179 | 3 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2614 | 1043378 | 2 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 4170 | 751749 | 324 | 0 | 185 | 1 | 185,1 | r,f |
summary_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|s2_pl")),
list(~ sum(.))) %>%
mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
select(-s1_plexed,- s2_plexed)
summary_counts
r <int> | label <chr> | s1_hopped <dbl> | s2_hopped <dbl> | s1_nonhopped <dbl> | s2_nonhopped <dbl> |
---|---|---|---|---|---|
1 | 0,f | 3266 | 0 | 0 | 0 |
1 | 0,r | 0 | 0 | 0 | 1863385 |
1 | f,0 | 0 | 6446 | 0 | 0 |
1 | r,0 | 0 | 0 | 699667 | 0 |
2 | 0,f | 330 | 0 | 0 | 0 |
2 | 0,r | 0 | 0 | 0 | 1542816 |
2 | f,0 | 0 | 126 | 0 | 0 |
2 | f,r | 0 | 5038 | 0 | 5038 |
2 | r,0 | 0 | 0 | 640976 | 0 |
2 | r,f | 2436 | 0 | 2436 | 0 |
summary_counts_conditional <-
summary_counts %>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional
r <int> | s1_hopped <dbl> | s2_hopped <dbl> | s1_nonhopped <dbl> | s2_nonhopped <dbl> | SIHR_12 <dbl> | SIHR_21 <dbl> | frac_s1 <dbl> |
---|---|---|---|---|---|---|---|
1 | 3266 | 6446 | 699667 | 1863385 | 0.00464625 | 0.0034474 | 0.273221 |
2 | 2766 | 5164 | 643412 | 1547854 | 0.00428055 | 0.0033251 | 0.293825 |
3 | 1678 | 3327 | 399701 | 1023138 | 0.00418059 | 0.0032412 | 0.281108 |
4 | 1051 | 2628 | 238205 | 796672 | 0.00439278 | 0.0032879 | 0.230374 |
5 | 744 | 2463 | 169311 | 769832 | 0.00437506 | 0.0031892 | 0.180458 |
6 | 551 | 2633 | 145705 | 825649 | 0.00376737 | 0.0031789 | 0.150077 |
7 | 550 | 2814 | 140801 | 908453 | 0.00389102 | 0.0030880 | 0.134285 |
8 | 557 | 3083 | 141947 | 1005845 | 0.00390866 | 0.0030557 | 0.123762 |
9 | 559 | 3319 | 146447 | 1116128 | 0.00380257 | 0.0029649 | 0.116077 |
10 | 501 | 3789 | 147579 | 1233501 | 0.00338331 | 0.0030623 | 0.106888 |
summary_counts_marginal <-
summary_counts %>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
s1_hopped <dbl> | s2_hopped <dbl> | s1_nonhopped <dbl> | s2_nonhopped <dbl> | SIHR_12 <dbl> | SIHR_21 <dbl> | SIHR <dbl> | frac_s1 <dbl> |
---|---|---|---|---|---|---|---|
317904 | 230210 | 91653829 | 75976017 | 0.0034565 | 0.0030209 | 0.0032591 | 0.54687 |
p1 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12*100,
colour="SIHR_12"))+
geom_line(aes(x = r,
y = SIHR_21*100,
colour="SIHR_21")) +
geom_line(aes(x = r,
y = frac_s1,
colour="frac_s1")) +
geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
linetype="dashed") +
xlim(0,90) +
ylim(0,1)
p1
NA
summary_mol_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed)
summary_mol_counts
r <int> | label <chr> | s1_phantom <int> | s2_phantom <int> | s1_real <int> | s2_real <int> |
---|---|---|---|---|---|
1 | 0,f | 3266 | 0 | 0 | 0 |
1 | 0,r | 0 | 0 | 0 | 1863385 |
1 | f,0 | 0 | 6446 | 0 | 0 |
1 | r,0 | 0 | 0 | 699667 | 0 |
2 | 0,f | 165 | 0 | 0 | 0 |
2 | 0,r | 0 | 0 | 0 | 771408 |
2 | f,0 | 0 | 63 | 0 | 0 |
2 | f,r | 0 | 5038 | 0 | 5038 |
2 | r,0 | 0 | 0 | 320488 | 0 |
2 | r,f | 2436 | 0 | 2436 | 0 |
summary_mol_counts_marginal <-
summary_mol_counts%>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
ppm_2 = s1_phantom/(s1_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
s1_phantom <int> | s2_phantom <int> | s1_real <int> | s2_real <int> | ppm_12 <dbl> | ppm_21 <dbl> | ppm_1 <dbl> | ppm_2 <dbl> | ppm <dbl> | |
---|---|---|---|---|---|---|---|---|---|
263741 | 221658 | 2389337 | 6852763 | 0.099409 | 0.031332 | 0.084894 | 0.03706 | 0.0499 |
summary_mol_counts_conditional <-
summary_mol_counts%>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
r <int> | s1_phantom <int> | s2_phantom <int> | s1_real <int> | s2_real <int> | ppm_12 <dbl> | ppm_21 <dbl> | ppm <dbl> | frac_mol_s1 <dbl> |
---|---|---|---|---|---|---|---|---|
1 | 3266 | 6446 | 699667 | 1863385 | 0.0046462 | 0.0034474 | 0.0037749 | 0.273221 |
2 | 2601 | 5101 | 322924 | 776446 | 0.0079902 | 0.0065268 | 0.0069571 | 0.294041 |
3 | 1547 | 3284 | 133753 | 342153 | 0.0114339 | 0.0095068 | 0.0100492 | 0.281443 |
4 | 952 | 2593 | 59802 | 199825 | 0.0156698 | 0.0128101 | 0.0134703 | 0.230853 |
5 | 693 | 2439 | 34010 | 154459 | 0.0199695 | 0.0155451 | 0.0163465 | 0.181121 |
6 | 529 | 2598 | 24376 | 138047 | 0.0212407 | 0.0184720 | 0.0188886 | 0.150438 |
7 | 532 | 2775 | 20193 | 130181 | 0.0256695 | 0.0208716 | 0.0215186 | 0.134857 |
8 | 541 | 3028 | 17813 | 126116 | 0.0294759 | 0.0234467 | 0.0241969 | 0.124436 |
9 | 548 | 3267 | 16334 | 124383 | 0.0324606 | 0.0255934 | 0.0263955 | 0.116805 |
10 | 493 | 3731 | 14808 | 123729 | 0.0322201 | 0.0292719 | 0.0295879 | 0.107179 |
p4 <- ggplot(summary_mol_counts_conditional) +
geom_line(aes(x = r,
y = ppm_12,
colour="ppm_12"))+
geom_line(aes(x = r,
y = ppm_21,
colour=" ppm_21")) +
#geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
xlim(0,300)
p4
#ggsave("phantom_molecules_validation.pdf", p4, width =8, height = 5)
summary_mol_counts_cell<-
data %>%
filter(label!="NA") %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(cell, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed) %>%
group_by(cell) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate_at(vars(matches("^s")),
list(nonempty= ~ as.integer(.!=0))) %>%
unite(label,matches("nonempty"), sep=",") %>%
mutate(cell_status=
case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
label %in% c("0,0,1,1") ~ "real",
label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~ "phantom",
label %in% c("1,0,0,1","0,1,1,0") ~ "phantom",
TRUE ~ "contaminated")) %>%
mutate(s1_total =(s1_phantom+s1_real),
s2_total =(s2_phantom+s2_real),
s1_ppm = s1_phantom/(s1_total),
s2_ppm = s2_phantom/(s2_total))
summary_mol_counts_cell
cell <chr> | s1_phantom <int> | s2_phantom <int> | s1_real <int> | s2_real <int> | label <chr> | cell_status <chr> | s1_total <int> | |
---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | 0 | 1 | 10 | 0 | 0,1,1,0 | phantom | 10 | |
AAACCTGAGAAACGCC | 0 | 0 | 0 | 1 | 0,0,0,1 | real | 0 | |
AAACCTGAGAAAGTGG | 0 | 0 | 9 | 0 | 0,0,1,0 | real | 9 | |
AAACCTGAGAACAACT | 0 | 0 | 0 | 24 | 0,0,0,1 | real | 0 | |
AAACCTGAGAACTCGG | 0 | 0 | 0 | 26 | 0,0,0,1 | real | 0 | |
AAACCTGAGAACTGTA | 0 | 0 | 0 | 1 | 0,0,0,1 | real | 0 | |
AAACCTGAGAAGCCCA | 0 | 0 | 0 | 56 | 0,0,0,1 | real | 0 | |
AAACCTGAGAAGGCCT | 1 | 0 | 0 | 35 | 1,0,0,1 | phantom | 1 | |
AAACCTGAGAAGGTGA | 0 | 0 | 12 | 22 | 0,0,1,1 | real | 12 | |
AAACCTGAGAATGTGT | 0 | 0 | 0 | 12 | 0,0,0,1 | real | 0 |
cell_status_tally <-
summary_mol_counts_cell %>%
group_by(cell_status,label) %>%
tally(sort=TRUE)
cell_status_tally
cell_status <chr> | label <chr> | n <int> | ||
---|---|---|---|---|
real | 0,0,0,1 | 98256 | ||
real | 0,0,1,0 | 67769 | ||
phantom | 1,0,0,1 | 26246 | ||
phantom | 0,1,1,0 | 25291 | ||
real | 0,0,1,1 | 15105 | ||
contaminated | 0,1,1,1 | 5561 | ||
contaminated | 1,0,1,1 | 4407 | ||
contaminated | 1,1,1,1 | 1502 | ||
phantom | 1,0,0,0 | 30 | ||
phantom | 0,1,0,0 | 15 |
n_affected_cells <-
cell_status_tally %>%
ungroup() %>%
filter(cell_status!="real") %>%
summarise(n=sum(n)) %>%
pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
[1] 64579
n_total_cells <-
summary_mol_counts_cell %>%
mutate_at(vars(matches("_total")),
list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarise_at(vars(matches("_total")), list(~sum(.))) %>%
mutate(cells_total= s1_total+s2_total)%>%
pull(cells_total)
n_total_cells
[1] 322321
Proportion of affected cells
p_affected_cells <- n_affected_cells/n_total_cells
p_affected_cells
[1] 0.20036
For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.
Sample 1 plot
p2 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s1_phantom >0 )) +
geom_point(aes(x = s1_total,
y = s1_phantom,
colour=cell_status)) +
scale_x_log10() +
scale_y_log10()
p2
#ggsave("phantom_molecules_validation.pdf", p2, width =8, height = 5)
Sample 2 plot
p3 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s2_phantom >0 )) +
geom_point(aes(x = s2_total,
y = s2_phantom,
colour=cell_status)) +
scale_x_log10() +
scale_y_log10()
p3
#ggsave("phantom_cells_validation_s2.pdf", p3, width =8, height = 5)
read_counts <-
data %>%
filter(label!="NA")%>%
select(-ends_with("nonplexed")) %>%
set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
cell <chr> | gene <dbl> | umi <dbl> | s1 <dbl> | s2 <dbl> | outcome <chr> | label <chr> |
---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | 14121 | 159000 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 8108 | 842392 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2605 | 138546 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 2725 | 855812 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2211 | 577124 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 5339 | 250233 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 19619 | 302544 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 20884 | 644179 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2614 | 1043378 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 4170 | 751749 | 185 | 1 | 185,1 | r,f |
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
[1] "s1" "s2"
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts%>%
select(-label),
sample_names,
min_frac=0.8)
toc()
Step 2: creating outcome counts datatable with grouping vars: 1.591 sec elapsed
outcome_counts
outcome <chr> | n <int> | r <int> | k_chimera <int> | s1 <dbl> | s2 <dbl> | s_maxprop <dbl> |
---|---|---|---|---|---|---|
0,1 | 1866651 | 1 | 1 | 0 | 1 | 2 |
1,0 | 706113 | 1 | 1 | 1 | 0 | 1 |
0,2 | 771573 | 2 | 1 | 0 | 2 | 2 |
2,0 | 320551 | 2 | 1 | 2 | 0 | 1 |
1,1 | 7474 | 2 | 2 | 1 | 1 | 0 |
0,3 | 338911 | 3 | 1 | 0 | 3 | 2 |
3,0 | 132248 | 3 | 1 | 3 | 0 | 1 |
1,2 | 3294 | 3 | 2 | 1 | 2 | 0 |
2,1 | 1495 | 3 | 2 | 2 | 1 | 0 |
0,4 | 197244 | 4 | 1 | 0 | 4 | 2 |
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S
)
toc()
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.083 sec elapsed
fit_out
$glm_estimates
max_r <int> | phat <dbl> | phat_low <dbl> | phat_high <dbl> | SIHR <dbl> | SBIHR <dbl> |
---|---|---|---|---|---|
508 | 0.9968 | 0.99678 | 0.99681 | 0.0032042 | 0.0056074 |
$chimera_counts
r <dbl> | 1 <dbl> | 2 <dbl> | n_cugs <dbl> | n_chimeras <dbl> | p_chimeras <dbl> | pcum_cugs <dbl> | phat_chimeras <dbl> | phat_chimeras_low <dbl> | |
---|---|---|---|---|---|---|---|---|---|
1 | 2572764 | 0 | 2572764 | 0 | 0.000000 | 0.27807 | 0.0032042 | 0.0032162 | |
2 | 1092124 | 7474 | 1099598 | 7474 | 0.006797 | 0.39692 | 0.0063982 | 0.0064221 | |
3 | 471159 | 4789 | 475948 | 4789 | 0.010062 | 0.44836 | 0.0095819 | 0.0096177 | |
4 | 256106 | 3533 | 259639 | 3533 | 0.013607 | 0.47643 | 0.0127554 | 0.0128030 | |
5 | 185339 | 3131 | 188470 | 3131 | 0.016613 | 0.49680 | 0.0159187 | 0.0159780 | |
6 | 159296 | 3127 | 162423 | 3127 | 0.019252 | 0.51435 | 0.0190719 | 0.0191428 | |
7 | 147067 | 3307 | 150374 | 3307 | 0.021992 | 0.53061 | 0.0222151 | 0.0222975 | |
8 | 140360 | 3569 | 143929 | 3569 | 0.024797 | 0.54616 | 0.0253481 | 0.0254420 | |
9 | 136902 | 3815 | 140717 | 3815 | 0.027111 | 0.56137 | 0.0284711 | 0.0285764 | |
10 | 134313 | 4224 | 138537 | 4224 | 0.030490 | 0.57635 | 0.0315841 | 0.0317007 |
NA
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
Step 4: compute molecular complexity profile and other summary statistics: 0.16 sec elapsed
summary_stats
$summary_estimates
n_reads <int> | n_cugs <dbl> | n_pm <dbl> | u <dbl> | g <dbl> | frm <dbl> | p_chimeras <dbl> |
---|---|---|---|---|---|---|
168177960 | 9252095 | 483659 | 0.051383 | 0.00089223 | 18.177 | 0.051383 |
$marginal
sample <chr> | n_reads <dbl> | prop_reads <dbl> | n_molecs <int> | p_molecs <dbl> | FRM <dbl> |
---|---|---|---|---|---|
s1 | 91884039 | 0.54635 | 2610995 | 0.26841 | 35.191 |
s2 | 76293921 | 0.45365 | 7116504 | 0.73159 | 10.721 |
$conditional
r <int> | n_cugs <dbl> | p_cugs <dbl> | s1 <dbl> | s2 <dbl> | max_hop_rate <dbl> |
---|---|---|---|---|---|
1 | 2572764 | 0.27807366872 | 0.274457 | 0.72554303 | 0.27445697 |
2 | 1099598 | 0.11884854187 | 0.294915 | 0.70508495 | 0.29491505 |
3 | 475948 | 0.05144218688 | 0.282263 | 0.71773667 | 0.28226333 |
4 | 259639 | 0.02806272525 | 0.231892 | 0.76810783 | 0.23189217 |
5 | 188470 | 0.02037052149 | 0.182283 | 0.81771741 | 0.18228259 |
6 | 162423 | 0.01755526721 | 0.152214 | 0.84778634 | 0.15221366 |
7 | 150374 | 0.01625296757 | 0.136436 | 0.86356399 | 0.13643601 |
8 | 143929 | 0.01555636858 | 0.125956 | 0.87404380 | 0.12595620 |
9 | 140717 | 0.01520920397 | 0.118256 | 0.88174374 | 0.11825626 |
10 | 138537 | 0.01497358166 | 0.109262 | 0.89073821 | 0.10926179 |
$pi_r_hat
r <int> | s1 <dbl> | s2 <dbl> | ||
---|---|---|---|---|
1 | 0.273002 | 0.72699773429 | ||
2 | 0.293592 | 0.70640769902 | ||
3 | 0.280859 | 0.71914102194 | ||
4 | 0.230163 | 0.76983706695 | ||
5 | 0.180233 | 0.81976661133 | ||
6 | 0.149971 | 0.85002947592 | ||
7 | 0.134091 | 0.86590889238 | ||
8 | 0.123544 | 0.87645629409 | ||
9 | 0.115794 | 0.88420589375 | ||
10 | 0.106742 | 0.89325838412 |
NA
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
Step 5: reassign read counts, determine cutoff, and mark retained observations: 1.176 sec elapsed
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names, label),
outcome_counts,
by = c("outcome")
) %>%
select(-outcome)
toc()
Step 6: Purge and save read counts datatable to disk: 2.166 sec elapsed
p5 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12,
colour="12"))+
geom_line(aes(x = r,
y = SIHR_21,
colour="21")) +
geom_hline(aes(yintercept = summary_counts_marginal$SIHR,
colour="true mean"),
linetype="solid",
size=.5) +
geom_hline(aes(yintercept = fit_out$glm_estimates$SIHR,
colour="estimate"),
linetype="solid",
size=.1) +
geom_linerange(data=summary_counts_conditional,
aes(x=r,
ymax=1-fit_out$glm_estimates$phat_low,
ymin=1-fit_out$glm_estimates$phat_high,
colour="estimate"),
size=.5)+
xlim(1,210) +
ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5
read_counts <-
read_counts %>%
ungroup() %>%
arrange(-qr) %>%
mutate(t= case_when(
label %in% c("f,r","0,r") ~ 2,
label %in% c("r,f","r,0") ~ 1 ),
f= case_when(
label %in% c("f,r","f,0") ~ 1,
label %in% c("r,f","0,f") ~ 2 )) %>%
mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
fp= if_else( f == s, 1L, 0L, missing =0L),
tn= if_else( f != s, 1L, 0L, missing =0L),
fn= if_else( t != s, 1L, 0L, missing =0L),
tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if predict all molecules to be phantom
fn0= if_else( t != 0, 1L, 0L, missing =0L),
tn0= if_else( f != 0, 1L, 0L, missing =0L),
fp0= if_else( f == 0, 1L, 0L, missing =0L),
tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L))
false_counts_maxprop <-
read_counts %>%
summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
list( ~ sum(.))) %>%
set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop
tp <int> | fp <int> | tn <int> | fn <int> | |
---|---|---|---|---|
9226006 | 10000 | 475399 | 16094 |
false_counts_min_cutoff <-
read_counts %>%
summarize_at(vars(c("tp", "fp", "tn", "fn")),
list( ~ sum(.)))
false_counts_min_cutoff
tp <int> | fp <int> | tn <int> | fn <int> | |
---|---|---|---|---|
9239510 | 12585 | 472814 | 2590 |
false_counts_nopurging <-
read_counts%>%
summarize(n_cugs = n(),
n_real= sum(t>0, na.rm = TRUE),
n_fantom = sum(f>0, na.rm = TRUE),
n_mol=n_real+n_fantom,
g =n_cugs- n_real,
u = n_mol-n_cugs,
tp=n_mol-g,
fp=u+g,
tn=0,
fn=0)
false_counts_nopurging
n_cugs <int> | n_real <int> | n_fantom <int> | n_mol <int> | g <int> | u <int> | tp <int> | fp <int> | tn <dbl> | fn <dbl> |
---|---|---|---|---|---|---|---|---|---|
9252095 | 9242100 | 485399 | 9727499 | 9995 | 475404 | 9717504 | 485399 | 0 | 0 |
read_counts <-
read_counts %>%
mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
list(cum= ~ cumsum(.))) %>%
mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")),
list( ~ (last(.)-lag(., default =0)))) %>%
mutate(tp_t=tp_cum + tp0_cum,
fp_t=fp_cum + fp0_cum,
tn_t=tn_cum + tn0_cum,
fn_t=fn_cum + fn0_cum,
fpm= first(fp_t)- fp_t,
fnm= fn_t-first(fn_t),
tor_true= fnm/fpm)
false_counts_tor_cutoff <-
read_counts%>%
filter(retain) %>%
slice(1)%>%
select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))
false_counts_tor_cutoff
s1 <dbl> | s2 <dbl> | qr <dbl> | tor <dbl> | tp_t <dbl> | fp_t <dbl> | tn_t <dbl> | fn_t <dbl> | fpm <dbl> | fnm <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 0 | 0.0084875 | 2.4222 | 9234416 | 10116 | 475283 | 7685 | 2469 | 5094 |
false_counts_dt <-
bind_rows(
list(no_purging=false_counts_nopurging %>%
select(c("tp", "fp", "tn", "fn")),
no_cutoff=false_counts_min_cutoff,
tor_cutoff=
false_counts_tor_cutoff %>%
select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
set_names(c("tp", "fp", "tn", "fn")),
max_frac=false_counts_maxprop),
.id="approach") %>%
select(approach, fp,fn, tp, tn) %>%
mutate(fpr=fp/false_counts_nopurging$n_fantom,
fnr=fn/false_counts_nopurging$n_real)
false_counts_dt
approach <chr> | fp <dbl> | fn <dbl> | tp <dbl> | tn <dbl> | fpr <dbl> | fnr <dbl> |
---|---|---|---|---|---|---|
no_purging | 485399 | 0 | 9717504 | 0 | 1.000000 | 0.00000000 |
no_cutoff | 12585 | 2590 | 9239510 | 472814 | 0.025927 | 0.00028024 |
tor_cutoff | 10116 | 7685 | 9234416 | 475283 | 0.020841 | 0.00083152 |
max_frac | 10000 | 16094 | 9226006 | 475399 | 0.020602 | 0.00174138 |
classification_curves <-
read_counts %>%
group_by(qr) %>%
slice(1L) %>%
ungroup() %>%
select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
fnr=fn_t/false_counts_nopurging$n_real)
classification_curves
qr <dbl> | qs <dbl> | tor <dbl> | retain <lgl> | fp_t <dbl> | FP <dbl> | fn_t <dbl> | FN <dbl> | tp_t <dbl> | tn_t <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
0.0000e+00 | 0.0000 | 455.6849 | TRUE | 0 | 0 | 4755852 | 4778259 | 4486249 | 485400 | |
2.2204e-16 | 5.0792 | 440.7896 | TRUE | 0 | 0 | 4617371 | 4622142 | 4624730 | 485399 | |
6.2172e-15 | 18.0053 | 438.2251 | TRUE | 0 | 0 | 4593524 | 4595263 | 4648577 | 485399 | |
7.7716e-15 | 18.9606 | 438.1748 | TRUE | 0 | 0 | 4592997 | 4594736 | 4649104 | 485400 | |
9.3259e-15 | 19.7432 | 438.1740 | TRUE | 0 | 0 | 4592989 | 4594728 | 4649112 | 485400 | |
3.7970e-14 | 25.8058 | 438.1738 | TRUE | 0 | 0 | 4592987 | 4594726 | 4649114 | 485400 | |
4.4853e-14 | 26.5276 | 438.1691 | TRUE | 0 | 0 | 4592937 | 4594676 | 4649164 | 485400 | |
5.3069e-14 | 27.2566 | 437.9079 | TRUE | 1 | 0 | 4590201 | 4591939 | 4651900 | 485399 | |
7.5495e-14 | 28.7849 | 423.4035 | TRUE | 2 | 0 | 4438180 | 4439918 | 4803920 | 485398 | |
1.5612e-12 | 41.9349 | 420.2246 | TRUE | 2 | 0 | 4404863 | 4406600 | 4837238 | 485397 |
p_tradeoff <-
ggplot(classification_curves) +
geom_point(
aes(x = FPm,
y = FNm),
size=.5)+
geom_line(
aes(x = FPm,
y = FPm,
colour="1")
) +
geom_line(
aes(x = FPm,
y = 2*FPm,
colour="2"))+
geom_line(
aes(x = FPm,
y = 3*FPm,
colour="3"))+
geom_line(
aes(x = FPm,
y = 4*FPm,
colour="4"))+
geom_line(
aes(x = FPm,
y = 5*FPm,
colour="5"))+
geom_line(
aes(x = FPm,
y = 9*FPm,
colour="9"))+
scale_y_log10() +
theme_bw() +
theme(
legend.title = element_text(face = "bold")) +
scale_color_discrete(name = "TORC") +
labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
y="Marginal Increase in False Negatives (discard real molecs)")
#ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)
p_tradeoff
p6 <-
ggplot(classification_curves) +
geom_point(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(data=false_counts_dt ,
aes(x = fp,
y = fn,
shape=approach),
size=2) +
labs(x="False Positive Count",
y="False Negative Count") +
scale_y_sqrt() +
scale_x_sqrt()
#ggsave("peformance_groundtruth.pdf", p6, width=9, height=6)
p6
p7 <- ggplot(false_counts_dt
%>% filter(approach %in% c("no_cutoff", "tor_cutoff", "max_frac"))) +
geom_point(
aes(x = fp ,
y = fn,
color=approach),
size=2)
#ggsave("peformance_zoom.pdf", p7, width=9, height=6)
p7
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.9.4 data.table_1.12.2 tictoc_1.0 furrr_0.1.0 future_1.13.0 broom_0.5.2 matrixStats_0.54.0 forcats_0.4.0
[9] stringr_1.4.0 dplyr_0.8.1 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.2 ggplot2_3.1.1 tidyverse_1.2.1
[17] rhdf5_2.28.0
loaded via a namespace (and not attached):
[1] tidyselect_0.2.5 xfun_0.7 listenv_0.7.0 haven_2.1.0 lattice_0.20-38 colorspace_1.4-1 generics_0.0.2 yaml_2.2.0
[9] rlang_0.3.4 pillar_1.4.1 glue_1.3.1 withr_2.1.2 modelr_0.1.4 readxl_1.3.1 plyr_1.8.4 munsell_0.5.0
[17] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.4 codetools_0.2-16 labeling_0.3 knitr_1.23 parallel_3.6.0 Rcpp_1.0.1
[25] scales_1.0.0 backports_1.1.4 jsonlite_1.6 hms_0.4.2 digest_0.6.19 stringi_1.4.3 grid_3.6.0 rprojroot_1.3-2
[33] cli_1.1.0 tools_3.6.0 magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2 MASS_7.3-51.1 xml2_1.2.0
[41] lubridate_1.7.4 assertthat_0.2.1 httr_1.4.0 rstudioapi_0.10 Rhdf5lib_1.6.0 R6_2.4.0 globals_0.12.4 nlme_3.1-140
[49] compiler_3.6.0