library(tidyverse)
library(rhdf5)
#library(DropletUtils) # install but not load
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
datasets_names <- c("hiseq4000_nonplexed", "hiseq4000_plexed")
names(datasets_names) <- datasets_names
datasets_names
hiseq4000_nonplexed hiseq4000_plexed
"hiseq4000_nonplexed" "hiseq4000_plexed"
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
dir.create(validation_output_dir)
'/home/rfarouni/Documents/index_hopping/data/hiseq4000_validation' already exists
get_read_counts <- function(dataset_name) {
data_dir <- file.path(project_dir, "data", dataset_name)
output_dir <- file.path(data_dir, "output")
input_dir <- file.path(data_dir, "input")
read_counts_filepath <- file.path(
output_dir,
paste0(
dataset_name,
"_read_counts.rds"
)
)
read_counts <-
create_joined_counts(
input_dir,
read_counts_filepath
)
return(read_counts)
}
read_counts <- map(datasets_names, get_read_counts)
data_fj <-
full_join(read_counts$hiseq4000_nonplexed %>%
select(-outcome),
read_counts$hiseq4000_plexed,
by = c("cell", "gene", "umi"),
suffix = c("_nonplexed", "_plexed")
)
create keys
old_key <- unique(data_fj$gene)
new_key <-
sample.int(
n = length(old_key),
size = length(old_key)
)
names(new_key) <- old_key
new_key <-
enframe(new_key,
name = "gene",
value = "gene_new"
)
recode
data_fj <-
left_join(data_fj, new_key, by = "gene") %>%
mutate(
gene = NULL,
gene = gene_new
) %>%
select(cell, gene, umi, everything()) %>%
select(-gene_new) %>%
mutate_if(is.double, as.integer) %>%
set_names(c("cell", "gene", "umi", "s1_nonplexed", "s2_nonplexed", "s1_plexed", "s2_plexed", "outcome"))
data_fj
cell <chr> | gene <int> | umi <int> | s1_nonplexed <int> | s2_nonplexed <int> | s1_plexed <int> | s2_plexed <int> | outcome <chr> |
---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | 9996 | 675238 | 1 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 13108 | 810559 | 1 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 15283 | 486487 | 1 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 14121 | 159000 | 1 | 0 | 1 | 0 | 1,0 |
AAACCTGAGAAACCTA | 14121 | 115508 | 2 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 8108 | 842392 | 3 | 0 | 2 | 0 | 2,0 |
AAACCTGAGAAACCTA | 2605 | 1000660 | 1 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 2605 | 18500 | 1 | 0 | NA | NA | NA |
AAACCTGAGAAACCTA | 2605 | 138546 | 1 | 0 | 1 | 0 | 1,0 |
AAACCTGAGAAACCTA | 11148 | 606019 | 4 | 0 | NA | NA | NA |
write_tsv(data_fj,
file.path(validation_output_dir, "hiseq4000_joined_datatable_plexed_nonplexed.txt"))
inner join
data <-
data_fj %>%
drop_na()
data
cell <chr> | gene <int> | umi <int> | s1_nonplexed <int> | s2_nonplexed <int> | s1_plexed <int> | s2_plexed <int> | outcome <chr> | |
---|---|---|---|---|---|---|---|---|
4 | AAACCTGAGAAACCTA | 14121 | 159000 | 1 | 0 | 1 | 0 | 1,0 |
6 | AAACCTGAGAAACCTA | 8108 | 842392 | 3 | 0 | 2 | 0 | 2,0 |
9 | AAACCTGAGAAACCTA | 2605 | 138546 | 1 | 0 | 1 | 0 | 1,0 |
11 | AAACCTGAGAAACCTA | 2725 | 855812 | 5 | 0 | 2 | 0 | 2,0 |
12 | AAACCTGAGAAACCTA | 2211 | 577124 | 2 | 0 | 2 | 0 | 2,0 |
13 | AAACCTGAGAAACCTA | 5339 | 250233 | 1 | 0 | 3 | 0 | 3,0 |
17 | AAACCTGAGAAACCTA | 19619 | 302544 | 3 | 0 | 3 | 0 | 3,0 |
19 | AAACCTGAGAAACCTA | 20884 | 644179 | 3 | 0 | 2 | 0 | 2,0 |
20 | AAACCTGAGAAACCTA | 2614 | 1043378 | 2 | 0 | 2 | 0 | 2,0 |
21 | AAACCTGAGAAACCTA | 4170 | 751749 | 324 | 0 | 185 | 1 | 185,1 |
data <-
data %>%
mutate(label= case_when(
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed!=0~ "r,r",
s1_nonplexed == 0 & s1_plexed == 0 ~ "0,r",
s2_nonplexed == 0 & s2_plexed == 0 ~ "r,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed == 0 ~ "f,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed != 0 ~ "f,r",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed == 0 ~ "0,f",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed != 0 ~ "r,f",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed==0 & s2_plexed!=0~ "0,r",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed==0~ "r,0",
TRUE ~ "NA"
))
data
cell <chr> | gene <int> | umi <int> | s1_nonplexed <int> | s2_nonplexed <int> | s1_plexed <int> | s2_plexed <int> | outcome <chr> | label <chr> |
---|---|---|---|---|---|---|---|---|
AAACCTGAGAAACCTA | 14121 | 159000 | 1 | 0 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 8108 | 842392 | 3 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2605 | 138546 | 1 | 0 | 1 | 0 | 1,0 | r,0 |
AAACCTGAGAAACCTA | 2725 | 855812 | 5 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2211 | 577124 | 2 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 5339 | 250233 | 1 | 0 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 19619 | 302544 | 3 | 0 | 3 | 0 | 3,0 | r,0 |
AAACCTGAGAAACCTA | 20884 | 644179 | 3 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 2614 | 1043378 | 2 | 0 | 2 | 0 | 2,0 | r,0 |
AAACCTGAGAAACCTA | 4170 | 751749 | 324 | 0 | 185 | 1 | 185,1 | r,f |
label_tally <-
data %>%
group_by(label) %>%
tally() %>%
add_row(label = "TOTAL", n=sum(.$n))
label_tally
label <chr> | n <int> | |||
---|---|---|---|---|
0,f | 3484 | |||
0,r | 6637616 | |||
f,0 | 6511 | |||
f,r | 215147 | |||
r,0 | 2129080 | |||
r,f | 260257 | |||
r,r | 52 | |||
TOTAL | 9252147 |
There are only 52 (real, real) colliding CUGs out of 9252147 so assumption I is validated. The collision rate is 0.00001.
data <-
data %>%
filter(label!="r,r")
write_tsv(data, file.path(validation_output_dir,"hiseq4000_inner_joined_with_labels.txt"))
number of reads (in millions)
data_fj %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
s1_nonplexed <dbl> | s2_nonplexed <dbl> | s1_plexed <dbl> | s2_plexed <dbl> | |
---|---|---|---|---|
184.77 | 184.17 | 92.7 | 77.909 |
data %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6)))
s1_nonplexed <dbl> | s2_nonplexed <dbl> | s1_plexed <dbl> | s2_plexed <dbl> | |
---|---|---|---|---|
182.01 | 176.67 | 91.884 | 76.294 |
number of molecules (in millions)
data_fj %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
s1_nonplexed <dbl> | s2_nonplexed <dbl> | s1_plexed <dbl> | s2_plexed <dbl> | |
---|---|---|---|---|
3.9066 | 10.837 | 3.2209 | 8.3142 |
data %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
s1_nonplexed <dbl> | s2_nonplexed <dbl> | s1_plexed <dbl> | s2_plexed <dbl> | |
---|---|---|---|---|
2.3929 | 6.8593 | 2.611 | 7.1165 |
p1 <- ggplot(data, aes(x = s1_nonplexed,
y = s1_plexed))
p1 + geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
p2 <- ggplot(data,
aes(x = s2_nonplexed,
y = s2_plexed))
p2 +
geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
# geom_point()
data_reads_cell_gene <-
data %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_reads_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
## Gene expression UMI counts with same cell-gene label
data_molec_cell_gene <-
data %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_molec_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s1_plexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s1_plexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s2_plexed,
y = s1_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
## Gene expression UMI counts (nonhopping) with same cell-gene label
Here we retain molecules for which the reads didn’t hop
data_molec_cell_gene_nonhopping <-
data %>%
filter(label%in% c("r,0","0,r"))%>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
ggplot(data_molec_cell_gene_nonhopping,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_molec_cell_gene_nonhopping,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene_nonhopping,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene_nonhopping,
aes(x = s1_plexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene_nonhopping,
aes(x = s1_plexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] hexbin_1.27.3 rhdf5_2.28.0 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.2
[10] ggplot2_3.1.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 cellranger_1.1.0 pillar_1.4.1 compiler_3.6.0 plyr_1.8.4 tools_3.6.0 digest_0.6.19 jsonlite_1.6
[9] lubridate_1.7.4 nlme_3.1-140 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.2 rlang_0.3.4 cli_1.1.0 rstudioapi_0.10
[17] yaml_2.2.0 haven_2.1.0 xfun_0.7 withr_2.1.2 xml2_1.2.0 httr_1.4.0 knitr_1.23 generics_0.0.2
[25] hms_0.4.2 rprojroot_1.3-2 grid_3.6.0 tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 readxl_1.3.1 Rhdf5lib_1.6.0
[33] modelr_0.1.4 magrittr_1.5 backports_1.1.4 scales_1.0.0 rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3
[41] stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2 crayon_1.3.4