Last updated: 2018-05-26
workflowr checks: (Click a bullet for more information) ✔ R Markdown file: up-to-date
Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.
✔ Environment: empty
Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
✔ Seed:
set.seed(12345)
The command set.seed(12345)
was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
✔ Session information: recorded
Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
✔ Repository version: 2076ce9
wflow_publish
or wflow_git_commit
). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
Ignored files:
Ignored: .Rhistory
Ignored: .Rproj.user/
Untracked files:
Untracked: data/gene_cov/
Untracked: data/reads_mapped_three_prime_seq.csv
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.
File | Version | Author | Date | Message |
---|---|---|---|---|
Rmd | 2076ce9 | Briana Mittleman | 2018-05-26 | initial commit, gene level analysis |
I will use this analysis to take a look at the initial protein conding gene counts.
library(workflowr)
Loading required package: rmarkdown
This is workflowr version 1.0.1
Run ?workflowr for help getting started
library(ggplot2)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(edgeR)
Warning: package 'edgeR' was built under R version 3.4.3
Loading required package: limma
Warning: package 'limma' was built under R version 3.4.3
Imput the data that was created from my coding gene rule in the snakefile.
N_18486_cov= read.table("../data/gene_cov/YL-SP-18486-N_S10_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18486_cov= read.table("../data/gene_cov/YL-SP-18486-T_S9_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18497_cov= read.table("../data/gene_cov/YL-SP-18497-N_S12_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18497_cov= read.table("../data/gene_cov/YL-SP-18497-T_S11_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18500_cov= read.table("../data/gene_cov/YL-SP-18500-N_S20_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18500_cov= read.table("../data/gene_cov/YL-SP-18500-T_S19_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18505_cov= read.table("../data/gene_cov/YL-SP-18505-N_S2_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18505_cov= read.table("../data/gene_cov/YL-SP-18505-T_S1_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18508_cov= read.table("../data/gene_cov/YL-SP-18508-N_S6_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18508_cov= read.table("../data/gene_cov/YL-SP-18508-T_S5_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18853_cov= read.table("../data/gene_cov/YL-SP-18853-N_S32_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18853_cov= read.table("../data/gene_cov/YL-SP-18853-T_S31_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_18870_cov= read.table("../data/gene_cov/YL-SP-18870-N_S24_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_18870_cov= read.table("../data/gene_cov/YL-SP-18870-T_S23_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19128_cov= read.table("../data/gene_cov/YL-SP-19128-N_S30_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19128_cov= read.table("../data/gene_cov/YL-SP-19128-T_S29_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19141_cov= read.table("../data/gene_cov/YL-SP-19141-N_S18_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19141_cov= read.table("../data/gene_cov/YL-SP-19141-T_S17_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19193_cov= read.table("../data/gene_cov/YL-SP-19193-N_S22_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19193_cov= read.table("../data/gene_cov/YL-SP-19193-T_S21_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19209_cov= read.table("../data/gene_cov/YL-SP-19209-N_S16_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19209_cov= read.table("../data/gene_cov/YL-SP-19209-T_S15_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19223_cov= read.table("../data/gene_cov/YL-SP-19223-N_S8_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19223_cov= read.table("../data/gene_cov/YL-SP-19233-T_S7_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19225_cov= read.table("../data/gene_cov/YL-SP-19225-N_S28_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19225_cov= read.table("../data/gene_cov/YL-SP-19225-T_S27_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19238_cov= read.table("../data/gene_cov/YL-SP-19238-N_S4_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19238_cov= read.table("../data/gene_cov/YL-SP-19238-T_S3_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19239_cov= read.table("../data/gene_cov/YL-SP-19239-N_S14_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19239_cov= read.table("../data/gene_cov/YL-SP-19239-T_S13_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
N_19257_cov= read.table("../data/gene_cov/YL-SP-19257-N_S26_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
T_19257_cov= read.table("../data/gene_cov/YL-SP-19257-T_S25_R1_001-genecov.txt", stringsAsFactors = FALSE, header = F, col.names = c("chr", "start", "end", "gene", "score", "strand", "count" ))
Look at the total libraries first:
total_count_matrix=cbind(T_18486_cov$count, T_18497_cov$count, T_18500_cov$count, T_18505_cov$count, T_18508_cov$count, T_18853_cov$count, T_18870_cov$count, T_19128_cov$count, T_19141_cov$count, T_19193_cov$count, T_19209_cov$count, T_19223_cov$count, T_19225_cov$count, T_19238_cov$count,T_19239_cov$count, T_19257_cov$count)
#gene length vector
gene_length=T_18497_cov %>% mutate(genelength=end-start) %>% select(genelength)
gene_length_vec=as.vector(gene_length$genelength)
total_count_matrix_cpm=cpm(total_count_matrix, log=T, gene.length=gene_length_vec )
Plot distribution of log2 cpm for total libraries.
plotDensities(total_count_matrix_cpm, legend = "bottomright", main="Pre-filtering total fraction")
Look at gene distributions for the nuclear fractions.
nuclear_count_matrix=cbind(N_18486_cov$count, N_18497_cov$count, N_18500_cov$count, N_18505_cov$count, N_18508_cov$count, N_18853_cov$count, N_18870_cov$count, N_19128_cov$count, N_19141_cov$count, N_19193_cov$count, N_19209_cov$count, N_19223_cov$count, N_19225_cov$count, N_19238_cov$count,N_19239_cov$count, N_19257_cov$count)
#cpm
nuclear_count_matrix_cpm=cpm(nuclear_count_matrix, log=T, gene.length=gene_length_vec )
Plot distribution of log2 cpm for nuclear libraries.
plotDensities(nuclear_count_matrix_cpm, legend = "bottomright", main="Pre-filtering nuclear fraction")
The distributions look similar. I can filter based on alll of the libraries. I will filter for 1cpm in more than half of the libraries. After this I can ask how many genes are detected in each library.
all_count_matrix=cbind(T_18486_cov$count, T_18497_cov$count, T_18500_cov$count, T_18505_cov$count, T_18508_cov$count, T_18853_cov$count, T_18870_cov$count, T_19128_cov$count, T_19141_cov$count, T_19193_cov$count, T_19209_cov$count, T_19223_cov$count, T_19225_cov$count, T_19238_cov$count,T_19239_cov$count, T_19257_cov$count,N_18486_cov$count, N_18497_cov$count, N_18500_cov$count, N_18505_cov$count, N_18508_cov$count, N_18853_cov$count, N_18870_cov$count, N_19128_cov$count, N_19141_cov$count, N_19193_cov$count, N_19209_cov$count, N_19223_cov$count, N_19225_cov$count, N_19238_cov$count,N_19239_cov$count, N_19257_cov$count )
#cpm
all_count_matrix_cpm=cpm(all_count_matrix, log=T, gene.length=gene_length_vec )
plotDensities(all_count_matrix_cpm, legend = "bottomright", main="Pre-filtering all libraries")
Filter:
keep.exprs=rowSums(all_count_matrix_cpm>1) >= 16
all_count_matrix_cpm_filt= all_count_matrix_cpm[keep.exprs,]
plotDensities(all_count_matrix_cpm_filt, legend = "bottomright", main="Post-filtering all libraries")
Post filtering we are left with 12461 protein coding genes.
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 edgeR_3.20.9 limma_3.34.9 dplyr_0.7.4
[5] ggplot2_2.2.1 workflowr_1.0.1 rmarkdown_1.8.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.15 compiler_3.4.2 pillar_1.1.0
[4] git2r_0.21.0 plyr_1.8.4 bindr_0.1
[7] R.methodsS3_1.7.1 R.utils_2.6.0 tools_3.4.2
[10] digest_0.6.14 evaluate_0.10.1 tibble_1.4.2
[13] gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.1
[16] rlang_0.1.6 yaml_2.1.16 stringr_1.2.0
[19] knitr_1.18 locfit_1.5-9.1 rprojroot_1.3-2
[22] grid_3.4.2 glue_1.2.0 R6_2.2.2
[25] magrittr_1.5 whisker_0.3-2 backports_1.1.2
[28] scales_0.5.0 htmltools_0.3.6 assertthat_0.2.0
[31] colorspace_1.3-2 stringi_1.1.6 lazyeval_0.2.1
[34] munsell_0.4.3 R.oo_1.22.0
This reproducible R Markdown analysis was created with workflowr 1.0.1