Required packages
require(tidyverse)
require(data.table)
require(BSgenome.Dmelanogaster.UCSC.dm3)
require(seqinr)
require(stringr)
require(GenomicRanges)
require(patchwork)
require(ggpointdensity)
require(gridExtra)
require(cowplot)
require(ggrepel)
require(Biostrings)
require(ggplot2)
theme_set(theme_classic() + theme(axis.text = element_text(colour = "black")))
# Library colours
# S2171 also called ced6 enhancer
<- c("Controls"="grey60",
my_col "dev"="orangered",
"dev enh"="orangered",
"S2171_wt"="red",
"chr3L_3310914_3311162_wt"="red",
"Wt"="grey60",
"pos110"=RColorBrewer::brewer.pal(7, "Set1")[2],
"ced6_pos110"=RColorBrewer::brewer.pal(7, "Set1")[2],
"ced6_pos182"="#CCCC00",
"pos182"="#CCCC00",
"ced6_pos230"="#e78ac3",
"pos230"="#e78ac3",
"ced6_pos241"=RColorBrewer::brewer.pal(7, "Set1")[5],
"pos241"=RColorBrewer::brewer.pal(7, "Set1")[5],
"ZnT63C_pos210"=RColorBrewer::brewer.pal(7, "Set1")[7],
"pos210"=RColorBrewer::brewer.pal(7, "Set1")[7],
"ZnT63C_pos180"=RColorBrewer::brewer.pal(7, "Set1")[3],
"pos180"=RColorBrewer::brewer.pal(7, "Set1")[3],
"ZnT63C_pos142"="#BA3979",
"pos142"="#BA3979")
### Motif colours
<- c("wt"="orangered",
my_motif_col "Trl"="#e78ac3",
"GAGA"="#e78ac3",
"GAGAG"="#e78ac3",
"GAGAGA"="#e78ac3",
"GATA"="#78A8F3",
"GATAA"="#78A8F3",
"GATAAG"="#78A8F3",
"twist"="#7AB06F",
"CATCTG"="#7AB06F",
"AP1"="#BA3979",
"AP-1"="#BA3979",
"TGA.TCA"="#BA3979",
"ttk"="#DCDF80",
"AGGAT"="#DCDF80",
"AGGATAA"="#DCDF80",
"CCGGAA"="#a6761d",
"ETS"="#a6761d",
"TCACGCG"="#7570b3",
"TCACGCGA"="#7570b3",
"SREBP"="#7570b3",
"CREB/ATF"="#9B9666",
"CREB"="#9B9666",
"TGATGA"="#9B9666",
"TCATCA"="#9B9666",
"TGATGT"="#9B9666",
"TGTGAAA"="#9B9666",
"STAT"="#FF9933",
"TCC.GGA"="#FF9933",
"TTCC.GGA"="#FF9933",
"ATCGAT"="dodgerblue",
"Dref"="dodgerblue"
)
# Metadata of sequences in library
<- readRDS("data/final_variants_metadata.rds")
final_metadata <- final_metadata[,-grep("twist_S2171|Twist_mix.y", names(final_metadata))]
final_metadata
<- read.delim("data/experiment.txt", stringsAsFactors = F)
experiment_table
<- "UMI"
type for(i in 1:nrow(experiment_table)){
=experiment_table$Outfile[i]
name<- import.bed(paste0("data/", name, "_final_mapped.", type, ".bed")) # this are the rv reads
mapped
# choose sequences with correct length and strand (negative strand because these are the results of rv read mapping)
if(length(grep("S2171", name))>0){
<- mapped[mapped@ranges@width==149] # in this lane we got 149nt reads
mapped_correct_length <- mapped_correct_length[(mapped_correct_length@strand =="-" & mapped_correct_length@ranges@start==101) | (mapped_correct_length@strand =="+" & mapped_correct_length@ranges@start==1)]
mapped_correct_length
# I need to correct the oligo names of wt chr3L_3310914_3311162 reads In the final index (and metadata) the correct ID is chr3L_3310914_3311162_wt, and not chr3L_3310914_3311162_+_dCP
@seqnames@values <- as.character(mapped_correct_length@seqnames@values)
mapped_correct_length@seqnames@values[grep("chr3L_3310914_3311162", as.character(mapped_correct_length@seqnames@values))] <- "chr3L_3310914_3311162_wt"
mapped_correct_length
}
# choose sequences with correct length
if(length(grep("chr3L_3310914_3311162", name))>0){
<- mapped[mapped@ranges@width==150] # in this lane we got 150nt reads
mapped_correct_length
# ZnT63C AP-1 pos142 library is 250bp instead of 249bp, and therefore the reads start one nt ahead
<- c(mapped_correct_length[c(intersect(intersect(grep("GATAA_|twist|chr|chr3L_3310914_3311162_wt|S2171_wt", as.character(mapped_correct_length@seqnames)), grep(100, mapped_correct_length@ranges@start)), grep("-", as.character(mapped_correct_length@strand), fixed = T)),
mapped_correct_length intersect(intersect(grep("AP1", as.character(mapped_correct_length@seqnames)), grep(101, mapped_correct_length@ranges@start)), grep("-", as.character(mapped_correct_length@strand), fixed = T)))],
@strand =="+" & mapped_correct_length@ranges@start==1]) # also keep reads that map to pos strand of index in first nt
mapped_correct_length[mapped_correct_length
}
# choose sequences with correct strand
<- as.numeric(c(intersect(grep("GATAA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
rm intersect(grep("AP1", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
intersect(grep("twist", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
intersect(grep("CACA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
intersect(grep("GAGA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T))
))if(length(rm)>0) mapped_correct_length <- mapped_correct_length[-rm]
# add strand to variant name
# but strand should be the opposite, because this is the mapping for reverse read
<- as.data.frame(mapped_correct_length)
mapped_correct_length $ID <- paste0(mapped_correct_length$seqnames, "_", ifelse(mapped_correct_length$strand=="+", "-", "+"))
mapped_correct_length
# merge results
<- merge(final_metadata, as.data.frame(table(mapped_correct_length$ID)), by=1, all.x=T)
final_metadata $Freq[is.na(final_metadata$Freq)] <- 0
final_metadatanames(final_metadata)[ncol(final_metadata)] <- paste0(name, "_", type)
print(name)
}
$library <- factor(final_metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "GATAA_1st_S2171", "CACA_S2171", "GAGA_S2171", "GATAA_2nd_S2171", "twist_S2171", "GATAA_chr3L_3310914_3311162", "twist_chr3L_3310914_3311162", "AP1_chr3L_3310914_3311162"),
final_metadatalabels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "twist_S2171", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))
# Remove neg orientation of wt S2171 and chr3L_3310914_3311162 oligos - because in the respective screens with my mapping strategy (36bp + 150bp) I cannot uniquely identify the reverse orientation of the oligo, because it multimapps to the variant reverse orientation sequences - therefore the 0s are not correct and is better to remove it
# actually for chr3L_3310914_3311162, the strand is swapped, the overrepresented oligo is the reverse strand
<- final_metadata[-grep("S2171_wt_\\-|chr3L_3310914_3311162_wt_\\-", final_metadata$Variant),]
final_metadata
table(final_metadata$library)
write.table(final_metadata, "Table_enhancer_variant_counts.txt", sep="\t", quote = F, row.names = F)
<- read.delim("Table_enhancer_variant_counts.txt")
metadata <- metadata[!metadata$library %in% "twist_S2171",] # library not used
metadata $library <- factor(metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))
metadata
for(i in c("S21717_REEFmix2_input_Rep", "S21717_REEFmix2_Rep", "chr3L_3310914_3311162_REEFmix3_input_Rep", "chr3L_3310914_3311162_REEFmix3_Rep")){
if(length(grep("S21717", i))>0){
<- metadata[!(metadata$Twist_mix=="mix3" | metadata$Variant %in% c("S2171_wt_+", "chr3L_3310914_3311162_wt_+")),]
df if(length(grep(i, names(metadata))) !=2) break
$mean <- rowMeans(df[,grep(i, names(metadata))])
df<- rowMeans(metadata[!metadata$Twist_mix=="mix3",grep(i, names(metadata))])
nr.reads.per.variant names(nr.reads.per.variant) <- metadata$Variant[!metadata$Twist_mix=="mix3"]
}if(length(grep("chr3L_3310914_3311162", i))>0){
<- metadata[!(metadata$Twist_mix=="mix2" | metadata$Variant %in% c("S2171_wt_+", "chr3L_3310914_3311162_wt_+")),]
df if(length(grep(i, names(metadata))) !=2) break
$mean <- rowMeans(df[,grep(i, names(metadata))])
df<- rowMeans(metadata[!metadata$Twist_mix=="mix2",grep(i, names(metadata))])
nr.reads.per.variant names(nr.reads.per.variant) <- metadata$Variant[!metadata$Twist_mix=="mix2"]
}
$library <- droplevels(df$library)
df
# make main plot for inputs to compare the different libraries
<- ggplot(df, aes(x=mean, fill=library)) +
gg geom_density(alpha = 0.5) +
scale_fill_manual("Libraries", values = my_col[levels(df$library)]) +
scale_x_log10("Counts of sequenced fragments", breaks=c(1,median(nr.reads.per.variant),10,100, 1000))+
scale_y_continuous(expand = c(0,0)) +
geom_vline(xintercept = median(nr.reads.per.variant), col = "black", linetype="dashed") +
ggtitle(gsub("_", " ", gsub("_UMI", "", i)),
subtitle = paste0("Nr of UMI counts = ", round(sum(nr.reads.per.variant)/1e6,1), " million",
"\nNr of chr3L_3310914_3311162 wt fragments = ", nr.reads.per.variant[grep("chr3L_3310914_3311162_wt_\\+", names(nr.reads.per.variant))], " (", formatC(nr.reads.per.variant[grep("chr3L_3310914_3311162_wt_\\+", metadata$Variant)]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
"\nNr of S2-171 wt fragments = ", nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))], " (", formatC(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
"\nNr variants found = ", table(nr.reads.per.variant>0)[2], " (", formatC(table(nr.reads.per.variant>0)[2]/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
"\n",
"Median = ", median(nr.reads.per.variant))) +
theme_light(base_size = 15) +
theme(plot.title = element_text(hjust=0.5))
print(gg)
}
# Only ced-6 pos110 GATAA instance
$mean <- rowMeans(metadata[,grep("S21717_REEFmix2_input_Rep._UMI", names(metadata))])
metadata<- metadata[metadata$library %in% c("ced6_pos241"),]
df <- metadata[metadata$library %in% c("ced6_pos241", "S2171_wt"), "mean"]
nr.reads.per.variant names(nr.reads.per.variant) <- metadata$Variant[metadata$library %in% c("ced6_pos241", "S2171_wt")]
hist(df[,"mean"], breaks = 270, las = 1, xlab = "Sequenced fragment count (variants)",
main = paste0("REEF input library - 2nd GATAA"), border = 0.1, col = "gray", xlim=c(0,150))
legend("topright", legend = c(paste0("Nr of reads mapped = ", round(sum(nr.reads.per.variant)/1e6,1), " million"),
paste0("Nr of sequences with wt = ", round(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))],0), " (", formatC(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
paste0("Nr variants found = ", length(nr.reads.per.variant[!nr.reads.per.variant==0]), " (", formatC(length(nr.reads.per.variant[!nr.reads.per.variant==0])/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
paste0("Nr variants > 5 counts = ", length(nr.reads.per.variant[nr.reads.per.variant>=10]), " (", formatC(length(nr.reads.per.variant[nr.reads.per.variant>=10])/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
paste0("Mean = ", round(mean(nr.reads.per.variant))),
paste0("Median = ", round(median(nr.reads.per.variant)))),
text.col = c("black", "black", "black", "black", "tomato3", "steelblue3"), bty = "n",
cex=0.85)
abline(v = c(median(nr.reads.per.variant), mean(nr.reads.per.variant)), col = c("steelblue3", "tomato3"), lwd = 1)
Didn’t include plots because they are too heavy
<- read.delim("Table_enhancer_variant_counts.txt")
metadata <- metadata[!metadata$library %in% "twist_S2171",] # library not used
metadata $library <- factor(metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))
metadata
# don't plot the WT
$library %in% c("S2171_wt") & metadata$Twist_mix.y %in% "mix3", grep("chr3L_3310914_3311162_REEFmix3", names(metadata))] <- NA
metadata[metadata
# plots
= list()
plot_list_tmp
<- metadata[,grep("Rep._UMI", names(metadata))]
Oligo_counts
# normalise to 1 million mapped fragments
<- as.data.frame(apply(Oligo_counts, 2, function(x) (x+1)/sum(x, na.rm=T)*1e6))
Counts_per_million_cpm
for(id in unique(substr(names(metadata)[grep("UMI", names(metadata))], 1, nchar(names(metadata)[grep("UMI", names(metadata))])-5))){
<- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]]
a <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]
b
if(length(grep("S21717", id))>0){
<- Counts_per_million_cpm[!(metadata$Twist_mix=="mix3"),grep(id, names(Counts_per_million_cpm))]
df_tmp
}if(length(grep("chr3L_3310914_3311162", id))>0){
<- Counts_per_million_cpm[!(metadata$Twist_mix=="mix2"),grep(id, names(Counts_per_million_cpm))]
df_tmp
}
# PCC - not using zeros
<- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
pc log10(df_tmp[apply(df_tmp,1,min)>0,b]),
method = "pearson")
if(length(grep("input", id))>0) sc_col=c("#d9d9d9","#525252") else{sc_col=c("#fdd49e","#d7301f")}
# plot
<- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
scale_x_log10("Normalized UMI counts - rep 1",
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
scale_y_log10("Normalized UMI counts - rep 2",
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(gsub("_", " ", gsub("_Rep", "", id))) +
annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
= ggplotGrob(scater)
plot_list_tmp[[id]]
}
# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp[1:2], ncol = 2))
print(gridExtra::grid.arrange(grobs = plot_list_tmp[3:4], ncol = 2))
# Only ced-6 pos241 GATAA instance
<- metadata[metadata$library %in% c("ced6_pos241", "S2171_wt"),]
metadata
= list()
plot_list_tmp
<- metadata[,grep("Rep._UMI", names(metadata))]
Oligo_counts
# normalise to 1 million mapped fragments
<- as.data.frame(apply(Oligo_counts, 2, function(x) (x+1)/sum(x, na.rm=T)*1e6))
Counts_per_million_cpm
for(id in c("S21717_REEFmix2_input_Rep", "S21717_REEFmix2_Rep")){
<- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]]
a <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]
b
if(length(grep("S21717", id))>0){
<- Counts_per_million_cpm[!(metadata$Twist_mix=="mix3"),grep(id, names(Counts_per_million_cpm))]
df_tmp
}if(length(grep("chr3L_3310914_3311162", id))>0){
<- Counts_per_million_cpm[!(metadata$Twist_mix=="mix2"),grep(id, names(Counts_per_million_cpm))]
df_tmp
}
# PCC - not using zeros
<- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
pc log10(df_tmp[apply(df_tmp,1,min)>0,b]),
method = "pearson")
if(length(grep("input", id))>0) sc_col=c("#d9d9d9","#525252") else{sc_col=c("#fdd49e","#d7301f")}
# plot
<- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
scale_x_log10("Normalized UMI counts - rep 1",
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
scale_y_log10("Normalized UMI counts - rep 2",
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(gsub("_", " ", gsub("_Rep", "", id))) +
annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
= ggplotGrob(scater)
plot_list_tmp[[id]]
}
# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp[1:2], ncol = 2))
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## S21717_REEFmix2_input_Rep 1 (1-1,1-1) arrange gtable[layout]
## S21717_REEFmix2_Rep 2 (1-1,2-2) arrange gtable[layout]
<- read.delim("Table_enhancer_variant_counts.txt", stringsAsFactors = F)
Count_table_final rownames(Count_table_final) <- Count_table_final$Variant
table(Count_table_final$library)
library(BiocParallel, lib.loc = "/software/2020/software/r-bundle-bioconductor/3.8-foss-2018b-r-3.5.1")
library(DESeq2)
# loop per screen
<- lapply(c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3"), function(e){
DESeq_results
if(length(grep("S21717", e))>0){
<- Count_table_final[!Count_table_final$Twist_mix=="mix3",c(3, grep(e, names(Count_table_final)))]
Count_table
}if(length(grep("chr3L_3310914_3311162", e))>0){
<- Count_table_final[!Count_table_final$Twist_mix=="mix2",c(3, grep(e, names(Count_table_final)))]
Count_table
}
table(Count_table$library)
# only sequences with at least 5 reads in both inputs
<- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<5)==0,]
Count_table_2 table(Count_table_2$library)
# only sequences with at least 1 read in both STARR-seq reps (no need to handle zeros)
<- Count_table_2[rowSums(Count_table_2[,grep("REEFmix._R", names(Count_table_2))]<1)==0,]
Count_table_2 table(Count_table_2$library)
# count table
<- Count_table_2[,grep(e, names(Count_table_2), ignore.case = T)]
cts
# design
<- data.frame(type=factor(rep(c("Input", "Experiment"),each=2),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors - assumption that controls don't work
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("NegativeRegions", rownames(cts)),]))
<- DESeq(dds)
dds
# plots quality control
plotDispEsts(dds)
# plot normal FC
<- results(dds, alpha=0.05)
res summary(res)
::plotMA(res)
DESeq2mcols(res)$description
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp names(tmp) <- paste0(e,"_",names(tmp))
return(tmp)
})
<- merge(Count_table_final, DESeq_results$S21717_REEFmix2, by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3, by.x=1, by.y=0, all.x=T)
Count_table_final
### find oligo with activity closer to the wt enhancer chr3L_3310914_3311162 in the twist wt library - to serve as a reference since the wt activity is biased due to the over-representation
<- Count_table_final[order(abs(Count_table_final$S21717_REEFmix2_log2FoldChange[Count_table_final$Variant %in% "chr3L_3310914_3311162_wt_+"]-Count_table_final$S21717_REEFmix2_log2FoldChange)),][,c(1:4,42,46)]
chr3L_3310914_3311162_ref_enh <- chr3L_3310914_3311162_ref_enh[chr3L_3310914_3311162_ref_enh$library %in% c("Wt", "chr3L_3310914_3311162_wt"),]
chr3L_3310914_3311162_ref_enh head(chr3L_3310914_3311162_ref_enh)
<- "chrX_9273894_9274142_+_dCP_-"
chr3L_3310914_3311162_ref_enh $library[grep(chr3L_3310914_3311162_ref_enh, Count_table_final$Variant, fixed = T)] <- "chr3L_3310914_3311162_ref"
Count_table_finaltable(Count_table_final$library)
# library not used
<- Count_table_final[!Count_table_final$library %in% "twist_S2171",]
Count_table_final
# save final table
write.table(Count_table_final, "Table_enhancer_variant_counts_and_enrichment.txt", sep="\t", quote = F, row.names = F)
<- read.delim("Table_enhancer_variant_counts.txt", stringsAsFactors = F)
Count_table_final rownames(Count_table_final) <- Count_table_final$Variant
table(Count_table_final$library)
require(DESeq2)
# loop per screen
<- lapply(c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3"), function(e){
DESeq_results <- lapply(c(rep1="_Rep1", rep2="_Rep2"), function(r){
tmp_list
if(length(grep("S21717", e))>0){
<- Count_table_final[!Count_table_final$Twist_mix=="mix3",c(3, grep(e, names(Count_table_final)))]
Count_table
}if(length(grep("chr3L_3310914_3311162", e))>0){
<- Count_table_final[!Count_table_final$Twist_mix=="mix2",c(3, grep(e, names(Count_table_final)))]
Count_table
}
table(Count_table$library)
# only sequences with at least 5 reads in both inputs
<- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<5)==0,]
Count_table_2 table(Count_table_2$library)
# only sequences with at least 1 read in both STARR-seq reps (no need to handle zeros)
<- Count_table_2[rowSums(Count_table_2[,grep("REEFmix._R", names(Count_table_2))]<1)==0,]
Count_table_2 table(Count_table_2$library)
# count table - all inputs + specific RNA replicate
<- Count_table_2[,c(2,3,grep(paste0(e, r), names(Count_table_2), ignore.case = T))]
cts
# design
<- data.frame(type=factor(c("Input", "Input", "Experiment"),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors - assumption that controls don't work
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("NegativeRegions", rownames(cts)),]))
<- DESeq(dds)
dds #resultsNames(dds) # lists the coefficients
# plots quality control
plotDispEsts(dds)
# plot normal FC
<- results(dds, alpha=0.05)
res summary(res)
::plotMA(res)
DESeq2mcols(res)$description
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp names(tmp) <- paste0(e,r,"_",names(tmp))
return(tmp)
})return(tmp_list)
})
<- merge(Count_table_final, DESeq_results$S21717_REEFmix2$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$S21717_REEFmix2$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final
# save final table
write.table(Count_table_final, "Table_enhancer_variant_counts_and_enrichment_per_replicate.txt", sep="\t", quote = F, row.names = F)
library(ggpointdensity)
<- read.delim("Table_enhancer_variant_counts_and_enrichment_per_replicate.txt", stringsAsFactors = F)
metadata_final <- metadata_final[!metadata_final$library %in% "twist_S2171",] # library not used
metadata_final
= list()
plot_list_tmp for(e in c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3")){
<- metadata_final[,grep(paste0(e, "_Rep._log2FoldChange"), names(metadata_final))]
df_tmp
if(e=="S21717_REEFmix2") id="ced-6 enhancer"
if(e=="chr3L_3310914_3311162_REEFmix3") id="ZnT63C enhancer"
# PCC - not using zeros
<- cor.test(df_tmp[,1],
pc 2],
df_tmp[,method = "pearson", use="pairwise.complete.obs")
=c("#fdd49e","#d7301f")
sc_col
# plot
<- ggplot(df_tmp, aes(df_tmp[,1], df_tmp[,2])) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
scale_x_continuous("Enhancer activity - rep 1") +
scale_y_continuous("Enhancer activity - rep 2") +
guides(color=F) +
geom_abline(linetype="dashed", col="grey30") +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(id) +
annotate("text", x=min(df_tmp[,1], na.rm=T), y = max(df_tmp[,2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
= ggplotGrob(scater)
plot_list_tmp[[e]]
}
# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp, ncol = 2))
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## S21717_REEFmix2 1 (1-1,1-1) arrange gtable[layout]
## chr3L_3310914_3311162_REEFmix3 2 (1-1,2-2) arrange gtable[layout]
# Only ced-6 pos241 GATAA instance
<- metadata_final[metadata_final$library %in% c("ced6_pos241", "S2171_wt"),]
df_tmp
<- cor.test(df_tmp$S21717_REEFmix2_Rep1_log2FoldChange,
pc $S21717_REEFmix2_Rep2_log2FoldChange,
df_tmpmethod = "pearson", use="pairwise.complete.obs")
=c("#fdd49e","#d7301f")
sc_col
# plot
<- ggplot(df_tmp, aes(S21717_REEFmix2_Rep1_log2FoldChange, S21717_REEFmix2_Rep2_log2FoldChange)) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
scale_x_continuous("Enhancer activity - rep 1") +
scale_y_continuous("Enhancer activity - rep 2") +
guides(color=F) +
geom_abline(linetype="dashed", col="grey30") +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(id) +
annotate("text", x=min(df_tmp[,41], na.rm=T), y = max(df_tmp[,43], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
print(scater)
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")
Count_table_final <- Count_table_final[Count_table_final$library %in% c("ced6_pos241", "S2171_wt"),c(1:4,grep("GATAA_2nd_S2171", names(Count_table_final)), grep("S21717_REEFmix2", names(Count_table_final)))]
tmp_main <- tmp_main[complete.cases(tmp_main$S21717_REEFmix2_log2FoldChange),]
tmp_main <- tmp_main[order(tmp_main$S21717_REEFmix2_log2FoldChange, decreasing = T),]
tmp <- data.frame(x=1:nrow(tmp)/1000, y=2**tmp$S21717_REEFmix2_log2FoldChange, y_log=tmp$S21717_REEFmix2_log2FoldChange,
tmp variant=as.character(tmp$Variant),
variant_seq=sapply(strsplit(as.character(tmp$Variant),"_"), `[`, 3),
stringsAsFactors = F)
# select variants to plot
<- tmp[tmp$variant %in% c("S2171_wt_+",
tmp_sel $variant[1],
tmp$variant[grep("GATAA", tmp$variant_seq)][1],
tmp$variant[grep("TTATC", tmp$variant_seq)][1]),]
tmp$variant_seq[grep("S2171_wt_+", tmp_sel$variant)] <- "wt (CTTATCGC)"
tmp_sel
<- ggplot(tmp, aes(x,y)) +
gg geom_line() +
scale_y_continuous("Enhancer activity\n(Normalized RNA levels)", breaks = seq(0,150,30)) +
scale_x_continuous("Enhancer variants (x 10e3)", breaks = seq(0,60,10)) +
ggtitle("S2-171 enhancer - ced6 pos241 GATAA position") +
theme(plot.title = element_text(hjust=0.5)) +
geom_point(data=tmp_sel, aes(x,y), color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]), size=3) +
annotate(geom="text", x=tmp_sel$x+1.5, y=tmp_sel$y,
label=tmp_sel$variant_seq,
color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]),
hjust=0, vjust=0.5, size = 3.5)
print(gg)
### numbers of variants
length(tmp_main$Variant[tmp_main$S21717_REEFmix2_log2FoldChange>tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)]])
## [1] 600
# 600 over WT
# +/- 10% linear scale
<- 2**tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)]
wt_act length(tmp_main$Variant[(2**tmp_main$S21717_REEFmix2_log2FoldChange) >= (wt_act - wt_act*0.1) &
2**tmp_main$S21717_REEFmix2_log2FoldChange) <= (wt_act + wt_act*0.1)]) -1 # minus wt (
## [1] 374
# 374
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")
Count_table_final <- Count_table_final[Count_table_final$library %in% c("ced6_pos241", "S2171_wt"),c(1:5,grep("GATAA_2nd_S2171", names(Count_table_final)), grep("S21717_REEFmix2", names(Count_table_final)))]
metadata_final <- metadata_final[complete.cases(metadata_final$S21717_REEFmix2_log2FoldChange),]
metadata_final
library(ggseqlogo)
<- function(x, method='bits'){
plot.logo ggseqlogo(x, method=method, seq_type='dna', ncol=1) +
theme(panel.background = element_rect(fill="white",colour="white"),
panel.grid = element_blank(), axis.line=element_blank(),
axis.text = element_blank(), axis.ticks = element_blank(),
axis.title = element_blank(),
legend.key=element_blank(),
legend.text = element_text(size=11, colour="black"),
legend.title = element_text(size=12, colour="black"),
plot.title = element_text(size=14, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5),
legend.position = "right")
}
<- c(1,2,5,10,50,100,1000, nrow(metadata_final))
seq_counts names(seq_counts) <- seq_counts
<- lapply(c(IC='bits', Prob='p'), function(m){
logo_plots_all <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
logo_plots if(c=="Random") tmp <- metadata_final[sample(1:nrow(metadata_final)),]
if(c=="Experiment") tmp <- metadata_final[order(metadata_final$S21717_REEFmix2_log2FoldChange, decreasing = T),]
lapply(seq_counts, function(i){
<- tmp[1:i,]
tmp2
# get nucleotide at each position
<- do.call(rbind, lapply(1:16, function(x) data.frame(Pos=x,
tmp2 Base=substr(tmp2$GATAA_2nd_S2171_sequence_fw_extended4bp, x, x))))
<- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x/sum(x))
prop_matrix
plot.logo(prop_matrix, method=m)
})
})
})
# plot logos
library(cowplot)
plot_grid(plot_grid(plotlist = logo_plots_all$IC$Experiment, ncol=1),
plot_grid(plotlist = logo_plots_all$IC$Random, ncol=1))
plot_grid(plot_grid(plotlist = logo_plots_all$Prob$Experiment, ncol=1),
plot_grid(plotlist = logo_plots_all$Prob$Random, ncol=1))
### statistics with information content
# The information content (IC) of a PWM says how different a given PWM is from a uniform distribution.
<- function(C) log2(length(C))
tIC <- function(C) C / sum(C)
PPM <- function(C) -sum(PPM(C) * log2(PPM(C)))
U <- function(C) tIC(C) - U(C)
fIC <- function(C){
IC # add pseudocounts
==0] <- 0.001
C[CPPM(C) * fIC(C)
}
set.seed(1234)
<- lapply(c(Random="Random", Experiment="Experiment"), function(c){
statistics_logo_plots if(c=="Random") tmp <- metadata_final[sample(1:nrow(metadata_final)),]
if(c=="Experiment") tmp <- metadata_final[order(metadata_final$S21717_REEFmix2_log2FoldChange, decreasing = T),]
<- sapply(c(1:1000, 10000, 50000, nrow(metadata_final)), function(i){
v_all <- tmp[1:i,]
tmp2
# get nucleotide at each position
<- do.call(rbind, lapply(1:8, function(x) data.frame(Pos=x,
tmp2 Base=substr(tmp2$GATAA_2nd_S2171_sequence_fw_extended4bp, x, x))))
$Base <- factor(tmp2$Base, levels = c("A", "C", "G", "T"))
tmp2
<- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x)
df
<- sum(colSums(sapply(1:ncol(df), function(x){
v IC(df[,x])
})))
return(v)
})return(v_all)
})
=300
n<- data.frame(Group=rep(c("Random", "STARR-seq"), each=n),
plot_df Top_sequences=rep(1:n, 2),
IC=c(statistics_logo_plots$Random[1:n], statistics_logo_plots$Experiment[1:n]))
<- ggplot(plot_df, aes(Top_sequences, IC, colour=Group)) +
IC_plot geom_point() +
geom_line(alpha=0.5) +
scale_y_continuous("8-mer sum Information Content") +
scale_x_continuous("Number of top sequences considered", breaks = seq(0,1000,50)) +
scale_color_manual("", values=c("STARR-seq"="orangered", Random="grey60")) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 13, colour="black"),
axis.title = element_text(size = 15),
legend.text = element_text(size = 14)
)
print(IC_plot)
<- ggplot(tmp, aes("1",y_log)) +
boxplot geom_violin(alpha=0.5, position = position_dodge(width = 0.9), size=0.7, color="black", fill="grey60") +
geom_boxplot(outlier.size = -1, color="black", position = position_dodge(width = 0.9), width=0.2, size=0.8) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
scale_x_discrete("", labels=paste0("All variants\n(n=", nrow(tmp), ")")) +
#geom_hline(yintercept = 0, linetype="dashed", col="grey60") + # the 0 is not relevant so don't print the line
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.title = element_text(colour="black"),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
<- boxplot +
boxplot_final geom_point(data=tmp_sel, aes("1",y_log), color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]), size=1.5) +
annotate(geom="text", x="1", y=tmp_sel$y_log,
label=tmp_sel$variant_seq,
color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]),
hjust=-0.15, vjust=0.5, size = 1.85)
print(boxplot_final)
<- boxplot +
boxplot3 geom_point(data=tmp_sel[tmp_sel$variant %in% "S2171_wt_+",], aes("1",y_log), color=c(my_motif_col["wt"]), size=2) +
annotate(geom="text", x="1", y=tmp_sel$y_log[tmp_sel$variant %in% "S2171_wt_+"],
label="wt",
color=my_motif_col["wt"],
hjust=-0.35, vjust=0.5, size = 4)
########
<- c("GATAAG", "TGA.TCA", "CATCTG", "GAGAGA",
motif_interest_list "CCGGAA", "TCACGCGA", "TTCC.GGA",
"TCATCA", #TGATGT
"ATCGAT",
"AGGATAA"
)names(motif_interest_list) <- motif_interest_list
# make table with enrichments per motif
<- lapply(motif_interest_list, function(m){
motif_variant_enrichment_16bp =tmp_main$S21717_REEFmix2_log2FoldChange[grep(m, tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp)]
fw=tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp[grep(m, tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp)]
fw_seqif(!DNAString(m) == reverseComplement(DNAString(m))){
=tmp_main$S21717_REEFmix2_log2FoldChange[grep(m, tmp_main$GATAA_2nd_S2171_sequence_rv_extended4bp)]
rv=tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp[grep(m, tmp_main$GATAA_2nd_S2171_sequence_rv_extended4bp)]
rv_seqelse{rv <- c(); rv_seq <- c()}
}
data.frame(Motif=m,
Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
Sequence=c(fw_seq, rv_seq),
Enrichment_log2=c(fw, rv))
})
# plot for both tables
="16bp"
seq
# join all
<- do.call(rbind, motif_variant_enrichment_16bp)
motif_variant_enrichment
# set factor levels
<- motif_variant_enrichment %>%
motif_levels group_by(Motif) %>%
summarise(median=median(Enrichment_log2)) %>%
arrange(desc(median)) %>%
select(Motif)
$Motif <- factor(motif_variant_enrichment$Motif, levels =as.character(motif_levels$Motif))
motif_variant_enrichment$Motif <- relevel(motif_variant_enrichment$Motif, "GATAAG")
motif_variant_enrichment
# x labels counts per boxplot
<- data.frame(table(motif_variant_enrichment$Motif))
t <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
xlabels =""
TFif(x=="GATAAG") TF="GATA, "
if(x=="TGA.TCA") TF="AP-1, "
if(x=="CATCTG") TF="twist, "
if(x=="GAGAGA") TF="Trl, "
if(x=="CCGGAA") TF="ETS, "
if(x=="TCACGCGA") TF="SREBP, "
if(x=="TTCC.GGA") TF="STAT, "
if(x=="TCATCA") TF="CREB, "
if(x=="ATCGAT") TF="Dref, "
if(x=="AGGATAA") TF="ttk, "
if(x %in% c("All", "Others")){
paste0(x,"\n(", TF, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
else{
paste0(x,"\n(",
$Freq[t$Var1%in%x], ")")}
TF, t
})
<- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif, alpha=Strand)) +
boxplot_known_motifs geom_boxplot(outlier.size = 0.6) +
#geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
geom_hline(yintercept = tmp_main$S21717_REEFmix2_log2FoldChange[tmp_main$Variant %in% "S2171_wt_+"], linetype="dashed", col="orangered") +
geom_hline(yintercept = median(tmp_main$S21717_REEFmix2_log2FoldChange), linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
scale_alpha_manual(values=c(0.5,0.9)) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
scale_x_discrete("", labels=xlabels) +
guides(fill=F, alpha=F) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9),
axis.title = element_text(colour="black"),
axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
<- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
boxplot_known_motifs2 geom_boxplot(outlier.size = 0.6) +
#geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
geom_hline(yintercept = tmp_main$S21717_REEFmix2_log2FoldChange[tmp_main$Variant %in% "S2171_wt_+"], linetype="dashed", col="orangered") +
geom_hline(yintercept = median(tmp_main$S21717_REEFmix2_log2FoldChange), linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
scale_x_discrete("", labels=xlabels) +
guides(fill=F, alpha=F) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9),
axis.title = element_text(colour="black"),
axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(boxplot3 + boxplot_known_motifs2 + plot_layout(widths = c(1,8)))
print(boxplot3 + boxplot_known_motifs + plot_layout(widths = c(1,8)))
# test significance of differences between orienations
# AP1 (TGA.TCA) and Dref (ATCGAT) are palindromic and have now reverse complement
unlist(sapply(levels(motif_variant_enrichment$Motif), function(m){
if(!m %in% c("TGA.TCA", "ATCGAT")) wilcox.test(motif_variant_enrichment$Enrichment_log2[motif_variant_enrichment$Motif==m & motif_variant_enrichment$Strand=="fw"],
$Enrichment_log2[motif_variant_enrichment$Motif==m & motif_variant_enrichment$Strand=="rv"])$p.value
motif_variant_enrichment }))
## GATAAG TCACGCGA CATCTG TCATCA TTCC.GGA GAGAGA CCGGAA AGGATAA
## 0.4881886 0.7304401 0.7489610 0.7906186 0.6968224 0.1191006 0.9858273 0.8582189
<- list(GATA=data.frame(Motif="GATAAG", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
TF_motifs AP1=data.frame(Motif="TGA.TCA", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
twist=data.frame(Motif="CATCTG", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
Trl=data.frame(Motif="GAGAGA", ID="homer__CTYTCTYTCTCTCTC_GAGA-repeat"),
ETS=data.frame(Motif="CCGGAA", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
SREBP=data.frame(Motif="TCACGCGA", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
STAT=data.frame(Motif="TTCC.GGA", ID="stark__TTCCSGGAA"), # since bergman__Stat92E was longer than the variants sequence
CREB=data.frame(Motif="TCATCA", ID="cisbp__M0295"),
Dref=data.frame(Motif="ATCGAT", ID="bergman__Dref"),
ttk=data.frame(Motif="AGGATAA", ID="flyfactorsurvey__ttk_NAR_FBgn0003870")
)<- do.call(rbind, TF_motifs)
TF_motifs
# get motif scores
require(motifmatchr)
require(TFBSTools)
load("data/TF_clusters_PWMs.RData") # load motifs
<- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID], # no ttk
scores $Sequence,
motif_variant_enrichmentgenome = "BSgenome.Dmelanogaster.UCSC.dm3", p.cutoff = 1, bg="genome", out = "scores") # no cutoff to get all scores
<- as.data.frame(as.matrix(motifmatchr::motifScores(scores)))
out names(out) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID])
<- cbind(motif_variant_enrichment, out)
motif_variant_enrichment_2
<- sapply(levels(motif_variant_enrichment_2$Motif), function(m){
Corr_scores cor(motif_variant_enrichment_2$Enrichment_log2[motif_variant_enrichment_2$Motif==m],
$Motif==m, TF_motifs$ID[TF_motifs$Motif==m]])
motif_variant_enrichment_2[motif_variant_enrichment_2
})
print(Corr_scores)
## GATAAG TGA.TCA TCACGCGA CATCTG TCATCA TTCC.GGA
## 0.27326923 0.57669744 0.75118949 0.04499555 0.23829398 0.52075684
## GAGAGA CCGGAA ATCGAT AGGATAA
## 0.20628973 0.07917095 0.13069675 -0.57051310
<- data.frame(Motif = factor(names(Corr_scores), levels =as.character(motif_levels$Motif)),
gg Scores = Corr_scores) %>%
ggplot(aes(Motif, Scores, fill=Motif)) +
geom_bar(stat="identity", width=0.7, col="black") +
scale_x_discrete("Pasted motif") +
scale_y_continuous("Correlation activity - motif scores", breaks = seq(-1,1,0.2)) +
scale_fill_manual(values=my_motif_col) +
guides(fill="none", col="none") +
scale_x_discrete("", labels=xlabels, limits=levels(motif_variant_enrichment$Motif)) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9),
axis.title = element_text(colour="black"),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(gg)
# do the variants with high activity match to different motifs?
<- tmp_main[tmp_main$S21717_REEFmix2_log2FoldChange>tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)],]
top_sequences rownames(top_sequences) <- top_sequences$Variant
# 600 over WT
### All enriched motifs (FDR<0.05)
<- readRDS("data/PWM_candidates.rds")
PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]
PWM_candidates <- PWM_candidates[PWM_candidates$FDR_motif_enr<0.05 & PWM_candidates$log2_motif_enr>0,]
PWM_candidates
require(motifmatchr)
require(TFBSTools)
load("data/TF_clusters_PWMs.RData") # load motifs
# count motifs in all enhancers
=list()
opt$pvalue_cutoff <- c(1e-5, 1e-4)
opt$genome="BSgenome.Dmelanogaster.UCSC.dm3"
opt
# motif counts
# for motif enrichment is is fine to do not reduce, because it will just be present vs non-present
<- list()
motif_counts_all for(p in opt$pvalue_cutoff){
# motif counts
<- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name],
motif_ix_scores as.character(top_sequences$GATAA_2nd_S2171_sequence_fw_extended4bp),
genome = opt$genome, p.cutoff = p, bg="genome", out = "scores")
<- as.data.frame(as.matrix(motifCounts(motif_ix_scores)))
scores names(scores) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name])
<- cbind(top_sequences, scores)
motif_counts
paste0("pvalue_", p)]] <- motif_counts
motif_counts_all[[print(p)
}
## [1] 1e-05
## [1] 1e-04
# sequences not explained
table(rowSums(motif_counts_all$`pvalue_1e-04`[,18:ncol(motif_counts_all$`pvalue_1e-04`)])>0)
##
## FALSE TRUE
## 38 562
table(rowSums(motif_counts_all$`pvalue_1e-05`[,18:ncol(motif_counts_all$`pvalue_1e-05`)])>0)
##
## FALSE TRUE
## 181 419
# sequences explained by main motifs, others, multiple, or none
<- list(GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507"),
TF_motifs GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
GATA=data.frame(Motif="GATA", ID=as.character(PWM_candidates$motif_name[grep("srp|gatad", PWM_candidates$motif_description2, ignore.case = T)])),
AP1=data.frame(Motif="AP-1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
AP1=data.frame(Motif="AP-1", ID=as.character(PWM_candidates$motif_name[grep("jra|kay|fos|jun", PWM_candidates$motif_description2, ignore.case = T)])),
twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
twist=data.frame(Motif="twist", ID=as.character(PWM_candidates$motif_name[grep("twi", PWM_candidates$motif_description2, ignore.case = T)])),
Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263"),
Trl=data.frame(Motif="Trl", ID=as.character(PWM_candidates$motif_name[grep("trl|gaga", PWM_candidates$motif_description2, ignore.case = T)])),
ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
ETS=data.frame(Motif="ETS", ID=as.character(PWM_candidates$motif_name[grep("ETS", PWM_candidates$motif_description2, ignore.case = T)])),
SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
SREBP=data.frame(Motif="SREBP", ID=as.character(PWM_candidates$motif_name[grep("SREBP|HLH106", PWM_candidates$motif_description2, ignore.case = T)])),
STAT=data.frame(Motif="STAT", ID="bergman__Stat92E"),
STAT=data.frame(Motif="STAT", ID=as.character(PWM_candidates$motif_name[grep("stat", PWM_candidates$motif_description2, ignore.case = T)])),
CREB=data.frame(Motif="CREB/ATF", ID="cisbp__M0295"),
CREB=data.frame(Motif="CREB/ATF", ID=as.character(PWM_candidates$motif_name[grep("creb", PWM_candidates$motif_description2, ignore.case = T)])
)
)<- do.call(rbind, TF_motifs)
TF_motifs $Motif <- factor(TF_motifs$Motif)
TF_motifs
<- lapply(motif_counts_all, function(p){
Sequences_motifs_16bp
<- data.frame(sapply(levels(TF_motifs$Motif), function(m){
candidates_variant_enrichment return(rowSums(p[,as.character(TF_motifs$ID[TF_motifs$Motif==m])])>0)
}))"Other motifs"] <- rowSums(p[,17:ncol(p)])>0
candidates_variant_enrichment[,colSums(candidates_variant_enrichment)
return(candidates_variant_enrichment)
})colSums(Sequences_motifs_16bp[[1]])
## AP.1 CREB.ATF ETS GATA SREBP STAT
## 98 5 0 23 51 24
## Trl twist Other motifs
## 0 2 419
colSums(Sequences_motifs_16bp[[2]])
## AP.1 CREB.ATF ETS GATA SREBP STAT
## 172 30 7 49 92 39
## Trl twist Other motifs
## 0 2 562
table(rowSums(Sequences_motifs_16bp[[1]][,1:8]))
##
## 0 1 2
## 399 199 2
table(rowSums(Sequences_motifs_16bp[[2]][,1:8]))
##
## 0 1 2 3
## 242 326 31 1
# which variants
<- lapply(Sequences_motifs_16bp, function(i){
all_var_list "Group"] <- apply(i, 1, function(x){
i[,if(sum(x[1:8]==TRUE)>1){"Multiple"
else if(sum(x[1:8]==TRUE)==1){paste0(colnames(i)[1:8][x[1:8]==TRUE], collapse = "_")
}else if(sum(x[1:8]==TRUE)==0 & x["Other motifs"]==TRUE){paste0(colnames(i)[x==TRUE], collapse = "_")
}else{"No motifs"}
}
})<- table(i[,"Group"])
all_var names(all_var)[names(all_var)=="AP.1"] <- "AP-1"
names(all_var)[names(all_var)=="CREB.ATF"] <- "CREB/ATF"
<- data.frame(all_var)
df $Var1 <- factor(df$Var1, levels=rev(c("No motifs", "Other motifs", "Multiple", rev(sort(levels(TF_motifs$Motif))))))
df
df
})
<- ggplot() +
gg_bar1 geom_bar(data=all_var_list[[1]], aes("1e-05", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
geom_bar(data=all_var_list[[2]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
xlab("PWM cutoff") +
scale_y_continuous("# of variants", expand = c(0,0)) +
scale_x_discrete(limits=c("1e-05", "1e-04")) +
scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
limits = levels(all_var_list[[2]]$Var1))
<- ggplot() +
gg_bar2 geom_bar(data=all_var_list[[2]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
xlab("PWM cutoff") +
scale_y_continuous("# of variants", expand = c(0,0)) +
scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
limits = levels(all_var_list[[2]]$Var1))
print(gg_bar2)
print(gg_bar1)
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)
metadata_final
$library <- factor(metadata_final$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"),
metadata_finallabels=c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "pos110", "pos182", "pos230", "pos241", "pos210", "pos180", "pos142"))
# Melt table
<- reshape::melt(metadata_final[,c("Variant", "library", "S21717_REEFmix2_log2FoldChange", "chr3L_3310914_3311162_REEFmix3_log2FoldChange")])
tmp <- tmp[complete.cases(tmp$value),]
tmp $library <- factor(tmp$library, levels=c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "pos110", "pos182", "pos230", "pos241", "pos142", "pos180", "pos210"))
tmp
# get motif average to choose variants to highlight
<- readRDS("Average_motif_enrichment_log2.rds")
Average_motif_enrichment_log2
<- list()
plot_list for(e in as.character(unique(tmp$variable))){
<- tmp[tmp$variable %in% e & !tmp$library %in% c("chr3L_3310914_3311162_ref", "chr3L_3310914_3311162_wt", "S2171_wt", "Wt"),]
tmp2 $Variant_sequence <- sapply(strsplit(as.character(tmp2$Variant),"_"), `[`, length(strsplit(as.character(tmp2$Variant)[1],"_")[[1]])-1)
tmp2
# x labels counts per boxplot
if(e=="S21717_REEFmix2_log2FoldChange"){
$library <- factor(tmp2$library, levels = c("pos110", "pos241", "pos182", "pos230"))
tmp2<- sapply(as.character(levels(tmp2$library)), function(x){
xlabels paste0(x,"\n(", table(tmp2$library)[x], ")")
})="S2171_wt_+"
id<- "ced-6"
ti <- rbind(data.frame(Motif="CCGGAA", Pos=c("pos110", "pos241"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="CCGGAA" & Average_motif_enrichment_log2$varID=="ced6 pos110"],
variants_choice $Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="CCGGAA" & Average_motif_enrichment_log2$varID=="ced6 pos241"])),
Average_motif_enrichment_log2data.frame(Motif="GATAAG", Pos=c("pos182", "pos230"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="GATAAG" & Average_motif_enrichment_log2$varID=="ced6 pos182"],
$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="GATAAG" & Average_motif_enrichment_log2$varID=="ced6 pos230"])))
Average_motif_enrichment_log2
}if(e=="chr3L_3310914_3311162_REEFmix3_log2FoldChange"){
<- sapply(as.character(levels(tmp2$library)), function(x){
xlabels paste0(x,"\n(", table(tmp2$library)[x], ")")
})=c("chrX_9273894_9274142_+_dCP_-")
id<- "ZnT63C"
ti <- rbind(data.frame(Motif="TCACGCGA", Pos=c("pos142", "pos180", "pos210"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos142"],
variants_choice $Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos180"],
Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos210"])))
Average_motif_enrichment_log2
}
if(length(id)==2){tmp_col <- c("red", "grey40")}else{tmp_col <- "red"}
if(length(id)==2){tmp_ann <- c("ref", "wt")}else{tmp_ann <- "wt"}
<- ggplot(tmp2, aes(library, value, fill=library)) +
boxplot geom_boxplot(col=NA, fill=NA) +
geom_violin(alpha=0.7) +
geom_boxplot(outlier.size = -1, color="black", width=0.17, size=0.8, fill=NA) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
geom_hline(yintercept = tmp$value[tmp$Variant %in% id & tmp$variable %in% e], linetype="dashed", col=tmp_col) +
annotate(geom="text", x=1, y=tmp$value[tmp$Variant %in% id & tmp$variable %in% e],
label=tmp_ann,
color=tmp_col,
hjust=2, vjust=-0.4, size = 4) +
scale_fill_manual("", values=my_col) +
guides(fill="none") +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-4,10,2), labels = c("1/16", "1/2", 2**seq(0,10,2)), limits = c(-2.28, 9.7)) +
scale_x_discrete("", labels=xlabels) +
ggtitle(ti) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.title = element_text(colour="black"),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust = 0.5)
)
# add variants
$var <- NA
variants_choice$var_activity <- NA
variants_choicefor(m in unique(variants_choice$Motif)){
<- lapply(variants_choice$Pos[variants_choice$Motif==m], function(p){
var_list <- tmp2[tmp2$library %in% p,]
v <- v[grep(m, v$Variant_sequence),]
v # motif variants difference to avg
$delta <- v$value - variants_choice$Average[variants_choice$Motif==m & variants_choice$Pos==p]
vreturn(v[,c("Variant_sequence", "delta")])
}) <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Variant_sequence"),
var_list_red
var_list)<- tmp2[grep(var_list_red$Variant_sequence[order(abs(rowSums(var_list_red[,-1])))][1], tmp2$Variant),]
out
$var[variants_choice$Motif==m] <- out$Variant[match(variants_choice$Pos[variants_choice$Motif==m], out$library)]
variants_choice$var_activity[variants_choice$Motif==m] <- out$value[match(variants_choice$Pos[variants_choice$Motif==m], out$library)]
variants_choice
}print(variants_choice)
<- boxplot +
boxplot geom_point(data=variants_choice, aes(Pos, var_activity), size=3, fill=NA, col=my_motif_col[as.character(variants_choice$Motif)])
<- boxplot
plot_list[[e]] }
## Motif Pos Average var var_activity
## 1 CCGGAA pos110 4.764639 GATAA_1st_ATCCGGAA_+ 5.172439
## 2 CCGGAA pos241 3.163757 GATAA_2nd_ATCCGGAA_+ 2.725990
## 3 GATAAG pos182 5.940204 CACA_2nd_CAGATAAG_+ 6.236382
## 4 GATAAG pos230 5.258812 GAGA_1st_CAGATAAG_+ 4.853649
## Motif Pos Average var var_activity
## 1 TCACGCGA pos142 7.031426 AP1_TCACGCGA_+ 6.764411
## 2 TCACGCGA pos180 7.725775 twist_TCACGCGA_+ 7.832483
## 3 TCACGCGA pos210 5.492017 GATAA_TCACGCGA_+ 4.760254
plot_grid(plotlist = plot_list, rel_widths = c(4,3))
# ced-6 enhancer
table(metadata_final$library, metadata_final$S21717_REEFmix2_log2FoldChange>metadata_final$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", metadata_final$library)])
##
## FALSE TRUE
## S2171_wt 1 0
## chr3L_3310914_3311162_wt 0 1
## chr3L_3310914_3311162_ref 0 1
## Wt 3056 155
## pos110 63779 1465
## pos182 30530 20437
## pos230 18258 46861
## pos241 61411 600
## pos210 0 0
## pos180 0 0
## pos142 0 0
# ZnT63C chr3L_3310914_3311162 enhancer
table(metadata_final$library, metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange>metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[grep("chr3L_3310914_3311162_wt", metadata_final$library)])
##
## FALSE TRUE
## S2171_wt 0 1
## chr3L_3310914_3311162_wt 1 0
## chr3L_3310914_3311162_ref 0 1
## Wt 2221 400
## pos110 0 0
## pos182 0 0
## pos230 0 0
## pos241 0 0
## pos210 33648 27751
## pos180 59858 2018
## pos142 55483 2918
# ZnT63C chr3L_3310914_3311162 REF enhancer (chr2R_4239779_4240027_+_dCP_+)
table(metadata_final$library, metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange>metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[grep("chrX_9273894_9274142_+_dCP_-", metadata_final$Variant, fixed=T)])
##
## FALSE TRUE
## S2171_wt 1 0
## chr3L_3310914_3311162_wt 1 0
## chr3L_3310914_3311162_ref 1 0
## Wt 2610 11
## pos110 0 0
## pos182 0 0
## pos230 0 0
## pos241 0 0
## pos210 61249 150
## pos180 61830 46
## pos142 58375 26
Merge libraries - add enrichments and fw and rv sequences
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)
metadata_final
<- unique(unlist(lapply(grep("sequence_fw$", names(metadata_final), value = T), function(x) unique(metadata_final[,x]))))
vars = vars[nchar(vars)==8]
vars<- data.frame(row.names = vars[complete.cases(vars)],
Merged_enrichments variant_sequence_fw=vars[complete.cases(vars)],
variant_sequence_rv=as.character(reverseComplement(DNAStringSet(vars[complete.cases(vars)]))),
stringsAsFactors = F)
# only data for variants and wt
<- metadata_final[!metadata_final$library %in% c("Wt", "S2171_wt", "chr3L_3310914_3311162_ref"),]
metadata_final
# merge information in loop
for(e in sapply(strsplit(grep("_sequence_fw$", names(metadata_final), value = T),"_sequence_fw"), `[`, 1)){
if(length(grep("chr3L_3310914_3311162|twist_S2171", e))>0){
paste0(e, "_act")] <- metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")])]
Merged_enrichments[,else{
}paste0(e, "_act")] <- metadata_final$S21717_REEFmix2_log2FoldChange[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")])]
Merged_enrichments[,
}
paste0(e, "_fw_extended4bp")] <- metadata_final[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")]), paste0(e, "_sequence_fw_extended4bp")]
Merged_enrichments[,paste0(e, "_rv_extended4bp")] <- metadata_final[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")]), paste0(e, "_sequence_rv_extended4bp")]
Merged_enrichments[,
}
names(Merged_enrichments) <- gsub("GATAA_1st_S2171", "ced6_pos110", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("CACA_2nd_S2171", "ced6_pos182", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GAGA_1st_S2171", "ced6_pos230", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GATAA_2nd_S2171", "ced6_pos241", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("AP1_chr3L_3310914_3311162", "ZnT63C_pos142", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("twist_chr3L_3310914_3311162", "ZnT63C_pos180", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GATAA_chr3L_3310914_3311162", "ZnT63C_pos210", names(Merged_enrichments))
# write table
write.table(Merged_enrichments, "Enrichments_and_sequence_information_all_REEF_libraries.txt", sep="\t", row.names = F, quote=F)
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### z-score normalise to just compare differences within libraries (against mean)
<- apply(Merged_enrichments[,grep("_act", names(Merged_enrichments), value = T)], 2, scale)
df_scaled colnames(df_scaled) <- paste0(colnames(df_scaled), "_scaled")
<- cbind(Merged_enrichments, df_scaled)
out
write.table(out, "Supplementary_Table_3.txt", sep="\t", quote=F, row.names=F)
Didn’t include plots because they are too heavy
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### z-score normalise to just compare differences within libraries (against mean)
<- Merged_enrichments
df grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)
df[,
for(l in 8:5){
if(l==8) df$variant_sequence_fw_corrected <- df$variant_sequence_fw
if(l==7) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 1,7)
if(l==6) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)
if(l==5) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,7)
if(l==4) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,6)
if(l==3) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,6)
if(l==2) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,5)
<- df %>%
tmp group_by(variant_sequence_fw_corrected) %>%
summarise(ced6_pos110=mean(ced6_pos110_act, na.rm=T),
ced6_pos182=mean(ced6_pos182_act, na.rm=T),
ced6_pos230=mean(ced6_pos230_act, na.rm=T),
ced6_pos241=mean(ced6_pos241_act, na.rm=T),
ZnT63C_pos142=mean(ZnT63C_pos142_act, na.rm=T),
ZnT63C_pos180=mean(ZnT63C_pos180_act, na.rm=T),
ZnT63C_pos210=mean(ZnT63C_pos210_act, na.rm=T)
)
<- as.data.frame(tmp)
tmp
### plots
= list()
plot_list_tmp = list()
plot_list_motifs = list()
plot_list_tmp_no_GATA<- names(tmp)[-1]
v for(i in 1:(length(v)-1)){
for(i2 in (i+1):length(v)){
<- v[i]
a <- v[i2]
b
# PCC
<- cor.test(tmp[,a],
pc
tmp[,b],method = "pearson")
=c("grey70","grey40")
my_col
<- ifelse(l>6,2,1) # axes breaks
br <- ifelse(l>4,br,0.5)
br
<- ifelse(l>6,0.5,0.7) # point size
si <- ifelse(l>4,si,2)
si
# plot
<- ggplot(tmp, aes(tmp[,a], tmp[,b])) +
scater geom_pointdensity(size=si, alpha=0.7) +
geom_abline(intercept = 0, slope = 1, linetype="dashed", col="grey50") +
scale_color_gradient(low = my_col[1], high = my_col[2]) +
scale_x_continuous(paste0(gsub("_", " ", a), " (mean log2 act scaled)"),
breaks=seq(-10,20, ifelse(l>6,2,1))) +
scale_y_continuous(paste0(gsub("_", " ", b), " (mean log2 act scaled)"),
breaks=seq(-10,20, ifelse(l>6,2,1))) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=17)) +
# ggtitle(paste0(gsub("_", " ", a), " vs ", gsub("_", " ", b), " (", l, "nt variants)")) +
annotate("text", x=min(tmp[,a], na.rm=T), y = max(tmp[,b], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2), "\n(n=", length(which(rowSums(is.na(tmp[,c(a,b)]))==0)),
")"), vjust=1, hjust=0, size=5)
paste0(a,"_",b)]] = ggplotGrob(scater)
plot_list_tmp[[
if(l>=5 & l<=6){
### add points with specific motifs
<- tmp[c(grep("GATAA", tmp$variant_sequence_fw_corrected),
tmp_subest grep("TTATC", tmp$variant_sequence_fw_corrected),
grep("CA..TG", tmp$variant_sequence_fw_corrected)),]
<- tmp_subest[-grep("CAGGTG|CACCTG", tmp_subest$variant_sequence_fw_corrected),]
tmp_subest
}if(l==7){
### add points with specific motifs
<- tmp[c(#grep("ACGTGAC", tmp$variant_sequence_fw_corrected),
tmp_subest #grep("GTCACGT", tmp$variant_sequence_fw_corrected),
grep("TGA.TCA", tmp$variant_sequence_fw_corrected)),]
}if(l==8){
### add points with specific motifs
<- tmp[c(grep("TCACGCGA", tmp$variant_sequence_fw_corrected),
tmp_subest grep("TCGCGTGA", tmp$variant_sequence_fw_corrected)),]
}
<- scater +
scater2 geom_point(data=tmp_subest, aes(tmp_subest[,a], tmp_subest[,b]), size=0.8) +
geom_text_repel(data=tmp_subest,
aes(tmp_subest[,a], tmp_subest[,b],
label = variant_sequence_fw_corrected), size = 2.5)
if(l==5){
### add points with specific motifs
<- tmp[order(rowMeans(tmp[,c(a,b)]), decreasing = T),][1:10,]
tmp_subest2 <- tmp_subest2[-c(grep("GATAA", tmp_subest2$variant_sequence_fw_corrected),
tmp_subest2 grep("TTATC", tmp_subest2$variant_sequence_fw_corrected)),]
<- scater2 +
scater2 geom_text_repel(data=tmp_subest2,
aes(tmp_subest2[,a], tmp_subest2[,b],
label = variant_sequence_fw_corrected), size = 2.5,
colour="grey40")
}
paste0(a,"_",b)]] = ggplotGrob(scater2)
plot_list_motifs[[
}
}pdf(paste0("Compared_enrichments_per_position_different_variant_lengths_scaled_", l, "bp.pdf"), width = 28, height = 25)
print(gridExtra::grid.arrange(grobs = plot_list_tmp))
if(l>=5) print(gridExtra::grid.arrange(grobs = plot_list_motifs))
dev.off()
print(l)
}
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### I could maybe z-score normalise to just compare differences within libraries (against mean)
<- Merged_enrichments
df grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)
df[,
=6
l$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)
df
<- df %>%
tmp group_by(variant_sequence_fw_corrected) %>%
summarise(GATAA_1st_ced6=mean(GATAA_1st_ced6_act, na.rm=T),
CACA_ced6=mean(CACA_ced6_act, na.rm=T),
GAGA_ced6=mean(GAGA_ced6_act, na.rm=T),
GATAA_2nd_ced6=mean(GATAA_2nd_ced6_act, na.rm=T),
GATAA_ZnT63C=mean(GATAA_ZnT63C_act, na.rm=T),
AP1_ZnT63C=mean(AP1_ZnT63C_act, na.rm=T),
twist_ZnT63C=mean(twist_ZnT63C_act, na.rm=T)
)
<- as.data.frame(tmp)
tmp
### plots
= list()
plot_list_motifs <- names(tmp)[-1]
v for(i in 1:(length(v)-1)){
for(i2 in (i+1):length(v)){
<- v[i]
a <- v[i2]
b
# PCC
<- cor.test(tmp[,a],
pc
tmp[,b],method = "pearson")
=c("grey70","grey40")
my_col
<- ifelse(l>6,2,1) # axes breaks
br <- ifelse(l>4,br,0.5)
br
<- ifelse(l>6,0.5,0.7) # point size
si <- ifelse(l>4,si,2)
si
# plot
<- ggplot(tmp, aes(tmp[,a], tmp[,b])) +
scater geom_pointdensity(size=si, alpha=0.7) +
geom_abline(intercept = 0, slope = 1, linetype="dashed", col="grey50") +
scale_color_gradient(low = my_col[1], high = my_col[2]) +
scale_x_continuous(paste0(gsub("_", " ", a), " (mean log2 act scaled)"),
breaks=seq(-10,20,1)) +
scale_y_continuous(paste0(gsub("_", " ", b), " (mean log2 act scaled)"),
breaks=seq(-10,20,1)) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=17)) +
# ggtitle(paste0(gsub("_", " ", a), " vs ", gsub("_", " ", b), " (", l, "nt variants)")) +
annotate("text", x=min(tmp[,a], na.rm=T), y = max(tmp[,b], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2), "\n(n=", length(which(rowSums(is.na(tmp[,c(a,b)]))==0)),
")"), vjust=1, hjust=0, size=5)
### add points with specific motifs
<- tmp[grep("GATAA|TTATC", tmp$variant_sequence_fw_corrected),]
tmp_subest_GATA <- tmp_subest_GATA[-grep("AGATAA|TTATCT|GATAAT|ATTATC", tmp_subest_GATA$variant_sequence_fw_corrected),]
tmp_subest_GATA $Motif <- "GATA"
tmp_subest_GATA<- tmp[grep("CACATG|CAGATG|CATGTG|CATATG|CATCTG|CACGTG|CAGCTG", tmp$variant_sequence_fw_corrected),]
tmp_subest_twist $Motif <- "twist"
tmp_subest_twist<- tmp[grep("CCGGAA|TTCCGG", tmp$variant_sequence_fw_corrected),]
tmp_subest_ETS $Motif <- "ETS"
tmp_subest_ETS<- rbind(tmp_subest_GATA, tmp_subest_twist, tmp_subest_ETS)
tmp_subest
<- scater +
scater2 geom_point(data=tmp_subest, aes(tmp_subest[,a], tmp_subest[,b], fill=Motif), size=2, pch=21, stroke = 0.2) +
scale_fill_manual(values=my_motif_col) +
geom_text_repel(data=tmp_subest,
aes(tmp_subest[,a], tmp_subest[,b],
label = variant_sequence_fw_corrected), size = 2.3, col="black")
paste0(a,"_",b)]] = ggplotGrob(scater2)
plot_list_motifs[[
}
}
print(gridExtra::grid.arrange(grobs = plot_list_motifs))
require(pheatmap)
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### I could maybe z-score normalise to just compare differences within libraries (against mean)
<- Merged_enrichments
df grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)
df[,
=seq(0,1,length.out = 200)
breaksList= colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList))
mycol
<- data.frame()
pearson_out for(l in 8:2){
if(l==8) df$variant_sequence_fw_corrected <- df$variant_sequence_fw
if(l==7) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 1,7)
if(l==6) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)
if(l==5) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,7)
if(l==4) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,6)
if(l==3) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,6)
if(l==2) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,5)
<- df %>%
tmp group_by(variant_sequence_fw_corrected) %>%
summarise(ced6_pos110=mean(ced6_pos110_act, na.rm=T),
ced6_pos182=mean(ced6_pos182_act, na.rm=T),
ced6_pos230=mean(ced6_pos230_act, na.rm=T),
ced6_pos241=mean(ced6_pos241_act, na.rm=T),
ZnT63C_pos142=mean(ZnT63C_pos142_act, na.rm=T),
ZnT63C_pos180=mean(ZnT63C_pos180_act, na.rm=T),
ZnT63C_pos210=mean(ZnT63C_pos210_act, na.rm=T)
)
<- as.data.frame(tmp)
tmp
### correlation heatmap - Pearson
<- cor(tmp[,-1], method = "pearson", use = "pairwise.complete.obs")
cormat print(pheatmap(cormat,
clustering_distance_cols = as.dist(1 - cormat),
clustering_distance_rows = as.dist(1 - cormat),
fontsize_row = 10,
angle_col = 0,
#fontsize_col = 12,
show_colnames = F,
color=mycol,
breaks=breaksList,
border_color = NA,
main=paste0(l, "nt variants")))
# pearson out
<- function(matrix){
cor_function <- cor(x = matrix, method = "pearson", use = "pairwise.complete.obs")
cors <- cors[upper.tri(cors, diag = FALSE)] # we're only interested in one of the off-diagonals, otherwise there'd be duplicates
cor.data <- as.data.frame(cor.data) # that's how ggplot likes it
cor.data colnames(cor.data) <- "corr"
return(cor.data$corr)
}
<- rbind(pearson_out, data.frame(Length=l,
pearson_out PCC=cor_function(tmp[,-1])))
print(l)
}
## [1] 8
## [1] 7
## [1] 6
## [1] 5
## [1] 4
## [1] 3
## [1] 2
### plot PCCs in function of length
$Length <- factor(pearson_out$Length, levels = unique(pearson_out$Length), labels = paste0(unique(pearson_out$Length), "nt"))
pearson_out
<- ggplot(pearson_out, aes(Length, PCC, fill=Length)) +
boxplot_PCC geom_boxplot(outlier.size = 0.7) +
scale_fill_brewer("", palette = "Blues") +
guides(fill=F) +
scale_y_continuous("PCC", breaks = seq(-10,18,0.2), limits = c(0,1)) +
xlab("Variant length") +
theme_bw(base_size = 11) +
theme(axis.text = element_text(colour="black"),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(size = 13),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 15),
legend.text = element_text(size = 14)
)
print(boxplot_PCC)
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")
Count_table_final
names(Count_table_final) <- gsub("GATAA_1st_S2171", "ced6_pos110", names(Count_table_final))
names(Count_table_final) <- gsub("CACA_2nd_S2171", "ced6_pos182", names(Count_table_final))
names(Count_table_final) <- gsub("GAGA_1st_S2171", "ced6_pos230", names(Count_table_final))
names(Count_table_final) <- gsub("GATAA_2nd_S2171", "ced6_pos241", names(Count_table_final))
names(Count_table_final) <- gsub("AP1_chr3L_3310914_3311162", "ZnT63C_pos142", names(Count_table_final))
names(Count_table_final) <- gsub("twist_chr3L_3310914_3311162", "ZnT63C_pos180", names(Count_table_final))
names(Count_table_final) <- gsub("GATAA_chr3L_3310914_3311162", "ZnT63C_pos210", names(Count_table_final))
$library <- factor(Count_table_final$library, levels = c("Wt", "S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref",
Count_table_final"ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos142", "ZnT63C_pos180", "ZnT63C_pos210"))
<- c("TGA.TCA", "GATAAG", "TCATCA", "TTCC.GGA",
motif_interest_list "GAGAGA",
"CATCTG", "CCGGAA", "TCACGCGA",
"AGGATAA"
)names(motif_interest_list) <- motif_interest_list
<- data.frame()
Data_for_example_boxplots for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
if(length(grep("ced6", lib)) > 0){
<- lib
lib2 <- "S21717_REEFmix2"
mix <- "S2171_wt"
wt <- "S21717_REEFmix2_log2FoldChange"
var else{
}<- lib
lib2 <- "chr3L_3310914_3311162_REEFmix3"
mix #wt <- "chr3L_3310914_3311162_wt" # use reference enhancer instead of wt? (chr2R_4239779_4240027_+_dCP_+)
<- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the same activity as the wt) because I don't trust the wt levels
wt <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
var
}
<- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% paste0(wt, "_+"),
tmp_main c(1:4,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
$log2FoldChange <- tmp_main[,var]
tmp_main<- tmp_main[complete.cases(tmp_main$log2FoldChange),]
tmp_main
<- tmp_main[order(tmp_main$log2FoldChange, decreasing = T),]
tmp
# make table with enrichments per motif
<- lapply(motif_interest_list, function(m){
motif_variant_enrichment_16bp =tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
fwif(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
# this library has TCA in right flanks - don't allow to use that otherwise it will use a lot of variants
if(lib=="ZnT63C_pos180" & m=="TGA.TCA"){
=tmp_main$log2FoldChange[grep(paste0(m, "."), tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
fw
}
data.frame(Motif=m,
Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
Enrichment_log2=c(fw, rv))
})
# plot for both tables
="16bp"
seq
# join all
<- do.call(rbind, motif_variant_enrichment_16bp)
motif_variant_enrichment
# set factor levels
<- motif_variant_enrichment %>%
motif_levels group_by(Motif) %>%
summarise(median=median(Enrichment_log2)) %>%
arrange(desc(median)) %>%
select(Motif)
$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
motif_variant_enrichment
# x labels counts per boxplot
<- data.frame(table(motif_variant_enrichment$Motif))
t <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
xlabels =""
TFif(x=="GATAAG") TF="GATA"
if(x=="TGA.TCA") TF="AP-1"
if(x=="CATCTG") TF="twist"
if(x=="GAGAG") TF="Trl"
if(x=="GAGAGA") TF="Trl"
if(x=="CCGGAA") TF="ETS"
if(x=="TCACGCGA") TF="SREBP"
if(x=="TTCC.GGA") TF="STAT"
if(x=="TGATGT") TF="CREB"
if(x=="TGTGAAA") TF="CREB"
if(x=="TCATCA") TF="CREB"
if(x=="ATCGAT") TF="Dref"
if(x=="AGGATAA") TF="ttk"
if(x %in% c("All", "Others")){
paste0(TF,"\n(", x, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
else{
paste0(TF,"\n(",
", ", t$Freq[t$Var1%in%x], ")")}
x,
})
<- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
boxplot_known_motifs geom_boxplot(outlier.size = 0.6, alpha=0.9) +
#geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
geom_hline(yintercept = tmp_main$log2FoldChange[tmp_main$Variant %in% paste0(wt, "_+")], linetype="dashed", col="orangered") +
geom_hline(yintercept = median(tmp_main$log2FoldChange), linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
scale_alpha_manual(values=c(0.3,0.8)) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
scale_x_discrete("", labels=xlabels) +
guides(fill=F) +
ggtitle(gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", gsub("CACA_2nd", "CACA", gsub("GAGA_1st", "GAGA", lib)))))) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9.5, angle=45, hjust=1),
axis.title = element_text(colour="black"),
# axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(boxplot_known_motifs)
<- gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", gsub("CACA_2nd", "CACA", gsub("GAGA_1st", "GAGA", lib)))))
varID <- rbind(Data_for_example_boxplots,
Data_for_example_boxplots cbind(varID, motif_variant_enrichment))
}
# average per motifs
<- Data_for_example_boxplots %>% group_by(varID, Motif) %>%
out summarise(Mean_Enrichment_log2=mean(Enrichment_log2, na.rm=T))
saveRDS(out, "Average_motif_enrichment_log2.rds")
<- c("TGA.TCA", "GATAAG", "TCATCA", "TTCC.GGA",
motif_interest_list "GAGAGA",
"CATCTG", "CCGGAA", "TCACGCGA",
"AGGATAA"
)names(motif_interest_list) <- motif_interest_list
<- data.frame()
Average_results <- data.frame()
Data_for_example_boxplots for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
if(length(grep("ced6", lib))>0){
<- lib
lib2 <- "S21717_REEFmix2"
mix <- "S2171_wt_+"
wt <- "S21717_REEFmix2_log2FoldChange"
var else{
}<- lib
lib2 <- "chr3L_3310914_3311162_REEFmix3"
mix #wt <- "chr3L_3310914_3311162_wt" # use reference enhancer instead of wt? (chr2R_4239779_4240027_+_dCP_+)
<- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the same activity as the wt) because I don't trust the wt levels
wt <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
var
}
<- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% paste0(wt, "_+"),
tmp_main c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
$log2FoldChange <- scale(tmp_main[,var])
tmp_main<- tmp_main[complete.cases(tmp_main$log2FoldChange),]
tmp_main
# make table with enrichments per motif
<- lapply(motif_interest_list, function(m){
motif_variant_enrichment_16bp =tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
fwif(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
# this library has TCA in right flanks - don't allow to use that otherwise it will use a lot of variants
if(lib=="ZnT63C_pos180" & m=="TGA.TCA"){
=tmp_main$log2FoldChange[grep(paste0(m, "."), tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
fw
}
data.frame(Motif=m,
Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
Enrichment_log2=c(fw, rv))
})
# plot for both tables
="16bp"
seq
# join all
<- do.call(rbind, motif_variant_enrichment_16bp)
motif_variant_enrichment
# save main results
<- gsub("ced6", "ced-6", gsub("_", " ", lib))
varID <- rbind(Average_results,
Average_results cbind(data.frame(Lib=varID,
Seq=seq),
%>%
motif_variant_enrichment group_by(Motif) %>%
summarise(Avg=mean(Enrichment_log2),
Median=median(Enrichment_log2)) )
)
# set factor levels
<- motif_variant_enrichment %>%
motif_levels group_by(Motif) %>%
summarise(median=median(Enrichment_log2)) %>%
arrange(desc(median)) %>%
select(Motif)
$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
motif_variant_enrichment
# x labels counts per boxplot
<- data.frame(table(motif_variant_enrichment$Motif))
t <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
xlabels =""
TFif(x=="GATAAG") TF="GATA"
if(x=="TGA.TCA") TF="AP-1"
if(x=="CATCTG") TF="twist"
if(x=="GAGAG") TF="Trl"
if(x=="GAGAGA") TF="Trl"
if(x=="CCGGAA") TF="ETS"
if(x=="TCACGCGA") TF="SREBP"
if(x=="TTCC.GGA") TF="STAT"
if(x=="TGATGT") TF="CREB"
if(x=="TGTGAAA") TF="CREB"
if(x=="TCATCA") TF="CREB"
if(x=="ATCGAT") TF="Dref"
if(x=="AGGATAA") TF="ttk"
if(x %in% c("All", "Others")){
paste0(TF,"\n(", x, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
else{
paste0(TF,"\n(",
", ", t$Freq[t$Var1%in%x], ")")}
x,
})
<- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
boxplot_known_motifs geom_boxplot(outlier.size = 0.3, alpha=0.9) +
geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
scale_alpha_manual(values=c(0.3,0.8)) +
scale_y_continuous("Enhancer activity (z-scores)", breaks = seq(-10,10,2)) +
scale_x_discrete("", labels=xlabels) +
guides(fill=F) +
ggtitle(varID) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9.5, angle=45, hjust=1),
axis.title = element_text(colour="black"),
# axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(boxplot_known_motifs)
if(seq=="16bp"){
<- rbind(Data_for_example_boxplots,
Data_for_example_boxplots cbind(varID, motif_variant_enrichment))
}
}
<- Data_for_example_boxplots[Data_for_example_boxplots$varID %in% c("ced-6 pos241", "ZnT63C pos180") &
tmp $Motif %in% c("TGA.TCA", "GATAAG", "CCGGAA", "AGGATAA"),]
Data_for_example_boxplots
<- sapply(as.character(unique(tmp$Motif)), function(x){
xlabels =""
TFif(x=="GATAAG") TF="GATA"
if(x=="TGA.TCA") TF="AP-1"
if(x=="CATCTG") TF="twist"
if(x=="GAGAG") TF="Trl"
if(x=="CCGGAA") TF="ETS"
if(x=="TCACGCGA") TF="SREBP"
if(x=="TCC.GGA") TF="STAT"
if(x=="TGATGT") TF="CREB"
if(x=="TGTGAAA") TF="CREB"
if(x=="AGGATAA") TF="ttk"
# paste0(TF,"\n(", x, ")")
paste0(TF)
})
<- ggplot(tmp, aes(Motif, Enrichment_log2, fill=Motif)) +
boxplot_ex geom_boxplot(outlier.size = 0.3) +
geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
# scale_alpha_manual(values=c(0.5,0.9)) +
scale_y_continuous("Enhancer activity (z-scores)", breaks = seq(-10,10,2)) +
scale_x_discrete("Created TF motifs", labels=xlabels) +
guides(fill=F) +
facet_grid(~varID) +
theme_bw(base_size = 16) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=11.5),
axis.title = element_text(colour="black"),
# axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(boxplot_ex)
<- data.table::dcast(Average_results[Average_results$Seq =="16bp",], Lib ~ Motif, value.var = "Avg", fun=mean)
tmp rownames(tmp) <- tmp$Lib
<- tmp[,-1]
tmp
names(tmp) <- c("TGA.TCA"="AP-1",
"GATAAG"="GATA",
CATCTG="twist",
"TTCC.GGA"="STAT",
"CCGGAA"="ETS",
"GAGAGA"="Trl",
"TCACGCGA"="SREBP",
"TCATCA"="CREB",
"AGGATAA"="ttk")[names(tmp)]
library(pheatmap)
<- 200
paletteLength <- c(seq(min(tmp, na.rm=T), 0, length.out=ceiling(paletteLength/2) + 1),
myBreaks seq(max(tmp, na.rm=T)/paletteLength, max(tmp, na.rm=T), length.out=floor(paletteLength/2)))
<- rev(colorRampPalette(
col c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
"#FFFFFF", "#FFFFFF",
"#92C5DE", "#92C5DE", "#4393C3", "#4393C3"))(length(myBreaks)-1))
pheatmap(t(tmp)[,c(4,1,2,5,6,3,7)],
cluster_cols = F,
cluster_rows = F,
angle_col=90,
color=col,
breaks = myBreaks,
legend_breaks=seq(-10,10,1),
border_color = "grey60",
fontsize_row=10,
fontsize_col=10,
main=paste0("Scaled log2 avg activity of motifs"),
width = 4, height = 4
)
<- list()
Variants_res_list for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))[c(1,4:7)]){
if(length(grep("ced6", lib))>0){
<- "S21717_REEFmix2"
mix <- "S2171_wt_+"
wt <- "S21717_REEFmix2_log2FoldChange"
var else{
}<- "chr3L_3310914_3311162_REEFmix3"
mix <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
wt <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
var
}
<- Count_table_final[Count_table_final$library %in% c(lib, wt) | Count_table_final$Variant %in% wt,
tmp_main c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
$log2FoldChange <- tmp_main[,var]
tmp_main<- tmp_main[complete.cases(tmp_main$log2FoldChange),]
tmp_main
# do the variants with high activity match to different motifs?
<- tmp_main[tmp_main$log2FoldChange > tmp_main$log2FoldChange[tmp_main$library==wt | tmp_main$Variant %in% wt],]
top_sequences rownames(top_sequences) <- top_sequences$Variant
### All enriched motifs (FDR<0.05)
<- readRDS("data/PWM_candidates.rds") # from 20201124_REEF_S2171_and_chr3L_3310914_3311162_mixs2_3_reanalysed/Prepare_strings_from_motifs.R
PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]
PWM_candidates <- PWM_candidates[PWM_candidates$FDR_motif_enr<0.05 & PWM_candidates$log2_motif_enr>0,]
PWM_candidates
require(motifmatchr)
require(TFBSTools)
load("data/TF_clusters_PWMs.RData") # load motifs
# count motifs in all enhancers
=list()
opt$pvalue_cutoff <- c(1e-4)
opt$genome="BSgenome.Dmelanogaster.UCSC.dm3"
opt
# motif counts
# for motif enrichment is is fine to do not reduce, because it will just be present vs non-present
<- list()
motif_counts_all for(p in opt$pvalue_cutoff){
# motif counts
<- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name],
motif_ix_scores as.character(top_sequences[[paste0(lib, "_sequence_fw_extended4bp")]]),
genome = opt$genome, p.cutoff = p, bg="genome", out = "scores")
<- as.data.frame(as.matrix(motifCounts(motif_ix_scores)))
scores names(scores) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name])
<- cbind(top_sequences, scores)
motif_counts
paste0("pvalue_", p)]] <- motif_counts
motif_counts_all[[print(p)
}
# sequences explained by main motifs, others, multiple, or none
<- list(GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507"),
TF_motifs GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
GATA=data.frame(Motif="GATA", ID=as.character(PWM_candidates$motif_name[grep("srp|gatad", PWM_candidates$motif_description2, ignore.case = T)])),
AP1=data.frame(Motif="AP-1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
AP1=data.frame(Motif="AP-1", ID=as.character(PWM_candidates$motif_name[grep("jra|kay|fos|jun", PWM_candidates$motif_description2, ignore.case = T)])),
twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
twist=data.frame(Motif="twist", ID=as.character(PWM_candidates$motif_name[grep("twi", PWM_candidates$motif_description2, ignore.case = T)])),
Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263"),
Trl=data.frame(Motif="Trl", ID=as.character(PWM_candidates$motif_name[grep("trl|gaga", PWM_candidates$motif_description2, ignore.case = T)])),
ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
ETS=data.frame(Motif="ETS", ID=as.character(PWM_candidates$motif_name[grep("ETS", PWM_candidates$motif_description2, ignore.case = T)])),
SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
SREBP=data.frame(Motif="SREBP", ID=as.character(PWM_candidates$motif_name[grep("SREBP|HLH106", PWM_candidates$motif_description2, ignore.case = T)])),
STAT=data.frame(Motif="STAT", ID="bergman__Stat92E"),
STAT=data.frame(Motif="STAT", ID=as.character(PWM_candidates$motif_name[grep("stat", PWM_candidates$motif_description2, ignore.case = T)])),
CREB=data.frame(Motif="CREB/ATF", ID="cisbp__M0295"),
CREB=data.frame(Motif="CREB/ATF", ID=as.character(PWM_candidates$motif_name[grep("creb", PWM_candidates$motif_description2, ignore.case = T)])
)
)<- do.call(rbind, TF_motifs)
TF_motifs $Motif <- factor(TF_motifs$Motif)
TF_motifs
<- lapply(motif_counts_all, function(p){
Sequences_motifs_16bp
<- data.frame(sapply(levels(TF_motifs$Motif), function(m){
candidates_variant_enrichment return(rowSums(p[,as.character(TF_motifs$ID[TF_motifs$Motif==m])])>0)
}))"Other motifs"] <- rowSums(p[,19:ncol(p)])>0
candidates_variant_enrichment[,colSums(candidates_variant_enrichment)
return(candidates_variant_enrichment)
})colSums(Sequences_motifs_16bp[[1]])
table(rowSums(Sequences_motifs_16bp[[1]][,1:8]))
# save data
<- Sequences_motifs_16bp[[1]]
Variants_res_list[[lib]]
# which variants
<- lapply(Sequences_motifs_16bp, function(i){
all_var_list "Group"] <- apply(i, 1, function(x){
i[,if(sum(x[1:8]==TRUE)>1){"Multiple"
else if(sum(x[1:8]==TRUE)==1){paste0(colnames(i)[1:8][x[1:8]==TRUE], collapse = "_")
}else if(sum(x[1:8]==TRUE)==0 & x["Other motifs"]==TRUE){paste0(colnames(i)[x==TRUE], collapse = "_")
}else{"No motifs"}
}
})<- table(i[,"Group"])
all_var names(all_var)[names(all_var)=="AP.1"] <- "AP-1"
names(all_var)[names(all_var)=="CREB.ATF"] <- "CREB/ATF"
<- data.frame(all_var)
df $Var1 <- factor(df$Var1, levels=rev(c("No motifs", "Other motifs", "Multiple", rev(sort(levels(TF_motifs$Motif))))))
df
df
})
<- ggplot() +
gg_bar2 geom_bar(data=all_var_list[[1]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
xlab("PWM cutoff") +
scale_y_continuous("# of variants", expand = c(0,0)) +
ggtitle(gsub("ced6", "ced-6", gsub("_", " ", lib))) +
scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
limits = levels(all_var_list[[1]]$Var1)) +
theme(plot.title = element_text(hjust=0.5, colour=my_col[lib]))
print(gg_bar2)
print(lib)
}
## [1] 1e-04
## [1] "ced6_pos110"
## [1] 1e-04
## [1] "ced6_pos241"
## [1] 1e-04
## [1] "ZnT63C_pos210"
## [1] 1e-04
## [1] "ZnT63C_pos142"
## [1] 1e-04
## [1] "ZnT63C_pos180"
saveRDS(Variants_res_list, "Variants_motif_matching_list.rds")
<- c("ced6_pos110"="AAGATAAT",
wt_motif_seq "ced6_pos241"="CTTATCGC",
"ZnT63C_pos142"="ATGACTCA",
"ZnT63C_pos180"="GCAGATGT",
"ZnT63C_pos210"="CTTATCTT")
require(stringdist)
for(lib in names(wt_motif_seq)){
if(length(grep("ced6", lib))>0){
<- "S21717_REEFmix2"
mix <- "S2171_wt_+"
wt <- "S21717_REEFmix2_log2FoldChange"
var else{
}<- "chr3L_3310914_3311162_REEFmix3"
mix <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
wt <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
var
}
<- Count_table_final[Count_table_final$library %in% c(lib, wt) | Count_table_final$Variant %in% wt,
tmp_main c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
$log2FoldChange <- tmp_main[,var]
tmp_main<- tmp_main[complete.cases(tmp_main$log2FoldChange),]
tmp_main
# log2FC
$log2FC_wt <- tmp_main$log2FoldChange-tmp_main$log2FoldChange[tmp_main$library==wt | tmp_main$Variant %in% wt]
tmp_main
# edit distance - minimum of fw and rv distance
$variant_seq <- tmp_main[,paste0(lib, "_sequence_fw")]
tmp_main<- wt_motif_seq[lib]
wt_str <- as.character(reverseComplement(DNAString(wt_str)))
wt_rev $Edit_dist_fw <- stringdist(wt_str, tmp_main$variant_seq, method="hamming")
tmp_main$Edit_dist_rv <- stringdist(wt_rev, tmp_main$variant_seq, method="hamming")
tmp_main$Edit_dist_min <- factor(apply(tmp_main[,c("Edit_dist_fw", "Edit_dist_rv")], 1, min))
tmp_main<- tmp_main[!(tmp_main$library==wt | tmp_main$Variant %in% wt | tmp_main$variant_seq %in% c(wt_str, wt_rev)),]
tmp_main
<- ggplot(tmp_main, aes(Edit_dist_min,log2FC_wt, fill=Edit_dist_min)) +
boxplot_edit geom_boxplot(outlier.size = 0.8) +
scale_y_continuous("log2FC enhancer activity to wildtype", breaks = seq(-10,8,2), limits = c(-9.43,2.51)) +
scale_x_discrete("Edit distance to wildtype [hamming]") +
scale_fill_grey(start = 0.4) +
guides(fill=F) +
geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
ggtitle(gsub("ced6", "ced-6", gsub("_", " ", lib))) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.title = element_text(colour="black"),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5, colour=my_col[lib])
)
print(boxplot_edit)
print(lib)
}
## [1] "ced6_pos110"
## [1] "ced6_pos241"
## [1] "ZnT63C_pos142"
## [1] "ZnT63C_pos180"
## [1] "ZnT63C_pos210"
<- c(ttk="AGGATAA",
motif_interest_list ZEB1="CAGGTG",
lola="GGAGTT",
GGTAAG="GGTAAG"
)
for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
if(length(grep("ced6", lib))>0){
<- lib
lib2 <- "S21717_REEFmix2"
mix <- "S2171_wt_+"
wt <- "S21717_REEFmix2_log2FoldChange"
var else{
}<- lib
lib2 <- "chr3L_3310914_3311162_REEFmix3"
mix <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
wt <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
var
}
<- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% wt,
tmp_main c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
$log2FoldChange <- tmp_main[,var]
tmp_main<- tmp_main[complete.cases(tmp_main$log2FoldChange),]
tmp_main
<- tmp_main[order(tmp_main$log2FoldChange, decreasing = T),]
tmp <- data.frame(x=1:nrow(tmp)/1000, y=2**tmp$log2FoldChange, y_log=tmp$log2FoldChange,
tmp variant=as.character(tmp$Variant),
variant_seq=sapply(strsplit(as.character(tmp$Variant),"_"), `[`, length(strsplit(as.character(tmp$Variant)[1],"_")[[1]])-1),
stringsAsFactors = F)
<- ggplot(tmp, aes("1",y_log)) +
boxplot geom_violin(alpha=0.5, position = position_dodge(width = 0.9), size=0.7, color="black", fill="grey60") +
geom_boxplot(outlier.size = -1, color="black", position = position_dodge(width = 0.9), width=0.2, size=0.8) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
scale_x_discrete("", labels=paste0("All variants\n(n=", nrow(tmp), ")")) +
#geom_hline(yintercept = 0, linetype="dashed", col="grey60") + # the 0 is not relevant so don't print the line
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.title = element_text(colour="black"),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
# select wt variants to plot
<- tmp[tmp$variant %in% c(wt,
tmp_sel $variant[1],
tmp$variant[grep("GATAA", tmp$variant_seq)][1],
tmp$variant[grep("TTATC", tmp$variant_seq)][1]),]
tmp$variant_seq[grep(wt, tmp_sel$variant)] <- "wt (CTTATCGC)"
tmp_sel
<- boxplot +
boxplot3 geom_point(data=tmp_sel[tmp_sel$variant %in% wt,], aes("1",y_log), color=c(my_motif_col["wt"]), size=2) +
annotate(geom="text", x="1", y=tmp_sel$y_log[tmp_sel$variant %in% wt],
label="wt",
color=my_motif_col["wt"],
hjust=-0.35, vjust=0.5, size = 4)
########
# make table with enrichments per motif
<- lapply(motif_interest_list, function(m){
motif_variant_enrichment_16bp =tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
fwif(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
data.frame(Motif=m,
Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
Enrichment_log2=c(fw, rv))
})
# plot for both tables
="16bp"
seq
# join all
<- do.call(rbind, motif_variant_enrichment_16bp)
motif_variant_enrichment
# set factor levels
<- motif_variant_enrichment %>%
motif_levels group_by(Motif) %>%
summarise(median=median(Enrichment_log2)) %>%
arrange(desc(median)) %>%
select(Motif)
$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
motif_variant_enrichment
# x labels counts per boxplot
<- data.frame(table(motif_variant_enrichment$Motif))
t <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
xlabels =names(motif_interest_list)[grep(x, motif_interest_list)]
TFpaste0(TF,"\n(",
",", t$Freq[t$Var1%in%x], ")")
x,
})
<- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2)) +
boxplot_known_motifs geom_boxplot(outlier.size = 0.6, alpha=0.9, fill="dodgerblue") +
#geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
geom_hline(yintercept = tmp_main$log2FoldChange[tmp_main$Variant %in% paste0(wt, "_+")], linetype="dashed", col="orangered") +
geom_hline(yintercept = median(tmp_main$log2FoldChange), linetype="dashed", col="grey60") +
scale_fill_manual(values=my_motif_col) +
scale_alpha_manual(values=c(0.3,0.8)) +
scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
scale_x_discrete("", labels=xlabels) +
guides(fill=F) +
ggtitle(gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", lib)))) +
theme_bw(base_size = 14) +
theme(axis.text = element_text(colour="black"),
axis.text.x = element_text(colour="black", size=9),
axis.title = element_text(colour="black"),
axis.title.y = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(colour="black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour="grey95", size=0.4),
axis.ticks = element_line(colour="black"),
plot.title = element_text(hjust=0.5)
)
print(boxplot3 + boxplot_known_motifs + plot_layout(widths = c(1,4.2)))
}
Didn’t include plots because they are too heavy
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### z-score normalise to just compare differences within libraries (against mean)
<- apply(Merged_enrichments[,grep("_act", names(Merged_enrichments), value = T)], 2, scale)
df rownames(df) <- Merged_enrichments$variant_sequence_fw
### remove variants not measured in all screens
<- na.omit(df)
df
## active variants
<- df[rowSums(df>1)>0,]
df2 =1
s<- ceiling(max(abs(c(min(df2, na.rm=T), max(df2, na.rm=T)))))
extr = seq(-extr, extr, by = s)
myBreaks
<- c(colorRampPalette(
col c("#2166AC", "#4393C3", "#92C5DE",
"#D1E5F0"))((extr*(1/s))-1),
"white", "white",
colorRampPalette(
c("#FDDBC7", "#F4A582",
"#D6604D", "#B2182B"))((extr*(1/s))-1))
pheatmap(df2,
clustering_distance_cols = "correlation",
clustering_distance_rows = "correlation",
angle_col=45,
color=col,
breaks = myBreaks,
legend_breaks=seq(-10,10,2),
border_color = NA,
show_rownames = F,
labels_col = gsub("_", " ", gsub("ced6", "ced-6", gsub("_act", "", colnames(df2)))),
fontsize_col=11,
main=paste0("Scaled log2 activity of ", nrow(df2), " active variants (min 1)"),
width = 5.5, height = 8
)
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
library(ggseqlogo)
<- function(x, method='bits'){
plot.logo ggseqlogo(x, method=method, seq_type='dna', ncol=1) +
theme(panel.background = element_rect(fill="white",colour="white"),
panel.grid = element_blank(), axis.line=element_blank(),
axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(),
axis.title = element_blank(),
legend.key=element_blank(),
legend.text = element_text(size=11, colour="black"),
legend.title = element_text(size=12, colour="black"),
plot.title = element_text(size=14, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5),
legend.position = "right")
}
<- gsub("_act", "", grep("_act", names(Merged_enrichments), value = T))[c(1:4,6,7,5)]
variants names(variants) <- variants
<- list()
logo_plots_all_variants for(var in variants){
<- Merged_enrichments[,grep(var, names(Merged_enrichments))]
df <- df[complete.cases(df[,paste0(var, "_act")]),]
df
<- lapply(c(IC='bits', Prob='p'), function(m){
logo_plots_all <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
logo_plots if(c=="Random") tmp <- df[sample(1:nrow(df)),]
if(c=="Experiment") tmp <- df[order(df[,paste0(var, "_act")], decreasing = T),]
<- c(1,2,5,10,50,100,1000, nrow(tmp))
seq_counts names(seq_counts) <- c(seq_counts[1:7], "last")
lapply(seq_counts, function(i){
<- tmp[1:i,]
tmp2
# get nucleotide at each position
<- do.call(rbind, lapply(1:16, function(x) data.frame(Pos=x,
tmp2 Base=substr(tmp2[,paste0(var, "_fw_extended4bp")], x, x))))
<- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x/sum(x))
prop_matrix
plot.logo(prop_matrix, method=m)
})
})return(logo_plots)
})
<- logo_plots_all
logo_plots_all_variants[[var]]
# print(plot_grid(plot_grid(plotlist = logo_plots_all$Prob$Experiment, ncol=1),
# plot_grid(plotlist = logo_plots_all$Prob$Random, ncol=1)))
}
### plot all together
plot_grid(plotlist = lapply(logo_plots_all_variants, function(x) plot_grid(plotlist = x$Prob$Experiment, ncol=1)), nrow = 1)
### statistics with information content
# The information content (IC) of a PWM says how different a given PWM is from a uniform distribution.
<- function(C) log2(length(C))
tIC <- function(C) C / sum(C)
PPM <- function(C) -sum(PPM(C) * log2(PPM(C)))
U <- function(C) tIC(C) - U(C)
fIC <- function(C){
IC # add pseudocounts
==0] <- 0.001
C[CPPM(C) * fIC(C)
}
set.seed(1234)
<- lapply(variants, function(var){
statistics_logo_plots_variants <- Merged_enrichments[,c(1,grep(var, names(Merged_enrichments)))]
df <- df[complete.cases(df[,paste0(var, "_act")]),]
df <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
statistics_logo_plots if(c=="Random") tmp <- df[sample(1:nrow(df)),]
if(c=="Experiment") tmp <- df[order(df[,paste0(var, "_act")], decreasing = T),]
<- sapply(c(1:1000, 10000, 50000, nrow(tmp)), function(i){
v_all <- tmp[1:i,]
tmp2
# get nucleotide at each position
<- do.call(rbind, lapply(1:8, function(x) data.frame(Pos=x,
tmp2 Base=substr(tmp2$variant_sequence_fw, x, x))))
$Base <- factor(tmp2$Base, levels = c("A", "C", "G", "T"))
tmp2
<- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x)
df
<- sum(colSums(sapply(1:ncol(df), function(x){
v IC(df[,x])
})))
return(v)
})return(v_all)
})=300
nreturn(data.frame(Var=var,
Group=rep(c("Random", "Experiment"), each=n),
Top_sequences=rep(1:n, 2),
IC=c(statistics_logo_plots$Random[1:n], statistics_logo_plots$Experiment[1:n])))
})
<- do.call(rbind, statistics_logo_plots_variants)
plot_df
<- ggplot(plot_df, aes(Top_sequences, IC, group=paste0(Var, "_", Group), colour=Var, alpha=Group, shape=Group)) +
IC_plot geom_point() +
geom_line() +
scale_y_continuous("8-mer sum Information Content") +
scale_x_continuous("Number of top sequences considered", breaks = seq(0,1000,50)) +
scale_color_manual("Enh position", values=my_col[as.character(unique(plot_df$Var))], drop=T) +
scale_alpha_manual(values = c(1,0.2)) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 13, colour="black"),
axis.title = element_text(size = 15),
legend.text = element_text(size = 14),
legend.title = element_text(size = 15)
)
print(IC_plot)
<- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")
Merged_enrichments
### I could maybe z-score normalise to just compare differences within libraries (against mean)
<- Merged_enrichments[,grep("_act|_fw_extended4bp", names(Merged_enrichments), value = T)]
df rownames(df) <- Merged_enrichments$variant_sequence_fw
grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)
df[,
### Motifs of interest
<- readRDS("data/PWM_candidates.rds")
PWM_candidates <- PWM_candidates[complete.cases(PWM_candidates$Motif_strings),]
PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]
PWM_candidates
<- PWM_candidates$Motif_strings
motif_interest_list names(motif_interest_list) <- PWM_candidates$motif_name
### summary (mean z-score) per motif
<- t(sapply(motif_interest_list, function(m){
Motif_average sapply(gsub("_act", "", grep("_act", names(df), value = T)), function(c){
mean(df[unique(grep(paste0(m, "|", as.character(reverseComplement(DNAString(m)))),df[,paste0(c, "_fw_extended4bp")])),paste0(c, "_act")], na.rm=T) # added unique to don't count twice
})
}))is.na(Motif_average) | Motif_average=="NaN"] <- NA
Motif_average[
<- merge(as.data.frame(Motif_average), PWM_candidates[,c(1,5:9,2,14,16:20)], by.x=0, by.y=1)
Motif_average names(Motif_average)[1] <- "motif_name"
write.table(Motif_average, "Candidate_TF_activators_screens_16bp_variants_average_results.txt", sep="\t", quote=F)
require(pheatmap)
<- read.delim("Candidate_TF_activators_screens_16bp_variants_average_results.txt")
plot_df
# summarise per cluster (ordered by maximum activity in any screen)
$Max <- apply(plot_df[,2:8], 1, max, na.rm=T)
plot_df<- plot_df[order(plot_df$Max, decreasing = T),]
plot_df # add name of dmel factor per cluster
<- sapply(unique(plot_df$Motif_cluster), function(c){
cluster_dmel <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>60])[1] # first only expressed TFs
out if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>10])[1] # first only expressed TFs
if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>1])[1] # first only expressed TFs
if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel)])[1] # because it is ordered
if(length(out)==0){return(NA)}else{return(out)}
})names(cluster_dmel) <- unique(plot_df$Motif_cluster)
$Motif_cluster_Dmel <- cluster_dmel[match(plot_df$Motif_cluster, names(cluster_dmel))]
plot_df$Motif_cluster_Dmel[!complete.cases(plot_df$Motif_cluster_Dmel)] <- plot_df$motif_description2[!complete.cases(plot_df$Motif_cluster_Dmel)]
plot_df
# remove duplicated clusters
<- plot_df[!duplicated(plot_df$Motif_cluster),]
plot_df
# remove duplicated strings
<- plot_df[!duplicated(plot_df$Motif_strings),]
plot_df
# correct some names
$Motif_cluster_Dmel[grep("HLH106", plot_df$Motif_cluster_Dmel)] <- "SREBP"
plot_df
# plot TFs with at least 1 z-score in one screen
<- plot_df[rowSums(plot_df[,2:8]>1, na.rm=T)>0,]
plot_df rownames(plot_df) <- paste0(plot_df$Motif_cluster_name, " (", plot_df$Motif_cluster_Dmel, "/", plot_df$Motif_strings, ")")
<- data.frame(row.names = rownames(plot_df),
annotation_row Motif.enrch=as.character(plot_df$log2_motif_enr > 0 & plot_df$FDR_motif_enr < 0.05))
require(RColorBrewer)
<- list(Motif.enrch=c("FALSE"="white", "TRUE"="#336600"))
annotation_colours
### do clustering without NAs and using all motifs - Pearson
<- read.delim("Candidate_TF_activators_screens_16bp_variants_average_results.txt")
Motif_average <- Motif_average[,2:8]
Motif_average_no_na is.na(Motif_average_no_na) | Motif_average_no_na=="NaN"] <- 0
Motif_average_no_na[<- cor(Motif_average_no_na, use = "pairwise.complete.obs", method = "pearson")
cols.cor
<- plot_df[,2:8]
Motif_average_no_na is.na(Motif_average_no_na) | Motif_average_no_na=="NaN"] <- 0
Motif_average_no_na[<- dist(Motif_average_no_na)
row_dist
=0.5
s<- ceiling(max(abs(c(min(plot_df[,2:8], na.rm=T), max(plot_df[,2:8], na.rm=T)))))
extr = seq(-extr, extr, by = s)
myBreaks
<- c(colorRampPalette(
col c("#2166AC", "#4393C3", "#92C5DE",
"#D1E5F0"))((extr*(1/s))-1),
"white", "white",
colorRampPalette(
c("#FDDBC7", "#F4A582",
"#D6604D", "#B2182B"))((extr*(1/s))-1))
# Pairwise correlation between samples (columns)
<- cor(plot_df[,2:8], use = "pairwise.complete.obs", method = "pearson")
cols.cor # Pairwise correlation between rows (genes)
<- cor(t(plot_df[,2:8]), use = "pairwise.complete.obs", method = "pearson")
rows.cor
<- pheatmap(plot_df[,2:8],
h2 clustering_distance_cols = as.dist(1 - cols.cor),
clustering_distance_rows = as.dist(1 - rows.cor),
angle_col=45,
color=col,
breaks = myBreaks,
legend_breaks=seq(-8,8,2),
border_color = NA,
show_rownames = T,
labels_col = gsub("_", " ", gsub("ced6", "ced-6", gsub("_act", "", colnames(plot_df[,2:8])))),
annotation_row = annotation_row,
annotation_legend = F,
annotation_colors = annotation_colours,
fontsize_row=1.5,
fontsize_col=9,
fontsize = 8,
main=paste0("Scaled log2 activity of motif variants"),
width = 8, height = 8
)
<- read.fasta("data/S2_171_REEF_STARRseq_LibFR002_LibFR003_LibFR004_LibFR005_Twist6k_wt.fa", as.string = T, forceDNAtolower = F)
fa1 <- reverseComplement(DNAStringSet(unlist(fa1)))
fa1_rv names(fa1) <- paste0(names(fa1), "_+")
names(fa1_rv) <- paste0(names(fa1_rv), "_-")
<- read.fasta("data/REEF_STARRseq_chr3L_3310914_3311162_mix_S2171_twist_Twist6k_wt.fa", as.string = T, forceDNAtolower = F)
fa2 <- reverseComplement(DNAStringSet(unlist(fa2)))
fa2_rv names(fa2) <- paste0(names(fa2), "_+")
names(fa2_rv) <- paste0(names(fa2_rv), "_-")
<- c(DNAStringSet(unlist(fa1)),
fa_all
fa1_rv,DNAStringSet(unlist(fa2)),
fa2_rv)
# AP-1 library has 250bp sequences - I removed the last nucleotide
write.fasta(as.list(substr(as.character(fa_all),1,249)), names(fa_all), "All_enhancer_variants.fa")
fasta=All_enhancer_variants.fa
pred_script=./Predict_dev_and_hk_enh_activity_CNN_model_from_fasta.py
$pred_script -s $fasta -m DeepSTARR
# output All_enhancer_variants.fa_predictions_DeepSTARR.txt
<- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)
metadata_final
<- read.delim(paste0("All_enhancer_variants.fa_predictions_DeepSTARR.txt"),
predictions stringsAsFactors = F)
$Predictions_dev <- predictions$Predictions_dev[match(metadata_final$Variant, predictions$location)]
metadata_finaltable(complete.cases(metadata_final$Predictions_dev))
##
## TRUE
## 470744
<- metadata_final[metadata_final$library %in% c("ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"),]
metadata_final $library <- factor(metadata_final$library, levels = c("ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))
metadata_final
### scatter plot per library
<- lapply(as.character(levels(metadata_final$library)), function(l){
plot_list if(length(grep("ced6", l))>0){
<- data.frame(obs=metadata_final$S21717_REEFmix2_log2FoldChange[metadata_final$library %in% l],
tmp pred=metadata_final$Predictions_dev[metadata_final$library %in% l])
}if(length(grep("ZnT63C", l))>0){
<- data.frame(obs=metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[metadata_final$library %in% l],
tmp pred=metadata_final$Predictions_dev[metadata_final$library %in% l])
}
# PCC - not using zeros
<- cor.test(tmp$obs,
pc $pred,
tmpmethod = "pearson")
=c("#fdd49e","#d7301f")
sc_col
# plot
<- ggplot(tmp, aes(obs, pred)) +
scater geom_pointdensity(size=0.5, alpha=0.7) +
scale_color_gradient(high = "grey10", low = my_col[l]) +
scale_x_continuous("STARR-seq [log2 enh activity]") +
scale_y_continuous("DeepSTARR [log2 enh activity]") +
# geom_abline(linetype="dashed", col="grey50") +
guides(color="none") +
theme_bw(base_size = 15) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(gsub("_", " ", gsub("_S2171", "_ced-6", gsub("_chr3L_3310914_3311162", "_ZnT63C", l)))) +
annotate("text", x=min(tmp$obs, na.rm=T), y = max(tmp$pred, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
return(scater)
})
print(plot_grid(plotlist = plot_list, nrow = 2))