Required packages
require(tidyverse)
require(data.table)
require(dplyr)
require(BSgenome.Dmelanogaster.UCSC.dm3)
require(seqinr)
require(stringr)
require(GenomicRanges)
require(patchwork)
require(ggpointdensity)
require(gridExtra)
require(cowplot)
require(ggrepel)
require(Biostrings)
require(ggplot2)
theme_set(theme_classic() + theme(axis.text = element_text(colour = "black")))
<- c("wt"="orangered",
motif_colours "CACACA"="#CCCC00",
"Trl"="#e78ac3",
"GAGA"="#e78ac3",
"GAGAG"="#e78ac3",
"GAGAGA"="#e78ac3",
"GATA"="#78A8F3",
"GATAA"="#78A8F3",
"GATAAG"="#78A8F3",
"twist"="#7AB06F",
"CATCTG"="#7AB06F",
"AP1"="#BA3979",
"AP-1"="#BA3979",
"TGA.TCA"="#BA3979",
"ttk"="#DCDF80",
"AGGAT"="#DCDF80",
"AGGATAA"="#DCDF80",
"CCGGAA"="#a6761d",
"ETS"="#a6761d",
"TCACGCG"="#7570b3",
"TCACGCGA"="#7570b3",
"SREBP"="#7570b3",
"CREB/ATF"="#9B9666",
"CREB"="#9B9666",
"TGATGA"="#9B9666",
"TGATGT"="#9B9666",
"TGTGAAA"="#9B9666",
"STAT"="#FF9933",
"Stat92E"="#FF9933",
"TCC.GGA"="#FF9933",
"ATCGAT"="dodgerblue",
"Dref"="dodgerblue",
Atf2=rgb(0,153,153, maxColorValue = 255),
shuffle="grey60"
)
require(plyranges)
# reduce motifs but keep scores
<- function(gr0, min.frac.ov=0.7){
my_reduce_with_score # Overlaps that achieve required overlap
<- findOverlaps(gr0, ignore.strand=T, minoverlap = ceiling(unique(width(gr0))*min.frac.ov))
hits # remove self hits
<- hits[queryHits(hits)!=subjectHits(hits)]
hits
### Do reduce just between the respective overlaps
if(length(c(queryHits(hits), subjectHits(hits)))>0){
<- reduce_ranges(gr0[unique(c(queryHits(hits), subjectHits(hits)))], max_score = max(score), sum_score = sum(score), Number_overlapping=n())
gr_red strand(gr_red) <- "+"
<- gr0[-unique(c(queryHits(hits), subjectHits(hits)))]
gr_no_ov mcols(gr_no_ov) <- data.frame(max_score=gr_no_ov$score,
sum_score=gr_no_ov$score,
Number_overlapping=1)
return(c(gr_no_ov, gr_red))
else{
}<- gr0
gr_no_ov mcols(gr_no_ov) <- data.frame(max_score=gr_no_ov$score,
sum_score=gr_no_ov$score,
Number_overlapping=1)
return(gr_no_ov)
}
}
### function to merge/reduce motif instances based on minimum motif overlap - but here I don't keep the score
<- function(gr0, min.frac.ov=0.7){
my_reduce # Overlaps that achieve required overlap
<- findOverlaps(gr0, ignore.strand=T, minoverlap = ceiling(unique(width(gr0))*min.frac.ov))
hits # remove self hits
<- hits[queryHits(hits)!=subjectHits(hits)]
hits
### Do reduce just between the respective overlaps
<- hits[queryHits(hits)<subjectHits(hits)]
hits
# remove queryhits (q) that are the subjecthits of other (inst1), and for whose its subjecthit is also a subjecthit of inst1
# this is important for MAFK instances in sequence chr15_60699706_60699954 - where two instances in opposite strands (palindrome) overlap another instance
<- queryHits(hits)[queryHits(hits) %in% subjectHits(hits)]
q <- q[subjectHits(hits)[queryHits(hits) %in% q] %in% subjectHits(hits)[!queryHits(hits) %in% q]]
q <- hits[!queryHits(hits) %in% q]
hits
# Reduce the ranges in 'gr0' that are connected via one or more hits in 'hits'. - and merge with the other ranges
# reduce per overlapping ranges
# is a bit slow because of the lapply per overlap
if(length(c(queryHits(hits), subjectHits(hits)))>0){
<- do.call("c", lapply(unique(queryHits(hits)), function(i){
gr1 <- hits[queryHits(hits)==i]
hits_tmp return(reduce(gr0[unique(c(queryHits(hits_tmp), subjectHits(hits_tmp)))], ignore.strand=T, min.gapwidth=0))
}))<- c(gr0[-c(queryHits(hits), subjectHits(hits))], gr1)
out <- unique(out) # if there is overlap between 3 ranges, it will do it twice and I need to unique it
out else{out <- gr0}
}return(out)
}
<- read.delim("data/Twistmix_20210819_oligo_info_REEF_paper.txt", stringsAsFactors = F)
Twistmix_20210819_oligo_info
# experiment tables
<- read.delim("data/experiment.txt")
experiment_table $simple_name <- gsub("LibFR013_Twistmix2_", "", experiment_table$Outfile)
experiment_table
="UMI"
typefor(i in experiment_table$simple_name){
<- import.bed(paste0("data/", experiment_table$Outfile[experiment_table$simple_name %in% i], ".", type, ".bed"))
mapped
# choose sequences with correct length
<- mapped[width(mapped)==249 & mapped@strand %in% "+"]
mapped_correct_length
# add counts info to Twistmix_20210819_oligo_info table
<- merge(Twistmix_20210819_oligo_info, as.data.frame(table(mapped_correct_length@seqnames)), by=1, all.x=T)
Twistmix_20210819_oligo_info names(Twistmix_20210819_oligo_info)[ncol(Twistmix_20210819_oligo_info)] <- paste0(i, "_", type)
print(paste0(i, "_", type))
}
# correct NAs to 0 counts
14:ncol(Twistmix_20210819_oligo_info)][is.na(Twistmix_20210819_oligo_info[14:ncol(Twistmix_20210819_oligo_info)])] <- 0
Twistmix_20210819_oligo_info[
write.table(Twistmix_20210819_oligo_info, paste0("Oligo_counts.txt"), sep="\t", quote=F, row.names = F)
<- read.delim("Oligo_counts.txt")
Twistmix_20210819_oligo_info
= list()
plot_list_tmp
<- Twistmix_20210819_oligo_info[,grep("rep._UMI", names(Twistmix_20210819_oligo_info))]
Oligo_counts
# normalise to 1 million mapped fragments
<- as.data.frame(apply(Oligo_counts, 2, function(x) x/sum(x)*1e6))
Counts_per_million_cpm
for(id in c("input_dev_rep", "STARRseq_dev_rep")){
if(id=="STARRseq_dev_rep") t="Dev STARR-seq"
if(id=="input_dev_rep") t="Dev input"
<- Counts_per_million_cpm[,grep(id, names(Counts_per_million_cpm))]
df_tmp
<- list(a_b=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]],
comparison_list names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]),
a_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]],
names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]),
b_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]],
names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]))
if(length(grep(id, names(Counts_per_million_cpm)))==4) comparison_list[["a_d"]] <- c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]],
names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[4]])
<- lapply(comparison_list, function(x){
plot_list_tmp
<- x[1]
a <- x[2]
b
# PCC
<- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
pc log10(df_tmp[apply(df_tmp,1,min)>0,b]),
method = "pearson")
if(id=="input_dev_rep") my_col=c("#d9d9d9","#525252")
if(id=="STARRseq_dev_rep") my_col=c("orangered","orangered4")
# plot
<- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
scater geom_pointdensity(adjust = 0.4, size=0.4) +
scale_color_gradient(low = my_col[1], high = my_col[2]) +
scale_x_log10(gsub("_", " ", a),
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
scale_y_log10(gsub("_", " ", b),
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
return(ggplotGrob(scater))
})
# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp, nrow = 1))
}
## TableGrob (1 x 3) "arrange": 3 grobs
## z cells name grob
## a_b 1 (1-1,1-1) arrange gtable[layout]
## a_c 2 (1-1,2-2) arrange gtable[layout]
## b_c 3 (1-1,3-3) arrange gtable[layout]
## TableGrob (1 x 3) "arrange": 3 grobs
## z cells name grob
## a_b 1 (1-1,1-1) arrange gtable[layout]
## a_c 2 (1-1,2-2) arrange gtable[layout]
## b_c 3 (1-1,3-3) arrange gtable[layout]
<- read.delim("Oligo_counts.txt")
Count_table <- Count_table[,c(1:13,grep("UMI", names(Count_table)))]
Count_table rownames(Count_table) <- Count_table$Oligo_ID
require(DESeq2)
<- Count_table
Count_table_final
<- "dev"
e
# only sequences with at least 10 reads in all inputs
<- Count_table[rowSums(Count_table[,grep(paste0("input_", e), names(Count_table))]<10)==0,]
Count_table_2
<- Count_table_2[,grep(e, names(Count_table_2), ignore.case = T)]
cts rownames(cts) <- Count_table_2$Oligo_ID
# add one count to sequences not found in RNA
==0] <- 1
cts[cts
# design
<- data.frame(type=factor(rep(c("Input", "Experiment"),each=3),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("_wt_NegativeRegions", rownames(cts)),]))
<- DESeq(dds)
dds
# results
<- results(dds, alpha=0.05)
res
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp names(tmp) <- paste0(e,"_",names(tmp))
<- merge(Count_table_final, tmp, by.x=1, by.y=0, all.x=T)
Count_table_final
write.table(Count_table_final, "Final_table_all_oligos.txt", sep="\t", quote=F, row.names = F)
# remove oligos with no activity values
<- Count_table_final[complete.cases(Count_table_final$dev_log2FoldChange),]
Count_table_final
write.table(Count_table_final, "Final_table.txt", sep="\t", quote=F, row.names = F)
<- read.delim("Oligo_counts.txt")
Count_table <- Count_table[,c(1:13,grep("UMI", names(Count_table)))]
Count_table rownames(Count_table) <- Count_table$Oligo_ID
<- "dev"
e
<- lapply(c(rep1="rep1", rep2="rep2", rep3="rep3"), function(r){
DESeq_results
# save main table to add columns in the end
<- Count_table
Count_table_final
# only sequences with at least 10 reads in all inputs
<- Count_table[rowSums(Count_table[,grep(paste0("input_", e), names(Count_table))]<10)==0,]
Count_table_2
<- Count_table_2[,grep(e, names(Count_table_2), ignore.case = T)]
cts rownames(cts) <- Count_table_2$Oligo_ID
# add one count to sequences not found in RNA
==0] <- 1
cts[cts
# count table replicate-specific
<- cts[,c(1:3,grep(paste0("STARRseq_dev_",r), names(cts), ignore.case = T))]
cts
# design
<- data.frame(type=factor(c(rep("Input",3), rep("Experiment",1)),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors - assumption that controls don't work
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("_wt_NegativeRegions", rownames(cts)),]))
<- DESeq(dds)
dds
# results
<- results(dds, alpha=0.05)
res
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp names(tmp) <- paste0(r,"_",names(tmp))
return(tmp)
})
<- merge(Count_table_final, DESeq_results$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$rep3[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final
# save final table
write.table(Count_table_final, "Final_table_all_oligos_and_enrichment_per_replicate.txt", sep="\t", quote = F, row.names = F)
<- read.delim("Final_table_all_oligos_and_enrichment_per_replicate.txt")
Count_table_final
= list()
plot_list_tmp for(e in 1:3){
if(e==1) v=c(1,2)
if(e==2) v=c(1,3)
if(e==3) v=c(2,3)
<- Count_table_final[,grep("rep._log2FoldChange", names(Count_table_final))[v]]
df_tmp
# PCC - not using zeros
<- cor.test(df_tmp[,1],
pc 2],
df_tmp[,method = "pearson", use="pairwise.complete.obs")
=c("orangered","orangered4")
my_col
# plot
<- ggplot(df_tmp, aes(df_tmp[,1], df_tmp[,2])) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = my_col[1], high = my_col[2]) +
scale_x_continuous(paste0("Enhancer activity - rep",v[1]), breaks = seq(-10,10,2)) +
scale_y_continuous(paste0("Enhancer activity - rep",v[2]), breaks = seq(-10,10,2)) +
guides(color=F) +
geom_abline(linetype="dashed", col="grey30") +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
annotate("text", x=min(df_tmp[,1], na.rm=T), y = max(df_tmp[,2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
= ggplotGrob(scater)
plot_list_tmp[[e]]
}
print(gridExtra::grid.arrange(grobs = plot_list_tmp, nrow = 1))
## TableGrob (1 x 3) "arrange": 3 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
# all oligo library
### activity of oligos
<- read.delim("Final_table.txt", stringsAsFactors = F)[,c(1:5,8,9,14:19,21)]
df_twist table(df_twist$Experiment)
### REEF-STARR-seq validation motif swapping - 8,251
<- readRDS("data/Swap_motif_instances_sequences.rds")
Swap_df names(Swap_df)[9] <- "Mutation_log2FC_wt_mut_old_library"
# add UMI counts
<- cbind(Swap_df, df_twist[match(Swap_df$Oligo_ID, df_twist$Oligo_ID),8:13])
Swap_df
# remove non-changed sequences here, they are similar to wt
table(as.character(Swap_df$Wt_motif_sequence)==as.character(Swap_df$New_motif_sequence))
<- Swap_df[!as.character(Swap_df$Wt_motif_sequence)==as.character(Swap_df$New_motif_sequence),]
Swap_df
### add activity information
$dev_activity <- df_twist$dev_log2FoldChange[match(Swap_df$Oligo_ID, df_twist$Oligo_ID)]
Swap_dftable(complete.cases(Swap_df$dev_activity))
# add wt activity (match wt by strand) and calculate FC
="dev"
CPpaste0(CP, "_wt_activity")] <- sapply(Swap_df$Sequence_ID, function(e){
Swap_df[,<- unique(Swap_df$Strand[Swap_df$Sequence_ID %in% e])
str <- df_twist[,paste0(CP, "_log2FoldChange")][df_twist$Sequence_ID %in% e & df_twist$Strand %in% str & df_twist$Experiment %in% "wt"]
activity if(length(activity)==0) return(NA) else{return(activity)}
})# calculate FC
paste0(CP, "_log2FC_wt_swapped")] <- Swap_df[,paste0(CP, "_activity")]-Swap_df[,paste0(CP, "_wt_activity")]
Swap_df[,
### keep enhancers active and from dev program
<- read.delim("data/Dev_enhancers.txt")
Dev_enh <- Swap_df[complete.cases(Swap_df$dev_wt_activity) & Swap_df$Oligo_ID %in% Dev_enh$Oligo_ID,]
Swap_df
### add original motif mutant information
<- readRDS("data/mutation_data.rds")
mutation_data <- mutation_data[,c(1:5,7:9,11,12,16:20,28:29,32,34:57,60:ncol(mutation_data))]
mutation_data
<- do.call(rbind, lapply(1:nrow(Swap_df), function(i){
mutated_instances <- Swap_df$Oligo_ID[i]
REEF_ID <- Swap_df$Sequence_ID[i]
enh <- Swap_df$Wt_motif[i]
m <- Swap_df$Motif_center[i]
ctr
<- m
m2 if(m=="AP1") m2 <- c("TGA.TCA")
if(m=="twist") m2 <- c("CA..TG", "twist")
if(m=="Trl") m2 <- c("GAGA", "Trl")
if(m=="ETS") m2 <- c("ETS", "ETS_string")
<- mutation_data[mutation_data$Sequence_ID==enh & mutation_data$Motif_mutated %in% m2 & mutation_data$instance_start < ctr & mutation_data$instance_end > ctr,]
out
if(nrow(out)>1 & length(which(complete.cases(out$dev_mut)))>0) out <- out[complete.cases(out$dev_mut),]
# if I mutated more than one overlapping instance
if(nrow(out)>1 & m=="Trl" & length(grep("Trl", out$Motif_mutated))>0) out <- out[out$Motif_mutated == "Trl",]
if(nrow(out)>1 & m=="ETS" & length(grep("ETS_string", out$Motif_mutated))>0) out <- out[out$Motif_mutated == "ETS_string",]
# if mutated overlapping instance choose the closest one
if(nrow(out)>1) out <- out[order(abs((out$instance_start + (out$instance_end-out$instance_start)/2) -ctr)),][1,]
# output
if(nrow(out)==0) return(data.frame(REEF_ID=REEF_ID,
Oligo_ID=NA,
instance_start=NA,
instance_end=NA,
instance_strand=NA,
wt_instance=NA,
mutant_variant=NA,
dev_mut=NA
else{ return(cbind(REEF_ID, out[,c(1,44:46,49,50,8)])) }
))
}))
which(duplicated(mutated_instances$REEF_ID))
which(!complete.cases(mutated_instances$Oligo_ID))
names(mutated_instances) <- paste0("mut_inst_", names(mutated_instances))
names(mutated_instances)[8] <- "motif_mut_dev_activity"
<- cbind(Swap_df, mutated_instances[,-1])
REEFSTARRseq_Swap_motif_instances_sequences_final
# calculate FC
$dev_log2FC_wt_mut <- REEFSTARRseq_Swap_motif_instances_sequences_final$motif_mut_dev_activity - REEFSTARRseq_Swap_motif_instances_sequences_final$dev_wt_activity
REEFSTARRseq_Swap_motif_instances_sequences_finalhead(REEFSTARRseq_Swap_motif_instances_sequences_final)
### For each enhancer, add row with activity and log2FC for motif mutant
$Instance_ID <- sapply(strsplit(as.character(REEFSTARRseq_Swap_motif_instances_sequences_final$Oligo_ID),"_to_"), `[`, 1)
REEFSTARRseq_Swap_motif_instances_sequences_final<- do.call(rbind, lapply(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Instance_ID), function(i){
REEF_motif_mutants <- REEFSTARRseq_Swap_motif_instances_sequences_final[grep(i, REEFSTARRseq_Swap_motif_instances_sequences_final$Oligo_ID, fixed = T),][1,]
out $Oligo_ID <- paste0(i, "_to_mutant")
out$Sequence <- NA
out$New_motif <- "shuffle"
out$New_motif_sequence <- out$mut_inst_mutant_variant
out$dev_activity <- out$motif_mut_dev_activity
out$dev_log2FC_wt_swapped <- out$dev_log2FC_wt_mut
out
return(out)
}))
<- rbind(REEFSTARRseq_Swap_motif_instances_sequences_final, REEF_motif_mutants)
REEFSTARRseq_Swap_motif_instances_sequences_final
length(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Sequence_ID)) # 593 different enhancers
length(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Instance_ID)) # 990 different instances
table(REEFSTARRseq_Swap_motif_instances_sequences_final$Wt_motif, REEFSTARRseq_Swap_motif_instances_sequences_final$New_motif)
### correct motif factors
$Wt_motif <- factor(REEFSTARRseq_Swap_motif_instances_sequences_final$Wt_motif,
REEFSTARRseq_Swap_motif_instances_sequences_finallevels=c("AP1", "GATAA", "twist", "Trl", "ETS", "SREBP"))
$New_motif <- factor(REEFSTARRseq_Swap_motif_instances_sequences_final$New_motif,
REEFSTARRseq_Swap_motif_instances_sequences_finallevels=c("shuffle", "AP1", "GATAA", "twist", "Trl", "Stat92E", "ETS", "SREBP", "Atf2"))
saveRDS(REEFSTARRseq_Swap_motif_instances_sequences_final, "REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff <- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df $log2FC_mut_pasted <- df$dev_activity-df$motif_mut_dev_activity
dftable(df$Wt_motif[!duplicated(df$Instance_ID)])
length(unique(df$Instance_ID)) # 763 instances tested
length(unique(df$Sequence_ID)) # in 496 enhancers tested
<- df[,c(1:4,29,6:8,10:20,27,28,30)]
out names(out)[names(out)=="dev_log2FC_wt_swapped"] <- "dev_log2FC_wt_pasted"
write.table(out, "Supplementary_Table_5.txt", sep="\t", quote=F, row.names=F)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP1 GATAA twist Trl Stat92E ETS SREBP Atf2
## AP1 176 0 176 176 176 176 176 176 176
## GATAA 190 190 0 190 190 190 190 190 190
## twist 187 187 187 0 187 187 187 187 187
## Trl 129 129 129 129 0 129 129 129 129
## ETS 156 156 156 154 156 156 0 156 156
## SREBP 152 152 152 152 152 152 152 0 152
# filter for important instances
table(df_main$Wt_motif, df_main$dev_log2FC_wt_mut <= -1)
##
## FALSE TRUE
## AP1 40 1360
## GATAA 0 1520
## twist 135 1353
## Trl 752 264
## ETS 480 710
## SREBP 288 896
table(df_main$Wt_motif, df_main$dev_log2FC_wt_mut <= -2)
##
## FALSE TRUE
## AP1 256 1144
## GATAA 104 1416
## twist 856 632
## Trl 904 112
## ETS 806 384
## SREBP 584 600
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP1 GATAA twist Trl ETS SREBP
## 170 190 169 33 89 112
# reorder levels
$New_motif <- factor(df$New_motif, levels = rev(df %>% group_by(New_motif) %>% summarise(act=median(dev_activity, na.rm=T)) %>% arrange(desc(act, )) %>% pull(New_motif)))
df
<- ggplot(df,
violin aes(New_motif, dev_activity, colour=New_motif, fill=New_motif)) +
geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
geom_violin(data=df[!duplicated(df$Instance_ID),], aes("WT enh", dev_wt_activity), size=0.7, colour="orangered", fill="orangered") +
geom_boxplot(data=df[!duplicated(df$Instance_ID),], aes("WT enh", dev_wt_activity), col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 enhancer activity", breaks = seq(-10,10,2)) +
scale_x_discrete("", limits=c("WT enh", levels(df$New_motif)), labels=c("WT enh", "mutant", levels(df$New_motif)[-1])) +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
print(violin)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP1 GATAA twist Trl Stat92E ETS SREBP Atf2
## AP1 176 0 176 176 176 176 176 176 176
## GATAA 190 190 0 190 190 190 190 190 190
## twist 187 187 187 0 187 187 187 187 187
## Trl 129 129 129 129 0 129 129 129 129
## ETS 156 156 156 154 156 156 0 156 156
## SREBP 152 152 152 152 152 152 152 0 152
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP1 GATAA twist Trl ETS SREBP
## 170 190 169 33 89 112
# reorder levels
$New_motif <- factor(df$New_motif, levels = c("shuffle", "GATAA", "Trl", "SREBP", "AP1", "Atf2", "twist", "Stat92E", "ETS"))
df
# all
# compare with summary results of pasting the different motif types individually
<- ggplot(df, aes(New_motif, dev_log2FC_wt_swapped, colour=New_motif, fill=New_motif)) +
violin geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to wt", breaks = seq(-10,10,2)) +
scale_x_discrete("New motif") +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
# compare to shuffle mutant
<- ggplot(df[!df$New_motif=="shuffle",], aes(New_motif, dev_activity-motif_mut_dev_activity, colour=New_motif, fill=New_motif)) +
violin2 geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif") +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
print(violin2)
print(violin)
# per wt motif
<- ggplot(df, aes(New_motif, dev_log2FC_wt_swapped, fill=Wt_motif)) +
gg geom_boxplot(outlier.size = 0.4) +
scale_fill_manual(values=motif_colours[as.character(df$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to wt", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif", breaks=levels(df$New_motif)) +
ggtitle(paste0(length(unique(df$Instance_ID)), " instances"))
<- ggplot(df[!df$New_motif=="shuffle",], aes(New_motif, dev_activity-motif_mut_dev_activity, fill=Wt_motif)) +
gg2 geom_boxplot(outlier.size = 0.4) +
scale_fill_manual("Replaced\nmotif", values=motif_colours[as.character(df$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif", breaks=levels(df$New_motif)) +
ggtitle(paste0(length(unique(df$Instance_ID)), " instances"))
print(gg2)
print(gg)
# per pasted motif
<- function(variable,value){
labeller1 return(sapply(levels(df$New_motif[!df$New_motif=="shuffle"]), function(m) paste0(m, " (", round(cor.test(df$motif_mut_dev_activity[df$New_motif==m],
$dev_activity[df$New_motif==m]-df$motif_mut_dev_activity[df$New_motif==m])$estimate,2), ")"))[value])
df
}
<- ggplot(df[!df$New_motif=="shuffle",], aes(motif_mut_dev_activity, dev_activity-motif_mut_dev_activity, colour=New_motif)) +
MA geom_point(size=0.9, alpha=0.7) +
scale_colour_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
geom_vline(xintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_continuous("Mutant enhancer activity [log2]") +
facet_wrap(~New_motif, labeller = labeller1, nrow = 2) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
strip.text = element_text(size=13))
print(MA)
<- df[!df$New_motif=="shuffle",] %>%
coef_var group_by(New_motif) %>%
summarise(CoefVar=sd(dev_activity-motif_mut_dev_activity, na.rm=T) / mean(dev_activity-motif_mut_dev_activity, na.rm=T) * 100)
=0.8
w<- ggplot(coef_var, aes(New_motif, CoefVar, fill=New_motif)) +
gg geom_bar(stat="identity", width=w, col="black") +
scale_x_discrete("Pasted motif") +
scale_y_continuous("Coefficient of variation", expand = expansion(mult = c(0, 0.02), add = c(0, 0))) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
theme(axis.text.x = element_text(size=10))
print(gg)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP1 GATAA twist Trl Stat92E ETS SREBP Atf2
## AP1 176 0 176 176 176 176 176 176 176
## GATAA 190 190 0 190 190 190 190 190 190
## twist 187 187 187 0 187 187 187 187 187
## Trl 129 129 129 129 0 129 129 129 129
## ETS 156 156 156 154 156 156 0 156 156
## SREBP 152 152 152 152 152 152 152 0 152
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP1 GATAA twist Trl ETS SREBP
## 170 190 169 33 89 112
# crate matrix motif swapped vs new motif
$log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df<- data.table::dcast(df, Instance_ID + Wt_motif ~ New_motif, value.var = "log2FC_mut_swapped", fun=mean)[,-3]
df_matrix
### correlation heatmap - Pearson
require(pheatmap)
=seq(0,1,length.out = 200)
breaksList= colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList))
mycol
<- cor(df_matrix[,-c(1,2)], method = "pearson", use = "pairwise.complete.obs")
cormat print(pheatmap(cormat,
clustering_distance_cols = as.dist(1 - cormat),
clustering_distance_rows = as.dist(1 - cormat),
fontsize_row = 10,
angle_col = 0,
#fontsize_col = 12,
show_colnames = F,
color=mycol,
breaks=breaksList,
border_color = NA,
main=paste0("Motif types - PCC")))
# pearson out
<- function(matrix){
cor_function <- cor(x = matrix, method = "pearson", use = "pairwise.complete.obs")
cors <- cors[upper.tri(cors, diag = FALSE)] # we're only interested in one of the off-diagonals, otherwise there'd be duplicates
cor.data <- as.data.frame(cor.data) # that's how ggplot likes it
cor.data colnames(cor.data) <- "corr"
return(cor.data$corr)
}
### pairwise comparison plots
# for(i1 in 3:(ncol(df_matrix)-1)){
# for(i2 in (i1+1):ncol(df_matrix)){
# example for AP-1
for(i1 in c(3,4)){
for(i2 in c(5,8)){
<- df_matrix
tmp
<- names(tmp)[i1]
m1 <- names(tmp)[i2]
m2
# PCC - not using zeros
<- cor.test(tmp[,m1],
pc
tmp[,m2],method = "pearson")
# colour points
$col <- ""
tmp$col[tmp[,m1] - tmp[,m2] >= 1] <- m1
tmp$col[tmp[,m2] - tmp[,m1] >= 1] <- m2
tmp$col <- factor(tmp$col, levels = c("", m1, m2))
tmp
# plot
<- ggplot(tmp, aes(tmp[,m1], tmp[,m2], col=col)) +
scater geom_point(size=1.5) +
scale_colour_manual("Preferential\nactivity", values=c("grey50", motif_colours[m1], motif_colours[m2])) +
guides(col="none") +
geom_abline(intercept = 0, linetype="dashed", slope = 1, col="grey20") +
scale_x_continuous(paste0(m1, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0(m2, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
theme_bw(base_size = 15) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=15)) +
annotate("text", x=min(tmp[,m1], na.rm=T), y = max(tmp[,m2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
# colour by original motif
<- ggplot(df_matrix, aes(df_matrix[,m1], df_matrix[,m2], col=Wt_motif)) +
scater2 geom_point(size=1.5) +
scale_colour_manual("wt motif", values=motif_colours[as.character(df_matrix$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey75") +
geom_vline(xintercept = 0, linetype="dashed", col="grey75") +
geom_abline(intercept = 0, slope = 1, col="grey20") +
scale_x_continuous(paste0(m1, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0(m2, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
theme_bw(base_size = 15) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=15))
print(scater)
print(scater2)
} }
require(pheatmap)
require(RColorBrewer)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP1 GATAA twist Trl Stat92E ETS SREBP Atf2
## AP1 176 0 176 176 176 176 176 176 176
## GATAA 190 190 0 190 190 190 190 190 190
## twist 187 187 187 0 187 187 187 187 187
## Trl 129 129 129 129 0 129 129 129 129
## ETS 156 156 156 154 156 156 0 156 156
## SREBP 152 152 152 152 152 152 152 0 152
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP1 GATAA twist Trl ETS SREBP
## 170 190 169 33 89 112
# crate matrix motif swapped vs new motif
$log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df<- data.table::dcast(df, Instance_ID + Wt_motif ~ New_motif, value.var = "log2FC_mut_swapped", fun=mean)[,-3]
df_matrix
# Remove instances for which more than 2 motif swapps didn't work
table(apply(is.na(df_matrix[,-c(1:2)]), 1, sum))
##
## 1 2 3 4 5 6 7 8
## 677 44 18 8 2 3 8 3
<- df_matrix[apply(is.na(df_matrix[,-c(1:2)]), 1, sum) <= 2,]
df_matrix
<- df_matrix[,-c(1:2)]
dat rownames(dat) <- df_matrix$Instance_ID
### annotations
<- data.frame("Wt motif" = df_matrix$Wt_motif,
instance_row row.names = df_matrix$Instance_ID)
= list(
my_colour "Wt.motif" = motif_colours[names(motif_colours) %in% instance_row$Wt.motif]
)
# Pearson clustering heatmap
# Pairwise correlation between samples (columns)
<- cor(dat, use = "pairwise.complete.obs", method = "pearson")
cols.cor # Pairwise correlation between rows (genes)
<- cor(t(dat), use = "pairwise.complete.obs", method = "pearson")
rows.cor table(is.na(rows.cor))
##
## FALSE
## 519841
# it might be dangerous to use pairwise.complete.obs. When specified, R computes correlations for each pair of columns using vectors formed by omitting rows with missing values on a pairwise basis. Thus each column vector may vary depending on it’s pairing, resulting in correlation values that are not even comparable.
# colours and breaks
library(RColorBrewer)
=1
s= seq(floor(min(dat, na.rm=T)), ceiling(max(dat, na.rm=T)), by = s)
myBreaks
<- c(colorRampPalette(
myColor c("#2166AC", "#4393C3", "#92C5DE",
"#D1E5F0"))(abs(floor(min(dat, na.rm=T))*(1/s))-1),
"white", "white",
colorRampPalette(
c("#FFEBE5", "#FDDBC7", "#F4A582",
"#D6604D", "#B2182B", "#800026"))((ceiling(max(dat, na.rm=T))*(1/s))-1))
<- pheatmap(dat,
heatmap_pearson color=myColor, breaks=myBreaks,
clustering_distance_cols = as.dist(1 - cols.cor),
clustering_distance_rows = as.dist(1 - rows.cor),
angle_col = 90,
border_color = NA,
legend_breaks=seq(-10,10,2),
show_rownames = F,
fontsize_col=11,
main=paste0("log2 FC to mut (", nrow(dat), " instances)"),
width = 5.5, height = 7
)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP1 GATAA twist Trl Stat92E ETS SREBP Atf2
## AP1 176 0 176 176 176 176 176 176 176
## GATAA 190 190 0 190 190 190 190 190 190
## twist 187 187 187 0 187 187 187 187 187
## Trl 129 129 129 129 0 129 129 129 129
## ETS 156 156 156 154 156 156 0 156 156
## SREBP 152 152 152 152 152 152 152 0 152
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP1 GATAA twist Trl ETS SREBP
## 170 190 169 33 89 112
# crate matrix motif swapped vs new motif
$log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df<- data.table::dcast(df, Instance_ID + Wt_motif ~ New_motif, value.var = "log2FC_mut_swapped", fun=mean)[,-3]
df_matrix =="NaN"] <- NA
df_matrix[df_matrix
# select examples GATA vs ETS
<- c("chr3L_4121757_4122005_+_dCP_twist_center168",
cand "chr2L_3958910_3959158_+_dCP_twist_center68",
"chr2L_7795687_7795935_+_dCP_AP1_center135",
"chrX_9742091_9742339_+_dCP_AP1_center205")
=0.8
wfor(i in 1:length(cand)){
<- df_main[df_main$Instance_ID %in% cand[i],]
tmp
<- ggplot() +
gg geom_bar(data=tmp[tmp$New_motif=="GATAA",], aes("mut", motif_mut_dev_activity), stat="identity", width=w, col="black", fill="grey60") +
geom_bar(data=tmp[tmp$New_motif=="GATAA",], aes("GATAA", dev_activity), stat="identity", width=w, col="black", fill=motif_colours["GATAA"]) +
geom_bar(data=tmp[tmp$New_motif=="ETS",], aes("ETS", dev_activity), stat="identity", width=w, col="black", fill=motif_colours["ETS"]) +
scale_x_discrete("", limits=c("mut", "GATAA", "ETS")) +
scale_y_continuous("log2 enhancer activity", expand = expansion(mult = c(0, 0.02), add = c(0, 0))) +
ggtitle(unique(paste0(tmp$Sequence_ID, " pos ", tmp$Motif_center))) +
theme_classic(base_size = 18) +
theme(axis.title = element_text(colour="black"),
axis.text = element_text(colour="black"),
plot.title = element_text(colour="black", hjust=0.5, size=11))
print(gg + scale_y_continuous("log2 enhancer activity",
expand = expansion(mult = c(0, 0.02), add = c(0, 0)),
limits = c(0,8.45)))
}
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & complete.cases(df_main$dev_log2FC_wt_swapped) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df
# remove shuffled variant
<- df[!df$New_motif %in% "shuffle",]
df
### for different formulas
$dev_log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df
# linear model
<- lm(dev_log2FC_mut_swapped ~ dev_log2FC_wt_mut + Wt_motif * New_motif, df)
fit
# How much variance explained by each feature
<- anova(fit)
af $PctExp <- af$"Sum Sq"/sum(af$"Sum Sq")*100
af$variable <- factor(rownames(af), levels = rev(c("dev_log2FC_wt_mut", "Sequence_ID", "Instance_ID", "Wt_motif", "New_motif", "Wt_motif:New_motif", "Instance_ID:New_motif", "Residuals")),
aflabels = rev(c("wt motif imp.", "Sequence ID", "Instance ID", "WT motif", "Pasted motif", "WT:Pasted motif interaction", "Instance ID:Pasted motif", "Residuals")))
# barplot
<- ggplot(af, aes("", PctExp, fill=variable)) +
barplot geom_col() +
scale_fill_manual("Features", values = rev(c("wt motif imp."="#984ea3", "WT motif"="#e41a1c", "Pasted motif"="#377eb8", "WT:Pasted motif interaction"="#4daf4a", "Residuals"="grey60"))) +
scale_y_continuous("Variance explained [%]", breaks=seq(-100,100,20), expand = c(0,0)) +
ggtitle(paste0(nrow(df), " positions (adjR2: ", round(summary(fit)$adj.r.squared,2), ")")) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
axis.title.x = element_blank(), axis.ticks.x = element_blank())
<- data.frame(obs=df$dev_log2FC_mut_swapped,
out pred=predict(fit, df))
<- ggplot(out, aes(obs, pred)) +
scater geom_pointdensity(adjust = 1.3) +
scale_color_gradient(low = "grey70", high = "grey20") +
guides(col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey70") +
geom_vline(xintercept = 0, linetype="dashed", col="grey70") +
geom_abline(intercept = 0, slope = 1, col="grey20") +
scale_x_continuous(paste0("Observed (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0("Predicted (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(size=11)) +
ggtitle(paste(as.character(fit$call$formula)[2], as.character(fit$call$formula)[1], as.character(fit$call$formula)[3])) +
annotate("text", x=min(out$obs, na.rm=T), y = max(out$pred, na.rm=T), label = paste0("PCC: ", round(cor(out$obs, out$pred),2)), vjust=1, hjust=0, size=5)
print(barplot + scater + plot_layout(widths = c(1, 5)))
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff <- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df
# remove shuffled variant
<- df[!df$New_motif %in% "shuffle",]
df $New_motif <- droplevels(df$New_motif)
df
# add FC to mutant
$dev_log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df<- df[complete.cases(df$dev_log2FC_mut_swapped),]
df
<- data.frame()
out for(m in unique(df$New_motif)){
<- df[df$New_motif %in% m,]
tmp <- tmp %>%
enh_list group_by(Sequence_ID) %>%
summarise(N=dplyr::n()) %>%
filter(N==2) %>%
pull(Sequence_ID)
for(e in enh_list){
<- abs(diff(tmp$dev_log2FC_mut_swapped[tmp$Sequence_ID==e]))
delta_asame <- mean(c(abs(diff(c(tmp$dev_log2FC_mut_swapped[tmp$Sequence_ID==e][1], mean(tmp$dev_log2FC_mut_swapped[!tmp$Sequence_ID==e])))),
delta_diff_avg abs(diff(c(tmp$dev_log2FC_mut_swapped[tmp$Sequence_ID==e][2], mean(tmp$dev_log2FC_mut_swapped[!tmp$Sequence_ID==e]))))))
# delta to instances in enhancers with similar mutant activity
<- tmp$motif_mut_dev_activity[tmp$Sequence_ID==e][1]
mut1st <- tmp[!tmp$Sequence_ID==e,][order(abs(tmp$motif_mut_dev_activity[!tmp$Sequence_ID==e]-mut1st)),][1:2,]
e2
<- mean(c(abs(diff(c(tmp$dev_log2FC_mut_swapped[tmp$Sequence_ID==e][1], e2$dev_log2FC_mut_swapped[1]))),
delta_diff abs(diff(c(tmp$dev_log2FC_mut_swapped[tmp$Sequence_ID==e][2], e2$dev_log2FC_mut_swapped[2])))))
<- rbind(out, data.frame(Motif=m,
out Enh=e,
delta_asame=delta_asame,
delta_diff=delta_diff,
delta_diff_avg=delta_diff_avg))
}
}
$Motif <- factor(out$Motif, levels = c("AP1", "ETS", "SREBP", "Atf2", "GATAA", "twist", "Stat92E", "Trl"))
out
<- ggplot(reshape2::melt(out[,-5]), aes(Motif, value, fill=variable)) +
gg_delta geom_boxplot(outlier.size = 0.6, alpha=0.8) +
scale_fill_brewer("Pairs of positions", palette = "Set1", labels = c("Same enhancer", "Different enhancers")) +
scale_y_continuous("Absolute log2 FC\nbetween motif instances") +
scale_x_discrete("Pasted motif") +
ggtitle("Difference between pairs of positions\nin the same or different enhancers") +
theme_bw(base_size = 16) +
theme(axis.text = element_text(colour="black"),
axis.title = element_text(colour="black"),
plot.title = element_text(hjust=0.5),
legend.position = "bottom"
)
print(gg_delta)
sapply(levels(out$Motif), function(m) wilcox.test(out$delta_asame[out$Motif==m], out$delta_diff[out$Motif==m])$p.value) # ETS=0.004
## AP1 ETS SREBP Atf2 GATAA twist
## 0.870926645 0.004399168 0.169084209 0.930139761 0.091187177 0.062007786
## Stat92E Trl
## 0.115053276 0.379605610
library(motifmatchr)
library(PWMEnrich)
library(TFBSTools)
library(seqinr)
# Motifs
load("../Random_variant_STARRseq/data/TF_clusters_PWMs.RData")
View(TF_clusters_PWMs$metadata)
<- list(GATAA=data.frame(Motif="GATAA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507", full="AGATAAGA"),
TF_motifs AP1=data.frame(Motif="AP1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291", full="ATGAGTCAC"),
twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413", full="CCAGATGT"),
Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263", full="GAGAGAG"),
ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510", full="ACCGGAAGT"),
SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234", full="ATCACGCGAC"),
Stat92E=data.frame(Motif="Stat92E", ID="bergman__Stat92E", full="TTTCCCGGAAA"),
Atf2=data.frame(Motif="Atf2", ID="flyfactorsurvey__Atf-2_SANGER_5_FBgn0050420", full="ATCACGTCATC")
)<- do.call(rbind, TF_motifs)
TF_motifs
# All sequences
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main <- GRanges(paste0(sapply(strsplit(as.character(df_main$Sequence_ID),"_"), `[`, 1),
Endogenous_sequences ":", sapply(strsplit(as.character(df_main$Sequence_ID),"_"), `[`, 2),
"-", sapply(strsplit(as.character(df_main$Sequence_ID),"_"), `[`, 3)),
strand = df_main$Strand,
Sequence_ID=df_main$Sequence_ID)
<- unique(Endogenous_sequences)
Endogenous_sequences $Sequence <- getSeq(Dmelanogaster, Endogenous_sequences)
Endogenous_sequences
# motif positions in oligo
<- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID],
motif_ix_pos $Sequence,
Endogenous_sequencesgenome = "BSgenome.Dmelanogaster.UCSC.dm3", p.cutoff = 5e-4, bg="genome", out = "positions")
names(motif_ix_pos) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID])
library(plyranges)
<- lapply(motif_ix_pos, function(motif_x){
motif_ix_pos2 names(motif_x) <- Endogenous_sequences$Sequence_ID
<- motif_x[sapply(motif_x, length)>0] # remove sequences without motif
motif_x # join all IRanges in same GRanges
<- GRanges(names(unlist(motif_x)), #Use sequence ID as seqnames
motif_x IRanges(start(unlist(motif_x)), end(unlist(motif_x))),
strand = mcols(unlist(motif_x))$strand,
score=mcols(unlist(motif_x))$score)
### reduce - keep strand information
<- my_reduce_with_score(motif_x, min.frac.ov=0.5)
motif_x # add wt sequence
$enh_strand <- Endogenous_sequences@strand[match(motif_x@seqnames, Endogenous_sequences$Sequence_ID)]
motif_x$Class <- Endogenous_sequences$Class[match(as.character(motif_x@seqnames), Endogenous_sequences$Sequence_ID)]
motif_x$seq <- substr(as.character(getSeq(Dmelanogaster, GRanges(paste0(sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 1),
motif_x":", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 2),
"-", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 3)),
strand = motif_x$enh_strand))),
@ranges@start,
motif_x@ranges@start+motif_x@ranges@width-1)
motif_xreturn(motif_x)
}) lapply(motif_ix_pos2, function(i) table(width(i)))
<- lapply(names(motif_ix_pos2), function(i){
motif_ix_pos3 <- data.frame(motif_ix_pos2[[i]], stringsAsFactors = F)
x names(x)[1] <- "Sequence"
$Sequence <- as.character(x$Sequence)
x$strand <- as.character(x$strand)
x$Motif <- TF_motifs$Motif[TF_motifs$ID %in% i]
xreturn(x)
})names(motif_ix_pos3) <- names(motif_ix_pos2)
sapply(motif_ix_pos2, length)
sapply(motif_ix_pos3, nrow)
saveRDS(do.call(rbind, motif_ix_pos3), file = "Endogenous_sequences_motif_positions_all.rds")
library(motifmatchr)
library(PWMEnrich)
library(TFBSTools)
library(seqinr)
# Motifs
load("/groups/stark/almeida/data/motifs/motif_collection_v7_no_transfac_SteinAerts/TF_clusters_PWMs.RData")
View(TF_clusters_PWMs$metadata)
<- list(GATAA=data.frame(Motif="GATAA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507", full="AGATAAGA"),
TF_motifs AP1=data.frame(Motif="AP1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291", full="ATGAGTCAC"),
twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413", full="CCAGATGT"),
Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263", full="GAGAGAG"),
ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510", full="ACCGGAAGT"),
SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234", full="ATCACGCGAC"),
Stat92E=data.frame(Motif="Stat92E", ID="bergman__Stat92E", full="TTTCCCGGAAA"),
Atf2=data.frame(Motif="Atf2", ID="flyfactorsurvey__Atf-2_SANGER_5_FBgn0050420", full="ATCACGTCATC")
)<- do.call(rbind, TF_motifs)
TF_motifs
# All sequences
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(duplicated(df_main$Oligo_ID))
# motif positions in oligo
<- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID],
motif_ix_pos as.character(df_main$Sequence),
genome = "BSgenome.Dmelanogaster.UCSC.dm3", p.cutoff = 5e-4, bg="genome", out = "positions")
names(motif_ix_pos) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID])
library(plyranges)
<- lapply(motif_ix_pos, function(motif_x){
motif_ix_pos2 names(motif_x) <- as.character(df_main$Oligo_ID)
<- motif_x[sapply(motif_x, length)>0] # remove sequences without motif
motif_x # join all IRanges in same GRanges
<- GRanges(names(unlist(motif_x)), #Use sequence ID as seqnames
motif_x IRanges(start(unlist(motif_x)), end(unlist(motif_x))),
strand = mcols(unlist(motif_x))$strand,
score=mcols(unlist(motif_x))$score)
### reduce - keep strand information
# motif_x <- my_reduce(motif_x, min.frac.ov=0.7)
## instead, use plyranges::reduce_ranges to get motif score back - but here there is no min overlap
# motif_x <- reduce_ranges(motif_x, max_score = max(score), sum_score = sum(score), Number=n()) # bad because one bp overlapping is too stringent, use the adapted version below
<- my_reduce_with_score(motif_x, min.frac.ov=0.5)
motif_x # add wt sequence
$enh_strand <- df_main$Strand[match(as.character(motif_x@seqnames), as.character(df_main$Oligo_ID))]
motif_x$seq <- substr(as.character(getSeq(Dmelanogaster, GRanges(paste0(sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 1),
motif_x":", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 2),
"-", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 3)),
strand = motif_x$enh_strand))),
@ranges@start,
motif_x@ranges@start+motif_x@ranges@width-1)
motif_xreturn(motif_x)
}) lapply(motif_ix_pos2, function(i) table(width(i)))
<- lapply(names(motif_ix_pos2), function(i){
motif_ix_pos3 <- data.frame(motif_ix_pos2[[i]], stringsAsFactors = F)
x names(x)[1] <- "Sequence"
$Sequence <- as.character(x$Sequence)
x$strand <- as.character(x$strand)
x$Motif <- TF_motifs$Motif[TF_motifs$ID %in% i]
xreturn(x)
})names(motif_ix_pos3) <- names(motif_ix_pos2)
sapply(motif_ix_pos2, length)
sapply(motif_ix_pos3, nrow)
saveRDS(do.call(rbind, motif_ix_pos3), file = "Endogenous_sequences_motif_positions_all_oligos.rds")
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff <- df_main[complete.cases(df_main$dev_log2FC_wt_mut) & complete.cases(df_main$dev_log2FC_wt_swapped) & df_main$dev_log2FC_wt_mut <= log2FC_cutoff,]
df
# remove shuffled variant
<- df[!df$New_motif %in% "shuffle",]
df
# Use GATA as reference for WTmotif because it seems to be the closest to the average
$Wt_motif <- relevel(df$Wt_motif, "GATAA")
df
# add FC to mutant
$dev_log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df
# extend motif pasted sequence to get flanks
<- 5 # add 5bp on each side
motif_flank_length $instance_sequence_extended_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$mut_inst_instance_start[i]-7,
df$mut_inst_instance_end[i]+7)
df
})$instance_sequence_up_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$Motif_center[i]-(ceiling(nchar(as.character(df$New_motif_sequence[i]))/2))-motif_flank_length+1,
df$Motif_center[i]-(ceiling(nchar(as.character(df$New_motif_sequence[i]))/2)))
df
})$instance_sequence_down_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$Motif_center[i]+(floor(nchar(as.character(df$New_motif_sequence[i]))/2)+1),
df$Motif_center[i]+(floor(nchar(as.character(df$New_motif_sequence[i]))/2)+1)+motif_flank_length-1)
df
})
# Motifs
<- readRDS("../20210824_oligo_library_motif_swapping_Drosophila/Endogenous_sequences_motif_positions_all_oligos.rds")
Endogenous_sequences_motif_positions_all $instance_center <- Endogenous_sequences_motif_positions_all$start+(Endogenous_sequences_motif_positions_all$end-Endogenous_sequences_motif_positions_all$start)/2
Endogenous_sequences_motif_positions_all
# multiple models (with caret) with motif counts, presence of other motifs, and motif flanks
# 10-fold CV - 10 different shuffles
require(caret)
<- trainControl(method = "repeatedcv",
trctrl number = 10,
repeats = 10,
classProbs = T,
savePredictions = T)
### features important per motif
<- as.character(unique(df$New_motif))
cand names(cand) <- cand
<- lapply(cand, function(m){
model_list # m="GATAA"
<- df[df$New_motif %in% m,]
df_tmp
# motif flanks (only pasting with enough flanks) - don't include for now because I pasted a full motif
<- df_tmp[nchar(df_tmp$instance_sequence_up_5bp)==median(nchar(df_tmp$instance_sequence_up_5bp)) & nchar(df_tmp$instance_sequence_down_5bp)==median(nchar(df_tmp$instance_sequence_down_5bp)),]
df_tmp <- as.data.frame(sapply(1:max(nchar(df_tmp$instance_sequence_up_5bp)), function(i) sapply(strsplit(df_tmp$instance_sequence_up_5bp,""), `[`, i)))
up <- as.data.frame(sapply(1:max(nchar(df_tmp$instance_sequence_down_5bp)), function(i) sapply(strsplit(df_tmp$instance_sequence_down_5bp,""), `[`, i)))
down names(up) <- paste0("flank_left_", 5:1)
names(down) <- paste0("flank_right_", 1:5)
<- cbind(df_tmp, up, down)
df_tmp
# motif number
<- sapply(unique(Endogenous_sequences_motif_positions_all$Motif), function(m2){
motif_number countOverlaps(GRanges(df_tmp$Oligo_ID, IRanges(1,249)),
GRanges(Endogenous_sequences_motif_positions_all$Sequence[Endogenous_sequences_motif_positions_all$Motif == m2], IRanges(1,249)))
})colnames(motif_number) <- paste0(unique(Endogenous_sequences_motif_positions_all$Motif), "_number")
# motif distances
<- lapply(unique(Endogenous_sequences_motif_positions_all$Motif), function(m2){
add_dist <- sapply(1:nrow(df_tmp), function(s){
Distance <- Endogenous_sequences_motif_positions_all[Endogenous_sequences_motif_positions_all$Sequence %in% df_tmp$Oligo_ID[s] & Endogenous_sequences_motif_positions_all$Motif %in% m2,]
m2_df
# exclude the same instance
<- queryHits(findOverlaps(GRanges(m2_df$Sequence, IRanges(m2_df$start, m2_df$end)),
hi GRanges(df_tmp$Oligo_ID[s], IRanges(df_tmp$mut_inst_instance_start[s], df_tmp$mut_inst_instance_end[s])),
minoverlap = 4))
if(length(hi)>0){
if(df_tmp$New_motif[s]==m2) m2_df <- m2_df[-hi,]
# also the two motifs are similar, and PWM matching find each other
if((df_tmp$New_motif[s]=="Atf2" & m2=="SREBP") | (df_tmp$New_motif[s]=="SREBP" & m2=="Atf2")) m2_df <- m2_df[-hi,]
}
if(nrow(m2_df)>0){
<- m2_df$instance_center-df_tmp$Motif_center[s]
dist return(dist[abs(dist)==min(abs(dist))][1])
else{return(NA)}
}
})<- data.frame(abs(Distance))
out names(out) <- paste0(m2, "_dist")
# only close and distal
paste0("Close_bin_", m2)] <- "NO"
out[,paste0("Close_bin_", m2)][which(out[,1] <= 25)] <- "Close"
out[,paste0("Close_bin_", m2)][which(out[,1] > 25)] <- "Distal"
out[,paste0("Close_bin_", m2)] <- relevel(factor(out[,paste0("Close_bin_", m2)]),
out[,"NO")
return(out)
})
# merge all features
<- cbind(df_tmp, motif_number, do.call(cbind, add_dist))
df_tmp
# add position of the motif
$motif_position <- "center"
df_tmp$motif_position[df_tmp$Motif_center<=100 | df_tmp$Motif_center>=150] <- "flanks" # outside middle 100bp
df_tmp$motif_position[df_tmp$Motif_center<=74 | df_tmp$Motif_center>=175] <- "boundaries" # outside middle 100bp
df_tmp$motif_position <- factor(df_tmp$motif_position, levels = c("flanks", "center", "boundaries"))
df_tmptable(df_tmp$motif_position)
#use motif importance as the negative log2FC -> positive means more important
$motif_importance_minus_log2FC_wt_mut <- -df_tmp$dev_log2FC_wt_mut
df_tmp
### models
= lm(dev_log2FC_mut_swapped ~ . , data=df_tmp[,grep("dev_log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|motif_position|flank_|Close_bin_", names(df_tmp))])
lm_model <- train(dev_log2FC_mut_swapped ~ . , data=m, method = "lm",
model_CV trControl=trctrl,
importance = TRUE,
# preProcess = c("center", "scale"),
tuneLength = 1)
# average across replicate models
<- model_CV$pred
pred <- pred %>%
pred group_by(rowIndex) %>%
summarise(pred=mean(pred),
obs=mean(obs))
return(c(list(Data=df_tmp),
list(Model_trained=lm_model,
Pred=pred,
adj.r.squared=summary(lm(pred$pred~pred$obs))$adj.r.squared,
PCC=cor.test(pred$pred, pred$obs)$estimate)))
})
saveRDS(model_list, "List_motif_model_results_syntax_features.rds")
<- readRDS("List_motif_model_results_syntax_features.rds")
model_list
<- c("motif_importance_minus_log2FC_wt_mut",
vars "Wt_motif",
"motif_position",
"flank_",
paste0("Close_bin_AP1"),
paste0("Close_bin_GATAA"),
paste0("Close_bin_twist"),
paste0("Close_bin_Trl"),
paste0("Close_bin_Stat92E"),
paste0("Close_bin_ETS"),
paste0("Close_bin_SREBP"),
paste0("Close_bin_Atf2")
)names(vars) <- c("wt motif importance",
"wt motif type",
"Motif position",
"Motif flanks",
paste0("Dist to AP1"),
paste0("Dist to GATAA"),
paste0("Dist to twist"),
paste0("Dist to Trl"),
paste0("Dist to Stat92E"),
paste0("Dist to ETS"),
paste0("Dist to SREBP"),
paste0("Dist to Atf2")
)
<- data.frame()
Var_table for(motif in names(model_list)){
for(model in names(model_list[[motif]])[[2]]){
<- model_list[[motif]][[model]]$Pred
tmp
<- cor.test(tmp$pred, tmp$obs,
pc method = "pearson", use="complete.obs")
<- ggplot(tmp, aes(pred, obs)) +
g2 geom_pointdensity() +
scale_color_gradient(low = "grey70", high = motif_colours[[motif]]) +
guides(col=F) +
scale_y_continuous(paste0("log2 FC enh act [observed]"), breaks = seq(-10,10,2)) +
scale_x_continuous(paste0("log2 FC enh act [predicted]"), breaks = seq(-10,10,2)) +
# ggtitle(paste0(motif, " - ", model)) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
annotate("text", x=min(tmp$pred, na.rm=T), y = max(tmp$obs, na.rm=T),
label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(colour="black", size=10))
<- model_list[[motif]][[model]]$Model_trained
model_tmp ## variance explained - 100 models - to get feature importance
<- model_list[[motif]]$Data
df1 <- df1[,grep("dev_log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|motif_position|flank_|Close_bin_", names(df1))]
df1
<- sapply(1:100, function(i){
var_expl_all <- df1[,sample(ncol(df1))]
df3 <- lm(dev_log2FC_mut_swapped~., data=df3)
fit <- anova(fit)
af $PctExp <- af$"Sum Sq"/sum(af$"Sum Sq")*100
af<- sapply(vars, function(x) sum(af$PctExp[grep(x, rownames(af))]))
var_expl
var_expl
})<- rowMeans(var_expl_all)
var_expl_all
<- data.frame(Features=names(var_expl_all),Perc=var_expl_all, stringsAsFactors = F)
tmp_avg $Features[grep("Dist", tmp_avg$Features)] <- "Motif distances"
tmp_avg$Features <- factor(tmp_avg$Features, levels=c("wt motif importance", "wt motif type", "Motif position", "Motif flanks", "Motif distances"))
tmp_avg
<- ggplot(tmp_avg, aes("Average", Perc, fill=Features)) +
barplot_avg geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Variance explained [%]", breaks=seq(-100,100,10), expand = c(0,0)) +
xlab("") +
ggtitle(paste0("adjR2: ", round(summary(model_tmp)$adj.r.squared,2))) +
# ggtitle("Average 100 models - type I") +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=16),
legend.text = element_text(colour="black", size=14))
print(barplot_avg + g2 + plot_layout(widths = c(1, 6)))
$Motif=motif
tmp_avg$Model=model
tmp_avg
<- rbind(Var_table, tmp_avg)
Var_table
}print(motif)
}
## [1] "GATAA"
## [1] "AP1"
## [1] "Trl"
## [1] "ETS"
## [1] "SREBP"
## [1] "Stat92E"
## [1] "Atf2"
## [1] "twist"
# average
mean(Var_table[intersect(grep("Motif flanks", Var_table$Features), grep("model_and_pos_and_flank_and_dist", Var_table$Model)),] %>% group_by(Motif, Model) %>% summarise(Perc=sum(Perc)) %>% pull(Perc)) #16.7%
## [1] 16.78911
mean(Var_table[intersect(grep("Motif distances", Var_table$Features), grep("model_and_pos_and_flank_and_dist", Var_table$Model)),] %>% group_by(Motif, Model) %>% summarise(Perc=sum(Perc)) %>% pull(Perc)) #6.7%
## [1] 6.664546
mean(Var_table[intersect(grep("Motif position", Var_table$Features), grep("model_and_pos_and_flank_and_dist", Var_table$Model)),] %>% group_by(Motif, Model) %>% summarise(Perc=sum(Perc)) %>% pull(Perc)) #0.4%
## [1] 0.3971095
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results
<- c("motif_importance_minus_log2FC_wt_mut",
vars "Wt_motifAP1",
"Wt_motiftwist",
"Wt_motifTrl",
"Wt_motifETS",
"Wt_motifSREBP",
"motif_positioncenter",
"motif_positionboundaries",
paste0("flank_left_5"),
paste0("flank_left_4"),
paste0("flank_left_3"),
paste0("flank_left_2"),
paste0("flank_left_1"),
paste0("flank_right_1"),
paste0("flank_right_2"),
paste0("flank_right_3"),
paste0("flank_right_4"),
paste0("flank_right_5"),
paste0("Close_bin_AP1", c("Close", "Distal")),
paste0("Close_bin_GATAA", c("Close", "Distal")),
paste0("Close_bin_twist", c("Close", "Distal")),
paste0("Close_bin_Trl", c("Close", "Distal")),
paste0("Close_bin_Stat92E", c("Close", "Distal")),
paste0("Close_bin_ETS", c("Close", "Distal")),
paste0("Close_bin_SREBP", c("Close", "Distal")),
paste0("Close_bin_Atf2", c("Close", "Distal"))
)names(vars) <- c("wt motif importance",
"wt AP1",
"wt twist",
"wt Trl",
"wt ETS",
"wt SREBP",
paste0("Motif position - ", c("Center", "Boundaries")),
paste0("Flanking nt -5"),
paste0("Flanking nt -4"),
paste0("Flanking nt -3"),
paste0("Flanking nt -2"),
paste0("Flanking nt -1"),
paste0("Flanking nt +1"),
paste0("Flanking nt +2"),
paste0("Flanking nt +3"),
paste0("Flanking nt +4"),
paste0("Flanking nt +5"),
paste0("Dist to AP1 - ", c("Close", "Distal")),
paste0("Dist to GATAA - ", c("Close", "Distal")),
paste0("Dist to twist - ", c("Close", "Distal")),
paste0("Dist to Trl - ", c("Close", "Distal")),
paste0("Dist to Stat92E - ", c("Close", "Distal")),
paste0("Dist to ETS - ", c("Close", "Distal")),
paste0("Dist to SREBP - ", c("Close", "Distal")),
paste0("Dist to Atf2 - ", c("Close", "Distal"))
)
<- data.frame(row.names = names(sapply(List_motif_model_results, function(m) m[[2]]$adj.r.squared)),
motif_anno PCC = sapply(List_motif_model_results, function(m) m[[2]]$PCC))
# Get FDR-corrected p-value bin
<- do.call(rbind, lapply(names(List_motif_model_results), function(model){
summary_lm # Linear model - significance
<- as.data.frame(summary(List_motif_model_results[[model]][[2]]$Model_trained)$coefficients)
out "Motif"] <- model
out[,
out
}))$FDR <- p.adjust(summary_lm$`Pr(>|t|)`, method="fdr")
summary_lmgrep("flank", rownames(summary_lm)),1] <- abs(summary_lm[grep("flank", rownames(summary_lm)),1]) # for flanks use only pos, because direction has no direct meaning
summary_lm[
<- sapply(unique(summary_lm$Motif), function(model){
summary_lm
# Linear model - significance
<-summary_lm[summary_lm$Motif %in% model,]
m <- sapply(vars, function(x){
var_imp if(length(grep(x, rownames(m)))>0){
<- m[grep(x, rownames(m)),]
m2 <- m2[order(m2[,4]),]
m2 <- m2[1,6]
p if(p>=0.1) p2 <- ">0.1"
if(p<0.1) p2 <- "0.1"
if(p<0.05) p2 <- "0.05"
if(p<0.01) p2 <- "0.01"
if(p<0.001) p2 <- "0.001"
if(sign(m2[1,1])==1 & p<0.1) p2 <- paste0(p2, " (+)")
if(sign(m2[1,1])==-1 & p<0.1) p2 <- paste0(p2, " (-)")
return(p2)
else{return(NA)}
}
})
return(var_imp)
}) summary_lm
## GATAA AP1 Trl ETS
## wt motif importance "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## wt AP1 NA NA "0.001 (+)" ">0.1"
## wt twist ">0.1" ">0.1" "0.01 (+)" "0.01 (+)"
## wt Trl ">0.1" "0.05 (+)" NA "0.05 (+)"
## wt ETS "0.01 (-)" "0.001 (-)" ">0.1" NA
## wt SREBP ">0.1" "0.05 (-)" ">0.1" ">0.1"
## Motif position - Center ">0.1" ">0.1" ">0.1" ">0.1"
## Motif position - Boundaries ">0.1" "0.05 (+)" ">0.1" ">0.1"
## Flanking nt -5 ">0.1" ">0.1" "0.05 (+)" ">0.1"
## Flanking nt -4 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -3 ">0.1" ">0.1" "0.1 (+)" "0.1 (+)"
## Flanking nt -2 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -1 "0.01 (+)" ">0.1" "0.001 (+)" ">0.1"
## Flanking nt +1 "0.001 (+)" ">0.1" "0.001 (+)" ">0.1"
## Flanking nt +2 "0.05 (+)" "0.1 (+)" ">0.1" ">0.1"
## Flanking nt +3 ">0.1" ">0.1" "0.1 (+)" ">0.1"
## Flanking nt +4 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt +5 ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to AP1 - Close ">0.1" ">0.1" ">0.1" "0.05 (+)"
## Dist to AP1 - Distal ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to GATAA - Close "0.001 (-)" ">0.1" "0.1 (+)" "0.001 (+)"
## Dist to GATAA - Distal ">0.1" ">0.1" ">0.1" "0.05 (+)"
## Dist to twist - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to twist - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Trl - Close ">0.1" "0.05 (-)" ">0.1" ">0.1"
## Dist to Trl - Distal ">0.1" "0.05 (-)" ">0.1" ">0.1"
## Dist to Stat92E - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Stat92E - Distal "0.1 (-)" ">0.1" ">0.1" "0.05 (-)"
## Dist to ETS - Close "0.1 (+)" ">0.1" ">0.1" "0.001 (-)"
## Dist to ETS - Distal ">0.1" "0.1 (+)" ">0.1" "0.01 (-)"
## Dist to SREBP - Close ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to SREBP - Distal "0.05 (+)" ">0.1" ">0.1" ">0.1"
## Dist to Atf2 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Atf2 - Distal ">0.1" ">0.1" "0.05 (+)" ">0.1"
## SREBP Stat92E Atf2 twist
## wt motif importance "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## wt AP1 ">0.1" "0.1 (+)" ">0.1" ">0.1"
## wt twist "0.001 (+)" "0.01 (+)" ">0.1" NA
## wt Trl "0.01 (+)" "0.05 (+)" "0.05 (+)" "0.1 (+)"
## wt ETS "0.001 (-)" ">0.1" "0.001 (-)" ">0.1"
## wt SREBP NA ">0.1" ">0.1" "0.1 (-)"
## Motif position - Center ">0.1" ">0.1" ">0.1" ">0.1"
## Motif position - Boundaries ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -5 ">0.1" ">0.1" ">0.1" "0.01 (+)"
## Flanking nt -4 ">0.1" ">0.1" "0.01 (+)" ">0.1"
## Flanking nt -3 ">0.1" "0.01 (+)" ">0.1" ">0.1"
## Flanking nt -2 "0.05 (+)" "0.001 (+)" "0.05 (+)" "0.001 (+)"
## Flanking nt -1 "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## Flanking nt +1 ">0.1" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## Flanking nt +2 ">0.1" "0.01 (+)" "0.05 (+)" "0.001 (+)"
## Flanking nt +3 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt +4 ">0.1" "0.01 (+)" "0.1 (+)" ">0.1"
## Flanking nt +5 "0.05 (+)" ">0.1" "0.1 (+)" ">0.1"
## Dist to AP1 - Close "0.01 (+)" "0.1 (+)" "0.1 (+)" ">0.1"
## Dist to AP1 - Distal ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to GATAA - Close ">0.1" "0.001 (+)" ">0.1" "0.01 (+)"
## Dist to GATAA - Distal ">0.1" "0.001 (+)" ">0.1" "0.1 (+)"
## Dist to twist - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to twist - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Trl - Close ">0.1" ">0.1" "0.1 (-)" ">0.1"
## Dist to Trl - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Stat92E - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Stat92E - Distal ">0.1" "0.01 (-)" ">0.1" "0.1 (-)"
## Dist to ETS - Close ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to ETS - Distal ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to SREBP - Close ">0.1" "0.05 (+)" "0.1 (+)" "0.1 (+)"
## Dist to SREBP - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to Atf2 - Close ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to Atf2 - Distal ">0.1" ">0.1" "0.05 (+)" ">0.1"
### heatmap
# motifs in rows, features in columns
# add PCC
<- reshape2::melt(summary_lm)
out $Var2 <- factor(out$Var2, levels=rev(c("AP1", "GATAA", "twist", "Trl", "Stat92E", "ETS", "SREBP", "Atf2")))
out$value <- factor(out$value, levels=c("0.001 (-)",
out"0.01 (-)",
"0.05 (-)",
"0.1 (-)",
">0.1",
"0.1 (+)",
"0.05 (+)",
"0.01 (+)",
"0.001 (+)"))
<- ggplot(out, aes(Var1, Var2, fill=value)) +
g geom_tile(col="black") +
scale_fill_manual("linear model\nFDR (direction)", values=c("0.001 (-)"="#2166AC",
"0.01 (-)"="#4393C3",
"0.05 (-)"="#92C5DE",
"0.1 (-)"="#D1E5F0",
">0.1"="white",
"0.1 (+)"="#FDDBC7",
"0.05 (+)"="#F4A582",
"0.01 (+)"="#D6604D",
"0.001 (+)"="#B2182B"),
breaks=c("0.1 (-)",
"0.05 (-)",
"0.01 (-)",
"0.001 (-)",
">0.1",
"0.1 (+)",
"0.05 (+)",
"0.01 (+)",
"0.001 (+)"),
drop=F, na.value="grey80") +
guides(fill=guide_legend(nrow=2,byrow=TRUE)) +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
legend.position = "bottom")
$Var2 <- factor(rownames(motif_anno), levels=rev(c("AP1", "GATAA", "twist", "Trl", "Stat92E", "ETS", "SREBP", "Atf2")))
motif_anno
<- ggplot(motif_anno, aes("1", Var2, fill=PCC)) +
g_PCC geom_tile(col="black") +
scale_fill_gradient("PCC", low = "white", high = "green4", breaks = seq(0,10,0.2), limits=c(0,0.65)) +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank())
print(g + g_PCC + plot_layout(widths=c(30,1)))
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results
<- c(paste0("-5"),
myflanks paste0("-4"),
paste0("-3"),
paste0("-2"),
paste0("-1"),
paste0("+1"),
paste0("+2"),
paste0("+3"),
paste0("+4"),
paste0("+5")
)names(myflanks) <- c(paste0("flank_left_5"),
paste0("flank_left_4"),
paste0("flank_left_3"),
paste0("flank_left_2"),
paste0("flank_left_1"),
paste0("flank_right_1"),
paste0("flank_right_2"),
paste0("flank_right_3"),
paste0("flank_right_4"),
paste0("flank_right_5")
)
for(m in names(List_motif_model_results)[c(1,4)]){ # some examples
<- List_motif_model_results[[m]]$Data
tmp
<- ggplot(tmp, aes(Wt_motif, dev_log2FC_mut_swapped, fill=Wt_motif)) +
gg geom_boxplot() +
scale_fill_manual("Wt motif", values=motif_colours) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("wt motif", breaks=levels(tmp$Wt_motif)) +
ggtitle(paste0(m, " pasted (n=", nrow(tmp), " instances)"))
print(gg)
# flanks
<- ggplot(reshape2::melt(tmp[,grep("dev_log2FC_mut_swapped|flank",names(tmp))], id.vars="dev_log2FC_mut_swapped"),
gg_flanks aes(variable, dev_log2FC_mut_swapped, fill=value)) +
geom_boxplot(alpha=0.8, outlier.size = -1) +
scale_fill_manual("", values=c('#109648', '#255C99', '#F7B32B', '#D62839')) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Motif flanks", labels=myflanks) +
ggtitle(paste0(m, " pasted (n=", nrow(tmp), " instances)"))
print(gg_flanks)
# partner motifs
for(m2 in gsub("_dist", "", grep("_dist", names(tmp), value = T))[c(7,3)]){ # some examples
<- table(tmp[, paste0("Close_bin_", m2)])
tt <- sapply(names(tt), function(t) paste0(t, "\n(", tt[t], ")"))
x_labels2 <- ggplot(tmp, aes(tmp[,paste0("Close_bin_", m2)], dev_log2FC_mut_swapped, alpha=tmp[,paste0("Close_bin_", m2)])) +
gg_motif_distance_summary geom_boxplot(fill=motif_colours[m]) +
# scale_fill_manual("Wt motif", values=motif_colours) +
scale_alpha_manual(paste0(m2, " distance"), values=c(0.2, 0.5, 1)) +
guides(alpha=F) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete(paste0(m2, " distance"), breaks=names(x_labels2), labels=x_labels2) +
ggtitle("Summary")
print(gg_motif_distance_summary)
}
}
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results
<- do.call(rbind, lapply(List_motif_model_results, function(m){
All_df return(m$Data)
}))table(All_df$Wt_motif, All_df$New_motif)
# multiple models (with caret) with motif counts, presence of other motifs, and motif flanks
# 10-fold CV
<- trainControl(method = "repeatedcv",
trctrl number = 10,
repeats = 1,
classProbs = T,
savePredictions = T)
<- c("rf")
model_list names(model_list) <- model_list
<- lapply(model_list, function(model){
model_predictions <- lapply(list(Only_motifs=All_df[,grep("dev_log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|New_motif$", names(All_df))],
model_predictions_data All_syntax_features=All_df[,grep("dev_log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|New_motif$|Close_bin_|flank_|motif_position", names(All_df))]
function(data){
),
if(length(grep("dev_log2FC_wt_swapped", names(data)))>0){ # model FC to wt, but it will be more confounded by the motif importance (mutation impact)
<- train(dev_log2FC_wt_swapped ~ . , data=data, method = model,
trained_model trControl=trctrl,
importance = TRUE,
# preProcess = c("center", "scale"),
tuneLength = 1)
else{# model directly the increment from mutant, less biased by motif importance
}<- train(dev_log2FC_mut_swapped ~ . , data=data, method = model,
trained_model trControl=trctrl,
importance = TRUE,
# preProcess = c("center", "scale"),
tuneLength = 1)
}
=varImp(trained_model, scale = F)$importance
feature_importance
return(list(Data=data,
Model_trained=trained_model,
Pred=trained_model$pred,
adj.r.squared=summary(lm(trained_model$pred$pred~trained_model$pred$obs))$adj.r.squared,
PCC=cor.test(trained_model$pred$pred, trained_model$pred$obs)$estimate,
results=trained_model$results,
feature_importance=feature_importance))
})return(model_predictions_data)
})
saveRDS(model_predictions, "List_motif_model_results_syntax_features_all_motifs.rds")
<- readRDS("List_motif_model_results_syntax_features_all_motifs.rds")
model_predictions
="rf"
modelfor(data in names(model_predictions[[model]])){
<- model_predictions[[model]][[data]]$Pred
tmp
<- cor.test(tmp$pred, tmp$obs,
pc method = "pearson", use="complete.obs")
if(length(grep("to_wt", data))>0) lab <- "log2 FC enh act to wt"
if(length(grep("to_wt", data))==0) lab <- "log2 FC enh act to mut"
<- ggplot(tmp, aes(pred, obs)) +
g2 geom_pointdensity() +
scale_color_gradient(low = "grey70", high = "grey20") +
guides(col=F) +
scale_y_continuous(paste0(lab, " [observed]"), breaks = seq(-10,10,2)) +
scale_x_continuous(paste0(lab, " [predicted]"), breaks = seq(-10,10,2)) +
ggtitle(paste0(model, " - ", data, " (10-fold CV)")) +
geom_abline(linetype="dashed", col="grey40") +
annotate("text", x=min(tmp$pred, na.rm=T), y = max(tmp$obs, na.rm=T),
label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(colour="black", size=16))
## Imp features
<- model_predictions[[model]][[data]]$feature_importance
imp $Feature <- rownames(imp)
imp$Feature_type <- NA
imp$Feature_type[grep("motif_importance_minus_log2FC_wt_mut", imp$Feature)] <- "wt motif importance"
imp$Feature_type[grep("Wt_motif", imp$Feature)] <- "wt motif type"
imp$Feature_type[grep("New_motif", imp$Feature)] <- "New motif type"
imp$Feature_type[grep("flank_", imp$Feature)] <- "Motif flanks"
imp$Feature_type[grep("Close_bin_", imp$Feature)] <- "Motif distances"
imp$Feature_type[grep("motif_position", imp$Feature)] <- "Motif position"
imp$Feature_type <- factor(imp$Feature_type, levels=c("wt motif importance", "wt motif type", "New motif type", "Motif position", "Motif flanks", "Motif distances"))
imp
<- imp[order(imp$Feature_type),]
imp $Feature <- factor(imp$Feature, levels = rev(imp$Feature))
imp<- ggplot(imp, aes(Feature, Overall, fill=Feature_type)) +
barplot_all geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"New motif type"="#4daf4a",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Feature importance", expand = c(0,0)) +
xlab("") +
ggtitle("") +
theme(axis.text = element_text(colour="black", size=14),
axis.text.y = element_blank(),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=16),
legend.text = element_text(colour="black", size=14)) +
coord_flip() +
guides(fill=F)
<- imp[order(imp$Overall, decreasing = T),]
imp if(nrow(imp)>20) imp <- imp[1:20,]
$Feature <- factor(imp$Feature, levels = rev(imp$Feature))
imp
<- ggplot(imp, aes(Feature, Overall, fill=Feature_type)) +
barplot_summ geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"New motif type"="#4daf4a",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Feature importance", expand = c(0,0)) +
xlab("") +
ggtitle("") +
theme(axis.text.x = element_text(colour="black", size=14),
axis.text.y = element_text(colour="black", size=12),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=14),
legend.text = element_text(colour="black", size=12)) +
coord_flip()
print(barplot_all + barplot_summ + g2 + plot_layout(widths = c(1,2,3)))
}
<- readRDS("data/DeepSTARR_predictions_motif_pasting.rds")
df_main2
<- df_main2[complete.cases(df_main2$dev_log2FC_wt_mut) & complete.cases(df_main2$dev_log2FC_wt_swapped) & complete.cases(df_main2$dev_pred_log2FC_wt_swapped) & df_main2$dev_log2FC_wt_mut <= -1,]
df
$obs_log2FC_mut_swapped <- df$dev_activity-df$motif_mut_dev_activity
df$dev_pred_log2FC_mut_swapped <- df$Predictions_dev-(df$dev_pred_wt_activity+df$dev_pred_log2FC_wt_mut) # swapped - (WT-log2FC mut)
df<- df[!df$New_motif %in% "shuffle",]
df
<- ggplot(df, aes(obs_log2FC_mut_swapped, dev_pred_log2FC_mut_swapped)) +
main_scater geom_pointdensity(adjust = 1) +
scale_color_gradient(low = "grey70", high = "grey20") +
guides(col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey70") +
geom_vline(xintercept = 0, linetype="dashed", col="grey70") +
scale_x_continuous(paste0("Observed (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0("Predicted (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
annotate("text", x=min(df$obs_log2FC_mut_swapped, na.rm=T), y = max(df$dev_pred_log2FC_mut_swapped, na.rm=T), label = paste0("PCC: ", round(cor(df$obs_log2FC_mut_swapped, df$dev_pred_log2FC_mut_swapped),2)),
vjust=1, hjust=0, size=6) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(size=11))
print(main_scater)
# per pasted motif
<- function(variable,value){
labeller3 return(sapply(levels(df$New_motif), function(m) paste0(m, " (", round(cor(df$obs_log2FC_mut_swapped[df$New_motif==m], df$dev_pred_log2FC_mut_swapped[df$New_motif==m]),2), ")"))[value])
}
<- ggplot(df, aes(obs_log2FC_mut_swapped, dev_pred_log2FC_mut_swapped, col=New_motif)) +
scater3 geom_point(alpha=0.5, size=1) +
scale_color_manual("", values=motif_colours[as.character(df$New_motif)]) +
guides(col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey70") +
geom_vline(xintercept = 0, linetype="dashed", col="grey70") +
scale_x_continuous(paste0("Observed (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0("Predicted (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
# ggtitle(paste0("DeepSTARR predictions (per pasted motif)")) +
facet_wrap(~New_motif, labeller=labeller3) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
strip.text = element_text(size=13))
print(scater3)