Required packages
require(tidyverse)
require(data.table)
require(BSgenome.Hsapiens.UCSC.hg19)
require(seqinr)
require(stringr)
require(GenomicRanges)
require(patchwork)
require(ggpointdensity)
require(gridExtra)
require(cowplot)
require(ggrepel)
require(Biostrings)
require(ggplot2)
theme_set(theme_classic() + theme(axis.text = element_text(colour = "black")))
### Motif colours
<- c("4 controls"="grey60",
motif_colours "neg_ctrl_AAGTTT"="grey60",
"neg_ctrl_ATTAGT"="grey60",
"neg_ctrl_ATAAG"="grey60",
"neg_ctrl_AGTTG"="grey60",
"ctrl_AAGTTT"="grey60",
"ctrl_ATTAGT"="grey60",
"ctrl_ATAAG"="grey60",
"ctrl_AGTTG"="grey60",
"AP1"="#BA3979",
"AP-1"="#BA3979",
"CREB1"="#377EB8",
"MAF"="#FF7F00", # 984EA3
"MECP2"="#984EA3", # E41A1C
"P53"="#4DAF4A",
"Ebox/MYC"="#FFFF33",
"EGR1"=rgb(255,102,102, maxColorValue = 255), #"#A65628"
"ETS"="#a6761d", # "#FF7F00"
"E2F1"="#F781BF",
"Random"="grey60",
"shuffle"="grey60")
require(plyranges)
# reduce motifs but keep scores
<- function(gr0, min.frac.ov=0.7){
my_reduce_with_score # Overlaps that achieve required overlap
<- findOverlaps(gr0, ignore.strand=T, minoverlap = ceiling(unique(width(gr0))*min.frac.ov))
hits # remove self hits
<- hits[queryHits(hits)!=subjectHits(hits)]
hits
### Do reduce just between the respective overlaps
if(length(c(queryHits(hits), subjectHits(hits)))>0){
<- reduce_ranges(gr0[unique(c(queryHits(hits), subjectHits(hits)))], max_score = max(score), sum_score = sum(score), Number_overlapping=n())
gr_red strand(gr_red) <- "+"
<- gr0[-unique(c(queryHits(hits), subjectHits(hits)))]
gr_no_ov if(length(gr_no_ov) > 0){
mcols(gr_no_ov) <- data.frame(max_score=gr_no_ov$score,
sum_score=gr_no_ov$score,
Number_overlapping=1)
else{mcols(gr_no_ov) <- NULL}
}
return(c(gr_no_ov, gr_red))
else{
}<- gr0
gr_no_ov mcols(gr_no_ov) <- data.frame(max_score=gr_no_ov$score,
sum_score=gr_no_ov$score,
Number_overlapping=1)
return(gr_no_ov)
}
}
### function to merge/reduce motif instances based on minimum motif overlap - but here I don't keep the score
<- function(gr0, min.frac.ov=0.7){
my_reduce # Overlaps that achieve required overlap
<- findOverlaps(gr0, ignore.strand=T, minoverlap = ceiling(unique(width(gr0))*min.frac.ov))
hits # remove self hits
<- hits[queryHits(hits)!=subjectHits(hits)]
hits
### Do reduce just between the respective overlaps
<- hits[queryHits(hits)<subjectHits(hits)]
hits
# remove queryhits (q) that are the subjecthits of other (inst1), and for whose its subjecthit is also a subjecthit of inst1
# this is important for MAFK instances in sequence chr15_60699706_60699954 - where two instances in opposite strands (palindrome) overlap another instance
<- queryHits(hits)[queryHits(hits) %in% subjectHits(hits)]
q <- q[subjectHits(hits)[queryHits(hits) %in% q] %in% subjectHits(hits)[!queryHits(hits) %in% q]]
q <- hits[!queryHits(hits) %in% q]
hits
# Reduce the ranges in 'gr0' that are connected via one or more hits in 'hits'. - and merge with the other ranges
# reduce per overlapping ranges
# is a bit slow because of the lapply per overlap
if(length(c(queryHits(hits), subjectHits(hits)))>0){
<- do.call("c", lapply(unique(queryHits(hits)), function(i){
gr1 <- hits[queryHits(hits)==i]
hits_tmp return(reduce(gr0[unique(c(queryHits(hits_tmp), subjectHits(hits_tmp)))], ignore.strand=T, min.gapwidth=0))
}))<- c(gr0[-c(queryHits(hits), subjectHits(hits))], gr1)
out <- unique(out) # if there is overlap between 3 ranges, it will do it twice and I need to unique it
out else{out <- gr0}
}return(out)
}
<- read.delim("data/Twistmix_20220113_oligo_info_paper.txt", stringsAsFactors = F)
Twistmix_HCT116_20220113_oligo_info
# experiment tables
<- read.delim("data/experiment.txt")
experiment_table $simple_name <- gsub("LibFR012_LibFR015_humantwistmix_", "", experiment_table$Outfile)
experiment_table
="UMI"
typefor(i in experiment_table$simple_name){
<- import.bed(paste0("data/", experiment_table$Outfile[experiment_table$simple_name %in% i], ".", type, ".bed"))
mapped
# choose sequences with correct length and strand (mapped with no reverse strand)
<- mapped[width(mapped)==249 & mapped@strand %in% "+"]
mapped_correct_length
# add counts info to Twistmix_HCT116_20220113_oligo_info table
<- merge(Twistmix_HCT116_20220113_oligo_info, as.data.frame(table(mapped_correct_length@seqnames)), by=1, all.x=T)
Twistmix_HCT116_20220113_oligo_info names(Twistmix_HCT116_20220113_oligo_info)[ncol(Twistmix_HCT116_20220113_oligo_info)] <- paste0(i, "_", type)
print(paste0(i, "_", type))
}
# correct NAs to 0 counts
16:ncol(Twistmix_HCT116_20220113_oligo_info)][is.na(Twistmix_HCT116_20220113_oligo_info[16:ncol(Twistmix_HCT116_20220113_oligo_info)])] <- 0
Twistmix_HCT116_20220113_oligo_info[
write.table(Twistmix_HCT116_20220113_oligo_info, paste0("Oligo_counts.txt"), sep="\t", quote=F, row.names = F)
<- read.delim("Oligo_counts.txt")
Twistmix_HCT116_20220113_oligo_info
="UMI"
f
= list()
plot_list_tmp
<- Twistmix_HCT116_20220113_oligo_info[,grep("rep._UMI", names(Twistmix_HCT116_20220113_oligo_info))]
Oligo_counts
# normalise to 1 million mapped fragments??? Or use counts?
<- as.data.frame(apply(Oligo_counts, 2, function(x) x/sum(x)*1e6))
Counts_per_million_cpm
for(id in c("input_rep", "STARRseq_rep")){
if(id=="STARRseq_rep") t="STARR-seq"
if(id=="input_rep") t="input"
<- Counts_per_million_cpm[,grep(id, names(Counts_per_million_cpm))]
df_tmp
<- list(a_b=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]],
comparison_list names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]),
a_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]],
names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]),
b_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]],
names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]))
for(x in names(comparison_list)){
<- comparison_list[[x]][1]
a <- comparison_list[[x]][2]
b
# PCC
<- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
pc log10(df_tmp[apply(df_tmp,1,min)>0,b]),
method = "pearson")
if(id=="input_rep") my_col=c("#d9d9d9","#525252")
if(id=="STARRseq_rep") my_col=c("#737E92","#063887")
# plot
<- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
scater geom_pointdensity(adjust = 0.4, size=0.4) +
scale_color_gradient(low = my_col[1], high = my_col[2]) +
scale_x_log10(gsub("_", " ", a),
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
scale_y_log10(gsub("_", " ", b),
limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
breaks=c(0,1,10,100,1000)) +
guides(color=F) +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
ggtitle(t) +
annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
<- ggplotGrob(scater)
plot_list_tmp[[x]]
}
# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp, nrow = 1))
}
<- read.delim("Oligo_counts.txt")
Count_table <- Count_table[,c(1:15,grep("UMI", names(Count_table)))]
Count_table rownames(Count_table) <- Count_table$Oligo_ID
library(DESeq2)
# save main table to add columns in the end
<- Count_table
Count_table_final
# only sequences with at least 10 reads in both inputs
table(rowSums(Count_table[,grep("input_", names(Count_table))]==0))
##
## 0 1 2 3
## 32413 100 74 663
<- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<10)==0,]
Count_table_2
<- Count_table_2[,grep("input|STARR", names(Count_table_2), ignore.case = T)]
cts rownames(cts) <- Count_table_2$Oligo_ID
# add one count to sequences not found in RNA
==0] <- 1
cts[cts
# design
<- data.frame(type=factor(c(rep("Input",3), rep("Experiment",3)),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors - assumption that controls don't work
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("Neg_region_", rownames(cts)),]))
<- DESeq(dds)
dds
# results
<- results(dds, alpha=0.05)
res
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp <- merge(Count_table_final, tmp, by.x=1, by.y=0, all.x=T)
Count_table_final
write.table(Count_table_final, "Final_table_all_oligos.txt", sep="\t", quote=F, row.names = F)
# remove oligos with no activity values
<- Count_table_final[complete.cases(Count_table_final$log2FoldChange),]
Count_table_final
write.table(Count_table_final, "Final_table.txt", sep="\t", quote=F, row.names = F)
<- read.delim("Oligo_counts.txt")
Count_table <- Count_table[,c(1:15,grep("UMI", names(Count_table)))]
Count_table rownames(Count_table) <- Count_table$Oligo_ID
<- lapply(c(rep1="rep1", rep2="rep2", rep3="rep3"), function(r){
DESeq_results
# save main table to add columns in the end
<- Count_table
Count_table_final
# only sequences with at least 10 reads in both inputs
table(rowSums(Count_table[,grep("input_", names(Count_table))]==0))
<- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<10)==0,]
Count_table_2
<- Count_table_2[,grep("input|STARR", names(Count_table_2), ignore.case = T)]
cts rownames(cts) <- Count_table_2$Oligo_ID
# add one count to sequences not found in RNA
==0] <- 1
cts[cts
# count table replicate-specific
<- cts[,c(1:3,grep(paste0("STARRseq_",r), names(cts), ignore.case = T))]
cts
# design
<- data.frame(type=factor(c(rep("Input",3), rep("Experiment",1)),levels=c("Input", "Experiment")),
coldata row.names = names(cts))
if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
print("Input in design matrix does not match input samples")
break
}if (!all(rownames(coldata) %in% colnames(cts))){
print("Rownames do not match colnames")
break
}if (!all(rownames(coldata) == colnames(cts))){
print("Rownames do not match colnames")
break
}
<- DESeqDataSetFromMatrix(countData = as.matrix(cts),
dds colData = coldata,
design= ~ type)
# use controls as sizeFactors - assumption that controls don't work
sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("Neg_region_", rownames(cts)),]))
<- DESeq(dds)
dds
# results
<- results(dds, alpha=0.05)
res
# merge with main table
<- as.data.frame(res)[,c(1,2,5,6)]
tmp names(tmp) <- paste0(r,"_",names(tmp))
return(tmp)
})
<- merge(Count_table_final, DESeq_results$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$rep3[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final
# save final table
write.table(Count_table_final, "Final_table_all_oligos_and_enrichment_per_replicate.txt", sep="\t", quote = F, row.names = F)
<- read.delim("Final_table_all_oligos_and_enrichment_per_replicate.txt")
Count_table_final
= list()
plot_list_tmp for(e in 1:3){
if(e==1) v=c(1,2)
if(e==2) v=c(1,3)
if(e==3) v=c(2,3)
<- Count_table_final[,grep("rep._log2FoldChange", names(Count_table_final))[v]]
df_tmp
# PCC - not using zeros
<- cor.test(df_tmp[,1],
pc 2],
df_tmp[,method = "pearson", use="pairwise.complete.obs")
=c("#737E92","#063887")
sc_col
# plot
<- ggplot(df_tmp, aes(df_tmp[,1], df_tmp[,2])) +
scater geom_pointdensity(adjust = 0.7, size=0.6) +
scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
scale_x_continuous(paste0("Enhancer activity - rep",v[1]), breaks = seq(-10,10,2)) +
scale_y_continuous(paste0("Enhancer activity - rep",v[2]), breaks = seq(-10,10,2)) +
guides(color=F) +
geom_abline(linetype="dashed", col="grey30") +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5)) +
annotate("text", x=min(df_tmp[,1], na.rm=T), y = max(df_tmp[,2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
= ggplotGrob(scater)
plot_list_tmp[[e]]
}
print(gridExtra::grid.arrange(grobs = plot_list_tmp, nrow = 1))
### activity of oligos
<- read.delim("Final_table.txt", stringsAsFactors = F)[,c(1:5,10:21,23)]
df_twist table(df_twist$Experiment)
<- df_twist[df_twist$Experiment %in% "Swap_motifs",]
REEFSTARRseq_Swap_motif_instances_sequences names(REEFSTARRseq_Swap_motif_instances_sequences)[9] <- "Mutation_log2FC_wt_mut_old_library"
names(REEFSTARRseq_Swap_motif_instances_sequences)[18] <- "Activity"
# add wt activity (match wt by strand) and calculate FC
$wt_activity <- sapply(REEFSTARRseq_Swap_motif_instances_sequences$Sequence_ID, function(e){
REEFSTARRseq_Swap_motif_instances_sequences<- unique(REEFSTARRseq_Swap_motif_instances_sequences$Strand[REEFSTARRseq_Swap_motif_instances_sequences$Sequence_ID %in% e])
str <- df_twist$log2FoldChange[df_twist$Sequence_ID %in% e & df_twist$Strand %in% str & df_twist$Experiment %in% "Wt_enhancers"]
activity if(length(activity)==0) return(NA) else{return(activity)}
})# calculate FC
$log2FC_wt_swapped <- REEFSTARRseq_Swap_motif_instances_sequences$Activity-REEFSTARRseq_Swap_motif_instances_sequences$wt_activity
REEFSTARRseq_Swap_motif_instances_sequences
### keep enhancers active (more active than top negative region)
<- REEFSTARRseq_Swap_motif_instances_sequences[complete.cases(REEFSTARRseq_Swap_motif_instances_sequences$wt_activity) &
REEFSTARRseq_Swap_motif_instances_sequences $wt_activity>max(df_twist$log2FoldChange[grep("Neg_region_", df_twist$Oligo_ID)]),]
REEFSTARRseq_Swap_motif_instances_sequenceslength(unique(REEFSTARRseq_Swap_motif_instances_sequences$Sequence_ID)) # 784 enhancers
table(REEFSTARRseq_Swap_motif_instances_sequences$Wt_motif, REEFSTARRseq_Swap_motif_instances_sequences$New_motif)
### add original motif mutant information, from de Almeida et al 2022
<- readRDS("data/mutation_data.rds")
mutation_data $mut_new_library <- df_twist$log2FoldChange[match(mutation_data$Oligo_ID, df_twist$Oligo_ID)] # update the motif mutation log2FC (below) with the data from these new STARR-seq runs and replicates
mutation_data
<- do.call(rbind, lapply(1:nrow(REEFSTARRseq_Swap_motif_instances_sequences), function(i){
mutated_instances <- REEFSTARRseq_Swap_motif_instances_sequences$Oligo_ID[i]
REEF_ID <- REEFSTARRseq_Swap_motif_instances_sequences$Sequence_ID[i]
enh <- REEFSTARRseq_Swap_motif_instances_sequences$Wt_motif[i]
m <- REEFSTARRseq_Swap_motif_instances_sequences$Motif_center[i]
ctr
<- mutation_data[mutation_data$Sequence_ID==enh & mutation_data$Motif %in% m & mutation_data$instance_start < ctr & mutation_data$instance_end > ctr,]
out
if(nrow(out)>1 & length(which(complete.cases(out$mut)))>0) out <- out[complete.cases(out$mut),]
# if mutated overlapping instance choose the closest one
if(nrow(out)>1) out <- out[order(abs((out$instance_start + (out$instance_end-out$instance_start)/2) -ctr)),][1,]
# output
if(nrow(out)==0) return(data.frame(REEF_ID=REEF_ID,
Oligo_ID=NA,
instance_start=NA,
instance_end=NA,
wt_instance=NA,
mutant_variant=NA,
mut=NA,
wgkmSVM_motif_sum,
wgkmSVM_motif_average)else{ return(cbind(REEF_ID, out[,c(2,28,29,31,32,35,33,34)])) }
)
}))
which(duplicated(mutated_instances$REEF_ID))
which(!complete.cases(mutated_instances$Oligo_ID))
names(mutated_instances) <- paste0("mut_inst_", names(mutated_instances))
names(mutated_instances)[7] <- "motif_mut_activity"
<- cbind(REEFSTARRseq_Swap_motif_instances_sequences, mutated_instances[,-1])
REEFSTARRseq_Swap_motif_instances_sequences_final
# calculate FC
$log2FC_wt_mut <- REEFSTARRseq_Swap_motif_instances_sequences_final$motif_mut_activity - REEFSTARRseq_Swap_motif_instances_sequences_final$wt_activity
REEFSTARRseq_Swap_motif_instances_sequences_finalhead(REEFSTARRseq_Swap_motif_instances_sequences_final)
### For each enhancer, add row with activity and log2FC for motif mutant
$Instance_ID <- sapply(strsplit(as.character(REEFSTARRseq_Swap_motif_instances_sequences_final$Oligo_ID),"_to_"), `[`, 1)
REEFSTARRseq_Swap_motif_instances_sequences_final<- do.call(rbind, lapply(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Instance_ID), function(i){
REEF_motif_mutants <- REEFSTARRseq_Swap_motif_instances_sequences_final[grep(i, REEFSTARRseq_Swap_motif_instances_sequences_final$Oligo_ID, fixed = T),][1,]
out $Oligo_ID <- paste0(i, "_to_mutant")
out$Sequence <- NA
out$New_motif <- "shuffle"
out$New_motif_sequence <- out$mut_inst_mutant_variant
out$Activity <- out$motif_mut_activity
out$log2FC_wt_swapped <- out$log2FC_wt_mut
out
return(out)
}))
<- rbind(REEFSTARRseq_Swap_motif_instances_sequences_final, REEF_motif_mutants)
REEFSTARRseq_Swap_motif_instances_sequences_final
length(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Sequence_ID)) # 784 different enhancers
length(unique(REEFSTARRseq_Swap_motif_instances_sequences_final$Instance_ID)) # 1475 different instances
table(REEFSTARRseq_Swap_motif_instances_sequences_final$Wt_motif, REEFSTARRseq_Swap_motif_instances_sequences_final$New_motif)
### correct motif factors
$Wt_motif <- factor(REEFSTARRseq_Swap_motif_instances_sequences_final$Wt_motif,
REEFSTARRseq_Swap_motif_instances_sequences_finallevels = c("FOSL1_MA0477.1", "TP53_MA0106.3", "MAFK_MA0496.2", "CREB1_MA0018.3", "ETS1_HUMAN.H11MO.0.A", "EGR1_C2H2_1", "MECP2_HUMAN.H11MO.0.C", "E2F1_HUMAN.H11MO.0.A"),
labels = c("AP-1", "P53", "MAF", "CREB1", "ETS", "EGR1", "MECP2", "E2F1"))
$New_motif <- factor(REEFSTARRseq_Swap_motif_instances_sequences_final$New_motif,
REEFSTARRseq_Swap_motif_instances_sequences_finallevels = c("shuffle", "FOSL1_MA0477.1", "TP53_MA0106.3", "MAFK_MA0496.2", "CREB1_MA0018.3", "ETS1_HUMAN.H11MO.0.A", "EGR1_C2H2_1", "MECP2_HUMAN.H11MO.0.C", "E2F1_HUMAN.H11MO.0.A"),
labels = c("shuffle", "AP-1", "P53", "MAF", "CREB1", "ETS", "EGR1", "MECP2", "E2F1"))
saveRDS(REEFSTARRseq_Swap_motif_instances_sequences_final, "REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
library(motifmatchr)
library(PWMEnrich)
library(TFBSTools)
library(seqinr)
# Motifs
<- readRDS("data/TF_motif_human_list.rds")
TF_motif_human_list <- readRDS("data/Selected_motifs.rds")
TF_motifs
# All sequences
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds") # disrupted and non-disrupted
df_main table(duplicated(df_main$Oligo_ID))
# motif positions in oligo, lenient threshold to make sure the wt motif was removed
<- matchMotifs(TF_motif_human_list$All_pwms_log_odds[name(TF_motif_human_list$All_pwms_log_odds) %in% TF_motifs$ID],
motif_ix_pos as.character(df_main$Sequence),
genome = "BSgenome.Hsapiens.UCSC.hg19", p.cutoff = 1e-3, bg="genome", out = "positions")
names(motif_ix_pos) <- name(TF_motif_human_list$All_pwms_log_odds[name(TF_motif_human_list$All_pwms_log_odds) %in% TF_motifs$ID])
library(plyranges)
<- lapply(motif_ix_pos, function(motif_x){
motif_ix_pos2 names(motif_x) <- as.character(df_main$Oligo_ID)
<- motif_x[sapply(motif_x, length)>0] # remove sequences without motif
motif_x # join all IRanges in same GRanges
<- GRanges(names(unlist(motif_x)), #Use sequence ID as seqnames
motif_x IRanges(start(unlist(motif_x)), end(unlist(motif_x))),
strand = mcols(unlist(motif_x))$strand,
score=mcols(unlist(motif_x))$score)
### do not reduce - I want to make sure the swappings disrupted thw motif
# add wt sequence
$enh_strand <- df_main$Strand[match(as.character(motif_x@seqnames), as.character(df_main$Oligo_ID))]
motif_x$seq <- substr(as.character(df_main$Sequence[match(as.character(motif_x@seqnames), as.character(df_main$Oligo_ID))]),
motif_x@ranges@start,
motif_x@ranges@start+motif_x@ranges@width-1)
motif_xreturn(motif_x)
}) # lapply(motif_ix_pos2, function(i) table(width(i)))
<- lapply(names(motif_ix_pos2), function(i){
motif_ix_pos3 <- data.frame(motif_ix_pos2[[i]], stringsAsFactors = F)
x names(x)[1] <- "Sequence"
$Sequence <- as.character(x$Sequence)
x$strand <- as.character(x$strand)
x$Motif <- TF_motifs$Motif[TF_motifs$ID %in% i]
xreturn(x)
})names(motif_ix_pos3) <- names(motif_ix_pos2)
sapply(motif_ix_pos2, length)
sapply(motif_ix_pos3, nrow)
<- do.call(rbind, motif_ix_pos3)
Endogenous_sequences_motif_positions_all $Motif <- gsub("AP1", "AP-1", Endogenous_sequences_motif_positions_all$Motif)
Endogenous_sequences_motif_positions_all
saveRDS(Endogenous_sequences_motif_positions_all, file = "Oligo_sequenuces_motif_positions_p1e-3.rds")
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds")
df_main table(df_main$Wt_motif)
table(df_main$New_motif)
table(df_main$Wt_motif, df_main$New_motif)
### Get WT motif scores
<- readRDS("Oligo_sequenuces_motif_positions_p1e-3.rds")
Endogenous_sequences_motif_positions_all_wt $Motif <- gsub("AP1", "AP-1", Endogenous_sequences_motif_positions_all_wt$Motif)
Endogenous_sequences_motif_positions_all_wt$instance_center <- Endogenous_sequences_motif_positions_all_wt$start+(Endogenous_sequences_motif_positions_all_wt$end-Endogenous_sequences_motif_positions_all_wt$start)/2
Endogenous_sequences_motif_positions_all_wt
$WT_motif_score <- sapply(1:nrow(df_main), function(i){
df_main<- Endogenous_sequences_motif_positions_all_wt[Endogenous_sequences_motif_positions_all_wt$Sequence %in% df_main$Sequence_ID[i] & #same oligo
out $Motif == df_main$Wt_motif[i],]
Endogenous_sequences_motif_positions_all_wtreturn(out$max_score[order(abs(out$instance_center - df_main$Motif_center[i]), decreasing = T)][1])
})
### select only swaps that disrupt the wt motif - because if the WT remains we cannot interpret the result
# Motifs
<- readRDS("Oligo_sequenuces_motif_positions_p1e-3.rds") # less stringent threshold
Endogenous_sequences_motif_positions_all $instance_center <- Endogenous_sequences_motif_positions_all$start+(Endogenous_sequences_motif_positions_all$end-Endogenous_sequences_motif_positions_all$start)/2
Endogenous_sequences_motif_positions_all
$Motif_disrupted <- sapply(1:nrow(df_main), function(i){
df_mainnrow(Endogenous_sequences_motif_positions_all[Endogenous_sequences_motif_positions_all$Sequence %in% df_main$Oligo_ID[i] & #same oligo
$Motif == df_main$Wt_motif[i] & #same wt motif
Endogenous_sequences_motif_positions_allabs(Endogenous_sequences_motif_positions_all$instance_center - df_main$Motif_center[i]) <= 5,]) == 0 # near the original position
})$Motif_disrupted <- factor(df_main$Motif_disrupted, levels=c("TRUE", "FALSE"))
df_main
table(df_main$Motif_disrupted)
table(df_main$Wt_motif, df_main$Motif_disrupted)
table(df_main$New_motif, df_main$Motif_disrupted)
table(WT=df_main$Wt_motif, New=df_main$New_motif, df_main$Motif_disrupted)
# AP-1 -> MAF
# MAF -> AP-1
# P53 -> CREB1/MECP2
# MECP2 -> ETS
# ETS -> E2F1
# EGR1 -> MECP2/E2F1
### save table keeping only swaps that disrupt the wt motif
<- df_main[df_main$Motif_disrupted %in% "TRUE",]
out table(out$Wt_motif, out$New_motif)
<- out[!(out$Wt_motif=="P53" & out$New_motif=="MECP2"),] # pasting short MECP2 motif in middle of P53 motif likely does not mutate it, so better not include it
out <- out[!(out$Wt_motif=="P53" & out$New_motif=="CREB1"),] # pasting short CREB1 motif in middle of P53 motif likely does not mutate it, so better not include it
out
saveRDS(out, "REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff <- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df $log2FC_mut_pasted <- df$Activity-df$motif_mut_activity
dftable(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## 293 224 246 100 263 110 41 77
length(unique(df$Instance_ID)) # 1354 instances tested
## [1] 1354
length(unique(df$Sequence_ID)) # in 753 enhancers tested
## [1] 753
<- df[,-c(4,9,21:25,27,28,31,32)]
out names(out)[18] <- "log2FC_wt_pasted"
write.table(out[,c(1:4,21,5:20,22)], "Supplementary_Table_6.txt", sep="\t", quote=F, row.names=F)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main table(df_main$Wt_motif, df_main$New_motif)
##
## shuffle AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## AP-1 295 0 293 0 294 294 295 293 291
## P53 230 228 0 228 0 228 230 0 219
## MAF 262 114 209 0 207 209 208 209 207
## CREB1 112 109 112 112 0 112 112 112 112
## ETS 288 281 287 286 286 0 288 281 193
## EGR1 135 132 132 132 132 132 0 109 111
## MECP2 55 54 54 54 54 0 54 0 52
## E2F1 98 98 98 98 97 97 96 94 0
# filter for important instances
table(df_main$Wt_motif, df_main$log2FC_wt_mut <= -1)
##
## FALSE TRUE
## AP-1 0 2043
## P53 18 1327
## MAF 96 1507
## CREB1 80 797
## ETS 187 2003
## EGR1 184 831
## MECP2 91 286
## E2F1 167 609
table(df_main$Wt_motif, df_main$log2FC_wt_mut <= -2)
##
## FALSE TRUE
## AP-1 91 1952
## P53 378 967
## MAF 983 620
## CREB1 559 318
## ETS 1522 668
## EGR1 874 141
## MECP2 328 49
## E2F1 657 119
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## 293 224 246 100 263 110 41 77
# reorder levels
$New_motif <- factor(df$New_motif, levels = rev(df %>% group_by(New_motif) %>% summarise(act=median(Activity, na.rm=T)) %>% arrange(desc(act, )) %>% pull(New_motif)))
df
<- ggplot(df,
violin aes(New_motif, Activity, colour=New_motif, fill=New_motif)) +
geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
geom_violin(data=df[!duplicated(df$Instance_ID),], aes("WT enh", wt_activity), size=0.7, colour="orangered", fill="orangered") +
geom_boxplot(data=df[!duplicated(df$Instance_ID),], aes("WT enh", wt_activity), col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 enhancer activity", breaks = seq(-10,10,2)) +
scale_x_discrete("", limits=c("WT enh", levels(df$New_motif)), labels=c("WT enh", "mutant", levels(df$New_motif)[-1])) +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
print(violin)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
table(df_main$Wt_motif, df_main$log2FC_wt_mut <= -1)
##
## FALSE TRUE
## AP-1 0 2043
## P53 18 1327
## MAF 96 1507
## CREB1 80 797
## ETS 187 2003
## EGR1 184 831
## MECP2 91 286
## E2F1 167 609
table(df_main$Wt_motif, df_main$log2FC_wt_mut <= -2)
##
## FALSE TRUE
## AP-1 91 1952
## P53 378 967
## MAF 983 620
## CREB1 559 318
## ETS 1522 668
## EGR1 874 141
## MECP2 328 49
## E2F1 657 119
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## 293 224 246 100 263 110 41 77
# reorder levels
$New_motif <- factor(df$New_motif, levels = c("shuffle", "P53", "CREB1", "AP-1", "MAF", "EGR1", "ETS", "MECP2", "E2F1"))
df
# all
# compare with summary results of pasting the different motif types individually
<- ggplot(df, aes(New_motif, log2FC_wt_swapped, colour=New_motif, fill=New_motif)) +
violin geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to wt", breaks = seq(-10,10,2)) +
scale_x_discrete("New motif") +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
# compare to shuffle mutant
<- ggplot(df[!df$New_motif=="shuffle",], aes(New_motif, Activity-motif_mut_activity, colour=New_motif, fill=New_motif)) +
violin2 geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif") +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
<- ggplot(df, aes(New_motif, Activity, colour=New_motif, fill=New_motif)) +
violin_WT geom_violin(size=0.7) +
geom_boxplot(col="black", fill=NA, size=0.7, outlier.size = -1, width=0.2) +
scale_colour_manual(values=motif_colours) +
scale_fill_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 enhancer activity", breaks = seq(-10,10,2)) +
scale_x_discrete("New motif") +
ggtitle(paste0("Summary of individual motif swapping (n=",length(unique(df$Instance_ID))," inst each)"))
print(violin2)
print(violin_WT)
print(violin)
# per wt motif
<- ggplot(df, aes(New_motif, log2FC_wt_swapped, fill=Wt_motif)) +
gg geom_boxplot(outlier.size = 0.4) +
scale_fill_manual("Replaced\nmotif", values=motif_colours[as.character(df$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to wt", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif", breaks=levels(df$New_motif)) +
ggtitle(paste0(length(unique(df$Instance_ID)), " instances"))
# compare to shuffled mutant
<- ggplot(df[!df$New_motif=="shuffle",], aes(New_motif, Activity-motif_mut_activity, fill=Wt_motif)) +
gg2 geom_boxplot(outlier.size = 0.4) +
scale_fill_manual("Replaced\nmotif", values=motif_colours[as.character(df$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Pasted motif", breaks=levels(df$New_motif)) +
ggtitle(paste0(length(unique(df$Instance_ID)), " instances"))
print(gg2)
print(gg)
# per pasted motif
<- function(variable,value){
labeller1 return(sapply(levels(df$New_motif[!df$New_motif=="shuffle"]), function(m) paste0(m, " (", round(cor.test(df$motif_mut_activity[df$New_motif==m],
$Activity[df$New_motif==m]-df$motif_mut_activity[df$New_motif==m])$estimate,2), ")"))[value])
df
}
<- ggplot(df[!df$New_motif=="shuffle",], aes(motif_mut_activity, Activity-motif_mut_activity, colour=New_motif)) +
MA geom_point(size=0.9, alpha=0.7) +
scale_colour_manual(values=motif_colours) +
guides(fill="none", col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
geom_vline(xintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to mutant", breaks = seq(-10,10,2)) +
scale_x_continuous("Mutant enhancer activity [log2]") +
facet_wrap(~New_motif, labeller = labeller1, nrow = 2) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
strip.text = element_text(size=13))
print(MA)
ggsave("MAplots_motif_mutated_activity.pdf", MA, width = 12, height = 6)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## 293 224 246 100 263 110 41 77
# crate matrix motif swapped vs new motif
$log2FC_mut_swapped <- df$Activity-df$motif_mut_activity
df<- data.table::dcast(df, Instance_ID + Wt_motif ~ New_motif, value.var = "log2FC_mut_swapped", fun=mean)[,-3]
df_matrix
### correlation heatmap - Pearson
require(pheatmap)
=seq(0,1,length.out = 200)
breaksList= colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList))
mycol
<- cor(df_matrix[,-c(1,2)], method = "pearson", use = "pairwise.complete.obs")
cormat print(pheatmap(cormat,
clustering_distance_cols = as.dist(1 - cormat),
clustering_distance_rows = as.dist(1 - cormat),
fontsize_row = 10,
angle_col = 0,
#fontsize_col = 12,
show_colnames = F,
color=mycol,
breaks=breaksList,
border_color = NA,
main=paste0("Motif types - PCC")))
# pearson out
<- function(matrix){
cor_function <- cor(x = matrix, method = "pearson", use = "pairwise.complete.obs")
cors <- cors[upper.tri(cors, diag = FALSE)] # we're only interested in one of the off-diagonals, otherwise there'd be duplicates
cor.data <- as.data.frame(cor.data) # that's how ggplot likes it
cor.data colnames(cor.data) <- "corr"
return(cor.data$corr)
}
### pairwise comparison plots
# for(i1 in 3:(ncol(df_matrix)-1)){
# for(i2 in (i1+1):ncol(df_matrix)){
# example for AP-1
for(i1 in 3){
for(i2 in c(4,8,7)){
<- df_matrix
tmp
<- names(tmp)[i1]
m1 <- names(tmp)[i2]
m2
# PCC - not using zeros
<- cor.test(tmp[,m1],
pc
tmp[,m2],method = "pearson")
=c("grey60","orangered")
my_col
# colour points
$col <- ""
tmp$col[tmp[,m1] - tmp[,m2] >= 1] <- m1
tmp$col[tmp[,m2] - tmp[,m1] >= 1] <- m2
tmp$col <- factor(tmp$col, levels = c("", m1, m2))
tmp
# plot
<- ggplot(tmp, aes(tmp[,m1], tmp[,m2], col=col)) +
scater geom_point(size=1.5) +
scale_colour_manual("Preferential\nactivity", values=c("grey50", motif_colours[m1], motif_colours[m2])) +
guides(col="none") +
# geom_hline(yintercept = 0, linetype="dashed", col="grey75") +
# geom_vline(xintercept = 0, linetype="dashed", col="grey75") +
geom_abline(intercept = 0, linetype="dashed", slope = 1, col="grey20") +
scale_x_continuous(paste0(m1, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0(m2, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
theme_bw(base_size = 15) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=15)) +
annotate("text", x=min(tmp[,m1], na.rm=T), y = max(tmp[,m2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
# colour by original motif
<- ggplot(df_matrix, aes(df_matrix[,m1], df_matrix[,m2], col=Wt_motif)) +
scater2 geom_point(size=1.5) +
scale_colour_manual("wt motif", values=motif_colours[as.character(df_matrix$Wt_motif)]) +
geom_hline(yintercept = 0, linetype="dashed", col="grey75") +
geom_vline(xintercept = 0, linetype="dashed", col="grey75") +
geom_abline(intercept = 0, slope = 1, col="grey20") +
scale_x_continuous(paste0(m1, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0(m2, " (log2 FC to mut)"),
breaks=seq(-10,20,2)) +
theme_bw(base_size = 15) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour="black"),
plot.title = element_text(hjust=0.5, size=15))
print(scater)
print(scater2)
} }
require(pheatmap)
require(RColorBrewer)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df table(df$Wt_motif[!duplicated(df$Instance_ID)])
##
## AP-1 P53 MAF CREB1 ETS EGR1 MECP2 E2F1
## 293 224 246 100 263 110 41 77
# crate matrix motif swapped vs new motif
$log2FC_mut_swapped <- df$Activity-df$motif_mut_activity
df<- data.table::dcast(df, Instance_ID + Wt_motif ~ New_motif, value.var = "log2FC_mut_swapped", fun=mean)[,-3]
df_matrix
# Remove instances for which only 2 or less motif swapps didn't work
table(apply(is.na(df_matrix[,-c(1:2)]), 1, sum))
##
## 1 2 3 4 7 8
## 495 557 236 9 26 31
<- df_matrix[apply(is.na(df_matrix[,-c(1:2)]), 1, sum) <= 2,]
df_matrix
<- df_matrix[,-c(1:2)]
dat rownames(dat) <- df_matrix$Instance_ID
### annotations
<- data.frame("Wt motif" = df_matrix$Wt_motif,
instance_row row.names = df_matrix$Instance_ID)
# Pearson clustering heatmap
# Pairwise correlation between samples (columns)
<- cor(dat, use = "pairwise.complete.obs", method = "pearson")
cols.cor # Pairwise correlation between rows (genes)
<- cor(t(dat), use = "pairwise.complete.obs", method = "pearson")
rows.cor table(is.na(rows.cor))
##
## FALSE
## 1106704
# colours and breaks
library(RColorBrewer)
=1
s= seq(floor(min(dat, na.rm=T)), ceiling(max(dat, na.rm=T)), by = s)
myBreaks
<- c(colorRampPalette(
myColor c("#2166AC", "#4393C3", "#92C5DE",
"#D1E5F0"))(abs(floor(min(dat, na.rm=T))*(1/s))-1),
"white", "white",
colorRampPalette(
c("#FDDBC7", "#F4A582",
"#D6604D", "#B2182B"))((ceiling(max(dat, na.rm=T))*(1/s))-1))
<- pheatmap(dat,
heatmap_pearson color=myColor, breaks=myBreaks,
clustering_distance_cols = as.dist(1 - cols.cor),
clustering_distance_rows = as.dist(1 - rows.cor),
angle_col = 90,
border_color = NA,
legend_breaks=seq(-10,10,2),
show_rownames = F,
fontsize_col=11,
main=paste0("log2 FC to mut (", nrow(dat), " instances)"),
width = 5.5, height = 7
)
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & complete.cases(df_main$log2FC_wt_swapped) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df
# remove shuffled variant
<- df[!df$New_motif %in% "shuffle",]
df
### for different formulas
$log2FC_mut_swapped <- df$Activity-df$motif_mut_activity
df
# linear model
<- lm(log2FC_mut_swapped ~ log2FC_wt_mut + Wt_motif * New_motif, df)
fit
# How much variance explained by each feature
<- anova(fit)
af $PctExp <- af$"Sum Sq"/sum(af$"Sum Sq")*100
af$variable <- factor(rownames(af), levels = rev(c("log2FC_wt_mut", "Sequence_ID", "Instance_ID", "Wt_motif", "New_motif", "Wt_motif:New_motif", "Instance_ID:New_motif", "Residuals")),
aflabels = rev(c("wt motif imp.", "Sequence ID", "Instance ID", "WT motif", "Pasted motif", "WT:Pasted motif interaction", "Instance ID:Pasted motif", "Residuals")))
# barplot
<- ggplot(af, aes("", PctExp, fill=variable)) +
barplot geom_col() +
scale_fill_manual("Features", values = rev(c("wt motif imp."="#984ea3", "WT motif"="#e41a1c", "Pasted motif"="#377eb8", "WT:Pasted motif interaction"="#4daf4a", "Residuals"="grey60"))) +
scale_y_continuous("Variance explained [%]", breaks=seq(-100,100,20), expand = c(0,0)) +
ggtitle(paste0(nrow(df), " positions (adjR2: ", round(summary(fit)$adj.r.squared,2), ")")) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
axis.title.x = element_blank(), axis.ticks.x = element_blank())
<- data.frame(obs=df$log2FC_mut_swapped,
out pred=predict(fit, df))
<- ggplot(out, aes(obs, pred)) +
scater geom_pointdensity(adjust = 1.3) +
scale_color_gradient(low = "grey70", high = "grey20") +
guides(col="none") +
geom_hline(yintercept = 0, linetype="dashed", col="grey70") +
geom_vline(xintercept = 0, linetype="dashed", col="grey70") +
geom_abline(intercept = 0, slope = 1, col="grey20") +
scale_x_continuous(paste0("Observed (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
scale_y_continuous(paste0("Predicted (log2 FC enhancer activity to mut)"),
breaks=seq(-10,20,2)) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(size=11)) +
ggtitle(paste(as.character(fit$call$formula)[2], as.character(fit$call$formula)[1], as.character(fit$call$formula)[3])) +
annotate("text", x=min(out$obs, na.rm=T), y = max(out$pred, na.rm=T), label = paste0("PCC: ", round(cor(out$obs, out$pred),2)), vjust=1, hjust=0, size=5)
print(barplot + scater + plot_layout(widths = c(1, 5)))
library(motifmatchr)
library(PWMEnrich)
library(TFBSTools)
library(seqinr)
# Motifs
<- readRDS("data/TF_motif_human_list.rds")
TF_motif_human_list View(TF_motif_human_list$metadata)
<- readRDS("data/Selected_motifs.rds")
TF_motifs
# All sequences
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final.rds") # disrupted and non-disrupted
df_main table(duplicated(df_main$Oligo_ID))
# motif positions in oligo
<- matchMotifs(TF_motif_human_list$All_pwms_log_odds[name(TF_motif_human_list$All_pwms_log_odds) %in% TF_motifs$ID],
motif_ix_pos as.character(df_main$Sequence),
genome = "BSgenome.Hsapiens.UCSC.hg19", p.cutoff = 5e-4, bg="genome", out = "positions")
names(motif_ix_pos) <- name(TF_motif_human_list$All_pwms_log_odds[name(TF_motif_human_list$All_pwms_log_odds) %in% TF_motifs$ID])
library(plyranges)
<- lapply(motif_ix_pos, function(motif_x){
motif_ix_pos2 names(motif_x) <- as.character(df_main$Oligo_ID)
<- motif_x[sapply(motif_x, length)>0] # remove sequences without motif
motif_x # join all IRanges in same GRanges
<- GRanges(names(unlist(motif_x)), #Use sequence ID as seqnames
motif_x IRanges(start(unlist(motif_x)), end(unlist(motif_x))),
strand = mcols(unlist(motif_x))$strand,
score=mcols(unlist(motif_x))$score)
### reduce - keep strand information
# motif_x <- my_reduce(motif_x, min.frac.ov=0.7)
## instead, use plyranges::reduce_ranges to get motif score back - but here there is no min overlap
# motif_x <- reduce_ranges(motif_x, max_score = max(score), sum_score = sum(score), Number=n()) # bad because one bp overlapping is too stringent, use the adapted version below
<- my_reduce_with_score(motif_x, min.frac.ov=0.5)
motif_x # add wt sequence
$enh_strand <- df_main$Strand[match(as.character(motif_x@seqnames), as.character(df_main$Oligo_ID))]
motif_x$seq <- substr(as.character(df_main$Sequence[match(as.character(motif_x@seqnames), as.character(df_main$Oligo_ID))]),
motif_x@ranges@start,
motif_x@ranges@start+motif_x@ranges@width-1)
motif_xreturn(motif_x)
}) lapply(motif_ix_pos2, function(i) table(width(i)))
<- lapply(names(motif_ix_pos2), function(i){
motif_ix_pos3 <- data.frame(motif_ix_pos2[[i]], stringsAsFactors = F)
x names(x)[1] <- "Sequence"
$Sequence <- as.character(x$Sequence)
x$strand <- as.character(x$strand)
x$Motif <- TF_motifs$Motif[TF_motifs$ID %in% i]
xreturn(x)
})names(motif_ix_pos3) <- names(motif_ix_pos2)
sapply(motif_ix_pos2, length)
sapply(motif_ix_pos3, nrow)
<- do.call(rbind, motif_ix_pos3)
Endogenous_sequences_motif_positions_all $Motif <- gsub("AP1", "AP-1", Endogenous_sequences_motif_positions_all$Motif)
Endogenous_sequences_motif_positions_all
saveRDS(Endogenous_sequences_motif_positions_all, file = "Endogenous_sequences_motif_positions_all_oligos.rds")
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main
# filter for important instances
<- -1
log2FC_cutoff
<- df_main[complete.cases(df_main$log2FC_wt_mut) & complete.cases(df_main$log2FC_wt_swapped) & df_main$log2FC_wt_mut <= log2FC_cutoff,]
df
# remove shuffled variant
<- df[!df$New_motif %in% "shuffle",]
df
# Use E2F1 as reference for WTmotif because it seems to be the closest to the average
$Wt_motif <- relevel(df$Wt_motif, "E2F1")
df
# add FC to mutant
$log2FC_mut_swapped <- df$Activity-df$motif_mut_activity
df
# extend motif pasted sequence to get flanks
<- 5 # add 5bp on each side
motif_flank_length $instance_sequence_extended_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$mut_inst_instance_start[i]-7,
df$mut_inst_instance_end[i]+7)
df
})$instance_sequence_up_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$Motif_center[i]-(ceiling(nchar(as.character(df$New_motif_sequence[i]))/2))-motif_flank_length+1,
df$Motif_center[i]-(ceiling(nchar(as.character(df$New_motif_sequence[i]))/2)))
df
})$instance_sequence_down_5bp <- sapply(1:nrow(df), function(i){
dfsubstr(as.character(df$Sequence[i]),
$Motif_center[i]+(floor(nchar(as.character(df$New_motif_sequence[i]))/2)+1),
df$Motif_center[i]+(floor(nchar(as.character(df$New_motif_sequence[i]))/2)+1)+motif_flank_length-1)
df
})
# Motifs
<- readRDS("Endogenous_sequences_motif_positions_all_oligos.rds")
Endogenous_sequences_motif_positions_all $instance_center <- Endogenous_sequences_motif_positions_all$start+(Endogenous_sequences_motif_positions_all$end-Endogenous_sequences_motif_positions_all$start)/2
Endogenous_sequences_motif_positions_all
# multiple models (with caret) with motif counts, presence of other motifs, and motif flanks
# 10-fold CV
require(caret)
<- trainControl(method = "repeatedcv",
trctrl number = 10,
repeats = 1,
classProbs = T,
savePredictions = T)
### features important per motif
<- as.character(unique(df$New_motif))
cand names(cand) <- cand
<- lapply(cand, function(m){
model_list
<- df[df$New_motif %in% m,]
df_tmp
# motif flanks
<- df_tmp[nchar(df_tmp$instance_sequence_up_5bp)==median(nchar(df_tmp$instance_sequence_up_5bp)) & nchar(df_tmp$instance_sequence_down_5bp)==median(nchar(df_tmp$instance_sequence_down_5bp)),]
df_tmp <- as.data.frame(sapply(1:max(nchar(df_tmp$instance_sequence_up_5bp)), function(i) sapply(strsplit(df_tmp$instance_sequence_up_5bp,""), `[`, i)))
up <- as.data.frame(sapply(1:max(nchar(df_tmp$instance_sequence_down_5bp)), function(i) sapply(strsplit(df_tmp$instance_sequence_down_5bp,""), `[`, i)))
down names(up) <- paste0("flank_left_", 5:1)
names(down) <- paste0("flank_right_", 1:5)
<- cbind(df_tmp, up, down)
df_tmp
# motif distances
<- lapply(unique(Endogenous_sequences_motif_positions_all$Motif), function(m2){
add_dist <- sapply(1:nrow(df_tmp), function(s){
Distance <- Endogenous_sequences_motif_positions_all[Endogenous_sequences_motif_positions_all$Sequence %in% df_tmp$Oligo_ID[s] & Endogenous_sequences_motif_positions_all$Motif %in% m2,]
m2_df
# exclude the same instance
<- queryHits(findOverlaps(GRanges(m2_df$Sequence, IRanges(m2_df$start, m2_df$end)),
hi GRanges(df_tmp$Oligo_ID[s], IRanges(df_tmp$mut_inst_instance_start[s], df_tmp$mut_inst_instance_end[s])),
minoverlap = 4))
if(length(hi)>0){
if(df_tmp$New_motif[s]==m2) m2_df <- m2_df[-hi,]
# also the two motifs are similar, and PWM matching find each other
if((df_tmp$New_motif[s]=="ETS" & m2=="MECP2") | (df_tmp$New_motif[s]=="MECP2" & m2=="ETS") | (df_tmp$New_motif[s]=="AP-1" & m2=="MAF") | (df_tmp$New_motif[s]=="MAF" & m2=="AP-1")) m2_df <- m2_df[-hi,]
}
if(nrow(m2_df)>0){
<- m2_df$instance_center-df_tmp$Motif_center[s]
dist return(dist[abs(dist)==min(abs(dist))][1])
else{return(NA)}
}
})<- data.frame(abs(Distance))
out names(out) <- paste0(m2, "_dist")
# only close and distal
paste0("Close_", m2)] <- "NO"
out[,paste0("Close_", m2)][which(out[,1] <= 25)] <- "Close"
out[,paste0("Close_", m2)][which(out[,1] > 25)] <- "Distal"
out[,paste0("Close_", m2)] <- relevel(factor(out[,paste0("Close_", m2)]),
out[,"NO")
return(out)
})
# merge all features
<- cbind(df_tmp, do.call(cbind, add_dist))
df_tmp
# add position of the motif
$motif_position <- "center"
df_tmp$motif_position[df_tmp$Motif_center<=100 | df_tmp$Motif_center>=150] <- "flanks" # outside middle 100bp
df_tmp$motif_position[df_tmp$Motif_center<=74 | df_tmp$Motif_center>=175] <- "boundaries" # outside middle 100bp
df_tmp$motif_position <- factor(df_tmp$motif_position, levels = c("flanks", "center", "boundaries"))
df_tmptable(df_tmp$motif_position)
#use motif importance as the negative log2FC -> positive means more important
$motif_importance_minus_log2FC_wt_mut <- -df_tmp$log2FC_wt_mut
df_tmp
### models
<- lm(log2FC_mut_swapped ~ . , data=df_tmp[,grep("log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|Close_|flank_|motif_position", names(df_tmp))])
model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_model <- train(log2FC_mut_swapped ~ . , data=df_tmp[,grep("log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|Close_|flank_|motif_position", names(df_tmp))], method = "lm",
model_log2FC_mut_swapped_and_importance_no_number_plus_flanks trControl=trctrl,
importance = TRUE,
# preProcess = c("center", "scale"),
tuneLength = 1)
<- model_log2FC_mut_swapped_and_importance_no_number_plus_flanks$pred
model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred
return(list(Data=df_tmp,
model_log2FC_mut_swapped_and_importance_no_number_plus_flanks=list(Model_trained=model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_model,
Pred=model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred,
adj.r.squared=summary(lm(model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred$pred~model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred$obs))$adj.r.squared,
PCC=cor.test(model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred$pred, model_log2FC_mut_swapped_and_importance_no_number_plus_flanks_pred$obs)$estimate)
))
})
saveRDS(model_list, "List_motif_model_results_syntax_features.rds")
<- readRDS("List_motif_model_results_syntax_features.rds")
model_list
<- c("motif_importance_minus_log2FC_wt_mut",
vars "Wt_motif",
"motif_position",
"flank_",
paste0("Close_AP-1"),
paste0("Close_P53"),
paste0("Close_MAF"),
paste0("Close_CREB1"),
paste0("Close_ETS"),
paste0("Close_EGR1"),
paste0("Close_MECP2"),
paste0("Close_E2F1")
)names(vars) <- c("wt motif importance",
"wt motif type",
"Motif position",
"Motif flanks",
paste0("Dist to AP-1"),
paste0("Dist to P53"),
paste0("Dist to MAF"),
paste0("Dist to CREB1"),
paste0("Dist to ETS"),
paste0("Dist to EGR1"),
paste0("Dist to MECP2"),
paste0("Dist to E2F1")
)
<- data.frame()
Var_table for(motif in names(model_list)){
for(model in names(model_list[[motif]])[[2]]){
<- model_list[[motif]][[model]]$Pred
tmp
<- cor.test(tmp$pred, tmp$obs,
pc method = "pearson", use="complete.obs")
<- ggplot(tmp, aes(pred, obs)) +
g2 geom_pointdensity() +
scale_color_gradient(low = "grey70", high = motif_colours[[motif]]) +
guides(col=F) +
scale_y_continuous(paste0("log2 FC enh act [observed]"), breaks = seq(-10,10,2)) +
scale_x_continuous(paste0("log2 FC enh act [predicted]"), breaks = seq(-10,10,2)) +
ggtitle(motif) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
annotate("text", x=min(tmp$pred, na.rm=T), y = max(tmp$obs, na.rm=T),
label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(colour="black", size=10))
<- model_list[[motif]][[model]]$Model_trained
model_tmp ## variance explained - average of 100 models - to get feature importance
<- model_list[[motif]]$Data
df1 <- df1[,grep("log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|motif_position|flank_|Close_", names(df1))]
df1
<- sapply(1:100, function(i){
var_expl_all <- df1[,sample(ncol(df1))]
df3 <- lm(log2FC_mut_swapped~., data=df3)
fit <- anova(fit)
af $PctExp <- af$"Sum Sq"/sum(af$"Sum Sq")*100
af<- sapply(vars, function(x) sum(af$PctExp[grep(x, rownames(af))]))
var_expl
var_expl
})<- rowMeans(var_expl_all)
var_expl_all
<- data.frame(Features=names(var_expl_all),Perc=var_expl_all, stringsAsFactors = F)
tmp_avg $Features[grep("Dist", tmp_avg$Features)] <- "Motif distances"
tmp_avg$Features <- factor(tmp_avg$Features, levels=c("wt motif importance", "wt motif type", "Motif position", "Motif flanks", "Motif distances"))
tmp_avg
<- ggplot(tmp_avg, aes("Average", Perc, fill=Features)) +
barplot_avg geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Variance explained [%]", breaks=seq(-100,100,10), expand = c(0,0)) +
xlab("") +
ggtitle(paste0("adjR2: ", round(summary(model_tmp)$adj.r.squared,2))) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=16),
legend.text = element_text(colour="black", size=14))
print(barplot_avg + g2 + plot_layout(widths = c(1, 6)))
$Motif=motif
tmp_avg$Model=model
tmp_avg
<- rbind(Var_table, tmp_avg)
Var_table
}print(motif)
}
## [1] "CREB1"
## [1] "E2F1"
## [1] "EGR1"
## [1] "ETS"
## [1] "MECP2"
## [1] "P53"
## [1] "AP-1"
## [1] "MAF"
# average variance explained
mean(Var_table[grep("Motif flanks", Var_table$Features),] %>% group_by(Motif, Model) %>% summarise(Perc=sum(Perc)) %>% pull(Perc)) #8.2%
## [1] 8.267161
mean(Var_table[grep("Motif distances", Var_table$Features),] %>% group_by(Motif, Model) %>% summarise(Perc=sum(Perc)) %>% pull(Perc)) #8.5%
## [1] 8.51985
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results
<- c("motif_importance_minus_log2FC_wt_mut",
vars "Wt_motifAP-1",
"Wt_motifP53",
"Wt_motifMAF",
"Wt_motifCREB1",
"Wt_motifETS",
"Wt_motifEGR1",
"Wt_motifMECP2",
"Wt_motifE2F1",
"motif_positioncenter",
"motif_positionboundaries",
paste0("flank_left_5"),
paste0("flank_left_4"),
paste0("flank_left_3"),
paste0("flank_left_2"),
paste0("flank_left_1"),
paste0("flank_right_1"),
paste0("flank_right_2"),
paste0("flank_right_3"),
paste0("flank_right_4"),
paste0("flank_right_5"),
paste0("`Close_AP-1`", c("Close", "Distal")),
paste0("Close_P53", c("Close", "Distal")),
paste0("Close_MAF", c("Close", "Distal")),
paste0("Close_CREB1", c("Close", "Distal")),
paste0("Close_ETS", c("Close", "Distal")),
paste0("Close_EGR1", c("Close", "Distal")),
paste0("Close_MECP2", c("Close", "Distal")),
paste0("Close_E2F1", c("Close", "Distal"))
)names(vars) <- c("wt motif importance",
"wt AP-1",
"wt P53",
"wt MAF",
"wt CREB1",
"wt ETS",
"wt EGR1",
"wt MECP2",
"wt E2F1",
paste0("Motif position - ", c("Center", "Boundaries")),
paste0("Flanking nt -5"),
paste0("Flanking nt -4"),
paste0("Flanking nt -3"),
paste0("Flanking nt -2"),
paste0("Flanking nt -1"),
paste0("Flanking nt +1"),
paste0("Flanking nt +2"),
paste0("Flanking nt +3"),
paste0("Flanking nt +4"),
paste0("Flanking nt +5"),
paste0("Dist to AP-1 - ", c("Close", "Distal")),
paste0("Dist to P53 - ", c("Close", "Distal")),
paste0("Dist to MAF - ", c("Close", "Distal")),
paste0("Dist to CREB1 - ", c("Close", "Distal")),
paste0("Dist to ETS - ", c("Close", "Distal")),
paste0("Dist to EGR1 - ", c("Close", "Distal")),
paste0("Dist to MECP2 - ", c("Close", "Distal")),
paste0("Dist to E2F1 - ", c("Close", "Distal"))
)
<- data.frame(row.names = names(sapply(List_motif_model_results, function(m) m[[2]]$adj.r.squared)),
motif_anno PCC = sapply(List_motif_model_results, function(m) m[[2]]$PCC))
# Get FDR-corrected p-value bin
<- do.call(rbind, lapply(names(List_motif_model_results), function(model){
summary_lm # Linear model - significance
<- as.data.frame(summary(List_motif_model_results[[model]][[2]]$Model_trained)$coefficients)
out "Motif"] <- model
out[,
out
}))$FDR <- p.adjust(summary_lm$`Pr(>|t|)`, method="fdr")
summary_lmgrep("flank", rownames(summary_lm)),1] <- abs(summary_lm[grep("flank", rownames(summary_lm)),1]) # for flanks use only pos, because direction has no direct meaning
summary_lm[
<- sapply(unique(summary_lm$Motif), function(model){
summary_lm
# Linear model - significance
<-summary_lm[summary_lm$Motif %in% model,]
m <- sapply(vars, function(x){
var_imp if(length(grep(x, rownames(m)))>0){
<- m[grep(x, rownames(m)),]
m2 <- m2[order(m2[,4]),]
m2 <- m2[1,6]
p if(p>=0.1) p2 <- ">0.1"
if(p<0.1) p2 <- "0.1"
if(p<0.05) p2 <- "0.05"
if(p<0.01) p2 <- "0.01"
if(p<0.001) p2 <- "0.001"
if(sign(m2[1,1])==1 & p<0.1) p2 <- paste0(p2, " (+)")
if(sign(m2[1,1])==-1 & p<0.1) p2 <- paste0(p2, " (-)")
return(p2)
else{return(NA)}
}
})
return(var_imp)
}) summary_lm
## CREB1 E2F1 EGR1 ETS
## wt motif importance "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## wt AP-1 ">0.1" NA ">0.1" "0.001 (-)"
## wt P53 NA "0.001 (+)" "0.05 (+)" ">0.1"
## wt MAF ">0.1" ">0.1" "0.05 (-)" "0.001 (-)"
## wt CREB1 NA ">0.1" "0.05 (+)" "0.05 (-)"
## wt ETS "0.05 (-)" ">0.1" "0.05 (-)" NA
## wt EGR1 "0.1 (+)" "0.001 (+)" NA ">0.1"
## wt MECP2 ">0.1" ">0.1" "0.1 (-)" NA
## wt E2F1 NA NA NA NA
## Motif position - Center "0.001 (-)" "0.01 (-)" "0.01 (-)" "0.01 (-)"
## Motif position - Boundaries ">0.1" ">0.1" ">0.1" "0.1 (-)"
## Flanking nt -5 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -4 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -3 ">0.1" ">0.1" "0.01 (+)" ">0.1"
## Flanking nt -2 "0.05 (+)" "0.05 (+)" ">0.1" ">0.1"
## Flanking nt -1 ">0.1" "0.05 (+)" "0.05 (+)" "0.01 (+)"
## Flanking nt +1 "0.01 (+)" "0.01 (+)" "0.1 (+)" "0.01 (+)"
## Flanking nt +2 "0.001 (+)" ">0.1" "0.05 (+)" ">0.1"
## Flanking nt +3 ">0.1" "0.05 (+)" "0.01 (+)" ">0.1"
## Flanking nt +4 ">0.1" ">0.1" ">0.1" "0.05 (+)"
## Flanking nt +5 ">0.1" ">0.1" "0.05 (+)" ">0.1"
## Dist to AP-1 - Close "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## Dist to AP-1 - Distal "0.001 (+)" "0.05 (+)" "0.001 (+)" "0.001 (+)"
## Dist to P53 - Close ">0.1" ">0.1" ">0.1" "0.1 (-)"
## Dist to P53 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MAF - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MAF - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to CREB1 - Close ">0.1" ">0.1" "0.1 (+)" ">0.1"
## Dist to CREB1 - Distal "0.05 (-)" ">0.1" ">0.1" ">0.1"
## Dist to ETS - Close "0.05 (+)" ">0.1" "0.05 (+)" "0.05 (-)"
## Dist to ETS - Distal ">0.1" ">0.1" "0.01 (+)" ">0.1"
## Dist to EGR1 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to EGR1 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MECP2 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MECP2 - Distal ">0.1" ">0.1" ">0.1" "0.05 (-)"
## Dist to E2F1 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to E2F1 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## MECP2 P53 AP-1 MAF
## wt motif importance "0.001 (+)" "0.001 (+)" "0.001 (+)" "0.001 (+)"
## wt AP-1 "0.001 (-)" ">0.1" NA NA
## wt P53 NA NA ">0.1" ">0.1"
## wt MAF "0.001 (-)" ">0.1" ">0.1" NA
## wt CREB1 "0.05 (-)" ">0.1" ">0.1" "0.01 (+)"
## wt ETS "0.01 (-)" ">0.1" "0.05 (-)" "0.05 (-)"
## wt EGR1 ">0.1" ">0.1" ">0.1" ">0.1"
## wt MECP2 NA ">0.1" "0.05 (-)" ">0.1"
## wt E2F1 NA NA NA NA
## Motif position - Center "0.05 (-)" "0.05 (-)" "0.001 (-)" "0.001 (-)"
## Motif position - Boundaries ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt -5 "0.01 (+)" ">0.1" ">0.1" ">0.1"
## Flanking nt -4 "0.05 (+)" ">0.1" ">0.1" ">0.1"
## Flanking nt -3 ">0.1" ">0.1" "0.1 (+)" ">0.1"
## Flanking nt -2 "0.05 (+)" ">0.1" "0.1 (+)" "0.1 (+)"
## Flanking nt -1 "0.05 (+)" "0.001 (+)" ">0.1" ">0.1"
## Flanking nt +1 ">0.1" "0.01 (+)" "0.05 (+)" "0.1 (+)"
## Flanking nt +2 "0.05 (+)" "0.1 (+)" ">0.1" "0.01 (+)"
## Flanking nt +3 ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Flanking nt +4 ">0.1" ">0.1" ">0.1" ">0.1"
## Flanking nt +5 ">0.1" "0.05 (+)" "0.05 (+)" ">0.1"
## Dist to AP-1 - Close "0.001 (+)" ">0.1" "0.001 (+)" "0.001 (+)"
## Dist to AP-1 - Distal "0.1 (+)" ">0.1" "0.001 (+)" "0.001 (+)"
## Dist to P53 - Close "0.05 (+)" ">0.1" "0.1 (+)" ">0.1"
## Dist to P53 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MAF - Close ">0.1" ">0.1" ">0.1" "0.01 (-)"
## Dist to MAF - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to CREB1 - Close ">0.1" ">0.1" ">0.1" "0.05 (+)"
## Dist to CREB1 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to ETS - Close ">0.1" ">0.1" "0.001 (+)" "0.1 (+)"
## Dist to ETS - Distal ">0.1" ">0.1" "0.05 (+)" ">0.1"
## Dist to EGR1 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to EGR1 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MECP2 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to MECP2 - Distal ">0.1" ">0.1" ">0.1" "0.1 (+)"
## Dist to E2F1 - Close ">0.1" ">0.1" ">0.1" ">0.1"
## Dist to E2F1 - Distal ">0.1" ">0.1" ">0.1" ">0.1"
### heatmap
# motifs in rows, features in columns
# add PCC
<- reshape2::melt(summary_lm)
out $Var2 <- factor(out$Var2, levels=rev(c("AP-1", "P53", "MAF", "CREB1", "ETS", "EGR1", "MECP2", "E2F1")))
out$value <- factor(out$value, levels=c("0.001 (-)",
out"0.01 (-)",
"0.05 (-)",
"0.1 (-)",
">0.1",
"0.1 (+)",
"0.05 (+)",
"0.01 (+)",
"0.001 (+)"))
<- ggplot(out, aes(Var1, Var2, fill=value)) +
g geom_tile(col="black") +
scale_fill_manual("linear model\nFDR (direction)", values=c("0.001 (-)"="#2166AC",
"0.01 (-)"="#4393C3",
"0.05 (-)"="#92C5DE",
"0.1 (-)"="#D1E5F0",
">0.1"="white",
"0.1 (+)"="#FDDBC7",
"0.05 (+)"="#F4A582",
"0.01 (+)"="#D6604D",
"0.001 (+)"="#B2182B"),
breaks=c("0.1 (-)",
"0.05 (-)",
"0.01 (-)",
"0.001 (-)",
">0.1",
"0.1 (+)",
"0.05 (+)",
"0.01 (+)",
"0.001 (+)"),
drop=F, na.value="grey80") +
guides(fill=guide_legend(nrow=2,byrow=TRUE)) +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
legend.position = "bottom")
$Var2 <- factor(rownames(motif_anno), levels=rev(c("AP-1", "P53", "MAF", "CREB1", "ETS", "EGR1", "MECP2", "E2F1")))
motif_anno
<- ggplot(motif_anno, aes("1", Var2, fill=PCC)) +
g_PCC geom_tile(col="black") +
scale_fill_gradient("PCC", low = "white", high = "green4", breaks = seq(0,10,0.2), limits=c(0,0.70)) +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank())
print(g_PCC + g + plot_layout(widths=c(1,30)))
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results
<- readRDS("REEFSTARRseq_Swap_motif_instances_sequences_final_wt_disrupted.rds")
df_main <- df_main[complete.cases(df_main$log2FC_wt_mut) & df_main$log2FC_wt_mut <= -1,]
df_main
<- c(paste0("-5"),
myflanks paste0("-4"),
paste0("-3"),
paste0("-2"),
paste0("-1"),
paste0("+1"),
paste0("+2"),
paste0("+3"),
paste0("+4"),
paste0("+5")
)names(myflanks) <- c(paste0("flank_left_5"),
paste0("flank_left_4"),
paste0("flank_left_3"),
paste0("flank_left_2"),
paste0("flank_left_1"),
paste0("flank_right_1"),
paste0("flank_right_2"),
paste0("flank_right_3"),
paste0("flank_right_4"),
paste0("flank_right_5")
)
for(m in names(List_motif_model_results)[c(6,7,4)]){ # some examples
<- List_motif_model_results[[m]]$Data
tmp $Wt_motif <- factor(tmp$Wt_motif, levels=c("AP-1", "P53", "MAF", "CREB1", "ETS", "EGR1", "MECP2", "E2F1"))
tmp
<- df_main[df_main$Wt_motif %in% m & complete.cases(df_main$log2FC_wt_mut),]
tmp2 <- tmp2[!duplicated(tmp2$Instance_ID),]
tmp2
<- ggplot(tmp, aes(Wt_motif, log2FC_mut_swapped, fill=Wt_motif)) +
gg geom_boxplot() +
scale_fill_manual("Wt motif", values=motif_colours) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("wt motif", breaks=levels(tmp$Wt_motif), drop=F) +
ggtitle(paste0(m, " pasted (n=", nrow(tmp), " instances)")) +
theme(axis.text.x = element_text(size=11)) +
geom_boxplot(data = tmp2, aes(Wt_motif, -log2FC_wt_mut)) # to include its own instances
print((gg + guides(fill=F)) + gg + plot_layout(widths = c(1.9,1)))
# flanks
<- ggplot(reshape2::melt(tmp[,grep("log2FC_mut_swapped|flank",names(tmp))], id.vars="log2FC_mut_swapped"),
gg_flanks aes(variable, log2FC_mut_swapped, fill=value)) +
geom_boxplot(alpha=0.8, outlier.size = -1) +
scale_fill_manual("", values=c('#109648', '#255C99', '#F7B32B', '#D62839')) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete("Motif flanks", labels=myflanks) +
ggtitle(paste0(m, " pasted (n=", nrow(tmp), " instances)"))
print(gg_flanks)
# partner motifs - distance
for(m2 in gsub("_dist", "", grep("_dist", names(tmp), value = T))[c(1,7)]){ # some examples
<- table(tmp[, paste0("Close_", m2)])
tt <- sapply(names(tt), function(t) paste0(t, "\n(", tt[t], ")"))
x_labels2 <- ggplot(tmp, aes(tmp[,paste0("Close_", m2)], log2FC_mut_swapped, alpha=tmp[,paste0("Close_", m2)])) +
gg_motif_distance_summary geom_boxplot(fill=motif_colours[m]) +
# scale_fill_manual("Wt motif", values=motif_colours) +
scale_alpha_manual(paste0(m2, " distance"), values=c(0.2, 0.5, 1)) +
guides(alpha=F) +
geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
scale_y_continuous("log2 FC enhancer activity to shuffled mutant", breaks = seq(-10,10,2)) +
scale_x_discrete(paste0(m2, " distance"), breaks=names(x_labels2), labels=x_labels2) +
ggtitle("Summary")
print(gg_motif_distance_summary)
}
}
library(caret)
# combine all data and features
<- readRDS("List_motif_model_results_syntax_features.rds")
List_motif_model_results <- do.call(rbind, lapply(List_motif_model_results, function(m){
All_df return(m$Data)
}))table(All_df$Wt_motif, All_df$New_motif)
# multiple models (with caret) with motif flanks, presence of other motifs, and motif flanks
# 10-fold CV
<- trainControl(method = "repeatedcv",
trctrl number = 10,
repeats = 1,
classProbs = T,
savePredictions = T)
<- c("rf")
model_list names(model_list) <- model_list
<- lapply(model_list, function(model){
model_predictions <- lapply(list(Only_motifs=All_df[,grep("log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|New_motif$", names(All_df))],
model_predictions_data All_syntax_features=All_df[,grep("log2FC_mut_swapped|motif_importance_minus_log2FC_wt_mut|Wt_motif$|New_motif$|Close_|flank_|motif_position", names(All_df))]
function(data){
),
<- train(log2FC_mut_swapped ~ . , data=data, method = model,
trained_model trControl=trctrl,
importance = TRUE,
# preProcess = c("center", "scale"),
tuneLength = 1)
=varImp(trained_model, scale = F)$importance
feature_importance
return(list(Data=data,
Model_trained=trained_model,
Pred=trained_model$pred,
adj.r.squared=summary(lm(trained_model$pred$pred~trained_model$pred$obs))$adj.r.squared,
PCC=cor.test(trained_model$pred$pred, trained_model$pred$obs)$estimate,
results=trained_model$results,
feature_importance=feature_importance))
})return(model_predictions_data)
})saveRDS(model_predictions, "List_motif_model_results_syntax_features_all_motifs.rds")
<- readRDS("List_motif_model_results_syntax_features_all_motifs.rds")
model_predictions
="rf"
modelfor(data in names(model_predictions[[model]])){
<- model_predictions[[model]][[data]]$Pred
tmp
<- cor.test(tmp$pred, tmp$obs,
pc method = "pearson", use="complete.obs")
if(length(grep("to_wt", data))>0) lab <- "log2 FC enh act to wt"
if(length(grep("to_wt", data))==0) lab <- "log2 FC enh act to mut"
<- ggplot(tmp, aes(pred, obs)) +
g2 geom_pointdensity() +
scale_color_gradient(low = "grey70", high = "grey20") +
guides(col=F) +
scale_y_continuous(paste0(lab, " [observed]"), breaks = seq(-10,10,2)) +
scale_x_continuous(paste0(lab, " [predicted]"), breaks = seq(-10,10,2)) +
ggtitle(paste0(model, " - ", data, " (10-fold CV)")) +
geom_abline(linetype="dashed", col="grey40") +
annotate("text", x=min(tmp$pred, na.rm=T), y = max(tmp$obs, na.rm=T),
label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) +
theme(axis.text = element_text(colour="black", size=14),
axis.title = element_text(colour="black", size=16),
plot.title = element_text(colour="black", size=16))
## Imp features
<- model_predictions[[model]][[data]]$feature_importance
imp $Feature <- rownames(imp)
imp$Feature_type <- NA
imp$Feature_type[grep("motif_importance_minus_log2FC_wt_mut", imp$Feature)] <- "wt motif importance"
imp$Feature_type[grep("Wt_motif", imp$Feature)] <- "wt motif type"
imp$Feature_type[grep("New_motif", imp$Feature)] <- "New motif type"
imp$Feature_type[grep("flank_", imp$Feature)] <- "Motif flanks"
imp$Feature_type[grep("Close_", imp$Feature)] <- "Motif distances"
imp$Feature_type[grep("motif_position", imp$Feature)] <- "Motif position"
imp$Feature_type <- factor(imp$Feature_type, levels=c("wt motif importance", "wt motif type", "New motif type", "Motif position", "Motif flanks", "Motif distances"))
imp
<- imp[order(imp$Feature_type),]
imp $Feature <- factor(imp$Feature, levels = rev(imp$Feature))
imp<- ggplot(imp, aes(Feature, Overall, fill=Feature_type)) +
barplot_all geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"New motif type"="#4daf4a",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Feature importance", expand = c(0,0)) +
xlab("") +
ggtitle("") +
theme(axis.text = element_text(colour="black", size=14),
axis.text.y = element_blank(),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=16),
legend.text = element_text(colour="black", size=14)) +
coord_flip() +
guides(fill=F)
<- imp[order(imp$Overall, decreasing = T),]
imp if(nrow(imp)>20) imp <- imp[1:20,]
$Feature <- factor(imp$Feature, levels = rev(imp$Feature))
imp
<- ggplot(imp, aes(Feature, Overall, fill=Feature_type)) +
barplot_summ geom_col(width=0.8) +
scale_fill_manual("Features", values = c("wt motif importance"="#984ea3",
"wt motif type"="#e41a1c",
"New motif type"="#4daf4a",
"Motif position"="grey50",
"Motif flanks"="#ff7f00",
"Motif distances"="#a65628")) +
scale_y_continuous("Feature importance", expand = c(0,0)) +
xlab("") +
ggtitle("") +
theme(axis.text.x = element_text(colour="black", size=14),
axis.text.y = element_text(colour="black", size=12),
axis.title = element_text(colour="black", size=16),
legend.title = element_text(colour="black", size=14),
legend.text = element_text(colour="black", size=12)) +
coord_flip()
print(barplot_all + barplot_summ + g2 + plot_layout(widths = c(1,2,3)))
}