Script to reproduce analyses of random Variants STARR-seq

Required packages

require(tidyverse)
require(data.table)
require(BSgenome.Dmelanogaster.UCSC.dm3)
require(seqinr)
require(stringr)
require(GenomicRanges)
require(patchwork)
require(ggpointdensity)
require(gridExtra)
require(cowplot)
require(ggrepel)
require(Biostrings)
require(ggplot2)
theme_set(theme_classic() + theme(axis.text = element_text(colour = "black")))

# Library colours
# S2171 also called ced6 enhancer
my_col <- c("Controls"="grey60",
            "dev"="orangered",
            "dev enh"="orangered",
            "S2171_wt"="red",
            "chr3L_3310914_3311162_wt"="red",
            "Wt"="grey60",
            "pos110"=RColorBrewer::brewer.pal(7, "Set1")[2],
            "ced6_pos110"=RColorBrewer::brewer.pal(7, "Set1")[2],
            "ced6_pos182"="#CCCC00",
            "pos182"="#CCCC00",
            "ced6_pos230"="#e78ac3",
            "pos230"="#e78ac3",
            "ced6_pos241"=RColorBrewer::brewer.pal(7, "Set1")[5],
            "pos241"=RColorBrewer::brewer.pal(7, "Set1")[5],
            "ZnT63C_pos210"=RColorBrewer::brewer.pal(7, "Set1")[7],
            "pos210"=RColorBrewer::brewer.pal(7, "Set1")[7],
            "ZnT63C_pos180"=RColorBrewer::brewer.pal(7, "Set1")[3],
            "pos180"=RColorBrewer::brewer.pal(7, "Set1")[3],
            "ZnT63C_pos142"="#BA3979",
            "pos142"="#BA3979")

### Motif colours
my_motif_col <- c("wt"="orangered",
                  "Trl"="#e78ac3",
                  "GAGA"="#e78ac3",
                  "GAGAG"="#e78ac3",
                  "GAGAGA"="#e78ac3",
                  "GATA"="#78A8F3",
                  "GATAA"="#78A8F3",
                  "GATAAG"="#78A8F3",
                  "twist"="#7AB06F",
                  "CATCTG"="#7AB06F",
                  "AP1"="#BA3979",
                  "AP-1"="#BA3979",
                  "TGA.TCA"="#BA3979",
                  "ttk"="#DCDF80",
                  "AGGAT"="#DCDF80",
                  "AGGATAA"="#DCDF80",
                  "CCGGAA"="#a6761d",
                  "ETS"="#a6761d",
                  "TCACGCG"="#7570b3",
                  "TCACGCGA"="#7570b3",
                  "SREBP"="#7570b3",
                  "CREB/ATF"="#9B9666",
                  "CREB"="#9B9666",
                  "TGATGA"="#9B9666",
                  "TCATCA"="#9B9666",
                  "TGATGT"="#9B9666",
                  "TGTGAAA"="#9B9666",
                  "STAT"="#FF9933",
                  "TCC.GGA"="#FF9933",
                  "TTCC.GGA"="#FF9933",
                  "ATCGAT"="dodgerblue",
                  "Dref"="dodgerblue"
                  )

Load sequencing reads from bowtie

# Metadata of sequences in library
final_metadata <- readRDS("data/final_variants_metadata.rds")
final_metadata <- final_metadata[,-grep("twist_S2171|Twist_mix.y", names(final_metadata))]

experiment_table <- read.delim("data/experiment.txt", stringsAsFactors = F)

type <- "UMI"
for(i in 1:nrow(experiment_table)){
  
  name=experiment_table$Outfile[i]
  mapped <- import.bed(paste0("data/", name, "_final_mapped.", type, ".bed")) # this are the rv reads
  
  # choose sequences with correct length and strand (negative strand because these are the results of rv read mapping)
  if(length(grep("S2171", name))>0){
    mapped_correct_length <- mapped[mapped@ranges@width==149] # in this lane we got 149nt reads
    mapped_correct_length <- mapped_correct_length[(mapped_correct_length@strand =="-" & mapped_correct_length@ranges@start==101) | (mapped_correct_length@strand =="+" & mapped_correct_length@ranges@start==1)]
    
    # I need to correct the oligo names of wt chr3L_3310914_3311162 reads In the final index (and metadata) the correct ID is chr3L_3310914_3311162_wt, and not chr3L_3310914_3311162_+_dCP
    mapped_correct_length@seqnames@values <- as.character(mapped_correct_length@seqnames@values)
    mapped_correct_length@seqnames@values[grep("chr3L_3310914_3311162", as.character(mapped_correct_length@seqnames@values))] <- "chr3L_3310914_3311162_wt"
  }
  
  # choose sequences with correct length
  if(length(grep("chr3L_3310914_3311162", name))>0){
    mapped_correct_length <- mapped[mapped@ranges@width==150] # in this lane we got 150nt reads
    
    # ZnT63C AP-1 pos142 library is 250bp instead of 249bp, and therefore the reads start one nt ahead
    mapped_correct_length <- c(mapped_correct_length[c(intersect(intersect(grep("GATAA_|twist|chr|chr3L_3310914_3311162_wt|S2171_wt", as.character(mapped_correct_length@seqnames)), grep(100, mapped_correct_length@ranges@start)), grep("-", as.character(mapped_correct_length@strand), fixed = T)),
                                                       intersect(intersect(grep("AP1", as.character(mapped_correct_length@seqnames)), grep(101, mapped_correct_length@ranges@start)), grep("-", as.character(mapped_correct_length@strand), fixed = T)))],
                               mapped_correct_length[mapped_correct_length@strand =="+" & mapped_correct_length@ranges@start==1]) # also keep reads that map to pos strand of index in first nt
  }
  
  # choose sequences with correct strand
  rm <- as.numeric(c(intersect(grep("GATAA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
                     intersect(grep("AP1", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
                     intersect(grep("twist", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
                     intersect(grep("CACA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T)),
                     intersect(grep("GAGA_", as.character(mapped_correct_length@seqnames)), grep("+", as.character(mapped_correct_length@strand), fixed = T))
  ))
  if(length(rm)>0) mapped_correct_length <- mapped_correct_length[-rm]
  
  # add strand to variant name
  # but strand should be the opposite, because this is the mapping for reverse read
  mapped_correct_length <- as.data.frame(mapped_correct_length)
  mapped_correct_length$ID <- paste0(mapped_correct_length$seqnames, "_", ifelse(mapped_correct_length$strand=="+", "-", "+"))
  
  # merge results
  final_metadata <- merge(final_metadata, as.data.frame(table(mapped_correct_length$ID)), by=1, all.x=T)
  final_metadata$Freq[is.na(final_metadata$Freq)] <- 0
  names(final_metadata)[ncol(final_metadata)] <- paste0(name, "_", type)
  
  print(name)
}

final_metadata$library <- factor(final_metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "GATAA_1st_S2171", "CACA_S2171", "GAGA_S2171", "GATAA_2nd_S2171", "twist_S2171", "GATAA_chr3L_3310914_3311162", "twist_chr3L_3310914_3311162", "AP1_chr3L_3310914_3311162"),
                                 labels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "twist_S2171", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))

# Remove neg orientation of wt S2171 and chr3L_3310914_3311162 oligos - because in the respective screens with my mapping strategy (36bp + 150bp) I cannot uniquely identify the reverse orientation of the oligo, because it multimapps to the variant reverse orientation sequences - therefore the 0s are not correct and is better to remove it
# actually for chr3L_3310914_3311162, the strand is swapped, the overrepresented oligo is the reverse strand
final_metadata <- final_metadata[-grep("S2171_wt_\\-|chr3L_3310914_3311162_wt_\\-", final_metadata$Variant),]

table(final_metadata$library)

write.table(final_metadata, "Table_enhancer_variant_counts.txt", sep="\t", quote = F, row.names = F)

Input quality

Fig S4D,E - distribution of mapped fragments per library

metadata <- read.delim("Table_enhancer_variant_counts.txt")
metadata <- metadata[!metadata$library %in% "twist_S2171",] # library not used
metadata$library <- factor(metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))

for(i in c("S21717_REEFmix2_input_Rep", "S21717_REEFmix2_Rep", "chr3L_3310914_3311162_REEFmix3_input_Rep", "chr3L_3310914_3311162_REEFmix3_Rep")){
  
  if(length(grep("S21717", i))>0){
    df <- metadata[!(metadata$Twist_mix=="mix3" | metadata$Variant %in% c("S2171_wt_+", "chr3L_3310914_3311162_wt_+")),]
    if(length(grep(i, names(metadata))) !=2) break
    df$mean <- rowMeans(df[,grep(i, names(metadata))])
    nr.reads.per.variant <- rowMeans(metadata[!metadata$Twist_mix=="mix3",grep(i, names(metadata))])
    names(nr.reads.per.variant) <- metadata$Variant[!metadata$Twist_mix=="mix3"]
  }
  if(length(grep("chr3L_3310914_3311162", i))>0){
    df <- metadata[!(metadata$Twist_mix=="mix2" | metadata$Variant %in% c("S2171_wt_+", "chr3L_3310914_3311162_wt_+")),]
    if(length(grep(i, names(metadata))) !=2) break
    df$mean <- rowMeans(df[,grep(i, names(metadata))])
    nr.reads.per.variant <- rowMeans(metadata[!metadata$Twist_mix=="mix2",grep(i, names(metadata))])
    names(nr.reads.per.variant) <- metadata$Variant[!metadata$Twist_mix=="mix2"]
  }
  
  df$library <- droplevels(df$library)
  
  # make main plot for inputs to compare the different libraries
  gg <- ggplot(df, aes(x=mean, fill=library)) +
    geom_density(alpha = 0.5) +
    scale_fill_manual("Libraries", values = my_col[levels(df$library)]) +
    scale_x_log10("Counts of sequenced fragments", breaks=c(1,median(nr.reads.per.variant),10,100, 1000))+
    scale_y_continuous(expand = c(0,0)) +
    geom_vline(xintercept = median(nr.reads.per.variant), col = "black", linetype="dashed") +
    ggtitle(gsub("_", " ", gsub("_UMI", "", i)),
            subtitle = paste0("Nr of UMI counts = ", round(sum(nr.reads.per.variant)/1e6,1), " million",
                              "\nNr of chr3L_3310914_3311162 wt fragments = ", nr.reads.per.variant[grep("chr3L_3310914_3311162_wt_\\+", names(nr.reads.per.variant))], " (", formatC(nr.reads.per.variant[grep("chr3L_3310914_3311162_wt_\\+", metadata$Variant)]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
                              "\nNr of S2-171 wt fragments = ", nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))], " (", formatC(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
                              "\nNr variants found = ", table(nr.reads.per.variant>0)[2], " (", formatC(table(nr.reads.per.variant>0)[2]/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)",
                              "\n",
                              "Median = ", median(nr.reads.per.variant))) +
    theme_light(base_size = 15) +
    theme(plot.title = element_text(hjust=0.5))
  
  print(gg)
}

Fig S1B - only ced-6 pos241 library

# Only ced-6 pos110 GATAA instance
metadata$mean <- rowMeans(metadata[,grep("S21717_REEFmix2_input_Rep._UMI", names(metadata))])
df <- metadata[metadata$library %in% c("ced6_pos241"),]
nr.reads.per.variant <- metadata[metadata$library %in% c("ced6_pos241", "S2171_wt"), "mean"]
names(nr.reads.per.variant) <- metadata$Variant[metadata$library %in% c("ced6_pos241", "S2171_wt")]

hist(df[,"mean"], breaks = 270, las = 1, xlab = "Sequenced fragment count (variants)",
     main = paste0("REEF input library - 2nd GATAA"), border = 0.1, col = "gray", xlim=c(0,150))
legend("topright", legend = c(paste0("Nr of reads mapped = ", round(sum(nr.reads.per.variant)/1e6,1), " million"),
                              paste0("Nr of sequences with wt = ", round(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))],0), " (", formatC(nr.reads.per.variant[grep("S2171_wt_\\+", names(nr.reads.per.variant))]/sum(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
                              paste0("Nr variants found = ", length(nr.reads.per.variant[!nr.reads.per.variant==0]), " (", formatC(length(nr.reads.per.variant[!nr.reads.per.variant==0])/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
                              paste0("Nr variants > 5 counts = ", length(nr.reads.per.variant[nr.reads.per.variant>=10]), " (", formatC(length(nr.reads.per.variant[nr.reads.per.variant>=10])/length(nr.reads.per.variant)*100, format = "f", digits = 2), "%)"),
                              paste0("Mean = ", round(mean(nr.reads.per.variant))),
                              paste0("Median = ", round(median(nr.reads.per.variant)))),
       text.col = c("black", "black", "black", "black", "tomato3", "steelblue3"), bty = "n",
       cex=0.85)
abline(v = c(median(nr.reads.per.variant), mean(nr.reads.per.variant)), col = c("steelblue3", "tomato3"), lwd = 1)

Fig S4A,B - Compare replicates

Didn’t include plots because they are too heavy

metadata <- read.delim("Table_enhancer_variant_counts.txt")
metadata <- metadata[!metadata$library %in% "twist_S2171",] # library not used
metadata$library <- factor(metadata$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))

# don't plot the WT
metadata[metadata$library %in% c("S2171_wt") & metadata$Twist_mix.y %in% "mix3", grep("chr3L_3310914_3311162_REEFmix3", names(metadata))] <- NA

# plots
plot_list_tmp = list()

Oligo_counts <- metadata[,grep("Rep._UMI", names(metadata))]

# normalise to 1 million mapped fragments
Counts_per_million_cpm <- as.data.frame(apply(Oligo_counts, 2, function(x) (x+1)/sum(x, na.rm=T)*1e6))

for(id in unique(substr(names(metadata)[grep("UMI", names(metadata))], 1, nchar(names(metadata)[grep("UMI", names(metadata))])-5))){
  
  a <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]]
  b <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]
  
  if(length(grep("S21717", id))>0){
    df_tmp <- Counts_per_million_cpm[!(metadata$Twist_mix=="mix3"),grep(id, names(Counts_per_million_cpm))]
  }
  if(length(grep("chr3L_3310914_3311162", id))>0){
    df_tmp <- Counts_per_million_cpm[!(metadata$Twist_mix=="mix2"),grep(id, names(Counts_per_million_cpm))]
  }
  
  # PCC - not using zeros
  pc <- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
                 log10(df_tmp[apply(df_tmp,1,min)>0,b]),
                 method = "pearson")
  
  if(length(grep("input", id))>0) sc_col=c("#d9d9d9","#525252") else{sc_col=c("#fdd49e","#d7301f")}
  
  # plot
  scater <- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
    geom_pointdensity(adjust = 0.7, size=0.6) +
    scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
    scale_x_log10("Normalized UMI counts - rep 1",
                  limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
                  breaks=c(0,1,10,100,1000)) +
    scale_y_log10("Normalized UMI counts - rep 2",
                  limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
                  breaks=c(0,1,10,100,1000)) +
    guides(color=F) +
    theme_bw(base_size = 16) +
    theme(panel.grid = element_blank(),
          axis.text = element_text(colour="black"),
          plot.title = element_text(hjust=0.5)) +
    ggtitle(gsub("_", " ", gsub("_Rep", "", id))) +
    annotate("text",  x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
  
  plot_list_tmp[[id]] = ggplotGrob(scater)
  
}

# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp[1:2], ncol = 2))
print(gridExtra::grid.arrange(grobs = plot_list_tmp[3:4], ncol = 2))

Fig S1A - only ced-6 pos241 library

# Only ced-6 pos241 GATAA instance
metadata <- metadata[metadata$library %in% c("ced6_pos241", "S2171_wt"),]

plot_list_tmp = list()

Oligo_counts <- metadata[,grep("Rep._UMI", names(metadata))]

# normalise to 1 million mapped fragments
Counts_per_million_cpm <- as.data.frame(apply(Oligo_counts, 2, function(x) (x+1)/sum(x, na.rm=T)*1e6))

for(id in c("S21717_REEFmix2_input_Rep", "S21717_REEFmix2_Rep")){
  
  a <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]]
  b <- names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]
  
  if(length(grep("S21717", id))>0){
    df_tmp <- Counts_per_million_cpm[!(metadata$Twist_mix=="mix3"),grep(id, names(Counts_per_million_cpm))]
  }
  if(length(grep("chr3L_3310914_3311162", id))>0){
    df_tmp <- Counts_per_million_cpm[!(metadata$Twist_mix=="mix2"),grep(id, names(Counts_per_million_cpm))]
  }
  
  # PCC - not using zeros
  pc <- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]),
                 log10(df_tmp[apply(df_tmp,1,min)>0,b]),
                 method = "pearson")
  
  if(length(grep("input", id))>0) sc_col=c("#d9d9d9","#525252") else{sc_col=c("#fdd49e","#d7301f")}
  
  # plot
  scater <- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) +
    geom_pointdensity(adjust = 0.7, size=0.6) +
    scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
    scale_x_log10("Normalized UMI counts - rep 1",
                  limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
                  breaks=c(0,1,10,100,1000)) +
    scale_y_log10("Normalized UMI counts - rep 2",
                  limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)),
                  breaks=c(0,1,10,100,1000)) +
    guides(color=F) +
    theme_bw(base_size = 16) +
    theme(panel.grid = element_blank(),
          axis.text = element_text(colour="black"),
          plot.title = element_text(hjust=0.5)) +
    ggtitle(gsub("_", " ", gsub("_Rep", "", id))) +
    annotate("text",  x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
  
  plot_list_tmp[[id]] = ggplotGrob(scater)
  
}

# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp[1:2], ncol = 2))

## TableGrob (1 x 2) "arrange": 2 grobs
##                           z     cells    name           grob
## S21717_REEFmix2_input_Rep 1 (1-1,1-1) arrange gtable[layout]
## S21717_REEFmix2_Rep       2 (1-1,2-2) arrange gtable[layout]

Calculate enhancer activity values with DESeq2

Count_table_final <- read.delim("Table_enhancer_variant_counts.txt", stringsAsFactors = F)
rownames(Count_table_final) <- Count_table_final$Variant
table(Count_table_final$library)

library(BiocParallel, lib.loc = "/software/2020/software/r-bundle-bioconductor/3.8-foss-2018b-r-3.5.1")
library(DESeq2)

# loop per screen
DESeq_results <- lapply(c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3"), function(e){
  
  if(length(grep("S21717", e))>0){
    Count_table <- Count_table_final[!Count_table_final$Twist_mix=="mix3",c(3, grep(e, names(Count_table_final)))]
  }
  if(length(grep("chr3L_3310914_3311162", e))>0){
    Count_table <- Count_table_final[!Count_table_final$Twist_mix=="mix2",c(3, grep(e, names(Count_table_final)))]
  }
  
  table(Count_table$library)
  
  # only sequences with at least 5 reads in both inputs
  Count_table_2 <- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<5)==0,]
  table(Count_table_2$library)
  
  # only sequences with at least 1 read in both STARR-seq reps (no need to handle zeros)
  Count_table_2 <- Count_table_2[rowSums(Count_table_2[,grep("REEFmix._R", names(Count_table_2))]<1)==0,]
  table(Count_table_2$library)
  
  # count table
  cts <- Count_table_2[,grep(e, names(Count_table_2), ignore.case = T)]
  
  # design
  coldata <- data.frame(type=factor(rep(c("Input", "Experiment"),each=2),levels=c("Input", "Experiment")),
                        row.names = names(cts))
  
  if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
    print("Input in design matrix does not match input samples")
    break
  }
  if (!all(rownames(coldata) %in% colnames(cts))){
    print("Rownames do not match colnames")
    break
  }
  if (!all(rownames(coldata) == colnames(cts))){
    print("Rownames do not match colnames")
    break
  }
  
  dds <- DESeqDataSetFromMatrix(countData = as.matrix(cts),
                                colData = coldata,
                                design= ~ type)
  
  # use controls as sizeFactors - assumption that controls don't work
  sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("NegativeRegions", rownames(cts)),]))
  dds <- DESeq(dds)
  
  # plots quality control
  plotDispEsts(dds)
  
  # plot normal FC
  res <- results(dds, alpha=0.05)
  summary(res)
  DESeq2::plotMA(res)
  mcols(res)$description
  
  # merge with main table
  tmp <- as.data.frame(res)[,c(1,2,5,6)]
  names(tmp) <- paste0(e,"_",names(tmp))
  return(tmp)
  
})

Count_table_final <- merge(Count_table_final, DESeq_results$S21717_REEFmix2, by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3, by.x=1, by.y=0, all.x=T)

### find oligo with activity closer to the wt enhancer chr3L_3310914_3311162 in the twist wt library - to serve as a reference since the wt activity is biased due to the over-representation
chr3L_3310914_3311162_ref_enh <- Count_table_final[order(abs(Count_table_final$S21717_REEFmix2_log2FoldChange[Count_table_final$Variant %in% "chr3L_3310914_3311162_wt_+"]-Count_table_final$S21717_REEFmix2_log2FoldChange)),][,c(1:4,42,46)]
chr3L_3310914_3311162_ref_enh <- chr3L_3310914_3311162_ref_enh[chr3L_3310914_3311162_ref_enh$library %in% c("Wt", "chr3L_3310914_3311162_wt"),]
head(chr3L_3310914_3311162_ref_enh)
chr3L_3310914_3311162_ref_enh <- "chrX_9273894_9274142_+_dCP_-"
Count_table_final$library[grep(chr3L_3310914_3311162_ref_enh, Count_table_final$Variant, fixed = T)] <- "chr3L_3310914_3311162_ref"
table(Count_table_final$library)

# library not used
Count_table_final <- Count_table_final[!Count_table_final$library %in% "twist_S2171",]

# save final table
write.table(Count_table_final, "Table_enhancer_variant_counts_and_enrichment.txt", sep="\t", quote = F, row.names = F)

Fig S4A,B - Compare enhancer activity between replicates

Count_table_final <- read.delim("Table_enhancer_variant_counts.txt", stringsAsFactors = F)
rownames(Count_table_final) <- Count_table_final$Variant
table(Count_table_final$library)

require(DESeq2)

# loop per screen
DESeq_results <- lapply(c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3"), function(e){
  tmp_list <- lapply(c(rep1="_Rep1", rep2="_Rep2"), function(r){
    
    if(length(grep("S21717", e))>0){
      Count_table <- Count_table_final[!Count_table_final$Twist_mix=="mix3",c(3, grep(e, names(Count_table_final)))]
    }
    if(length(grep("chr3L_3310914_3311162", e))>0){
      Count_table <- Count_table_final[!Count_table_final$Twist_mix=="mix2",c(3, grep(e, names(Count_table_final)))]
    }
    
    table(Count_table$library)
    
    # only sequences with at least 5 reads in both inputs
    Count_table_2 <- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<5)==0,]
    table(Count_table_2$library)
    
    # only sequences with at least 1 read in both STARR-seq reps (no need to handle zeros)
    Count_table_2 <- Count_table_2[rowSums(Count_table_2[,grep("REEFmix._R", names(Count_table_2))]<1)==0,]
    table(Count_table_2$library)
    
    # count table - all inputs + specific RNA replicate
    cts <- Count_table_2[,c(2,3,grep(paste0(e, r), names(Count_table_2), ignore.case = T))]
    
    # design
    coldata <- data.frame(type=factor(c("Input", "Input", "Experiment"),levels=c("Input", "Experiment")),
                          row.names = names(cts))
    
    if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){
      print("Input in design matrix does not match input samples")
      break
    }
    if (!all(rownames(coldata) %in% colnames(cts))){
      print("Rownames do not match colnames")
      break
    }
    if (!all(rownames(coldata) == colnames(cts))){
      print("Rownames do not match colnames")
      break
    }
    
    dds <- DESeqDataSetFromMatrix(countData = as.matrix(cts),
                                  colData = coldata,
                                  design= ~ type)
    
    # use controls as sizeFactors - assumption that controls don't work
    sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("NegativeRegions", rownames(cts)),]))
    dds <- DESeq(dds)
    #resultsNames(dds) # lists the coefficients
    
    # plots quality control
    plotDispEsts(dds)
    
    # plot normal FC
    res <- results(dds, alpha=0.05)
    summary(res)
    DESeq2::plotMA(res)
    mcols(res)$description
    
    # merge with main table
    tmp <- as.data.frame(res)[,c(1,2,5,6)]
    names(tmp) <- paste0(e,r,"_",names(tmp))
    return(tmp)
  })
  return(tmp_list)
})

Count_table_final <- merge(Count_table_final, DESeq_results$S21717_REEFmix2$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$S21717_REEFmix2$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3$rep1[,c(2,4)], by.x=1, by.y=0, all.x=T)
Count_table_final <- merge(Count_table_final, DESeq_results$chr3L_3310914_3311162_REEFmix3$rep2[,c(2,4)], by.x=1, by.y=0, all.x=T)

# save final table
write.table(Count_table_final, "Table_enhancer_variant_counts_and_enrichment_per_replicate.txt", sep="\t", quote = F, row.names = F)
library(ggpointdensity)

metadata_final <- read.delim("Table_enhancer_variant_counts_and_enrichment_per_replicate.txt", stringsAsFactors = F)
metadata_final <- metadata_final[!metadata_final$library %in% "twist_S2171",] # library not used

plot_list_tmp = list()
for(e in c(S21717_REEFmix2="S21717_REEFmix2", chr3L_3310914_3311162_REEFmix3="chr3L_3310914_3311162_REEFmix3")){
  
  df_tmp <- metadata_final[,grep(paste0(e, "_Rep._log2FoldChange"), names(metadata_final))]
  
  if(e=="S21717_REEFmix2") id="ced-6 enhancer"
  if(e=="chr3L_3310914_3311162_REEFmix3") id="ZnT63C enhancer"
  
  # PCC - not using zeros
  pc <- cor.test(df_tmp[,1],
                 df_tmp[,2],
                 method = "pearson", use="pairwise.complete.obs")
  
  sc_col=c("#fdd49e","#d7301f")
  
  # plot
  scater <- ggplot(df_tmp, aes(df_tmp[,1], df_tmp[,2])) +
    geom_pointdensity(adjust = 0.7, size=0.6) +
    scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
    scale_x_continuous("Enhancer activity - rep 1") +
    scale_y_continuous("Enhancer activity - rep 2") +
    guides(color=F) +
    geom_abline(linetype="dashed", col="grey30") +
    theme_bw(base_size = 16) +
    theme(panel.grid = element_blank(),
          axis.text = element_text(colour="black"),
          plot.title = element_text(hjust=0.5)) +
    ggtitle(id) +
    annotate("text",  x=min(df_tmp[,1], na.rm=T), y = max(df_tmp[,2], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
  
  plot_list_tmp[[e]] = ggplotGrob(scater)
  
}

# multiplot
print(gridExtra::grid.arrange(grobs = plot_list_tmp, ncol = 2))

## TableGrob (1 x 2) "arrange": 2 grobs
##                                z     cells    name           grob
## S21717_REEFmix2                1 (1-1,1-1) arrange gtable[layout]
## chr3L_3310914_3311162_REEFmix3 2 (1-1,2-2) arrange gtable[layout]

Fig S1A - Compare enhancer activity between replicates only in ced-6 pos241 library

# Only ced-6 pos241 GATAA instance
df_tmp <- metadata_final[metadata_final$library %in% c("ced6_pos241", "S2171_wt"),]

pc <- cor.test(df_tmp$S21717_REEFmix2_Rep1_log2FoldChange,
               df_tmp$S21717_REEFmix2_Rep2_log2FoldChange,
               method = "pearson", use="pairwise.complete.obs")

sc_col=c("#fdd49e","#d7301f")

# plot
scater <- ggplot(df_tmp, aes(S21717_REEFmix2_Rep1_log2FoldChange, S21717_REEFmix2_Rep2_log2FoldChange)) +
  geom_pointdensity(adjust = 0.7, size=0.6) +
  scale_color_gradient(low = sc_col[1], high = sc_col[2]) +
  scale_x_continuous("Enhancer activity - rep 1") +
  scale_y_continuous("Enhancer activity - rep 2") +
  guides(color=F) +
  geom_abline(linetype="dashed", col="grey30") +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        axis.text = element_text(colour="black"),
        plot.title = element_text(hjust=0.5)) +
  ggtitle(id) +
  annotate("text",  x=min(df_tmp[,41], na.rm=T), y = max(df_tmp[,43], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)

print(scater)

Analyses for proof of concept GATAA 2nd S2-171

Fig 1B - activity of variants ordered

Count_table_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")
tmp_main <- Count_table_final[Count_table_final$library %in% c("ced6_pos241", "S2171_wt"),c(1:4,grep("GATAA_2nd_S2171", names(Count_table_final)), grep("S21717_REEFmix2", names(Count_table_final)))]
tmp_main <- tmp_main[complete.cases(tmp_main$S21717_REEFmix2_log2FoldChange),]
tmp <- tmp_main[order(tmp_main$S21717_REEFmix2_log2FoldChange, decreasing = T),]
tmp <- data.frame(x=1:nrow(tmp)/1000, y=2**tmp$S21717_REEFmix2_log2FoldChange, y_log=tmp$S21717_REEFmix2_log2FoldChange,
                  variant=as.character(tmp$Variant),
                  variant_seq=sapply(strsplit(as.character(tmp$Variant),"_"), `[`, 3),
                  stringsAsFactors = F)

# select variants to plot
tmp_sel <- tmp[tmp$variant %in% c("S2171_wt_+",
                                  tmp$variant[1],
                                  tmp$variant[grep("GATAA", tmp$variant_seq)][1],
                                  tmp$variant[grep("TTATC", tmp$variant_seq)][1]),]
tmp_sel$variant_seq[grep("S2171_wt_+", tmp_sel$variant)] <- "wt (CTTATCGC)"

gg <- ggplot(tmp, aes(x,y)) +
  geom_line() +
  scale_y_continuous("Enhancer activity\n(Normalized RNA levels)", breaks = seq(0,150,30)) +
  scale_x_continuous("Enhancer variants (x 10e3)", breaks = seq(0,60,10)) +
  ggtitle("S2-171 enhancer - ced6 pos241 GATAA position") +
  theme(plot.title = element_text(hjust=0.5)) +
  geom_point(data=tmp_sel, aes(x,y), color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]), size=3) +
  annotate(geom="text", x=tmp_sel$x+1.5, y=tmp_sel$y,
           label=tmp_sel$variant_seq,
           color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]),
           hjust=0, vjust=0.5, size = 3.5)

print(gg)

### numbers of variants

length(tmp_main$Variant[tmp_main$S21717_REEFmix2_log2FoldChange>tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)]])
## [1] 600
# 600 over WT

# +/- 10% linear scale
wt_act <- 2**tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)]
length(tmp_main$Variant[(2**tmp_main$S21717_REEFmix2_log2FoldChange) >= (wt_act - wt_act*0.1) &
                          (2**tmp_main$S21717_REEFmix2_log2FoldChange) <= (wt_act + wt_act*0.1)]) -1 # minus wt
## [1] 374
# 374

Fig 1C,D - Diversity of top active variants

Count_table_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")
metadata_final <- Count_table_final[Count_table_final$library %in% c("ced6_pos241", "S2171_wt"),c(1:5,grep("GATAA_2nd_S2171", names(Count_table_final)), grep("S21717_REEFmix2", names(Count_table_final)))]
metadata_final <- metadata_final[complete.cases(metadata_final$S21717_REEFmix2_log2FoldChange),]

library(ggseqlogo)
plot.logo <- function(x, method='bits'){
      ggseqlogo(x, method=method, seq_type='dna', ncol=1) +
        theme(panel.background = element_rect(fill="white",colour="white"),
            panel.grid = element_blank(), axis.line=element_blank(),
            axis.text = element_blank(), axis.ticks = element_blank(),
            axis.title = element_blank(),
            legend.key=element_blank(),
            legend.text = element_text(size=11, colour="black"),
            legend.title = element_text(size=12, colour="black"),
            plot.title = element_text(size=14, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5),
            legend.position = "right")
}

seq_counts <- c(1,2,5,10,50,100,1000, nrow(metadata_final))
names(seq_counts) <- seq_counts

logo_plots_all <- lapply(c(IC='bits', Prob='p'), function(m){
  logo_plots <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
    if(c=="Random") tmp <- metadata_final[sample(1:nrow(metadata_final)),]
    if(c=="Experiment") tmp <- metadata_final[order(metadata_final$S21717_REEFmix2_log2FoldChange, decreasing = T),]
    
    lapply(seq_counts, function(i){
      tmp2 <- tmp[1:i,]
      
      # get nucleotide at each position
      tmp2 <- do.call(rbind, lapply(1:16, function(x) data.frame(Pos=x,
                                                                 Base=substr(tmp2$GATAA_2nd_S2171_sequence_fw_extended4bp, x, x))))
      
      prop_matrix <- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x/sum(x))
      
      plot.logo(prop_matrix, method=m)
      
    })
  })
})

# plot logos
library(cowplot)
plot_grid(plot_grid(plotlist = logo_plots_all$IC$Experiment, ncol=1),
                   plot_grid(plotlist = logo_plots_all$IC$Random, ncol=1))

plot_grid(plot_grid(plotlist = logo_plots_all$Prob$Experiment, ncol=1),
                   plot_grid(plotlist = logo_plots_all$Prob$Random, ncol=1))

### statistics with information content
# The information content (IC) of a PWM says how different a given PWM is from a uniform distribution.
tIC <- function(C) log2(length(C))
PPM <- function(C) C / sum(C)
U <- function(C) -sum(PPM(C) * log2(PPM(C)))
fIC <- function(C) tIC(C) - U(C)
IC <- function(C){
  # add pseudocounts
  C[C==0] <- 0.001
  PPM(C) * fIC(C)
}

set.seed(1234)
statistics_logo_plots <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
  if(c=="Random") tmp <- metadata_final[sample(1:nrow(metadata_final)),]
  if(c=="Experiment") tmp <- metadata_final[order(metadata_final$S21717_REEFmix2_log2FoldChange, decreasing = T),]
  
  v_all <- sapply(c(1:1000, 10000, 50000, nrow(metadata_final)), function(i){
    tmp2 <- tmp[1:i,]
    
    # get nucleotide at each position
    tmp2 <- do.call(rbind, lapply(1:8, function(x) data.frame(Pos=x,
                                                              Base=substr(tmp2$GATAA_2nd_S2171_sequence_fw_extended4bp, x, x))))
    tmp2$Base <- factor(tmp2$Base, levels = c("A", "C", "G", "T"))
    
    df <- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x)
    
    v <- sum(colSums(sapply(1:ncol(df), function(x){
      IC(df[,x])
    })))
    
    return(v)
  })
  return(v_all)
})

n=300
plot_df <- data.frame(Group=rep(c("Random", "STARR-seq"), each=n),
                      Top_sequences=rep(1:n, 2),
                      IC=c(statistics_logo_plots$Random[1:n], statistics_logo_plots$Experiment[1:n]))

IC_plot <- ggplot(plot_df, aes(Top_sequences, IC, colour=Group)) +
    geom_point() +
    geom_line(alpha=0.5) +
    scale_y_continuous("8-mer sum Information Content") +
    scale_x_continuous("Number of top sequences considered", breaks = seq(0,1000,50)) +
    scale_color_manual("", values=c("STARR-seq"="orangered", Random="grey60")) +
  theme_bw(base_size = 11) +
    theme(plot.title = element_text(hjust = 0.5),
          axis.text = element_text(size = 13, colour="black"),
          axis.title = element_text(size = 15),
          legend.text = element_text(size = 14)
    )

print(IC_plot)

boxplot <- ggplot(tmp, aes("1",y_log)) +
  geom_violin(alpha=0.5, position = position_dodge(width = 0.9), size=0.7, color="black", fill="grey60") +
  geom_boxplot(outlier.size = -1, color="black", position = position_dodge(width = 0.9), width=0.2, size=0.8) +
  scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
  scale_x_discrete("", labels=paste0("All variants\n(n=", nrow(tmp), ")")) +
  #geom_hline(yintercept = 0, linetype="dashed", col="grey60") + # the 0 is not relevant so don't print the line
  theme_bw(base_size = 14) +
  theme(axis.text = element_text(colour="black"),
        axis.title = element_text(colour="black"),
        axis.line = element_blank(),
        panel.border = element_rect(colour="black"),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_line(colour="grey95", size=0.4),
        axis.ticks = element_line(colour="black"),
        plot.title = element_text(hjust=0.5)
  )

boxplot_final <- boxplot +
  geom_point(data=tmp_sel, aes("1",y_log), color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]), size=1.5) +
  annotate(geom="text", x="1", y=tmp_sel$y_log,
           label=tmp_sel$variant_seq,
           color=c("grey20", my_motif_col["GATAA"], my_motif_col["GATAA"], my_motif_col["wt"]),
           hjust=-0.15, vjust=0.5, size = 1.85)
  
print(boxplot_final)

Fig 1E, S3A - Enrichment of variants creating known motifs

boxplot3 <- boxplot +
  geom_point(data=tmp_sel[tmp_sel$variant %in% "S2171_wt_+",], aes("1",y_log), color=c(my_motif_col["wt"]), size=2) +
  annotate(geom="text", x="1", y=tmp_sel$y_log[tmp_sel$variant %in% "S2171_wt_+"],
           label="wt",
           color=my_motif_col["wt"],
           hjust=-0.35, vjust=0.5, size = 4)

########

motif_interest_list <- c("GATAAG", "TGA.TCA", "CATCTG", "GAGAGA",
                         "CCGGAA", "TCACGCGA", "TTCC.GGA",
                         "TCATCA", #TGATGT
                         "ATCGAT",
                         "AGGATAA"
                         )
names(motif_interest_list) <- motif_interest_list

# make table with enrichments per motif
motif_variant_enrichment_16bp <- lapply(motif_interest_list, function(m){
  fw=tmp_main$S21717_REEFmix2_log2FoldChange[grep(m, tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp)]
  fw_seq=tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp[grep(m, tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp)]
  if(!DNAString(m) == reverseComplement(DNAString(m))){
    rv=tmp_main$S21717_REEFmix2_log2FoldChange[grep(m, tmp_main$GATAA_2nd_S2171_sequence_rv_extended4bp)]
    rv_seq=tmp_main$GATAA_2nd_S2171_sequence_fw_extended4bp[grep(m, tmp_main$GATAA_2nd_S2171_sequence_rv_extended4bp)]
    }else{rv <- c(); rv_seq <- c()}
  
  data.frame(Motif=m,
             Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
             Sequence=c(fw_seq, rv_seq),
             Enrichment_log2=c(fw, rv))
})

# plot for both tables
seq="16bp"

# join all
motif_variant_enrichment <- do.call(rbind, motif_variant_enrichment_16bp)

# set factor levels
motif_levels <- motif_variant_enrichment %>%
  group_by(Motif) %>%
  summarise(median=median(Enrichment_log2)) %>%
  arrange(desc(median)) %>%
  select(Motif)

motif_variant_enrichment$Motif <- factor(motif_variant_enrichment$Motif, levels =as.character(motif_levels$Motif))
motif_variant_enrichment$Motif <- relevel(motif_variant_enrichment$Motif, "GATAAG")

# x labels counts per boxplot
t <- data.frame(table(motif_variant_enrichment$Motif))
xlabels <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
  TF=""
  if(x=="GATAAG") TF="GATA, "
  if(x=="TGA.TCA") TF="AP-1, "
  if(x=="CATCTG") TF="twist, "
  if(x=="GAGAGA") TF="Trl, "
  if(x=="CCGGAA") TF="ETS, "
  if(x=="TCACGCGA") TF="SREBP, "
  if(x=="TTCC.GGA") TF="STAT, "
  if(x=="TCATCA") TF="CREB, "
  if(x=="ATCGAT") TF="Dref, "
  if(x=="AGGATAA") TF="ttk, "
  if(x %in% c("All", "Others")){
    paste0(x,"\n(", TF, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
  else{
    paste0(x,"\n(",
           TF, t$Freq[t$Var1%in%x], ")")}
})

boxplot_known_motifs <- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif, alpha=Strand)) +
  geom_boxplot(outlier.size = 0.6) +
  #geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
  geom_hline(yintercept = tmp_main$S21717_REEFmix2_log2FoldChange[tmp_main$Variant %in% "S2171_wt_+"], linetype="dashed", col="orangered") +
  geom_hline(yintercept = median(tmp_main$S21717_REEFmix2_log2FoldChange), linetype="dashed", col="grey60") +
  scale_fill_manual(values=my_motif_col) +
  scale_alpha_manual(values=c(0.5,0.9)) +
  scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
  scale_x_discrete("", labels=xlabels) +
  guides(fill=F, alpha=F) +
  theme_bw(base_size = 14) +
  theme(axis.text = element_text(colour="black"),
        axis.text.x = element_text(colour="black", size=9),
        axis.title = element_text(colour="black"),
        axis.title.y = element_blank(),
        axis.line = element_blank(),
        panel.border = element_rect(colour="black"),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_line(colour="grey95", size=0.4),
        axis.ticks = element_line(colour="black"),
        plot.title = element_text(hjust=0.5)
  )

boxplot_known_motifs2 <- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
  geom_boxplot(outlier.size = 0.6) +
  #geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
  geom_hline(yintercept = tmp_main$S21717_REEFmix2_log2FoldChange[tmp_main$Variant %in% "S2171_wt_+"], linetype="dashed", col="orangered") +
  geom_hline(yintercept = median(tmp_main$S21717_REEFmix2_log2FoldChange), linetype="dashed", col="grey60") +
  scale_fill_manual(values=my_motif_col) +
  scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,8,1), labels = c("1/4", "1/2", 2**seq(0,8,1)), limits = c(-1.65,7.5)) +
  scale_x_discrete("", labels=xlabels) +
  guides(fill=F, alpha=F) +
  theme_bw(base_size = 14) +
  theme(axis.text = element_text(colour="black"),
        axis.text.x = element_text(colour="black", size=9),
        axis.title = element_text(colour="black"),
        axis.title.y = element_blank(),
        axis.line = element_blank(),
        panel.border = element_rect(colour="black"),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_line(colour="grey95", size=0.4),
        axis.ticks = element_line(colour="black"),
        plot.title = element_text(hjust=0.5)
  )

print(boxplot3 + boxplot_known_motifs2 + plot_layout(widths = c(1,8)))

print(boxplot3 + boxplot_known_motifs + plot_layout(widths = c(1,8)))

# test significance of differences between orienations
# AP1 (TGA.TCA) and Dref (ATCGAT) are palindromic and have now reverse complement
unlist(sapply(levels(motif_variant_enrichment$Motif), function(m){
  if(!m %in% c("TGA.TCA", "ATCGAT"))  wilcox.test(motif_variant_enrichment$Enrichment_log2[motif_variant_enrichment$Motif==m & motif_variant_enrichment$Strand=="fw"],
                                                                  motif_variant_enrichment$Enrichment_log2[motif_variant_enrichment$Motif==m & motif_variant_enrichment$Strand=="rv"])$p.value
  }))
##    GATAAG  TCACGCGA    CATCTG    TCATCA  TTCC.GGA    GAGAGA    CCGGAA   AGGATAA 
## 0.4881886 0.7304401 0.7489610 0.7906186 0.6968224 0.1191006 0.9858273 0.8582189

Fig S3C - Correlation between variant activity and motif scores

TF_motifs <- list(GATA=data.frame(Motif="GATAAG", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
                  AP1=data.frame(Motif="TGA.TCA", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
                  twist=data.frame(Motif="CATCTG", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
                  Trl=data.frame(Motif="GAGAGA", ID="homer__CTYTCTYTCTCTCTC_GAGA-repeat"),
                  ETS=data.frame(Motif="CCGGAA", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
                  SREBP=data.frame(Motif="TCACGCGA", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
                  STAT=data.frame(Motif="TTCC.GGA", ID="stark__TTCCSGGAA"), # since bergman__Stat92E was longer than the variants sequence
                  CREB=data.frame(Motif="TCATCA", ID="cisbp__M0295"),
                  Dref=data.frame(Motif="ATCGAT", ID="bergman__Dref"),
                  ttk=data.frame(Motif="AGGATAA", ID="flyfactorsurvey__ttk_NAR_FBgn0003870")
                  )
TF_motifs <- do.call(rbind, TF_motifs)

# get motif scores
require(motifmatchr)
require(TFBSTools)
load("data/TF_clusters_PWMs.RData") # load motifs
scores <- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID], # no ttk
                      motif_variant_enrichment$Sequence,
                      genome = "BSgenome.Dmelanogaster.UCSC.dm3", p.cutoff = 1, bg="genome", out = "scores") # no cutoff to get all scores

out <- as.data.frame(as.matrix(motifmatchr::motifScores(scores)))
names(out) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID])
motif_variant_enrichment_2 <- cbind(motif_variant_enrichment, out)

Corr_scores <- sapply(levels(motif_variant_enrichment_2$Motif), function(m){
  cor(motif_variant_enrichment_2$Enrichment_log2[motif_variant_enrichment_2$Motif==m],
      motif_variant_enrichment_2[motif_variant_enrichment_2$Motif==m, TF_motifs$ID[TF_motifs$Motif==m]])
})

print(Corr_scores)
##      GATAAG     TGA.TCA    TCACGCGA      CATCTG      TCATCA    TTCC.GGA 
##  0.27326923  0.57669744  0.75118949  0.04499555  0.23829398  0.52075684 
##      GAGAGA      CCGGAA      ATCGAT     AGGATAA 
##  0.20628973  0.07917095  0.13069675 -0.57051310
gg <- data.frame(Motif = factor(names(Corr_scores), levels =as.character(motif_levels$Motif)),
           Scores = Corr_scores) %>%
  ggplot(aes(Motif, Scores, fill=Motif)) +
  geom_bar(stat="identity", width=0.7, col="black") +
  scale_x_discrete("Pasted motif") +
  scale_y_continuous("Correlation activity - motif scores", breaks = seq(-1,1,0.2)) +
  scale_fill_manual(values=my_motif_col) +
  guides(fill="none", col="none") +
  scale_x_discrete("", labels=xlabels, limits=levels(motif_variant_enrichment$Motif)) +
  theme_bw(base_size = 14) +
  theme(axis.text = element_text(colour="black"),
        axis.text.x = element_text(colour="black", size=9),
        axis.title = element_text(colour="black"),
        axis.line = element_blank(),
        panel.border = element_rect(colour="black"),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_line(colour="grey95", size=0.4),
        axis.ticks = element_line(colour="black"),
        plot.title = element_text(hjust=0.5)
  )

print(gg)

Fig 1F, S3B - How many motifs explain the top variants

# do the variants with high activity match to different motifs?
top_sequences <- tmp_main[tmp_main$S21717_REEFmix2_log2FoldChange>tmp_main$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", tmp_main$library)],]
rownames(top_sequences) <- top_sequences$Variant
# 600 over WT

### All enriched motifs (FDR<0.05)
PWM_candidates <- readRDS("data/PWM_candidates.rds")
PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]
PWM_candidates <- PWM_candidates[PWM_candidates$FDR_motif_enr<0.05 & PWM_candidates$log2_motif_enr>0,]

require(motifmatchr)
require(TFBSTools)
load("data/TF_clusters_PWMs.RData") # load motifs

# count motifs in all enhancers
opt=list()
opt$pvalue_cutoff <- c(1e-5, 1e-4)
opt$genome="BSgenome.Dmelanogaster.UCSC.dm3"

# motif counts
# for motif enrichment is is fine to do not reduce, because it will just be present vs non-present
motif_counts_all <- list()
for(p in opt$pvalue_cutoff){
  # motif counts
  motif_ix_scores <- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name],
                                 as.character(top_sequences$GATAA_2nd_S2171_sequence_fw_extended4bp),
                                 genome = opt$genome, p.cutoff = p, bg="genome", out = "scores")
  scores <- as.data.frame(as.matrix(motifCounts(motif_ix_scores)))
  names(scores) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name])
  motif_counts <- cbind(top_sequences, scores)
  
  motif_counts_all[[paste0("pvalue_", p)]] <- motif_counts
  print(p)
}
## [1] 1e-05
## [1] 1e-04
# sequences not explained
table(rowSums(motif_counts_all$`pvalue_1e-04`[,18:ncol(motif_counts_all$`pvalue_1e-04`)])>0)
## 
## FALSE  TRUE 
##    38   562
table(rowSums(motif_counts_all$`pvalue_1e-05`[,18:ncol(motif_counts_all$`pvalue_1e-05`)])>0)
## 
## FALSE  TRUE 
##   181   419
# sequences explained by main motifs, others, multiple, or none
TF_motifs <- list(GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507"),
                  GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
                  GATA=data.frame(Motif="GATA", ID=as.character(PWM_candidates$motif_name[grep("srp|gatad", PWM_candidates$motif_description2, ignore.case = T)])),
                  AP1=data.frame(Motif="AP-1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
                  AP1=data.frame(Motif="AP-1", ID=as.character(PWM_candidates$motif_name[grep("jra|kay|fos|jun", PWM_candidates$motif_description2, ignore.case = T)])),
                  twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
                  twist=data.frame(Motif="twist", ID=as.character(PWM_candidates$motif_name[grep("twi", PWM_candidates$motif_description2, ignore.case = T)])),
                  Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263"),
                  Trl=data.frame(Motif="Trl", ID=as.character(PWM_candidates$motif_name[grep("trl|gaga", PWM_candidates$motif_description2, ignore.case = T)])),
                  ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
                  ETS=data.frame(Motif="ETS", ID=as.character(PWM_candidates$motif_name[grep("ETS", PWM_candidates$motif_description2, ignore.case = T)])),
                  SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
                  SREBP=data.frame(Motif="SREBP", ID=as.character(PWM_candidates$motif_name[grep("SREBP|HLH106", PWM_candidates$motif_description2, ignore.case = T)])),
                  STAT=data.frame(Motif="STAT", ID="bergman__Stat92E"),
                  STAT=data.frame(Motif="STAT", ID=as.character(PWM_candidates$motif_name[grep("stat", PWM_candidates$motif_description2, ignore.case = T)])),
                  CREB=data.frame(Motif="CREB/ATF", ID="cisbp__M0295"),
                  CREB=data.frame(Motif="CREB/ATF", ID=as.character(PWM_candidates$motif_name[grep("creb", PWM_candidates$motif_description2, ignore.case = T)])
                  )
)
TF_motifs <- do.call(rbind, TF_motifs)
TF_motifs$Motif <- factor(TF_motifs$Motif)

Sequences_motifs_16bp <- lapply(motif_counts_all, function(p){
  
  candidates_variant_enrichment <- data.frame(sapply(levels(TF_motifs$Motif), function(m){
    return(rowSums(p[,as.character(TF_motifs$ID[TF_motifs$Motif==m])])>0)
  }))
  candidates_variant_enrichment[,"Other motifs"] <- rowSums(p[,17:ncol(p)])>0
  colSums(candidates_variant_enrichment)
  
  return(candidates_variant_enrichment)
})
colSums(Sequences_motifs_16bp[[1]])
##         AP.1     CREB.ATF          ETS         GATA        SREBP         STAT 
##           98            5            0           23           51           24 
##          Trl        twist Other motifs 
##            0            2          419
colSums(Sequences_motifs_16bp[[2]])
##         AP.1     CREB.ATF          ETS         GATA        SREBP         STAT 
##          172           30            7           49           92           39 
##          Trl        twist Other motifs 
##            0            2          562
table(rowSums(Sequences_motifs_16bp[[1]][,1:8]))
## 
##   0   1   2 
## 399 199   2
table(rowSums(Sequences_motifs_16bp[[2]][,1:8]))
## 
##   0   1   2   3 
## 242 326  31   1
# which variants
all_var_list <- lapply(Sequences_motifs_16bp, function(i){
  i[,"Group"] <- apply(i, 1, function(x){
    if(sum(x[1:8]==TRUE)>1){"Multiple"
    }else if(sum(x[1:8]==TRUE)==1){paste0(colnames(i)[1:8][x[1:8]==TRUE], collapse = "_")
    }else if(sum(x[1:8]==TRUE)==0 & x["Other motifs"]==TRUE){paste0(colnames(i)[x==TRUE], collapse = "_")
    }else{"No motifs"}
  })
  all_var <- table(i[,"Group"])
  names(all_var)[names(all_var)=="AP.1"] <- "AP-1"
  names(all_var)[names(all_var)=="CREB.ATF"] <- "CREB/ATF"
  df <- data.frame(all_var)
  df$Var1 <- factor(df$Var1, levels=rev(c("No motifs", "Other motifs", "Multiple", rev(sort(levels(TF_motifs$Motif))))))
  df
})

gg_bar1 <- ggplot() +
  geom_bar(data=all_var_list[[1]], aes("1e-05", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
  geom_bar(data=all_var_list[[2]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
  xlab("PWM cutoff") +
  scale_y_continuous("# of variants", expand = c(0,0)) +
  scale_x_discrete(limits=c("1e-05", "1e-04")) +
  scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
                    limits = levels(all_var_list[[2]]$Var1))

gg_bar2 <- ggplot() +
  geom_bar(data=all_var_list[[2]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
  xlab("PWM cutoff") +
  scale_y_continuous("# of variants", expand = c(0,0)) +
  scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
                    limits = levels(all_var_list[[2]]$Var1))

print(gg_bar2)

print(gg_bar1)

Analyse activity of variants in all enhancer positions

Fig2B - activity of variants at different enhancer positions

metadata_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)

metadata_final$library <- factor(metadata_final$library, levels = c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"),
                                 labels=c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "pos110", "pos182", "pos230", "pos241", "pos210", "pos180", "pos142"))

# Melt table
tmp <- reshape::melt(metadata_final[,c("Variant", "library", "S21717_REEFmix2_log2FoldChange", "chr3L_3310914_3311162_REEFmix3_log2FoldChange")])
tmp <- tmp[complete.cases(tmp$value),]
tmp$library <- factor(tmp$library, levels=c("S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref", "Wt", "pos110", "pos182", "pos230", "pos241", "pos142", "pos180", "pos210"))

# get motif average to choose variants to highlight
Average_motif_enrichment_log2 <- readRDS("Average_motif_enrichment_log2.rds")

plot_list <- list()
for(e in as.character(unique(tmp$variable))){
  tmp2 <- tmp[tmp$variable %in% e & !tmp$library %in% c("chr3L_3310914_3311162_ref", "chr3L_3310914_3311162_wt", "S2171_wt", "Wt"),]
  tmp2$Variant_sequence <- sapply(strsplit(as.character(tmp2$Variant),"_"), `[`, length(strsplit(as.character(tmp2$Variant)[1],"_")[[1]])-1)
  
  # x labels counts per boxplot
  if(e=="S21717_REEFmix2_log2FoldChange"){
    tmp2$library <- factor(tmp2$library, levels = c("pos110", "pos241", "pos182", "pos230"))
    xlabels <- sapply(as.character(levels(tmp2$library)), function(x){
      paste0(x,"\n(", table(tmp2$library)[x], ")")
    })
    id="S2171_wt_+"
    ti <- "ced-6"
    variants_choice <- rbind(data.frame(Motif="CCGGAA", Pos=c("pos110", "pos241"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="CCGGAA" & Average_motif_enrichment_log2$varID=="ced6 pos110"],
                                                                                             Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="CCGGAA" & Average_motif_enrichment_log2$varID=="ced6 pos241"])),
                             data.frame(Motif="GATAAG", Pos=c("pos182", "pos230"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="GATAAG" & Average_motif_enrichment_log2$varID=="ced6 pos182"],
                                                                                             Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="GATAAG" & Average_motif_enrichment_log2$varID=="ced6 pos230"])))
  }
  if(e=="chr3L_3310914_3311162_REEFmix3_log2FoldChange"){
    xlabels <- sapply(as.character(levels(tmp2$library)), function(x){
      paste0(x,"\n(", table(tmp2$library)[x], ")")
    })
    id=c("chrX_9273894_9274142_+_dCP_-")
    ti <- "ZnT63C"
    variants_choice <- rbind(data.frame(Motif="TCACGCGA", Pos=c("pos142", "pos180", "pos210"), Average=c(Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos142"],
                                                                                                         Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos180"],
                                                                                                         Average_motif_enrichment_log2$Mean_Enrichment_log2[Average_motif_enrichment_log2$Motif=="TCACGCGA" & Average_motif_enrichment_log2$varID=="ZnT63C pos210"])))
  }
  
  if(length(id)==2){tmp_col <- c("red", "grey40")}else{tmp_col <- "red"}
  if(length(id)==2){tmp_ann <- c("ref", "wt")}else{tmp_ann <- "wt"}
  
  boxplot <- ggplot(tmp2, aes(library, value, fill=library)) +
    geom_boxplot(col=NA, fill=NA) +
    geom_violin(alpha=0.7) +
    geom_boxplot(outlier.size = -1, color="black", width=0.17, size=0.8, fill=NA) +
    geom_hline(yintercept = 0, linetype="dashed", col="grey40") +
    geom_hline(yintercept = tmp$value[tmp$Variant %in% id & tmp$variable %in% e], linetype="dashed", col=tmp_col) +
    annotate(geom="text", x=1, y=tmp$value[tmp$Variant %in% id & tmp$variable %in% e],
             label=tmp_ann,
             color=tmp_col,
             hjust=2, vjust=-0.4, size = 4) +
    scale_fill_manual("", values=my_col) +
    guides(fill="none") +
    scale_y_continuous("Enhancer activity (log2)", breaks = seq(-4,10,2), labels = c("1/16", "1/2", 2**seq(0,10,2)), limits = c(-2.28, 9.7)) +
    scale_x_discrete("", labels=xlabels) +
    ggtitle(ti) +
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.title = element_text(colour="black"),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust = 0.5)
    )
  
  
  # add variants
  variants_choice$var <- NA
  variants_choice$var_activity <- NA
  for(m in unique(variants_choice$Motif)){
    var_list <- lapply(variants_choice$Pos[variants_choice$Motif==m], function(p){
      v <- tmp2[tmp2$library %in% p,]
      v <- v[grep(m, v$Variant_sequence),]
      # motif variants difference to avg
      v$delta <- v$value - variants_choice$Average[variants_choice$Motif==m & variants_choice$Pos==p]
      return(v[,c("Variant_sequence", "delta")])
    })  
    var_list_red <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Variant_sequence"),
                           var_list)
    out <- tmp2[grep(var_list_red$Variant_sequence[order(abs(rowSums(var_list_red[,-1])))][1], tmp2$Variant),]
    
    variants_choice$var[variants_choice$Motif==m] <- out$Variant[match(variants_choice$Pos[variants_choice$Motif==m], out$library)]
    variants_choice$var_activity[variants_choice$Motif==m] <- out$value[match(variants_choice$Pos[variants_choice$Motif==m], out$library)]
  }
  print(variants_choice)
  
  boxplot <- boxplot +
    geom_point(data=variants_choice, aes(Pos, var_activity), size=3, fill=NA, col=my_motif_col[as.character(variants_choice$Motif)])
    
  plot_list[[e]] <- boxplot
}
##    Motif    Pos  Average                  var var_activity
## 1 CCGGAA pos110 4.764639 GATAA_1st_ATCCGGAA_+     5.172439
## 2 CCGGAA pos241 3.163757 GATAA_2nd_ATCCGGAA_+     2.725990
## 3 GATAAG pos182 5.940204  CACA_2nd_CAGATAAG_+     6.236382
## 4 GATAAG pos230 5.258812  GAGA_1st_CAGATAAG_+     4.853649
##      Motif    Pos  Average              var var_activity
## 1 TCACGCGA pos142 7.031426   AP1_TCACGCGA_+     6.764411
## 2 TCACGCGA pos180 7.725775 twist_TCACGCGA_+     7.832483
## 3 TCACGCGA pos210 5.492017 GATAA_TCACGCGA_+     4.760254
plot_grid(plotlist = plot_list, rel_widths = c(4,3))

# ced-6 enhancer
table(metadata_final$library, metadata_final$S21717_REEFmix2_log2FoldChange>metadata_final$S21717_REEFmix2_log2FoldChange[grep("S2171_wt", metadata_final$library)])
##                            
##                             FALSE  TRUE
##   S2171_wt                      1     0
##   chr3L_3310914_3311162_wt      0     1
##   chr3L_3310914_3311162_ref     0     1
##   Wt                         3056   155
##   pos110                    63779  1465
##   pos182                    30530 20437
##   pos230                    18258 46861
##   pos241                    61411   600
##   pos210                        0     0
##   pos180                        0     0
##   pos142                        0     0
# ZnT63C chr3L_3310914_3311162 enhancer
table(metadata_final$library, metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange>metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[grep("chr3L_3310914_3311162_wt", metadata_final$library)])
##                            
##                             FALSE  TRUE
##   S2171_wt                      0     1
##   chr3L_3310914_3311162_wt      1     0
##   chr3L_3310914_3311162_ref     0     1
##   Wt                         2221   400
##   pos110                        0     0
##   pos182                        0     0
##   pos230                        0     0
##   pos241                        0     0
##   pos210                    33648 27751
##   pos180                    59858  2018
##   pos142                    55483  2918
# ZnT63C chr3L_3310914_3311162 REF enhancer (chr2R_4239779_4240027_+_dCP_+)
table(metadata_final$library, metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange>metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[grep("chrX_9273894_9274142_+_dCP_-", metadata_final$Variant, fixed=T)])
##                            
##                             FALSE  TRUE
##   S2171_wt                      1     0
##   chr3L_3310914_3311162_wt      1     0
##   chr3L_3310914_3311162_ref     1     0
##   Wt                         2610    11
##   pos110                        0     0
##   pos182                        0     0
##   pos230                        0     0
##   pos241                        0     0
##   pos210                    61249   150
##   pos180                    61830    46
##   pos142                    58375    26

Comparison of variant activities in different enhancer positions using the 8nt variants

Merge libraries - add enrichments and fw and rv sequences

metadata_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)

vars <- unique(unlist(lapply(grep("sequence_fw$", names(metadata_final), value = T), function(x) unique(metadata_final[,x]))))
vars= vars[nchar(vars)==8]
Merged_enrichments <- data.frame(row.names = vars[complete.cases(vars)],
                                 variant_sequence_fw=vars[complete.cases(vars)],
                                 variant_sequence_rv=as.character(reverseComplement(DNAStringSet(vars[complete.cases(vars)]))),
                                 stringsAsFactors = F)

# only data for variants and wt
metadata_final <- metadata_final[!metadata_final$library %in% c("Wt", "S2171_wt", "chr3L_3310914_3311162_ref"),]

# merge information in loop
for(e in sapply(strsplit(grep("_sequence_fw$", names(metadata_final), value = T),"_sequence_fw"), `[`, 1)){
  if(length(grep("chr3L_3310914_3311162|twist_S2171", e))>0){
    Merged_enrichments[,paste0(e, "_act")] <- metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")])]
  }else{
    Merged_enrichments[,paste0(e, "_act")] <- metadata_final$S21717_REEFmix2_log2FoldChange[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")])]
  }
  
  Merged_enrichments[,paste0(e, "_fw_extended4bp")] <- metadata_final[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")]), paste0(e, "_sequence_fw_extended4bp")]
  Merged_enrichments[,paste0(e, "_rv_extended4bp")] <- metadata_final[match(rownames(Merged_enrichments), metadata_final[,paste0(e, "_sequence_fw")]), paste0(e, "_sequence_rv_extended4bp")]
  
}

names(Merged_enrichments) <- gsub("GATAA_1st_S2171", "ced6_pos110", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("CACA_2nd_S2171", "ced6_pos182", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GAGA_1st_S2171", "ced6_pos230", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GATAA_2nd_S2171", "ced6_pos241", names(Merged_enrichments))

names(Merged_enrichments) <- gsub("AP1_chr3L_3310914_3311162", "ZnT63C_pos142", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("twist_chr3L_3310914_3311162", "ZnT63C_pos180", names(Merged_enrichments))
names(Merged_enrichments) <- gsub("GATAA_chr3L_3310914_3311162", "ZnT63C_pos210", names(Merged_enrichments))

# write table
write.table(Merged_enrichments, "Enrichments_and_sequence_information_all_REEF_libraries.txt", sep="\t", row.names = F, quote=F)

Supplementary Table 3

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### z-score normalise to just compare differences within libraries (against mean)
df_scaled <- apply(Merged_enrichments[,grep("_act", names(Merged_enrichments), value = T)], 2, scale)
colnames(df_scaled) <- paste0(colnames(df_scaled), "_scaled")


out <- cbind(Merged_enrichments, df_scaled)

write.table(out, "Supplementary_Table_3.txt", sep="\t", quote=F, row.names=F)

Fig S8C,D - pairwise comparison per position

Didn’t include plots because they are too heavy

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### z-score normalise to just compare differences within libraries (against mean)
df <- Merged_enrichments
df[,grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)

for(l in 8:5){
  
  if(l==8) df$variant_sequence_fw_corrected <- df$variant_sequence_fw
  if(l==7) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 1,7)
  if(l==6) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)
  if(l==5) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,7)
  if(l==4) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,6)
  if(l==3) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,6)
  if(l==2) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,5)
  
  tmp <- df %>% 
    group_by(variant_sequence_fw_corrected) %>%
    summarise(ced6_pos110=mean(ced6_pos110_act, na.rm=T),
              ced6_pos182=mean(ced6_pos182_act, na.rm=T),
              ced6_pos230=mean(ced6_pos230_act, na.rm=T),
              ced6_pos241=mean(ced6_pos241_act, na.rm=T),
              ZnT63C_pos142=mean(ZnT63C_pos142_act, na.rm=T),
              ZnT63C_pos180=mean(ZnT63C_pos180_act, na.rm=T),
              ZnT63C_pos210=mean(ZnT63C_pos210_act, na.rm=T)
              )
  
  tmp <- as.data.frame(tmp)
  
  ### plots
  
  plot_list_tmp = list()
  plot_list_motifs = list()
  plot_list_tmp_no_GATA= list()
  v <- names(tmp)[-1]
  for(i in 1:(length(v)-1)){
    for(i2 in (i+1):length(v)){
      a <- v[i]
      b <- v[i2]
      
      # PCC
      pc <- cor.test(tmp[,a],
                     tmp[,b],
                     method = "pearson")
      
      my_col=c("grey70","grey40")
      
      br <- ifelse(l>6,2,1) # axes breaks
      br <- ifelse(l>4,br,0.5)
      
      si <- ifelse(l>6,0.5,0.7) # point size
      si <- ifelse(l>4,si,2)
      
      # plot
      scater <- ggplot(tmp, aes(tmp[,a], tmp[,b])) +
        geom_pointdensity(size=si, alpha=0.7) +
        geom_abline(intercept = 0, slope = 1, linetype="dashed", col="grey50") +
        scale_color_gradient(low = my_col[1], high = my_col[2]) +
        scale_x_continuous(paste0(gsub("_", " ", a), " (mean log2 act scaled)"),
                           breaks=seq(-10,20, ifelse(l>6,2,1))) +
        scale_y_continuous(paste0(gsub("_", " ", b), " (mean log2 act scaled)"),
                           breaks=seq(-10,20, ifelse(l>6,2,1))) +
        guides(color=F) +
        theme_bw(base_size = 16) +
        theme(panel.grid = element_blank(),
              axis.text = element_text(colour="black"),
              plot.title = element_text(hjust=0.5, size=17)) +
        # ggtitle(paste0(gsub("_", " ", a), " vs ", gsub("_", " ", b), " (", l, "nt variants)")) +
        annotate("text",  x=min(tmp[,a], na.rm=T), y = max(tmp[,b], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2), "\n(n=",                                                                                                         length(which(rowSums(is.na(tmp[,c(a,b)]))==0)),
                                                                                             ")"), vjust=1, hjust=0, size=5)
      
      plot_list_tmp[[paste0(a,"_",b)]] = ggplotGrob(scater)
      
      
      if(l>=5 & l<=6){
        ### add points with specific motifs
        tmp_subest <- tmp[c(grep("GATAA", tmp$variant_sequence_fw_corrected),
                            grep("TTATC", tmp$variant_sequence_fw_corrected),
                            grep("CA..TG", tmp$variant_sequence_fw_corrected)),]
        tmp_subest <- tmp_subest[-grep("CAGGTG|CACCTG", tmp_subest$variant_sequence_fw_corrected),]
      }
      if(l==7){
        ### add points with specific motifs
        tmp_subest <- tmp[c(#grep("ACGTGAC", tmp$variant_sequence_fw_corrected),
          #grep("GTCACGT", tmp$variant_sequence_fw_corrected),
          grep("TGA.TCA", tmp$variant_sequence_fw_corrected)),]
      }
      if(l==8){
        ### add points with specific motifs
        tmp_subest <- tmp[c(grep("TCACGCGA", tmp$variant_sequence_fw_corrected),
                            grep("TCGCGTGA", tmp$variant_sequence_fw_corrected)),]
      }
      
      scater2 <- scater +
        geom_point(data=tmp_subest, aes(tmp_subest[,a], tmp_subest[,b]), size=0.8) +
        geom_text_repel(data=tmp_subest,
                        aes(tmp_subest[,a], tmp_subest[,b],
                            label = variant_sequence_fw_corrected), size = 2.5)      
      
      if(l==5){
        ### add points with specific motifs
        tmp_subest2 <- tmp[order(rowMeans(tmp[,c(a,b)]), decreasing = T),][1:10,]
        tmp_subest2 <- tmp_subest2[-c(grep("GATAA", tmp_subest2$variant_sequence_fw_corrected),
                                      grep("TTATC", tmp_subest2$variant_sequence_fw_corrected)),]
        
        scater2 <- scater2 +
          geom_text_repel(data=tmp_subest2,
                          aes(tmp_subest2[,a], tmp_subest2[,b],
                              label = variant_sequence_fw_corrected), size = 2.5,
                          colour="grey40")
      }
      
      plot_list_motifs[[paste0(a,"_",b)]] = ggplotGrob(scater2)
      
    }
  }
  pdf(paste0("Compared_enrichments_per_position_different_variant_lengths_scaled_", l, "bp.pdf"), width = 28, height = 25)

  print(gridExtra::grid.arrange(grobs = plot_list_tmp))
  if(l>=5) print(gridExtra::grid.arrange(grobs = plot_list_motifs))
  
  dev.off()
  
  print(l)
}
Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### I could maybe z-score normalise to just compare differences within libraries (against mean)
df <- Merged_enrichments
df[,grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)


l=6
df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)

tmp <- df %>% 
  group_by(variant_sequence_fw_corrected) %>%
  summarise(GATAA_1st_ced6=mean(GATAA_1st_ced6_act, na.rm=T),
            CACA_ced6=mean(CACA_ced6_act, na.rm=T),
            GAGA_ced6=mean(GAGA_ced6_act, na.rm=T),
            GATAA_2nd_ced6=mean(GATAA_2nd_ced6_act, na.rm=T),
            GATAA_ZnT63C=mean(GATAA_ZnT63C_act, na.rm=T),
            AP1_ZnT63C=mean(AP1_ZnT63C_act, na.rm=T),
            twist_ZnT63C=mean(twist_ZnT63C_act, na.rm=T)
  )

tmp <- as.data.frame(tmp)

### plots
plot_list_motifs = list()
v <- names(tmp)[-1]
for(i in 1:(length(v)-1)){
  for(i2 in (i+1):length(v)){
    a <- v[i]
    b <- v[i2]
    
    # PCC
    pc <- cor.test(tmp[,a],
                   tmp[,b],
                   method = "pearson")
    
    my_col=c("grey70","grey40")
    
    br <- ifelse(l>6,2,1) # axes breaks
    br <- ifelse(l>4,br,0.5)
    
    si <- ifelse(l>6,0.5,0.7) # point size
    si <- ifelse(l>4,si,2)
    
    # plot
    scater <- ggplot(tmp, aes(tmp[,a], tmp[,b])) +
      geom_pointdensity(size=si, alpha=0.7) +
      geom_abline(intercept = 0, slope = 1, linetype="dashed", col="grey50") +
      scale_color_gradient(low = my_col[1], high = my_col[2]) +
      scale_x_continuous(paste0(gsub("_", " ", a), " (mean log2 act scaled)"),
                         breaks=seq(-10,20,1)) +
      scale_y_continuous(paste0(gsub("_", " ", b), " (mean log2 act scaled)"),
                         breaks=seq(-10,20,1)) +
      guides(color=F) +
      theme_bw(base_size = 16) +
      theme(panel.grid = element_blank(),
            axis.text = element_text(colour="black"),
            plot.title = element_text(hjust=0.5, size=17)) +
      # ggtitle(paste0(gsub("_", " ", a), " vs ", gsub("_", " ", b), " (", l, "nt variants)")) +
      annotate("text",  x=min(tmp[,a], na.rm=T), y = max(tmp[,b], na.rm=T), label = paste0("PCC: ", round(pc$estimate,2), "\n(n=",                                                                                                         length(which(rowSums(is.na(tmp[,c(a,b)]))==0)),
                                                                                           ")"), vjust=1, hjust=0, size=5)
    
    ### add points with specific motifs
    tmp_subest_GATA <- tmp[grep("GATAA|TTATC", tmp$variant_sequence_fw_corrected),]
    tmp_subest_GATA <- tmp_subest_GATA[-grep("AGATAA|TTATCT|GATAAT|ATTATC", tmp_subest_GATA$variant_sequence_fw_corrected),]
    tmp_subest_GATA$Motif <- "GATA"
    tmp_subest_twist <- tmp[grep("CACATG|CAGATG|CATGTG|CATATG|CATCTG|CACGTG|CAGCTG", tmp$variant_sequence_fw_corrected),]
    tmp_subest_twist$Motif <- "twist"
    tmp_subest_ETS <- tmp[grep("CCGGAA|TTCCGG", tmp$variant_sequence_fw_corrected),]
    tmp_subest_ETS$Motif <- "ETS"
    tmp_subest <- rbind(tmp_subest_GATA, tmp_subest_twist, tmp_subest_ETS)
    
    scater2 <- scater +
      geom_point(data=tmp_subest, aes(tmp_subest[,a], tmp_subest[,b], fill=Motif), size=2, pch=21, stroke = 0.2) +
      scale_fill_manual(values=my_motif_col) +
      geom_text_repel(data=tmp_subest,
                      aes(tmp_subest[,a], tmp_subest[,b],
                          label = variant_sequence_fw_corrected), size = 2.3, col="black")
    
    plot_list_motifs[[paste0(a,"_",b)]] = ggplotGrob(scater2)
    
  }
}

print(gridExtra::grid.arrange(grobs = plot_list_motifs))

Fig S8A,B - clustering of positions

require(pheatmap)

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### I could maybe z-score normalise to just compare differences within libraries (against mean)
df <- Merged_enrichments
df[,grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)

breaksList=seq(0,1,length.out = 200)
mycol = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList))

pearson_out <- data.frame()
for(l in 8:2){
  
  if(l==8) df$variant_sequence_fw_corrected <- df$variant_sequence_fw
  if(l==7) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 1,7)
  if(l==6) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 2,7)
  if(l==5) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,7)
  if(l==4) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 3,6)
  if(l==3) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,6)
  if(l==2) df$variant_sequence_fw_corrected <- substr(df$variant_sequence_fw, 4,5)
  
  tmp <- df %>% 
    group_by(variant_sequence_fw_corrected) %>%
    summarise(ced6_pos110=mean(ced6_pos110_act, na.rm=T),
              ced6_pos182=mean(ced6_pos182_act, na.rm=T),
              ced6_pos230=mean(ced6_pos230_act, na.rm=T),
              ced6_pos241=mean(ced6_pos241_act, na.rm=T),
              ZnT63C_pos142=mean(ZnT63C_pos142_act, na.rm=T),
              ZnT63C_pos180=mean(ZnT63C_pos180_act, na.rm=T),
              ZnT63C_pos210=mean(ZnT63C_pos210_act, na.rm=T)
              )
  
  tmp <- as.data.frame(tmp)
  
  ### correlation heatmap - Pearson
  cormat <- cor(tmp[,-1], method = "pearson", use = "pairwise.complete.obs")
  print(pheatmap(cormat,
                 clustering_distance_cols = as.dist(1 - cormat),
                 clustering_distance_rows = as.dist(1 - cormat),
                 fontsize_row = 10,
                 angle_col = 0,
                 #fontsize_col = 12,
                 show_colnames = F,
                 color=mycol,
                 breaks=breaksList,
                 border_color = NA,
                 main=paste0(l, "nt variants")))
  
  # pearson out
  cor_function <- function(matrix){
    cors <- cor(x = matrix, method = "pearson", use = "pairwise.complete.obs")
    cor.data <- cors[upper.tri(cors, diag = FALSE)]  # we're only interested in one of the off-diagonals, otherwise there'd be duplicates
    cor.data <- as.data.frame(cor.data)  # that's how ggplot likes it
    colnames(cor.data) <- "corr"
    return(cor.data$corr)
  }

  pearson_out <- rbind(pearson_out, data.frame(Length=l,
                                               PCC=cor_function(tmp[,-1])))
  
  print(l)
}

## [1] 8

## [1] 7

## [1] 6

## [1] 5

## [1] 4

## [1] 3

## [1] 2
### plot PCCs in function of length
pearson_out$Length <- factor(pearson_out$Length, levels = unique(pearson_out$Length), labels = paste0(unique(pearson_out$Length), "nt"))

boxplot_PCC <- ggplot(pearson_out, aes(Length, PCC, fill=Length)) +
  geom_boxplot(outlier.size = 0.7) +
  scale_fill_brewer("", palette = "Blues") +
  guides(fill=F) +
  scale_y_continuous("PCC", breaks = seq(-10,18,0.2), limits = c(0,1)) +
  xlab("Variant length") +
  theme_bw(base_size = 11) +
  theme(axis.text = element_text(colour="black"),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 13),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 15),
        legend.text = element_text(size = 14)
  )

print(boxplot_PCC)

Plot activity of variants with known motifs per library

Count_table_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt")

names(Count_table_final) <- gsub("GATAA_1st_S2171", "ced6_pos110", names(Count_table_final))
names(Count_table_final) <- gsub("CACA_2nd_S2171", "ced6_pos182", names(Count_table_final))
names(Count_table_final) <- gsub("GAGA_1st_S2171", "ced6_pos230", names(Count_table_final))
names(Count_table_final) <- gsub("GATAA_2nd_S2171", "ced6_pos241", names(Count_table_final))
names(Count_table_final) <- gsub("AP1_chr3L_3310914_3311162", "ZnT63C_pos142", names(Count_table_final))
names(Count_table_final) <- gsub("twist_chr3L_3310914_3311162", "ZnT63C_pos180", names(Count_table_final))
names(Count_table_final) <- gsub("GATAA_chr3L_3310914_3311162", "ZnT63C_pos210", names(Count_table_final))

Count_table_final$library <- factor(Count_table_final$library, levels = c("Wt", "S2171_wt", "chr3L_3310914_3311162_wt", "chr3L_3310914_3311162_ref",
                                               "ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos142", "ZnT63C_pos180", "ZnT63C_pos210"))

motif_interest_list <- c("TGA.TCA", "GATAAG", "TCATCA", "TTCC.GGA",
                         "GAGAGA",
                         "CATCTG", "CCGGAA", "TCACGCGA",
                         "AGGATAA"
)
names(motif_interest_list) <- motif_interest_list

Data_for_example_boxplots <- data.frame()
for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
  
  if(length(grep("ced6", lib)) > 0){
    lib2 <- lib
    mix <- "S21717_REEFmix2"
    wt <- "S2171_wt"
    var <- "S21717_REEFmix2_log2FoldChange"
  }else{
    lib2 <- lib
    mix <- "chr3L_3310914_3311162_REEFmix3"
    #wt <- "chr3L_3310914_3311162_wt" # use reference enhancer instead of wt? (chr2R_4239779_4240027_+_dCP_+)
    wt <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the same activity as the wt) because I don't trust the wt levels
    var <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
  }
  
  tmp_main <- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% paste0(wt, "_+"),
                                c(1:4,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
  tmp_main$log2FoldChange <- tmp_main[,var]
  tmp_main <- tmp_main[complete.cases(tmp_main$log2FoldChange),]
  
  tmp <- tmp_main[order(tmp_main$log2FoldChange, decreasing = T),]
  
  # make table with enrichments per motif
  motif_variant_enrichment_16bp <- lapply(motif_interest_list, function(m){
    fw=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
    if(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
    
    # this library has TCA in right flanks - don't allow to use that otherwise it will use a lot of variants
    if(lib=="ZnT63C_pos180" & m=="TGA.TCA"){
      fw=tmp_main$log2FoldChange[grep(paste0(m, "."), tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
    }
    
    data.frame(Motif=m,
               Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
               Enrichment_log2=c(fw, rv))
  })
  
  # plot for both tables
  seq="16bp"
  
  # join all
  motif_variant_enrichment <- do.call(rbind, motif_variant_enrichment_16bp)
  
  # set factor levels
  motif_levels <- motif_variant_enrichment %>%
    group_by(Motif) %>%
    summarise(median=median(Enrichment_log2)) %>%
    arrange(desc(median)) %>%
    select(Motif)
  
  motif_variant_enrichment$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
  
  # x labels counts per boxplot
  t <- data.frame(table(motif_variant_enrichment$Motif))
  xlabels <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
    TF=""
    if(x=="GATAAG") TF="GATA"
      if(x=="TGA.TCA") TF="AP-1"
      if(x=="CATCTG") TF="twist"
      if(x=="GAGAG") TF="Trl"
      if(x=="GAGAGA") TF="Trl"
      if(x=="CCGGAA") TF="ETS"
      if(x=="TCACGCGA") TF="SREBP"
      if(x=="TTCC.GGA") TF="STAT"
      if(x=="TGATGT") TF="CREB"
      if(x=="TGTGAAA") TF="CREB"
      if(x=="TCATCA") TF="CREB"
      if(x=="ATCGAT") TF="Dref"
      if(x=="AGGATAA") TF="ttk"
      if(x %in% c("All", "Others")){
      paste0(TF,"\n(", x, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
    else{
      paste0(TF,"\n(",
             x, ", ", t$Freq[t$Var1%in%x], ")")}
  })
  
  boxplot_known_motifs <- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
    geom_boxplot(outlier.size = 0.6, alpha=0.9) +
    #geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
    geom_hline(yintercept = tmp_main$log2FoldChange[tmp_main$Variant %in% paste0(wt, "_+")], linetype="dashed", col="orangered") +
    geom_hline(yintercept = median(tmp_main$log2FoldChange), linetype="dashed", col="grey60") +
    scale_fill_manual(values=my_motif_col) +
    scale_alpha_manual(values=c(0.3,0.8)) +
    scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
    scale_x_discrete("", labels=xlabels) +
    guides(fill=F) +
    ggtitle(gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", gsub("CACA_2nd", "CACA", gsub("GAGA_1st", "GAGA", lib)))))) +
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.text.x = element_text(colour="black", size=9.5, angle=45, hjust=1),
          axis.title = element_text(colour="black"),
          # axis.title.y = element_blank(),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust=0.5)
    )
  
  print(boxplot_known_motifs)
  
  varID <- gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", gsub("CACA_2nd", "CACA", gsub("GAGA_1st", "GAGA", lib)))))
  Data_for_example_boxplots <- rbind(Data_for_example_boxplots,
                                     cbind(varID, motif_variant_enrichment))
  
}

# average per motifs
out <- Data_for_example_boxplots %>% group_by(varID, Motif) %>%
  summarise(Mean_Enrichment_log2=mean(Enrichment_log2, na.rm=T))

saveRDS(out, "Average_motif_enrichment_log2.rds")

Fig S9B - plot activity of motif variants per position

motif_interest_list <- c("TGA.TCA", "GATAAG", "TCATCA", "TTCC.GGA",
                         "GAGAGA",
                         "CATCTG", "CCGGAA", "TCACGCGA",
                         "AGGATAA"
)
names(motif_interest_list) <- motif_interest_list

Average_results <- data.frame()
Data_for_example_boxplots <- data.frame()
for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
  
  if(length(grep("ced6", lib))>0){
    lib2 <- lib
    mix <- "S21717_REEFmix2"
    wt <- "S2171_wt_+"
    var <- "S21717_REEFmix2_log2FoldChange"
  }else{
    lib2 <- lib
    mix <- "chr3L_3310914_3311162_REEFmix3"
    #wt <- "chr3L_3310914_3311162_wt" # use reference enhancer instead of wt? (chr2R_4239779_4240027_+_dCP_+)
    wt <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the same activity as the wt) because I don't trust the wt levels
    var <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
  }
  
  tmp_main <- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% paste0(wt, "_+"),
                                c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
  tmp_main$log2FoldChange <- scale(tmp_main[,var])
  tmp_main <- tmp_main[complete.cases(tmp_main$log2FoldChange),]
  
  # make table with enrichments per motif
  motif_variant_enrichment_16bp <- lapply(motif_interest_list, function(m){
    fw=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
    if(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
    
    # this library has TCA in right flanks - don't allow to use that otherwise it will use a lot of variants
    if(lib=="ZnT63C_pos180" & m=="TGA.TCA"){
      fw=tmp_main$log2FoldChange[grep(paste0(m, "."), tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
    }
    
    data.frame(Motif=m,
               Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
               Enrichment_log2=c(fw, rv))
  })
  
  
  # plot for both tables
  seq="16bp"
  
  # join all
  motif_variant_enrichment <- do.call(rbind, motif_variant_enrichment_16bp)
  
  # save main results
  varID <- gsub("ced6", "ced-6", gsub("_", " ", lib))
  Average_results <- rbind(Average_results, 
                           cbind(data.frame(Lib=varID,
                                            Seq=seq),
                                 motif_variant_enrichment %>%
                                   group_by(Motif) %>%
                                   summarise(Avg=mean(Enrichment_log2),
                                             Median=median(Enrichment_log2)) )
  )
  
  # set factor levels
  motif_levels <- motif_variant_enrichment %>%
    group_by(Motif) %>%
    summarise(median=median(Enrichment_log2)) %>%
    arrange(desc(median)) %>%
    select(Motif)
  
  motif_variant_enrichment$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
  
  # x labels counts per boxplot
  t <- data.frame(table(motif_variant_enrichment$Motif))
  xlabels <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
    TF=""
    if(x=="GATAAG") TF="GATA"
    if(x=="TGA.TCA") TF="AP-1"
    if(x=="CATCTG") TF="twist"
    if(x=="GAGAG") TF="Trl"
    if(x=="GAGAGA") TF="Trl"
    if(x=="CCGGAA") TF="ETS"
    if(x=="TCACGCGA") TF="SREBP"
    if(x=="TTCC.GGA") TF="STAT"
    if(x=="TGATGT") TF="CREB"
    if(x=="TGTGAAA") TF="CREB"
    if(x=="TCATCA") TF="CREB"
    if(x=="ATCGAT") TF="Dref"
    if(x=="AGGATAA") TF="ttk"
    if(x %in% c("All", "Others")){
      paste0(TF,"\n(", x, t$Freq[t$Var1%in%x&t$Var2%in%"rv"], ")")}
    else{
      paste0(TF,"\n(",
             x, ", ", t$Freq[t$Var1%in%x], ")")}
  })
  
  boxplot_known_motifs <- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2, fill=Motif)) +
    geom_boxplot(outlier.size = 0.3, alpha=0.9) +
    geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
    scale_fill_manual(values=my_motif_col) +
    scale_alpha_manual(values=c(0.3,0.8)) +
    scale_y_continuous("Enhancer activity (z-scores)", breaks = seq(-10,10,2)) +
    scale_x_discrete("", labels=xlabels) +
    guides(fill=F) +
    ggtitle(varID) +
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.text.x = element_text(colour="black", size=9.5, angle=45, hjust=1),
          axis.title = element_text(colour="black"),
          # axis.title.y = element_blank(),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust=0.5)
    )
  
  print(boxplot_known_motifs)
  
  if(seq=="16bp"){
    Data_for_example_boxplots <- rbind(Data_for_example_boxplots,
                                       cbind(varID, motif_variant_enrichment))
  }
  
}

Fig 2E - boxplot examples

tmp <- Data_for_example_boxplots[Data_for_example_boxplots$varID %in% c("ced-6 pos241", "ZnT63C pos180") &
                                   Data_for_example_boxplots$Motif %in% c("TGA.TCA", "GATAAG", "CCGGAA", "AGGATAA"),]

xlabels <- sapply(as.character(unique(tmp$Motif)), function(x){
      TF=""
      if(x=="GATAAG") TF="GATA"
      if(x=="TGA.TCA") TF="AP-1"
      if(x=="CATCTG") TF="twist"
      if(x=="GAGAG") TF="Trl"
      if(x=="CCGGAA") TF="ETS"
      if(x=="TCACGCGA") TF="SREBP"
      if(x=="TCC.GGA") TF="STAT"
      if(x=="TGATGT") TF="CREB"
      if(x=="TGTGAAA") TF="CREB"
      if(x=="AGGATAA") TF="ttk"
      # paste0(TF,"\n(", x, ")")
      paste0(TF)
    })

boxplot_ex <- ggplot(tmp, aes(Motif, Enrichment_log2, fill=Motif)) +
      geom_boxplot(outlier.size = 0.3) +
      geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
      scale_fill_manual(values=my_motif_col) +
      # scale_alpha_manual(values=c(0.5,0.9)) +
      scale_y_continuous("Enhancer activity (z-scores)", breaks = seq(-10,10,2)) +
      scale_x_discrete("Created TF motifs", labels=xlabels) +
      guides(fill=F) +
      facet_grid(~varID) +
      theme_bw(base_size = 16) +
      theme(axis.text = element_text(colour="black"),
            axis.text.x = element_text(colour="black", size=11.5),
            axis.title = element_text(colour="black"),
            # axis.title.y = element_blank(),
            axis.line = element_blank(),
            panel.border = element_rect(colour="black"),
            panel.grid.minor = element_blank(),
            panel.grid.major = element_line(colour="grey95", size=0.4),
            axis.ticks = element_line(colour="black"),
            plot.title = element_text(hjust=0.5)
      )

print(boxplot_ex)

Fig 2D - Summary heatmap of motifs

tmp <- data.table::dcast(Average_results[Average_results$Seq =="16bp",], Lib ~ Motif, value.var = "Avg", fun=mean)
rownames(tmp) <- tmp$Lib
tmp <- tmp[,-1]

names(tmp) <- c("TGA.TCA"="AP-1",
                "GATAAG"="GATA",
                CATCTG="twist",
                "TTCC.GGA"="STAT",
                "CCGGAA"="ETS",
                "GAGAGA"="Trl",
                "TCACGCGA"="SREBP",
                "TCATCA"="CREB",
                "AGGATAA"="ttk")[names(tmp)]

library(pheatmap)

paletteLength <- 200
myBreaks <- c(seq(min(tmp, na.rm=T), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(tmp, na.rm=T)/paletteLength, max(tmp, na.rm=T), length.out=floor(paletteLength/2)))

col <- rev(colorRampPalette(
  c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
    "#FFFFFF", "#FFFFFF",
    "#92C5DE", "#92C5DE", "#4393C3", "#4393C3"))(length(myBreaks)-1))


pheatmap(t(tmp)[,c(4,1,2,5,6,3,7)],
         cluster_cols = F,
         cluster_rows = F,
         angle_col=90,
         color=col,
         breaks = myBreaks,
         legend_breaks=seq(-10,10,1),
         border_color = "grey60",
         fontsize_row=10,
         fontsize_col=10,
         main=paste0("Scaled log2 avg activity of motifs"),
         width = 4, height = 4
)

Fig S6B - How many motifs explain the top variants at important positions

Variants_res_list <- list()
for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))[c(1,4:7)]){
  
  if(length(grep("ced6", lib))>0){
    mix <- "S21717_REEFmix2"
    wt <- "S2171_wt_+"
    var <- "S21717_REEFmix2_log2FoldChange"
  }else{
    mix <- "chr3L_3310914_3311162_REEFmix3"
    wt <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
    var <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
  }
  
  tmp_main <- Count_table_final[Count_table_final$library %in% c(lib, wt) | Count_table_final$Variant %in% wt,
                                c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
  tmp_main$log2FoldChange <- tmp_main[,var]
  tmp_main <- tmp_main[complete.cases(tmp_main$log2FoldChange),]
  
  # do the variants with high activity match to different motifs?
  top_sequences <- tmp_main[tmp_main$log2FoldChange > tmp_main$log2FoldChange[tmp_main$library==wt | tmp_main$Variant %in% wt],]
  rownames(top_sequences) <- top_sequences$Variant
  
  ### All enriched motifs (FDR<0.05)
  PWM_candidates <- readRDS("data/PWM_candidates.rds") # from 20201124_REEF_S2171_and_chr3L_3310914_3311162_mixs2_3_reanalysed/Prepare_strings_from_motifs.R
  PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]
  PWM_candidates <- PWM_candidates[PWM_candidates$FDR_motif_enr<0.05 & PWM_candidates$log2_motif_enr>0,]
  
  require(motifmatchr)
  require(TFBSTools)
  load("data/TF_clusters_PWMs.RData") # load motifs
  
  # count motifs in all enhancers
  opt=list()
  opt$pvalue_cutoff <- c(1e-4)
  opt$genome="BSgenome.Dmelanogaster.UCSC.dm3"
  
  # motif counts
  # for motif enrichment is is fine to do not reduce, because it will just be present vs non-present
  motif_counts_all <- list()
  for(p in opt$pvalue_cutoff){
    # motif counts
    motif_ix_scores <- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name],
                                   as.character(top_sequences[[paste0(lib, "_sequence_fw_extended4bp")]]),
                                   genome = opt$genome, p.cutoff = p, bg="genome", out = "scores")
    scores <- as.data.frame(as.matrix(motifCounts(motif_ix_scores)))
    names(scores) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% PWM_candidates$motif_name])
    motif_counts <- cbind(top_sequences, scores)
    
    motif_counts_all[[paste0("pvalue_", p)]] <- motif_counts
    print(p)
  }
  
  # sequences explained by main motifs, others, multiple, or none
  TF_motifs <- list(GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507"),
                    GATA=data.frame(Motif="GATA", ID="flyfactorsurvey__GATAd_SANGER_5_FBgn0032223"),
                    GATA=data.frame(Motif="GATA", ID=as.character(PWM_candidates$motif_name[grep("srp|gatad", PWM_candidates$motif_description2, ignore.case = T)])),
                    AP1=data.frame(Motif="AP-1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291"),
                    AP1=data.frame(Motif="AP-1", ID=as.character(PWM_candidates$motif_name[grep("jra|kay|fos|jun", PWM_candidates$motif_description2, ignore.case = T)])),
                    twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"),
                    twist=data.frame(Motif="twist", ID=as.character(PWM_candidates$motif_name[grep("twi", PWM_candidates$motif_description2, ignore.case = T)])),
                    Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263"),
                    Trl=data.frame(Motif="Trl", ID=as.character(PWM_candidates$motif_name[grep("trl|gaga", PWM_candidates$motif_description2, ignore.case = T)])),
                    ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510"),
                    ETS=data.frame(Motif="ETS", ID=as.character(PWM_candidates$motif_name[grep("ETS", PWM_candidates$motif_description2, ignore.case = T)])),
                    SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234"),
                    SREBP=data.frame(Motif="SREBP", ID=as.character(PWM_candidates$motif_name[grep("SREBP|HLH106", PWM_candidates$motif_description2, ignore.case = T)])),
                    STAT=data.frame(Motif="STAT", ID="bergman__Stat92E"),
                    STAT=data.frame(Motif="STAT", ID=as.character(PWM_candidates$motif_name[grep("stat", PWM_candidates$motif_description2, ignore.case = T)])),
                    CREB=data.frame(Motif="CREB/ATF", ID="cisbp__M0295"),
                    CREB=data.frame(Motif="CREB/ATF", ID=as.character(PWM_candidates$motif_name[grep("creb", PWM_candidates$motif_description2, ignore.case = T)])
                    )
  )
  TF_motifs <- do.call(rbind, TF_motifs)
  TF_motifs$Motif <- factor(TF_motifs$Motif)
  
  Sequences_motifs_16bp <- lapply(motif_counts_all, function(p){
    
    candidates_variant_enrichment <- data.frame(sapply(levels(TF_motifs$Motif), function(m){
      return(rowSums(p[,as.character(TF_motifs$ID[TF_motifs$Motif==m])])>0)
    }))
    candidates_variant_enrichment[,"Other motifs"] <- rowSums(p[,19:ncol(p)])>0
    colSums(candidates_variant_enrichment)
    
    return(candidates_variant_enrichment)
  })
  colSums(Sequences_motifs_16bp[[1]])
  table(rowSums(Sequences_motifs_16bp[[1]][,1:8]))
  
  # save data
  Variants_res_list[[lib]] <- Sequences_motifs_16bp[[1]] 
  
  # which variants
  all_var_list <- lapply(Sequences_motifs_16bp, function(i){
    i[,"Group"] <- apply(i, 1, function(x){
      if(sum(x[1:8]==TRUE)>1){"Multiple"
      }else if(sum(x[1:8]==TRUE)==1){paste0(colnames(i)[1:8][x[1:8]==TRUE], collapse = "_")
      }else if(sum(x[1:8]==TRUE)==0 & x["Other motifs"]==TRUE){paste0(colnames(i)[x==TRUE], collapse = "_")
      }else{"No motifs"}
    })
    all_var <- table(i[,"Group"])
    names(all_var)[names(all_var)=="AP.1"] <- "AP-1"
    names(all_var)[names(all_var)=="CREB.ATF"] <- "CREB/ATF"
    df <- data.frame(all_var)
    df$Var1 <- factor(df$Var1, levels=rev(c("No motifs", "Other motifs", "Multiple", rev(sort(levels(TF_motifs$Motif))))))
    df
  })
  
  gg_bar2 <- ggplot() +
    geom_bar(data=all_var_list[[1]], aes("1e-04", Freq, fill=Var1), position="stack", stat="identity", width=0.8) +
    xlab("PWM cutoff") +
    scale_y_continuous("# of variants", expand = c(0,0)) +
    ggtitle(gsub("ced6", "ced-6", gsub("_", " ", lib))) +
    scale_fill_manual("Motifs", values=c(my_motif_col, "No motifs"="grey90", "Other motifs"="#b3cccc", Multiple="#FF9999"),
                      limits = levels(all_var_list[[1]]$Var1)) +
    theme(plot.title = element_text(hjust=0.5, colour=my_col[lib]))
  
  print(gg_bar2)
  
  print(lib)
  
}
## [1] 1e-04

## [1] "ced6_pos110"
## [1] 1e-04

## [1] "ced6_pos241"
## [1] 1e-04

## [1] "ZnT63C_pos210"
## [1] 1e-04

## [1] "ZnT63C_pos142"
## [1] 1e-04

## [1] "ZnT63C_pos180"
saveRDS(Variants_res_list, "Variants_motif_matching_list.rds")

Fig S6A - edit distance of 8bp sequences in function of fold-change to WT activity

wt_motif_seq <- c("ced6_pos110"="AAGATAAT",
                  "ced6_pos241"="CTTATCGC",
                  "ZnT63C_pos142"="ATGACTCA",
                  "ZnT63C_pos180"="GCAGATGT",
                  "ZnT63C_pos210"="CTTATCTT")

require(stringdist)
for(lib in names(wt_motif_seq)){
  
  if(length(grep("ced6", lib))>0){
    mix <- "S21717_REEFmix2"
    wt <- "S2171_wt_+"
    var <- "S21717_REEFmix2_log2FoldChange"
  }else{
    mix <- "chr3L_3310914_3311162_REEFmix3"
    wt <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
    var <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
  }
  
  tmp_main <- Count_table_final[Count_table_final$library %in% c(lib, wt) | Count_table_final$Variant %in% wt,
                                c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
  tmp_main$log2FoldChange <- tmp_main[,var]
  tmp_main <- tmp_main[complete.cases(tmp_main$log2FoldChange),]
  
  # log2FC
  tmp_main$log2FC_wt <- tmp_main$log2FoldChange-tmp_main$log2FoldChange[tmp_main$library==wt | tmp_main$Variant %in% wt]
  
  # edit distance - minimum of fw and rv distance
  tmp_main$variant_seq <- tmp_main[,paste0(lib, "_sequence_fw")]
  wt_str <- wt_motif_seq[lib]
  wt_rev <- as.character(reverseComplement(DNAString(wt_str)))
  tmp_main$Edit_dist_fw <- stringdist(wt_str, tmp_main$variant_seq, method="hamming")
  tmp_main$Edit_dist_rv <- stringdist(wt_rev, tmp_main$variant_seq, method="hamming")
  tmp_main$Edit_dist_min <- factor(apply(tmp_main[,c("Edit_dist_fw", "Edit_dist_rv")], 1, min))
  tmp_main <- tmp_main[!(tmp_main$library==wt | tmp_main$Variant %in% wt | tmp_main$variant_seq %in% c(wt_str, wt_rev)),]
  
  boxplot_edit <- ggplot(tmp_main, aes(Edit_dist_min,log2FC_wt, fill=Edit_dist_min)) +
    geom_boxplot(outlier.size = 0.8) +
    scale_y_continuous("log2FC enhancer activity to wildtype", breaks = seq(-10,8,2), limits = c(-9.43,2.51)) +
    scale_x_discrete("Edit distance to wildtype [hamming]") +
    scale_fill_grey(start = 0.4) +
    guides(fill=F) +
    geom_hline(yintercept = 0, linetype="dashed", col="grey60") +
    ggtitle(gsub("ced6", "ced-6", gsub("_", " ", lib))) +
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.title = element_text(colour="black"),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust=0.5, colour=my_col[lib])
    )
  
  print(boxplot_edit)
  print(lib)
}

## [1] "ced6_pos110"

## [1] "ced6_pos241"

## [1] "ZnT63C_pos142"

## [1] "ZnT63C_pos180"

## [1] "ZnT63C_pos210"

Fig S10B - activity of variants that create repressor motifs

motif_interest_list <- c(ttk="AGGATAA",
                         ZEB1="CAGGTG",
                         lola="GGAGTT",
                         GGTAAG="GGTAAG"
)

for(lib in gsub("_sequence_fw", "", grep("sequence_fw$", names(Count_table_final), value = T))){
  
  if(length(grep("ced6", lib))>0){
    lib2 <- lib
    mix <- "S21717_REEFmix2"
    wt <- "S2171_wt_+"
    var <- "S21717_REEFmix2_log2FoldChange"
  }else{
    lib2 <- lib
    mix <- "chr3L_3310914_3311162_REEFmix3"
    wt <- "chrX_9273894_9274142_+_dCP_-" # use reference enhancer (should have the sme activity as the wt) because I don't trust the wt levels
    var <- "chr3L_3310914_3311162_REEFmix3_log2FoldChange"
  }
  
  tmp_main <- Count_table_final[Count_table_final$library %in% c(lib2, wt) | Count_table_final$Variant %in% wt,
                                c(1:5,grep(lib, names(Count_table_final)), grep(mix, names(Count_table_final)))]
  tmp_main$log2FoldChange <- tmp_main[,var]
  tmp_main <- tmp_main[complete.cases(tmp_main$log2FoldChange),]
  
  tmp <- tmp_main[order(tmp_main$log2FoldChange, decreasing = T),]
  tmp <- data.frame(x=1:nrow(tmp)/1000, y=2**tmp$log2FoldChange, y_log=tmp$log2FoldChange,
                    variant=as.character(tmp$Variant),
                    variant_seq=sapply(strsplit(as.character(tmp$Variant),"_"), `[`, length(strsplit(as.character(tmp$Variant)[1],"_")[[1]])-1),
                    stringsAsFactors = F)
  
  boxplot <- ggplot(tmp, aes("1",y_log)) +
    geom_violin(alpha=0.5, position = position_dodge(width = 0.9), size=0.7, color="black", fill="grey60") +
    geom_boxplot(outlier.size = -1, color="black", position = position_dodge(width = 0.9), width=0.2, size=0.8) +
    scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
    scale_x_discrete("", labels=paste0("All variants\n(n=", nrow(tmp), ")")) +
    #geom_hline(yintercept = 0, linetype="dashed", col="grey60") + # the 0 is not relevant so don't print the line
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.title = element_text(colour="black"),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust=0.5)
    )
  
  # select wt variants to plot
  tmp_sel <- tmp[tmp$variant %in% c(wt,
                                    tmp$variant[1],
                                    tmp$variant[grep("GATAA", tmp$variant_seq)][1],
                                    tmp$variant[grep("TTATC", tmp$variant_seq)][1]),]
  tmp_sel$variant_seq[grep(wt, tmp_sel$variant)] <- "wt (CTTATCGC)"
  
  boxplot3 <- boxplot +
    geom_point(data=tmp_sel[tmp_sel$variant %in% wt,], aes("1",y_log), color=c(my_motif_col["wt"]), size=2) +
    annotate(geom="text", x="1", y=tmp_sel$y_log[tmp_sel$variant %in% wt],
             label="wt",
             color=my_motif_col["wt"],
             hjust=-0.35, vjust=0.5, size = 4)
  
  ########
  
  # make table with enrichments per motif
  motif_variant_enrichment_16bp <- lapply(motif_interest_list, function(m){
    fw=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_fw_extended4bp")])]
    if(!DNAString(m) == reverseComplement(DNAString(m))){rv=tmp_main$log2FoldChange[grep(m, tmp_main[,paste0(lib, "_sequence_rv_extended4bp")])]}else{rv <- c()}
    
    data.frame(Motif=m,
               Strand=c(rep("fw",length(fw)), rep("rv",length(rv))),
               Enrichment_log2=c(fw, rv))
  })
  
  
  # plot for both tables
  seq="16bp"
  
  # join all
  motif_variant_enrichment <- do.call(rbind, motif_variant_enrichment_16bp)
  
  # set factor levels
  motif_levels <- motif_variant_enrichment %>%
    group_by(Motif) %>%
    summarise(median=median(Enrichment_log2)) %>%
    arrange(desc(median)) %>%
    select(Motif)
  
  motif_variant_enrichment$Motif <- factor(motif_variant_enrichment$Motif, levels =motif_interest_list)
  
  # x labels counts per boxplot
  t <- data.frame(table(motif_variant_enrichment$Motif))
  xlabels <- sapply(as.character(levels(motif_variant_enrichment$Motif)), function(x){
    TF=names(motif_interest_list)[grep(x, motif_interest_list)]
    paste0(TF,"\n(",
           x, ",", t$Freq[t$Var1%in%x], ")")
  })
  
  boxplot_known_motifs <- ggplot(motif_variant_enrichment, aes(Motif, Enrichment_log2)) +
    geom_boxplot(outlier.size = 0.6, alpha=0.9, fill="dodgerblue") +
    #geom_hline(yintercept = median(metadata_final$log2_ratio_UMI_corrected), linetype="dashed", col="grey40") +
    geom_hline(yintercept = tmp_main$log2FoldChange[tmp_main$Variant %in% paste0(wt, "_+")], linetype="dashed", col="orangered") +
    geom_hline(yintercept = median(tmp_main$log2FoldChange), linetype="dashed", col="grey60") +
    scale_fill_manual(values=my_motif_col) +
    scale_alpha_manual(values=c(0.3,0.8)) +
    scale_y_continuous("Enhancer activity (log2)", breaks = seq(-2,10,1), labels = c("1/4", "1/2", 2**seq(0,10,1)), limits = c(min(tmp_main$log2FoldChange),max(tmp_main$log2FoldChange))) +
    scale_x_discrete("", labels=xlabels) +
    guides(fill=F) +
    ggtitle(gsub("chr3L 3310914 3311162", "ZnT63C", gsub("S2171", "ced-6", gsub("_", " ", lib)))) +
    theme_bw(base_size = 14) +
    theme(axis.text = element_text(colour="black"),
          axis.text.x = element_text(colour="black", size=9),
          axis.title = element_text(colour="black"),
          axis.title.y = element_blank(),
          axis.line = element_blank(),
          panel.border = element_rect(colour="black"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(colour="grey95", size=0.4),
          axis.ticks = element_line(colour="black"),
          plot.title = element_text(hjust=0.5)
    )
  
  print(boxplot3 + boxplot_known_motifs + plot_layout(widths = c(1,4.2)))
  
}

Heatmap clustered activity of all variants per screen

Fig 2C - Heatmap with all variants

Didn’t include plots because they are too heavy

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### z-score normalise to just compare differences within libraries (against mean)
df <- apply(Merged_enrichments[,grep("_act", names(Merged_enrichments), value = T)], 2, scale)
rownames(df) <- Merged_enrichments$variant_sequence_fw

### remove variants not measured in all screens
df <- na.omit(df)

## active variants
df2 <- df[rowSums(df>1)>0,]
s=1
extr <- ceiling(max(abs(c(min(df2, na.rm=T), max(df2, na.rm=T)))))
myBreaks = seq(-extr, extr, by = s)

col <- c(colorRampPalette(
  c("#2166AC", "#4393C3", "#92C5DE",
    "#D1E5F0"))((extr*(1/s))-1),
  "white", "white",
  colorRampPalette(
    c("#FDDBC7", "#F4A582",
      "#D6604D", "#B2182B"))((extr*(1/s))-1))

pheatmap(df2,
         clustering_distance_cols = "correlation",
         clustering_distance_rows = "correlation",
         angle_col=45,
         color=col,
         breaks = myBreaks,
         legend_breaks=seq(-10,10,2),
         border_color = NA,
         show_rownames = F,
         labels_col = gsub("_", " ", gsub("ced6", "ced-6", gsub("_act", "", colnames(df2)))),
         fontsize_col=11,
         main=paste0("Scaled log2 activity of ", nrow(df2), " active variants (min 1)"),
         width = 5.5, height = 8
         )

Look at top sequences and make logo to check how diverse they are

Fig S5 - logos and information content of top sequences

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

library(ggseqlogo)
plot.logo <- function(x, method='bits'){
      ggseqlogo(x, method=method, seq_type='dna', ncol=1) +
        theme(panel.background = element_rect(fill="white",colour="white"),
            panel.grid = element_blank(), axis.line=element_blank(),
            axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(),
            axis.title = element_blank(),
            legend.key=element_blank(),
            legend.text = element_text(size=11, colour="black"),
            legend.title = element_text(size=12, colour="black"),
            plot.title = element_text(size=14, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5),
            legend.position = "right")
}

variants <- gsub("_act", "", grep("_act", names(Merged_enrichments), value = T))[c(1:4,6,7,5)]
names(variants) <- variants

logo_plots_all_variants <- list()
for(var in variants){
  df <- Merged_enrichments[,grep(var, names(Merged_enrichments))]
  df <- df[complete.cases(df[,paste0(var, "_act")]),]
  
  logo_plots_all <- lapply(c(IC='bits', Prob='p'), function(m){
    logo_plots <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
      if(c=="Random") tmp <- df[sample(1:nrow(df)),]
      if(c=="Experiment") tmp <- df[order(df[,paste0(var, "_act")], decreasing = T),]
      
      seq_counts <- c(1,2,5,10,50,100,1000, nrow(tmp))
      names(seq_counts) <- c(seq_counts[1:7], "last")
      
      lapply(seq_counts, function(i){
        tmp2 <- tmp[1:i,]
        
        # get nucleotide at each position
        tmp2 <- do.call(rbind, lapply(1:16, function(x) data.frame(Pos=x,
                                                                   Base=substr(tmp2[,paste0(var, "_fw_extended4bp")], x, x))))
        
        prop_matrix <- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x/sum(x))
        
        plot.logo(prop_matrix, method=m)
        
      })
    })
    return(logo_plots)
  })
  
  logo_plots_all_variants[[var]] <- logo_plots_all
  
  # print(plot_grid(plot_grid(plotlist = logo_plots_all$Prob$Experiment, ncol=1),
  #                 plot_grid(plotlist = logo_plots_all$Prob$Random, ncol=1)))
  
}


### plot all together
plot_grid(plotlist = lapply(logo_plots_all_variants, function(x) plot_grid(plotlist = x$Prob$Experiment, ncol=1)), nrow = 1)

### statistics with information content
# The information content (IC) of a PWM says how different a given PWM is from a uniform distribution.
tIC <- function(C) log2(length(C))
PPM <- function(C) C / sum(C)
U <- function(C) -sum(PPM(C) * log2(PPM(C)))
fIC <- function(C) tIC(C) - U(C)
IC <- function(C){
  # add pseudocounts
  C[C==0] <- 0.001
  PPM(C) * fIC(C)
}

set.seed(1234)
statistics_logo_plots_variants <- lapply(variants, function(var){
  df <- Merged_enrichments[,c(1,grep(var, names(Merged_enrichments)))]
  df <- df[complete.cases(df[,paste0(var, "_act")]),]
  statistics_logo_plots <- lapply(c(Random="Random", Experiment="Experiment"), function(c){
    if(c=="Random") tmp <- df[sample(1:nrow(df)),]
    if(c=="Experiment") tmp <- df[order(df[,paste0(var, "_act")], decreasing = T),]
    
    v_all <- sapply(c(1:1000, 10000, 50000, nrow(tmp)), function(i){
      tmp2 <- tmp[1:i,]
      
      # get nucleotide at each position
      tmp2 <- do.call(rbind, lapply(1:8, function(x) data.frame(Pos=x,
                                                                Base=substr(tmp2$variant_sequence_fw, x, x))))
      tmp2$Base <- factor(tmp2$Base, levels = c("A", "C", "G", "T"))
      
      df <- apply(table(tmp2$Base, tmp2$Pos), 2, function(x) x)
      
      v <- sum(colSums(sapply(1:ncol(df), function(x){
        IC(df[,x])
      })))
      
      return(v)
    })
    return(v_all)
  })
  n=300
  return(data.frame(Var=var,
                    Group=rep(c("Random", "Experiment"), each=n),
                    Top_sequences=rep(1:n, 2),
                    IC=c(statistics_logo_plots$Random[1:n], statistics_logo_plots$Experiment[1:n])))
})

plot_df <- do.call(rbind, statistics_logo_plots_variants)

IC_plot <- ggplot(plot_df, aes(Top_sequences, IC, group=paste0(Var, "_", Group), colour=Var, alpha=Group, shape=Group)) +
  geom_point() +
  geom_line() +
  scale_y_continuous("8-mer sum Information Content") +
  scale_x_continuous("Number of top sequences considered", breaks = seq(0,1000,50)) +
  scale_color_manual("Enh position", values=my_col[as.character(unique(plot_df$Var))], drop=T) +
  scale_alpha_manual(values = c(1,0.2)) +
  theme_bw(base_size = 11) +
    theme(plot.title = element_text(hjust = 0.5),
          axis.text = element_text(size = 13, colour="black"),
          axis.title = element_text(size = 15),
          legend.text = element_text(size = 14),
          legend.title = element_text(size = 15)
    )

print(IC_plot)

Test activity of all possible TF motifs using PWM motif matching

Fig S9A - heatmap with activity of all TF motifs

Merged_enrichments <- read.delim("Enrichments_and_sequence_information_all_REEF_libraries.txt")

### I could maybe z-score normalise to just compare differences within libraries (against mean)
df <- Merged_enrichments[,grep("_act|_fw_extended4bp", names(Merged_enrichments), value = T)]
rownames(df) <- Merged_enrichments$variant_sequence_fw
df[,grep("_act", names(df), value = T)] <- apply(df[,grep("_act", names(df), value = T)], 2, scale)

### Motifs of interest
PWM_candidates <- readRDS("data/PWM_candidates.rds")
PWM_candidates <- PWM_candidates[complete.cases(PWM_candidates$Motif_strings),]
PWM_candidates <- PWM_candidates[!PWM_candidates$X..motif_collection_name %in% "modisco",]

motif_interest_list <- PWM_candidates$Motif_strings
names(motif_interest_list) <- PWM_candidates$motif_name

### summary (mean z-score) per motif
Motif_average <- t(sapply(motif_interest_list, function(m){
  sapply(gsub("_act", "", grep("_act", names(df), value = T)), function(c){
    mean(df[unique(grep(paste0(m, "|", as.character(reverseComplement(DNAString(m)))),df[,paste0(c, "_fw_extended4bp")])),paste0(c, "_act")], na.rm=T) # added unique to don't count twice
  })
}))
Motif_average[is.na(Motif_average) | Motif_average=="NaN"] <- NA

Motif_average <- merge(as.data.frame(Motif_average), PWM_candidates[,c(1,5:9,2,14,16:20)], by.x=0, by.y=1)
names(Motif_average)[1] <- "motif_name"
write.table(Motif_average, "Candidate_TF_activators_screens_16bp_variants_average_results.txt", sep="\t", quote=F)
require(pheatmap)

plot_df <- read.delim("Candidate_TF_activators_screens_16bp_variants_average_results.txt")

# summarise per cluster (ordered by maximum activity in any screen)
plot_df$Max <- apply(plot_df[,2:8], 1, max, na.rm=T)
plot_df <- plot_df[order(plot_df$Max, decreasing = T),]
# add name of dmel factor per cluster
cluster_dmel <- sapply(unique(plot_df$Motif_cluster), function(c){
  out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>60])[1] # first only expressed TFs
  if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>10])[1] # first only expressed TFs
  if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel) & complete.cases(plot_df$S2_exp) & plot_df$S2_exp>1])[1] # first only expressed TFs
  if(length(out)==0) out <- unique(plot_df$Dmel[plot_df$Motif_cluster == c & complete.cases(plot_df$Dmel)])[1] # because it is ordered
  if(length(out)==0){return(NA)}else{return(out)}
})
names(cluster_dmel) <- unique(plot_df$Motif_cluster)
plot_df$Motif_cluster_Dmel <- cluster_dmel[match(plot_df$Motif_cluster, names(cluster_dmel))]
plot_df$Motif_cluster_Dmel[!complete.cases(plot_df$Motif_cluster_Dmel)] <- plot_df$motif_description2[!complete.cases(plot_df$Motif_cluster_Dmel)]

# remove duplicated clusters
plot_df <- plot_df[!duplicated(plot_df$Motif_cluster),]

# remove duplicated strings
plot_df <- plot_df[!duplicated(plot_df$Motif_strings),]

# correct some names
plot_df$Motif_cluster_Dmel[grep("HLH106", plot_df$Motif_cluster_Dmel)] <- "SREBP"

# plot TFs with at least 1 z-score in one screen
plot_df <- plot_df[rowSums(plot_df[,2:8]>1, na.rm=T)>0,]
rownames(plot_df) <- paste0(plot_df$Motif_cluster_name, " (", plot_df$Motif_cluster_Dmel, "/", plot_df$Motif_strings, ")")

annotation_row <- data.frame(row.names = rownames(plot_df),
                             Motif.enrch=as.character(plot_df$log2_motif_enr > 0 & plot_df$FDR_motif_enr < 0.05))

require(RColorBrewer)
annotation_colours <- list(Motif.enrch=c("FALSE"="white", "TRUE"="#336600"))

### do clustering without NAs and using all motifs - Pearson
Motif_average <- read.delim("Candidate_TF_activators_screens_16bp_variants_average_results.txt")
Motif_average_no_na <- Motif_average[,2:8]
Motif_average_no_na[is.na(Motif_average_no_na) | Motif_average_no_na=="NaN"] <- 0
cols.cor <- cor(Motif_average_no_na, use = "pairwise.complete.obs", method = "pearson")

Motif_average_no_na <- plot_df[,2:8]
Motif_average_no_na[is.na(Motif_average_no_na) | Motif_average_no_na=="NaN"] <- 0
row_dist <- dist(Motif_average_no_na)

s=0.5
extr <- ceiling(max(abs(c(min(plot_df[,2:8], na.rm=T), max(plot_df[,2:8], na.rm=T)))))
myBreaks = seq(-extr, extr, by = s)

col <- c(colorRampPalette(
  c("#2166AC", "#4393C3", "#92C5DE",
    "#D1E5F0"))((extr*(1/s))-1),
  "white", "white",
  colorRampPalette(
    c("#FDDBC7", "#F4A582",
      "#D6604D", "#B2182B"))((extr*(1/s))-1))

# Pairwise correlation between samples (columns)
cols.cor <- cor(plot_df[,2:8], use = "pairwise.complete.obs", method = "pearson")
# Pairwise correlation between rows (genes)
rows.cor <- cor(t(plot_df[,2:8]), use = "pairwise.complete.obs", method = "pearson")

h2 <- pheatmap(plot_df[,2:8],
         clustering_distance_cols = as.dist(1 - cols.cor),
         clustering_distance_rows = as.dist(1 - rows.cor),
         angle_col=45,
         color=col,
         breaks = myBreaks,
         legend_breaks=seq(-8,8,2),
         border_color = NA,
         show_rownames = T,
         labels_col = gsub("_", " ", gsub("ced6", "ced-6", gsub("_act", "", colnames(plot_df[,2:8])))),
         annotation_row = annotation_row,
         annotation_legend = F,
         annotation_colors = annotation_colours,
         fontsize_row=1.5,
         fontsize_col=9,
         fontsize = 8,
         main=paste0("Scaled log2 activity of motif variants"),
         width = 8, height = 8
)

Fig S27 - DeepSTARR predictions

fa1 <- read.fasta("data/S2_171_REEF_STARRseq_LibFR002_LibFR003_LibFR004_LibFR005_Twist6k_wt.fa", as.string = T, forceDNAtolower = F)
fa1_rv <- reverseComplement(DNAStringSet(unlist(fa1)))
names(fa1) <- paste0(names(fa1), "_+")
names(fa1_rv) <- paste0(names(fa1_rv), "_-")

fa2 <- read.fasta("data/REEF_STARRseq_chr3L_3310914_3311162_mix_S2171_twist_Twist6k_wt.fa", as.string = T, forceDNAtolower = F)
fa2_rv <- reverseComplement(DNAStringSet(unlist(fa2)))
names(fa2) <- paste0(names(fa2), "_+")
names(fa2_rv) <- paste0(names(fa2_rv), "_-")

fa_all <- c(DNAStringSet(unlist(fa1)),
            fa1_rv,
            DNAStringSet(unlist(fa2)),
            fa2_rv)

# AP-1 library has 250bp sequences - I removed the last nucleotide
write.fasta(as.list(substr(as.character(fa_all),1,249)), names(fa_all), "All_enhancer_variants.fa")

fasta=All_enhancer_variants.fa
pred_script=./Predict_dev_and_hk_enh_activity_CNN_model_from_fasta.py

$pred_script -s $fasta -m DeepSTARR

# output All_enhancer_variants.fa_predictions_DeepSTARR.txt
metadata_final <- read.delim("Table_enhancer_variant_counts_and_enrichment.txt", stringsAsFactors = F)

predictions <- read.delim(paste0("All_enhancer_variants.fa_predictions_DeepSTARR.txt"),
                          stringsAsFactors = F)
metadata_final$Predictions_dev <- predictions$Predictions_dev[match(metadata_final$Variant, predictions$location)]
table(complete.cases(metadata_final$Predictions_dev))
## 
##   TRUE 
## 470744
metadata_final <- metadata_final[metadata_final$library %in% c("ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"),]
metadata_final$library <- factor(metadata_final$library, levels = c("ced6_pos110", "ced6_pos182", "ced6_pos230", "ced6_pos241", "ZnT63C_pos210", "ZnT63C_pos180", "ZnT63C_pos142"))


### scatter plot per library
plot_list <- lapply(as.character(levels(metadata_final$library)), function(l){
  if(length(grep("ced6", l))>0){
    tmp <- data.frame(obs=metadata_final$S21717_REEFmix2_log2FoldChange[metadata_final$library %in% l],
                      pred=metadata_final$Predictions_dev[metadata_final$library %in% l])
  }
  if(length(grep("ZnT63C", l))>0){
    tmp <- data.frame(obs=metadata_final$chr3L_3310914_3311162_REEFmix3_log2FoldChange[metadata_final$library %in% l],
                      pred=metadata_final$Predictions_dev[metadata_final$library %in% l])
  }
  
  # PCC - not using zeros
  pc <- cor.test(tmp$obs,
                 tmp$pred,
                 method = "pearson")
  
  sc_col=c("#fdd49e","#d7301f")
  
  # plot
  scater <- ggplot(tmp, aes(obs, pred)) +
    geom_pointdensity(size=0.5, alpha=0.7) +
    scale_color_gradient(high = "grey10", low = my_col[l]) +
    scale_x_continuous("STARR-seq [log2 enh activity]") +
    scale_y_continuous("DeepSTARR [log2 enh activity]") +
    # geom_abline(linetype="dashed", col="grey50") +
    guides(color="none") +
    theme_bw(base_size = 15) +
    theme(panel.grid = element_blank(),
          axis.text = element_text(colour="black"),
          plot.title = element_text(hjust=0.5)) +
    ggtitle(gsub("_", " ", gsub("_S2171", "_ced-6", gsub("_chr3L_3310914_3311162", "_ZnT63C", l)))) +
    annotate("text",  x=min(tmp$obs, na.rm=T), y = max(tmp$pred, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5)
  
  return(scater)
    
})

print(plot_grid(plotlist = plot_list, nrow = 2))