Setting and functions

library(Gviz)
library(GenomicRanges)
library(GenomicFeatures)
library(rtracklayer)
library(dplyr)
library(data.table)
source("~/projects/funcFinemapping/code/run_susie.R")
annoDIR <- "~/cluster/data/features/raw/"
build_grtrack <- function(curr.locus.gr, 
                          genome="hg19",
                          start.pos = start,
                          end.pos = end
                          ){
    ## match gene ids in TxDb with gene symbols if available
    ## Otherwise, return transcript symbol
    ga.track <- GenomeAxisTrack()
    if(genome == "hg19"){
        #txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
        txdb <- loadDb("~/resources/gencode/gencode_v43lift37.txdb")

    }
    if(genome == "hg38"){
        txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
    }
    # gene track
    grtrack <- GeneRegionTrack(range = txdb, 
                               genome = genome,
                               chromosome = as.character(seqnames(curr.locus.gr)),
                               name = "Genes",
                               collapse = TRUE
                 )
    # subsetting the grtrack to keep all transcripts regardless of whether they have gene annotations
    grtrack <- subset(grtrack,
                      from = start.pos,
                      to = end.pos)
    # match gene IDs with gene symbols
    symbolList <- mapIds(org.Hs.eg.db::org.Hs.eg.db, 
                              keys=sub("\\.\\d+$", "", 
                                       sub("\\_\\d+$", "", 
                                           ranges(grtrack)$gene)), 
                              keytype="ENSEMBL", column="SYMBOL")
    
    # return transcript symbol if no gene symbol available
    symbol(grtrack) <- unlist(lapply(1:length(symbolList), function(i){
      if(is.na(symbolList[[i]])){
        return(symbol(grtrack)[i])
      }else{
        return(symbolList[[i]])
      }
    }))
    return(grtrack)
}



make_track_plots <- function(dsRNA.locus, 
                             extend.size=0,
                             reverse.strand = FALSE,
                             to.highligh = TRUE,
                             collapse.type = "meta"
                             ){
      
  # build GRanges for each track
  m6a_peaks<-GRanges(seqnames = unique(dsRNA.locus$chr),
                   IRanges(start = unique(dsRNA.locus$start),
                           end = unique(dsRNA.locus$end)),
                   strand = unique(dsRNA.locus$strand)
                )

  gene_query <- range(c(
                  GRanges(dsRNA.locus$DS_query_gene_gr),
                  GRanges(dsRNA.locus$DS_ref_gene_gr)
                  ), ignore.strand=TRUE)

  #print("Here is the targeted gene region:\n")
  #print(gene_query)
  
  dsRNA <- unique(GRanges(dsRNA.locus %>% 
                            select(c(DS_chr, DS_start, DS_end))
                          ))
  
  ## annotation tracks
  m6a_track <- AnnotationTrack(m6a_peaks, genome = "hg19", name = "m6A")
  dsRNA_track <- AnnotationTrack(dsRNA, genome = "hg19", name = "dsRNA", stacking = "dense")
  ed_track <- AnnotationTrack(
                        ed_gr[seqnames(ed_gr) == seqnames(m6a_track), ], 
                        genome = "hg19", 
                        name = "Geuvadis LCLs",
                        stacking = "dense"
                        )
  RADAR_track <- AnnotationTrack(
                        ed_gr[seqnames(ed_gr) == seqnames(m6a_track) &
                                ed_gr$RADAR == 1, ], 
                        genome = "hg19", 
                        name = "RADAR",
                        stacking = "dense"
                        )
  
  ed_atlas_subset<- ed_atlas_gr[seqnames(ed_atlas_gr) == as.character(seqnames(gene_query)) &
                                      start(ed_atlas_gr) >= start(gene_query) &
                                      start(ed_atlas_gr) < end(gene_query)]
  ED_atlas_track <- AnnotationTrack(
                        ed_atlas_subset,
                        genome = "hg19",
                        name = "ED Atlas",
                        stacking = "dense",
                        group = ed_atlas_subset$type
                        )
  feature(ED_atlas_track)<-ed_atlas_subset$type


  GTEx_Ed_track <- AnnotationTrack(
                        ed_gtex_gr[seqnames(ed_gtex_gr) == seqnames(m6a_track), ], 
                        genome = "hg19", 
                        name = "GTEx LCLs",
                        stacking = "dense"
                        )

  ## genomic coordinates
  gtrack <- GenomeAxisTrack()
  

  ## gene tracks
  grtrack<-build_grtrack(curr.locus.gr = gene_query,
                            start = start(gene_query),
                            end = end(gene_query),
                            genome = "hg19")
  

  # highlight a particular SNP
  trackList <-list(
                   gtrack,grtrack, 
                   m6a_track, 
                   dsRNA_track, 
                   ed_track,
                   RADAR_track,
                   ED_atlas_track,
                   GTEx_Ed_track
                   )
  if(to.highligh){
    ht1 <- HighlightTrack(trackList,
                          start = unique(dsRNA.locus$physical.pos), width = 1, 
                          chromosome = as.character(seqnames(gene_query)), 
                          col = 'pink')
    dist <- unique(dsRNA.locus$physical.pos) - end(gene_query)
    
    if(dist > extend.size){
       print(sprintf("Extend the window size to be longer than %d.", 
                     dist))
       }
  }else{
    ht1 <- trackList
  }
  
  plotTracks(ht1,
             transcriptAnnotation = 'symbol',
             from = start(gene_query) - extend.size,
             to = end(gene_query) + extend.size,
             collapseTranscripts = collapse.type,
             just.group = "right",
             groupAnnotation="type", ALU="darkgreen", REP="blue", NONREP="darkred",
             reverseStrand = reverse.strand
             )
}

Load data

RNA editing sites
Editing sites from Geuvadis LCLs:
Mai et al. integrated genome and transcriptome squencing data from Geuvadis LCLs of 447 individuals (derived from 1000 Genomes Project) to detect A-to-I editing sites. For each A/G SNP site, individuals with the homozygous genotype AA were considered to determine RNA editing. An A/G SNP site was further defined as a SNP editing site if it was found to be edited at a level >5% in at least two LCL samples from individuals with homozygous genotype AA. This leads to the detection of more than 80K edited sites.

RADAR database:
Ramaswami et al. built a rigorous database for RNA A-to-I editing sites detected in humans. This database not only includes extensive manually curated annotations for each editing site, but also tissue-specific editing levels for each editing site.

Editing sites from GTEx LCLs:
Li et al. detected editing sites using the GTEx gene expression data. The list of edited sites for LCLs were obtained from GTEx portal (GTEx Analysis V8 release).

REDIportal V2.0: Picardi et al. produced an RNA editing atlas in humans across 55 human body sites using GTEx RNAseq experiments as well as their own experiments.

## GTEx dataset
# ed_sites <- read.csv(paste0(annoDIR, "RNA_editing/Tan2017/GTEx_avg_RNA_editing_levels_by_tissue.csv"))
# 
# ed_sites[, c("chr", "pos")]<-t(sapply(ed_sites$sites, 
#                                       function(i){strsplit(i, "_")[[1]]}
# ))
## Mai2019 +400 LCLs
ed_sites <- readxl::read_excel(paste0(annoDIR, 
                                    "RNA_editing/Mai2019/RNA_editing_sites_LCLs.xlsx"),
                               skip = 2)

## Picardi2017: RNA editing atlas
ed_atlas <- fread("../data/TABLE1_hg19.txt")
## Warning in fread("../data/TABLE1_hg19.txt"): Detected 25 column names but the
## data has 22 columns. Filling rows automatically. Set fill=TRUE explicitly to
## avoid this warning.
ed_atlas_gr <- GRanges(seqnames = ed_atlas$Region,
                    IRanges(start = ed_atlas$Position,
                            end = ed_atlas$Position),
                    type = ed_atlas$type)

## Li2022 Edsites for GTEx 

ed_gtex <- read.table(paste0(annoDIR, 
                             "RNA_editing/Li2022/Cells-EBV-transformed-lymphocytes.edsite.txt.gz"),
                      header = T)


ed_gtex_gr_hg38 <- GRanges(seqnames = ed_gtex$chr,
                      IRanges(start = ed_gtex$variant_pos,
                              end = ed_gtex$variant_pos),
                      snpID = ed_gtex$rs_id_dbSNP151_GRCh38p7
                      )

## Li2022 dsRNAs:
ds_data <- readxl::read_excel("../data/Li2022_dsRNAs.xlsx", skip=10)
ds_rnas <- unlist(lapply(unique(ds_data$`cis-NATs`), function(i){strsplit(i, ",")[[1]]}))
m6a_twas <- read.csv("../data/m6A_TWAS_results.csv")   
m6a_immuno_dsRNA <- ds_data[ds_data$`cis-NATs` %in% intersect(m6a_twas$GENE, ds_rnas) &
                              ds_data$tissue == "Cells-EBV-transformed-lymphocytes", ] 
snp_info <- data.frame(t(
  sapply(
    m6a_immuno_dsRNA$ref_SNP, function(i){
      items <- strsplit(i, "_")[[1]]
      return(c(paste0("chr", items[1]), items[2]))
      }
    )
  ), row.names = NULL)

colnames(snp_info) <- c("chr", "pos")
m6a_immuno_dsRNA <- data.frame(snp_info, m6a_immuno_dsRNA)
targets_gr_hg38 <- GRanges(seqnames = m6a_immuno_dsRNA$chr,
                      IRanges(start = m6a_immuno_dsRNA$pos),
                      cisNATs = m6a_immuno_dsRNA$cis.NATs,
                      traits = m6a_immuno_dsRNA$GWAS_trait,
                      pp4 = m6a_immuno_dsRNA$clpp_h4,
                      )
## liftover to hg19
chain.file.path <- "~/resources/genomes/chain_files/hg38ToHg19.over.chain"
c<-import.chain(chain.file.path) 

targets_gr <- unlist(liftOver(targets_gr_hg38, c))

Results

The features on the plot from top to bottom are genes, m6a peaks, dsRNA regions, RNA editing sites detected in >400 LCLs and the subset of editing sites that were also found in RADAR database. The red line highlights the position for the top m6A QTL.

TWAS genes with PP4 > 0.5 for blood traits

Specific examples for blood traits

rs3204541 - DDX55

DDX55 has the molecular function of RNA Binding (GO:0003723). DDX55 is found in Nuclear Lumen (GO:0031981).
DDX55 is found in Nucleolus (GO:0005730).
DDX55 is upregulated in 721 B lymphoblasts cells.

EIF2B1 is involved in the biological process T Cell Receptor Signaling Pathway (GO:0050852).
EIF2B1 is involved in the biological process Antigen Receptor-Mediated Signaling Pathway (GO:0050851).
EIF2B1 has the molecular function of GTPase Regulator Activity (GO:0030695).
EIF2B1 has the molecular function of Guanyl-Nucleotide Exchange Factor Activity (GO:0005085).
EIF2B1 has the molecular function of Translation Initiation Factor Activity (GO:0003743).

## 'select()' returned 1:1 mapping between keys and columns

rs1806261 - RABEP1

RABEP1 has the molecular function of Protein Homodimerization Activity (GO:0042803).
RABEP1 is involved in the biological process Golgi To Plasma Membrane Transport (GO:0006893).
RABEP1 is involved in the biological process Membrane Fusion (GO:0061025).
RABEP1 is involved in the biological process Membrane Organization (GO:0061024).
RABEP1 is involved in the biological process post-Golgi Vesicle-Mediated Transport (GO:0006892).
RABEP1 is involved in the biological process Protein Localization To Cell Periphery (GO:1990778).

## 'select()' returned 1:1 mapping between keys and columns

rs3177647 - MAPKAPK5

MAPKAPK5 has the molecular function of MAP Kinase Kinase Activity (GO:0004708).
MAPKAPK5 has the molecular function of Calcium-Dependent Protein Kinase Activity (GO:0010857).
MAPKAPK5 has the molecular function of Calcium-Dependent Protein Serine/Threonine Kinase Activity

## 'select()' returned 1:1 mapping between keys and columns

rs1056879 - PPP2R3C

PPP2R3C is involved in the biological process B Cell Homeostasis (GO:0001782).
PPP2R3C is involved in the biological process T Cell Homeostasis (GO:0043029).
PPP2R3C is involved in the biological process Positive Regulation Of B Cell Activation (GO:0050871).
PPP2R3C is involved in the biological process Positive Regulation Of B Cell Differentiation (GO:0045579).
PPP2R3C is involved in the biological process Positive Regulation Of Lymphocyte Differentiation (GO:0045621).
PPP2R3C is involved in the biological process Regulation Of B Cell Differentiation (GO:0045577).

PRORP is involved in the biological process Mitochondrial tRNA Processing (GO:0090646).
PRORP is involved in the biological process tRNA 5’-End Processing (GO:0099116).
PRORP is involved in the biological process tRNA 5’-Leader Removal (GO:0001682).
PRORP is found in Intracellular Organelle Lumen (GO:0070013).
PRORP is found in Mitochondrial Matrix (GO:0005759).
PRORP has the molecular function of tRNA-specific Ribonuclease Activity (GO:0004549).

## 'select()' returned 1:many mapping between keys and columns