Code
for preprocessing GTEx data and fitting flash objects to the brain subtensor…
# Download GTEx data from GTEx Portal and load into R using Peter's code ------
samples.file <- "https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt"
read.counts.file <- "~/Downloads/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz"
source("https://raw.githubusercontent.com/stephenslab/topics/master/code/gtex.R?token=AWTMG_VIxH9P52pyv6O3a3tRQEFn0F9Zks5cN431wA%3D%3D")
gtex <- read.gtex.data(samples.file, read.counts.file)
# Pre-process the data as described in Wang, Fischer, and Song (2017) ---------
# "Normalization was performed using the size factors produced by the
#   estimateSizeFactors function of DESeq2."
#     Calculate the (logged) geometric means of the counts for each gene.
gene.geom.means <- apply(gtex$counts, 2, function(x) {
  if (all(x == 0)) {-Inf} else {sum(log(x[x > 0])) / length(x)}
})
#     Take the median of the normalized counts for each sample:
size.factors <- apply(gtex$counts, 1, function(x) {
  exp(median((log(x) - gene.geom.means)[x > 0]))
})
#     Normalize the samples:
gtex$counts <- apply(gtex$counts, 2, `/`, size.factors)
# "We required at least 15 samples to include a given tissue and an average of
#   at least 500 normalized reads in one or more tissues to retain a gene."
tissues.to.include <- names(which(table(gtex$samples$specific) > 14))
samples.to.retain <- samples$specific %in% tissues.to.include
gtex$counts <- gtex$counts[samples.to.retain, ]
gtex$samples <- gtex$samples[samples.to.retain, ]
gene.max.avg.reads <- apply(gtex$counts, 2, function(x) {
  max(aggregate(x, by = list(gtex$samples$specific), FUN = mean)$x)
})
genes.to.retain <- gene.max.avg.reads > 500
gtex$counts <- gtex$counts[, genes.to.retain]
# Log-transform data:
gtex$counts <- log1p(gtex$counts)
# Convert the data from a matrix to a tissue x individual x gene array --------
individual <- as.factor(sapply(strsplit(rownames(gtex$counts), "-"), function(x) {
  paste(x[1:2], collapse = "-")
}))
gtex.df <- data.frame(individual = individual,
                      tissue = droplevels(gtex$samples$specific))
gtex.df <- cbind(gtex.df, gtex$counts)
gtex <- reshape2::melt(gtex.df)
colnames(gtex)[3] <- "gene"
rm(gtex.df)
gtex <- reshape2::acast(gtex, tissue ~ individual ~ gene)
# object size: 4.6 Gb (a 49 x 714 x 17792 array)
saveRDS(gtex, "~/Downloads/gtex_v7_array.rds") # temporary
# Create smaller array using only brain tissues -------------------------------
brain.tissues <- which(substr(dimnames(gtex)[[1]], 1, 5) == "brain")
brain <- gtex[brain.tissues, , ]
# Remove individuals with no brains:
brain <- brain[, apply(brain, 2, function(x) sum(!is.na(x))) > 0, ]
# object size: 450 Mb (a 13 x 254 x 17792 array)
saveRDS(brain, "~/Downloads/gtex_v7_brain.rds") # temporary
# Fit flash object ------------------------------------------------------------
devtools::load_all("~/Github/ashr")
devtools::load_all("~/Github/flashier")
devtools::load_all("~/Github/ebnm")
brain <- set.flash.data(brain) # object size: 675 Mb
brain.flash <- flashier(brain,
                        var.type = 3,
                        prior.type = c("nonnegative", "nonzero.mode", "normal.mix"),
                        conv.crit.fn = function(new, old, k) {
                          flashier:::calc.max.abs.chg.EF(new, old, k, n = 1)
                        },
                        greedy.Kmax = 10,
                        greedy.tol = 5e-4,
                        backfit.after = 2,
                        backfit.every = 1,
                        inner.backfit.maxiter = 1,
                        ash.param = list(optmethod = "mixSQP"),
                        nonmissing.thresh = c(0, 0.05, 0),
                        verbose = "O L1 E2")
saveRDS(brain.flash, "~/Downloads/brain_flash.rds") # temporary
brain.flash <- flashier(brain,
                        flash.init = brain.flash,
                        backfit = "only",
                        backfit.order = "dropout",
                        conv.crit.fn = function(new, old, k) {
                          flashier:::calc.max.abs.chg.EF(new, old, k, n = 1)
                        },
                        backfit.tol = 5e-4,
                        verbose = "O L1 E2")
saveRDS(brain.flash, "~/Downloads/brain_flash_bf.rds") # temporary
# Remove large fields from flash object.
brain.flash$fit$Y <- NULL
brain.flash$fit$Z <- NULL
brain.flash$sampler <- NULL
saveRDS(brain.flash, "./data/brain/brain05.rds")
# Other "brain" objects were created using different settings for
#   nonmissing.thresh.
# Create data frame containing demographics and technical factors -------------
phenotypes <- readRDS("~/Downloads/GTEx_v7_Subjects.rds")
class(phenotypes) <- "data.frame"
rownames(phenotypes) <- phenotypes$SUBJID
phenotypes <- phenotypes[, -1]
# Get subset corresponding to individuals with brains.
phenotypes <- phenotypes[rownames(brain.flash$loadings$normalized.loadings[[2]]), ]
# Remove COHORT because they're all postmortem.
phenotypes <- phenotypes[, -1]
# Remove ETHNCTY because there are no hispanics.
phenotypes <- phenotypes[, -4]
samples <- readr::read_delim("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt",
                             delim = "\t")
samples <- samples[, c("SAMPID", "SMGEBTCHD")]
# Extract individuals from sample IDs.
samples$SAMPID <- sapply(lapply(strsplit(samples$SAMPID, "-"), `[`, 1:2),
                         paste, collapse = "-")
# Convert sequencing date to number of days after first sequencing date.
samples$SMGEBTCHD <- as.numeric(as.POSIXct(samples$SMGEBTCHD, format = "%m/%d/%Y"))
samples$SMGEBTCHD <- samples$SMGEBTCHD - min(samples$SMGEBTCHD, na.rm = TRUE)
samples$SMGEBTCHD <- samples$SMGEBTCHD / (60 * 60 * 24)
SEQDATE <- aggregate(SMGEBTCHD ~ SAMPID, data = samples,
                       FUN = mean, na.rm = TRUE)
rownames(SEQDATE) <- SEQDATE$SAMPID
SEQDATE <- SEQDATE[, 2, drop = FALSE]
SEQDATE <- SEQDATE[rownames(brain.flash$loadings$normalized.loadings[[2]]), ]
all.covar <- cbind(phenotypes, SEQDATE)
# Do not upload to GitHub (data includes protected attributes!)
saveRDS(all.covar, "~/Downloads/GTEx_v7_brain_covar.rds")
…and for producing the plots and tables below.
# Flip signs so that individual loadings are mostly postive -------------------
align.data <- function(brain) {
  for (k in 1:brain$n.factors) {
    if (mean(brain$loadings$normalized.loadings[[2]][, k]) < 0) {
      brain$loadings$normalized.loadings[[2]][, k] <- -1 *
        brain$loadings$normalized.loadings[[2]][, k]
      brain$loadings$normalized.loadings[[3]][, k] <- -1 *
        brain$loadings$normalized.loadings[[3]][, k]
    }
  }
  return(brain)
}
# Barplots --------------------------------------------------------------------
do.plots <- function(brain, k, incl.gene.plot = TRUE, fix.ind.ylim = FALSE,
                     remove.exclusions = TRUE) {
  brain.colors <- c("hotpink", "gray40", "green3", "blue1", "blue3", 
                    "gray20", "black", "lightpink", "red1", "green4", 
                    "green1", "red4", "red3")
  tissue.idx <- order(brain$loadings$normalized.loadings[[1]][, k],
               decreasing = TRUE)
  barplot(brain$loadings$normalized.loadings[[1]][tissue.idx, k],
          col = brain.colors[tissue.idx], 
          axisnames = FALSE,
          xlab = "Tissues")
  
  if (incl.gene.plot) {
    if (brain$pve[k] < 1e-4) {
      pve.str <- "<0.01% PVE"
    } else {
      pve.str <- paste0(100 * round(brain$pve[k], 4), "% PVE")
    }
    barplot(sort(brain$loadings$normalized.loadings[[3]][, k],
                 decreasing = TRUE),
            axisnames = FALSE,
            xlab = "Genes",
            main = paste0("Factor ", k, ": ", pve.str))
  }
  
  vals <- brain$loadings$normalized.loadings[[2]][, k]
  if (remove.exclusions) {
    exclusions <- brain$fit$exclusions[[k]][[2]]
    if (length(exclusions) > 0)
      vals <- vals[-exclusions]
  }
  ylim <- NULL
  if (fix.ind.ylim)
    ylim <- c(-0.15, 0.25)
  barplot(sort(vals, decreasing = TRUE),
          axisnames = FALSE,
          xlab = "Individuals",
          ylim = ylim)
}
all.plots <- function(brain, remove.exclusions = TRUE) {
  par(mfrow = c(2, 3))
  for (k in order(brain$pve, decreasing = TRUE)) {
    do.plots(brain, k, remove.exclusions = remove.exclusions)
  }
}
all.plots.comparison <- function(brain1, brain2) {
  par(mfrow = c(2, 4))
  for (k in 1:brain1$n.factors) {
    do.plots(brain1, k, incl.gene.plot = FALSE, fix.ind.ylim = TRUE)
    do.plots(brain2, k, incl.gene.plot = FALSE, fix.ind.ylim = TRUE)
  }
}
# Top tissues and genes -------------------------------------------------------
get.top.tissues <- function(brain, k) {
  tissue.names <- c("amygdala", "ac cortex", "caudate bg",
                    "cerebellar hemisphere", "cerebellum",
                    "cortex", "fr cortex", "hippocampus",
                    "hypothalamus", "nuc acc bg", "putamen bg",
                    "spinal cord", "substantia nigra")
  
  loadings <- round(brain$loadings$normalized.loadings[[1]][, k], 2)
  tissue.labels <- paste0(tissue.names, " (", loadings, ")")
  
  n.tissues <- sum(loadings > 0.1)
  if (n.tissues > 6)
    return("most tissues")
  tissue.idx <- order(loadings, decreasing = TRUE)[1:n.tissues]
  return(paste(tissue.labels[tissue.idx], collapse = ", "))
}
get.top.go <- function(brain, k, n.loadings = 200) {
  loadings <- brain$loadings$normalized.loadings[[3]][, k]
  names(loadings) <- sapply(strsplit(names(loadings), ".", fixed = TRUE), 
                            `[`, 1)
  overexpressed <- names(sort(loadings, decreasing = TRUE)[1:n.loadings])
  cp.res <- clusterProfiler::enrichGO(overexpressed, "org.Hs.eg.db", 
                                      ont = "BP", keyType = "ENSEMBL",
                                      universe = names(loadings), 
                                      minGSSize = 10)
  over <- paste(head(cp.res, 3)$Description, collapse = ", ")
  
  if (k == 1) {
    under <- NA
  } else {
    underexpressed <- names(sort(loadings, decreasing = FALSE)[1:n.loadings])
    cp.res2 <- clusterProfiler::enrichGO(underexpressed, "org.Hs.eg.db", 
                                         ont = "BP", keyType = "ENSEMBL",
                                         universe = names(loadings), 
                                         minGSSize = 10)
    under <- paste(head(cp.res2, 3)$Description, collapse = ", ")
  }
  
  list(over = over, under = under)
}
# Regression on demographic and technical variables ---------------------------
do.regression <- function(brain, covar, k) {
  excl <- brain$fit$exclusions[[k]][[2]]
  loadings <- brain$loadings$normalized.loadings[[2]][, k]
  
  if (length(excl) > 0) {
    covar <- covar[-excl, ]
    loadings <- loadings[-excl]
  }
  df <- cbind(covar, loadings = loadings)
  mod <- lm(loadings ~ ., data = df)
  
  p.vals <- coefficients(summary(mod))[-1, 4]
  p.vals <- ifelse(p.vals < 0.01, 
                   formatC(p.vals, digits = 1, format = "e"),
                   round(p.vals, digits = 2))
  sex.col <- which(substr(names(p.vals), 1, 3) == "SEX")
  age.col <- which(substr(names(p.vals), 1, 3) == "AGE")
  race.col <- which(substr(names(p.vals), 1, 4) == "RACE")
  tech.col <- (1:length(p.vals))[-c(sex.col, age.col, race.col)]
  
  mod.anova <- anova(mod)
  ss <- sum(mod.anova$`Sum Sq`)
  sex.pve <- mod.anova["SEX", ]$`Sum Sq` / ss
  age.pve <- mod.anova["AGE", ]$`Sum Sq` / ss
  race.pve <- mod.anova["RACE", ]$`Sum Sq` / ss
  tech.pve <- sum(mod.anova[c("DTHVNT", "TRISCHD", 
                              "logTRISCHD", "SEQDATE"), ]$`Sum Sq`) / ss
  
  sex.res <- paste0("PVE: ", 100 * round(sex.pve, 4), "% (p: ",
                    paste(p.vals[sex.col], collapse = ", "), ")")
  age.res <- paste0("PVE: ", 100 * round(age.pve, 4), "% (p: ",
                    paste(p.vals[age.col], collapse = ", "), ")")
  race.res <- paste0("PVE: ", 100 * round(race.pve, 4), "% (p: ",
                    paste(p.vals[race.col], collapse = ", "), ")")
  tech.res <- paste0("PVE: ", 100 * round(tech.pve, 4), "% (p: ",
                    paste(p.vals[tech.col], collapse = ", "), ")")
  
  return(list(sex = sex.res, age = age.res, race = race.res, tech = tech.res))
}
# Table of results ------------------------------------------------------------
do.table <- function(brain, covar, k) {
  top.go <- get.top.go(brain, k)
  regress.res <- do.regression(brain, covar, k)
  tabl <- rbind(c("PVE: ", paste0(100 * round(brain$pve[k], 4), "%")),
                c("top tissues: ", get.top.tissues(brain, k)),
                c("overexpressed: ", top.go$over),
                c("underexpressed: ", top.go$under),
                c("age effect: ", regress.res$age),
                c("sex effect: ", regress.res$sex),
                c("race effect: ", regress.res$race),
                c("technical factors:", regress.res$tech))
  return(tabl)
}
all.tables <- function(brain, covar) {
  cat("\n")
  for (k in order(brain$pve, decreasing = TRUE)) {
    print(knitr::kable(do.table(brain, covar, k), 
                       caption = paste("Factor", k)))
    cat("\n")
  }
}
 
Factor details (10 factors)
Note, in particular, the large differences from the paper in proportions of variance explained by demographic effects.
covar <- readRDS("~/Downloads/GTEx_v7_brain_covar.rds")
# Two of the race levels only have one tissue each, so I remove them.
covar$RACE[covar$RACE %in% c(1, 4)] <- NA
covar$RACE <- droplevels(covar$RACE)
all.tables(brain05, covar)
#> 
#> Loading required package: org.Hs.eg.db
#> Loading required package: AnnotationDbi
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, cbind, colMeans,
#>     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
#>     Find, get, grep, grepl, intersect, is.unsorted, lapply,
#>     lengths, Map, mapply, match, mget, order, paste, pmax,
#>     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
#>     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
#>     tapply, union, unique, unsplit, which, which.max, which.min
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> Loading required package: IRanges
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> 
Factor 1
| PVE: | 98.77% | 
| top tissues: | most tissues | 
| overexpressed: | SRP-dependent cotranslational protein targeting to membrane, establishment of protein localization to membrane, cotranslational protein targeting to membrane | 
| underexpressed: | NA | 
| age effect: | PVE: 2.31% (p: 8.2e-03) | 
| sex effect: | PVE: 0.86% (p: 0.25) | 
| race effect: | PVE: 0.02% (p: 0.69) | 
| technical factors: | PVE: 3.96% (p: 0.02, 0.43, 0.4, 0.15) | 
Factor 2
| PVE: | 0.44% | 
| top tissues: | cerebellar hemisphere (0.74), cerebellum (0.67) | 
| overexpressed: |  | 
| underexpressed: | telencephalon development, forebrain development, forebrain generation of neurons | 
| age effect: | PVE: 0.12% (p: 0.89) | 
| sex effect: | PVE: 2.18% (p: 0.11) | 
| race effect: | PVE: 0.07% (p: 0.71) | 
| technical factors: | PVE: 6.72% (p: 0.22, 0.78, 0.74, 0.65) | 
Factor 3
| PVE: | 0.19% | 
| top tissues: | spinal cord (0.81), substantia nigra (0.41), hypothalamus (0.31), hippocampus (0.19), putamen bg (0.13) | 
| overexpressed: | skeletal system development, embryonic skeletal system development, anterior/posterior pattern specification | 
| underexpressed: | telencephalon development, regulation of neurotransmitter levels, neurotransmitter transport | 
| age effect: | PVE: 1.01% (p: 0.27) | 
| sex effect: | PVE: 1.95% (p: 0.12) | 
| race effect: | PVE: 0.07% (p: 0.68) | 
| technical factors: | PVE: 5.77% (p: 0.5, 0.09, 0.04, 0.42) | 
Factor 4
| PVE: | 0.11% | 
| top tissues: | fr cortex (0.65), cortex (0.54), ac cortex (0.46), hippocampus (0.22), amygdala (0.12) | 
| overexpressed: | potassium ion transport, fear response, regulation of membrane potential | 
| underexpressed: | sensory perception, pattern specification process, nephron tubule development | 
| age effect: | PVE: 5.31% (p: 1.5e-03) | 
| sex effect: | PVE: 2.31% (p: 0.02) | 
| race effect: | PVE: 0.12% (p: 0.53) | 
| technical factors: | PVE: 4.5% (p: 0.41, 0.06, 0.06, 5.6e-03) | 
Factor 6
| PVE: | 0.06% | 
| top tissues: | nuc acc bg (0.66), caudate bg (0.57), putamen bg (0.46), spinal cord (0.13) | 
| overexpressed: | behavior, associative learning, learning or memory | 
| underexpressed: | forebrain development, cyclic-nucleotide-mediated signaling, second-messenger-mediated signaling | 
| age effect: | PVE: 0.03% (p: 0.77) | 
| sex effect: | PVE: 0.4% (p: 0.51) | 
| race effect: | PVE: 0.09% (p: 0.68) | 
| technical factors: | PVE: 3.67% (p: 0.1, 0.15, 0.11, 0.64) | 
Factor 7
| PVE: | 0.04% | 
| top tissues: | hypothalamus (0.96), substantia nigra (0.21), nuc acc bg (0.17) | 
| overexpressed: | behavior, signal release, neuropeptide signaling pathway | 
| underexpressed: | ensheathment of neurons, axon ensheathment, myelination | 
| age effect: | PVE: 0.25% (p: 0.73) | 
| sex effect: | PVE: 0.26% (p: 0.76) | 
| race effect: | PVE: 0.05% (p: 0.76) | 
| technical factors: | PVE: 4.47% (p: 0.62, 0.34, 0.27, 0.04) | 
Factor 5
| PVE: | 0.04% | 
| top tissues: | most tissues | 
| overexpressed: | bicarbonate transport, neutrophil chemotaxis, leukocyte migration | 
| underexpressed: | neurotransmitter transport, modulation of chemical synaptic transmission, synaptic vesicle cycle | 
| age effect: | PVE: 7.24% (p: 6.3e-04) | 
| sex effect: | PVE: 0.23% (p: 0.38) | 
| race effect: | PVE: 0.11% (p: 0.64) | 
| technical factors: | PVE: 2.98% (p: 0.52, 0.71, 0.56, 0.02) | 
Factor 10
| PVE: | 0.03% | 
| top tissues: | most tissues | 
| overexpressed: | positive regulation of response to external stimulus, regulation of receptor activity, L-glutamate transport | 
| underexpressed: | cell-matrix adhesion | 
| age effect: | PVE: 3.32% (p: 0.01) | 
| sex effect: | PVE: 0.33% (p: 0.56) | 
| race effect: | PVE: 0.4% (p: 0.45) | 
| technical factors: | PVE: 8.4% (p: 0.29, 0.14, 0.04, 0.97) | 
Factor 8
| PVE: | 0.03% | 
| top tissues: | most tissues | 
| overexpressed: | regulation of receptor activity | 
| underexpressed: | neutrophil activation, granulocyte activation, neutrophil mediated immunity | 
| age effect: | PVE: 0.28% (p: 0.12) | 
| sex effect: | PVE: 2.28% (p: 0.06) | 
| race effect: | PVE: 1.64% (p: 0.07) | 
| technical factors: | PVE: 19.29% (p: 0.62, 0.12, 0.06, 1.2e-09) | 
Factor 9
| PVE: | 0.02% | 
| top tissues: | most tissues | 
| overexpressed: | response to lipopolysaccharide, regulation of angiogenesis, response to molecule of bacterial origin | 
| underexpressed: |  | 
| age effect: | PVE: 6.13% (p: 3.0e-04) | 
| sex effect: | PVE: 0.14% (p: 0.54) | 
| race effect: | PVE: 0.34% (p: 0.34) | 
| technical factors: | PVE: 2.65% (p: 0.03, 0.94, 0.97, 0.2) |