Code
for preprocessing GTEx data and fitting flash objects to the brain subtensor…
# Download GTEx data from GTEx Portal and load into R using Peter's code ------
samples.file <- "https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt"
read.counts.file <- "~/Downloads/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz"
source("https://raw.githubusercontent.com/stephenslab/topics/master/code/gtex.R?token=AWTMG_VIxH9P52pyv6O3a3tRQEFn0F9Zks5cN431wA%3D%3D")
gtex <- read.gtex.data(samples.file, read.counts.file)
# Pre-process the data as described in Wang, Fischer, and Song (2017) ---------
# "Normalization was performed using the size factors produced by the
# estimateSizeFactors function of DESeq2."
# Calculate the (logged) geometric means of the counts for each gene.
gene.geom.means <- apply(gtex$counts, 2, function(x) {
if (all(x == 0)) {-Inf} else {sum(log(x[x > 0])) / length(x)}
})
# Take the median of the normalized counts for each sample:
size.factors <- apply(gtex$counts, 1, function(x) {
exp(median((log(x) - gene.geom.means)[x > 0]))
})
# Normalize the samples:
gtex$counts <- apply(gtex$counts, 2, `/`, size.factors)
# "We required at least 15 samples to include a given tissue and an average of
# at least 500 normalized reads in one or more tissues to retain a gene."
tissues.to.include <- names(which(table(gtex$samples$specific) > 14))
samples.to.retain <- samples$specific %in% tissues.to.include
gtex$counts <- gtex$counts[samples.to.retain, ]
gtex$samples <- gtex$samples[samples.to.retain, ]
gene.max.avg.reads <- apply(gtex$counts, 2, function(x) {
max(aggregate(x, by = list(gtex$samples$specific), FUN = mean)$x)
})
genes.to.retain <- gene.max.avg.reads > 500
gtex$counts <- gtex$counts[, genes.to.retain]
# Log-transform data:
gtex$counts <- log1p(gtex$counts)
# Convert the data from a matrix to a tissue x individual x gene array --------
individual <- as.factor(sapply(strsplit(rownames(gtex$counts), "-"), function(x) {
paste(x[1:2], collapse = "-")
}))
gtex.df <- data.frame(individual = individual,
tissue = droplevels(gtex$samples$specific))
gtex.df <- cbind(gtex.df, gtex$counts)
gtex <- reshape2::melt(gtex.df)
colnames(gtex)[3] <- "gene"
rm(gtex.df)
gtex <- reshape2::acast(gtex, tissue ~ individual ~ gene)
# object size: 4.6 Gb (a 49 x 714 x 17792 array)
saveRDS(gtex, "~/Downloads/gtex_v7_array.rds") # temporary
# Create smaller array using only brain tissues -------------------------------
brain.tissues <- which(substr(dimnames(gtex)[[1]], 1, 5) == "brain")
brain <- gtex[brain.tissues, , ]
# Remove individuals with no brains:
brain <- brain[, apply(brain, 2, function(x) sum(!is.na(x))) > 0, ]
# object size: 450 Mb (a 13 x 254 x 17792 array)
saveRDS(brain, "~/Downloads/gtex_v7_brain.rds") # temporary
# Fit flash object ------------------------------------------------------------
devtools::load_all("~/Github/ashr")
devtools::load_all("~/Github/flashier")
devtools::load_all("~/Github/ebnm")
brain <- set.flash.data(brain) # object size: 675 Mb
brain.flash <- flashier(brain,
var.type = 3,
prior.type = c("nonnegative", "nonzero.mode", "normal.mix"),
conv.crit.fn = function(new, old, k) {
flashier:::calc.max.abs.chg.EF(new, old, k, n = 1)
},
greedy.Kmax = 10,
greedy.tol = 5e-4,
backfit.after = 2,
backfit.every = 1,
inner.backfit.maxiter = 1,
ash.param = list(optmethod = "mixSQP"),
nonmissing.thresh = c(0, 0.05, 0),
verbose = "O L1 E2")
saveRDS(brain.flash, "~/Downloads/brain_flash.rds") # temporary
brain.flash <- flashier(brain,
flash.init = brain.flash,
backfit = "only",
backfit.order = "dropout",
conv.crit.fn = function(new, old, k) {
flashier:::calc.max.abs.chg.EF(new, old, k, n = 1)
},
backfit.tol = 5e-4,
verbose = "O L1 E2")
saveRDS(brain.flash, "~/Downloads/brain_flash_bf.rds") # temporary
# Remove large fields from flash object.
brain.flash$fit$Y <- NULL
brain.flash$fit$Z <- NULL
brain.flash$sampler <- NULL
saveRDS(brain.flash, "./data/brain/brain05.rds")
# Other "brain" objects were created using different settings for
# nonmissing.thresh.
# Create data frame containing demographics and technical factors -------------
phenotypes <- readRDS("~/Downloads/GTEx_v7_Subjects.rds")
class(phenotypes) <- "data.frame"
rownames(phenotypes) <- phenotypes$SUBJID
phenotypes <- phenotypes[, -1]
# Get subset corresponding to individuals with brains.
phenotypes <- phenotypes[rownames(brain.flash$loadings$normalized.loadings[[2]]), ]
# Remove COHORT because they're all postmortem.
phenotypes <- phenotypes[, -1]
# Remove ETHNCTY because there are no hispanics.
phenotypes <- phenotypes[, -4]
samples <- readr::read_delim("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt",
delim = "\t")
samples <- samples[, c("SAMPID", "SMGEBTCHD")]
# Extract individuals from sample IDs.
samples$SAMPID <- sapply(lapply(strsplit(samples$SAMPID, "-"), `[`, 1:2),
paste, collapse = "-")
# Convert sequencing date to number of days after first sequencing date.
samples$SMGEBTCHD <- as.numeric(as.POSIXct(samples$SMGEBTCHD, format = "%m/%d/%Y"))
samples$SMGEBTCHD <- samples$SMGEBTCHD - min(samples$SMGEBTCHD, na.rm = TRUE)
samples$SMGEBTCHD <- samples$SMGEBTCHD / (60 * 60 * 24)
SEQDATE <- aggregate(SMGEBTCHD ~ SAMPID, data = samples,
FUN = mean, na.rm = TRUE)
rownames(SEQDATE) <- SEQDATE$SAMPID
SEQDATE <- SEQDATE[, 2, drop = FALSE]
SEQDATE <- SEQDATE[rownames(brain.flash$loadings$normalized.loadings[[2]]), ]
all.covar <- cbind(phenotypes, SEQDATE)
# Do not upload to GitHub (data includes protected attributes!)
saveRDS(all.covar, "~/Downloads/GTEx_v7_brain_covar.rds")
…and for producing the plots and tables below.
# Flip signs so that individual loadings are mostly postive -------------------
align.data <- function(brain) {
for (k in 1:brain$n.factors) {
if (mean(brain$loadings$normalized.loadings[[2]][, k]) < 0) {
brain$loadings$normalized.loadings[[2]][, k] <- -1 *
brain$loadings$normalized.loadings[[2]][, k]
brain$loadings$normalized.loadings[[3]][, k] <- -1 *
brain$loadings$normalized.loadings[[3]][, k]
}
}
return(brain)
}
# Barplots --------------------------------------------------------------------
do.plots <- function(brain, k, incl.gene.plot = TRUE, fix.ind.ylim = FALSE,
remove.exclusions = TRUE) {
brain.colors <- c("hotpink", "gray40", "green3", "blue1", "blue3",
"gray20", "black", "lightpink", "red1", "green4",
"green1", "red4", "red3")
tissue.idx <- order(brain$loadings$normalized.loadings[[1]][, k],
decreasing = TRUE)
barplot(brain$loadings$normalized.loadings[[1]][tissue.idx, k],
col = brain.colors[tissue.idx],
axisnames = FALSE,
xlab = "Tissues")
if (incl.gene.plot) {
if (brain$pve[k] < 1e-4) {
pve.str <- "<0.01% PVE"
} else {
pve.str <- paste0(100 * round(brain$pve[k], 4), "% PVE")
}
barplot(sort(brain$loadings$normalized.loadings[[3]][, k],
decreasing = TRUE),
axisnames = FALSE,
xlab = "Genes",
main = paste0("Factor ", k, ": ", pve.str))
}
vals <- brain$loadings$normalized.loadings[[2]][, k]
if (remove.exclusions) {
exclusions <- brain$fit$exclusions[[k]][[2]]
if (length(exclusions) > 0)
vals <- vals[-exclusions]
}
ylim <- NULL
if (fix.ind.ylim)
ylim <- c(-0.15, 0.25)
barplot(sort(vals, decreasing = TRUE),
axisnames = FALSE,
xlab = "Individuals",
ylim = ylim)
}
all.plots <- function(brain, remove.exclusions = TRUE) {
par(mfrow = c(2, 3))
for (k in order(brain$pve, decreasing = TRUE)) {
do.plots(brain, k, remove.exclusions = remove.exclusions)
}
}
all.plots.comparison <- function(brain1, brain2) {
par(mfrow = c(2, 4))
for (k in 1:brain1$n.factors) {
do.plots(brain1, k, incl.gene.plot = FALSE, fix.ind.ylim = TRUE)
do.plots(brain2, k, incl.gene.plot = FALSE, fix.ind.ylim = TRUE)
}
}
# Top tissues and genes -------------------------------------------------------
get.top.tissues <- function(brain, k) {
tissue.names <- c("amygdala", "ac cortex", "caudate bg",
"cerebellar hemisphere", "cerebellum",
"cortex", "fr cortex", "hippocampus",
"hypothalamus", "nuc acc bg", "putamen bg",
"spinal cord", "substantia nigra")
loadings <- round(brain$loadings$normalized.loadings[[1]][, k], 2)
tissue.labels <- paste0(tissue.names, " (", loadings, ")")
n.tissues <- sum(loadings > 0.1)
if (n.tissues > 6)
return("most tissues")
tissue.idx <- order(loadings, decreasing = TRUE)[1:n.tissues]
return(paste(tissue.labels[tissue.idx], collapse = ", "))
}
get.top.go <- function(brain, k, n.loadings = 200) {
loadings <- brain$loadings$normalized.loadings[[3]][, k]
names(loadings) <- sapply(strsplit(names(loadings), ".", fixed = TRUE),
`[`, 1)
overexpressed <- names(sort(loadings, decreasing = TRUE)[1:n.loadings])
cp.res <- clusterProfiler::enrichGO(overexpressed, "org.Hs.eg.db",
ont = "BP", keyType = "ENSEMBL",
universe = names(loadings),
minGSSize = 10)
over <- paste(head(cp.res, 3)$Description, collapse = ", ")
if (k == 1) {
under <- NA
} else {
underexpressed <- names(sort(loadings, decreasing = FALSE)[1:n.loadings])
cp.res2 <- clusterProfiler::enrichGO(underexpressed, "org.Hs.eg.db",
ont = "BP", keyType = "ENSEMBL",
universe = names(loadings),
minGSSize = 10)
under <- paste(head(cp.res2, 3)$Description, collapse = ", ")
}
list(over = over, under = under)
}
# Regression on demographic and technical variables ---------------------------
do.regression <- function(brain, covar, k) {
excl <- brain$fit$exclusions[[k]][[2]]
loadings <- brain$loadings$normalized.loadings[[2]][, k]
if (length(excl) > 0) {
covar <- covar[-excl, ]
loadings <- loadings[-excl]
}
df <- cbind(covar, loadings = loadings)
mod <- lm(loadings ~ ., data = df)
p.vals <- coefficients(summary(mod))[-1, 4]
p.vals <- ifelse(p.vals < 0.01,
formatC(p.vals, digits = 1, format = "e"),
round(p.vals, digits = 2))
sex.col <- which(substr(names(p.vals), 1, 3) == "SEX")
age.col <- which(substr(names(p.vals), 1, 3) == "AGE")
race.col <- which(substr(names(p.vals), 1, 4) == "RACE")
tech.col <- (1:length(p.vals))[-c(sex.col, age.col, race.col)]
mod.anova <- anova(mod)
ss <- sum(mod.anova$`Sum Sq`)
sex.pve <- mod.anova["SEX", ]$`Sum Sq` / ss
age.pve <- mod.anova["AGE", ]$`Sum Sq` / ss
race.pve <- mod.anova["RACE", ]$`Sum Sq` / ss
tech.pve <- sum(mod.anova[c("DTHVNT", "TRISCHD",
"logTRISCHD", "SEQDATE"), ]$`Sum Sq`) / ss
sex.res <- paste0("PVE: ", 100 * round(sex.pve, 4), "% (p: ",
paste(p.vals[sex.col], collapse = ", "), ")")
age.res <- paste0("PVE: ", 100 * round(age.pve, 4), "% (p: ",
paste(p.vals[age.col], collapse = ", "), ")")
race.res <- paste0("PVE: ", 100 * round(race.pve, 4), "% (p: ",
paste(p.vals[race.col], collapse = ", "), ")")
tech.res <- paste0("PVE: ", 100 * round(tech.pve, 4), "% (p: ",
paste(p.vals[tech.col], collapse = ", "), ")")
return(list(sex = sex.res, age = age.res, race = race.res, tech = tech.res))
}
# Table of results ------------------------------------------------------------
do.table <- function(brain, covar, k) {
top.go <- get.top.go(brain, k)
regress.res <- do.regression(brain, covar, k)
tabl <- rbind(c("PVE: ", paste0(100 * round(brain$pve[k], 4), "%")),
c("top tissues: ", get.top.tissues(brain, k)),
c("overexpressed: ", top.go$over),
c("underexpressed: ", top.go$under),
c("age effect: ", regress.res$age),
c("sex effect: ", regress.res$sex),
c("race effect: ", regress.res$race),
c("technical factors:", regress.res$tech))
return(tabl)
}
all.tables <- function(brain, covar) {
cat("\n")
for (k in order(brain$pve, decreasing = TRUE)) {
print(knitr::kable(do.table(brain, covar, k),
caption = paste("Factor", k)))
cat("\n")
}
}
Factor details (10 factors)
Note, in particular, the large differences from the paper in proportions of variance explained by demographic effects.
covar <- readRDS("~/Downloads/GTEx_v7_brain_covar.rds")
# Two of the race levels only have one tissue each, so I remove them.
covar$RACE[covar$RACE %in% c(1, 4)] <- NA
covar$RACE <- droplevels(covar$RACE)
all.tables(brain05, covar)
#>
#> Loading required package: org.Hs.eg.db
#> Loading required package: AnnotationDbi
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, cbind, colMeans,
#> colnames, colSums, do.call, duplicated, eval, evalq, Filter,
#> Find, get, grep, grepl, intersect, is.unsorted, lapply,
#> lengths, Map, mapply, match, mget, order, paste, pmax,
#> pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
#> rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
#> tapply, union, unique, unsplit, which, which.max, which.min
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#> Loading required package: IRanges
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#>
#> expand.grid
#>
Factor 1
PVE: |
98.77% |
top tissues: |
most tissues |
overexpressed: |
SRP-dependent cotranslational protein targeting to membrane, establishment of protein localization to membrane, cotranslational protein targeting to membrane |
underexpressed: |
NA |
age effect: |
PVE: 2.31% (p: 8.2e-03) |
sex effect: |
PVE: 0.86% (p: 0.25) |
race effect: |
PVE: 0.02% (p: 0.69) |
technical factors: |
PVE: 3.96% (p: 0.02, 0.43, 0.4, 0.15) |
Factor 2
PVE: |
0.44% |
top tissues: |
cerebellar hemisphere (0.74), cerebellum (0.67) |
overexpressed: |
|
underexpressed: |
telencephalon development, forebrain development, forebrain generation of neurons |
age effect: |
PVE: 0.12% (p: 0.89) |
sex effect: |
PVE: 2.18% (p: 0.11) |
race effect: |
PVE: 0.07% (p: 0.71) |
technical factors: |
PVE: 6.72% (p: 0.22, 0.78, 0.74, 0.65) |
Factor 3
PVE: |
0.19% |
top tissues: |
spinal cord (0.81), substantia nigra (0.41), hypothalamus (0.31), hippocampus (0.19), putamen bg (0.13) |
overexpressed: |
skeletal system development, embryonic skeletal system development, anterior/posterior pattern specification |
underexpressed: |
telencephalon development, regulation of neurotransmitter levels, neurotransmitter transport |
age effect: |
PVE: 1.01% (p: 0.27) |
sex effect: |
PVE: 1.95% (p: 0.12) |
race effect: |
PVE: 0.07% (p: 0.68) |
technical factors: |
PVE: 5.77% (p: 0.5, 0.09, 0.04, 0.42) |
Factor 4
PVE: |
0.11% |
top tissues: |
fr cortex (0.65), cortex (0.54), ac cortex (0.46), hippocampus (0.22), amygdala (0.12) |
overexpressed: |
potassium ion transport, fear response, regulation of membrane potential |
underexpressed: |
sensory perception, pattern specification process, nephron tubule development |
age effect: |
PVE: 5.31% (p: 1.5e-03) |
sex effect: |
PVE: 2.31% (p: 0.02) |
race effect: |
PVE: 0.12% (p: 0.53) |
technical factors: |
PVE: 4.5% (p: 0.41, 0.06, 0.06, 5.6e-03) |
Factor 6
PVE: |
0.06% |
top tissues: |
nuc acc bg (0.66), caudate bg (0.57), putamen bg (0.46), spinal cord (0.13) |
overexpressed: |
behavior, associative learning, learning or memory |
underexpressed: |
forebrain development, cyclic-nucleotide-mediated signaling, second-messenger-mediated signaling |
age effect: |
PVE: 0.03% (p: 0.77) |
sex effect: |
PVE: 0.4% (p: 0.51) |
race effect: |
PVE: 0.09% (p: 0.68) |
technical factors: |
PVE: 3.67% (p: 0.1, 0.15, 0.11, 0.64) |
Factor 7
PVE: |
0.04% |
top tissues: |
hypothalamus (0.96), substantia nigra (0.21), nuc acc bg (0.17) |
overexpressed: |
behavior, signal release, neuropeptide signaling pathway |
underexpressed: |
ensheathment of neurons, axon ensheathment, myelination |
age effect: |
PVE: 0.25% (p: 0.73) |
sex effect: |
PVE: 0.26% (p: 0.76) |
race effect: |
PVE: 0.05% (p: 0.76) |
technical factors: |
PVE: 4.47% (p: 0.62, 0.34, 0.27, 0.04) |
Factor 5
PVE: |
0.04% |
top tissues: |
most tissues |
overexpressed: |
bicarbonate transport, neutrophil chemotaxis, leukocyte migration |
underexpressed: |
neurotransmitter transport, modulation of chemical synaptic transmission, synaptic vesicle cycle |
age effect: |
PVE: 7.24% (p: 6.3e-04) |
sex effect: |
PVE: 0.23% (p: 0.38) |
race effect: |
PVE: 0.11% (p: 0.64) |
technical factors: |
PVE: 2.98% (p: 0.52, 0.71, 0.56, 0.02) |
Factor 10
PVE: |
0.03% |
top tissues: |
most tissues |
overexpressed: |
positive regulation of response to external stimulus, regulation of receptor activity, L-glutamate transport |
underexpressed: |
cell-matrix adhesion |
age effect: |
PVE: 3.32% (p: 0.01) |
sex effect: |
PVE: 0.33% (p: 0.56) |
race effect: |
PVE: 0.4% (p: 0.45) |
technical factors: |
PVE: 8.4% (p: 0.29, 0.14, 0.04, 0.97) |
Factor 8
PVE: |
0.03% |
top tissues: |
most tissues |
overexpressed: |
regulation of receptor activity |
underexpressed: |
neutrophil activation, granulocyte activation, neutrophil mediated immunity |
age effect: |
PVE: 0.28% (p: 0.12) |
sex effect: |
PVE: 2.28% (p: 0.06) |
race effect: |
PVE: 1.64% (p: 0.07) |
technical factors: |
PVE: 19.29% (p: 0.62, 0.12, 0.06, 1.2e-09) |
Factor 9
PVE: |
0.02% |
top tissues: |
most tissues |
overexpressed: |
response to lipopolysaccharide, regulation of angiogenesis, response to molecule of bacterial origin |
underexpressed: |
|
age effect: |
PVE: 6.13% (p: 3.0e-04) |
sex effect: |
PVE: 0.14% (p: 0.54) |
race effect: |
PVE: 0.34% (p: 0.34) |
technical factors: |
PVE: 2.65% (p: 0.03, 0.94, 0.97, 0.2) |