library(miloDE)
# library containing toy data
suppressMessages(library(MouseGastrulationData))
# analysis libraries
library(scuttle)
suppressMessages(library(miloR))
suppressMessages(library(uwot))
library(scran)
suppressMessages(library(dplyr))
library(reshape2)
# packages for gene cluster analysis
library(scWGCNA)
#>
suppressMessages(library(Seurat))
# plotting libraries
library(ggplot2)
library(viridis)
#> Loading required package: viridisLite
library(ggpubr)
We will also show how we can parallel
de_test_neighbourhoods
. For this we need to load
BiocParallel
and enable multicore parallel evaluation.
library(BiocParallel)
= 4
ncores = MulticoreParam(workers = ncores)
mcparam register(mcparam)
We will use data from mouse gastrulation scRNA-seq atlas Pijuan-Sala et al., 2019. As a part of this project, using chimera mouse embryos, authors characterised the impact of Tal1 knock out on the mouse development. The most prominent phenotype was a loss of blood cells in Tal1- cells.
In this vignette, we will apply miloDE on cells contributing to blood lineage (endothelia and haematoendothelial progenitors (haem. prog-s.)) and assess whether we can detect and characterise more subtle phenotypes (i.e. DE).
Tal1-/+ chimer data are processed and directly available within
MouseGastrulationData
package. As condition ID, we will use
slot tomato
which indicates whether cells carry Tal1
KO.
# load chimera Tal1
= suppressMessages(MouseGastrulationData::Tal1ChimeraData())
sce
# downsample to few selected cell types
= c("Spinal cord" , "Haematoendothelial progenitors", "Endothelium" , "Blood progenitors 1" , "Blood progenitors 2")
cts = sce[, sce$celltype.mapped %in% cts]
sce # let's rename Haematoendothelial progenitors
$celltype.mapped[sce$celltype.mapped == "Haematoendothelial progenitors"] = "Haem. prog-s."
sce
# delete row for tomato
= sce[!rownames(sce) == "tomato-td" , ]
sce
# add logcounts
= logNormCounts(sce)
sce
# update tomato field to be more interpretable
$tomato = sapply(sce$tomato , function(x) ifelse(isTRUE(x) , "Tal1_KO" , "WT"))
sce
# for this exercise, we focus on 3000 highly variable genes (for computational efficiency)
= modelGeneVar(sce)
dec.sce = getTopHVGs(dec.sce, n = 3000)
hvg.genes = sce[hvg.genes , ]
sce # change rownames to symbol names
= as.data.frame(rowData(sce))
rowdata rownames(sce) = rowdata$SYMBOL
# add UMAPs
set.seed(32)
= as.data.frame(uwot::umap(reducedDim(sce , "pca.corrected")))
umaps # let's store UMAPs - we will use them for visualisation
reducedDim(sce , "UMAP") = umaps
# plot UMAPs, colored by cell types
= cbind(as.data.frame(colData(sce)) , reducedDim(sce , "UMAP"))
umaps names(EmbryoCelltypeColours)[names(EmbryoCelltypeColours) == "Haematoendothelial progenitors"] = "Haem. prog-s."
= EmbryoCelltypeColours[names(EmbryoCelltypeColours) %in% unique(umaps$celltype.mapped)]
cols_ct
= ggplot(umaps , aes(x = V1 , y = V2 , col = celltype.mapped)) +
p geom_point() +
scale_color_manual(values = cols_ct) +
facet_wrap(~tomato) +
theme_bw() +
labs(x = "UMAP-1", y = "UMAP-2")
p
UMAPs, coloured by cell types
estimate_neighbourhood_sizes
allows to gauge how
neighbourhood size distribution changes as a function of (order,k). It
might be useful to run it first in order to determine optimal range that
will return desired neighbourhood sizes.
= estimate_neighbourhood_sizes(sce, k_grid = seq(10,40,5) ,
stat_k order = 2, prop = 0.1 , filtering = TRUE,
reducedDim_name = "pca.corrected" , plot_stat = TRUE)
#> Running for next k values:
#> 10, 15, 20, 25, 30, 35, 40
#> 100% covered by 106 sets.
#> Finished for k = 10
#> 100% covered by 57 sets.
#> Finished for k = 15
#> 100% covered by 42 sets.
#> Finished for k = 20
#> 100% covered by 33 sets.
#> Finished for k = 25
#> 100% covered by 30 sets.
#> Finished for k = 30
#> 100% covered by 28 sets.
#> Finished for k = 35
#> 100% covered by 26 sets.
#> Finished for k = 40
#> Finished the estimation of neighbourhood sizes ~ k dependancy (order = 2).
Neighbourhood size distribution ~ k
kable(stat_k , caption = "Neighbourhood size distribution ~ k")
k | min | q25 | med | q75 | max |
---|---|---|---|---|---|
10 | 54 | 113.00 | 153.5 | 222.75 | 389 |
15 | 77 | 202.00 | 272.0 | 396.00 | 523 |
20 | 104 | 331.25 | 408.0 | 536.00 | 705 |
25 | 109 | 416.00 | 521.0 | 623.00 | 909 |
30 | 186 | 448.50 | 595.0 | 706.75 | 1073 |
35 | 151 | 542.00 | 642.5 | 869.00 | 1143 |
40 | 218 | 569.25 | 765.0 | 920.75 | 1282 |
We will use k=20, order=2 –> that returns an average neighbourhood size ~400 cells.
To assign neighbourhoods, use assign_neighbourhoods
.
Note that under the hood, there is a random sampling of index cells
–> if you want to ensure the same neighbourhood assignment, please
set seed prior to running this function.
Note that we set filtering = TRUE
to achieve a refined
assignment in which redundant neighbourhoods are discarded.
Alternatively this can be done post hoc, using
filter_neighbourhoods
.
set.seed(32)
= assign_neighbourhoods(sce , k = 20 , order = 2,
sce_milo filtering = TRUE , reducedDim_name = "pca.corrected" , verbose = F)
#> 100% covered by 42 sets.
In total we get 42 nhoods. A neighbourhood assignment can be visualised using Milo plots, in which each circle corresponds to a neighbourhood, and edges between them represent shared cells. The center of each neighbourhood are coordinated in 2D latent space (e.g. UMAP) for the center cell of the neighbourhood. We can also colour each neighbourhood by provided metric. In this plot, we will annotate each neighbourhood with its enriched cell type, and colour neighbourhoods by assigned cell types.
= nhoods(sce_milo)
nhoods_sce # assign cell types for nhoods
= data.frame(Nhood = 1:ncol(nhoods_sce) , Nhood_center = colnames(nhoods_sce))
nhood_stat_ct = miloR::annotateNhoods(sce_milo , nhood_stat_ct , coldata_col = "celltype.mapped")
nhood_stat_ct #> Converting celltype.mapped to factor...
= plot_milo_by_single_metric(sce_milo, nhood_stat_ct, colour_by = "celltype.mapped" ,
p layout = "UMAP" , size_range = c(1.5,3) , edge_width = c(0.2,0.5)) +
scale_fill_manual(values = cols_ct , name = "Cell type")
#> Warning in plot_milo_by_single_metric(sce_milo, nhood_stat_ct, colour_by =
#> "celltype.mapped", : Coercing layout to matrix format
#> Scale for fill is already present.
#> Adding another scale for fill, which will replace the existing scale.
p
Neighbourhood plot, coloured by cell types
Prior to DE testing, an optional step is to assess which
neighbourhoods do not show any signs of perturbation and discard them
prior to DE testing in order to facilitate the burden from multiple
testing correction (by using calc_AUC_per_neighbourhood
).
To do so we build on Augur that employs
RF classifiers to separate cells between the conditions. As an output of
calc_AUC_per_neighbourhood
, we return data frame containing
AUC of the per neighbourhood classifiers. We suggest that AUC <= 0.5
corresponds to neighbourhoods that can be safely discarded from DE
testing (however, you may use your own threshold if desired). In
addition, if classifier can not be built due to very low number of cells
in at least one of the conditions, AUC will be set to NaN.
Note that this part is rather computationally costly, and we recommend it to run if a substantial transcriptional regions are anticipated to be unperturbed.
= suppressWarnings(calc_AUC_per_neighbourhood(sce_milo , sample_id = "sample" , condition_id = "tomato", min_n_cells_per_sample = 1, BPPARAM = mcparam))
stat_auc
= plot_milo_by_single_metric(sce_milo, stat_auc, colour_by = "auc" ,
p layout = "UMAP" , size_range = c(1.5,3) , edge_width = c(0.2,0.5)) +
scale_fill_viridis(name = "AUC")
#> Warning in plot_milo_by_single_metric(sce_milo, stat_auc, colour_by = "auc", :
#> Coercing layout to matrix format
#> Scale for fill is already present.
#> Adding another scale for fill, which will replace the existing scale.
p
Neighbourhood plot, coloured by AUC
We observe that AUCs in the neighbourhoods containing mostly Blood progenitors is NaN which is consistent with absense of these cells in Tal1- cells. In addition, we observe higher AUCs in endothelial subregions, likely reflecting that these cells are more affected by Tal1 KO.
Let’s proceed with DE testing within each neighbourhood. We will test neighbourhoods in which AUC is not NaN (i.e. neighbourhoods in which there are enough cells from both conditions).
= de_test_neighbourhoods(sce_milo ,
de_stat sample_id = "sample",
design = ~tomato,
covariates = c("tomato"),
subset_nhoods = stat_auc$Nhood[!is.na(stat_auc$auc)],
output_type = "SCE",
plot_summary_stat = TRUE,
layout = "UMAP", BPPARAM = mcparam ,
verbose = T)
#> Starting DE testing within each neighbourhood:
#> Finished DE testing within each neighbourhood.
#> Performing p-value correction across neighbourhoods..
#> DE testing is done.
#> 39 neighbourhoods (out of 39) were tested.
#> Scale for fill is already present.
#> Adding another scale for fill, which will replace the existing scale.
Neighbourhood plot, coloured by whether DE testing is performed
One explanatory question a user might have is an overall scan of
which transcriptional regions show noteworthy signs of DE. To do so ona
neighbourhood level, we provide the function
rank_neighbourhoods_by_DE_magnitude
. Within this function,
we calculate two metrics:
n_DE_genes
- for each neighbourhood, we calculate
how many genes are assigned as DE. Since we are doing it within each
neighbourhood, we use pval_corrected_across_genes
and we
use default pval.thresh=0.1
(can be changed). Note that
there is no comparison between neighbourhoods here.
n_specific_DE_genes
. We also might be interested
which neighbourhoods differ from others more so than we would expect. To
assess this, we are interested in which neighbourhoods contain genes,
that are DE ‘specifically’ in those neighbourhoods. To calculate this,
for each gene we now use z-transformation of
pval_corrected_across_nhoods
, and we identify the
neighbourhoods in which z-normalised p-values are lower than a threshold
(default z.thresh=-3
). This would tell us that the gene is
signifciantly DE in the neighbourhood compared to most other
neighbourhoods. We do so for each gene, and then for each
neighbourhood we calculate how mane genes have z-normalised p-values are
lower than a threshold.
Note that for gene/neighbourhood combinations for which p-values are returned as NaNs (e.g. genes are not tested), for this function we set pvalues = 1. In other words, if a gene is only tested in few neighbourhoods to begin with, z-normalised p-value corrected across neighbourhoods is likely to be small for these neighbourhoods.
= rank_neighbourhoods_by_DE_magnitude(de_stat)
stat_de_magnitude
= plot_milo_by_single_metric(sce_milo, stat_de_magnitude, colour_by = "n_DE_genes" ,
p1 layout = "UMAP" , size_range = c(1.5,3) , edge_width = c(0.2,0.5)) +
scale_fill_viridis(name = "# DE genes")
#> Warning in plot_milo_by_single_metric(sce_milo, stat_de_magnitude, colour_by =
#> "n_DE_genes", : Coercing layout to matrix format
#> Scale for fill is already present.
#> Adding another scale for fill, which will replace the existing scale.
= plot_milo_by_single_metric(sce_milo, stat_de_magnitude, colour_by = "n_specific_DE_genes" ,
p2 layout = "UMAP" , size_range = c(1.5,3) , edge_width = c(0.2,0.5)) +
scale_fill_viridis(name = "# specific\nDE genes" , option = "inferno")
#> Warning in plot_milo_by_single_metric(sce_milo, stat_de_magnitude, colour_by =
#> "n_specific_DE_genes", : Coercing layout to matrix format
#> Scale for fill is already present.
#> Adding another scale for fill, which will replace the existing scale.
= ggarrange(p1,p2)
p p
Neighbourhood plot, coloured by # of DE genes
Reassuringly, we observe that same regions show both higher AUC and number of DE expressed genes.
The fine neighbourhood resolution allows to cluster genes based on logFC vectors (across the neighbourhoods). One way is to employ WGCNA approach, originally designed to finding co-expressed genes. Instead of gene counts we will use corrected logFC (logFC set to 0 in the neighbourhoods in which it is not DE).
Below we outline a custom script to detect gene modules using WGCNA approach. We will use scWGCNA, specifically designed to handle single-cell data.
Note that WGCNA algorithm is exclusive and sensitive to input parameters - nevertheless, we suggest it is useful to explore what DE patterns exist in your data.
= function(de_stat , subset_hoods = NULL ,
get_wgcna_modules n_hoods_sig.thresh = 2 ,
npcs = 5 ,
pval.thresh = 0.1 ){
require(scWGCNA)
require(Seurat)
require(dplyr)
require(reshape2)
set.seed(32)
# subset hoods
if (!is.null(subset_hoods)){
= de_stat[de_stat$Nhood %in% subset_hoods , ]
de_stat
}
# focus on genes that DE in at least 2 nhoods
= as.data.frame(de_stat %>% group_by(gene) %>% dplyr::summarise(n_hoods_sig = sum(pval_corrected_across_nhoods < pval.thresh , na.rm = TRUE)))
de_stat_per_gene = de_stat_per_gene$gene[de_stat_per_gene$n_hoods_sig >= n_hoods_sig.thresh]
genes_sig
= de_stat[de_stat$gene %in% genes_sig, ]
de_stat = de_stat[order(de_stat$Nhood) , ]
de_stat
# discard neighbourhoods in which testing was not performed
= de_stat[de_stat$test_performed , ]
de_stat
# for this analysis, set logFC to 0 and pvals to 1 if they are NaN
$logFC[is.na(de_stat$logFC)] = 0
de_stat$pval[is.na(de_stat$pval)] = 1
de_stat$pval_corrected_across_genes[is.na(de_stat$pval_corrected_across_genes)] = 1
de_stat$pval_corrected_across_nhoods[is.na(de_stat$pval_corrected_across_nhoods)] = 1
de_stat
# set logFC to 0 if pval_corrected_across_nhoods > pval.thresh
$logFC[de_stat$pval_corrected_across_nhoods >= pval.thresh] = 0
de_stat
# move the object to Seurat
= reshape2::dcast(data = de_stat, formula = gene~Nhood, value.var = "logFC")
de_stat rownames(de_stat) = de_stat$gene
= de_stat[,2:ncol(de_stat)]
de_stat
<- CreateSeuratObject(counts = de_stat)
obj.seurat DefaultAssay(obj.seurat) <- "RNA"
= FindVariableFeatures(obj.seurat)
obj.seurat # scale
"RNA"]]@scale.data = as.matrix(obj.seurat[["RNA"]]@data)
obj.seurat[[= RunPCA(obj.seurat , npcs = npcs)
obj.seurat
# run scwgcna
= run.scWGCNA(p.cells = obj.seurat,
clusters_scwgcna s.cells = obj.seurat,
is.pseudocell = F,
features = rownames(obj.seurat),
less = TRUE , merging = TRUE)
# compile stat
= lapply(1:length(clusters_scwgcna$module.genes) , function(i){
clusters = data.frame(cluster = i , gene = clusters_scwgcna$module.genes[[i]] , n_genes = length(clusters_scwgcna$module.genes[[i]]))
out return(out)
})= do.call(rbind , clusters)
clusters # add colors
= clusters_scwgcna$dynamicCols
genes_w_colors = data.frame(gene = names(genes_w_colors) , cluster_color = genes_w_colors)
genes_w_colors = merge(clusters , genes_w_colors)
clusters
return(clusters)
}
# for this vignette, for simplicity we will focus on genes that are DE in at least 4 neighbourhoods
= suppressMessages(get_wgcna_modules(convert_de_stat(de_stat) , n_hoods_sig.thresh = 4))
modules_wgcna #> Warning in eval(predvars, data, env): NaNs produced
#> [1] "We have 1070 genes in the variable genes object"
#> Warning: executing %dopar% sequentially: no parallel backend registered
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'y'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
#> 1 1 0.0342 3.73 0.871 536.000 536.000 559.00
#> 2 2 0.3410 -7.76 0.884 293.000 290.000 342.00
#> 3 3 0.4240 -4.68 0.925 171.000 169.000 233.00
#> 4 4 0.4770 -3.19 0.946 106.000 103.000 169.00
#> 5 5 0.5460 -2.53 0.958 68.300 65.100 128.00
#> 6 6 0.5870 -2.33 0.934 46.000 42.900 100.00
#> 7 7 0.5990 -2.12 0.918 32.000 28.900 79.90
#> 8 8 0.6370 -1.95 0.915 23.000 19.900 64.80
#> 9 9 0.6290 -1.94 0.894 16.900 14.100 53.40
#> 10 10 0.6540 -1.84 0.887 12.800 10.200 44.50
#> 11 12 0.7950 -1.56 0.946 7.670 5.600 31.80
#> 12 14 0.9080 -1.46 0.997 4.920 3.230 24.20
#> 13 16 0.9170 -1.51 0.988 3.320 1.930 19.80
#> 14 18 0.9290 -1.56 0.993 2.330 1.190 16.60
#> 15 20 0.9350 -1.59 0.986 1.700 0.780 14.20
#> 16 22 0.8950 -1.64 0.927 1.270 0.518 12.30
#> 17 24 0.8810 -1.66 0.912 0.970 0.345 10.80
#> 18 26 0.8930 -1.65 0.910 0.758 0.236 9.60
#> 19 28 0.8970 -1.63 0.901 0.603 0.167 8.61
#> 20 30 0.9280 -1.62 0.944 0.488 0.119 7.78
#> [1] "Or power is 14"
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9968 .... my.Clsize: 15"
#> 84 genes not assigned to any module.
#> 160 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9964 .... my.Clsize: 15"
#> 27 genes not assigned to any module.
#> 54 genes excluded due to significance.
#>
#> IMPORTANT NOTE!!!
#> You have run this analysis witht the option less=TRUE. If a module seems to be highly expressed in only 1-3 cells ( cells expressing >2*(max(expression)/3) ), it will be removed.
#> Warning in min(x, na.rm = T): no non-missing arguments to min; returning Inf
#>
#> Info: Some clusters will be merged
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 21 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 15 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 15 module eigengenes in given set.
#> The following modules were only highly expressed in less than 3 cells : darkred, pink, tan
#> They will all be removed
#>
#> This is the result of the filtering.
#> The color of the modules does not correspond between original and filtered
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9964 .... my.Clsize: 20"
#>
#>
#> IMPORTANT NOTE!!!
#> You have run this analysis witht the option merging=TRUE. This means that based on their expression pattern, modules with similar expression profile (distance < 0.25) be merged during the iterations.
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 13 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 9 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 8 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 8 module eigengenes in given set.
#> 71 genes not assigned to any module.
#> 32 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9964 .... my.Clsize: 20"
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 13 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 8 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 8 module eigengenes in given set.
#> 34 genes not assigned to any module.
#> 18 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9964 .... my.Clsize: 16"
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 13 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 7 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 7 module eigengenes in given set.
#> 0 genes not assigned to any module.
#> 12 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9968 .... my.Clsize: 30"
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 7 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> 0 genes not assigned to any module.
#> 31 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9961 .... my.Clsize: 30"
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 7 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> 0 genes not assigned to any module.
#> 2 genes excluded due to significance.
#> Warning in (function (x, y = NULL, robustX = TRUE, robustY = TRUE, use =
#> "all.obs", : bicor: zero MAD in variable 'x'. Pearson correlation was used for
#> individual columns with zero (or missing) MAD.
#> TOM calculation: adjacency..
#> ..will not use multithreading.
#> Fraction of slow calculations: 0.000000
#> ..connectivity..
#> ..matrix multiplication (system BLAS)..
#> ..normalization..
#> ..done.
#> [1] "my.height: 0.9962 .... my.Clsize: 30"
#> mergeCloseModules: Merging modules whose distance is less than 0.25
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 7 module eigengenes in given set.
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> Calculating new MEs...
#> multiSetMEs: Calculating module MEs.
#> Working on set 1 ...
#> moduleEigengenes: Calculating 4 module eigengenes in given set.
#> 0 genes not assigned to any module.
#> 0 genes excluded due to significance.
We can visualise for which transcriptional regions different gene modules are relevant.
For each gene module, we can plot neighbourhood plot, in which neighbourhood colour corresponds to average logFC (across genes from the module) and neighbourhood size corresponds to fraction of genes from the module, that are DE in this neighbourhood.
= lapply(sort(unique(modules_wgcna$cluster)) , function(cluster){
plots = plot_DE_gene_set(sce_milo, de_stat , genes = modules_wgcna$gene[modules_wgcna$cluster == cluster],
p layout = "UMAP" , size_range = c(0.5,3) ,
node_stroke = 0.3, edge_width = c(0.2,0.5)) +
ggtitle(paste0("Module ", cluster, ", ", length(modules_wgcna$gene[modules_wgcna$cluster == cluster]) , " genes"))
return(p)
})#> Warning in plot_milo_by_single_metric(x, nhood_stat, colour_by = "avg_logFC", :
#> Coercing layout to matrix format
#> Warning in plot_milo_by_single_metric(x, nhood_stat, colour_by = "avg_logFC", :
#> Coercing layout to matrix format
#> Warning in plot_milo_by_single_metric(x, nhood_stat, colour_by = "avg_logFC", :
#> Coercing layout to matrix format
#> Warning in plot_milo_by_single_metric(x, nhood_stat, colour_by = "avg_logFC", :
#> Coercing layout to matrix format
= ggarrange(plotlist = plots)
p p
Co-reulated modules
We can also quantify the relevance of each module for different cell types.
# we first need to assign cell type label to each neighbourhood - we have calculated it previously
# we will use data.frame format for de-stat; the conversion between SingleCellExperiment and data.frame formats can be done using `convert_de_stat`
= convert_de_stat(de_stat)
de_stat_df #> Converting de_stat to 'data.frame' format
= merge(de_stat_df , nhood_stat_ct , by = c("Nhood" , "Nhood_center"))
de_stat_df
= lapply(sort(unique(modules_wgcna$cluster)) , function(cluster){
plots = plot_beeswarm_gene_set(de_stat_df,
p genes = modules_wgcna$gene[modules_wgcna$cluster == cluster],
nhoodGroup = "celltype.mapped") +
ggtitle(paste0("Module ", cluster))
return(p)
})#> Converting de_stat to 'SingleCellExperiment' format
#> Converting de_stat to 'SingleCellExperiment' format
#> Converting de_stat to 'SingleCellExperiment' format
#> Converting de_stat to 'SingleCellExperiment' format
= ggarrange(plotlist = plots)
p p
Co-regulated modules, break down by cell types.
We can also visualise DE results for individual genes, in which each neighbourhood is coloured by its logFC if significant.
Below we show couple examples of DE patterns of genes from modules 2 and 4.
set.seed(1020)
= 2
module = 6
n_genes = sample(modules_wgcna$gene[modules_wgcna$cluster == module] , n_genes)
genes
= lapply(genes , function(gene){
plots = plot_DE_single_gene(sce_milo, de_stat , gene = gene , layout = "UMAP" , set_na_to_0 = TRUE) +
p ggtitle(gene)
return(p)
})#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
= ggarrange(plotlist = plots , ncol = 2, nrow = 3)
p p
Genes from module 2.
set.seed(1020)
= 4
module = 6
n_genes = sample(modules_wgcna$gene[modules_wgcna$cluster == module] , n_genes)
genes
= lapply(genes , function(gene){
plots = plot_DE_single_gene(sce_milo, de_stat , gene = gene , layout = "UMAP" , set_na_to_0 = TRUE) +
p ggtitle(gene)
return(p)
})#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
#> Converting de_stat to 'data.frame' format
#> Warning in plot_milo_by_single_metric(x, nhood_stat = nhood_stat, colour_by =
#> "logFC", : Coercing layout to matrix format
= ggarrange(plotlist = plots, ncol=2, nrow=3)
p p
Genes from module 4.
sessionInfo()
#> R version 4.2.3 (2023-03-15)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] BiocParallel_1.32.6 ggpubr_0.6.0
#> [3] viridis_0.6.2 viridisLite_0.4.1
#> [5] ggplot2_3.4.2 SeuratObject_4.1.3
#> [7] Seurat_4.3.0 scWGCNA_1.0.0
#> [9] reshape2_1.4.4 dplyr_1.1.1
#> [11] scran_1.26.2 uwot_0.1.14
#> [13] Matrix_1.5-4 miloR_1.6.0
#> [15] edgeR_3.40.2 limma_3.54.2
#> [17] scuttle_1.8.4 MouseGastrulationData_1.12.0
#> [19] SpatialExperiment_1.8.1 SingleCellExperiment_1.20.1
#> [21] SummarizedExperiment_1.28.0 Biobase_2.58.0
#> [23] GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
#> [25] IRanges_2.32.0 S4Vectors_0.36.2
#> [27] BiocGenerics_0.44.0 MatrixGenerics_1.13.0
#> [29] matrixStats_0.63.0 miloDE_0.0.0.9000
#> [31] knitr_1.42
#>
#> loaded via a namespace (and not attached):
#> [1] rappdirs_0.3.3 scattermore_0.8
#> [3] R.methodsS3_1.8.2 tidyr_1.3.0
#> [5] bit64_4.0.5 irlba_2.3.5.1
#> [7] DelayedArray_0.24.0 R.utils_2.12.2
#> [9] data.table_1.14.8 rpart_4.1.19
#> [11] KEGGREST_1.38.0 hardhat_1.3.0
#> [13] RCurl_1.98-1.12 doParallel_1.0.17
#> [15] generics_0.1.3 preprocessCore_1.60.2
#> [17] ScaledMatrix_1.6.0 cowplot_1.1.1
#> [19] RSQLite_2.3.1 RANN_2.6.1
#> [21] future_1.32.0 bit_4.0.5
#> [23] spatstat.data_3.0-1 lubridate_1.9.2
#> [25] httpuv_1.6.9 gower_1.0.1
#> [27] xfun_0.38 jquerylib_0.1.4
#> [29] evaluate_0.20 promises_1.2.0.1
#> [31] fansi_1.0.4 dbplyr_2.3.2
#> [33] htmlwidgets_1.6.2 igraph_1.4.2
#> [35] DBI_1.1.3 spatstat.geom_3.1-0
#> [37] purrr_1.0.1 ellipsis_0.3.2
#> [39] tester_0.1.7 backports_1.4.1
#> [41] deldir_1.0-6 sparseMatrixStats_1.11.1
#> [43] vctrs_0.6.2 ROCR_1.0-11
#> [45] abind_1.4-5 cachem_1.0.7
#> [47] withr_2.5.0 ggforce_0.4.1
#> [49] progressr_0.13.0 checkmate_2.1.0
#> [51] sctransform_0.3.5 goftest_1.2-3
#> [53] parsnip_1.1.0 cluster_2.1.4
#> [55] ExperimentHub_2.6.0 lazyeval_0.2.2
#> [57] crayon_1.5.2 spatstat.explore_3.1-0
#> [59] labeling_0.4.2 recipes_1.0.5
#> [61] pkgconfig_2.0.3 tweenr_2.0.2
#> [63] nlme_3.1-162 vipor_0.4.5
#> [65] nnet_7.3-18 pals_1.7
#> [67] rlang_1.1.0 globals_0.16.2
#> [69] miniUI_0.1.1.1 lifecycle_1.0.3
#> [71] filelock_1.0.2 BiocFileCache_2.6.1
#> [73] rsvd_1.0.5 AnnotationHub_3.6.0
#> [75] dichromat_2.0-0.1 polyclip_1.10-4
#> [77] lmtest_0.9-40 yardstick_1.1.0
#> [79] carData_3.0-5 Rhdf5lib_1.20.0
#> [81] zoo_1.8-12 base64enc_0.1-3
#> [83] beeswarm_0.4.0 ggridges_0.5.4
#> [85] png_0.1-8 rjson_0.2.21
#> [87] bitops_1.0-7 R.oo_1.25.0
#> [89] KernSmooth_2.23-20 rhdf5filters_1.10.1
#> [91] Biostrings_2.66.0 blob_1.2.4
#> [93] DelayedMatrixStats_1.20.0 stringr_1.5.0
#> [95] RcppGreedySetCover_0.1.0 spatstat.random_3.1-4
#> [97] parallelly_1.35.0 rstatix_0.7.2
#> [99] ggsignif_0.6.4 beachmat_2.14.2
#> [101] scales_1.2.1 ica_1.0-3
#> [103] memoise_2.0.1 magrittr_2.0.3
#> [105] plyr_1.8.8 zlibbioc_1.44.0
#> [107] compiler_4.2.3 dqrng_0.3.0
#> [109] RColorBrewer_1.1-3 fitdistrplus_1.1-8
#> [111] cli_3.6.1 XVector_0.38.0
#> [113] listenv_0.9.0 pbapply_1.7-0
#> [115] patchwork_1.1.2 htmlTable_2.4.1
#> [117] Formula_1.2-5 MASS_7.3-58.3
#> [119] WGCNA_1.72-1 tidyselect_1.2.0
#> [121] stringi_1.7.12 highr_0.10
#> [123] yaml_2.3.7 BiocSingular_1.14.0
#> [125] locfit_1.5-9.7 ggrepel_0.9.3
#> [127] pbmcapply_1.5.1 grid_4.2.3
#> [129] sass_0.4.5 tools_4.2.3
#> [131] timechange_0.2.0 future.apply_1.10.0
#> [133] parallel_4.2.3 rstudioapi_0.14
#> [135] foreign_0.8-84 bluster_1.8.0
#> [137] foreach_1.5.2 metapod_1.6.0
#> [139] gridExtra_2.3 prodlim_2023.03.31
#> [141] Rtsne_0.16 farver_2.1.1
#> [143] ggraph_2.1.0 DropletUtils_1.18.1
#> [145] digest_0.6.31 BiocManager_1.30.20
#> [147] shiny_1.7.4 lava_1.7.2.1
#> [149] Rcpp_1.0.10 car_3.1-2
#> [151] broom_1.0.4 BiocVersion_3.16.0
#> [153] later_1.3.0 RcppAnnoy_0.0.20
#> [155] httr_1.4.5 AnnotationDbi_1.60.2
#> [157] colorspace_2.1-0 tensor_1.5
#> [159] reticulate_1.28 splines_4.2.3
#> [161] statmod_1.5.0 spatstat.utils_3.0-2
#> [163] graphlayouts_0.8.4 sp_1.6-0
#> [165] mapproj_1.2.11 plotly_4.10.1
#> [167] xtable_1.8-4 jsonlite_1.8.4
#> [169] dynamicTreeCut_1.63-1 tidygraph_1.2.3
#> [171] timeDate_4022.108 ipred_0.9-14
#> [173] R6_2.5.1 Hmisc_5.0-1
#> [175] pillar_1.9.0 htmltools_0.5.5
#> [177] mime_0.12 glue_1.6.2
#> [179] fastmap_1.1.1 BiocNeighbors_1.16.0
#> [181] class_7.3-21 interactiveDisplayBase_1.36.0
#> [183] codetools_0.2-19 maps_3.4.1
#> [185] furrr_0.3.1 utf8_1.2.3
#> [187] spatstat.sparse_3.0-1 lattice_0.21-8
#> [189] bslib_0.4.2 tibble_3.2.1
#> [191] Augur_1.0.3 curl_5.0.0
#> [193] ggbeeswarm_0.7.1 leiden_0.4.3
#> [195] gtools_3.9.4 magick_2.7.4
#> [197] GO.db_3.16.0 survival_3.5-5
#> [199] rmarkdown_2.21 munsell_0.5.0
#> [201] BumpyMatrix_1.6.0 fastcluster_1.2.3
#> [203] rhdf5_2.42.1 GenomeInfoDbData_1.2.9
#> [205] rsample_1.1.1 iterators_1.0.14
#> [207] impute_1.72.3 HDF5Array_1.26.0
#> [209] gtable_0.3.3