In this R Notebook we preprocess spatial and corresponding reference scRNA-seq data of mouse olfactory bulb (MOB) for cell type deconvolution.
Spatial data preprocessing:
1.1 Input original data files
1.2 Output data files for cell type deconvolution
x and y
coordinates from spot names, followed by rounding to nearest
integers.Reference scRNA-seq data preprocessing:
2.1 Input original data files
scRNA-seq data are downloaded from GSE121891.
Raw nUMI of all 52,549 single cells: GSE121891_OB_6_runs.raw.dge.csv.gz
Meta data for 21,746 Neurons cells: GSE121891_Figure_2_metadata.txt.gz
2.2 Output data files for cell type deconvolution
Raw nUMI of 12,801 cells with selected 5 cell types and 18,560 genes: MOB_ref_scRNA_cell_nUMI.csv.gz. NO filtering on cells or genes, i.e. all genes and cells with those 5 cell types are included for analysis.
Cell type annotation for those 12,801 cells: MOB_ref_scRNA_cell_celltype.csv
version[['version.string']][1] "R version 4.2.2 Patched (2022-11-10 r83330)"
file_name = file.path(home.dir, 'Rep12_MOB_count_matrix-1.tsv')
org_data = read.csv(file_name, sep = '\t', check.names = F, header = T, row.names = 1)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/MOB/Rep12_MOB_count_matrix-1.tsv"
print(sprintf('spots: %d; genes: %d', nrow(org_data), ncol(org_data)))[1] "spots: 282; genes: 16034"
org_data[1:5, 1:5]No filtering on spots or genes, directly save all spots and genes into file MOB_spatial_spot_nUMI.csv. Rows as spatial spots and columns as genes.
write.csv(org_data, 'MOB_spatial_spot_nUMI.csv')Registered S3 method overwritten by 'data.table':
method from
print.data.table
print(sprintf('save %d gene nUMIs of %d spatial spots into file %s', ncol(org_data), nrow(org_data), 'MOB_spatial_spot_nUMI.csv'))[1] "save 16034 gene nUMIs of 282 spatial spots into file MOB_spatial_spot_nUMI.csv"
Directly extract the spatial x and y
coordinates from spot names, followed by rounding to nearest
integers, then saved into file MOB_spatial_spot_loc.csv.
local_df = data.frame(names = row.names(org_data), row.names = row.names(org_data))
local_df = local_df %>%
tidyr::separate_wider_delim(names, 'x', names = c('x', 'y'))
local_df = as.data.frame(local_df)
row.names(local_df) = row.names(org_data)
local_df['x'] = round(as.numeric(local_df$x))
local_df['y'] = round(as.numeric(local_df$y))
local_df[1:5, ]
write.csv(local_df, 'MOB_spatial_spot_loc.csv')
print(sprintf('save Physical Locations of spatial spots into file %s', 'MOB_spatial_spot_loc.csv'))[1] "save Physical Locations of spatial spots into file MOB_spatial_spot_loc.csv"
We define the neighborhood of a spatial spot contains the adjacent left, right, top and bottom spot, that is, one spot has at most 4 neighbors.
The generated Adjacency Matrix A only contains
1 and 0, where 1 represents
corresponding two spots are adjacent spots according to the definition
of neighborhood, while value 0 for non-adjacent spots. Note all
diagonal entries are 0s.
Adjacency Matrix are saved into file MOB_spatial_spot_adjacency_matrix.csv.
getNeighbour = function(array_row, array_col) {
# based on the (row, col) of one spot, return the (row, col) of all 4 neighbours
return(list(c(array_row-1, array_col),
c(array_row+1, array_col),
c(array_row+0, array_col-1),
c(array_row+0, array_col+1)))
}
# adjacency matrix
A = matrix(0, nrow = nrow(local_df), ncol = nrow(local_df))
row.names(A) = rownames(local_df)
colnames(A) = rownames(local_df)
for (i in 1:nrow(local_df)) {
barcode = rownames(local_df)[i]
array_row = local_df[i, 'y']
array_col = local_df[i, 'x']
# get neighbors
neighbours = getNeighbour(array_row, array_col)
# fill the adjacency matrix
for (this.vec in neighbours) {
tmp.p = rownames(local_df[local_df$y==this.vec[1] & local_df$x==this.vec[2], ])
if (length(tmp.p) >= 1) {
# target spots have neighbors in selected spots
for (neigh.barcode in tmp.p) {
A[barcode, neigh.barcode] = 1
}
}
}
}
A[1:5, 1:5] 16.918x16.996 18.017x17.034 20.075x17.059 18.979x17.065 21.937x16.967
16.918x16.996 0 1 0 0 0
18.017x17.034 1 0 0 1 0
20.075x17.059 0 0 0 1 0
18.979x17.065 0 1 1 0 0
21.937x16.967 0 0 0 0 0
write.csv(A, 'MOB_spatial_spot_adjacency_matrix.csv')
print(sprintf('save Adjacency Matrix of spatial spots into file %s', 'MOB_spatial_spot_adjacency_matrix.csv'))[1] "save Adjacency Matrix of spatial spots into file MOB_spatial_spot_adjacency_matrix.csv"
Plot Adjacency Matrix. Each node is spot, spots within neighborhood are connected with edges.
g = graph_from_adjacency_matrix(A, 'undirected', add.colnames = NA, add.rownames = NA)
# manually set nodes x and y coordinates
vertex_attr(g, name = 'x') = local_df$x
vertex_attr(g, name = 'y') = local_df$y
plot(g, vertex.size=5, edge.width=4, margin=-0.05)Original meta data file is GSE121891_Figure_2_metadata.txt.gz
downloaded from GSE121891.
It contains meta data of 21,746 Neurons cells from
mouse olfactory bulb. The cell type annotation is stored in column
FinalIds, which includes total 18 distinct annotations. We
selected 5 cell types and combine the subtypes as below:
3 Subtypes “n04-Immature”, “n06-Transition” and “n13-AstrocyteLike” are discarded.
NO further filtering on cells, i.e. all 12,801 cells of these 5 selected cell types will be used for cell type deconvolution.
file_name = file.path(home.dir, 'GSE121891_Figure_2_metadata.txt.gz')
ref_meta = read.csv(gzfile(file_name), sep='\t', check.names = F, header = T, row.names = 1)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/MOB/GSE121891_Figure_2_metadata.txt.gz"
print(sprintf('total %d cells with distinct %d cell type annotations', nrow(ref_meta), length(unique(ref_meta$FinalIds))))[1] "total 21746 cells with distinct 18 cell type annotations"
# remove unwanted 3 subtypes
ref_meta = ref_meta[ref_meta$FinalIds %notin% c("n04-Immature", "n06-Transition", "n13-AstrocyteLike"), ]
print(sprintf('remove 3 cell subtype annotations, remain %d cells', nrow(ref_meta)))[1] "remove 3 cell subtype annotations, remain 12801 cells"
# combine subtypes
ref_meta$celltype = ""
ref_meta[ref_meta$FinalIds %in% c("n03-GC-1", "n07-GC-2", "n09-GC-3", "n10-GC-4", "n11-GC-5", "n12-GC-6", "n14-GC-7"), "celltype"] = "GC"
ref_meta[ref_meta$FinalIds == "n01-OSNs", "celltype"] = "OSNs"
ref_meta[ref_meta$FinalIds %in% c("n02-PGC-1", "n05-PGC-2", "n08-PGC-3"), "celltype"] = "PGC"
ref_meta[ref_meta$FinalIds %in% c("n15-M/TC-1", "n16-M/TC-2", "n17-M/TC-3"), "celltype"] = "M/TC"
ref_meta[ref_meta$FinalIds == "n18-EPL-IN", "celltype"] = "EPL-IN"
table(ref_meta$celltype)
EPL-IN GC M/TC OSNs PGC
161 8614 1133 1200 1693
ref_meta[1:5, c('FinalIds', 'celltype')]Save cell type annotation to file MOB_ref_scRNA_cell_celltype.csv
write.csv(ref_meta[, 'celltype', drop=F], 'MOB_ref_scRNA_cell_celltype.csv')
print(sprintf('save cell type annotation of reference scRNA-seq cells into file %s', 'MOB_ref_scRNA_cell_celltype.csv'))[1] "save cell type annotation of reference scRNA-seq cells into file MOB_ref_scRNA_cell_celltype.csv"
Original gene nUMI count data file is GSE121891_OB_6_runs.raw.dge.csv.gz downloaded from GSE121891. It contains total 52,549 cells and 18,560 genes.
We just selected 12,801 cells of the 5 selected cell types by barcodes, and discard other cells. NO filtering on genes, i.e. all 18,560 genes will be used for cell type deconvolution.
file_name = file.path(home.dir, 'GSE121891_OB_6_runs.raw.dge.csv.gz')
ref_data = data.table::fread(file_name, sep = ",", check.names = FALSE, select = c('V1', row.names(ref_meta)))
gene_names = ref_data$V1
# transpose it
ref_data = as.data.frame(data.table::transpose(ref_data %>%
select(row.names(ref_meta))))
row.names(ref_data) = row.names(ref_meta)
colnames(ref_data) = gene_names
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/MOB/GSE121891_OB_6_runs.raw.dge.csv.gz"
print(sprintf('cells: %d; genes: %d', nrow(ref_data), ncol(ref_data)))[1] "cells: 12801; genes: 18560"
ref_data[1:5, 1:5]Save scRNA-seq nUMI matrix to file MOB_ref_scRNA_cell_nUMI.csv.gz
data.table::fwrite(ref_data, 'MOB_ref_scRNA_cell_nUMI.csv.gz', row.names = T)
print(sprintf('save nUMI matrix of reference scRNA-seq cells into gzip compressed file %s', 'MOB_ref_scRNA_cell_nUMI.csv.gz'))[1] "save nUMI matrix of reference scRNA-seq cells into gzip compressed file MOB_ref_scRNA_cell_nUMI.csv.gz"