In this R Notebook we preprocess spatial and corresponding reference scRNA-seq data of human Melanoma for cell type deconvolution.
Spatial data preprocessing:
1.1 Input original data files
ST_mel1_rep2_counts.tsv is
from zipped file ST-Melanoma-Datasets_1.zip
downloaded from Spatial
Research website, and second replicate from biopsy
1 is selected for analysis.1.2 Output data files for cell type deconvolution
x and y coordinates are directly
extracted from spot names.Reference scRNA-seq data preprocessing:
2.1 Input original data files
scRNA-seq data are downloaded from GSE115978.
Raw nUMI of all 7,186 single cells: GSE115978_counts.csv.gz
Meta data of all 7,186 cells: GSE115978_cell.annotations.csv.gz
2.2 Output data files for cell type deconvolution
We select 2,495 cells from 8 samples, and re-annotate cells with unknown cell type “?”. NO filtering on genes, i.e. all genes are included for analysis.
Raw nUMI of 2,495 cells with 7 cell types and 23,686 genes: Melanoma_ref_scRNA_cell_nUMI.csv.gz.
Cell type annotation for those 2,495 cells: Melanoma_ref_scRNA_cell_celltype.csv.
version[['version.string']][1] "R version 4.2.2 Patched (2022-11-10 r83330)"
ST_mel1_rep2_counts.tsvfile_name = file.path(home.dir, 'ST_mel1_rep2_counts.tsv')
org_data = read.csv(file_name, sep = '\t', check.names = F, header = T)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/Melanoma/ST_mel1_rep2_counts.tsv"
spot_names = colnames(org_data)[2:ncol(org_data)]
# extract first column as row name
org_data = org_data %>%
tidyr::separate_wider_delim(gene, ' ', names = c('gene_name', NA), cols_remove = F)
org_data$duplicated = duplicated(org_data$gene_name)
duplicated_genes = org_data[org_data$duplicated, 'gene_name', drop=T]
org_data$gene_unique = make.unique(org_data$gene_name)
org_data$row = seq_len(nrow(org_data))
for_show = org_data[org_data$gene_name %in% duplicated_genes, c('row', 'gene', 'gene_unique')]
for_show[order(for_show$gene_unique), ]gene_order = org_data$gene_unique
org_data = org_data[, spot_names]
org_data = as.data.frame(data.table::transpose(org_data))Registered S3 method overwritten by 'data.table':
method from
print.data.table
row.names(org_data) = spot_names
colnames(org_data) = gene_order
print(sprintf('spots: %d; genes: %d', nrow(org_data), ncol(org_data)))[1] "spots: 293; genes: 16148"
org_data[1:5, 1:5]No filtering on spots or genes, directly save all spots and genes into file Melanoma_spatial_spot_nUMI.csv. Rows as spatial spots and columns as genes.
write.csv(org_data, 'Melanoma_spatial_spot_nUMI.csv')
print(sprintf('save %d gene nUMIs of %d spatial spots into file %s', ncol(org_data), nrow(org_data), 'Melanoma_spatial_spot_nUMI.csv'))[1] "save 16148 gene nUMIs of 293 spatial spots into file Melanoma_spatial_spot_nUMI.csv"
Directly extract the spatial x and y
coordinates from spot names, then save it into file Melanoma_spatial_spot_loc.csv.
local_df = data.frame(names = row.names(org_data), row.names = row.names(org_data))
local_df = local_df %>%
tidyr::separate_wider_delim(names, 'x', names = c('y', 'x'))
local_df = as.data.frame(local_df)
row.names(local_df) = row.names(org_data)
local_df['x'] = as.numeric(local_df$x)
local_df['y'] = as.numeric(local_df$y)
local_df[1:5, ]
write.csv(local_df, 'Melanoma_spatial_spot_loc.csv')
print(sprintf('save Physical Locations of spatial spots into file %s', 'Melanoma_spatial_spot_loc.csv'))[1] "save Physical Locations of spatial spots into file Melanoma_spatial_spot_loc.csv"
We define the neighborhood of a spatial spot contains the adjacent left, right, top and bottom spot, that is, one spot has at most 4 neighbors.
The generated Adjacency Matrix A only contains
1 and 0, where 1 represents
corresponding two spots are adjacent spots according to the definition
of neighborhood, while value 0 for non-adjacent spots. Note all
diagonal entries are 0s.
Adjacency Matrix are saved into file Melanoma_spatial_spot_adjacency_matrix.csv.
getNeighbour = function(array_row, array_col) {
# based on the (row, col) of one spot, return the (row, col) of all 4 neighbours
return(list(c(array_row-1, array_col),
c(array_row+1, array_col),
c(array_row+0, array_col-1),
c(array_row+0, array_col+1)))
}
# adjacency matrix
A = matrix(0, nrow = nrow(local_df), ncol = nrow(local_df))
row.names(A) = rownames(local_df)
colnames(A) = rownames(local_df)
for (i in 1:nrow(local_df)) {
barcode = rownames(local_df)[i]
array_row = local_df[i, 'y']
array_col = local_df[i, 'x']
# get neighbors
neighbours = getNeighbour(array_row, array_col)
# fill the adjacency matrix
for (this.vec in neighbours) {
tmp.p = rownames(local_df[local_df$y==this.vec[1] & local_df$x==this.vec[2], ])
if (length(tmp.p) >= 1) {
# target spots have neighbors in selected spots
for (neigh.barcode in tmp.p) {
A[barcode, neigh.barcode] = 1
}
}
}
}
A[1:5, 1:5] 7x15 7x16 7x17 7x18 8x13
7x15 0 1 0 0 0
7x16 1 0 1 0 0
7x17 0 1 0 1 0
7x18 0 0 1 0 0
8x13 0 0 0 0 0
write.csv(A, 'Melanoma_spatial_spot_adjacency_matrix.csv')
print(sprintf('save Adjacency Matrix of spatial spots into file %s', 'Melanoma_spatial_spot_adjacency_matrix.csv'))[1] "save Adjacency Matrix of spatial spots into file Melanoma_spatial_spot_adjacency_matrix.csv"
Plot Adjacency Matrix. Each node is spot, spots within neighborhood are connected with edges.
g = graph_from_adjacency_matrix(A, 'undirected', add.colnames = NA, add.rownames = NA)
# manually set nodes x and y coordinates
vertex_attr(g, name = 'x') = local_df$x
vertex_attr(g, name = 'y') = local_df$y
plot(g, vertex.size=5, edge.width=4, margin=-0.05)Original meta data file is GSE115978_cell.annotations.csv.gz downloaded from GSE115978.
It contains meta data of 7,186 cells from human melanoma tumors. Based on Table S1A. Clinical characteristics of the patients and samples in the scRNA-seq cohort, We select samples for cell type deconvolution by following criteria:
8 samples (Mel79, Mel80,
Mel81, Mel82, Mel89,
Mel103, Mel116, Mel128) with
2,495 cells are selected.
The cell type annotation is stored in column cell.types,
which includes total 10 distinct annotations.
We selected 7 cell types as below:
We re-analysis the gene expression of selected 2,495 cells, cluster all cells into 10 clusters using TOP 5 PCs, and re-label the cells with unclear cell type “?” in each cluster as the dominate cell type of the cluster.
The refined cell type annotation of selected 2,495 cells is provided in Melanoma_ref_scRNA_cell_celltype.csv.
file_name = file.path(home.dir, 'Melanoma_ref_scRNA_cell_celltype.csv')
ref_meta = read.csv(file_name, sep=',', check.names = F, header = T, row.names = 1)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/Melanoma/Melanoma_ref_scRNA_cell_celltype.csv"
print(sprintf('total %d cells with distinct %d cell type annotations', nrow(ref_meta), length(unique(ref_meta$new_celltype))))[1] "total 2495 cells with distinct 7 cell type annotations"
table(ref_meta$new_celltype)
B.cell CAF Endo. Macrophage Mal NK T.cell
364 48 34 62 1030 15 942
ref_meta[1:5, 'new_celltype', drop=F]Original gene nUMI count data file is GSE115978_counts.csv.gz downloaded from GSE115978. It contains total 7,186 cells and 23,686 genes.
We just selected 2,495 cells of the selected 7 cell types by barcodes, and discard other cells. NO filtering on genes, i.e. all 23,686 genes will be used for cell type deconvolution.
file_name = file.path(home.dir, 'GSE115978_counts.csv.gz')
ref_data = data.table::fread(file_name, sep = ",", check.names = FALSE)
gene_names = ref_data$V1
cell_names = colnames(ref_data)[2:ncol(ref_data)]
# transpose it
ref_data = as.data.frame(data.table::transpose(ref_data %>%
select(cell_names)))
row.names(ref_data) = cell_names
colnames(ref_data) = gene_names
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/SharedFolder/ToHost/CVAE-GLRM_Analysis/RealData/Melanoma/GSE115978_counts.csv.gz"
print(sprintf('total cells: %d; genes: %d', nrow(ref_data), ncol(ref_data)))[1] "total cells: 7186; genes: 23686"
Select cells and save scRNA-seq nUMI matrix to file Melanoma_ref_scRNA_cell_nUMI.csv.gz
ref_data = ref_data[row.names(ref_meta), ]
ref_data[1:5, 1:5]
data.table::fwrite(ref_data, 'Melanoma_ref_scRNA_cell_nUMI.csv.gz', row.names = T)
print(sprintf('save nUMI matrix of reference scRNA-seq cells into gzip compressed file %s', 'Melanoma_ref_scRNA_cell_nUMI.csv.gz'))[1] "save nUMI matrix of reference scRNA-seq cells into gzip compressed file Melanoma_ref_scRNA_cell_nUMI.csv.gz"