library(Matrix)
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
library(reshape2)
library(sctransform)
library(DoubletFinder)
## load data and create a Seurat object
lung.data<-Read10X(data.dir="/home/jinggu/cluster/data/single-cell/Wang2020/scRNA-seq")
lung <- CreateSeuratObject(counts = lung.data, project = "sc_lung")
lung
An object of class Seurat 26578 features across 46500 samples within 1 assay Active assay: RNA (26578 features, 0 variable features)
options(repr.plot.width=12, repr.plot.height=8)
lung[["percent.mt"]]<-PercentageFeatureSet(lung, pattern="^MT-")
VlnPlot(lung, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
pt.size=0, ncol = 3)
plot1 <- FeatureScatter(lung, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(lung, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
Notes:
#Replicate results in Wang2020
lung <- subset(lung, subset = nFeature_RNA > 500 & nFeature_RNA < 4000 & percent.mt<=0.3)
lung
An object of class Seurat 26578 features across 10223 samples within 1 assay Active assay: RNA (26578 features, 3000 variable features) 2 dimensional reductions calculated: pca, umap
#SCTransform
#lung<-PercentageFeatureSet(lung, "^MT-", col.name="percent.mt")
#lung <- SCTransform(lung, method = "glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE)
#log-normalized and scaled by a factor of 10,000
lung <- NormalizeData(lung, normalization.method = "LogNormalize", scale.factor = 10000)
#find variable genes
lung <- FindVariableFeatures(lung, selection.method = "vst", nfeatures = 3000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(lung), 10)
# plot variable features with and without labels
options(repr.plot.width=15, repr.plot.height=8)
plot1 <- VariableFeaturePlot(lung)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = FALSE)
plot1+plot2
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Transformation introduced infinite values in continuous x-axis”
all.genes <- rownames(lung)
lung <- ScaleData(lung, features = all.genes)
Centering and scaling data matrix
lung<-readRDS("~/cluster/projects/lung_Wang2020/lung_scRNA/lung_Wang2020.rds")
lung
An object of class Seurat 26578 features across 46500 samples within 1 assay Active assay: RNA (26578 features, 3000 variable features) 2 dimensional reductions calculated: pca, umap
lung <- RunPCA(lung, features = VariableFeatures(object = lung))
print(lung[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(lung, dims = 1:2, reduction = "pca")
DimPlot(lung, reduction="pca")
PC_ 1 Positive: PIEZO2, PDZRN3, CACNA1C, LAMA2, MEIS1, LAMA4, CDH11, SLIT3, COL6A3, COL5A2 CACNB2, ROBO2, BNC2, CCBE1, RYR2, ADAMTS12, CALD1, COL1A2, LSAMP, NR2F1-AS1 ADGRB3, COL5A1, ARHGAP6, ELMO1, BACH2, ITGA8, SCN7A, GALNT17, FN1, PRDM6 Negative: CPM, LMO7, ATP13A4, PHACTR1, MAGI3, VEPH1, ZNF385B, LRRK2, MACROD2, ERBB4 ITGB6, DAPK2, GLCCI1, LMO3, GPC5, SNX25, SLC34A2, P3H2, AC112206.2, TOX ABCA3, PEBP4, SLC39A8, LAMA3, CD55, AC019117.1, TACC2, AC044810.3, TMEM163, ELF3 PC_ 2 Positive: MAGI3, NEBL, PRKG1, ROR1, LMO7, EPB41L5, RORA, PTPRD, NFIB, ATP13A4 GPC5, DLG2, DAPK2, LAMA3, EMP2, GLCCI1, PTPN13, KIAA1217, AC112206.2, AC044810.3 SYNE1, VEPH1, SLIT2, FGFR2, PLEKHH2, PLS3, CASC15, ABLIM1, MAPK10, MACROD2 Negative: DOCK2, SLC11A1, GPRIN3, ALOX5, TBXAS1, PIK3R5, MRC1, PTPRC, TFEC, MSR1 PPARG, KYNU, SNX10, ABCG1, DOCK8, STAC, SLCO2B1, NCKAP1L, CD163, MYO1F CCDC88A, CLEC7A, SLC8A1, HCK, PIK3AP1, WDFY4, MARCH1, FGD2, CSF2RA, OLR1 PC_ 3 Positive: DNAH12, SPAG17, CFAP157, ADGB, CFAP54, ZBBX, CFAP299, ARMC3, DNAAF1, CFAP47 LRRIQ1, CFAP46, CFAP43, ERICH3, DNAI1, SERPINI2, RSPH1, TEKT1, TTC29, DNAH9 SAXO2, LMNTD1, VWA3B, DNAH3, DCDC1, NEK10, WDR49, DNAH6, CFAP52, CFAP73 Negative: ZNF385B, LRRK2, DLG2, TMEM163, ABCA3, TTN, SFTPC, P3H2, PID1, CCDC141 PHACTR1, PEBP4, WIF1, SLC34A2, LAMP3, BMP1, AGBL1, SNX25, CPM, ITGB6 CD36, LANCL1-AS1, HHIP, KIAA1324L, CTSH, AC046195.1, MFSD2A, NECAB1, ROR1, ETV5 PC_ 4 Positive: LRRK2, ZNF385B, PTPN13, AFF3, TTN, DLG2, SFTPC, TMEM163, ERBB4, LAMP3 ABCA3, CTSH, CCDC141, PTGFR, SFTPA1, MFSD2A, HHIP, KIAA1324L, SCN1A, TNIK PARM1, PID1, SNX30, AGBL1, BMP1, LANCL1-AS1, NTN4, ARHGEF38, TMEM164, FMO5 Negative: RTKN2, NCKAP5, AC027288.3, CLIC5, AC245041.2, ST6GALNAC5, ARHGEF26, CNTN6, AC002066.1, CAV1 MAP2, PHLDB2, COL4A3, ANKRD29, SCEL, EMP2, ANXA3, COL4A4, COL12A1, ANOS1 RAB11FIP1, CCDC85A, KHDRBS2, LAMA3, GPRC5A, FAM189A2, DENND3, LINC01290, TTLL7, NYAP2 PC_ 5 Positive: HBA2, TIMP1, CRISPLD2, TNC, COL4A2, COL4A1, HBA1, RASA3, MEG8, THBS2 PVT1, THBS1, MTHFD1L, NHS, MT2A, IL1R1, LDLRAD4, PITPNC1, FYN, PAPPA TGFBI, SFMBT2, MEG3, CHSY1, SAMD4A, CASC15, IGFBP7, PLCB1, COL1A1, BX322234.1 Negative: SCN7A, ANGPT1, NEBL, CACNA1D, ITGA8, CCBE1, MYLK, ADH1B, DOCK4, RSPO2 PLXDC2, MACF1, TNIK, AL139383.1, AC092691.1, TRHDE, ABCA6, ARHGAP6, LSAMP, ELMO1 IL16, PIK3R1, GRIK4, BMP5, NKD1, FBLN5, DST, NUAK1, PRKG1, GRIA1
PC_ 1 Positive: PIEZO2, PDZRN3, CACNA1C, LAMA2, MEIS1 Negative: CPM, LMO7, ATP13A4, PHACTR1, MAGI3 PC_ 2 Positive: MAGI3, NEBL, PRKG1, ROR1, LMO7 Negative: DOCK2, SLC11A1, GPRIN3, ALOX5, TBXAS1 PC_ 3 Positive: DNAH12, SPAG17, CFAP157, ADGB, CFAP54 Negative: ZNF385B, LRRK2, DLG2, TMEM163, ABCA3 PC_ 4 Positive: LRRK2, ZNF385B, PTPN13, AFF3, TTN Negative: RTKN2, NCKAP5, AC027288.3, CLIC5, AC245041.2 PC_ 5 Positive: HBA2, TIMP1, CRISPLD2, TNC, COL4A2 Negative: SCN7A, ANGPT1, NEBL, CACNA1D, ITGA8
DimHeatmap(lung, dims = 1:15, cells = 500, balanced = TRUE)
ElbowPlot(lung, ndims = 50)
lung<-FindNeighbors(lung, dims=1:30)
lung<-FindClusters(lung, resolution=0.8)
lung<-RunUMAP(lung, dims=1:30)
lung$
DimPlot(lung, reduction="umap") + ggtitle("UMAP plot colored by sample IDs")
lung@active.ident<-lung$orig.ident
DimPlot(lung, reduction="umap") + ggtitle("UMAP plot colored by clusters")
## pK Identification (no ground-truth)
sweep.res.list_lung <- paramSweep_v3(lung, PCs = 1:50, sct = FALSE)
sweep.stats_lung <- summarizeSweep(sweep.res.list_lung, GT = FALSE)
bcmvn_lung <- find.pK(sweep.stats_lung)
head(lung$seurat_clusters)
annotations<-lung@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations) ## ex: annotations <- seu_kidney@meta.data$ClusteringResults
nExp_poi <- round(0.1*nrow(lung@meta.data)) ## Assuming 7.5% doublet formation rate - tailor for your dataset
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
lung <- doubletFinder_v3(lung, PCs = 1:50, pN = 0.15, pK = 0.005, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
lung <- doubletFinder_v3(lung, PCs = 1:50, pN = 0.15, pK = 0.005, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.15_0.005_4650", sct = FALSE)
sum(lung$DF.classifications_0.15_0.005_4480 == 'Doublet')
head(lung@meta.data)
orig.ident | nCount_RNA | nFeature_RNA | RNA_snn_res.0.5 | seurat_clusters | percent.mt | RNA_snn_res.1.2 | RNA_snn_res.0.8 | |
---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <int> | <fct> | <fct> | <dbl> | <fct> | <fct> | |
D032_TAACACGCAGGCATGA | D032 | 1114 | 851 | 1 | 19 | 0.08976661 | 19 | 19 |
D046_AGTGACTCAGAGATGC | D046 | 1125 | 853 | 1 | 11 | 0.08888889 | 19 | 11 |
D046_ATCGGCGGTACAGTCT | D046 | 3763 | 1788 | 1 | 19 | 0.18602179 | 21 | 19 |
D046_TCTATCAGTCATATGC | D046 | 2638 | 1622 | 1 | 19 | 0.07581501 | 19 | 19 |
D062_ACCACAATCTCCGCAT | D062 | 7339 | 3025 | 1 | 19 | 0.17713585 | 19 | 19 |
D062_AGGATAAGTACGTACT | D062 | 2077 | 1396 | 1 | 19 | 0.24073182 | 19 | 19 |
meta.info<-read.table("Wang2020/scRNA-seq/GSE161382_metadata.txt.gz", header = T)
meta.info<-meta.info[rownames(meta.info) %in% rownames(lung@meta.data),]
library(harmony)
library(magrittr)
pc<-lung@reductions$pca@cell.embeddings
harmonized <- HarmonyMatrix(pc, meta.info, vars_use=c("donor", "batch", "sex"), do_pca=FALSE)
harmonized <- data.frame(harmonized)
lung@reductions$pca@cell.embeddings<-as.matrix(harmonized)
head(lung@reductions$pca@cell.embeddings)
PC_1 | PC_2 | PC_3 | PC_4 | PC_5 | PC_6 | PC_7 | PC_8 | PC_9 | PC_10 | ⋯ | PC_41 | PC_42 | PC_43 | PC_44 | PC_45 | PC_46 | PC_47 | PC_48 | PC_49 | PC_50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
D032_TAACACGCAGGCATGA | -0.3985820 | -20.85848 | -0.5240309 | -0.2439483 | -0.5422445 | 1.2595482 | -0.7032837 | 1.0844445 | 3.55206051 | -6.0492437 | ⋯ | -1.48486074 | 0.8908909 | 0.8934875 | -0.8904164 | 0.8289513 | -0.2346926 | -0.547690860 | 0.4405086 | 0.49811969 | -0.3260594 |
D046_AGTGACTCAGAGATGC | -1.2321803 | -13.33187 | -0.9988345 | -0.8102001 | 1.0425367 | 1.9197886 | -0.5353741 | 0.4976184 | 1.28765749 | 1.8857875 | ⋯ | -0.05940036 | -0.2286925 | -0.5732119 | 1.5012035 | -0.3322233 | 1.6437078 | 2.289830251 | 0.3170592 | -1.25525313 | -0.4953004 |
D046_ATCGGCGGTACAGTCT | 1.3195731 | -25.66298 | -1.6189908 | -0.7586676 | 2.0589709 | 1.8836052 | -1.3036565 | 0.3490058 | 0.59964696 | -3.0121535 | ⋯ | -0.01335006 | 0.1423684 | 4.7768354 | 0.1357333 | 1.2063323 | -2.8727584 | -2.292282851 | -0.9298083 | 4.17931361 | -1.3615013 |
D046_TCTATCAGTCATATGC | 0.5904185 | -27.22417 | -0.1631747 | -2.1585962 | 0.3150094 | 1.7617123 | -0.5007738 | -2.6033632 | 1.53346417 | -0.6391126 | ⋯ | 2.84548073 | 2.5159239 | -2.1065811 | -0.8151320 | -1.7546148 | 0.0313849 | 0.321297509 | -0.8295865 | 2.52309071 | 1.2149441 |
D062_ACCACAATCTCCGCAT | 1.6423649 | -35.82405 | 1.6365077 | -3.9821016 | -4.9717792 | -1.4236247 | 3.5897601 | -5.6472829 | 0.08844246 | 4.0540465 | ⋯ | 1.10237707 | 2.0238321 | -1.1486768 | 0.5971051 | 1.3507216 | 1.6533622 | 0.935727793 | -0.1090692 | -1.76254395 | 0.9318414 |
D062_AGGATAAGTACGTACT | 1.0832026 | -25.79946 | 0.6108807 | -3.4398976 | -2.5684447 | 0.7385468 | 2.1705286 | -1.2798463 | 0.83453173 | 1.8157443 | ⋯ | 0.87987734 | 0.9792191 | 1.1019914 | -1.4024157 | -0.2645823 | 0.3537502 | -0.001623401 | 0.5388432 | 0.02924582 | -2.1465729 |
lung@active.ident<-lung$orig.ident
DimPlot(lung, reduction="pca")
lung<-FindNeighbors(lung, dims=1:50)
lung<-FindClusters(lung, resolution=0.8)
lung<-RunUMAP(lung, dims=1:50)
DimPlot(lung, reduction="umap", label = T, label.size = 5)
saveRDS(lung, file="lung_snATAC/seurat_scRNA/lung_Wang2020.rds")
#reload R object
lung<-readRDS(file="~/cluster/projects/lung_others/lung_scRNA/lung_Wang2020.rds")
markers<-c("AGER","SFTPA2","NDNF","KCNJ15","GDF15","SCGB3A2","TP63","MUC5B",
"MYB","RIMS2","COL2A1","LAMC3","NTRK3","MYOCD","LTBP2","TCF21","MFAP5",
"BMX","ACKR1","KIT","CA4","ABCB1","RELN","TREM1","PLTP","FCN1","FLT3",
"EPB42","TCF7","PRF1","PAX5","MS4A2")
options(repr.plot.width=12, repr.plot.height=10)
DotPlot(object=lung, features=markers) + theme(axis.text.x = element_text(angle=45, vjust = 0.5))
lung<-RenameIdents(lung, new.cluster.ids)
DimPlot(lung, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#Find all markers for each cluster by performing pair-wise differential gene expression
#lung.markers <- FindAllMarkers(lung, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5, densify=TRUE)
lung.markers<-read.table("~/cluster/projects/lung_others/lung_scRNA/lung_clusters_DEG.txt", header=T)
new.cluster.ids<-c("0"="AT2 cells","1"="Alv. macro","2"="Matrix fib.1","3"="AT1 cells",
"4"="T cells", "5"="Matrix fib.2", "6"="Club cells", "7"="Monocytes",
"8"="Cap cells", "9"="Myofib",
"10"="Inter. macro.", "11"="B cells", "12"="NK cells",
"13"="Cap cells","14"="Lymphatics",
"15"="Pericytes", "16"="Ciliated cells",
"17"= "Erythrocytes","18"= "Erythrocytes",
"19"="Basal cells", "20"="Mast cells",
"21"="PNECs")
#write.table(top10, "~/cluster/projects/lung_others/lung_scRNA/lung_clusters_DEG.txt", quote = F, row.names = T, sep='\t')
saveRDS(lung, "~/cluster/projects//lung_others//lung_scRNA//lung_Wang2020.rds")
intersect(lung.markers[lung.markers$cluster==20, "gene"][1:50], markers)
head(lung.markers[lung.markers$cluster==13,], 10)
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | cluster | gene | |
---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <chr> | |
EMCN1 | 0 | 4.286502 | 0.793 | 0.041 | 0 | 13 | EMCN |
ADGRL22 | 0 | 4.077742 | 0.934 | 0.221 | 0 | 13 | ADGRL2 |
HPGD1 | 0 | 3.737363 | 0.756 | 0.079 | 0 | 13 | HPGD |
GALNT182 | 0 | 3.520607 | 0.917 | 0.192 | 0 | 13 | GALNT18 |
STXBP61 | 0 | 3.355972 | 0.796 | 0.114 | 0 | 13 | STXBP6 |
EGFL71 | 0 | 3.279031 | 0.883 | 0.085 | 0 | 13 | EGFL7 |
HECW21 | 0 | 3.270260 | 0.738 | 0.085 | 0 | 13 | HECW2 |
PRICKLE23 | 0 | 3.215023 | 0.869 | 0.282 | 0 | 13 | PRICKLE2 |
NCALD3 | 0 | 3.176994 | 0.803 | 0.138 | 0 | 13 | NCALD |
NRG32 | 0 | 3.023161 | 0.555 | 0.082 | 0 | 13 | NRG3 |
png("/home/jinggu/cluster/projects/lung_others/lung_scRNA/lung_scRNA_heatmap.png")
DoHeatmap(lung, features = top10$gene) + NoLegend()
dev.off()
nclusters<-length(unique(lung@meta.data$seurat_clusters))
cluster.markers<-c("0","T cells","AT2","Monocytes","AT2","Club cells",
"AT1", "Alv.macro.", "8", "AT1/AT2-like", "Alv.macro.",
"AT1/AT2-like", "Matrix fib.2", "Matrix fib.1","AT1/AT2-like", "Inter.macro.",
"Vasc. sm.", "B cells", "Vasc. sm.","NK cells","Matrix fib.2",
"Matrix fib.1", "Lymphatics", "Cap", "Ciliated cells", "Airway Sm.",
"26", "27", "Pericytes", "Erythro.", "Chondro.", "PNECs")
length(cluster.markers)
lung@meta.data["cluster.ident"]<-unlist(lapply(lung@meta.data$seurat_clusters, function(i){cluster.markers[as.numeric(i)+1]}))
names(cluster.markers)<-levels(lung@meta.data$seurat_clusters)
lung<-RenameIdents(lung, cluster.markers)
cluster.markers
options(repr.plot.width=15, repr.plot.height=10)
DimPlot(lung, reduction="umap", label=TRUE)
options(repr.plot.width=12, repr.plot.height=10)
tcell.markers=c('CD8A','CD4','CD74',
'CCR7', 'ITGAE','CD69',
'IL7R', 'IL33','CD200')
FeaturePlot(lung, features = tcell.markers)
lung.tcell<-readRDS("~/cluster/projects/lung_Wang2020/lung_scRNA/lung_Tcell_subclustering_Wang2020.rds")
lung.tcell
An object of class Seurat 26578 features across 3209 samples within 1 assay Active assay: RNA (26578 features, 3000 variable features) 2 dimensional reductions calculated: pca, umap
#lung.tcell<-subset(lung, subset=seurat_clusters=="4")
lung.tcell<-RunPCA(lung.tcell, features = VariableFeatures(lung.tcell)[1:2000])
ElbowPlot(lung.tcell, ndims = 50)
PC_ 1 Positive: ALCAM, CADPS2, COLEC12, DLG2, PID1, MYO1B, NEBL, PTPRD, ROR1, TCF4 CADM1, NRP1, MAGI3, KIAA1217, LHFPL6, EPS8, ELL2, MITF, SLC8A1, LRRK2 TACC2, EPAS1, GLIS3, MACROD2, MAPK10, TFPI, MAGI2, FBXL7, LMO7, PDE4D Negative: THEMIS, BCL11B, SKAP1, CAMK4, ITK, CD96, LINC01934, CD247, LEF1, IL7R RIPOR2, ETS1, ICOS, INPP4B, STAT4, TRAC, NELL2, CD28, IKZF3, CD69 SAMD3, BCL2, BTBD11, C15orf53, LTB, CCR7, TXK, GNG2, SELL, AC022075.1 PC_ 2 Positive: C15orf53, PPP2R2B, LINC01934, AC013652.1, CCL5, MSC-AS1, KLRB1, SKAP1, CD96, STAT4 INPP4B, ADAM19, PRDM1, TSPAN5, ADAM12, CLNK, RBPJ, IKZF3, BCL2, CD69 SAMSN1, TOX, GNAO1, SFTPC, THEMIS, MYBL1, IL2RA, IL12RB2, MIR155HG, TENT5C Negative: CCR7, SELL, LEF1, NELL2, TXK, BTBD11, LTB, BACH2, AL163932.1, PLAC8 PDE3B, TRAC, ITGA6, CD28, IL7R, RETREG1, KLF2, FAM19A1, PLCL1, CAMK4 KCNQ5, ATP10A, AC008014.1, ADAMTS6, CD55, EPHA4, RALGPS2, ETS1, FCGBP, PCSK5 PC_ 3 Positive: AC022075.1, CCL5, NELL2, IFNG-AS1, SAMD3, AOAH, PPP2R2B, LINC02446, CRTAM, CD96 PDGFD, NCALD, KLRC1, LEF1, VAV3, ABCB1, GNLY, KCNQ5, IKZF3, TXK CCL4, FCGBP, RIPOR2, ITGAD, PRKCB, SFTPC, AF165147.1, DTHD1, JAKMIP2, AC243829.2 Negative: ICOS, C15orf53, CD28, IL2RA, KLRB1, TNFAIP3, AC013652.1, BIRC3, RGS1, ADAM19 ADAM12, IL7R, CD69, LTB, BTBD11, CSGALNACT1, SAMSN1, PRDM1, DPP4, HPGDS IL18R1, INPP4B, CD200R1, PLCL1, ITK, FSTL4, MAF, SELL, PAG1, TMEM156 PC_ 4 Positive: TRAC, C15orf53, LTB, CCL5, AC013652.1, AC022075.1, TMSB4X, ACTB, NELL2, CLNK LINC02446, CCR7, CD52, KLRB1, CD200R1, SAMD3, HLA-C, AL163932.1, SELL, DDIT4 FTL, CD69, VIM, LINC01934, FCGBP, CRTAM, SFTPC, S100A4, TNIP3, KLRC1 Negative: CD28, ITK, ANK3, FKBP5, IL7R, MYBL1, FBXO32, BTBD11, BCL2, IFNG-AS1 PRDM1, PCAT1, BCL11B, CATSPERB, TXK, CD247, CAMK4, TMEM156, RIPOR2, ELOVL6 ETS1, INPP4B, AC105402.3, PLCB1, DTHD1, STAT4, ADAM19, IL2RA, DPP4, GPRIN3 PC_ 5 Positive: THEMIS, ICOS, C15orf53, LINC01934, CCR7, BCL11B, ITK, SELL, CD28, IKZF3 MSC-AS1, AC013652.1, TNIP3, SLC9A9, PPP2R2B, CCL5, AL163932.1, RBPJ, ITGA6, C12orf42 CAMK4, CSGALNACT1, LEF1, KIAA0825, PRKCB, TSHZ2, ANKRD55, PRDM1, NELL2, ANK3 Negative: KLRB1, MYBL1, CD69, IL7R, LTB, IL18R1, TNFAIP3, ADAM12, STAT4, IL2RA FSTL4, ABCB1, CLNK, DUSP2, KIT, DPP4, HPGDS, PLCB1, BIRC3, ATRNL1 RGS1, AC022075.1, NCALD, IFNG-AS1, IL12RB2, BTBD11, BLK, MED12L, TXK, DDIT4
lung.tcell <- FindNeighbors(lung.tcell, k.param = 40, dims = 1:30)
lung.tcell <- FindClusters(lung.tcell, resolution = 0.6)
lung.tcell<-RunUMAP(lung.tcell, dims=1:30)
Computing nearest neighbor graph Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck Number of nodes: 3209 Number of edges: 294449 Running Louvain algorithm... Maximum modularity in 10 random starts: 0.7022 Number of communities: 7 Elapsed time: 0 seconds
15:26:57 UMAP embedding parameters a = 0.9922 b = 1.112 15:26:57 Read 3209 rows and found 30 numeric columns 15:26:57 Using Annoy for neighbor search, n_neighbors = 30 15:26:57 Building Annoy index with metric = cosine, n_trees = 50 0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----| * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | 15:26:58 Writing NN index file to temp file /tmp/jobs/19456337/RtmpAaXWeH/file7f9053cc1934 15:26:58 Searching Annoy index using 1 thread, search_k = 3000 15:26:59 Annoy recall = 100% 15:26:59 Commencing smooth kNN distance calibration using 1 thread 15:27:00 Initializing from normalized Laplacian + noise 15:27:00 Commencing optimization for 500 epochs, with 131228 positive edges 15:27:08 Optimization finished
DimPlot(lung.tcell, reduction = "umap", label=TRUE, label.size = 8)
options(repr.plot.width=12, repr.plot.height=10)
## Marker genes for subtypes of T-cells
markerGenes <- c(
'CD8A','CD4','CD3E',
'ITGAE','CD69',
'ITGA1','ITGAL',
'SELL','CCR7'
)
FeaturePlot(lung.tcell, features=markerGenes, ncol=3)
Notes:
cluster.markers<-FindAllMarkers(lung.tcell, only.pos=TRUE)
write.table(cluster.markers %>% group_by(cluster) %>% slice_max(n=10, order_by=avg_log2FC),
"lung_snATAC/seurat_scRNA/t_cell_subclusters_DEG.txt", row.names=F, sep='\t', quote = F)
Calculating cluster 0 Calculating cluster 1 Calculating cluster 2 Calculating cluster 3 Calculating cluster 4 Calculating cluster 5 Calculating cluster 6
cluster.markers<-read.table("~/cluster/projects//lung_others//lung_scRNA//t_cell_subclusters_DEG.txt", header = T)
head(cluster.markers)
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | cluster | gene | |
---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <chr> | |
1 | 4.569342e-33 | 1.1985345 | 0.309 | 0.144 | 1.214440e-28 | 0 | PLCB1 |
2 | 1.360273e-20 | 0.8543668 | 0.340 | 0.213 | 3.615332e-16 | 0 | ERN1 |
3 | 1.475631e-09 | 0.8471117 | 0.418 | 0.360 | 3.921933e-05 | 0 | SFTPB |
4 | 7.793805e-17 | 0.8159923 | 0.239 | 0.130 | 2.071437e-12 | 0 | ZBTB16 |
5 | 2.141279e-11 | 0.7956628 | 0.189 | 0.111 | 5.691090e-07 | 0 | MYBL1 |
6 | 1.765221e-10 | 0.6677428 | 0.116 | 0.056 | 4.691604e-06 | 0 | MACROD2 |
lung.tcell$
cluster1.markers<-FindMarkers(lung.tcell, ident.1 = c(0,1), ident.2 = seq(2,6), min.pct = 0.1)
cluster1.markers[which(rownames(cluster1.markers)=="CD69"),]
options(repr.plot.width=15, repr.plot.height=12)
cluster.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(lung.tcell, features = top10$gene) + NoLegend()
options(repr.plot.width=12, repr.plot.height=10)
DotPlot(object=lung.tcell, features=markerGenes) + theme(axis.text.x = element_text(angle=45, vjust = 0.5)) + ggtitle("Expression of marker genes across all clusters")
DotPlot(object=subset(lung.tcell, subset=seurat_clusters!="6"), features=markerGenes) +
theme(axis.text.x = element_text(angle=45, vjust = 0.5)) +
ggtitle("Expression of marker genes among cluster 0-5")
VlnPlot(lung.tcell, features=markerGenes)
cell-type database: https://github.com/IanevskiAleksandr/sc-type/blob/master/ScTypeDB_full.xlsx
# load libraries and functions
lapply(c("dplyr","Seurat","HGNChelper"), library, character.only = T)
source("~/projects/funcFinemapping/code/sctype/gene_sets_prepare.R")
source("~/projects/funcFinemapping/code/sctype/sctype_score_.R")
# DB file
db_ = "~/projects/funcFinemapping/data/ScTypeDB_full.xlsx";
tissue = "Immune system" # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain
# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)
pbmc<-lung.tcell
# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = pbmc[["RNA"]]@scale.data, scaled = TRUE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
# merge by cluster
cL_results = do.call("rbind", lapply(unique(pbmc@meta.data$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(pbmc@meta.data[pbmc@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(pbmc@meta.data$seurat_clusters==cl)), 10)
}))
sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
print(sctype_scores[,1:3])
options(repr.plot.width=14, repr.plot.height=10)
pbmc@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j,];
pbmc@meta.data$customclassif[pbmc@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'customclassif')
# A tibble: 7 x 3 # Groups: cluster [7] cluster type scores <fct> <chr> <dbl> 1 2 Memory CD4+ T cells 1540. 2 0 Memory CD8+ T cells 2439. 3 3 CD8+ NKT-like cells 851. 4 1 Memory CD8+ T cells 3486. 5 4 Effector CD4+ T cells 982. 6 5 CD8+ NKT-like cells 694. 7 6 Mast cells 158.
sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 3, wt = scores)
sctype_scores
cluster | type | scores | ncells |
---|---|---|---|
<fct> | <chr> | <dbl> | <int> |
2 | Memory CD4+ T cells | 1539.8363 | 486 |
2 | Effector CD4+ T cells | 1483.1104 | 486 |
2 | Memory CD8+ T cells | 1441.9854 | 486 |
0 | Memory CD8+ T cells | 2438.5031 | 1144 |
0 | Memory CD4+ T cells | 2413.2066 | 1144 |
0 | Effector CD8+ T cells | 2345.7362 | 1144 |
3 | CD8+ NKT-like cells | 851.3827 | 408 |
3 | Effector CD8+ T cells | 809.8915 | 408 |
3 | Memory CD8+ T cells | 805.8721 | 408 |
1 | Memory CD8+ T cells | 3486.1701 | 560 |
1 | Memory CD4+ T cells | 3329.4700 | 560 |
1 | Naive CD8+ T cells | 3276.8359 | 560 |
4 | Effector CD4+ T cells | 981.5836 | 349 |
4 | Memory CD4+ T cells | 944.9250 | 349 |
4 | Effector CD8+ T cells | 908.2163 | 349 |
5 | CD8+ NKT-like cells | 694.2338 | 214 |
5 | Effector CD8+ T cells | 655.5007 | 214 |
5 | Memory CD8+ T cells | 650.0858 | 214 |
6 | Mast cells | 158.0384 | 48 |
6 | CD4+ NKT-like cells | 110.4318 | 48 |
6 | CD8+ NKT-like cells | 107.6022 | 48 |
ggplot(sctype_scores, aes(x=cluster, y=scores, label=type)) + geom_point() +
geom_text(check_overlap = FALSE, hjust=0, nudge_x=0.05, nudge_y=0.5)
Note: The differences in cell-type enrichment scores seem to be small across the top 3 cell types for each cluster.
Cell-types with similar enrichment scores are:
#memory CD4+ T cells vs. memory CD8+ T cells
diff1<-setdiff(gs_list$gs_positive$`Memory CD4+ T cells`, gs_list$gs_positive$`Memory CD8+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Memory CD8+ T cells`,gs_list$gs_positive$`Memory CD4+ T cells`)
options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=union(diff1, diff2))
Note: CD4 shows higher expression in cluster 2 and 4, while other genes that are specific to memory CD8+ T cells shows higher expression in cluster 0,1,3,5.
#memory CD4+ T cells vs effector CD4+ T cells
#memory CD8+ T cells vs. effector CD8+ T cells
diff1<-setdiff(gs_list$gs_positive$`Memory CD4+ T cells`, gs_list$gs_positive$`Effector CD4+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Effector CD4+ T cells`,gs_list$gs_positive$`Memory CD4+ T cells`)
options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=union(diff1, diff2))
Note: there seems a sutble difference in S100A4 expression between cluster2 and 4, but SCType identifies them as two different CD4+ T cell subtypes.
#memory CD8+ T cells vs. CD8+ NK cells (Distinct expression patterns between cluster3,5 and cluster 0,1)
diff1<-setdiff(gs_list$gs_positive$`CD8+ NKT-like cells`, gs_list$gs_positive$`Memory CD8+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Memory CD8+ T cells`,gs_list$gs_positive$`CD8+ NKT-like cells`)
options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=c("KLRD1", "CD52", "CD6", "LTB"))
# mast cells (high expression of KIT, IL2RA, HPGDS in cluster 6)
VlnPlot(lung.tcell, features=gs_list$gs_positive$`Mast cells`)
Note: The separation in CD4+ and CD8+ T cells were based on the expression levels of CD4, CD8A, CD8B and GNLY genes. ScType cannot identify more refined cell-types such as helper T cells. Roughly speaking, the method works to assign the main cell types for a given cluster based on the cell-type enrichment scores.