library(Matrix)
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
library(reshape2)
library(sctransform)
library(DoubletFinder)


## load data and create a Seurat object
lung.data<-Read10X(data.dir="/home/jinggu/cluster/data/single-cell/Wang2020/scRNA-seq")
lung <- CreateSeuratObject(counts = lung.data, project = "sc_lung")
lung

An object of class Seurat 
26578 features across 46500 samples within 1 assay 
Active assay: RNA (26578 features, 0 variable features)


options(repr.plot.width=12, repr.plot.height=8)
lung[["percent.mt"]]<-PercentageFeatureSet(lung, pattern="^MT-")
VlnPlot(lung, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        pt.size=0, ncol = 3)

plot1 <- FeatureScatter(lung, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(lung, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2


#Replicate results in Wang2020
lung <- subset(lung, subset = nFeature_RNA > 500 & nFeature_RNA < 4000 & percent.mt<=0.3)


lung

An object of class Seurat 
26578 features across 10223 samples within 1 assay 
Active assay: RNA (26578 features, 3000 variable features)
 2 dimensional reductions calculated: pca, umap


#SCTransform 
#lung<-PercentageFeatureSet(lung, "^MT-", col.name="percent.mt")
#lung <- SCTransform(lung, method = "glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE)


#log-normalized and scaled by a factor of 10,000
lung <- NormalizeData(lung, normalization.method = "LogNormalize", scale.factor = 10000)
#find variable genes
lung <- FindVariableFeatures(lung, selection.method = "vst", nfeatures = 3000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(lung), 10)


# plot variable features with and without labels
options(repr.plot.width=15, repr.plot.height=8)
plot1 <- VariableFeaturePlot(lung)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = FALSE)
plot1+plot2

Warning message:
“Transformation introduced infinite values in continuous x-axis”
Warning message:
“Transformation introduced infinite values in continuous x-axis”


all.genes <- rownames(lung)
lung <- ScaleData(lung, features = all.genes)

Centering and scaling data matrix


lung<-readRDS("~/cluster/projects/lung_Wang2020/lung_scRNA/lung_Wang2020.rds")


lung

An object of class Seurat 
26578 features across 46500 samples within 1 assay 
Active assay: RNA (26578 features, 3000 variable features)
 2 dimensional reductions calculated: pca, umap


lung <- RunPCA(lung, features = VariableFeatures(object = lung))
print(lung[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(lung, dims = 1:2, reduction = "pca")
DimPlot(lung, reduction="pca")

PC_ 1 
Positive:  PIEZO2, PDZRN3, CACNA1C, LAMA2, MEIS1, LAMA4, CDH11, SLIT3, COL6A3, COL5A2 
	   CACNB2, ROBO2, BNC2, CCBE1, RYR2, ADAMTS12, CALD1, COL1A2, LSAMP, NR2F1-AS1 
	   ADGRB3, COL5A1, ARHGAP6, ELMO1, BACH2, ITGA8, SCN7A, GALNT17, FN1, PRDM6 
Negative:  CPM, LMO7, ATP13A4, PHACTR1, MAGI3, VEPH1, ZNF385B, LRRK2, MACROD2, ERBB4 
	   ITGB6, DAPK2, GLCCI1, LMO3, GPC5, SNX25, SLC34A2, P3H2, AC112206.2, TOX 
	   ABCA3, PEBP4, SLC39A8, LAMA3, CD55, AC019117.1, TACC2, AC044810.3, TMEM163, ELF3 
PC_ 2 
Positive:  MAGI3, NEBL, PRKG1, ROR1, LMO7, EPB41L5, RORA, PTPRD, NFIB, ATP13A4 
	   GPC5, DLG2, DAPK2, LAMA3, EMP2, GLCCI1, PTPN13, KIAA1217, AC112206.2, AC044810.3 
	   SYNE1, VEPH1, SLIT2, FGFR2, PLEKHH2, PLS3, CASC15, ABLIM1, MAPK10, MACROD2 
Negative:  DOCK2, SLC11A1, GPRIN3, ALOX5, TBXAS1, PIK3R5, MRC1, PTPRC, TFEC, MSR1 
	   PPARG, KYNU, SNX10, ABCG1, DOCK8, STAC, SLCO2B1, NCKAP1L, CD163, MYO1F 
	   CCDC88A, CLEC7A, SLC8A1, HCK, PIK3AP1, WDFY4, MARCH1, FGD2, CSF2RA, OLR1 
PC_ 3 
Positive:  DNAH12, SPAG17, CFAP157, ADGB, CFAP54, ZBBX, CFAP299, ARMC3, DNAAF1, CFAP47 
	   LRRIQ1, CFAP46, CFAP43, ERICH3, DNAI1, SERPINI2, RSPH1, TEKT1, TTC29, DNAH9 
	   SAXO2, LMNTD1, VWA3B, DNAH3, DCDC1, NEK10, WDR49, DNAH6, CFAP52, CFAP73 
Negative:  ZNF385B, LRRK2, DLG2, TMEM163, ABCA3, TTN, SFTPC, P3H2, PID1, CCDC141 
	   PHACTR1, PEBP4, WIF1, SLC34A2, LAMP3, BMP1, AGBL1, SNX25, CPM, ITGB6 
	   CD36, LANCL1-AS1, HHIP, KIAA1324L, CTSH, AC046195.1, MFSD2A, NECAB1, ROR1, ETV5 
PC_ 4 
Positive:  LRRK2, ZNF385B, PTPN13, AFF3, TTN, DLG2, SFTPC, TMEM163, ERBB4, LAMP3 
	   ABCA3, CTSH, CCDC141, PTGFR, SFTPA1, MFSD2A, HHIP, KIAA1324L, SCN1A, TNIK 
	   PARM1, PID1, SNX30, AGBL1, BMP1, LANCL1-AS1, NTN4, ARHGEF38, TMEM164, FMO5 
Negative:  RTKN2, NCKAP5, AC027288.3, CLIC5, AC245041.2, ST6GALNAC5, ARHGEF26, CNTN6, AC002066.1, CAV1 
	   MAP2, PHLDB2, COL4A3, ANKRD29, SCEL, EMP2, ANXA3, COL4A4, COL12A1, ANOS1 
	   RAB11FIP1, CCDC85A, KHDRBS2, LAMA3, GPRC5A, FAM189A2, DENND3, LINC01290, TTLL7, NYAP2 
PC_ 5 
Positive:  HBA2, TIMP1, CRISPLD2, TNC, COL4A2, COL4A1, HBA1, RASA3, MEG8, THBS2 
	   PVT1, THBS1, MTHFD1L, NHS, MT2A, IL1R1, LDLRAD4, PITPNC1, FYN, PAPPA 
	   TGFBI, SFMBT2, MEG3, CHSY1, SAMD4A, CASC15, IGFBP7, PLCB1, COL1A1, BX322234.1 
Negative:  SCN7A, ANGPT1, NEBL, CACNA1D, ITGA8, CCBE1, MYLK, ADH1B, DOCK4, RSPO2 
	   PLXDC2, MACF1, TNIK, AL139383.1, AC092691.1, TRHDE, ABCA6, ARHGAP6, LSAMP, ELMO1 
	   IL16, PIK3R1, GRIK4, BMP5, NKD1, FBLN5, DST, NUAK1, PRKG1, GRIA1

PC_ 1 
Positive:  PIEZO2, PDZRN3, CACNA1C, LAMA2, MEIS1 
Negative:  CPM, LMO7, ATP13A4, PHACTR1, MAGI3 
PC_ 2 
Positive:  MAGI3, NEBL, PRKG1, ROR1, LMO7 
Negative:  DOCK2, SLC11A1, GPRIN3, ALOX5, TBXAS1 
PC_ 3 
Positive:  DNAH12, SPAG17, CFAP157, ADGB, CFAP54 
Negative:  ZNF385B, LRRK2, DLG2, TMEM163, ABCA3 
PC_ 4 
Positive:  LRRK2, ZNF385B, PTPN13, AFF3, TTN 
Negative:  RTKN2, NCKAP5, AC027288.3, CLIC5, AC245041.2 
PC_ 5 
Positive:  HBA2, TIMP1, CRISPLD2, TNC, COL4A2 
Negative:  SCN7A, ANGPT1, NEBL, CACNA1D, ITGA8


DimHeatmap(lung, dims = 1:15, cells = 500, balanced = TRUE)

ElbowPlot(lung, ndims = 50)


lung<-FindNeighbors(lung, dims=1:30)
lung<-FindClusters(lung, resolution=0.8)
lung<-RunUMAP(lung, dims=1:30)


lung$


DimPlot(lung, reduction="umap") + ggtitle("UMAP plot colored by sample IDs")
lung@active.ident<-lung$orig.ident
DimPlot(lung, reduction="umap") + ggtitle("UMAP plot colored by clusters")


## pK Identification (no ground-truth)
sweep.res.list_lung <- paramSweep_v3(lung, PCs = 1:50, sct = FALSE)
sweep.stats_lung <- summarizeSweep(sweep.res.list_lung, GT = FALSE)
bcmvn_lung <- find.pK(sweep.stats_lung)

head(lung$seurat_clusters)

annotations<-lung@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)           ## ex: annotations <- seu_kidney@meta.data$ClusteringResults
nExp_poi <- round(0.1*nrow(lung@meta.data))  ## Assuming 7.5% doublet formation rate - tailor for your dataset
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
lung <- doubletFinder_v3(lung, PCs = 1:50, pN = 0.15, pK = 0.005, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
lung <- doubletFinder_v3(lung, PCs = 1:50, pN = 0.15, pK = 0.005, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.15_0.005_4650", sct = FALSE)
sum(lung$DF.classifications_0.15_0.005_4480 == 'Doublet')


head(lung@meta.data)


meta.info<-read.table("Wang2020/scRNA-seq/GSE161382_metadata.txt.gz", header = T)
meta.info<-meta.info[rownames(meta.info) %in% rownames(lung@meta.data),]


library(harmony)
library(magrittr)

pc<-lung@reductions$pca@cell.embeddings
harmonized <- HarmonyMatrix(pc, meta.info, vars_use=c("donor", "batch", "sex"), do_pca=FALSE)
harmonized <- data.frame(harmonized)


lung@reductions$pca@cell.embeddings<-as.matrix(harmonized)
head(lung@reductions$pca@cell.embeddings)


lung@active.ident<-lung$orig.ident
DimPlot(lung, reduction="pca")


lung<-FindNeighbors(lung, dims=1:50)
lung<-FindClusters(lung, resolution=0.8)
lung<-RunUMAP(lung, dims=1:50)


DimPlot(lung, reduction="umap", label = T, label.size = 5)


saveRDS(lung, file="lung_snATAC/seurat_scRNA/lung_Wang2020.rds")


#reload R object
lung<-readRDS(file="~/cluster/projects/lung_others/lung_scRNA/lung_Wang2020.rds")


markers<-c("AGER","SFTPA2","NDNF","KCNJ15","GDF15","SCGB3A2","TP63","MUC5B",
          "MYB","RIMS2","COL2A1","LAMC3","NTRK3","MYOCD","LTBP2","TCF21","MFAP5",
          "BMX","ACKR1","KIT","CA4","ABCB1","RELN","TREM1","PLTP","FCN1","FLT3",
          "EPB42","TCF7","PRF1","PAX5","MS4A2")


options(repr.plot.width=12, repr.plot.height=10)
DotPlot(object=lung, features=markers) + theme(axis.text.x = element_text(angle=45, vjust = 0.5))


lung<-RenameIdents(lung, new.cluster.ids)


DimPlot(lung, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()


#Find all markers for each cluster by performing pair-wise differential gene expression
#lung.markers <- FindAllMarkers(lung, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5, densify=TRUE)
lung.markers<-read.table("~/cluster/projects/lung_others/lung_scRNA/lung_clusters_DEG.txt", header=T)


new.cluster.ids<-c("0"="AT2 cells","1"="Alv. macro","2"="Matrix fib.1","3"="AT1 cells",
              "4"="T cells", "5"="Matrix fib.2", "6"="Club cells", "7"="Monocytes",
              "8"="Cap cells", "9"="Myofib",
              "10"="Inter. macro.", "11"="B cells", "12"="NK cells",
              "13"="Cap cells","14"="Lymphatics",
              "15"="Pericytes", "16"="Ciliated cells",
              "17"= "Erythrocytes","18"= "Erythrocytes",
              "19"="Basal cells", "20"="Mast cells",
              "21"="PNECs")


#write.table(top10, "~/cluster/projects/lung_others/lung_scRNA/lung_clusters_DEG.txt", quote = F, row.names = T, sep='\t')


saveRDS(lung, "~/cluster/projects//lung_others//lung_scRNA//lung_Wang2020.rds")


intersect(lung.markers[lung.markers$cluster==20, "gene"][1:50], markers)


head(lung.markers[lung.markers$cluster==13,], 10)


png("/home/jinggu/cluster/projects/lung_others/lung_scRNA/lung_scRNA_heatmap.png")
DoHeatmap(lung, features = top10$gene) + NoLegend()
dev.off()


nclusters<-length(unique(lung@meta.data$seurat_clusters))
cluster.markers<-c("0","T cells","AT2","Monocytes","AT2","Club cells",
                   "AT1", "Alv.macro.", "8", "AT1/AT2-like", "Alv.macro.",
                   "AT1/AT2-like", "Matrix fib.2", "Matrix fib.1","AT1/AT2-like", "Inter.macro.",
                   "Vasc. sm.", "B cells", "Vasc. sm.","NK cells","Matrix fib.2", 
                   "Matrix fib.1", "Lymphatics", "Cap", "Ciliated cells", "Airway Sm.",
                   "26", "27", "Pericytes", "Erythro.", "Chondro.", "PNECs")
length(cluster.markers)

lung@meta.data["cluster.ident"]<-unlist(lapply(lung@meta.data$seurat_clusters, function(i){cluster.markers[as.numeric(i)+1]}))

names(cluster.markers)<-levels(lung@meta.data$seurat_clusters)

lung<-RenameIdents(lung, cluster.markers)

cluster.markers


options(repr.plot.width=15, repr.plot.height=10)
DimPlot(lung, reduction="umap", label=TRUE)


options(repr.plot.width=12, repr.plot.height=10)
tcell.markers=c('CD8A','CD4','CD74',
    'CCR7', 'ITGAE','CD69', 
    'IL7R', 'IL33','CD200')
FeaturePlot(lung, features = tcell.markers)


lung.tcell<-readRDS("~/cluster/projects/lung_Wang2020/lung_scRNA/lung_Tcell_subclustering_Wang2020.rds")


lung.tcell

An object of class Seurat 
26578 features across 3209 samples within 1 assay 
Active assay: RNA (26578 features, 3000 variable features)
 2 dimensional reductions calculated: pca, umap


#lung.tcell<-subset(lung, subset=seurat_clusters=="4")

lung.tcell<-RunPCA(lung.tcell, features = VariableFeatures(lung.tcell)[1:2000])

ElbowPlot(lung.tcell, ndims = 50)

PC_ 1 
Positive:  ALCAM, CADPS2, COLEC12, DLG2, PID1, MYO1B, NEBL, PTPRD, ROR1, TCF4 
	   CADM1, NRP1, MAGI3, KIAA1217, LHFPL6, EPS8, ELL2, MITF, SLC8A1, LRRK2 
	   TACC2, EPAS1, GLIS3, MACROD2, MAPK10, TFPI, MAGI2, FBXL7, LMO7, PDE4D 
Negative:  THEMIS, BCL11B, SKAP1, CAMK4, ITK, CD96, LINC01934, CD247, LEF1, IL7R 
	   RIPOR2, ETS1, ICOS, INPP4B, STAT4, TRAC, NELL2, CD28, IKZF3, CD69 
	   SAMD3, BCL2, BTBD11, C15orf53, LTB, CCR7, TXK, GNG2, SELL, AC022075.1 
PC_ 2 
Positive:  C15orf53, PPP2R2B, LINC01934, AC013652.1, CCL5, MSC-AS1, KLRB1, SKAP1, CD96, STAT4 
	   INPP4B, ADAM19, PRDM1, TSPAN5, ADAM12, CLNK, RBPJ, IKZF3, BCL2, CD69 
	   SAMSN1, TOX, GNAO1, SFTPC, THEMIS, MYBL1, IL2RA, IL12RB2, MIR155HG, TENT5C 
Negative:  CCR7, SELL, LEF1, NELL2, TXK, BTBD11, LTB, BACH2, AL163932.1, PLAC8 
	   PDE3B, TRAC, ITGA6, CD28, IL7R, RETREG1, KLF2, FAM19A1, PLCL1, CAMK4 
	   KCNQ5, ATP10A, AC008014.1, ADAMTS6, CD55, EPHA4, RALGPS2, ETS1, FCGBP, PCSK5 
PC_ 3 
Positive:  AC022075.1, CCL5, NELL2, IFNG-AS1, SAMD3, AOAH, PPP2R2B, LINC02446, CRTAM, CD96 
	   PDGFD, NCALD, KLRC1, LEF1, VAV3, ABCB1, GNLY, KCNQ5, IKZF3, TXK 
	   CCL4, FCGBP, RIPOR2, ITGAD, PRKCB, SFTPC, AF165147.1, DTHD1, JAKMIP2, AC243829.2 
Negative:  ICOS, C15orf53, CD28, IL2RA, KLRB1, TNFAIP3, AC013652.1, BIRC3, RGS1, ADAM19 
	   ADAM12, IL7R, CD69, LTB, BTBD11, CSGALNACT1, SAMSN1, PRDM1, DPP4, HPGDS 
	   IL18R1, INPP4B, CD200R1, PLCL1, ITK, FSTL4, MAF, SELL, PAG1, TMEM156 
PC_ 4 
Positive:  TRAC, C15orf53, LTB, CCL5, AC013652.1, AC022075.1, TMSB4X, ACTB, NELL2, CLNK 
	   LINC02446, CCR7, CD52, KLRB1, CD200R1, SAMD3, HLA-C, AL163932.1, SELL, DDIT4 
	   FTL, CD69, VIM, LINC01934, FCGBP, CRTAM, SFTPC, S100A4, TNIP3, KLRC1 
Negative:  CD28, ITK, ANK3, FKBP5, IL7R, MYBL1, FBXO32, BTBD11, BCL2, IFNG-AS1 
	   PRDM1, PCAT1, BCL11B, CATSPERB, TXK, CD247, CAMK4, TMEM156, RIPOR2, ELOVL6 
	   ETS1, INPP4B, AC105402.3, PLCB1, DTHD1, STAT4, ADAM19, IL2RA, DPP4, GPRIN3 
PC_ 5 
Positive:  THEMIS, ICOS, C15orf53, LINC01934, CCR7, BCL11B, ITK, SELL, CD28, IKZF3 
	   MSC-AS1, AC013652.1, TNIP3, SLC9A9, PPP2R2B, CCL5, AL163932.1, RBPJ, ITGA6, C12orf42 
	   CAMK4, CSGALNACT1, LEF1, KIAA0825, PRKCB, TSHZ2, ANKRD55, PRDM1, NELL2, ANK3 
Negative:  KLRB1, MYBL1, CD69, IL7R, LTB, IL18R1, TNFAIP3, ADAM12, STAT4, IL2RA 
	   FSTL4, ABCB1, CLNK, DUSP2, KIT, DPP4, HPGDS, PLCB1, BIRC3, ATRNL1 
	   RGS1, AC022075.1, NCALD, IFNG-AS1, IL12RB2, BTBD11, BLK, MED12L, TXK, DDIT4


lung.tcell <- FindNeighbors(lung.tcell, k.param = 40, dims = 1:30)
lung.tcell <- FindClusters(lung.tcell, resolution = 0.6)

lung.tcell<-RunUMAP(lung.tcell, dims=1:30)

Computing nearest neighbor graph

Computing SNN

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 3209
Number of edges: 294449

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.7022
Number of communities: 7
Elapsed time: 0 seconds

15:26:57 UMAP embedding parameters a = 0.9922 b = 1.112

15:26:57 Read 3209 rows and found 30 numeric columns

15:26:57 Using Annoy for neighbor search, n_neighbors = 30

15:26:57 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

15:26:58 Writing NN index file to temp file /tmp/jobs/19456337/RtmpAaXWeH/file7f9053cc1934

15:26:58 Searching Annoy index using 1 thread, search_k = 3000

15:26:59 Annoy recall = 100%

15:26:59 Commencing smooth kNN distance calibration using 1 thread

15:27:00 Initializing from normalized Laplacian + noise

15:27:00 Commencing optimization for 500 epochs, with 131228 positive edges

15:27:08 Optimization finished


DimPlot(lung.tcell, reduction = "umap", label=TRUE, label.size = 8)


options(repr.plot.width=12, repr.plot.height=10)
## Marker genes for subtypes of T-cells
markerGenes  <- c(
    'CD8A','CD4','CD3E',
    'ITGAE','CD69', 
    'ITGA1','ITGAL',
    'SELL','CCR7'
    )
FeaturePlot(lung.tcell, features=markerGenes, ncol=3)


cluster.markers<-FindAllMarkers(lung.tcell, only.pos=TRUE)

write.table(cluster.markers %>% group_by(cluster) %>% slice_max(n=10, order_by=avg_log2FC), 
            "lung_snATAC/seurat_scRNA/t_cell_subclusters_DEG.txt", row.names=F, sep='\t', quote = F)

Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 4

Calculating cluster 5

Calculating cluster 6


cluster.markers<-read.table("~/cluster/projects//lung_others//lung_scRNA//t_cell_subclusters_DEG.txt", header = T)
head(cluster.markers)


lung.tcell$


cluster1.markers<-FindMarkers(lung.tcell, ident.1 = c(0,1), ident.2 = seq(2,6), min.pct = 0.1)

cluster1.markers[which(rownames(cluster1.markers)=="CD69"),]


options(repr.plot.width=15, repr.plot.height=12)
cluster.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(lung.tcell, features = top10$gene) + NoLegend()


options(repr.plot.width=12, repr.plot.height=10)
DotPlot(object=lung.tcell, features=markerGenes) + theme(axis.text.x = element_text(angle=45, vjust = 0.5)) + ggtitle("Expression of marker genes across all clusters")
DotPlot(object=subset(lung.tcell, subset=seurat_clusters!="6"), features=markerGenes) + 
theme(axis.text.x = element_text(angle=45, vjust = 0.5)) + 
ggtitle("Expression of marker genes among cluster 0-5")


VlnPlot(lung.tcell, features=markerGenes)


# load libraries and functions
lapply(c("dplyr","Seurat","HGNChelper"), library, character.only = T)
source("~/projects/funcFinemapping/code/sctype/gene_sets_prepare.R")
source("~/projects/funcFinemapping/code/sctype/sctype_score_.R")

# DB file
db_ = "~/projects/funcFinemapping/data/ScTypeDB_full.xlsx";
tissue = "Immune system" # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain

# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)


pbmc<-lung.tcell


# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = pbmc[["RNA"]]@scale.data, scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)

# merge by cluster
cL_results = do.call("rbind", lapply(unique(pbmc@meta.data$seurat_clusters), function(cl){
    es.max.cl = sort(rowSums(es.max[ ,rownames(pbmc@meta.data[pbmc@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
    head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(pbmc@meta.data$seurat_clusters==cl)), 10)
}))
sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
print(sctype_scores[,1:3])

options(repr.plot.width=14, repr.plot.height=10)
pbmc@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j,]; 
  pbmc@meta.data$customclassif[pbmc@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}

DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'customclassif')

# A tibble: 7 x 3
# Groups:   cluster [7]
  cluster type                  scores
  <fct>   <chr>                  <dbl>
1 2       Memory CD4+ T cells    1540.
2 0       Memory CD8+ T cells    2439.
3 3       CD8+ NKT-like cells     851.
4 1       Memory CD8+ T cells    3486.
5 4       Effector CD4+ T cells   982.
6 5       CD8+ NKT-like cells     694.
7 6       Mast cells              158.


sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 3, wt = scores)  
sctype_scores


ggplot(sctype_scores, aes(x=cluster, y=scores, label=type)) + geom_point() + 
        geom_text(check_overlap = FALSE, hjust=0, nudge_x=0.05, nudge_y=0.5)


#memory CD4+ T cells vs. memory CD8+ T cells
diff1<-setdiff(gs_list$gs_positive$`Memory CD4+ T cells`, gs_list$gs_positive$`Memory CD8+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Memory CD8+ T cells`,gs_list$gs_positive$`Memory CD4+ T cells`)

options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=union(diff1, diff2))


#memory CD4+ T cells vs effector CD4+ T cells
#memory CD8+ T cells vs. effector CD8+ T cells
diff1<-setdiff(gs_list$gs_positive$`Memory CD4+ T cells`, gs_list$gs_positive$`Effector CD4+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Effector CD4+ T cells`,gs_list$gs_positive$`Memory CD4+ T cells`)

options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=union(diff1, diff2))


#memory CD8+ T cells vs. CD8+ NK cells (Distinct expression patterns between cluster3,5 and cluster 0,1)
diff1<-setdiff(gs_list$gs_positive$`CD8+ NKT-like cells`, gs_list$gs_positive$`Memory CD8+ T cells`)
diff2<-setdiff(gs_list$gs_positive$`Memory CD8+ T cells`,gs_list$gs_positive$`CD8+ NKT-like cells`)


options(repr.plot.width=8, repr.plot.height=6)
VlnPlot(lung.tcell, features=c("KLRD1", "CD52", "CD6", "LTB"))


# mast cells (high expression of KIT, IL2RA, HPGDS in cluster 6)
VlnPlot(lung.tcell, features=gs_list$gs_positive$`Mast cells`)

	orig.ident	nCount_RNA	nFeature_RNA	RNA_snn_res.0.5	seurat_clusters	percent.mt	RNA_snn_res.1.2	RNA_snn_res.0.8
	<fct>	<dbl>	<int>	<fct>	<fct>	<dbl>	<fct>	<fct>
D032_TAACACGCAGGCATGA	D032	1114	851	1	19	0.08976661	19	19
D046_AGTGACTCAGAGATGC	D046	1125	853	1	11	0.08888889	19	11
D046_ATCGGCGGTACAGTCT	D046	3763	1788	1	19	0.18602179	21	19
D046_TCTATCAGTCATATGC	D046	2638	1622	1	19	0.07581501	19	19
D062_ACCACAATCTCCGCAT	D062	7339	3025	1	19	0.17713585	19	19
D062_AGGATAAGTACGTACT	D062	2077	1396	1	19	0.24073182	19	19

	PC_1	PC_2	PC_3	PC_4	PC_5	PC_6	PC_7	PC_8	PC_9	PC_10	⋯	PC_41	PC_42	PC_43	PC_44	PC_45	PC_46	PC_47	PC_48	PC_49	PC_50
D032_TAACACGCAGGCATGA	-0.3985820	-20.85848	-0.5240309	-0.2439483	-0.5422445	1.2595482	-0.7032837	1.0844445	3.55206051	-6.0492437	⋯	-1.48486074	0.8908909	0.8934875	-0.8904164	0.8289513	-0.2346926	-0.547690860	0.4405086	0.49811969	-0.3260594
D046_AGTGACTCAGAGATGC	-1.2321803	-13.33187	-0.9988345	-0.8102001	1.0425367	1.9197886	-0.5353741	0.4976184	1.28765749	1.8857875	⋯	-0.05940036	-0.2286925	-0.5732119	1.5012035	-0.3322233	1.6437078	2.289830251	0.3170592	-1.25525313	-0.4953004
D046_ATCGGCGGTACAGTCT	1.3195731	-25.66298	-1.6189908	-0.7586676	2.0589709	1.8836052	-1.3036565	0.3490058	0.59964696	-3.0121535	⋯	-0.01335006	0.1423684	4.7768354	0.1357333	1.2063323	-2.8727584	-2.292282851	-0.9298083	4.17931361	-1.3615013
D046_TCTATCAGTCATATGC	0.5904185	-27.22417	-0.1631747	-2.1585962	0.3150094	1.7617123	-0.5007738	-2.6033632	1.53346417	-0.6391126	⋯	2.84548073	2.5159239	-2.1065811	-0.8151320	-1.7546148	0.0313849	0.321297509	-0.8295865	2.52309071	1.2149441
D062_ACCACAATCTCCGCAT	1.6423649	-35.82405	1.6365077	-3.9821016	-4.9717792	-1.4236247	3.5897601	-5.6472829	0.08844246	4.0540465	⋯	1.10237707	2.0238321	-1.1486768	0.5971051	1.3507216	1.6533622	0.935727793	-0.1090692	-1.76254395	0.9318414
D062_AGGATAAGTACGTACT	1.0832026	-25.79946	0.6108807	-3.4398976	-2.5684447	0.7385468	2.1705286	-1.2798463	0.83453173	1.8157443	⋯	0.87987734	0.9792191	1.1019914	-1.4024157	-0.2645823	0.3537502	-0.001623401	0.5388432	0.02924582	-2.1465729

	p_val	avg_log2FC	pct.1	pct.2	p_val_adj	cluster	gene
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<chr>
EMCN1	0	4.286502	0.793	0.041	0	13	EMCN
ADGRL22	0	4.077742	0.934	0.221	0	13	ADGRL2
HPGD1	0	3.737363	0.756	0.079	0	13	HPGD
GALNT182	0	3.520607	0.917	0.192	0	13	GALNT18
STXBP61	0	3.355972	0.796	0.114	0	13	STXBP6
EGFL71	0	3.279031	0.883	0.085	0	13	EGFL7
HECW21	0	3.270260	0.738	0.085	0	13	HECW2
PRICKLE23	0	3.215023	0.869	0.282	0	13	PRICKLE2
NCALD3	0	3.176994	0.803	0.138	0	13	NCALD
NRG32	0	3.023161	0.555	0.082	0	13	NRG3

	p_val	avg_log2FC	pct.1	pct.2	p_val_adj	cluster	gene
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<chr>
1	4.569342e-33	1.1985345	0.309	0.144	1.214440e-28	0	PLCB1
2	1.360273e-20	0.8543668	0.340	0.213	3.615332e-16	0	ERN1
3	1.475631e-09	0.8471117	0.418	0.360	3.921933e-05	0	SFTPB
4	7.793805e-17	0.8159923	0.239	0.130	2.071437e-12	0	ZBTB16
5	2.141279e-11	0.7956628	0.189	0.111	5.691090e-07	0	MYBL1
6	1.765221e-10	0.6677428	0.116	0.056	4.691604e-06	0	MACROD2

cluster	type	scores	ncells
<fct>	<chr>	<dbl>	<int>
2	Memory CD4+ T cells	1539.8363	486
2	Effector CD4+ T cells	1483.1104	486
2	Memory CD8+ T cells	1441.9854	486
0	Memory CD8+ T cells	2438.5031	1144
0	Memory CD4+ T cells	2413.2066	1144
0	Effector CD8+ T cells	2345.7362	1144
3	CD8+ NKT-like cells	851.3827	408
3	Effector CD8+ T cells	809.8915	408
3	Memory CD8+ T cells	805.8721	408
1	Memory CD8+ T cells	3486.1701	560
1	Memory CD4+ T cells	3329.4700	560
1	Naive CD8+ T cells	3276.8359	560
4	Effector CD4+ T cells	981.5836	349
4	Memory CD4+ T cells	944.9250	349
4	Effector CD8+ T cells	908.2163	349
5	CD8+ NKT-like cells	694.2338	214
5	Effector CD8+ T cells	655.5007	214
5	Memory CD8+ T cells	650.0858	214
6	Mast cells	158.0384	48
6	CD4+ NKT-like cells	110.4318	48
6	CD8+ NKT-like cells	107.6022	48

Single cell RNA-seq analyses using Seurat¶

Table of contents¶

Sample QCs¶

Data Processing - Normalization and Scaling¶

Dimension reduction and 2D embedding¶

Using DoubletFinder to filter out the predicted doublets¶

Use Harmony to adjust for batch effects¶

Find Marker genes¶

Sub-clustering of the identified cluster for T-cells in lungs¶

Use of scType to identify cell-types¶

Top 3 cell-type enrichment scores for each cluster¶

Check the distribution of marker genes¶