library(Seurat)
library(data.table)
library(dplyr)
library(SeuratDisk)
library(ggplot2)
library(patchwork)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(GOSemSim)
library(gridExtra)
options(repr.plot.width=16, repr.plot.height=8)
work.dir<-"~/cluster/projects/u19_multiomics"


immune.combined<-readRDS(paste0(work.dir, "/analyses/lungs_and_spleens_merged.rds"))


lungs@meta.data%>%group_by(majority_voting)%>%summarise(mean_RNA=median(nCount_RNA))


ct.clusters<-read.csv(paste0(work.dir, "/analyses/merged_scRNA_Immune_low_labels.csv"))
immune.combined@meta.data[,"majority_voting"]<-ct.clusters$majority_voting

# Visualization
p1 <- DimPlot(immune.combined, reduction = "umap", 
              group.by = "full.ident", label=TRUE, repel = TRUE) +
              ggtitle("Tissue of origin")
p2 <- DimPlot(immune.combined, reduction = "umap", 
              group.by = "majority_voting", label = TRUE, repel = TRUE) +
              ggtitle("cell types from majority voting")
p1 + p2


DimPlot(immune.combined, split.by="tissue.ident")


Idents(immune.combined)<-"majority_voting"
levels(immune.combined@active.ident)


write.table(immune.combined@meta.data, "../../output/all_merged_metadata.txt", sep="\t", quote = F)


df<-data.frame(table(immune.combined@meta.data[, c("tissue.ident", "majority_voting")]))
ggplot(df, 
       aes(x = majority_voting, y=Freq, 
           fill=tissue.ident, label=Freq,
           width=0.5)) + 
geom_col(position="dodge2") + geom_label(size=5,position = position_dodge2(width=0.8)) + 
theme_light(base_size = 15) + 
theme(axis.text.x=element_text(size=16, vjust=0.7, angle = 45))


immune.combined@meta.data[,"merged"]<-immune.combined$majority_voting
immune.combined@meta.data$merged[grep("T cells", immune.combined$merged)]<-"T cells"
df<-data.frame(table(immune.combined@meta.data[, c("tissue.ident", "merged")]))
ggplot(df, 
       aes(x = merged, y=Freq, 
           fill=tissue.ident, label=Freq,
           width=0.5)) + 
geom_col(position="dodge2") + geom_label(size=5,position = position_dodge2(width=0.8)) + 
theme_light(base_size = 15) + 
theme(axis.text.x=element_text(size=16, vjust=0.7, angle = 45))


Idents(immune.combined)<-"merged"
DEG.test<-FindMarkers(immune.combined, ident.1="lungs", group.by="tissue.ident", 
                          subset.ident="T cells", test.use = "MAST")


DEG.test<-read.table("../../output/DE_analysis_mast/merged_T_cells_DE_results.csv", sep=",")
DEG.test%>%filter(p_val_adj<0.01)%>%group_by(avg_log2FC>0)%>%summarise(n=n())


ggplot(DEG.test, aes(x=avg_log2FC, y=-log10(p_val), 
                          colour=avg_log2FC>0)) +
        geom_point(size=2) + 
        geom_point(data=DEG.test%>%filter(p_val_adj>0.05),
                       aes(x=avg_log2FC, y=-log10(p_val)), 
                           colour=alpha("grey",0.7), size=2) + 
        theme_light(base_size=15)  + theme(legend.position = "none")


DEG.test<-read.table("../../output/DE_analysis_mast/Naive_B_cells_DE_results.csv", sep=",")


for(c in levels(immune.combined@active.ident)){
    DEG.test<-FindMarkers(immune.combined, ident.1="lungs", group.by="tissue.ident", min.pct=0.25,
                          subset.ident=c, test.use = "MAST")
    #uniform the format of separator
    c<-gsub("/", "_", c)
    c<-gsub(" ", "_", c)
    write.table(DEG.test, sprintf("../../output/DE_analysis_mast_minpct_0.25/%s_DE_results.csv", c), quote = F, sep=",")
    }


null.list<-list()
for(i in 1:10){
    null<-readRDS(sprintf("%s/analyses/Tem_Trm_cytotoxic_T_cells.null.%s.rds", work.dir, i))
    null.list<-append(null.list, null)
    }


genes<-unique(unlist(lapply(null.list, function(i){rownames(i)})))
meta.pvals<-lapply(genes, function(i){
    pvals<-c()
    for(j in 1:1000){
        pvals<-c(pvals, null.list[[j]][rownames(null.list[[j]])==i,]$p_val)
     }
    combined<--2*sum(log(pvals))
    meta.p<-pchisq(combined, df=2*length(pvals[!is.na(pvals)]), lower.tail=FALSE)
    return(list(p=pvals, stats=combined, meta.p=meta.p))
    })


#FOSB gene
pvals<-meta.pvals[[which(genes=="FOSB")]]$p
test.stats<--2*sum(log(pvals))
print(test.stats)

[1] 2136.512


hist(meta.pvals[[which(genes=="FOSB")]]$p, xlab="p-values under the null", main="FOSB gene", 
     cex=2, cex.axis=2, cex.lab=2)


hist(unlist(lapply(meta.pvals, function(i){i$stats})), main="", 
     xlab="test statistics distribution", cex=2, cex.axis=2, cex.lab=2)


hist(unlist(lapply(meta.pvals, function(i){i$meta.p})), main="", 
     xlab="p-value null distribution", cex=2, cex.axis=2, cex.lab=2)


gene.pvals<-unlist(lapply(meta.pvals, function(i){i$meta.p}))
print(sprintf("%s percent of genes with p-value < 0.05", 100*round(sum(gene.pvals<0.05)/length(gene.pvals),2)))

[1] "34 percent of genes with p-value < 0.05"


DEG.test<-read.csv("../../output/DE_analysis_mast/Tem_Trm_cytotoxic_T_cells_DE_results.csv")


length(genes[gene.pvals>=0.05])


DE.genes<-rownames(DEG.test %>% filter(p_val_adj<0.01))
length(intersect(DE.genes, genes[gene.pvals>=0.05]))/length(DE.genes)


DE.calibrated<-DEG.test[rownames(DEG.test)%in% intersect(DE.genes, genes[gene.pvals>=0.05]),]


eg = bitr(rownames(DE.calibrated), fromType = "SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")
    all.eg = bitr(rownames(DEG.test), fromType = "SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")

    go_list <- enrichGO(gene          = eg$ENTREZID,
                    universe      = names(all.eg$ENTREZID),
                    OrgDb         = org.Hs.eg.db,
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 0.01,
                    qvalueCutoff  = 0.05,
                    readable      = TRUE)

'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(rownames(DE.calibrated), fromType = "SYMBOL", toType = "ENTREZID", :
“2.18% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(rownames(DEG.test), fromType = "SYMBOL", toType = "ENTREZID", :
“3.38% of input gene IDs are fail to map...”


library(forcats)
library(enrichplot)
go_list<-mutate(go_list, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))

ggplot(go_list, showCategory = 20, 
                        aes(richFactor, fct_reorder(Description, richFactor))) + 
                            geom_segment(aes(xend=0, yend = Description)) +
                            geom_point(aes(color=qvalue, size = Count)) +
                            scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
                            scale_size_continuous(range=c(2, 10)) +
                            theme_minimal() + theme(axis.text.y=element_text(size=12)) + 
                            xlab("rich factor") +
                            ylab(NULL)


for(c in levels(immune.combined@active.ident)){
    DEG.test<-FindMarkers(immune.combined, ident.1="lungs", group.by="tissue.ident",
                          subset.ident=c, test.use = "wilcox")
    #uniform the format of separator
    c<-gsub("/", "_", c)
    c<-gsub(" ", "_", c)
    write.table(DEG.test, sprintf("../../output/DE_analysis_wilcox/%s_DE_results.csv", c), quote = F, sep=",")
    }


input.tbl<-read.csv("../../output/DE_analysis_mast/Memory_B_cells_DE_results.csv", header=T)


Idents(immune.combined)<-"majority_voting"


volcano_plots<-list()
counts.tbl<-c()
gene.set<-c()
for(c in levels(immune.combined@active.ident)){
    #uniform the format of separator
    c<-gsub("/", "_", c)
    c<-gsub(" ", "_", c)
    input.tbl<-read.table(sprintf("../../output/DE_analysis_mast/%s_DE_results.csv", c), sep=",")
    counts<-input.tbl %>% filter(p_val_adj<=0.05) %>% group_by((avg_log2FC>=0)) %>% summarise(n=n())
    colnames(counts)<-c("labels", "n")
    gene.set<-unique(c(gene.set, row.names(input.tbl)))
    #filter out cell types with no DEGs
    if(dim(counts)[1]>0){
        counts.tbl<-rbind(counts.tbl, cbind(c, counts))
        }
    p<-ggplot(input.tbl, aes(x=avg_log2FC, y=-log10(p_val), 
                          colour=avg_log2FC>0)) +
        geom_point(size=2) + 
        geom_point(data=input.tbl%>%filter(p_val_adj>0.05),
                       aes(x=avg_log2FC, y=-log10(p_val)), 
                           colour=alpha("grey",0.7), size=2) + 
        theme_light(base_size=15) + theme(legend.position = "none") + 
        ggtitle(c)
    volcano_plots[[c]]<-p
    }


ml <- marrangeGrob(volcano_plots, nrow=2, ncol=3)
ml

[[1]]
NULL

[[2]]
NULL


ggplot(counts.tbl, 
       aes(x = c, y=n, 
           fill=labels, label=n,
           width=0.5)) + 
geom_col(position="dodge2") + geom_label(size=5,position = position_dodge2(width=0.8)) + 
theme_light(base_size = 15) + labs(y="DEG Counts") + 
scale_fill_discrete(labels=c("Up-regulated","Down-regulated")) +
theme(axis.text.x=element_text(size=16, vjust=0.6, angle = 45))


go_list<-list()
for(c in levels(immune.combined@active.ident)){
    #uniform the format of separator
    c<-gsub("/", "_", c)
    c<-gsub(" ", "_", c)
    print(c)
    input.tbl<-read.table(sprintf("../../output/DE_analysis_mast/%s_DE_results.csv", c), sep=",")

    subset.tbl<-input.tbl %>% filter(p_val_adj<=0.05)
    eg = bitr(rownames(subset.tbl), fromType = "SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")
    all.eg = bitr(rownames(input.tbl), fromType = "SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")

    go_list[[c]] <- enrichGO(gene          = eg$ENTREZID,
                    universe      = names(all.eg$ENTREZID),
                    OrgDb         = org.Hs.eg.db,
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 0.01,
                    qvalueCutoff  = 0.05,
                    readable      = TRUE)
    }


new_list <- lapply(go_list, function(i){
        z<-mutate(i, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
        return(z)
    })

go_plots<-list()
for(c in names(new_list)){
    go_plots[[c]]<-ggplot(new_list[[c]], showCategory = 20, 
                        aes(richFactor, fct_reorder(Description, richFactor))) + 
                            geom_segment(aes(xend=0, yend = Description)) +
                            geom_point(aes(color=qvalue, size = Count)) +
                            scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
                            scale_size_continuous(range=c(2, 10)) +
                            theme_minimal() + theme(axis.text.y=element_text(size=12)) + 
                            xlab("rich factor") +
                            ylab(NULL) + 
                            ggtitle(c)
    }


saveRDS(new_list, "~/cluster/projects/u19_multiomics/analyses/go_BP_results.rds")


options(repr.plot.width=18, repr.plot.height=18)
marrangeGrob(go_plots[c(3,4,5,8)], nrow=2, ncol=2)

[[1]]
NULL


options(repr.plot.width=18, repr.plot.height=16)
marrangeGrob(go_plots[c(2,6,10)], nrow=2, ncol=2)

[[1]]
NULL


marrangeGrob(go_plots[c(1,7,9,11)], nrow=2, ncol=2)

[[1]]
NULL

majority_voting	mean_RNA
<chr>	<dbl>
CD16- NK cells	2018.0
CD16+ NK cells	1939.5
Classical monocytes	1631.5
Epithelial cells	3113.5
ILC3	2203.0
Memory B cells	739.0
Naive B cells	1705.0
Plasma cells	4437.0
Regulatory T cells	2275.5
Tcm/Naive helper T cells	2030.0
Tem/Trm cytotoxic T cells	1854.0
Type 17 helper T cells	2083.5

avg_log2FC > 0	n
<lgl>	<int>
FALSE	642
TRUE	580

Identifying lung-specific genes for each cell type¶

Procedures for merging lung and spleen datasets¶

UMAP visualization of all the cells from both tissues¶

Cell type compositions¶

Merging all T cells¶

DE analysis using MAST¶

DE test on merged T cells¶

Validate the identified DE genes by comparing against null¶

DE analysis using wilcoxon rank sum test¶

Visualizing results via volcano plots¶

Summerizing number of DEGs by direction¶

Gene set enrichment analysis via ClusterProfiler¶

Lung-specific genes in regulatory T cells¶

Summerize top 20 GO terms in molecular functions¶