12.8.22
lane14sortedsamples=read.csv("data/Consensus_Data/novogene_lane15/sample_3/ngs/variants_unique_ann.csv")
# lane14sortedsamples=merge_samples("Novogene_lane14/sample10_combined/sscs","Novogene_lane14/sample11/sscs")
lane14sortedsamples=lane14sortedsamples%>%
rowwise()%>%
mutate(ref_aa=strsplit(amino_acids,"/")[[1]][1],
alt_aa=strsplit(amino_acids,"/")[[1]][2])
lane14sortedsamples=lane14sortedsamples%>%mutate(maf=ct/depth)
lane14sorted_simple=lane14sortedsamples%>%dplyr::select(alt_start_pos,protein_start,ref,alt,ref_aa,alt_aa,consequence_terms,ct,depth,maf)
lane14sorted_simple=lane14sorted_simple%>%
mutate(error_status=case_when(protein_start%in%c(1:241,495:700)~T,
T~F))
plotly=ggplot(lane14sorted_simple%>%
filter(protein_start>=99,protein_start<=650,ct>=3)%>%
mutate(mutant=paste(protein_start,alt_aa)),aes(x=protein_start,y=maf))+
geom_bar(aes(fill=error_status),stat="sum")+
scale_fill_manual(values = c("blue","red"))+
theme_bw()+
theme(legend.position = "none")
ggplotly(plotly)
# library(ggplot2)
# il3D0.D2$alt_aa=factor(il3D0.D2$alt_aa,levels=c("P","G","Y","W","F","V","L","I","A","T","S","Q","N","M","C","E","D","R","K","H"))
ggplot(lane14sorted_simple%>%filter(nchar(as.character(alt_aa))%in%1,protein_start>=242,protein_start<=494),aes(x=protein_start,y=alt_aa,fill=ct))+
geom_tile()+
theme(panel.background=element_rect(fill="gray", colour="black"))+
scale_fill_gradient2(low ="darkblue", high ="red",name="MAF")+
scale_color_manual(values=c("black"))+
scale_x_continuous(name="Residue on the ABL Kinase",limits=c(242,493),expand=c(0,0),breaks = c(248,250,256, 271,275,300,325,350,363,375,381,400,405,425,450,475))+
ylab("Mutant Amino Acid")
## Warning: Removed 13 rows containing missing values (geom_tile).
a=lane14sorted_simple%>%filter(nchar(as.character(alt_aa))%in%1,protein_start>=242,protein_start<=494,consequence_terms%in%"missense_variant")
Now I’m going to look at how much reduction in error rates we get with duplex sequencing
ngs=read.csv(file = "data/Consensus_Data/Novogene_lane15/sample_3/ngs/variants_unique_ann.csv",header=T,stringsAsFactors = F)
ngs$consensus="NGS"
sscs=read.csv(file = "data/Consensus_Data/Novogene_lane15/sample_3/sscs/variant_caller_outputs/variants_unique_ann.csv",header=T,stringsAsFactors = F)
sscs$consensus="SSCS"
duplex=read.csv(file = "data/Consensus_Data/Novogene_lane15/sample_3/duplex/variant_caller_outputs/variants_unique_ann.csv",header=T,stringsAsFactors = F)
duplex$consensus="Duplex"
sample10_duplex=rbind(ngs,sscs,duplex)
# sample10_duplex=read.csv(file = "data/Consensus_Data/Novogene_lane14/sample10_combined/duplex/variant_caller_outputs/variants_unique_ann.csv",header=T,stringsAsFactors = F)
sample10_duplex=sample10_duplex%>%
rowwise()%>%
mutate(ref_aa=strsplit(amino_acids,"/")[[1]][1],
alt_aa=strsplit(amino_acids,"/")[[1]][2])
sample10_duplex=sample10_duplex%>%mutate(maf=ct/depth)
sample10_duplex_simple=sample10_duplex%>%dplyr::select(alt_start_pos,protein_start,ref,alt,ref_aa,alt_aa,consequence_terms,ct,depth,maf,consensus)
sample10_duplex_simple=sample10_duplex_simple%>%
mutate(error_status=case_when(protein_start%in%c(1:241,495:700)~T,
T~F))
plotly=ggplot(sample10_duplex_simple%>%
filter(protein_start>=99,protein_start<=600)%>%
mutate(mutant=paste(protein_start,alt_aa)),aes(x=protein_start,y=maf,color=error_status))+
geom_col()+
scale_color_manual(values = c("red","blue"))+
theme_bw()+
theme(legend.position = "none")
ggplotly(plotly)
sample10_duplex_simple$consensus=factor(sample10_duplex_simple$consensus,levels=c("NGS","SSCS","Duplex"))
plotly=ggplot(sample10_duplex_simple%>%
filter(protein_start>=118,protein_start<=600,!protein_start%in%c(411,493,321,417),ct>=2)%>%
mutate(mutant=paste(protein_start,alt_aa)),aes(x=protein_start,y=maf,color=error_status))+
geom_col(aes(position="dodge"))+
facet_wrap(~consensus,ncol=1)+
scale_color_manual(values = c("blue","red"))+
theme_bw()+
scale_x_continuous(name="Position on ABL")+
scale_y_continuous(limits=c(0,.01),name="MAF")+
theme(legend.position = "none")
## Warning: Ignoring unknown aesthetics: position
ggplotly(plotly)
## Warning: Removed 6 rows containing missing values (position_stack).
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
ggplot(sample10_duplex_simple%>%
filter(protein_start>=118,protein_start<=600,!protein_start%in%c(411,493,321,417),ct>=2)%>%
mutate(mutant=paste(protein_start,alt_aa)),aes(x=protein_start,y=maf,color=error_status))+
geom_col(aes(position="dodge"))+
facet_wrap(~consensus,ncol=1)+
scale_color_manual(values = c("blue","red"))+
theme_bw()+
scale_x_continuous(name="Position on ABL")+
scale_y_continuous(limits=c(0,.01),name="MAF")+
theme(legend.position = "none")
## Warning: Ignoring unknown aesthetics: position
## Warning: Removed 6 rows containing missing values (position_stack).
## Warning: Removed 164 rows containing missing values (geom_col).
# ggsave("errorrates.pdf",width=8,height=8,units="in",useDingbats=F)