Evaluate results with/without noisy training labels

Extract data from the top 10 genes identified
Fitting using top 10 genes before/after removing PC outliers
fitting top 10 genes without PC outliers
Cell time properties before/after removing PC outliers (intensity PVE)
Re-fitting after removing PC outliers from the same 5 folds, top 101 genes
Fitting including PC outliers
Session information

Last updated: 2018-07-03

Code version: 2acec0f

Extract data from the top 10 genes identified

library(Biobase)
library(circular)
source("../peco/R/cycle.corr.R")
source("../peco/R/cycle.npreg.R")

df <- readRDS(file="../data/eset-final.rds")
pdata <- pData(df)
fdata <- fData(df)

# select endogeneous genes
counts <- exprs(df)[grep("ENSG", rownames(df)), ]

log2cpm.all <- t(log2(1+(10^6)*(t(counts)/pdata$molecules)))

# select external validation samples
log2cpm.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")

set.seed(99)
nvalid <- round(ncol(log2cpm.quant)*.15)
ii.valid <- sample(1:ncol(log2cpm.quant), nvalid, replace = F)
ii.nonvalid <- setdiff(1:ncol(log2cpm.quant), ii.valid)

log2cpm.quant.nonvalid <- log2cpm.quant[,ii.nonvalid]
log2cpm.quant.valid <- log2cpm.quant[,ii.valid]

sig.genes <- readRDS("../output/npreg-trendfilter-quantile.Rmd/out.stats.ordered.sig.101.rds")
expr.sig <- log2cpm.quant.nonvalid[rownames(log2cpm.quant.nonvalid) %in% rownames(sig.genes)[1:10], ]


# get predicted times
# set training samples
source("../peco/R/primes.R")
source("../peco/R/partitionSamples.R")
parts <- partitionSamples(1:ncol(log2cpm.quant.nonvalid), runs=5,
                          nsize.each = rep(151,5))
part_indices <- parts$partitions

Fitting using top 10 genes before/after removing PC outliers

# first check the theta in pdata
pca <- prcomp(cbind(pdata$rfp.median.log10sum.adjust,
                    pdata$gfp.median.log10sum.adjust), scale=TRUE)
pca_df <- cbind(pca$x[,1],pca$x[,2])
rownames(pca_df) <- rownames(pdata)

theta_check <- as.numeric(coord2rad(pca_df))
theta_check <- 2*pi-theta_check
plot(theta_check, pdata$theta)

names(theta_check) <- rownames(pdata)

fits_top10 <- vector("list", 5)
for (run in 1:5) {
  print(run)
  # fitting training data
  Y_train <- expr.sig[,part_indices[[run]]$train]
  theta_train <- theta_check[match(colnames(Y_train), rownames(pdata))]
  names(theta_train) <- colnames(Y_train) 

  fit.train <- cycle.npreg.insample(Y = Y_train, 
                                    theta = theta_train, 
                                    ncores=12,
                                    method.trend="trendfilter")
  # fitting test data
  Y_test <- expr.sig[,part_indices[[run]]$test]
  theta_test <- theta_check[match(colnames(Y_test), rownames(pdata))]
  names(theta_test) <- colnames(Y_test) 

  fit.test <- cycle.npreg.outsample(Y_test=Y_test,
                                    sigma_est=fit.train$sigma_est,
                                    funs_est=fit.train$funs_est,
                                    method.grid = "uniform",
                                    method.trend="trendfilter",
                                    polyorder=2,
                                    ncores=12)
  
  fits_top10[[run]] <- list(fit.train=fit.train,
                      fit.test=fit.test,
                      theta_test=theta_test)
}

for (i in 1:5) {
  fits_top10[[i]]$theta_est_shift <- rotation(fits_top10[[i]]$theta_test, fits_top10[[i]]$fit.test$cell_times_est)$y2shift
}
  
saveRDS(fits_top10, file = "../output/method-train-labels.Rmd/fits_top10.rds")


#fits_top10 <- readRDS(file = "../output/method-train-labels.Rmd/fits_top10.rds")
source("../peco/R/utility.R")

diff_time <- lapply(1:length(fits_top10), function(i) {
  pmin(abs(fits_top10[[i]]$theta_est_shift-fits_top10[[i]]$theta_test),
    abs(fits_top10[[i]]$theta_est_shift-(2*pi-fits_top10[[i]]$theta_test)))
})

pve <- lapply(1:length(fits_top10), function(i) {
  dapi <- pdata$dapi.median.log10sum[match(names(fits_top10[[i]]$theta_test),rownames(pdata))]
  get.pve(dapi[order(fits_top10[[i]]$theta_est_shift)])
})

save(diff_time, pve,
     file="../output/method-train-labels.Rmd/modelresults_top10.rda")

load(file="../output/method-train-labels.Rmd/modelresults_top10.rda")
mean(sapply(diff_time, mean)/2/pi)

[1] 0.08688909

mean(unlist(pve))

[1] 0.2419089

explore PCA outliers

# first check the theta in pdata
pca <- prcomp(cbind(pdata$rfp.median.log10sum.adjust,
                    pdata$gfp.median.log10sum.adjust), scale=TRUE)
pca_df <- cbind(pca$x[,1],pca$x[,2])
rownames(pca_df) <- rownames(pdata)

theta_check <- as.numeric(coord2rad(pca_df))
theta_check <- 2*pi-theta_check
plot(theta_check, pdata$theta)

names(theta_check) <- rownames(pdata)

dist_to_origin <- sqrt(pca_df[,1]^2+pca_df[,2]^2)
which_out <- rownames(pdata)[which(scale(dist_to_origin) < -1)]

plot(pca_df[,1],
     pca_df[,2],
     col=c("gray50", "forestgreen")[(scale(dist_to_origin) < -1)+1], pch=16,
     xlab="PC1",
     ylab="PC2",
     main="Distance-to-Origin < -1 SD")

fitting top 10 genes without PC outliers

fits_sub_top10 <- vector("list", 5)
for (run in 1:5) {
  print(run)
  # fitting training data
  Y_train <- expr.sig[,part_indices[[run]]$train]
  theta_train <- theta_check[match(colnames(Y_train), rownames(pdata))]
  names(theta_train) <- colnames(Y_train) 
  
  Y_train_sub <- Y_train[,which(!(colnames(Y_train) %in% which_out))]
  theta_train_sub <- theta_train[which(!(names(theta_train) %in% which_out))]

  fit.train <- cycle.npreg.insample(Y = Y_train_sub, 
                                    theta = theta_train_sub, 
                                    ncores=12,
                                    polyorder=2,
                                    method.trend="trendfilter")
  # fitting test data
  Y_test <- expr.sig[,part_indices[[run]]$test]
  theta_test <- theta_check[match(colnames(Y_test), rownames(pdata))]
  names(theta_test) <- colnames(Y_test) 

  Y_test_sub <- Y_test[,which(!(colnames(Y_test) %in% which_out))]
  theta_test_sub <- theta_test[which(!(names(theta_test) %in% which_out))]

  fit.test <- cycle.npreg.outsample(Y_test=Y_test_sub,
                                    sigma_est=fit.train$sigma_est,
                                    funs_est=fit.train$funs_est,
                                    method.grid = "uniform",
                                    method.trend="trendfilter",
                                    ncores=12)
  
  fits_sub_top10[[run]] <- list(fit.train=fit.train,
                      fit.test=fit.test,
                      theta_test=theta_test_sub)
}

for (i in 1:5) {
  fits_sub_top10[[i]]$theta_est_shift <- rotation(fits_sub_top10[[i]]$theta_test, fits_sub_top10[[i]]$fit.test$cell_times_est)$y2shift
}

saveRDS(fits_sub_top10, file = "../output/method-train-labels.Rmd/fits_sub_top10.rds")


diff_time <- lapply(1:5, function(i) {
  pmin(abs(fits_sub_top10[[i]]$theta_est_shift-fits_sub_top10[[i]]$theta_test),
    abs(fits_sub_top10[[i]]$theta_est_shift-(2*pi-fits_sub_top10[[i]]$theta_test)))
})

pve <- lapply(1:length(fits_sub), function(i) {
  dap <- pdata$dapi.median.log10sum[match(names(fits_sub_top10[[i]]$theta_test),rownames(pdata))]
  get.pve(dap[order(fits_sub_top10[[i]]$theta_est_shift)])
})

save(diff_time, pve,
     file="../output/method-train-labels.Rmd/modelresults_excludeoutlier_top10.rda")

load(file="../output/method-train-labels.Rmd/modelresults_excludeoutlier_top10.rda")
mean(sapply(diff_time, mean)/2/pi)

[1] 0.08253012

mean(unlist(pve))

[1] 0.2916538

Cell time properties before/after removing PC outliers (intensity PVE)

rfp_theta <- with(pdata,
                  get.pve(rfp.median.log10sum.adjust[order(theta)]))
gfp_theta <- with(pdata,
                  get.pve(gfp.median.log10sum.adjust[order(theta)]))
dapi_theta <- with(pdata,
                  get.pve(dapi.median.log10sum.adjust[order(theta)]))

save(rfp_theta,
     gfp_theta, 
     dapi_theta, 
     file = "../output/method-train-labels.Rmd/pve_include_pc_outlier.rda")


rfp_theta_sub <- with(pdata[which(!(rownames(pdata) %in% which_out)),],
                  get.pve(rfp.median.log10sum.adjust[order(theta)]))

gfp_theta_sub <- with(pdata[which(!(rownames(pdata) %in% which_out)),],
                  get.pve(gfp.median.log10sum.adjust[order(theta)]))

dapi_theta_sub <- with(pdata[which(!(rownames(pdata) %in% which_out)),],
                  get.pve(dapi.median.log10sum.adjust[order(theta)]))

save(rfp_theta_sub,
     gfp_theta_sub, 
     dapi_theta_sub, 
     file = "../output/method-train-labels.Rmd/pve_no_pc_outlier.rda")

load(file="../output/method-train-labels.Rmd/pve_no_pc_outlier.rda")
c(rfp_theta_sub,
     gfp_theta_sub, 
     dapi_theta_sub)

$pve
[1] 0.8920721

$pval
[1] 0

$pve
[1] 0.7175765

$pval
[1] 0

$pve
[1] 0.2673241

$pval
[1] 1.4491e-05

load(file="../output/method-train-labels.Rmd/pve_include_pc_outlier.rda")
c(rfp_theta$pve,
     gfp_theta$pve, 
     dapi_theta$pve)

[1] 0.8694718 0.6988123 0.2274918

Re-fitting after removing PC outliers from the same 5 folds, top 101 genes

expr_sub <- log2cpm.quant.nonvalid[rownames(log2cpm.quant.nonvalid) %in% rownames(sig.genes), ]
fits_sub_top101 <- vector("list", 5)
for (run in 1:5) {
  print(run)
  # fitting training data
  Y_train <- expr_sub[,part_indices[[run]]$train]
  theta_train <- theta_check[match(colnames(Y_train), rownames(pdata))]
  names(theta_train) <- colnames(Y_train)
  
  Y_train_sub <- Y_train[,which(!(colnames(Y_train) %in% which_out))]
  theta_train_sub <- theta_train[which(!(names(theta_train) %in% which_out))]

  fit.train <- cycle.npreg.insample(Y = Y_train_sub, 
                                    theta = theta_train_sub, 
                                    ncores=20,
                                    method.trend="trendfilter")
  # fitting test data
  Y_test <- expr_sub[,part_indices[[run]]$test]
  theta_test <- theta_check[match(colnames(Y_test), rownames(pdata))]
  names(theta_test) <- colnames(Y_test) 

  Y_test_sub <- Y_test[,which(!(colnames(Y_test) %in% which_out))]
  theta_test_sub <- theta_test[which(!(names(theta_test) %in% which_out))]

  fit.test <- cycle.npreg.outsample(Y_test=Y_test_sub,
                                    sigma_est=fit.train$sigma_est,
                                    funs_est=fit.train$funs_est,
                                    method.grid = "uniform",
                                    method.trend="trendfilter",
                                    polyorder=2,
                                    ncores=20)
  
  fits_sub_top101[[run]] <- list(fit.train=fit.train,
                      fit.test=fit.test,
                      theta_test=theta_test_sub)
}

for (i in 1:5) {
  fits_sub_top101[[i]]$theta_est_shift <- rotation(fits_sub_top101[[i]]$theta_test, fits_sub_top101[[i]]$fit.test$cell_times_est)$y2shift
}

saveRDS(fits_sub_top101, file = "../output/method-train-labels.Rmd/fits_sub_top101.rds")


diff_time <- lapply(1:5, function(i) {
  pmin(abs(fits_sub_top101[[i]]$theta_est_shift-fits_sub_top101[[i]]$theta_test),
    abs(fits_sub_top101[[i]]$theta_est_shift-(2*pi-fits_sub_top101[[i]]$theta_test)))
})

pve <- lapply(1:length(fits_sub_top101), function(i) {
  dap <- pdata$dapi.median.log10sum[match(names(fits_sub_top101[[i]]$theta_test),
                                          rownames(pdata))]
  get.pve(dap[order(fits_sub_top101[[i]]$theta_est_shift)])
})

save(diff_time, pve,
     file="../output/method-train-labels.Rmd/modelresults_excludeoutlier_top101.rda")

load(file="../output/method-train-labels.Rmd/modelresults_excludeoutlier_top101.rda")
mean(sapply(diff_time, mean)/2/pi)

[1] 0.09344773

mean(unlist(pve))

[1] 0.2832532

Fitting including PC outliers

expr_sub <- log2cpm.quant.nonvalid[rownames(log2cpm.quant.nonvalid) %in% rownames(sig.genes), ]

fits_top101 <- vector("list", 5)
for (run in 1:5) {
  print(run)
  # fitting training data
  Y_train <- expr_sub[,part_indices[[run]]$train]
  theta_train <- theta_check[match(colnames(Y_train), rownames(pdata))]
  names(theta_train) <- colnames(Y_train) 
  
  # Y_train_sub <- Y_train[,which(!(colnames(Y_train) %in% which_out))]
  # theta_train_sub <- theta_train[which(!(names(theta_train) %in% which_out))]

  fit.train <- cycle.npreg.insample(Y = Y_train, 
                                    theta = theta_train, 
                                    ncores=12,
                                    polyorder=2,
                                    method.trend="trendfilter")
  # fitting test data
  Y_test <- expr_sub[,part_indices[[run]]$test]
  theta_test <- theta_check[match(colnames(Y_test), rownames(pdata))]
  names(theta_test) <- colnames(Y_test) 

  # Y_test_sub <- Y_test[,which(!(colnames(Y_test) %in% which_out))]
  # theta_test_sub <- theta_test[which(!(names(theta_test) %in% which_out))]

  fit.test <- cycle.npreg.outsample(Y_test=Y_test,
                                    sigma_est=fit.train$sigma_est,
                                    funs_est=fit.train$funs_est,
                                    method.grid = "uniform",
                                    method.trend="trendfilter",
                                    ncores=12)
  
  fits_top101[[run]] <- list(fit.train=fit.train,
                      fit.test=fit.test,
                      theta_test=theta_test)
}

for (i in 1:5) {
  fits_top101[[i]]$theta_est_shift <- rotation(fits_top101[[i]]$theta_test, fits_top101[[i]]$fit.test$cell_times_est)$y2shift
}

saveRDS(fits_top101, file = "../output/method-train-labels.Rmd/fits_top101.rds")


diff_time <- lapply(1:5, function(i) {
  pmin(abs(fits_top101[[i]]$theta_est_shift-fits_top101[[i]]$theta_test),
    abs(fits_top101[[i]]$theta_est_shift-(2*pi-fits_top101[[i]]$theta_test)))
})

pve <- lapply(1:length(fits_sub), function(i) {
  dap <- pdata$dapi.median.log10sum[match(names(fits_top101[[i]]$theta_test),rownames(pdata))]
  get.pve(dap[order(fits_top101[[i]]$theta_est_shift)])
})

save(diff_time, pve, 
     file = "../output/method-train-labels.Rmd/modelresults_top101.rda")

load(file = "../output/method-train-labels.Rmd/modelresults_top101.rda")
mean(sapply(diff_time, mean)/2/pi)

[1] 0.09344773

mean(unlist(pve))

[1] 0.2703996

Session information

sessionInfo()

R version 3.4.3 (2017-11-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)

Matrix products: default
BLAS/LAPACK: /software/openblas-0.2.19-el7-x86_64/lib/libopenblas_haswellp-r0.2.19.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] circular_0.4-93     Biobase_2.38.0      BiocGenerics_0.24.0

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.17    mvtnorm_1.0-8   digest_0.6.15   rprojroot_1.3-2
 [5] backports_1.1.2 git2r_0.21.0    magrittr_1.5    evaluate_0.10.1
 [9] stringi_1.1.6   boot_1.3-20     rmarkdown_1.10  tools_3.4.3    
[13] stringr_1.2.0   yaml_2.1.16     compiler_3.4.3  htmltools_0.3.6
[17] knitr_1.20

This R Markdown site was created with workflowr

Evaluate results with/without noisy training labels

Joyce Hsiao

Extract data from the top 10 genes identified

Fitting using top 10 genes before/after removing PC outliers

fitting top 10 genes without PC outliers

Cell time properties before/after removing PC outliers (intensity PVE)

Re-fitting after removing PC outliers from the same 5 folds, top 101 genes

Fitting including PC outliers

Session information