library(data.table) library(dplyr) input.file1 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes", "12-feb-2019","ukb26140.csv.gz") input.file2 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes", "11-jun-2019","ukb32141.csv.gz") input.file3 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes", "16-oct-2020","ukb44231.csv.gz") output.file <- file.path("/gpfs/data/stephens-lab/finemap-uk-biobank", "data/raw/BloodCells/bloodcells.csv") cols <- c("eid","31-0.0","54-0.0","21022-0.0","22006-0.0", "22001-0.0","22000-0.0","22005-0.0",paste0("22009-0.",1:10), "22021-0.0", "22027-0.0", "30000-0.0", "30010-0.0", "30020-0.0", "30040-0.0", "30070-0.0", "30080-0.0", "30090-0.0", "30110-0.0", "30180-0.0", "30190-0.0", "30200-0.0", "30210-0.0", "30220-0.0", "30240-0.0", "30270-0.0", "30290-0.0", paste0("41202-0.", 0:379), "3140-0.0") col_names <- c("id","sex","assessment_centre","age","ethnic_genetic", "sex_genetic","genotype_measurement_batch","missingness", paste0("pc_genetic",1:10), "kinship_genetic", "outliers", "WBC_count", "RBC_count", "Haemoglobin", "MCV", "RDW", "Platelet_count", "Plateletcrit", "PDW", "Lymphocyte_perc", "Monocyte_perc", "Neutrophill_perc", "Eosinophill_perc", "Basophill_perc", "Reticulocyte_perc", "MSCV", "HLR_perc", paste0('ICD10.',0:379), "pregnancy") cat("Reading data from the CSV files.\n") out <- system.time({ dat1 <- fread(input.file1,sep = ",",header = TRUE,verbose = FALSE, showProgress = FALSE,colClasses = "character"); dat2 <- fread(input.file2,sep = ",",header = TRUE,verbose = FALSE, showProgress = FALSE,colClasses = "character"); dat3 <- fread(input.file3,sep = ",",header = TRUE,verbose = FALSE, showProgress = FALSE,colClasses = "character") }) class(dat1) <- "data.frame" class(dat2) <- "data.frame" class(dat3) <- "data.frame" cat(sprintf("Data loading step took %d seconds.\n",round(out["elapsed"]))) dat12 <- inner_join(dat1,dat2,by = "eid") dat <- inner_join(dat12, dat3, by='eid') rm(dat1,dat2,dat3,dat12) cat(sprintf("Merged table contains %d rows.\n",nrow(dat))) ## There are 502492 samples # PREPARE DATA # ------------ # Select the requested columns. cat("Preparing data.\n") dat <- dat[,cols] names(dat) <- col_names # Convert numerical columns except the first one (the first column contains # the sample ids) to numeric values, and set all empty strings to NA. n <- length(dat) for (i in 2:n) { x <- dat[,i] x[x == ""] <- as.character(NA) if(grepl('ICD10', colnames(dat)[i])){ dat[,i] <- x }else{ dat[,i] <- as.numeric(x) } } # Remove any samples that are not marked as being "White British". # This step should filter out 92887 rows. dat = dat %>% filter(ethnic_genetic == 1) cat(sprintf("After removing non White British, %d rows remain.\n",nrow(dat))) # Remove all rows in which one or more of the values are missing, # aside from the in the "outlier", "ICD10", "pregnancy" columns. # The "outliers" have value 1 when it is an outlier, NA otherwise. # The "pregnancy" have value NA for males. # This step should filter out 18578 rows. cols <- !(grepl(paste(c('ICD10', "outliers", "pregnancy"),collapse = '|'), names(dat))) rows <- which(rowSums(is.na(dat[,cols])) == 0) dat <- dat[rows,] cat(sprintf("After removing rows with NAs, %d rows remain.\n",nrow(dat))) # Remove rows with mismatches between self-reported and genetic sex # This step should filter out 287 rows. dat <- dat %>% filter(sex == sex_genetic) cat(sprintf("After removing sex mismatches, %d rows remain.\n",nrow(dat))) # Remove "missingness" and "heterozygosity" outliers as defined by UK # Biobank. This step should filter out 665 rows. Note that this step # will remove any samples in which the "missingness" column is greater # than 5%. dat <- dat %>% filter(is.na(outliers)) cat(sprintf("After removing outliers, %d rows remain.\n",nrow(dat))) # Remove any individuals have at leat one relative based on the # kinship calculations. This step should filter out 126,236 rows. dat <- dat %>% filter(kinship_genetic == 0) cat(sprintf(paste("After removing relatedness individuals based on kinship,", "%d rows remain.\n"),nrow(dat))) # Remove any pregnant individuals # This step should filter out 164 rows. dat <- dat %>% filter(!(pregnancy %in% c(1,2))) cat(sprintf(paste("After removing pregnant individuals,", "%d rows remain.\n"),nrow(dat))) # Remove any individuals with blood related diseases # This step should filter out 6070 rows icd10 = c('C94', 'C95', 'Z856', "C901", "C914", "C82", "C83", 'C84', "C85", "Z948", "Z511", "Z512", "Z542", "D46", paste0("D", 55:64), paste0("B", 20:24), "N180", "Z992", "Z491", "Z492", "K74", "C88", "C900", "C902", "C91", "C92", "D45", "D47", "E831") daticd10 = dat %>% select(which(grepl('ICD10', colnames(dat)))) %>% as.matrix icd_status = matrix(grepl(paste(icd10, collapse='|'), daticd10), nrow(daticd10), ncol(daticd10)) dat = dat %>% filter(rowSums(icd_status) == 0) cat(sprintf(paste("After removing individuals with blood diseases,", "%d rows remain.\n"),nrow(dat))) # Remove individuals with "abnormal" measurements. # This step should filter out 8625 rows pheno_names = c("WBC_count", "RBC_count", "Haemoglobin", "MCV", "RDW", "Platelet_count", "Plateletcrit", "PDW", "Lymphocyte_perc", "Monocyte_perc", "Neutrophill_perc", "Eosinophill_perc", "Basophill_perc", "Reticulocyte_perc", "MSCV", "HLR_perc") ## RINT for(name in pheno_names){ id = which(colnames(dat) == name) dat[,id] = qnorm((rank(dat[,id],na.last="keep")-0.5)/sum(!is.na(dat[,id]))) } ## compute empirical covariance matrix covy = dat %>% select(pheno_names) %>% cov D2 = stats::mahalanobis(dat %>% select(pheno_names), center=0, cov=covy) ## mahalanobis distance dat = dat[D2 < qchisq(0.01, df=16, lower.tail = F),] cat(sprintf(paste("After removing individuals with abnormal measurements,", "%d rows remain.\n"),nrow(dat))) # Finally, remove the columns that are no longer needed for subsequent # analyses. cols.to.remove <- c("sex_genetic","ethnic_genetic", "missingness", "kinship_genetic","outliers","pregnancy",paste0("ICD10.", 0:379)) cols <- which(!is.element(names(dat),cols.to.remove)) dat <- dat[,cols] # SUMMARIZE DATA # -------------- # Double-check that everything looks okay. summary(dat) cat("Writing prepared data to CSV file.\n") write.csv(dat,output.file,row.names = FALSE,quote = FALSE, na='NA')