Post-APA calling: Imputation and QC#
Aim#
This notebook is designed to impute the missing values in the PDUI matrix, and perform quantile normailization for the impute values.
Input#
raw PDUI matrix (row as gene, columns as sample id)
covariate file
Output#
PDUI matrix without missingness
The missing value is calculated using
imputepackage
Minimum working example#
sos run /mnt/mfs/statgen/ls3751/github/xqtl-protocol/pipeline/molecular_phenotypes/QC/apa_impute.ipynb APAimpute \
--cwd /mnt/mfs/statgen/ls3751/MWE_dapars2/Output \
--cov /data/example.cov.txt
--chrlist chr1 \
--container /mnt/mfs/statgen/ls3751/container/dapars2.sif
Workflow implementation#
[global]
# FIXME: this entire section must be rewritten using conventions as in other pipelines.
parameter: walltime = '40h'
parameter: mem = '32G'
parameter: ncore = 16
parameter: cwd = path
parameter: thread = 8
parameter: job_size = 1
parameter: container = ''
[APAimpute]
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_result_temp.{x}.txt' for x in chrlist]
output: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist]
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = ncore
R: expand= "${ }", container = container
.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
suppressPackageStartupMessages(require(magrittr))
suppressPackageStartupMessages(require(impute))
suppressPackageStartupMessages(require(readr))
suppressPackageStartupMessages(require(stringr))
chr = str_extract_all("${chrlist}", pattern = "chr\\d+")[[1]]
apalist = paste0("${cwd}", "/apa_", chr,"/Dapars_result_result_temp.", chr, ".txt")
merge_apa = function(file) {
df = data.table::fread(file, colClasses = 'character')
df = df %>% select(1,2,3,4, order(colnames(df)[5:ncol(df)]))
return(df)
}
dapars_result = do.call("rbind", mapply(merge_apa, apalist, SIMPLIFY = FALSE))
tmp = dapars_result[,1:4]
v1 = v2 = v3 = NULL
for (i in 1:nrow(tmp)) {
chro = str_split(tmp$Loci[i], ":")[[1]][1]
spos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][1]
epos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][2]
v1 = c(v1 , chro)
v2 = c(v2, spos)
v3 = c(v3, epos)
}
tmp = tmp %>% mutate(`#chr` = v1, start = v2, end = v3) %>% select(`#chr`, start, end, Gene)
dapars_result = dapars_result[,-c(2:4)]
tmp_vec = pull(dapars_result, Gene)
dapars_result = dapars_result[,-1]
id_vec = NULL
for (i in 1:length(colnames(dapars_result))) {
id = tail(str_split((colnames(dapars_result))[i], "/")[[1]],1)
id = str_split(id,"_")[[1]][1]
id_vec = c(id_vec ,id)
}
colnames(dapars_result) = id_vec
dapars_result = as.matrix(dapars_result)
rownames(dapars_result) = tmp_vec
dapars_result = dapars_result[,colMeans(is.na(dapars_result)) <= 0.8]
dapars_result = dapars_result[rowMeans(is.na(dapars_result)) < 0.5,]
class(dapars_result) = "numeric"
dapars_impute = impute.knn(dapars_result)
df = as.data.frame(dapars_impute$data)
for (gene in 1:nrow(df)) {
mat = apply(df[gene,], 1, rank, ties.method = "average")
mat = qnorm(mat/ (ncol(df) + 1))
df[gene, ] = mat
}
df$Gene = rownames(dapars_result)
final_data <- inner_join(tmp, df)
write_delim(final_data, file = paste0("${cwd}","/Dapars_allchrom.bed"), "\t")
for (i in chr) {
select_data = final_data %>% filter(`#chr` == i)
write.table(select_data, file = paste0("${cwd}", "/apa_", i,"/Dapars_result_impute_", i, ".bed"), quote = F, row.names = F)
}
[APArename_1]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
R: expand= "${ }", container = container
.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
suppressPackageStartupMessages(require(readr))
input_dir <- ${_input:r}
df = data.table::fread(input_dir, colClasses = 'character')
ref = data.table::fread("${match}", colClasses = 'character')
tmp = NULL
for (i in colnames(df)[5:ncol(df)]){
tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
}
colnames(df)[5:ncol(df)] = tmp
df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) %>% mutate(end = start + 1)
write_delim(df, file = ${_output:r}, "\t")
[APArename_2]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed.gz' for x in chrlist], group_by = 1
bash: expand= "${ }"
bgzip -f ${_input:r}
tabix -p bed ${_output:r} -f
gzip -dfk ${_output:r}
[APArename_3]
parameter: cwd = "/mnt/mfs/statgen/ls3751/aqtl_analysis/wig/AC"
parameter: container = "/mnt/mfs/statgen/ls3751/container/dapars2_final.sif"
parameter: match = "/mnt/mfs/statgen/ls3751/aqtl_analysis/ROSMAP_APA_matchtable.txt" #path
input: f'{cwd}/Dapars_allchrom.bed'
output: f'{cwd}/Dapars_allchrom_renamed.bed'
R: expand= "${ }", container = container
.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
suppressPackageStartupMessages(require(readr))
input_dir <- ${_input:r}
df = read_delim(input_dir, col_types = list(col_character()))
ref = data.table::fread("${match}", colClasses = 'character')
tmp = NULL
for (i in colnames(df)[5:ncol(df)]){
tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
}
colnames(df)[5:ncol(df)] = tmp
df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) # %>% mutate(end = start + 1)
write_delim(df, file = paste0("${cwd}","/Dapars_allchrom_renamed.bed"), "\t")
[APArename_4]
output: f'{_input}.gz', f'{_input}.gz.tbi'
bash: expand = "${ }"
bgzip -f ${_input}
tabix -p bed ${_output[0]} -f
gzip -dfk ${_output[0]}