Post-APA calling: Imputation and QC

Post-APA calling: Imputation and QC#

Aim#

This notebook is designed to impute the missing values in the PDUI matrix, and perform quantile normailization for the impute values.

Input#

  • raw PDUI matrix (row as gene, columns as sample id)

  • covariate file

Output#

  • PDUI matrix without missingness

    • The missing value is calculated using impute package

Minimum working example#

sos run /mnt/mfs/statgen/ls3751/github/xqtl-protocol/pipeline/molecular_phenotypes/QC/apa_impute.ipynb APAimpute \
    --cwd /mnt/mfs/statgen/ls3751/MWE_dapars2/Output \
    --cov /data/example.cov.txt
    --chrlist chr1 \
    --container /mnt/mfs/statgen/ls3751/container/dapars2.sif

Workflow implementation#

[global]
# FIXME: this entire section must be rewritten using conventions as in other pipelines.
parameter: walltime = '40h'
parameter: mem = '32G'
parameter: ncore = 16
parameter: cwd = path
parameter: thread = 8
parameter: job_size = 1
parameter: container = ''
[APAimpute]
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_result_temp.{x}.txt' for x in chrlist]
output: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist]
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = ncore
R: expand= "${ }", container = container
    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(magrittr))
    suppressPackageStartupMessages(require(impute))
    suppressPackageStartupMessages(require(readr))
    suppressPackageStartupMessages(require(stringr))
    chr = str_extract_all("${chrlist}", pattern = "chr\\d+")[[1]]
    apalist = paste0("${cwd}", "/apa_", chr,"/Dapars_result_result_temp.", chr, ".txt")
    merge_apa = function(file) {
        df = data.table::fread(file, colClasses = 'character')
        df = df %>% select(1,2,3,4, order(colnames(df)[5:ncol(df)]))
        return(df)
    }
    dapars_result = do.call("rbind", mapply(merge_apa, apalist, SIMPLIFY = FALSE))
    tmp = dapars_result[,1:4]
    v1 = v2 = v3 = NULL
    for (i in 1:nrow(tmp)) {
      chro = str_split(tmp$Loci[i], ":")[[1]][1]
      spos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][1]
      epos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][2]
      v1 = c(v1 , chro)
      v2 = c(v2, spos)
      v3 = c(v3, epos)
    }
    tmp = tmp %>% mutate(`#chr` = v1, start = v2, end = v3) %>% select(`#chr`, start, end, Gene)
    dapars_result = dapars_result[,-c(2:4)]
    
    tmp_vec = pull(dapars_result, Gene)
    dapars_result = dapars_result[,-1]
    id_vec = NULL
    for (i in 1:length(colnames(dapars_result))) {
      id = tail(str_split((colnames(dapars_result))[i], "/")[[1]],1)
      id = str_split(id,"_")[[1]][1]
      id_vec = c(id_vec ,id)
    }
    colnames(dapars_result) = id_vec
    dapars_result = as.matrix(dapars_result)
    rownames(dapars_result) = tmp_vec
    dapars_result = dapars_result[,colMeans(is.na(dapars_result)) <= 0.8]
    dapars_result = dapars_result[rowMeans(is.na(dapars_result)) < 0.5,]
    
    
    class(dapars_result) = "numeric"
    dapars_impute = impute.knn(dapars_result)
    df = as.data.frame(dapars_impute$data)
    
    for (gene in 1:nrow(df)) {
      mat = apply(df[gene,], 1, rank, ties.method = "average")
      mat = qnorm(mat/ (ncol(df) + 1))
      df[gene, ] = mat
    }
    
    df$Gene = rownames(dapars_result)
    final_data <- inner_join(tmp, df)
    write_delim(final_data, file = paste0("${cwd}","/Dapars_allchrom.bed"), "\t")
    for (i in chr) {
          select_data = final_data %>% filter(`#chr` == i)
          write.table(select_data, file = paste0("${cwd}", "/apa_", i,"/Dapars_result_impute_", i, ".bed"), quote = F, row.names = F)
    }
[APArename_1]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
R: expand= "${ }", container = container

    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths())) 
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(readr))
  

    input_dir <- ${_input:r}

    df = data.table::fread(input_dir, colClasses = 'character')
    ref = data.table::fread("${match}", colClasses = 'character')
    
    tmp = NULL
    for (i in colnames(df)[5:ncol(df)]){
      tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
    }
    
    colnames(df)[5:ncol(df)] = tmp
    df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) %>% mutate(end = start + 1)
    write_delim(df, file = ${_output:r}, "\t")
[APArename_2]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed.gz' for x in chrlist], group_by = 1
bash: expand= "${ }"
    bgzip -f ${_input:r}
    tabix -p bed ${_output:r} -f
    gzip -dfk ${_output:r}
[APArename_3]
parameter: cwd = "/mnt/mfs/statgen/ls3751/aqtl_analysis/wig/AC"
parameter: container = "/mnt/mfs/statgen/ls3751/container/dapars2_final.sif"
parameter: match = "/mnt/mfs/statgen/ls3751/aqtl_analysis/ROSMAP_APA_matchtable.txt" #path
input: f'{cwd}/Dapars_allchrom.bed'
output: f'{cwd}/Dapars_allchrom_renamed.bed'
R: expand= "${ }", container = container
    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths())) 
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(readr))
    
    input_dir <- ${_input:r}

    df = read_delim(input_dir, col_types = list(col_character()))
    ref = data.table::fread("${match}", colClasses = 'character')
    
    tmp = NULL
    for (i in colnames(df)[5:ncol(df)]){
      tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
    }
    
    colnames(df)[5:ncol(df)] = tmp
    df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) # %>% mutate(end = start + 1)
    write_delim(df, file = paste0("${cwd}","/Dapars_allchrom_renamed.bed"), "\t")
[APArename_4]
output: f'{_input}.gz', f'{_input}.gz.tbi'
bash: expand = "${ }"
    bgzip -f ${_input}
    tabix -p bed ${_output[0]} -f
    gzip -dfk ${_output[0]}