Post-APA calling: Imputation and QC#

Description#

This notebook is designed to impute the missing values in the PDUI matrix, and perform quantile normailization for the impute values.

Input#

  • raw PDUI matrix (row as gene, columns as sample id)

  • covariate file

Output#

  • PDUI matrix without missingness

    • The missing value is calculated using impute package

Steps#

Timing: Runtime varies by dataset size and compute resources. For the toy chr22 MWE dataset, most steps complete in under 10 minutes on a standard HPC node.

sos run pipeline/apa_impute.ipynb APAimpute \
    --cwd output/apa \
    --chrlist chr22

Optionally, rename the sample columns of the imputed PDUI matrix using a match table (maps the Dapars sample IDs to your desired sample names):

sos run pipeline/apa_impute.ipynb APArename \
    --cwd output/apa \
    --chrlist chr22 \
    --match input/covariate/protocol_example.apa_matchtable.txt

Command interface#

sos run pipeline/apa_impute.ipynb -h

Workflow implementation#

Anticipated Results#

The pipeline produces output files in the output/ subdirectory named after the workflow step. Verify success by checking that output files exist and are non-empty. See the Output section above for the expected file names and formats.

[global]
# FIXME: this entire section must be rewritten using conventions as in other pipelines.
parameter: walltime = '40h'
parameter: mem = '32G'
parameter: ncore = 16
parameter: cwd = path
parameter: thread = 8
parameter: job_size = 1
parameter: container = ''
[APAimpute]
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_result_temp.{x}.txt' for x in chrlist]
output: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist]
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = ncore
R: expand= "${ }"
    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(magrittr))
    suppressPackageStartupMessages(require(impute))
    suppressPackageStartupMessages(require(readr))
    suppressPackageStartupMessages(require(stringr))
    chr = str_extract_all("${chrlist}", pattern = "chr\\d+")[[1]]
    apalist = paste0("${cwd}", "/apa_", chr,"/Dapars_result_result_temp.", chr, ".txt")
    merge_apa = function(file) {
        df = data.table::fread(file, colClasses = 'character')
        df = df %>% select(1,2,3,4, 4 + order(colnames(df)[5:ncol(df)]))
        return(df)
    }
    dapars_result = do.call("rbind", mapply(merge_apa, apalist, SIMPLIFY = FALSE))
    # Graceful handling of empty DaPars2 result (e.g. sparse input where no
    # 3UTR event passes coverage thresholds): write valid empty outputs and stop.
    if (is.null(dapars_result) || nrow(dapars_result) == 0) {
        message("APAimpute: DaPars2 result is empty (no APA events called); writing empty outputs.")
        empty_hdr = c("#chr", "start", "end", "Gene")
        empty_df = setNames(data.frame(matrix(ncol = length(empty_hdr), nrow = 0)), empty_hdr)
        write_delim(empty_df, file = paste0("${cwd}","/Dapars_allchrom.bed"), "\t")
        for (i in chr) {
            write.table(empty_df, file = paste0("${cwd}", "/apa_", i,"/Dapars_result_impute_", i, ".bed"), quote = F, row.names = F, sep = "\t")
        }
        quit(save = "no", status = 0)
    }
    tmp = dapars_result[,1:4]
    v1 = v2 = v3 = NULL
    for (i in 1:nrow(tmp)) {
      chro = str_split(tmp$Loci[i], ":")[[1]][1]
      spos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][1]
      epos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][2]
      v1 = c(v1 , chro)
      v2 = c(v2, spos)
      v3 = c(v3, epos)
    }
    tmp = tmp %>% mutate(`#chr` = v1, start = v2, end = v3) %>% select(`#chr`, start, end, Gene)
    dapars_result = dapars_result[,-c(2:4)]
    
    tmp_vec = pull(dapars_result, Gene)
    dapars_result = dapars_result[,-1]
    id_vec = NULL
    for (i in 1:length(colnames(dapars_result))) {
      id = tail(str_split((colnames(dapars_result))[i], "/")[[1]],1)
      id = str_split(id,"_")[[1]][1]
      id_vec = c(id_vec ,id)
    }
    colnames(dapars_result) = id_vec
    dapars_result = as.matrix(dapars_result)
    rownames(dapars_result) = tmp_vec
    dapars_result = dapars_result[,colMeans(is.na(dapars_result)) <= 0.8]
    dapars_result = dapars_result[rowMeans(is.na(dapars_result)) < 0.5,]
    
    
    class(dapars_result) = "numeric"
    dapars_impute = impute.knn(dapars_result)
    df = as.data.frame(dapars_impute$data)
    
    for (gene in 1:nrow(df)) {
      mat = apply(df[gene,], 1, rank, ties.method = "average")
      mat = qnorm(mat/ (ncol(df) + 1))
      df[gene, ] = mat
    }
    
    df$Gene = rownames(dapars_result)
    final_data <- inner_join(tmp, df)
    write_delim(final_data, file = paste0("${cwd}","/Dapars_allchrom.bed"), "\t")
    for (i in chr) {
          select_data = final_data %>% filter(`#chr` == i)
          write.table(select_data, file = paste0("${cwd}", "/apa_", i,"/Dapars_result_impute_", i, ".bed"), quote = F, row.names = F)
    }
[APArename_1]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
R: expand= "${ }"

    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths())) 
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(readr))
  

    input_dir <- ${_input:r}

    df = data.table::fread(input_dir, colClasses = 'character')
    ref = data.table::fread("${match}", colClasses = 'character')
    
    if (ncol(df) >= 5) {
    tmp = NULL
    for (i in colnames(df)[5:ncol(df)]){
      tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
    }
    
    colnames(df)[5:ncol(df)] = tmp
    }
    df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) %>% mutate(end = as.integer(start) + 1)
    write_delim(df, file = ${_output:r}, "\t")
[APArename_2]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed.gz' for x in chrlist], group_by = 1
bash: expand= "${ }"
    bgzip -f ${_input:r}
    tabix -p bed ${_output:r} -f
    gzip -dfk ${_output:r}
[APArename_3]
parameter: cwd = "output/apa"
parameter: container = "containers/dapars2.sif"
parameter: match = "input/covariate/protocol_example.apa_matchtable.txt" #path
input: f'{cwd}/Dapars_allchrom.bed'
output: f'{cwd}/Dapars_allchrom_renamed.bed'
R: expand= "${ }"
    .libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths())) 
    suppressPackageStartupMessages(require(dplyr))
    suppressPackageStartupMessages(require(tidyr))
    suppressPackageStartupMessages(require(readr))
    
    input_dir <- ${_input:r}

    df = read_delim(input_dir, col_types = list(col_character()))
    ref = data.table::fread("${match}", colClasses = 'character')
    
    if (ncol(df) >= 5) {
    tmp = NULL
    for (i in colnames(df)[5:ncol(df)]){
      tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
    }
    
    colnames(df)[5:ncol(df)] = tmp
    }
    df = df %>% group_by(`#chr`) %>% arrange(start, .by_group = TRUE) # %>% mutate(end = as.integer(start) + 1)
    write_delim(df, file = paste0("${cwd}","/Dapars_allchrom_renamed.bed"), "\t")
[APArename_4]
output: f'{_input}.gz', f'{_input}.gz.tbi'
bash: expand = "${ }"
    bgzip -f ${_input}
    tabix -p bed ${_output[0]} -f
    gzip -dfk ${_output[0]}