FineBoost (single trait ColocBoost)#

Goal#

This notebook takes the input and run FineBoost (single trait version of ColocBoost), the result can be summarized to get power and FDR.

Input#

simufile: Individual level data X and Y introduced in Phenotype Simulation.

Output#

Fineboost / fine-mapping original result, along with some summarized elements.

  • cover_var_num: sets of variants identified as causal

  • true_cs_num: variants that are truly causal (ground truth)

  • total_var_num: total number of truly causal (ground truth)

Example output:

result = readRDS("../simulation_result/fineboost/sample_41_h2g_0.05_ncausal_3_ntrait_fineboost.rds")

result$var

print("=====Prediction=====")
# colocalization result - CoS
result$true_cs_num

# colocalization result - colocalizing trait
result$total_cs_num

print("=====Truth=====")
#  true colocalizing CoS
result$cover_var_num

#  true colocalizing trait
result$total_var_num
  1. 2654
  2. 8038
  3. 10675
[1] "=====Prediction====="
3
3
[1] "=====Truth====="
3
3

Fineboost Running code#

[fineboost]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 1
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    # FineBoost: Single-trait causal discovery

    library("tidyverse")   
    # --- Load colocboost function files ---
    # devtools::install_github("StatFunGen/colocboost")
    # for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
    library("colocboost")
  
    # --- Load input ---
    rds = readRDS(${_input:ar})
    X = rds$X
    Y = as.matrix(rds$Y[, 1, drop = FALSE])
    variant = rds$variant[[1]]

    # --- Run colocboost (fineboost mode) ---
    start_time <- Sys.time()
    colocboost_result = colocboost(
        X = X,
        Y = Y,
        output_level = 2
    )
    end_time <- Sys.time()

    # --- Post-processing ---
    colocboost_result$var = variant
    colocboost_result$file = "${_input[0]:a}"

    all_var = unlist(variant)
    cs_num = length(colocboost_result$ucos_details$ucos$ucos_index)

    cover_var_num = 0
    true_cs_num = 0

    if (cs_num == 0) {
        cover_var_num = 0
        true_cs_num = 0
        total_cs_num = 0
    } else {
        # How many true causal variants are covered
        cs_vars = unlist(colocboost_result$ucos_details$ucos$ucos_index)

        cover_var_num = sum(all_var %in% cs_vars)

        # How many credible sets overlap a true variant
        for (i in 1:cs_num) {
            if (length(intersect(colocboost_result$ucos_details$ucos$ucos_index[[i]], all_var)) > 0) {
                true_cs_num = true_cs_num + 1
            }
        }
    }

    # --- Summarize results ---
    colocboost_report = colocboost_result
    colocboost_report$cover_var_num = cover_var_num
    colocboost_report$total_var_num = length(all_var)
    colocboost_report$true_cs_num = true_cs_num
    colocboost_report$total_cs_num = cs_num
    colocboost_report$time = end_time - start_time

    saveRDS(colocboost_report, ${_output:r})

Fineboost summary code#

[fineboost_summary]
parameter: folder = path
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "50h"
parameter: mem = "60G"
parameter: numThreads = 1
parameter: container = ""
input: folder, group_by = 1
output: f'{cwd:a}/{_input[0]:b}_summary.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    # FineBoost summary script

    library(tidyverse)

    # load summary code
    variant_vector = function(true_variant, coloc_set){
        var_vec = numeric(length(true_variant))
        for(i in c(1:length(true_variant))){
            for (j in c(1:length(coloc_set))){
                if(true_variant[i] %in% unlist(coloc_set[[j]])){
                    var_vec[i] = 1
                }else{
                    var_vec[i] = var_vec[i]
                }
            }
        }
        return(var_vec)

    }

    set_vector =  function(true_variant, coloc_set){
        set_vec = numeric(length(coloc_set))
        for(i in c(1:length(true_variant))){
             for (j in c(1:length(coloc_set))){
                     inter = intersect(unlist(coloc_set[[j]]),
                                      true_variant)
                 if(length(inter > 0)){
                    set_vec[j] = 1 
                     }else{
                     set_vec[j] = 0
                     }
                 }
        return(set_vec)
        }
    }

    set_size = function(coloc_set){
        size_vec = numeric(length(coloc_set))
        for (j in c(1:length(coloc_set))){
        size_vec[j] = length(unlist(coloc_set[j]))
        }
        return(size_vec)
    }

    fineboost_summary = function(true_variant, coloc_set){
        return(list(
        match_variant = sum(variant_vector(true_variant, coloc_set)),
        true_trait_number = sum(set_vector(true_variant, coloc_set)),
        total_variant_number = length(true_variant),
        total_set_number = length(coloc_set)
        ))
    }

    # List all RDS result files
    filenames <- list.files(${folder:r}, pattern = "*.rds$", full.names = TRUE, recursive = TRUE)

    # Function to read and summarize each RDS
    read_and_extract_rds <- function(file_path) {
        rds_file <- readRDS(file_path)

        # If no ucos detected, set to NULL
        if (length(rds_file$ucos_details$ucos$ucos_index) == 0) {
            rds_file$ucos_details$ucos$ucos_index = NULL
        }

        # Run fineboost summary
        data = fineboost_summary(
            unlist(rds_file$var),
            rds_file$ucos_details$ucos$ucos_index
        )

        # Build output table
        table <- tibble(
            match_variant_number = data$match_variant,
            total_variant_number = data$total_variant_number,
            true_trait_number = data$true_trait_number,
            total_set_number = data$total_set_number,
            file = rds_file$file,
            out_file = file_path
        )
        return(table)
    }

    # Combine all summary tables
    combined_table <- map_dfr(filenames, read_and_extract_rds)

    # Save combined summary
    saveRDS(combined_table, ${_output:r})

Fineboost batch file#

work_dir="/home/hs3393/cb_simulation/simulation_result/fineboost"
job="fineboost"
mkdir -p ${work_dir}

mkdir -p ${work_dir}/code
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err

source ~/mamba_activate.sh
module load Singularity

cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost \
    --simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
    --mem 25G \
    --cwd PWD/
EOF


for causal in 1 2 3 4 5; do
    output_script="causal_${causal}.sh"
    cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g"  | sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

Fineboost summary batch file#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J sum
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.err

source ~/mamba_activate.sh
module load Singularity

sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost_summary \
    --folder /home/hs3393/cb_tune/simulation_result/fineboost/ \
    --cwd /home/hs3393/cb_tune/simulation_result/fineboost//summary

SuSiE running code#

[susie]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 10
parameter: walltime = "20h"
parameter: mem = "50G"
parameter: numThreads = 3
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_susie.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    # Susie finemapping script

    library("susieR")

    # Read input file
    rds = readRDS(${_input:ar})

    X = rds$X
    Y = as.matrix(rds$Y[, 1, drop = FALSE])
    variant = rds$variant[[1]]

    # Run SuSiE
    susie_result = susie(X, Y)

    # Record true variant, analyzed trait number, and file name
    susie_result$var = variant
    susie_result$file = "${_input[0]:a}"

    all_var = unlist(variant)
    cs_num = length(susie_result$sets$cs)

    # Initialize counts
    cover_var_num = 0
    true_cs_num = 0

    # If no credible sets found
    if (length(susie_result$sets$cs) == 0) {
        cover_var_num = 0
        true_cs_num = 0
        total_cs_num = 0
    } else {
        # Count how many true causal variants are covered
        cs_vars = unlist(susie_result$sets$cs)

        for (i in 1:length(all_var)) {
            if (all_var[i] %in% cs_vars) {
                cover_var_num = cover_var_num + 1
            }
        }

        # Count how many credible sets contain at least one true variant
        for (i in 1:cs_num) {
            if (length(intersect(susie_result$sets$cs[[i]], all_var)) > 0) {
                true_cs_num = true_cs_num + 1
            }
        }
    }

    susie_result$cover_var_num = cover_var_num
    susie_result$total_var_num = length(all_var)
    susie_result$true_cs_num = true_cs_num
    susie_result$total_cs_num = cs_num

    saveRDS(susie_result, ${_output:r})

Run SuSiE batch file#

work_dir="/home/hs3393//cloud_colocalization/simulation_result/susie"
job="susie"
mkdir -p ${work_dir}

mkdir -p ${work_dir}/code
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err

source ~/mamba_activate.sh
module load Singularity

cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb susie \
    --simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
    --mem 25G \
    --cwd PWD/
EOF


for causal in 1 2 3 4 5; do
    output_script="causal_${causal}.sh"
    cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g"  | sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

SuSiE summary batch file#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J susie
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.err

source ~/mamba_activate.sh
module load Singularity

sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost_summary \
    --folder /home/hs3393/cb_simulation/simulation_result/fineboost \
    --cwd /home/hs3393/cb_simulation/simulation_result/fineboost/summary