Run ColocBoost#

Goal#

This notebook demonstrates how to execute ColocBoost on simulated data and analyze its performance through power and false discovery rate (FDR) calculations.

Note: Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set, true_trait, and true_variant.

Input#

simufile: Individual level data X and Y introduced in Phenotype Simulation.

Output#

Colocboost original result, along with some summarized elements.

  • coloc_trait: traits identified by ColocBoost as having shared genetic effects

  • coloc_set: sets of variants identified as causal

  • true_trait: traits with true causal relationships (ground truth)

  • true_variant: variants that are truly causal (ground truth)

Example output:

result = readRDS("../simulation_result/real_simulation_5trait/result/sample_1_real_simulation_3_ncausal_5_trait_ntrait_5_colocboost.rds")
# true variant and corresponding trait index
result$var

print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set

# colocalization result - colocalizing trait
result$coloc_trait

print("=====Truth=====")
#  true colocalizing CoS
result$true_variant

#  true colocalizing trait
result$true_trait
    1. 787
    2. 7524
    1. 787
    2. 7524
  1. 3152
  2. 3152
    1. 3152
    2. 7524
[1] "=====Prediction====="
$`cos1:y1_y2_y5`
  1. 7524
  2. 7525
  3. 7526
  4. 7520
  5. 7523
  6. 7518
  7. 7521
  8. 7522
  9. 7531
  10. 7532
  11. 7533
  12. 7538
  13. 7540
  14. 7516
  15. 7494
  16. 7497
  17. 7499
  18. 7507
  19. 7519
  20. 7508
  21. 7536
  22. 7504
  23. 7506
  24. 7505
  25. 7553
  26. 7546
  27. 7547
  28. 7548
  29. 7549
  30. 7583
$`cos2:y3_y4_y5`
  1. 3152
  2. 3156
  3. 3154
  4. 3158
  5. 3160
  6. 3157
  7. 3184
  8. 3242
  9. 3246
  10. 3185
  11. 3196
  12. 3230
  13. 3203
  14. 3258
  15. 3231
  16. 3233
  17. 3178
  18. 3153
  19. 3183
  20. 3261
  21. 3176
  22. 3190
  23. 3147
  24. 3199
  25. 3266
  26. 3167
  27. 3169
  28. 3171
  29. 3200
  30. 3198
  31. 3168
  32. 3253
  33. 3255
  34. 3256
  35. 3277
  36. 3288
  37. 3290
  38. 3226
  39. 3270
  40. 3271
  41. 3281
  42. 3284
  43. 3285
  44. 3264
  45. 3274
  46. 3278
  47. 3283
    1. 1
    2. 2
    3. 5
    1. 3
    2. 4
    3. 5
[1] "=====Truth====="
  1. 787
  2. 3152
  3. 7524
    1. 1
    2. 2
    1. 3
    2. 4
    3. 5
    1. 1
    2. 2
    3. 5

Colocboost Running#

[colocboost]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 

    # --- Load colocboost function files ---
    # devtools::install_github("StatFunGen/colocboost")
    # for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
    library("colocboost")

    # --- Read simulation RDS file ---
    rds = readRDS(${_input:ar})

    # --- Prepare inputs for colocboost: X (genotypes), Y (phenotypes), variant (causal variant list) ---
    X = list()
    Y = list()
    variant = list()

    for (i in 1:${trait}) {
        X[[i]] = rds$X
        Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
        variant[[i]] = rds$variant[[i]]
    }

    # --- Run colocboost and measure time ---
    start_time <- Sys.time()
    colocboost_result = colocboost(
      X = X, 
      Y = Y
    )
    end_time <- Sys.time()

    # --- Record additional information into colocboost result ---
    colocboost_result$var = variant         # True causal variants
    colocboost_result$trait_num = ${trait}   # Number of traits analyzed
    colocboost_result$file = "${_input[0]:a}"  # File used as input

    # --- Determine true variants colocalized across multiple traits ---
    all_var = unlist(variant)
    true_var = as.numeric(names(which(table(all_var) >= 2)))  # Variants appearing in at least 2 traits
    true_trait = list()

    for (variant_index in 1:length(true_var)) {
        temp_vec = c()
        for (i in 1:length(variant)) {
            if (true_var[variant_index] %in% variant[[i]]) {
                temp_vec = c(temp_vec, i)
            }
        }
        true_trait[[variant_index]] = temp_vec
    }

    # --- Extract colocalization outcomes ---
    library(stringr)
    coloc_trait = list()

    # If no colocalization sets are detected, assign NULL
    if (length(colocboost_result$cos_details$cos$cos_index) == 0) {
        coloc_trait = NULL  
    } else {
        for (i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)) {
            coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
        }
    }

    # --- Finalize result object ---
    colocboost_result$true_variant = true_var
    colocboost_result$true_trait = true_trait
    colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
    colocboost_result$coloc_trait = coloc_trait
    colocboost_result$time = end_time - start_time

    # --- Save output RDS file ---
    saveRDS(colocboost_result, ${_output:r})

Colocboost Target Version Running#

[colocboost_target]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load colocboost function files ---
    # devtools::install_github("StatFunGen/colocboost")
    # for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
    library("colocboost")
    X = list()
    Y = list()
    variant = list()
    rds = readRDS(${_input:ar})

    for(i in 1:${trait}){
      X[[i]] = rds$X
      Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
      variant[[i]] = rds$variant[[i]]
    }

    start_time <- Sys.time()
    colocboost_result = colocboost(
      X = X, 
      Y = Y, focal_outcome_idx = 1
    )
    end_time <- Sys.time()

    # record true variant, analysed trait number and corresponding file name
    colocboost_result$var = variant
    colocboost_result$trait_num = ${trait}
    colocboost_result$file = "${_input[0]:a}"

    # In real setting (list: variant), show which snp appear in at least two traits
    all_var = unlist(variant)
    true_var = as.numeric(names(which(table(all_var) >= 2)))
    true_trait = list()

    # Iterate through the variant list to find which traits the true_var is colocalized on
    for (variant_index in 1:length(true_var)){
      temp_vec = c()
      for(i in 1:length(variant)){
        if(true_var[variant_index] %in% variant[[i]]){
          temp_vec = c(temp_vec, i)
        }
      }
      true_trait[[variant_index]] = temp_vec
    }

    library(stringr)
    coloc_trait = list()

    # if no coloc sets detected, assign coloc_trait as NULL as well
    if(length(colocboost_result$cos_details$cos$cos_index) == 0){
      coloc_trait = NULL  
    } else {
      for(i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)){
        coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
      }
    }

    colocboost_result$true_variant = true_var
    colocboost_result$true_trait = true_trait
    colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
    colocboost_result$coloc_trait = coloc_trait
    colocboost_result$time = end_time - start_time
    saveRDS(colocboost_result, ${_output:r})

Job submission#

Real configuration simulation running - 2, 5, 10 and 20 traits.

data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done


data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 5 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done



data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done


data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 20 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done