Run ColocBoost#
Goal#
This notebook demonstrates how to execute ColocBoost on simulated data and analyze its performance through power and false discovery rate (FDR) calculations.
Note: Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait
, coloc_set
, true_trait
, and true_variant
.
Input#
simufile
: Individual level data X and Y introduced in Phenotype Simulation.
Output#
Colocboost original result, along with some summarized elements.
coloc_trait
: traits identified by ColocBoost as having shared genetic effectscoloc_set
: sets of variants identified as causaltrue_trait
: traits with true causal relationships (ground truth)true_variant
: variants that are truly causal (ground truth)
Example output:
result = readRDS("../simulation_result/real_simulation_5trait/result/sample_1_real_simulation_3_ncausal_5_trait_ntrait_5_colocboost.rds")
# true variant and corresponding trait index
result$var
print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set
# colocalization result - colocalizing trait
result$coloc_trait
print("=====Truth=====")
# true colocalizing CoS
result$true_variant
# true colocalizing trait
result$true_trait
-
- 787
- 7524
-
- 787
- 7524
- 3152
- 3152
-
- 3152
- 7524
[1] "=====Prediction====="
- $`cos1:y1_y2_y5`
-
- 7524
- 7525
- 7526
- 7520
- 7523
- 7518
- 7521
- 7522
- 7531
- 7532
- 7533
- 7538
- 7540
- 7516
- 7494
- 7497
- 7499
- 7507
- 7519
- 7508
- 7536
- 7504
- 7506
- 7505
- 7553
- 7546
- 7547
- 7548
- 7549
- 7583
- $`cos2:y3_y4_y5`
-
- 3152
- 3156
- 3154
- 3158
- 3160
- 3157
- 3184
- 3242
- 3246
- 3185
- 3196
- 3230
- 3203
- 3258
- 3231
- 3233
- 3178
- 3153
- 3183
- 3261
- 3176
- 3190
- 3147
- 3199
- 3266
- 3167
- 3169
- 3171
- 3200
- 3198
- 3168
- 3253
- 3255
- 3256
- 3277
- 3288
- 3290
- 3226
- 3270
- 3271
- 3281
- 3284
- 3285
- 3264
- 3274
- 3278
- 3283
-
- 1
- 2
- 5
-
- 3
- 4
- 5
[1] "=====Truth====="
- 787
- 3152
- 7524
-
- 1
- 2
-
- 3
- 4
- 5
-
- 1
- 2
- 5
Colocboost Running#
[colocboost]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# --- Load colocboost function files ---
# devtools::install_github("StatFunGen/colocboost")
# for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
library("colocboost")
# --- Read simulation RDS file ---
rds = readRDS(${_input:ar})
# --- Prepare inputs for colocboost: X (genotypes), Y (phenotypes), variant (causal variant list) ---
X = list()
Y = list()
variant = list()
for (i in 1:${trait}) {
X[[i]] = rds$X
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
# --- Run colocboost and measure time ---
start_time <- Sys.time()
colocboost_result = colocboost(
X = X,
Y = Y
)
end_time <- Sys.time()
# --- Record additional information into colocboost result ---
colocboost_result$var = variant # True causal variants
colocboost_result$trait_num = ${trait} # Number of traits analyzed
colocboost_result$file = "${_input[0]:a}" # File used as input
# --- Determine true variants colocalized across multiple traits ---
all_var = unlist(variant)
true_var = as.numeric(names(which(table(all_var) >= 2))) # Variants appearing in at least 2 traits
true_trait = list()
for (variant_index in 1:length(true_var)) {
temp_vec = c()
for (i in 1:length(variant)) {
if (true_var[variant_index] %in% variant[[i]]) {
temp_vec = c(temp_vec, i)
}
}
true_trait[[variant_index]] = temp_vec
}
# --- Extract colocalization outcomes ---
library(stringr)
coloc_trait = list()
# If no colocalization sets are detected, assign NULL
if (length(colocboost_result$cos_details$cos$cos_index) == 0) {
coloc_trait = NULL
} else {
for (i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)) {
coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
}
}
# --- Finalize result object ---
colocboost_result$true_variant = true_var
colocboost_result$true_trait = true_trait
colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
colocboost_result$coloc_trait = coloc_trait
colocboost_result$time = end_time - start_time
# --- Save output RDS file ---
saveRDS(colocboost_result, ${_output:r})
Colocboost Target Version Running#
[colocboost_target]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# --- Load colocboost function files ---
# devtools::install_github("StatFunGen/colocboost")
# for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
library("colocboost")
X = list()
Y = list()
variant = list()
rds = readRDS(${_input:ar})
for(i in 1:${trait}){
X[[i]] = rds$X
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
start_time <- Sys.time()
colocboost_result = colocboost(
X = X,
Y = Y, focal_outcome_idx = 1
)
end_time <- Sys.time()
# record true variant, analysed trait number and corresponding file name
colocboost_result$var = variant
colocboost_result$trait_num = ${trait}
colocboost_result$file = "${_input[0]:a}"
# In real setting (list: variant), show which snp appear in at least two traits
all_var = unlist(variant)
true_var = as.numeric(names(which(table(all_var) >= 2)))
true_trait = list()
# Iterate through the variant list to find which traits the true_var is colocalized on
for (variant_index in 1:length(true_var)){
temp_vec = c()
for(i in 1:length(variant)){
if(true_var[variant_index] %in% variant[[i]]){
temp_vec = c(temp_vec, i)
}
}
true_trait[[variant_index]] = temp_vec
}
library(stringr)
coloc_trait = list()
# if no coloc sets detected, assign coloc_trait as NULL as well
if(length(colocboost_result$cos_details$cos$cos_index) == 0){
coloc_trait = NULL
} else {
for(i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)){
coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
}
}
colocboost_result$true_variant = true_var
colocboost_result$true_trait = true_trait
colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
colocboost_result$coloc_trait = coloc_trait
colocboost_result$time = end_time - start_time
saveRDS(colocboost_result, ${_output:r})
Job submission#
Real configuration simulation running - 2, 5, 10 and 20 traits.
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 2 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 5 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 20 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done