Run Colocboost#
Goal#
This notebook takes the input and run colocboost, the result can be summarized to get power and FDR.
Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set true_trait and true_variant.
Input#
Individual level data X and Y or summary statistics z and LD from other notebooks. file names of these data should be put in the parameter: simufile.
Output#
Colocboost original result, along with some summarized elements.
Example output:
result = readRDS("/home/hs3393/cb_Mar/simulation_result/real_simulation_5trait/result/sample_999_real_simulation_3_ncausal_5_trait_ntrait_5_colocboost.rds")
# true variant and corresponding trait index
result$var
print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set
# colocalization result - colocalizing trait
result$coloc_trait
print("=====Truth=====")
# true colocalizing CoS
result$true_variant
# true colocalizing trait
result$true_trait
-
- 196
- 5616
- 6566
-
- 5616
- 6566
-
- 196
- 6566
-
- 196
- 5616
-
- 196
- 6566
[1] "=====Prediction====="
- $`cos1:y1_y3_y4_y5`
- 196
- $`cos2:y1_y2_y3_y5`
-
- 6566
- 6535
- $`cos3:y1_y2_y4`
-
- 5616
- 5621
- 5628
- 5629
- 5632
- 5636
- 5637
- 5634
- 5635
- 5627
- 5623
- 5625
- 5615
-
- 1
- 3
- 4
- 5
-
- 1
- 2
- 3
- 5
-
- 1
- 2
- 4
[1] "=====Truth====="
- 196
- 5616
- 6566
-
- 1
- 3
- 4
- 5
-
- 1
- 2
- 4
-
- 1
- 2
- 3
- 5
Colocboost Running#
[colocboost]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/", full.names = T)) {source(file)}
X = list()
Y = list()
variant = list()
rds = readRDS(${_input:ar})
for(i in 1:${trait}){
X[[i]] = rds$X
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
start_time <- Sys.time()
colocboost_result = colocboost(
X = X,
Y = Y
)
end_time <- Sys.time()
# record true variant, analysed trait number and corresponding file name
colocboost_result$var = variant
colocboost_result$trait_num = ${trait}
colocboost_result$file = "${_input[0]:a}"
# In real setting (list: variant), show which snp appear in at least two traits
all_var = unlist(variant)
true_var = as.numeric(names(which(table(all_var) >= 2)))
true_trait = list()
# Iterate through the variant list to find which traits the true_var is colocalized on
for (variant_index in 1:length(true_var)){
temp_vec = c()
for(i in 1:length(variant)){
if(true_var[variant_index] %in% variant[[i]]){
temp_vec = c(temp_vec, i)
}
}
true_trait[[variant_index]] = temp_vec
}
library(stringr)
coloc_trait = list()
# if no coloc sets detected, assign coloc_trait as NULL as well
if(length(colocboost_result$cos_details$cos$cos_index) == 0){
coloc_trait = NULL
} else {
for(i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)){
coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
}
}
colocboost_result$true_variant = true_var
colocboost_result$true_trait = true_trait
colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
colocboost_result$coloc_trait = coloc_trait
colocboost_result$time = end_time - start_time
saveRDS(colocboost_result, ${_output:r})
Job submission#
Real configuration simulation running - 2, 5, 10 and 20 traits.
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 2 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 5 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 20 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done