Run Colocboost#

Goal#

This notebook takes the input and run colocboost, the result can be summarized to get power and FDR.

Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set true_trait and true_variant.

Input#

Individual level data X and Y or summary statistics z and LD from other notebooks. file names of these data should be put in the parameter: simufile.

Output#

Colocboost original result, along with some summarized elements.

Example output:

result = readRDS("/home/hs3393/cb_Mar/simulation_result/real_simulation_5trait/result/sample_999_real_simulation_3_ncausal_5_trait_ntrait_5_colocboost.rds")
# true variant and corresponding trait index
result$var

print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set

# colocalization result - colocalizing trait
result$coloc_trait

print("=====Truth=====")
#  true colocalizing CoS
result$true_variant

#  true colocalizing trait
result$true_trait
    1. 196
    2. 5616
    3. 6566
    1. 5616
    2. 6566
    1. 196
    2. 6566
    1. 196
    2. 5616
    1. 196
    2. 6566
[1] "=====Prediction====="
$`cos1:y1_y3_y4_y5`
196
$`cos2:y1_y2_y3_y5`
  1. 6566
  2. 6535
$`cos3:y1_y2_y4`
  1. 5616
  2. 5621
  3. 5628
  4. 5629
  5. 5632
  6. 5636
  7. 5637
  8. 5634
  9. 5635
  10. 5627
  11. 5623
  12. 5625
  13. 5615
    1. 1
    2. 3
    3. 4
    4. 5
    1. 1
    2. 2
    3. 3
    4. 5
    1. 1
    2. 2
    3. 4
[1] "=====Truth====="
  1. 196
  2. 5616
  3. 6566
    1. 1
    2. 3
    3. 4
    4. 5
    1. 1
    2. 2
    3. 4
    1. 1
    2. 2
    3. 3
    4. 5

Colocboost Running#

[colocboost]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/", full.names = T)) {source(file)}
    X = list()
    Y = list()
    variant = list()
    rds = readRDS(${_input:ar})

    for(i in 1:${trait}){
      X[[i]] = rds$X
      Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
      variant[[i]] = rds$variant[[i]]
    }

    start_time <- Sys.time()
    colocboost_result = colocboost(
      X = X, 
      Y = Y
    )
    end_time <- Sys.time()

    # record true variant, analysed trait number and corresponding file name
    colocboost_result$var = variant
    colocboost_result$trait_num = ${trait}
    colocboost_result$file = "${_input[0]:a}"

    # In real setting (list: variant), show which snp appear in at least two traits
    all_var = unlist(variant)
    true_var = as.numeric(names(which(table(all_var) >= 2)))
    true_trait = list()

    # Iterate through the variant list to find which traits the true_var is colocalized on
    for (variant_index in 1:length(true_var)){
      temp_vec = c()
      for(i in 1:length(variant)){
        if(true_var[variant_index] %in% variant[[i]]){
          temp_vec = c(temp_vec, i)
        }
      }
      true_trait[[variant_index]] = temp_vec
    }

    library(stringr)
    coloc_trait = list()

    # if no coloc sets detected, assign coloc_trait as NULL as well
    if(length(colocboost_result$cos_details$cos$cos_index) == 0){
      coloc_trait = NULL  
    } else {
      for(i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)){
        coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]
      }
    }

    colocboost_result$true_variant = true_var
    colocboost_result$true_trait = true_trait
    colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index
    colocboost_result$coloc_trait = coloc_trait
    colocboost_result$time = end_time - start_time
    saveRDS(colocboost_result, ${_output:r})

Job submission#

Real configuration simulation running - 2, 5, 10 and 20 traits.

data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done


data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 5 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done



data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done


data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 20 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done