Colocalization result summary#

Goal#

Summarize the result based on the key four outputs to calculate FDR and power.

Fit any coloclaization method generated by the notebooks in this repo. As long as they are summarized to have the elements: coloc_trait, coloc_set true_trait and true_variant.

Input#

File(s) of colocalization result.

Output#

A dataframe, each row is a summarization of one finemapping result.

Example output:

result = readRDS("/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/summary/result_summary.rds")
head(result)
A tibble: 6 × 12
total_causal_var_numberperfect_causal_var_numberpartial_causal_var_numbertrue_trait_numberpredict_trait_numbertotal_trait_numbertrue_set_numbertotal_set_numbersingle_set_numbermax_causalfileout_file
<int><dbl><dbl><dbl><int><dbl><dbl><int><int><int><chr><chr>
1111 5 5 51101/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_0_real_simulation_1_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_0_real_simulation_1_ncausal_10_trait_ntr_10_hypercoloc.rds
2211 5 5 81102/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_0_real_simulation_2_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_0_real_simulation_2_ncausal_10_trait_ntr_10_hypercoloc.rds
3302 4 4 82203/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_0_real_simulation_3_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_0_real_simulation_3_ncausal_10_trait_ntr_10_hypercoloc.rds
44031010173303/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_0_real_simulation_4_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_0_real_simulation_4_ncausal_10_trait_ntr_10_hypercoloc.rds
5111 4 4 41101/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_1_real_simulation_1_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_1_real_simulation_1_ncausal_10_trait_ntr_10_hypercoloc.rds
6212 6 6 72202/mnt/vast/hpc/csg/hs3393/cb_Mar/simulation_data/real_simulation_10trait/sample_1_real_simulation_2_ncausal_10_trait.rds/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_1_real_simulation_2_ncausal_10_trait_ntr_10_hypercoloc.rds

Summary code#

[coloc_summary]
parameter: folder = path
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: container = ""
input: folder , group_by = 1
output: f'{cwd:a}/{_input[0]:b}_summary.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    library("tidyverse")
    source("/home/hs3393/cloud_colocalization/simulation_code/colocboost_summary.r")
    filenames <- list.files(${folder:r}, pattern="*.rds$", full.names=TRUE, recursive = T)
    read_and_extract_rds <- function(file_path) {
      rds_file <- readRDS(file_path)  # Read the RDS file
      if(length(rds_file$coloc_set) == 0){
          rds_file$coloc_set = NULL
      }
      data = colocboost_summary(rds_file$true_variant, 
      rds_file$true_trait, rds_file$coloc_set, rds_file$coloc_trait)
      table <- tibble(total_causal_var_number = data$total_causal_var_number,
              perfect_causal_var_number = data$perfect_causal_var_number,
              partial_causal_var_number = data$partial_causal_var_number,
              true_trait_number = data$true_trait_number,
              predict_trait_number = data$predict_trait_number,
              total_trait_number = data$total_trait_number,
              true_set_number = data$true_set_number,
              total_set_number = data$total_set_number,
              single_set_number = length(rds_file$csets),
              max_causal = max(rds_file$true_trait %>% unlist() %>% table()),
              file = rds_file$file,
              out_file = file_path)
      return(table)
    }

    combined_table <- map_dfr(filenames, read_and_extract_rds)
    saveRDS(combined_table, ${_output:r})

Summary Bash commands#

ColocBoost summary#

## 2 trait summary

data_dir="/home/hs3393/cb_Mar/simulation_result/real_simulation_2trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

## 5 trait summary

data_dir="/home/hs3393/cb_Mar/simulation_result/real_simulation_5trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

## 10 trait summary

data_dir="/home/hs3393/cb_Mar/simulation_result/real_simulation_10trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

## 20 trait summary

data_dir="/home/hs3393/cb_Mar/simulation_result/real_simulation_20trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

Hyprcoloc summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_2trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}


data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_5trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_10trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/hyp_real_simulation_20trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J summary
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

MOLOC summary#

# moloc 

data_dir="/home/hs3393/cb_Mar/simulation_result/moloc/moloc_real_simulation_2trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

SuSiE coloc summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/susie_coloc/susie_coloc_real_simulation_2trait"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}