Fineboost (single trait ColocBoost)#

Goal#

This notebook takes the input and run colocboost (single trait, Fineboost), the result can be summarized to get power and FDR.

Input#

Individual level data X and Y or summary statistics z and LD from other notebooks. file names of these data should be put in the parameter: simufile.

Output#

Fineboost / fine-mapping original result, along with some summarized elements.

Example output:

result = readRDS("/home/hs3393/cloud_colocalization/simulation_result/fineboost/sample_41_h2g_0.05_ncausal_3_ntrait_fineboost.rds")

result$var

print("=====Prediction=====")
# colocalization result - CoS
result$true_cs_num

# colocalization result - colocalizing trait
result$total_cs_num

print("=====Truth=====")
#  true colocalizing CoS
result$cover_var_num

#  true colocalizing trait
result$total_var_num
  1. 2654
  2. 8038
  3. 10675
[1] "=====Prediction====="
3
3
[1] "=====Truth====="
3
3

Fineboost Running code#

[fineboost_Jan]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 1
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/", full.names = T)) {source(file)}
    rds = readRDS(${_input:ar})

    X = rds$X
    Y = as.matrix(rds$Y[, 1, drop = FALSE])
    variant = rds$variant[[1]]


    start_time <- Sys.time()
    colocboost_result = colocboost(
      X = X, 
      Y = Y, output_level=2)
    end_time <- Sys.time()

    # record true variant, analysed trait number and corresponding file name
    colocboost_result$var = variant
    colocboost_result$file = "${_input[0]:a}"
    # show which snp appear in at least two traits as the coloc_variants
    all_var = unlist(variant)
    
    cs_num = length(colocboost_result$ucos_details$ucos$ucos_index)
  
    cover_var_num = 0
    true_cs_num = 0
    if(length(colocboost_result$ucos_details$ucos$ucos_index) == 0){
      cover_var_num = 0    
      true_cs_num = 0
      total_cs_num = 0
    }else{
    
        # count how many var are covered
    
        # count how many var are covered
        for(i in 1:length(all_var)){
            cs_vars = unlist(colocboost_result$ucos_details$ucos$ucos_index)
            if(all_var[i] %in% cs_vars){
              cover_var_num = cover_var_num + 1 
            }

        }

        # cover how many cs are true
        for(i in 1:cs_num){
            if( length( intersect( colocboost_result$ucos_details$ucos$ucos_index[[i]] , all_var )) > 0 ){
            true_cs_num = true_cs_num + 1 
            }
        }
    }
  
    
    colocboost_report = colocboost_result
    colocboost_report$cover_var_num = cover_var_num 
    colocboost_report$total_var_num = length(all_var)
    colocboost_report$true_cs_num = true_cs_num 
    colocboost_report$total_cs_num = cs_num
    colocboost_report$time = end_time - start_time
    saveRDS(colocboost_report, ${_output:r})

Fineboost summary code#

[fineboost_summary]
parameter: folder = path
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "50h"
parameter: mem = "60G"
parameter: numThreads = 1
parameter: container = ""
input: folder , group_by = 1
output: f'{cwd:a}/{_input[0]:b}_summary.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    library("tidyverse")
    source("/home/hs3393/cloud_colocalization/simulation_code/fineboost_summary.r")
    filenames <- list.files(${folder:r}, pattern="*.rds$", full.names=TRUE, recursive = T)
    read_and_extract_rds <- function(file_path) {
      rds_file <- readRDS(file_path)  # Read the RDS file
      if(length(rds_file$ucos_details$ucos$ucos_index) == 0){
         rds_file$ucos_details$ucos$ucos_index = NULL
      }
      data = fineboost_summary(unlist(rds_file$var), 
      rds_file$ucos_details$ucos$ucos_index)
      table <- tibble(match_variant_number = data$match_variant,
              total_variant_number = data$total_variant_number,
              true_trait_number = data$true_trait_number,
              total_set_number = data$total_set_number,
              file = rds_file$file,
              out_file = file_path)
      return(table)
    }

    combined_table <- map_dfr(filenames, read_and_extract_rds)
    saveRDS(combined_table, ${_output:r})

  
  

Fineboost batch file#

work_dir="/home/hs3393/cb_simulation/simulation_result/fineboost"
job="fineboost"
mkdir -p ${work_dir}

mkdir -p ${work_dir}/code
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err

source ~/mamba_activate.sh
module load Singularity

cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/cb_simulation/simulation_code/Fineboost.ipynb fineboost \
    --simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
    --mem 25G \
    --cwd PWD/
EOF


for causal in 1 2 3 4 5; do
    output_script="causal_${causal}.sh"
    cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g"  | sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

Fineboost summary batch file#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J sum
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.err

source ~/mamba_activate.sh
module load Singularity

sos run /home/hs3393/cloud_colocalization/simulation_code/Fineboost.ipynb fineboost_summary \
    --folder /home/hs3393/cb_tune/simulation_result/fineboost/ \
    --cwd /home/hs3393/cb_tune/simulation_result/fineboost//summary

SuSiE running code#

[susie]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 10
parameter: walltime = "20h"
parameter: mem = "50G"
parameter: numThreads = 3
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_susie.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("susieR")
    # read 50 traits a s group, if we are going to analyse 3 traits-colocalization, we
    # select the first 3 traits X-Y pari in this group
    file_list = c(${_input:ar,})
    rds_file = readRDS(file_list[1])

    rds = readRDS(${_input:ar})


    X = rds$X
    Y = as.matrix(rds$Y[, 1, drop = FALSE])
    variant = rds$variant[[1]]
  
  
    susie_result = susie(X, Y)
    # record true variant, analysed trait number and corresponding file name
    susie_result$var = variant
    susie_result$file = "${_input[0]:a}"
    # show which snp appear in at least two traits as the coloc_variants
    all_var = unlist(variant)
    
    cs_num = length(susie_result$sets$cs)
  
    cover_var_num = 0
    true_cs_num = 0
    if(length(susie_result$sets$cs) == 0){
      cover_var_num = 0    
      true_cs_num = 0
      total_cs_num = 0
    }else{
    
        # count how many var are covered
        for(i in 1:length(all_var)){
            cs_vars = unlist(susie_result$sets$cs)
            if(all_var[i] %in% cs_vars){
              cover_var_num = cover_var_num + 1 
            }

        }

        # cover how many cs are true
        for(i in 1:cs_num){
            if( length( intersect( susie_result$sets$cs[[i]] , all_var )) > 0 ){
            true_cs_num = true_cs_num + 1 
            }
        }
    }
  
    
    susie_result$cover_var_num = cover_var_num 
    susie_result$total_var_num = length(all_var)
    susie_result$true_cs_num = true_cs_num 
    susie_result$total_cs_num = cs_num
    # show which snp appear in at least two traits as the coloc_variants
    saveRDS(susie_result, ${_output:r})

Run SuSiE batch file#

work_dir="/home/hs3393//cloud_colocalization/simulation_result/susie"
job="susie"
mkdir -p ${work_dir}

mkdir -p ${work_dir}/code
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err

source ~/mamba_activate.sh
module load Singularity

cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393//cloud_colocalization/simulation_code/Fineboost.ipynb susie \
    --simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
    --mem 25G \
    --cwd PWD/
EOF


for causal in 1 2 3 4 5; do
    output_script="causal_${causal}.sh"
    cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g"  | sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

SuSiE summary batch file#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J susie
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.err

source ~/mamba_activate.sh
module load Singularity

sos run /home/hs3393/cloud_colocalization/simulation_code/Fineboost.ipynb fineboost_summary \
    --folder /home/hs3393/cb_simulation/simulation_result/fineboost \
    --cwd /home/hs3393/cb_simulation/simulation_result/fineboost/summary