Fineboost (single trait ColocBoost)#
Goal#
This notebook takes the input and run colocboost (single trait, Fineboost), the result can be summarized to get power and FDR.
Input#
Individual level data X and Y or summary statistics z and LD from other notebooks. file names of these data should be put in the parameter: simufile.
Output#
Fineboost / fine-mapping original result, along with some summarized elements.
Example output:
result = readRDS("/home/hs3393/cloud_colocalization/simulation_result/fineboost/sample_41_h2g_0.05_ncausal_3_ntrait_fineboost.rds")
result$var
print("=====Prediction=====")
# colocalization result - CoS
result$true_cs_num
# colocalization result - colocalizing trait
result$total_cs_num
print("=====Truth=====")
# true colocalizing CoS
result$cover_var_num
# true colocalizing trait
result$total_var_num
- 2654
- 8038
- 10675
[1] "=====Prediction====="
3
3
[1] "=====Truth====="
3
3
Fineboost Running code#
[fineboost_Jan]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 1
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/", full.names = T)) {source(file)}
rds = readRDS(${_input:ar})
X = rds$X
Y = as.matrix(rds$Y[, 1, drop = FALSE])
variant = rds$variant[[1]]
start_time <- Sys.time()
colocboost_result = colocboost(
X = X,
Y = Y, output_level=2)
end_time <- Sys.time()
# record true variant, analysed trait number and corresponding file name
colocboost_result$var = variant
colocboost_result$file = "${_input[0]:a}"
# show which snp appear in at least two traits as the coloc_variants
all_var = unlist(variant)
cs_num = length(colocboost_result$ucos_details$ucos$ucos_index)
cover_var_num = 0
true_cs_num = 0
if(length(colocboost_result$ucos_details$ucos$ucos_index) == 0){
cover_var_num = 0
true_cs_num = 0
total_cs_num = 0
}else{
# count how many var are covered
# count how many var are covered
for(i in 1:length(all_var)){
cs_vars = unlist(colocboost_result$ucos_details$ucos$ucos_index)
if(all_var[i] %in% cs_vars){
cover_var_num = cover_var_num + 1
}
}
# cover how many cs are true
for(i in 1:cs_num){
if( length( intersect( colocboost_result$ucos_details$ucos$ucos_index[[i]] , all_var )) > 0 ){
true_cs_num = true_cs_num + 1
}
}
}
colocboost_report = colocboost_result
colocboost_report$cover_var_num = cover_var_num
colocboost_report$total_var_num = length(all_var)
colocboost_report$true_cs_num = true_cs_num
colocboost_report$total_cs_num = cs_num
colocboost_report$time = end_time - start_time
saveRDS(colocboost_report, ${_output:r})
Fineboost summary code#
[fineboost_summary]
parameter: folder = path
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "50h"
parameter: mem = "60G"
parameter: numThreads = 1
parameter: container = ""
input: folder , group_by = 1
output: f'{cwd:a}/{_input[0]:b}_summary.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("tidyverse")
source("/home/hs3393/cloud_colocalization/simulation_code/fineboost_summary.r")
filenames <- list.files(${folder:r}, pattern="*.rds$", full.names=TRUE, recursive = T)
read_and_extract_rds <- function(file_path) {
rds_file <- readRDS(file_path) # Read the RDS file
if(length(rds_file$ucos_details$ucos$ucos_index) == 0){
rds_file$ucos_details$ucos$ucos_index = NULL
}
data = fineboost_summary(unlist(rds_file$var),
rds_file$ucos_details$ucos$ucos_index)
table <- tibble(match_variant_number = data$match_variant,
total_variant_number = data$total_variant_number,
true_trait_number = data$true_trait_number,
total_set_number = data$total_set_number,
file = rds_file$file,
out_file = file_path)
return(table)
}
combined_table <- map_dfr(filenames, read_and_extract_rds)
saveRDS(combined_table, ${_output:r})
Fineboost batch file#
work_dir="/home/hs3393/cb_simulation/simulation_result/fineboost"
job="fineboost"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/cb_simulation/simulation_code/Fineboost.ipynb fineboost \
--simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
--mem 25G \
--cwd PWD/
EOF
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g" | sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
Fineboost summary batch file#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J sum
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cloud_colocalization/simulation_code/Fineboost.ipynb fineboost_summary \
--folder /home/hs3393/cb_tune/simulation_result/fineboost/ \
--cwd /home/hs3393/cb_tune/simulation_result/fineboost//summary
SuSiE running code#
[susie]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 10
parameter: walltime = "20h"
parameter: mem = "50G"
parameter: numThreads = 3
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_susie.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("susieR")
# read 50 traits a s group, if we are going to analyse 3 traits-colocalization, we
# select the first 3 traits X-Y pari in this group
file_list = c(${_input:ar,})
rds_file = readRDS(file_list[1])
rds = readRDS(${_input:ar})
X = rds$X
Y = as.matrix(rds$Y[, 1, drop = FALSE])
variant = rds$variant[[1]]
susie_result = susie(X, Y)
# record true variant, analysed trait number and corresponding file name
susie_result$var = variant
susie_result$file = "${_input[0]:a}"
# show which snp appear in at least two traits as the coloc_variants
all_var = unlist(variant)
cs_num = length(susie_result$sets$cs)
cover_var_num = 0
true_cs_num = 0
if(length(susie_result$sets$cs) == 0){
cover_var_num = 0
true_cs_num = 0
total_cs_num = 0
}else{
# count how many var are covered
for(i in 1:length(all_var)){
cs_vars = unlist(susie_result$sets$cs)
if(all_var[i] %in% cs_vars){
cover_var_num = cover_var_num + 1
}
}
# cover how many cs are true
for(i in 1:cs_num){
if( length( intersect( susie_result$sets$cs[[i]] , all_var )) > 0 ){
true_cs_num = true_cs_num + 1
}
}
}
susie_result$cover_var_num = cover_var_num
susie_result$total_var_num = length(all_var)
susie_result$true_cs_num = true_cs_num
susie_result$total_cs_num = cs_num
# show which snp appear in at least two traits as the coloc_variants
saveRDS(susie_result, ${_output:r})
Run SuSiE batch file#
work_dir="/home/hs3393//cloud_colocalization/simulation_result/susie"
job="susie"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393//cloud_colocalization/simulation_code/Fineboost.ipynb susie \
--simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
--mem 25G \
--cwd PWD/
EOF
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g" | sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
SuSiE summary batch file#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J susie
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cloud_colocalization/simulation_code/Fineboost.ipynb fineboost_summary \
--folder /home/hs3393/cb_simulation/simulation_result/fineboost \
--cwd /home/hs3393/cb_simulation/simulation_result/fineboost/summary