FineBoost (single trait ColocBoost)#
Goal#
This notebook takes the input and run FineBoost (single trait version of ColocBoost), the result can be summarized to get power and FDR.
Input#
simufile
: Individual level data X and Y introduced in Phenotype Simulation.
Output#
Fineboost / fine-mapping original result, along with some summarized elements.
cover_var_num
: sets of variants identified as causaltrue_cs_num
: variants that are truly causal (ground truth)total_var_num
: total number of truly causal (ground truth)
Example output:
result = readRDS("../simulation_result/fineboost/sample_41_h2g_0.05_ncausal_3_ntrait_fineboost.rds")
result$var
print("=====Prediction=====")
# colocalization result - CoS
result$true_cs_num
# colocalization result - colocalizing trait
result$total_cs_num
print("=====Truth=====")
# true colocalizing CoS
result$cover_var_num
# true colocalizing trait
result$total_var_num
- 2654
- 8038
- 10675
[1] "=====Prediction====="
3
3
[1] "=====Truth====="
3
3
Fineboost Running code#
[fineboost]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 1
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# FineBoost: Single-trait causal discovery
library("tidyverse")
# --- Load colocboost function files ---
# devtools::install_github("StatFunGen/colocboost")
# for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/release", full.names = T)) {source(file)}
library("colocboost")
# --- Load input ---
rds = readRDS(${_input:ar})
X = rds$X
Y = as.matrix(rds$Y[, 1, drop = FALSE])
variant = rds$variant[[1]]
# --- Run colocboost (fineboost mode) ---
start_time <- Sys.time()
colocboost_result = colocboost(
X = X,
Y = Y,
output_level = 2
)
end_time <- Sys.time()
# --- Post-processing ---
colocboost_result$var = variant
colocboost_result$file = "${_input[0]:a}"
all_var = unlist(variant)
cs_num = length(colocboost_result$ucos_details$ucos$ucos_index)
cover_var_num = 0
true_cs_num = 0
if (cs_num == 0) {
cover_var_num = 0
true_cs_num = 0
total_cs_num = 0
} else {
# How many true causal variants are covered
cs_vars = unlist(colocboost_result$ucos_details$ucos$ucos_index)
cover_var_num = sum(all_var %in% cs_vars)
# How many credible sets overlap a true variant
for (i in 1:cs_num) {
if (length(intersect(colocboost_result$ucos_details$ucos$ucos_index[[i]], all_var)) > 0) {
true_cs_num = true_cs_num + 1
}
}
}
# --- Summarize results ---
colocboost_report = colocboost_result
colocboost_report$cover_var_num = cover_var_num
colocboost_report$total_var_num = length(all_var)
colocboost_report$true_cs_num = true_cs_num
colocboost_report$total_cs_num = cs_num
colocboost_report$time = end_time - start_time
saveRDS(colocboost_report, ${_output:r})
Fineboost summary code#
[fineboost_summary]
parameter: folder = path
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "50h"
parameter: mem = "60G"
parameter: numThreads = 1
parameter: container = ""
input: folder, group_by = 1
output: f'{cwd:a}/{_input[0]:b}_summary.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# FineBoost summary script
library(tidyverse)
# load summary code
variant_vector = function(true_variant, coloc_set){
var_vec = numeric(length(true_variant))
for(i in c(1:length(true_variant))){
for (j in c(1:length(coloc_set))){
if(true_variant[i] %in% unlist(coloc_set[[j]])){
var_vec[i] = 1
}else{
var_vec[i] = var_vec[i]
}
}
}
return(var_vec)
}
set_vector = function(true_variant, coloc_set){
set_vec = numeric(length(coloc_set))
for(i in c(1:length(true_variant))){
for (j in c(1:length(coloc_set))){
inter = intersect(unlist(coloc_set[[j]]),
true_variant)
if(length(inter > 0)){
set_vec[j] = 1
}else{
set_vec[j] = 0
}
}
return(set_vec)
}
}
set_size = function(coloc_set){
size_vec = numeric(length(coloc_set))
for (j in c(1:length(coloc_set))){
size_vec[j] = length(unlist(coloc_set[j]))
}
return(size_vec)
}
fineboost_summary = function(true_variant, coloc_set){
return(list(
match_variant = sum(variant_vector(true_variant, coloc_set)),
true_trait_number = sum(set_vector(true_variant, coloc_set)),
total_variant_number = length(true_variant),
total_set_number = length(coloc_set)
))
}
# List all RDS result files
filenames <- list.files(${folder:r}, pattern = "*.rds$", full.names = TRUE, recursive = TRUE)
# Function to read and summarize each RDS
read_and_extract_rds <- function(file_path) {
rds_file <- readRDS(file_path)
# If no ucos detected, set to NULL
if (length(rds_file$ucos_details$ucos$ucos_index) == 0) {
rds_file$ucos_details$ucos$ucos_index = NULL
}
# Run fineboost summary
data = fineboost_summary(
unlist(rds_file$var),
rds_file$ucos_details$ucos$ucos_index
)
# Build output table
table <- tibble(
match_variant_number = data$match_variant,
total_variant_number = data$total_variant_number,
true_trait_number = data$true_trait_number,
total_set_number = data$total_set_number,
file = rds_file$file,
out_file = file_path
)
return(table)
}
# Combine all summary tables
combined_table <- map_dfr(filenames, read_and_extract_rds)
# Save combined summary
saveRDS(combined_table, ${_output:r})
Fineboost batch file#
work_dir="/home/hs3393/cb_simulation/simulation_result/fineboost"
job="fineboost"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost \
--simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
--mem 25G \
--cwd PWD/
EOF
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g" | sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
Fineboost summary batch file#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J sum
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/fineboost/log/summary.%j.err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost_summary \
--folder /home/hs3393/cb_tune/simulation_result/fineboost/ \
--cwd /home/hs3393/cb_tune/simulation_result/fineboost//summary
SuSiE running code#
[susie]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 10
parameter: walltime = "20h"
parameter: mem = "50G"
parameter: numThreads = 3
parameter: container = ""
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_susie.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# Susie finemapping script
library("susieR")
# Read input file
rds = readRDS(${_input:ar})
X = rds$X
Y = as.matrix(rds$Y[, 1, drop = FALSE])
variant = rds$variant[[1]]
# Run SuSiE
susie_result = susie(X, Y)
# Record true variant, analyzed trait number, and file name
susie_result$var = variant
susie_result$file = "${_input[0]:a}"
all_var = unlist(variant)
cs_num = length(susie_result$sets$cs)
# Initialize counts
cover_var_num = 0
true_cs_num = 0
# If no credible sets found
if (length(susie_result$sets$cs) == 0) {
cover_var_num = 0
true_cs_num = 0
total_cs_num = 0
} else {
# Count how many true causal variants are covered
cs_vars = unlist(susie_result$sets$cs)
for (i in 1:length(all_var)) {
if (all_var[i] %in% cs_vars) {
cover_var_num = cover_var_num + 1
}
}
# Count how many credible sets contain at least one true variant
for (i in 1:cs_num) {
if (length(intersect(susie_result$sets$cs[[i]], all_var)) > 0) {
true_cs_num = true_cs_num + 1
}
}
}
susie_result$cover_var_num = cover_var_num
susie_result$total_var_num = length(all_var)
susie_result$true_cs_num = true_cs_num
susie_result$total_cs_num = cs_num
saveRDS(susie_result, ${_output:r})
Run SuSiE batch file#
work_dir="/home/hs3393//cloud_colocalization/simulation_result/susie"
job="susie"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 68:00:00
#SBATCH --mem=25000
#SBATCH -J JOB
#SBATCH -o PWD/JOB.%j.out
#SBATCH -e PWD/JOB.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/simulation_signal/causal_CAUSAL
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb susie \
--simufile $(find -type f -name '*ncausal_CAUSAL*.rds') \
--mem 25G \
--cwd PWD/
EOF
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat base_script | sed "s|PWD|${work_dir}|g" | sed "s|CAUSAL|${causal}|g" | sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
SuSiE summary batch file#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 48:00:00
#SBATCH --mem=40000
#SBATCH -J susie
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_result/susie/log/summary.%j.err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/colocboost-paper/Simulation_Studies/9_Fineboost.ipynb fineboost_summary \
--folder /home/hs3393/cb_simulation/simulation_result/fineboost \
--cwd /home/hs3393/cb_simulation/simulation_result/fineboost/summary