Secondary Simulations#

This notebook include data simulation of 50 traits, and complex configurations suggested in HyPrColoc paper.

Goal#

To generate 50 traits, for each causal variant, we randomly selected 10-25 traits to colocalize on that variant.

We also simulated 10 traits complex configuration cases described and extended from HyPrcoloc paper (see details in Supplementary Note S.6.2 of ColocBoost paper).

Input#

genofile: plink file of real genotyope, plink_by_gene/extended_cis_before_winsorize_plink_files/*.bim

The other parameters can be found in simxQTL repo. https://github.com/StatFunGen/simxQTL.

Output#

An rds object, with genotype matrix \(X\) (dimension: \(N \times P\) with \(N\) samples, \(P\) variants ) and phenotype (trait) matrix (dimension: \(N \times L\) with \(N\) samples and \(L\) simulated traits).

Example output:

result = readRDS("../simulation_data/simulation_551rand_complex/sample_39_h2g_0.05_10trait_cluster_5+5+1rand.rds")
result$variant
    1. 3620
    2. 5240
  1. 3620
  2. 3620
    1. 3620
    2. 5240
  3. 3620
    1. 250
    2. 5240
  4. 250
  5. 250
  6. 250
    1. 250
    2. 5240

In this region, we simulated 3 causal variants (250, 3620, 5240). Each causal variant is distributed in 10 traits.

Simulation code#

[simulation_50trait]
# ======= Define parameters =======
parameter: genofile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: independent = False
parameter: n_trait = 50
parameter: h2g = 0.05
parameter: ncausal = 5
parameter: share_pattern = "all"
parameter: container = ""

# ======= Workflow input/output =======
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_50_trait_ncausal_{ncausal}.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load necessary libraries ---
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype data ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})

    # --- Extract gene name and TSS ---
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]

    # --- Filter SNPs by distance to TSS ---
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[, keep_index]

    # --- Apply SNP filters: missingness and MAF ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # --- Define number of causal variants ---
    ncausal = ${ncausal}
    indep = ${"TRUE" if independent else "FALSE"}

    # ======= Consistent causal variant selection =======

    # --- Select causal variants ---
    if (indep) {
        LD_vars = 1  # Initialize LD check

        if (ncausal == 1) {
            # Only one causal variant needed
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Ensure selected causal variants are approximately independent (LD < 0.3)
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)
                cor_mat = cor(Xmat[, vars])
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        LD_vars = 1  # Initialize LD check

        if (ncausal == 1) {
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Allow non-independent variants, but avoid perfectly correlated ones (|cor|==1)
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)
                cor_mat = cor(Xmat[, vars])
                LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
            }
        }
    }

    # ======= Continue secondary simulation design =======

    # --- Initialize effect size matrix ---
    B = matrix(0, nrow = ncol(Xmat), ncol = 50)

    # --- Randomly assign number of traits each variant affects ---
    trait_number_vector = sample(x = c(10:25), size = ncausal, replace = TRUE)

    # --- Assign traits per causal variant ---
    trait_list = list()
    for (i in 1:ncausal) {
        trait_list[[i]] = sample(x = 1:50, size = trait_number_vector[i])
    }

    # --- Simulate phenotype data for each trait ---
    phenotype = list()
    for (i in 1:50) {
        index = which(unlist(lapply(trait_list, function(x) i %in% x)))

        if (length(index) > 0) {
            beta = sim_beta_fix_variant(G = Xmat, causal_index = vars[index], is_h2g_total = FALSE)
            B[, i] = beta
            pheno_single = sim_multi_traits(G = Xmat, B = B[, i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
        } else {
            pheno_single = sim_multi_traits(G = Xmat, B = B[, i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
        }
    }

    # --- Identify causal variants for each trait ---
    variant = list()
    for (i in 1:ncol(B)) {
        variant[[i]] = which(B[, i] != 0)
    }

    # --- Assemble final dataset ---
    X = Xmat
    Y = bind_cols(phenotype)
    colnames(Y) = paste0("Trait", 1:50)

    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant

    # --- Save the output RDS file ---
    saveRDS(data, ${_output:r})

Phenotype Simulation#

work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5_Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --ncausal CAUSAL --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

for ncausal in 1 2 3 4 5; do
    base_sh="base_script"
    output_script="${job}_causal_${ncausal}.sh"
    cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"|  sed "s|CAUSAL|${ncausal}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Run ColocBoost#

data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
    --mem 40G --trait 50 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Run Hyprcoloc#

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_50trait/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
    --mem 20G --trait 50 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Colocboost summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4_Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

50 trait, Hyprcoloc summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4_Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

Complex simulation#

[simulation_55_complex]
# ======= Define parameters =======
parameter: genofile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
parameter: container = ""

# ======= Workflow input/output =======
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load necessary libraries ---
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype data ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})

    # --- Extract gene name and TSS ---
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]

    # --- Filter SNPs by distance to TSS ---
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[, keep_index]

    # --- Apply SNP filters: missingness and MAF ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # ======= Select two causal variants manually =======

    indep = ${"TRUE" if independent else "FALSE"}

    LD_vars = 1
    while (length(LD_vars) != 0) {
        vars = sample(1:ncol(Xmat), size = 2)
        cor_mat = cor(Xmat[, vars])

        if (indep) {
            # Check if correlation between the two selected SNPs is acceptable (< 0.3)
            LD_vars = which(abs(cor_mat[1,2]) > 0.3)
        } else {
            # Allow correlated variants unless perfectly correlated (|cor|==1)
            LD_vars = which(abs(cor_mat[1,2]) == 1)
        }
    }

    var1 = vars[1]
    var2 = vars[2]

    # ======= Assign effects manually =======

    B = matrix(0, nrow = ncol(Xmat), ncol = 10)

    # First 5 traits (Trait1 to Trait5) share var1
    beta1 = sim_beta_fix_variant(G = Xmat, causal_index = var1, is_h2g_total = FALSE)
    for (i in 1:5) {
        B[, i] = beta1
    }

    # Next 5 traits (Trait6 to Trait10) share var2
    beta2 = sim_beta_fix_variant(G = Xmat, causal_index = var2, is_h2g_total = FALSE)
    for (i in 6:10) {
        B[, i] = beta2
    }

    # ======= Simulate phenotypes =======
    phenotype = sim_multi_traits(
        G = Xmat,
        B = B,
        h2g = ${h2g},
        is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}
    )
    phenotype = phenotype$P

    # ======= Assemble the final dataset =======
    X = Xmat
    Y = phenotype

    variant = list()
    for (i in 1:ncol(B)) {
        variant[[i]] = which(B[, i] != 0)
    }

    data = list()
    data[["X"]] = X
    data[["Y"]] = Y
    data[["variant"]] = variant

    # ======= Save the final dataset =======
    saveRDS(data, ${_output:r})
[simulation_3322_complex]
# ======= Define parameters =======
parameter: genofile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
parameter: container = ""

# ======= Workflow input/output =======
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_3+3+2+2.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load necessary libraries ---
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype data ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})

    # --- Extract gene name and TSS ---
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]

    # --- Filter SNPs by distance to TSS ---
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[, keep_index]

    # --- Apply SNP filters: missingness and MAF ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # ======= Select 4 causal variants manually =======
    indep = ${"TRUE" if independent else "FALSE"}

    LD_vars = 1
    while (length(LD_vars) != 0) {
        vars = sample(1:ncol(Xmat), size = 4)
        cor_mat = cor(Xmat[, vars])

        if (indep) {
            # Check if any pair has correlation > 0.3
            LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
        } else {
            # Only reject if any pair is perfectly correlated (|cor|==1)
            LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
        }
    }

    var1 = vars[1]
    var2 = vars[2]
    var3 = vars[3]
    var4 = vars[4]

    # ======= Assign effects manually =======
    B = matrix(0, nrow = ncol(Xmat), ncol = 10)

    # Traits 1-3 share var1
    beta1 = sim_beta_fix_variant(G = Xmat, causal_index = var1, is_h2g_total = FALSE)
    for (i in 1:3) {
        B[, i] = beta1
    }

    # Traits 4-6 share var2
    beta2 = sim_beta_fix_variant(G = Xmat, causal_index = var2, is_h2g_total = FALSE)
    for (i in 4:6) {
        B[, i] = beta2
    }

    # Traits 7-8 share var3
    beta3 = sim_beta_fix_variant(G = Xmat, causal_index = var3, is_h2g_total = FALSE)
    for (i in 7:8) {
        B[, i] = beta3
    }

    # Traits 9-10 share var4
    beta4 = sim_beta_fix_variant(G = Xmat, causal_index = var4, is_h2g_total = FALSE)
    for (i in 9:10) {
        B[, i] = beta4
    }

    # ======= Simulate phenotypes =======
    phenotype = sim_multi_traits(
        G = Xmat,
        B = B,
        h2g = ${h2g},
        is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}
    )
    phenotype = phenotype$P

    # ======= Assemble the final dataset =======
    X = Xmat
    Y = phenotype

    variant = list()
    for (i in 1:ncol(B)) {
        variant[[i]] = which(B[, i] != 0)
    }

    data = list()
    data[["X"]] = X
    data[["Y"]] = Y
    data[["variant"]] = variant

    # ======= Save the final dataset =======
    saveRDS(data, ${_output:r})
[simulation_551rand_complex]
# ======= Define parameters =======
parameter: genofile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
parameter: container = ""

# ======= Workflow input/output =======
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+1rand.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load necessary libraries ---
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype data ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})

    # --- Extract gene name and TSS ---
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]

    # --- Filter SNPs by distance to TSS ---
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[, keep_index]

    # --- Apply SNP filters: missingness and MAF ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # ======= Select 3 causal variants manually =======
    indep = ${"TRUE" if independent else "FALSE"}

    LD_vars = 1
    while (length(LD_vars) != 0) {
        vars = sample(1:ncol(Xmat), size = 3)
        cor_mat = cor(Xmat[, vars])

        if (indep) {
            LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
        } else {
            LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
        }
    }

    var1 = vars[1]  # for first 5 traits
    var2 = vars[2]  # for last 5 traits
    var3 = vars[3]  # for randomly overlapping traits

    # ======= Assign effects manually =======
    B = matrix(0, nrow = ncol(Xmat), ncol = 10)

    # Randomly pick 4 traits that will also share var3
    rand_var_trait = sample(x = 1:10, size = 4)

    # First 5 traits (Trait1 to Trait5)
    for (i in 1:5) {
        if (i %in% rand_var_trait) {
            # Share var1 and var3
            beta = sim_beta_fix_variant(G = Xmat, causal_index = c(var1, var3), is_h2g_total = FALSE)
        } else {
            # Only share var1
            beta = sim_beta_fix_variant(G = Xmat, causal_index = var1, is_h2g_total = FALSE)
        }
        B[, i] = beta
    }

    # Next 5 traits (Trait6 to Trait10)
    for (i in 6:10) {
        if (i %in% rand_var_trait) {
            # Share var2 and var3
            beta = sim_beta_fix_variant(G = Xmat, causal_index = c(var2, var3), is_h2g_total = FALSE)
        } else {
            # Only share var2
            beta = sim_beta_fix_variant(G = Xmat, causal_index = var2, is_h2g_total = FALSE)
        }
        B[, i] = beta
    }

    # ======= Simulate phenotypes =======
    phenotype = sim_multi_traits(
        G = Xmat,
        B = B,
        h2g = ${h2g},
        is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}
    )
    phenotype = phenotype$P

    # ======= Assemble the final dataset =======
    X = Xmat
    Y = phenotype

    variant = list()
    for (i in 1:ncol(B)) {
        variant[[i]] = which(B[, i] != 0)
    }

    data = list()
    data[["X"]] = X
    data[["Y"]] = Y
    data[["variant"]] = variant

    # ======= Save the final dataset =======
    saveRDS(data, ${_output:r})
[simulation_552rand_complex]
# ======= Define parameters =======
parameter: genofile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
parameter: container = ""

# ======= Workflow input/output =======
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+2rand.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    # --- Load necessary libraries ---
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype data ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})

    # --- Extract gene name and TSS ---
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]

    # --- Filter SNPs by distance to TSS ---
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[, keep_index]

    # --- Apply SNP filters: missingness and MAF ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # ======= Select 4 causal variants manually =======
    indep = ${"TRUE" if independent else "FALSE"}

    LD_vars = 1
    while (length(LD_vars) != 0) {
        vars = sample(1:ncol(Xmat), size = 4)
        cor_mat = cor(Xmat[, vars])

        if (indep) {
            LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
        } else {
            LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
        }
    }

    var1 = vars[1]  # Main for traits 1-5
    var2 = vars[2]  # Main for traits 6-10
    var3 = vars[3]  # Random additional causal variant
    var4 = vars[4]  # Another random additional causal variant

    # ======= Assign effects manually =======
    B = matrix(0, nrow = ncol(Xmat), ncol = 10)

    # Randomly pick traits that will also share var3 and var4
    rand_var_trait1 = sample(x = 1:10, size = 4)
    rand_var_trait2 = sample(x = 1:10, size = 4)

    # First 5 traits (Trait1 to Trait5)
    for (i in 1:5) {
        var_vec = c(var1)
        if (i %in% rand_var_trait1) {
            var_vec = c(var_vec, var3)
        }
        if (i %in% rand_var_trait2) {
            var_vec = c(var_vec, var4)
        }
        beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec, is_h2g_total = FALSE)
        B[, i] = beta
    }

    # Next 5 traits (Trait6 to Trait10)
    for (i in 6:10) {
        var_vec = c(var2)
        if (i %in% rand_var_trait1) {
            var_vec = c(var_vec, var3)
        }
        if (i %in% rand_var_trait2) {
            var_vec = c(var_vec, var4)
        }
        beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec, is_h2g_total = FALSE)
        B[, i] = beta
    }

    # ======= Simulate phenotypes =======
    phenotype = sim_multi_traits(
        G = Xmat,
        B = B,
        h2g = ${h2g},
        is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}
    )
    phenotype = phenotype$P

    # ======= Assemble the final dataset =======
    X = Xmat
    Y = phenotype

    variant = list()
    for (i in 1:ncol(B)) {
        variant[[i]] = which(B[, i] != 0)
    }

    data = list()
    data[["X"]] = X
    data[["Y"]] = Y
    data[["variant"]] = variant

    # ======= Save the final dataset =======
    saveRDS(data, ${_output:r})

Complex simulation - bash submission#

work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_55_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5_Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05  --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_3322_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5_Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_551rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5_Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05  --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_552rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5_Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}

Run Colocboost#

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2_Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

Run Hyprcoloc#

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}