Secondary simulations#

This notebook include data simulation of 50 traits, and complex configurations suggested in HyprColoc paper.

Goal#

Because we don’t have 50 traits to estimate and reflect the true configurations, we used a different approach: for each causal variant, we randomly select 10-25 traits to colocalize on that variant.

We also simulated 10 traits complex configuration cases described and extended from Hyprcoloc paper.

Input#

genofile: plink file of real genotyope, /mnt/vast/hpc/csg/FunGen_xQTL/ROSMAP/Genotype/plink_by_gene/extended_cis_before_winsorize_plink_files/*.bim

The other parameters can be found in simxQTL repo. https://github.com/StatFunGen/simxQTL.

Output#

An rds matrix, with genotype matrix X (dimension: m * n, m: number of sample, n: number of SNP ) and phenotype (trait) matrix (dimension: m * a, m : number of samples, a: number of simulated traits)

Example output:

result = readRDS("/home/hs3393/cb_Mar/simulation_data/simulation_551rand_complex/sample_39_h2g_0.05_10trait_cluster_5+5+1rand.rds")
result$variant
    1. 3620
    2. 5240
  1. 3620
  2. 3620
    1. 3620
    2. 5240
  3. 3620
    1. 250
    2. 5240
  4. 250
  5. 250
  6. 250
    1. 250
    2. 5240

In this region, we simulated 3 causal variants (250, 3620, 5240). Each causal variant is distributed in 10 traits.

Simulation code#

[simulation_50trait]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: independent = False
# for each variant, how many traits it randomly colocalize at
parameter: n_trait = 50
parameter: h2g = 0.05
parameter: ncausal = 5
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_50_trait_ncausal_{ncausal}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    ncausal = ${ncausal}
    indep = ${"TRUE" if independent else "FALSE"}
  
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }

    vars = sample(1:ncol(Xmat), size = ncausal)
    B = matrix(0, nrow =  ncol(Xmat), ncol = 50)
  
    trait_number_vector = sample(x = c(10:25), size = ncausal, replace = TRUE)
  
    trait_list = list()
    for(i in 1:ncausal){
      trait_list[[i]] = sample(x = 1:50, size =  trait_number_vector[i])
      
    }
      
  
    
    phenotype = list()
    for(i in 1:50){
        index = which(unlist(lapply(trait_list, function(x) i %in% x)))
         if(length(index) > 0){            
            beta = sim_beta_fix_variant(G = Xmat, causal_index = vars[index], is_h2g_total = FALSE)
            B[, i] = beta
            pheno_single = sim_multi_traits(G = Xmat, B = B[,i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
          }else{
            pheno_single = sim_multi_traits(G = Xmat, B = B[,i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
  
          }

    }
    variant = list()
    for(i in 1:ncol(B)){
        variant[[i]] = which(B[,i] != 0)
    }
    X = Xmat
    Y = bind_cols(phenotype)
    colnames(Y) = paste0("Trait", c(1:50))
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})

Phenotype Simulation#

work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --ncausal CAUSAL --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

for ncausal in 1 2 3 4 5; do
    base_sh="base_script"
    output_script="${job}_causal_${ncausal}.sh"
    cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"|  sed "s|CAUSAL|${ncausal}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Run ColocBoost#

data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
    --mem 40G --trait 50 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Run Hyprcoloc#

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_50trait/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
    --mem 20G --trait 50 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4 5; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

50 trait: Colocboost summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

50 trait, Hyprcoloc summary#

data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary

cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
    --folder DATA_DIR/result \
    --cwd DATA_DIR/summary
EOF


base_script="summary_script"
output_script="summary.sh"
cat ${base_script}|  sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

Complex simulation#

[simulation_55_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    indep = ${"TRUE" if independent else "FALSE"}
    if(indep){ LD_thresh = 0.3} else{ LD_thresh = 1}
    LD_vars = 1
    while(length(LD_vars != 0)){
      
        B1 = sim_beta(G = Xmat, ncausal = 1, ntrait = 5, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B2 = sim_beta(G = Xmat, ncausal = 1, ntrait = 5, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B = cbind(B1, B2)
        variant = list()
        for(i in 1:ncol(B)){
          variant[[i]] = which(B[,i] != 0)
        }

        var_mat = unique(unlist(variant))
        cor_mat = cor(Xmat[,var_mat]) 
        LD_vars = which(colSums(abs(cor_mat) > LD_thresh) > 1)
    }
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
    phenotype = phenotype$P
    X = Xmat
    Y = phenotype
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})
[simulation_3322_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}//sample_{_index}_h2g_{h2g}_10trait_cluster_3+3+2+2.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    indep = ${"TRUE" if independent else "FALSE"}
    if(indep){ LD_thresh = 0.3} else{ LD_thresh = 1}
    LD_vars = 1
    while(length(LD_vars != 0)){
        B1 = sim_beta(G = Xmat, ncausal = 1, ntrait = 3, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B2 = sim_beta(G = Xmat, ncausal = 1, ntrait = 3, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B3 = sim_beta(G = Xmat, ncausal = 1, ntrait = 2, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B4 = sim_beta(G = Xmat, ncausal = 1, ntrait = 2, 
                     is_h2g_total = FALSE, shared_pattern = "all")
        B = cbind(B1, B2, B3, B4)
        variant = list()
        for(i in 1:ncol(B)){
          variant[[i]] = which(B[,i] != 0)
        }

        var_mat = unique(unlist(variant))
        cor_mat = cor(Xmat[,var_mat]) 
        LD_vars = which(colSums(abs(cor_mat) > LD_thresh) > 1)
    }
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
    phenotype = phenotype$P
    X = Xmat
    Y = phenotype
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})
[simulation_551rand_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}//sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+1rand.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
  
    LD_vars = 1
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = 3)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }
  
    B = matrix(0, nrow =  ncol(Xmat), ncol = 10)
    var_vec = list()
    rand_var_trait = sample(x = 1:10, size = 4)
    for(i in 1:5){
        if(i %in% rand_var_trait){
            var_vec[[i]] = c(vars[1], vars[3])

        }else{
            var_vec[[i]] = c(vars[1])
    }
        beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
        B[, i] = beta
    }
    for(i in 6:10){
    if(i %in% rand_var_trait){
        var_vec[[i]] = c(vars[2], vars[3])

    }else{
        var_vec[[i]] = c(vars[2])
    }
    beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
    B[, i] = beta

    }

    variant = list()

    for(i in 1:ncol(B)){
        variant[[i]] = which(B[,i] != 0)
    }
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
    phenotype = phenotype$P
    X = Xmat
    Y = phenotype
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})
  
[simulation_552rand_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+2rand.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
  
    LD_vars = 1
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = 4)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }
  
    B = matrix(0, nrow =  ncol(Xmat), ncol = 10)
    var_vec = list()
    rand_var_trait1 = sample(x = 1:10, size = 4)
    rand_var_trait2 = sample(x = 1:10, size = 4)
    for(i in 1:5){
        var_vec[[i]] = c(vars[1])
        if(i %in% rand_var_trait1){
            var_vec[[i]] = c(var_vec[[i]], vars[3])

        }
        if(i %in% rand_var_trait2){
            var_vec[[i]] = c(var_vec[[i]], vars[4])
        }
            
        beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
        B[, i] = beta
    }
    for(i in 6:10){
        var_vec[[i]] = c(vars[2])
        if(i %in% rand_var_trait1){
            var_vec[[i]] = c(var_vec[[i]], vars[3])

        }
        if(i %in% rand_var_trait2){
            var_vec[[i]] = c(var_vec[[i]], vars[4])
        }
            
        beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
        B[, i] = beta
    }

    variant = list()

    for(i in 1:ncol(B)){
        variant[[i]] = which(B[,i] != 0)
    }
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
    phenotype = phenotype$P
    X = Xmat
    Y = phenotype
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})

Complex simulation - bash submission#

work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_55_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05  --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_3322_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_551rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05  --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_552rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh

sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
        --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
        --mem 30G --h2g 0.05 --independent \
        --cwd /home/hs3393/cb_Mar/simulation_data/
EOF

base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}

Run Colocboost#

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

Run Hyprcoloc#

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}

data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*.rds') \
    --mem 40G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
sbatch ${output_script}