Secondary simulations#
This notebook include data simulation of 50 traits, and complex configurations suggested in HyprColoc paper.
Goal#
Because we don’t have 50 traits to estimate and reflect the true configurations, we used a different approach: for each causal variant, we randomly select 10-25 traits to colocalize on that variant.
We also simulated 10 traits complex configuration cases described and extended from Hyprcoloc paper.
Input#
genofile
: plink file of real genotyope, /mnt/vast/hpc/csg/FunGen_xQTL/ROSMAP/Genotype/plink_by_gene/extended_cis_before_winsorize_plink_files/*.bim
The other parameters can be found in simxQTL repo. https://github.com/StatFunGen/simxQTL
.
Output#
An rds matrix, with genotype matrix X (dimension: m * n, m: number of sample, n: number of SNP ) and phenotype (trait) matrix (dimension: m * a, m : number of samples, a: number of simulated traits)
Example output:
result = readRDS("/home/hs3393/cb_Mar/simulation_data/simulation_551rand_complex/sample_39_h2g_0.05_10trait_cluster_5+5+1rand.rds")
result$variant
-
- 3620
- 5240
- 3620
- 3620
-
- 3620
- 5240
- 3620
-
- 250
- 5240
- 250
- 250
- 250
-
- 250
- 5240
In this region, we simulated 3 causal variants (250, 3620, 5240). Each causal variant is distributed in 10 traits.
Simulation code#
[simulation_50trait]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: independent = False
# for each variant, how many traits it randomly colocalize at
parameter: n_trait = 50
parameter: h2g = 0.05
parameter: ncausal = 5
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_50_trait_ncausal_{ncausal}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
ncausal = ${ncausal}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
vars = sample(1:ncol(Xmat), size = ncausal)
B = matrix(0, nrow = ncol(Xmat), ncol = 50)
trait_number_vector = sample(x = c(10:25), size = ncausal, replace = TRUE)
trait_list = list()
for(i in 1:ncausal){
trait_list[[i]] = sample(x = 1:50, size = trait_number_vector[i])
}
phenotype = list()
for(i in 1:50){
index = which(unlist(lapply(trait_list, function(x) i %in% x)))
if(length(index) > 0){
beta = sim_beta_fix_variant(G = Xmat, causal_index = vars[index], is_h2g_total = FALSE)
B[, i] = beta
pheno_single = sim_multi_traits(G = Xmat, B = B[,i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}else{
pheno_single = sim_multi_traits(G = Xmat, B = B[,i, drop = FALSE], h2g = 0.05, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}
}
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
X = Xmat
Y = bind_cols(phenotype)
colnames(Y) = paste0("Trait", c(1:50))
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
Phenotype Simulation#
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--ncausal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd /home/hs3393/cb_Mar/simulation_data/
EOF
for ncausal in 1 2 3 4 5; do
base_sh="base_script"
output_script="${job}_causal_${ncausal}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"| sed "s|CAUSAL|${ncausal}|g" > ${output_script}
sbatch ${output_script}
done
50 trait: Run ColocBoost#
data_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/"
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR/JOB
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
--mem 40G --trait 50 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
50 trait: Run Hyprcoloc#
data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_50trait/"
job="simulation_50trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*_ncausal_NCAUSAL*.rds') \
--mem 20G --trait 50 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4 5; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
50 trait: Colocboost summary#
data_dir="/home/hs3393/cb_Mar/simulation_result/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary
cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
--folder DATA_DIR/result \
--cwd DATA_DIR/summary
EOF
base_script="summary_script"
output_script="summary.sh"
cat ${base_script}| sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
50 trait, Hyprcoloc summary#
data_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/simulation_50trait/"
mkdir -p ${data_dir}/summary
cd ${data_dir}/summary
cat << 'EOF' > summary_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 8:00:00
#SBATCH --mem=30000
#SBATCH -J sum
#SBATCH -o DATA_DIR/log/summary."%j".out
#SBATCH -e DATA_DIR/log/summary."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/4.Result_Summary.ipynb coloc_summary \
--folder DATA_DIR/result \
--cwd DATA_DIR/summary
EOF
base_script="summary_script"
output_script="summary.sh"
cat ${base_script}| sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
Complex simulation#
[simulation_55_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
indep = ${"TRUE" if independent else "FALSE"}
if(indep){ LD_thresh = 0.3} else{ LD_thresh = 1}
LD_vars = 1
while(length(LD_vars != 0)){
B1 = sim_beta(G = Xmat, ncausal = 1, ntrait = 5,
is_h2g_total = FALSE, shared_pattern = "all")
B2 = sim_beta(G = Xmat, ncausal = 1, ntrait = 5,
is_h2g_total = FALSE, shared_pattern = "all")
B = cbind(B1, B2)
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
var_mat = unique(unlist(variant))
cor_mat = cor(Xmat[,var_mat])
LD_vars = which(colSums(abs(cor_mat) > LD_thresh) > 1)
}
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[simulation_3322_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}//sample_{_index}_h2g_{h2g}_10trait_cluster_3+3+2+2.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
indep = ${"TRUE" if independent else "FALSE"}
if(indep){ LD_thresh = 0.3} else{ LD_thresh = 1}
LD_vars = 1
while(length(LD_vars != 0)){
B1 = sim_beta(G = Xmat, ncausal = 1, ntrait = 3,
is_h2g_total = FALSE, shared_pattern = "all")
B2 = sim_beta(G = Xmat, ncausal = 1, ntrait = 3,
is_h2g_total = FALSE, shared_pattern = "all")
B3 = sim_beta(G = Xmat, ncausal = 1, ntrait = 2,
is_h2g_total = FALSE, shared_pattern = "all")
B4 = sim_beta(G = Xmat, ncausal = 1, ntrait = 2,
is_h2g_total = FALSE, shared_pattern = "all")
B = cbind(B1, B2, B3, B4)
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
var_mat = unique(unlist(variant))
cor_mat = cor(Xmat[,var_mat])
LD_vars = which(colSums(abs(cor_mat) > LD_thresh) > 1)
}
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[simulation_551rand_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}//sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+1rand.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
LD_vars = 1
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = 3)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
B = matrix(0, nrow = ncol(Xmat), ncol = 10)
var_vec = list()
rand_var_trait = sample(x = 1:10, size = 4)
for(i in 1:5){
if(i %in% rand_var_trait){
var_vec[[i]] = c(vars[1], vars[3])
}else{
var_vec[[i]] = c(vars[1])
}
beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
B[, i] = beta
}
for(i in 6:10){
if(i %in% rand_var_trait){
var_vec[[i]] = c(vars[2], vars[3])
}else{
var_vec[[i]] = c(vars[2])
}
beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
B[, i] = beta
}
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[simulation_552rand_complex]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_h2g_{h2g}_10trait_cluster_5+5+2rand.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
LD_vars = 1
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = 4)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
B = matrix(0, nrow = ncol(Xmat), ncol = 10)
var_vec = list()
rand_var_trait1 = sample(x = 1:10, size = 4)
rand_var_trait2 = sample(x = 1:10, size = 4)
for(i in 1:5){
var_vec[[i]] = c(vars[1])
if(i %in% rand_var_trait1){
var_vec[[i]] = c(var_vec[[i]], vars[3])
}
if(i %in% rand_var_trait2){
var_vec[[i]] = c(var_vec[[i]], vars[4])
}
beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
B[, i] = beta
}
for(i in 6:10){
var_vec[[i]] = c(vars[2])
if(i %in% rand_var_trait1){
var_vec[[i]] = c(var_vec[[i]], vars[3])
}
if(i %in% rand_var_trait2){
var_vec[[i]] = c(var_vec[[i]], vars[4])
}
beta = sim_beta_fix_variant(G = Xmat, causal_index = var_vec[[i]], is_h2g_total = FALSE)
B[, i] = beta
}
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"})
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
Complex simulation - bash submission#
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_55_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--mem 30G --h2g 0.05 --independent \
--cwd /home/hs3393/cb_Mar/simulation_data/
EOF
base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_3322_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--mem 30G --h2g 0.05 --independent \
--cwd /home/hs3393/cb_Mar/simulation_data/
EOF
base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_551rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--mem 30G --h2g 0.05 --independent \
--cwd /home/hs3393/cb_Mar/simulation_data/
EOF
base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_552rand_complex"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
sos run /home/hs3393/cb_Mar/simulation_code/5.Simulation_secondary.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--mem 30G --h2g 0.05 --independent \
--cwd /home/hs3393/cb_Mar/simulation_data/
EOF
base_sh="base_script"
output_script="${job}.sh"
cat ${base_sh}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
Run Colocboost#
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
Run Hyprcoloc#
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_3322_complex/"
job="simulation_3322_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_55_complex/"
job="simulation_55_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data//simulation_551rand_complex/"
job="simulation_551rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
data_dir="/home/hs3393/cb_Mar/simulation_data/simulation_552rand_complex/"
job="simulation_552rand_complex"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/complex_simulation/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 100:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3.Other_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*.rds') \
--mem 40G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
output_script="job_${job}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}