Correlated Phenotypes Simulation#
Goal#
Here we used estimated correlation to simulate phenotype (traits) with residual correlation (see details in Supplementary Note S.6.2 of ColocBoost paper).
\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon} \quad \text{with} \ \boldsymbol{\varepsilon} \sim \mathcal{N}(\mathbf{0}, \sigma_y^2 \boldsymbol{\Sigma})\)
Input and output format are the same as other phenotype simulation files. \(\Sigma\) is estimated from MASH using AD-xQTL traits.
Phenotype simulation code#
[real_simulation_2trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 2
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# Here we use estimated correlation to simulate traits with residual correlation
# --- Load necessary libraries ---
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# --- Read genotype ---
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
# --- Filter by TSS window (+/- 1.5Mb) ---
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > (TSS_pos - 1500000) & geno$bim$V4 < (TSS_pos + 1500000))
geno$bed = geno$bed[, keep_index]
# --- SNP filtering ---
imiss = 0.1
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
# --- Select causal variants ---
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
# Only one causal variant needed
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Ensure selected variants are approximately independent (LD < 0.3)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Avoid perfectly correlated variants (|cor| = 1)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
}
}
}
# --- Set up effect sizes ---
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
for (i in 1:2) {
causal_index = vars
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
}
# --- Record causal variants ---
variant = list()
for (i in 1:ncol(B)) {
variant[[i]] = which(B[, i] != 0)
}
# --- Simulate phenotype with estimated residual covariance ---
variance = readRDS("./Data/vhat_ROSMAP_eQTL.rds")
two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
phenotype = sim_multi_traits(
G = Xmat,
B = B,
h2g = ${h2g},
is_h2g_total = ${"TRUE" if total_h2g else "FALSE"},
residual_corr = two_trait_var
)
phenotype = phenotype$P
# --- Save results ---
data = list()
data[["X"]] = Xmat
data[["Y"]] = phenotype
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[real_simulation_5trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 5
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# --- Load necessary libraries ---
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
# only keep the first 4000 variants
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
# Only one causal variant needed
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Ensure selected variants are approximately independent (LD < 0.3)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Avoid perfectly correlated variants (|cor| = 1)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
}
}
}
prop = readRDS("/home/hs3393/cloud_colocalization/simulation_code/trait6_prop.rds")[,1:4]
proportions = prop[ncausal,]
sampled_name <- as.numeric(sample(names(proportions), size = ncausal, prob = proportions, replace = TRUE))
phenotype = list()
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
config = matrix(0, nrow = ntrait, ncol = ncausal)
for(i in c(1:ncausal)){
coloc_trait = sample(c(1:ntrait), sampled_name[i])
config[coloc_trait, i] = 1
}
for(i in 1:nrow(config)){
beta = B[,i, drop = FALSE]
index = which(config[i,] == 1)
if(length(index) > 0){
causal_index = vars[index]
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
}else{
B[, i] = beta
}
}
variance = readRDS("./Data/vhat_ROSMAP_eQTL.rds")
two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
five_trait_var <- variance[c(1:5), c(1:5)]
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = five_trait_var)
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
X = Xmat
Y = phenotype$P
colnames(Y) = paste0("Trait", c(1:ntrait))
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[real_simulation_10trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: n_causal = 1
parameter: h2g = 0.02
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# --- Load necessary libraries ---
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
# Only one causal variant needed
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Ensure selected variants are approximately independent (LD < 0.3)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
LD_vars = 1 # Initialize LD check
if (ncausal == 1) {
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Avoid perfectly correlated variants (|cor| = 1)
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) == 1) > 1)
}
}
}
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
for(i in 1:2){
causal_index = vars
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
}
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
variance = readRDS("./Data/vhat_ROSMAP_eQTL.rds")
two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
five_trait_var <- variance[c(1:5), c(1:5)]
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = variance)
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = phenotype
data[["variant"]] = variant
saveRDS(data, ${_output:r})
Bash submission#
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_2trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/7_Correlated_Simulation.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_5trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/7_Correlated_Simulation.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3 4; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_10trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run ./7_Correlated_Simulation.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done