Correlated phenotypes simulation#
Goal#
Here we used estimated correlation to simulate phenotype (traits) with residual correlation.
\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon} \quad \text{with} \ \boldsymbol{\varepsilon} \sim \mathcal{N}(\mathbf{0}, \sigma_y^2 \boldsymbol{\Sigma})\)
Input and output format are the same as other phenotype simulation files. \(\Sigma\) is estimated from MASH using AD-xQTL traits.
Phenotype simulation code#
[real_simulation_2trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 2
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
for(i in 1:2){
causal_index = vars
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
}
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
variance = readRDS("/mnt/vast/hpc/csg/xc2270/colocboost/ROSMAP_eQTL/vhat_ROSMAP_eQTL.rds")
two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
five_trait_var <- variance[c(1:5), c(1:5)]
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = two_trait_var)
phenotype = phenotype$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = phenotype
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[real_simulation_5trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 5
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
# only keep the first 4000 variants
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
prop = readRDS("/home/hs3393/cloud_colocalization/simulation_code/trait6_prop.rds")[,1:4]
proportions = prop[ncausal,]
sampled_name <- as.numeric(sample(names(proportions), size = ncausal, prob = proportions, replace = TRUE))
phenotype = list()
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
config = matrix(0, nrow = ntrait, ncol = ncausal)
for(i in c(1:ncausal)){
coloc_trait = sample(c(1:ntrait), sampled_name[i])
config[coloc_trait, i] = 1
}
for(i in 1:nrow(config)){
beta = B[,i, drop = FALSE]
index = which(config[i,] == 1)
if(length(index) > 0){
causal_index = vars[index]
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
}else{
B[, i] = beta
}
}
variance = readRDS("/mnt/vast/hpc/csg/xc2270/colocboost/ROSMAP_eQTL/vhat_ROSMAP_eQTL.rds")
two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
five_trait_var <- variance[c(1:5), c(1:5)]
phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = five_trait_var)
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
X = Xmat
Y = phenotype$P
colnames(Y) = paste0("Trait", c(1:ntrait))
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
[real_simulation_10opera]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: n_causal = 1
parameter: h2g = 0.02
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
# only keep the first 4000 variants
ncausal = ${n_causal}
ntrait = ${n_trait}
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
prop = readRDS("/home/hs3393/cloud_colocalization/simulation_code/trait10_prop.rds")
proportions = prop[ncausal,]
sampled_name <- as.numeric(sample(names(proportions), size = ncausal, prob = proportions, replace = TRUE))
phenotype = list()
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
config = matrix(0, nrow = ntrait, ncol = ncausal)
for(i in c(1:ncausal)){
coloc_trait = sample(c(1:ntrait), sampled_name[i])
config[coloc_trait, i] = 1
}
for(i in 1:nrow(config)){
beta = B[,i, drop = FALSE]
index = which(config[i,] == 1)
if(length(index) > 0){
causal_index = vars[index]
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = 0.05, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}else{
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = 0.05, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}
}
pheno_single = sim_multi_traits(G = Xmat, B = B[,1,drop = FALSE], h2g = ${h2g}, is_h2g_total = FALSE)
phenotype[[1]] = pheno_single$P
variant = list()
for(i in 1:ncol(B)){
variant[[i]] = which(B[,i] != 0)
}
X = Xmat
Y = bind_cols(phenotype)
colnames(Y) = paste0("Trait", c(1:ntrait))
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
data[["variant"]] = variant
saveRDS(data, ${_output:r})
Bash submission#
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_2trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_5trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3 4; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done
work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_10trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
--cwd WORK_DIR/
EOF
base_script="base_script"
for causal in 1 2 3 4 5; do
output_script="causal_${causal}.sh"
cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done