Correlated phenotypes simulation

Contents

Correlated phenotypes simulation#

Goal#

Here we used estimated correlation to simulate phenotype (traits) with residual correlation.

\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon} \quad \text{with} \ \boldsymbol{\varepsilon} \sim \mathcal{N}(\mathbf{0}, \sigma_y^2 \boldsymbol{\Sigma})\)

Input and output format are the same as other phenotype simulation files. \(\Sigma\) is estimated from MASH using AD-xQTL traits.

Phenotype simulation code#

[real_simulation_2trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink)，we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 2
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    
    ncausal = ${n_causal}
    ntrait = ${n_trait}
    indep = ${"TRUE" if independent else "FALSE"}
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }
    B = matrix(0, nrow =  ncol(Xmat), ncol = ntrait)
  
    for(i in 1:2){
        causal_index = vars
        beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
        B[, i] = beta
    }
  
    variant = list()
    for(i in 1:ncol(B)){
      variant[[i]] = which(B[,i] != 0)
    }
    variance = readRDS("/mnt/vast/hpc/csg/xc2270/colocboost/ROSMAP_eQTL/vhat_ROSMAP_eQTL.rds")
    two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
    five_trait_var <- variance[c(1:5), c(1:5)]
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = two_trait_var)
    phenotype = phenotype$P
    X = Xmat
    Y = phenotype
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = phenotype
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})

[real_simulation_5trait_corr]
parameter: genofile = paths
# pheno_file: give genotype file (in plink)，we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 5
parameter: n_causal = 1
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    # only keep the first 4000 variants
    ncausal = ${n_causal}
    ntrait = ${n_trait}
    indep = ${"TRUE" if independent else "FALSE"}
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }
  
    prop = readRDS("/home/hs3393/cloud_colocalization/simulation_code/trait6_prop.rds")[,1:4]
    proportions = prop[ncausal,]
    sampled_name <- as.numeric(sample(names(proportions), size =  ncausal, prob = proportions, replace = TRUE))
    phenotype = list()
    B = matrix(0, nrow =  ncol(Xmat), ncol = ntrait)
    config = matrix(0, nrow =  ntrait, ncol = ncausal)
    for(i in c(1:ncausal)){
          coloc_trait = sample(c(1:ntrait), sampled_name[i])
          config[coloc_trait, i] = 1
    }
    for(i in 1:nrow(config)){
        beta = B[,i, drop = FALSE]
        index = which(config[i,] == 1)
        if(length(index) > 0){
            causal_index = vars[index]
            beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
            B[, i] = beta
        }else{
            B[, i] = beta
        }
    }
  
    variance = readRDS("/mnt/vast/hpc/csg/xc2270/colocboost/ROSMAP_eQTL/vhat_ROSMAP_eQTL.rds")
    two_trait_var = variance[c("AC_DeJager_eQTL", "PCC_DeJager_eQTL"), c("AC_DeJager_eQTL", "PCC_DeJager_eQTL")]
    five_trait_var <- variance[c(1:5), c(1:5)]
    phenotype = sim_multi_traits(G = Xmat, B = B, h2g = ${h2g}, is_h2g_total = ${"TRUE" if total_h2g else "FALSE"}, residual_corr = five_trait_var)
  
    variant = list()
    for(i in 1:ncol(B)){
        variant[[i]] = which(B[,i] != 0)
    }
    X = Xmat
    Y = phenotype$P
    colnames(Y) = paste0("Trait", c(1:ntrait))
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})

[real_simulation_10opera]
parameter: genofile = paths
# pheno_file: give genotype file (in plink)，we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: n_trait = 10
parameter: n_causal = 1
parameter: h2g = 0.02
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_real_simulation_{n_causal}_ncausal_{n_trait}_trait.rds'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
    # source some functions to read matrix and inpute the missing data
    source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
    source("~/cloud_colocalization/simulation_code/misc.R")
    # read the plink file
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
    # filter by distance with. TSS
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
    geno$bed = geno$bed[,keep_index]
    # filter out columns with missing rate > 0.1
    imiss = 0.1
    # filter out columns with MAF < 0.05
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)
    # only keep the first 4000 variants
    ncausal = ${n_causal}
    ntrait = ${n_trait}

    indep = ${"TRUE" if independent else "FALSE"}
    if (indep) {
        LD_vars = 1  # Initialize LD_vars

        if (ncausal == 1) {
            # If only one causal variant, just sample it
            vars = sample(1:ncol(Xmat), size = ncausal)
        } else {
            # Repeat sampling until selected variables are quasi independent
            while (length(LD_vars != 0)) {
                vars = sample(1:ncol(Xmat), size = ncausal)  
                cor_mat = cor(Xmat[, vars]) 
                LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
            }
        }
    } else {
        vars = sample(1:ncol(Xmat), size = ncausal)
    }
  
    prop = readRDS("/home/hs3393/cloud_colocalization/simulation_code/trait10_prop.rds")
    proportions = prop[ncausal,]
    sampled_name <- as.numeric(sample(names(proportions), size =  ncausal, prob = proportions, replace = TRUE))
    phenotype = list()
    B = matrix(0, nrow =  ncol(Xmat), ncol = ntrait)
    config = matrix(0, nrow =  ntrait, ncol = ncausal)
    for(i in c(1:ncausal)){
          coloc_trait = sample(c(1:ntrait), sampled_name[i])
          config[coloc_trait, i] = 1
    }
    for(i in 1:nrow(config)){
        beta = B[,i, drop = FALSE]
        index = which(config[i,] == 1)
        if(length(index) > 0){
            causal_index = vars[index]
            beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
            B[, i] = beta
            pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
        }else{
            pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = 0.05, is_h2g_total = FALSE)
            phenotype[[i]] = pheno_single$P
        
        }
    }
  
    pheno_single = sim_multi_traits(G = Xmat, B = B[,1,drop = FALSE], h2g = ${h2g}, is_h2g_total = FALSE)
    phenotype[[1]] = pheno_single$P
    variant = list()
    for(i in 1:ncol(B)){
        variant[[i]] = which(B[,i] != 0)
    }
    X = Xmat
    Y = bind_cols(phenotype)
    colnames(Y) = paste0("Trait", c(1:ntrait))
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = Y
    data[["variant"]] = variant
    saveRDS(data, ${_output:r})

Bash submission#

work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_2trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh
module load Singularity


sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
    --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
    --n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
    --cwd WORK_DIR/
EOF

base_script="base_script"
for causal in 1 2 3; do
    output_script="causal_${causal}.sh"
    cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_5trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh
module load Singularity


sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
    --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
    --n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
    --cwd WORK_DIR/
EOF

base_script="base_script"
for causal in 1 2 3 4; do
    output_script="causal_${causal}.sh"
    cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done

work_dir="/home/hs3393/cb_Mar/simulation_data/simulation_correlated/"
job="real_simulation_10trait_corr"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh
module load Singularity


sos run /home/hs3393/cb_Mar/simulation_code/8.Simulation_correlated.ipynb JOB \
    --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
    --n_causal CAUSAL --mem 30G --h2g 0.05 --independent \
    --cwd WORK_DIR/
EOF

base_script="base_script"
for causal in 1 2 3 4 5; do
    output_script="causal_${causal}.sh"
    cat ${base_script}|sed "s|CAUSAL|${causal}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done