Null Simulation

Null Simulation#

Following the previous simulation design

\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon}\)

We assigned \(\boldsymbol{\beta}= \boldsymbol{0}\), and varied \(Var(\varepsilon)\) by 0.1, 0.5 and 1.

For each region, we simulated 50 traits will all traits being null.

Phenotype simulation code#

[simulation_null]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: null_sigma = 1
parameter: n_trait = 50
parameter: container = ""

input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_trait_{n_trait}_null.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    # Null simulation: no causal variants, simulate pure noise phenotypes
    library("MASS")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("tidyverse")
  
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # --- Read genotype ---
    simu_file = ${_input:r}
    geno <- read_plink(${_input:nr})
    gene_name = str_extract(simu_file, "ENSG[0-9]+")
    gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")

    # --- Filter by TSS window (+/- 1.5Mb) ---
    TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
    keep_index = which(geno$bim$V4 > (TSS_pos - 1500000) & geno$bim$V4 < (TSS_pos + 1500000))
    geno$bed = geno$bed[, keep_index]

    # --- SNP filtering ---
    imiss = 0.1
    maf = 0.05
    Xmat = filter_X(geno$bed, imiss, maf)

    # --- Set up empty effect size matrix ---
    ntrait = ${n_trait}
    B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)

    # --- Simulate null traits ---
    pheno_single = sim_multi_traits(
        G = Xmat,
        B = B,
        h2g = 0.05,
        is_h2g_total = TRUE,
        null_sigma = ${null_sigma}
    )
    phenotype = pheno_single$P

    # --- Save results ---
    data = list()
    data[["X"]] = Xmat
    data[["Y"]] = phenotype
    saveRDS(data, ${_output:r})

Bash submission#

work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_null"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code

# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err

source ~/mamba_activate.sh
module load Singularity


sos run ~/colocboost-paper/Simulation_Studies/8_Null_Simulation.ipynb JOB \
    --genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
    --null_sigma NULL --mem 30G \
    --cwd WORK_DIR/
EOF

base_script="base_script"
# squared = 0.1, 0.5 and 1
for null in 0.31 0.71 1; do
    output_script="null_${null}.sh"
    cat ${base_script}|sed "s|NULL|${null}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g"  > ${output_script}
    sbatch ${output_script}
done