Null Simulation#
Following the previous simulation design
\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon}\)
We assigned \(\boldsymbol{\beta}= \boldsymbol{0}\), and varied \(Var(\varepsilon)\) by 0.1, 0.5 and 1.
For each region, we simulated 50 traits will all traits being null.
Phenotype simulation code#
[simulation_null]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: null_sigma = 1
parameter: n_trait = 50
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_trait_{n_trait}_null.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# Null simulation: no causal variants, simulate pure noise phenotypes
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# --- Read genotype ---
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("./Data/gene_cis_TADB_mapper.tsv")
# --- Filter by TSS window (+/- 1.5Mb) ---
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > (TSS_pos - 1500000) & geno$bim$V4 < (TSS_pos + 1500000))
geno$bed = geno$bed[, keep_index]
# --- SNP filtering ---
imiss = 0.1
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
# --- Set up empty effect size matrix ---
ntrait = ${n_trait}
B = matrix(0, nrow = ncol(Xmat), ncol = ntrait)
# --- Simulate null traits ---
pheno_single = sim_multi_traits(
G = Xmat,
B = B,
h2g = 0.05,
is_h2g_total = TRUE,
null_sigma = ${null_sigma}
)
phenotype = pheno_single$P
# --- Save results ---
data = list()
data[["X"]] = Xmat
data[["Y"]] = phenotype
saveRDS(data, ${_output:r})
Bash submission#
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_null"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run ~/colocboost-paper/Simulation_Studies/8_Null_Simulation.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--null_sigma NULL --mem 30G \
--cwd WORK_DIR/
EOF
base_script="base_script"
# squared = 0.1, 0.5 and 1
for null in 0.31 0.71 1; do
output_script="null_${null}.sh"
cat ${base_script}|sed "s|NULL|${null}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done