Null simulation#
Following the previous simulation design
\(\mathbf{Y}_1 = \mathbf{X} \boldsymbol{\beta}_1 + \boldsymbol{\varepsilon}\)
We assigned \(\boldsymbol{\beta}= \boldsymbol{0}\), and varied \(Var(\varepsilon)\) by 0.1, 0.5 and 1.
For each region, we simulated 50 traits will all traits being null.
Phenotype simulation code#
[simulation_null]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: null_sigma = 0.1
# for each variant, how many traits it randomly colocalize at
parameter: n_trait = 50
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_trait_{n_trait}_null.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("/home/hs3393/cb_simulation/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
# read the plink file
simu_file = ${_input:r}
geno <- read_plink(${_input:nr})
gene_name = str_extract(simu_file, "ENSG[0-9]+")
gene_tss_map = read_tsv("/home/hs3393/coloc/fungen-xqtl-analysis/resource/gene_cis_TADB_mapper.tsv")
# filter by distance with. TSS
TSS_pos = gene_tss_map$TSS[which(gene_tss_map$gene_id == gene_name)][1]
keep_index = which(geno$bim$V4 > TSS_pos - 1500000 | geno$bim$V4 < TSS_pos + 1500000)
geno$bed = geno$bed[,keep_index]
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed, imiss, maf)
B = matrix(0, nrow = ncol(Xmat), ncol = 50)
pheno_single = sim_multi_traits(G = Xmat, B = B, h2g = 0.05, is_h2g_total = TRUE, null_sigma = ${null_sigma})
phenotype = pheno_single$P
X = Xmat
Y = phenotype
data = list()
data[["X"]] = Xmat
data[["Y"]] = Y
saveRDS(data, ${_output:r})
Bash submission#
work_dir="/home/hs3393/cb_Mar/simulation_data/"
job="simulation_null"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 30:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/log/JOB."%j".out
#SBATCH -e WORK_DIR/log/JOB."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cb_Mar/simulation_code/9.Null_Simulation.ipynb JOB \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/selected_genes_genotype/*.bim` \
--null_sigma NULL --mem 30G \
--cwd WORK_DIR/
EOF
base_script="base_script"
# squared = 0.1, 0.5 and 1
for null in 0.31 0.71 1; do
output_script="null_${null}.sh"
cat ${base_script}|sed "s|NULL|${null}|g" | sed "s|WORK_DIR|${work_dir}|g" |sed "s|JOB|${job}|g" > ${output_script}
sbatch ${output_script}
done