OPERA: simulation using original proportion configuration#
Goal#
Compare colocboost and OPERA under original exposure-outcome design.
1: Match configuration with region index#
# 3 trait index
library(tidyverse)
total_indices <- 500
proportions <- c(0.78, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.01)
# Calculate the exact number of indices for each group
group_sizes <- floor(total_indices * proportions)
# Adjust for any rounding errors
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))
# Create a vector of group assignments
groups <- rep(1:8, times = group_sizes)
groups <- sample(groups)
# Create a list to store the indices for each group
partitioned_indices <- lapply(1:8, function(i) which(groups == i) - 1)
# Name the list elements for clarity
names(partitioned_indices) <- paste("Group", 1:8)
# Print the number of indices in each group
sapply(partitioned_indices, length)
config = list(c(1,0,0,0),
c(1,1,0,0),
c(1,0,1,0),
c(1, 0, 0, 1),
c(1,1,1,0),
c(1,1,0,1),
c(1,0,1,1),
c(1,1,1,1))
saveRDS(list(partitioned_indices = partitioned_indices, config = config), "/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_3trait.rds")
# 5 trait index
library(tidyverse)
total_indices <- 500
proportions = c(0.655, rep(0.02, 5), rep(0.01, 20), rep(0.008, 5), 0.005)
# Calculate the exact number of indices for each group
group_sizes <- floor(total_indices * proportions)
# Adjust for any rounding errors
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))
# Create a vector of group assignments
groups <- rep(1:length(proportions), times = group_sizes)
groups <- sample(groups)
# Create a list to store the indices for each group
# - 1 to start from 0
partitioned_indices <- lapply(1:length(proportions), function(i) which(groups == i) - 1)
# Name the list elements for clarity
names(partitioned_indices) <- paste("Group", 1:length(proportions))
# Print the number of indices in each group
sapply(partitioned_indices, length)
config = list(c(1,0,0,0,0,0),
c(1,1,0,0,0,0),
c(1,0,1,0,0,0),
c(1,0,0,1,0,0),
c(1,0,0,0,1,0),
c(1,0,0,0,0,1),
c(1,1,1,0,0,0),
c(1,1,0,1,0,0),
c(1,1,0,0,1,0),
c(1,1,0,0,0,1),
c(1,0,1,1,0,0),
c(1,0,1,0,1,0),
c(1,0,1,0,0,1),
c(1,0,0,1,1,0),
c(1,0,0,1,0,1),
c(1,0,0,0,1,1),
c(1, 1, 1, 1, 0, 0),
c(1, 1, 1, 0, 1, 0),
c(1, 1, 1, 0, 0, 1),
c(1, 0, 1, 1, 1, 0),
c(1, 0, 1, 1, 0, 1),
c(1, 0, 1, 0, 1, 1),
c(1, 0, 0, 1, 1, 1),
c(1, 1, 0, 1, 1, 0),
c(1, 1, 0, 1, 0, 1),
c(1, 0, 1, 1, 0, 1),
c(1,1,1,1,1,0),
c(1,1,1,0,1,1),
c(1,1,1,1,0,1),
c(1,1,0,1,1,1),
c(1,0,1,1,1,1),
c(1,1,1,1,1,1))
saveRDS(list(partitioned_indices = partitioned_indices, config = config), "/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_5trait.rds")
2: Sumstat simulation#
[simulation_opera_3trait]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: share_pattern = "all"
parameter: independent = False
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_opera_3trait.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
calculate_sumstat = function(X, Y){
Beta = c()
se = c()
Freq = c()
p = c()
for(mm in 1:ncol(X)){
rr <- susieR::univariate_regression(X[,mm], Y)
Beta[mm] = rr$betahat
se[mm] <- rr$sebetahat
Freq[mm] = sum(X[,mm])/(2*nrow(X))
p[mm] = 2 * (1 - pnorm(abs(rr$betahat / rr$sebetahat)))
}
tb = tibble(
SNP = colnames(X),
Beta = Beta,
se = se,
Freq = Freq, p = p) %>%
separate(SNP, into = c("Chr", "Bp", "A2", "A1"), sep = "[:_]", remove = FALSE) %>%
select(Chr, SNP, Bp, A1, A2, Freq, Beta, se, p) %>%
mutate(Chr = str_replace(Chr, "chr", "")) %>%
mutate(Chr = as.numeric(Chr), Bp = as.numeric(Bp))
return(tb)
}
# read the plink file
partitioned_res = readRDS("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_3trait.rds")
geno <- read_plink(${_input:nr})
tad_number <-${_index}
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed,imiss,maf)
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
result <- sapply(partitioned_res$partitioned_indices, function(x) any(x == tad_number))
idx <- which(result)
config = partitioned_res$config[[idx]]
phenotype = list()
B = matrix(0, nrow = ncol(Xmat), ncol = length(config))
for(i in 1:length(config)){
beta = B[,i, drop = FALSE]
if(config[i] == 1){
causal_index = vars[1]
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = ${h2g}, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}else{
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = ${h2g}, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}
}
X = Xmat
Y = bind_cols(phenotype)
colnames(Y) = paste0("Trait", c(1:length(config)))
trait = list()
trait[[1]] = calculate_sumstat(X, unname(unlist(Y[,1]))) %>% rename(freq = Freq, b = Beta) %>% mutate(N = 1160) %>% select(SNP, A1, A2, freq, b, se, p, N)
for(i in 2:ncol(Y)){
trait[[i]] = calculate_sumstat(X, unname(unlist(Y[,i])))
}
LD = get_correlation(Xmat)
data = list()
data[["LD"]] = LD
data[["trait"]] = trait
data[["tad_index"]] = tad_number
saveRDS(data, ${_output:r})
[simulation_opera_5trait]
parameter: genofile = paths
# pheno_file: give genotype file (in plink),we can read the gentype matrix. These files are separated by TADs.
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "100h"
parameter: mem = "30G"
parameter: numThreads = 1
# specify the number of causal variants
parameter: h2g = 0.05
parameter: total_h2g = False
parameter: independent = False
parameter: share_pattern = "all"
# specify the number of traits (phenotypes)
parameter: container = ""
input: genofile, group_by = 1
output: f'{cwd:a}/{step_name}/sample_{_index}_opera_5trait.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# source some functions to read matrix and inpute the missing data
source("~/cloud_colocalization/simulation_code/simulate_linreg.R")
source("~/cloud_colocalization/simulation_code/misc.R")
calculate_sumstat = function(X, Y){
Beta = c()
se = c()
Freq = c()
p = c()
for(mm in 1:ncol(X)){
rr <- susieR::univariate_regression(X[,mm], Y)
Beta[mm] = rr$betahat
se[mm] <- rr$sebetahat
Freq[mm] = sum(X[,mm])/(2*nrow(X))
p[mm] = 2 * (1 - pnorm(abs(rr$betahat / rr$sebetahat)))
}
tb = tibble(
SNP = colnames(X),
Beta = Beta,
se = se,
Freq = Freq, p = p) %>%
separate(SNP, into = c("Chr", "Bp", "A2", "A1"), sep = "[:_]", remove = FALSE) %>%
select(Chr, SNP, Bp, A1, A2, Freq, Beta, se, p) %>%
mutate(Chr = str_replace(Chr, "chr", "")) %>%
mutate(Chr = as.numeric(Chr), Bp = as.numeric(Bp))
return(tb)
}
# read the plink file
partitioned_res = readRDS("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/index/partitioned_index_5trait.rds")
geno <- read_plink(${_input:nr})
tad_number <-${_index}
# filter out columns with missing rate > 0.1
imiss = 0.1
# filter out columns with MAF < 0.05
maf = 0.05
Xmat = filter_X(geno$bed,imiss,maf)
indep = ${"TRUE" if independent else "FALSE"}
if (indep) {
LD_vars = 1 # Initialize LD_vars
if (ncausal == 1) {
# If only one causal variant, just sample it
vars = sample(1:ncol(Xmat), size = ncausal)
} else {
# Repeat sampling until selected variables are quasi independent
while (length(LD_vars != 0)) {
vars = sample(1:ncol(Xmat), size = ncausal)
cor_mat = cor(Xmat[, vars])
LD_vars = which(colSums(abs(cor_mat) > 0.3) > 1)
}
}
} else {
vars = sample(1:ncol(Xmat), size = ncausal)
}
result <- sapply(partitioned_res$partitioned_indices, function(x) any(x == tad_number))
idx <- which(result)
config = partitioned_res$config[[idx]]
phenotype = list()
B = matrix(0, nrow = ncol(Xmat), ncol = length(config))
for(i in 1:length(config)){
beta = B[,i, drop = FALSE]
if(config[i] == 1){
causal_index = vars[1]
beta = sim_beta_fix_variant(G = Xmat, causal_index = causal_index, is_h2g_total = FALSE)
B[, i] = beta
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = ${h2g}, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}else{
pheno_single = sim_multi_traits(G = Xmat, B = as.matrix(beta), h2g = ${h2g}, is_h2g_total = FALSE)
phenotype[[i]] = pheno_single$P
}
}
X = Xmat
Y = bind_cols(phenotype)
colnames(Y) = paste0("Trait", c(1:length(config)))
trait = list()
trait[[1]] = calculate_sumstat(X, unname(unlist(Y[,1]))) %>% rename(freq = Freq, b = Beta) %>% mutate(N = 1160) %>% select(SNP, A1, A2, freq, b, se, p, N)
for(i in 2:ncol(Y)){
trait[[i]] = calculate_sumstat(X, unname(unlist(Y[,i])))
}
LD = get_correlation(Xmat)
data = list()
data[["LD"]] = LD
data[["trait"]] = trait
data[["tad_index"]] = tad_number
saveRDS(data, ${_output:r})
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=40000
#SBATCH -J OPO_3
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/log/OPO_3."%j".out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/log/OPO_3."%j".err
source ~/mamba_activate.sh
module load Singularity
sos run /home/hs3393/cloud_colocalization/simulation_code/OPERA_original_design.ipynb simulation_opera_3trait \
--genofile `ls /home/hs3393/cloud_colocalization/simulation_data/opera_genes_genotype/*.bim` \
--mem 40G --h2g 0.05 --independent \
--cwd ~/cloud_colocalization/simulation_data/opera_original_design
3. Transform sumstat into .esd format and make the .flist summary file#
Also, GWAS trait is saved into .ma format.
3 Exposures#
library(tidyverse)
cwd = "/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/simulation_opera_3trait/"
gwas = list()
path = vector("list", 3)
chr = c()
distance = c()
probe_id = c()
cnt = 1
for(file in list.files(cwd, pattern = ".rds")){
file_path = paste0(cwd, file)
rds = readRDS(file_path)
tad_index = rds$tad_index
gwas[[cnt]] = rds$trait[[1]]
distance[cnt] = floor((max(as.numeric(rds$trait[[2]]$Bp)) + min(as.numeric(rds$trait[[2]]$Bp))) / 2)
chr[cnt] = unique(rds$trait[[2]]$Chr)
probe_id[cnt] = paste0("sample_", tad_index)
for(i in 2:4){
path[[i-1]][cnt] = paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/region_", tad_index, "_trait", i, ".esd")
rds$trait[[i]] %>% write_tsv(path[[i-1]][cnt])
}
cnt = cnt + 1
}
for(i in 2:4){
tibble(Chr = chr, ProbeID = paste0(probe_id, "_", i), GeneticDistance = 0,
ProbeBp = distance, Gene = paste0(probe_id, "_", i), Orientation = NA,
PathOfEsd = path[[i -1]]) %>%
write_tsv(paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/trait_", i, ".flist"))
}
bind_rows(gwas) %>% write_tsv("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/trait_1.ma")
bind_rows(gwas) %>% select(SNP) %>% write_tsv("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/remain_snp.tsv", col_names = FALSE)
5 exposures#
library(tidyverse)
cwd = "/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/simulation_opera_5trait/"
gwas = list()
path = vector("list", 5)
chr = c()
distance = c()
probe_id = c()
cnt = 1
for(file in list.files(cwd, pattern = ".rds")){
file_path = paste0(cwd, file)
rds = readRDS(file_path)
tad_index = rds$tad_index
gwas[[cnt]] = rds$trait[[1]]
distance[cnt] = floor((max(as.numeric(rds$trait[[2]]$Bp)) + min(as.numeric(rds$trait[[2]]$Bp))) / 2)
chr[cnt] = unique(rds$trait[[2]]$Chr)
probe_id[cnt] = paste0("Region_", tad_index)
for(i in 2:6){
path[[i-1]][cnt] = paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/region_", tad_index, "_trait", i, ".esd")
rds$trait[[i]] %>% write_tsv(path[[i-1]][cnt])
}
cnt = cnt + 1
}
for(i in 2:6){
tibble(Chr = chr, ProbeID = paste0(probe_id, "_", i), GeneticDistance = 0,
ProbeBp = distance, Gene = paste0(probe_id, "_", i), Orientation = NA,
PathOfEsd = path[[i -1]]) %>%
write_tsv(paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/trait_", i, ".flist"))
}
bind_rows(gwas) %>% write_tsv("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/trait_1.ma")
bind_rows(gwas) %>% select(SNP) %>% write_tsv("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/remain_snp.tsv", col_names = FALSE)
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J EX5
#SBATCH -o /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/log/extract.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/log/extract.%j.err
source ~/mamba_activate.sh
module load Singularity
Rscript /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/extract_sumstat/5_extract.R
4. Use SMR to convert .esd files to .bsed files as OPERA input#
job_name="besd"
WD="/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/${job_name}"
mkdir -p ${WD}/code
mkdir -p ${WD}/log
traits=("2" "5")
#causals=("1" "2" "3" "4" "5")
cd ${WD}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOBNAME
#SBATCH -o PWD/log/JOBNAME."%j".out
#SBATCH -e PWD/log/JOBNAME."%j".err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_TRAITtrait
for trait_num in {2..10}; do
~/smr_heidi/SMR/smr-1.3.1-linux-x86_64/smr-1.3.1 \
--eqtl-flist ./trait_${trait_num}.flist \
--make-besd \
--cis-wind 3000 \
--make-besd-dense \
--out ./trait_${trait_num}
done
EOF
base_script="base_script"
for trait in "${traits[@]}"; do
#for causal in "${causals[@]}"; do
output_script="script_trait_${trait}.sh"
cat ${base_script} | sed "s|TRAIT|${trait}|g" | sed "s|PWD|${WD}|g" | sed "s|JOBNAME|${job_name}|g" > ${output_script}
cat ${base_script} | sed "s|TRAIT|${trait}|g" | sed "s|PWD|${WD}|g" | sed "s|JOBNAME|${job_name}|g" > ${output_script}
sb ${output_script}
# cat script_variant_${variant}.sh
#done
done
5. Exposure (QTL) meta file summary#
# Create a two-line table with strings
library(tidyverse)
for(TRAIT in c(4)){
path = c()
ntrait = TRAIT
cnt = 1
for(i in c(2:ntrait)){
path[cnt] = paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/trait_", i)
cnt = cnt + 1
}
data <- data.frame(Path = path)
# Print the data
print(data)
# Save the data to a CSV file
write_csv(data, paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/qtl_list"), col_names = FALSE)
}
# Create a two-line table with strings
library(tidyverse)
for(TRAIT in c(6)){
path = c()
ntrait = TRAIT
cnt = 1
for(i in c(2:ntrait)){
path[cnt] = paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/trait_", i)
cnt = cnt + 1
}
data <- data.frame(Path = path)
# Print the data
print(data)
# Save the data to a CSV file
write_csv(data, paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/qtl_list"), col_names = FALSE)
}
6. Subset the genome-wide SNP PLINK binary file using PLINK#
Only include the variants in our selected non-overlapping regions for stage 1 prior estimation.
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J LDextract
#SBATCH -o LDextract.%j.out
#SBATCH -e LDextract.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait
plink2 --bfile /mnt/vast/hpc/csg/molecular_phenotype_calling/genotype_arch/ROSMAP_NIA_WGS.leftnorm.filtered --extract remain_snp.tsv --make-bed --out remain_snp_LD
for trait in 2 5 10; do
for causal in 1 2 3 4 5; do
output_sh="trait_${trait}_causal_${causal}.sh"
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|CAUSAL|${causal}|g" > ${output_sh}
sbatch ${output_sh}
done
done
7. Save genotype (LD) metadata in remain_snp_LD file#
# LD
# Create a two-line table with strings
library(tidyverse)
for(ntrait in c(3,5)){
data <- paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_",ntrait, "trait/remain_snp_LD")
# Print the data
print(data)
# Save the data to a CSV file
write_lines(data, paste0("/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_",ntrait, "trait/genotype_data"))
}
8. Stage 1 OPERA#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 46:00:00
#SBATCH --mem=60000
#SBATCH -J opera1
#SBATCH -o opera1.%j.out
#SBATCH -e opera1.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--mbfile ./genotype_data --pi-wind 1500 \
--estimate-pi --out pi_estimation
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 46:00:00
#SBATCH --mem=60000
#SBATCH -J opera1
#SBATCH -o opera1.%j.out
#SBATCH -e opera1.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--mbfile ./genotype_data --pi-wind 1500 \
--estimate-pi --out pi_estimation
9.Stage 2 OPERA#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=80000
#SBATCH -J opera2
#SBATCH -o opera2.%j.out
#SBATCH -e opera2.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--bfile /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/remain_snp_LD \
--outcome-wind 2000 \
--prior-pi-file /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/pi_estimation.pi \
--prior-var-file /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/pi_estimation.var \
--out /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_3trait/
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=80000
#SBATCH -J opera2
#SBATCH -o opera2.%j.out
#SBATCH -e opera2.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--bfile /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/remain_snp_LD \
--outcome-wind 2000 \
--prior-pi-file /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/pi_estimation.pi \
--prior-var-file /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/pi_estimation.var \
--out /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/opera_input/simulation_opera_5trait/
10. Run colocboost on simulated data for comparison#
[colocboost_sumstat]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 15
parameter: walltime = "80h"
parameter: mem = "60G"
parameter: numThreads = 3
parameter: trait = 10
parameter: container = ""
parameter: check_null_max = 0.1
parameter: sample_size = 1160
parameter: lfsr_max = 1
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
for(file in list.files("/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/", full.names = T)) {source(file)}
library(tidyverse)
data = readRDS(${_input:ar})
if(length(data$trait) < 5){
trait1 = data$trait[[1]] %>% mutate(beta = b, sebeta = se, n = N, variant = SNP) %>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait2 = data$trait[[2]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait3 = data$trait[[3]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait4 = data$trait[[4]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
sumstat = list(trait1, trait2, trait3, trait4)
}else{
trait1 = data$trait[[1]] %>% mutate(beta = b, sebeta = se, n = N, variant = SNP) %>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait2 = data$trait[[2]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait3 = data$trait[[3]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait4 = data$trait[[4]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait5 = data$trait[[5]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
trait6 = data$trait[[6]] %>% mutate(beta = Beta, sebeta = se, n = 1160, variant = SNP)%>% select(beta, sebeta, n, variant) %>% as.data.frame()
sumstat = list(trait1, trait2, trait3, trait4, trait5, trait6)
}
colocboost_result = colocboost(sumstat = sumstat, LD = data$LD, target_outcome_idx=1)
colocboost_result$tad_index = data$tad_index
saveRDS(colocboost_result, ${_output:r})
work_dir="/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/colocboost_result"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J opera_colocb
#SBATCH -o PWD/log/complex.%j.out
#SBATCH -e PWD/log/complex.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/simulation_opera_3trait
sos run /mnt/vast/hpc/csg/hs3393/cloud_colocalization/simulation_code/OPERA_original_design.ipynb colocboost_sumstat \
--simufile $(find -type f -name '*.rds') \
--mem 60G \
--cwd PWD/3_trait
EOF
base_script="base_script"
output_script="3trait.sh"
cat ${base_script}| sed "s|PWD|${work_dir}|g" > ${output_script}
sbatch ${output_script}
work_dir="/home/hs3393/cloud_colocalization/simulation_data/opera_original_design/colocboost_result"
mkdir -p ${work_dir}
mkdir -p ${work_dir}/code
mkdir -p ${work_dir}/log
cd ${work_dir}/code
# Create the base_script file and write the bash code into it
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 56:00:00
#SBATCH --mem=60000
#SBATCH -J opera_colocb
#SBATCH -o PWD/log/complex.%j.out
#SBATCH -e PWD/log/complex.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cloud_colocalization/simulation_data/opera_original_design/simulation_opera_5trait
sos run /mnt/vast/hpc/csg/hs3393/cloud_colocalization/simulation_code/OPERA_original_design.ipynb colocboost_sumstat \
--simufile $(find -type f -name '*.rds') \
--mem 60G \
--cwd PWD/5_trait
EOF
base_script="base_script"
output_script="5trait.sh"
cat ${base_script}| sed "s|PWD|${work_dir}|g" > ${output_script}
sbatch ${output_script}