Run OPERA#
Goal#
After simulating the phenotypes, we will summarize them into summary statistics format, making them compatible with OPERA, and compare with ColocBoost targeted version.
Input#
Simulated individual level data.
Step 1: Choose non-overlapping regions from 1,287 regions used in other simulations#
# Load necessary libraries
library(tidyverse)
# Read in selected gene table
select_gene = read_tsv("./Data//selected_gene_tb.tsv")
# Clean and arrange the gene table:
# - Remove "TADB_" prefix from TADB_index
# - Convert index to numeric
# - Sort by index
select_gene = select_gene %>%
mutate(index = str_replace(TADB_index, "TADB_", "")) %>%
mutate(index = as.numeric(index)) %>%
arrange(index)
# Initialize vectors to store non-overlapping gene intervals
chrom_str = c()
start_str = c()
end_str = c()
gene_str = c()
cnt = 1
# Iterate through genes to select non-overlapping regions
for (i in 1:nrow(select_gene)) {
chrom = select_gene[["#chr"]][i]
start = select_gene[["gene_cis_start"]][i]
end = select_gene[["gene_cis_end"]][i]
gene = select_gene[["gene_id"]][i]
if (i == 1) {
# Always keep the first gene
chrom_str = c(chrom_str, chrom)
start_str = c(start_str, start)
end_str = c(end_str, end)
gene_str = c(gene_str, gene)
cnt = cnt + 1
} else {
if (chrom != chrom_str[cnt - 1]) {
# If chromosome changes, keep the gene
chrom_str = c(chrom_str, chrom)
start_str = c(start_str, start)
end_str = c(end_str, end)
gene_str = c(gene_str, gene)
cnt = cnt + 1
} else {
if (start > end_str[cnt - 1]) {
# If no overlap with previous region on the same chromosome, keep the gene
chrom_str = c(chrom_str, chrom)
start_str = c(start_str, start)
end_str = c(end_str, end)
gene_str = c(gene_str, gene)
cnt = cnt + 1
}
}
}
}
Step 2: Based on causal variant distribution, mix the result by proportion#
Save the selected files in another folder.
## Obtained from true colocalization configuration
trait_2 = c(2177, 233, 26)
trait_2 = trait_2 / sum(trait_2)
trait_5 = c(1831, 258, 41, 9 )
trait_5 = trait_5 / sum(trait_5)
trait_10 = c(3175, 805, 149, 45, 10)
trait_10 = trait_10 / sum(trait_10)
# Mix simulation files based on causal variant number distribution
# ------------------------
# Part 1: 2 causal variants
# ------------------------
set.seed(100) # For reproducibility
total_indices <- 500
proportions <- trait_2 # Proportions for 2 causal variant groups
# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes)) # Correct rounding errors
# Assign samples to groups
groups <- rep(1:3, times = group_sizes)
groups <- sample(groups) # Shuffle group assignments
# Find sample indices for each group
partitioned_indices <- lapply(1:3, function(i) which(groups == i) - 1)
# Copy files into mixed folders by h2g value
for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/h2g_", h2g, "_mix/")
dir.create(path, showWarnings = FALSE, recursive = TRUE)
# Copy from group 1
for (i in partitioned_indices[[1]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_1_h2g_",
h2g, "/real_simulation_2opera/sample_",
i, "_real_simulation_1_ncausal_2_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
# Copy from group 2
for (i in partitioned_indices[[2]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_2_h2g_",
h2g, "/real_simulation_2opera/sample_",
i, "_real_simulation_2_ncausal_2_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
# Copy from group 3
for (i in partitioned_indices[[3]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_3_h2g_",
h2g, "/real_simulation_2opera/sample_",
i, "_real_simulation_3_ncausal_2_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
}
# ------------------------
# Part 2: 5 causal variants
# ------------------------
set.seed(100) # Reset seed
total_indices <- 500
proportions <- trait_5 # Proportions for 5 causal variant groups
# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))
groups <- rep(1:4, times = group_sizes)
groups <- sample(groups)
partitioned_indices <- lapply(1:4, function(i) which(groups == i) - 1)
for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/h2g_", h2g, "_mix/")
dir.create(path, showWarnings = FALSE, recursive = TRUE)
for (i in partitioned_indices[[1]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_1_h2g_",
h2g, "/real_simulation_5opera/sample_",
i, "_real_simulation_1_ncausal_5_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[2]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_2_h2g_",
h2g, "/real_simulation_5opera/sample_",
i, "_real_simulation_2_ncausal_5_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[3]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_3_h2g_",
h2g, "/real_simulation_5opera/sample_",
i, "_real_simulation_3_ncausal_5_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[4]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_4_h2g_",
h2g, "/real_simulation_5opera/sample_",
i, "_real_simulation_4_ncausal_5_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
}
# -------------------------
# Part 3: 10 causal variants
# -------------------------
set.seed(100)
total_indices <- 500
proportions <- trait_10 # Proportions for 10 causal variant groups
# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))
groups <- rep(1:5, times = group_sizes)
groups <- sample(groups)
partitioned_indices <- lapply(1:5, function(i) which(groups == i) - 1)
for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/h2g_", h2g, "_mix/")
dir.create(path, showWarnings = FALSE, recursive = TRUE)
for (i in partitioned_indices[[1]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_1_h2g_",
h2g, "/real_simulation_10opera/sample_",
i, "_real_simulation_1_ncausal_10_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[2]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_2_h2g_",
h2g, "/real_simulation_10opera/sample_",
i, "_real_simulation_2_ncausal_10_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[3]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_3_h2g_",
h2g, "/real_simulation_10opera/sample_",
i, "_real_simulation_3_ncausal_10_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[4]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_4_h2g_",
h2g, "/real_simulation_10opera/sample_",
i, "_real_simulation_4_ncausal_10_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
for (i in partitioned_indices[[5]]) {
file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_5_h2g_",
h2g, "/real_simulation_10opera/sample_",
i, "_real_simulation_5_ncausal_10_trait.rds")
file.copy(from = file_path, to = path, overwrite = TRUE)
}
}
Step 3: Transform individual level simulation into sumstat#
# --- Load necessary libraries ---
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
# --- Load helper functions ---
source("~/simxQTL/simulate_linreg.R")
# Function to calculate GWAS summary statistics for each SNP
calculate_sumstat = function(X, Y) {
Beta = c()
se = c()
Freq = c()
p = c()
for (mm in 1:ncol(X)) {
rr <- susieR::univariate_regression(X[, mm], Y)
Beta[mm] = rr$betahat
se[mm] = rr$sebetahat
Freq[mm] = sum(X[, mm]) / (2 * nrow(X))
p[mm] = 2 * (1 - pnorm(abs(rr$betahat / rr$sebetahat)))
}
# Assemble summary statistics table
tb = tibble(
SNP = colnames(X),
Beta = Beta,
se = se,
Freq = Freq,
p = p
) %>%
separate(SNP, into = c("Chr", "Bp", "A2", "A1"), sep = "[:_]", remove = FALSE) %>%
select(Chr, SNP, Bp, A1, A2, Freq, Beta, se, p) %>%
mutate(
Chr = str_replace(Chr, "chr", ""),
Chr = as.numeric(Chr),
Bp = as.numeric(Bp)
)
return(tb)
}
# ------------------------------
# Main script to process samples
# ------------------------------
cnt = 1
ntrait = TRAIT # Number of traits (user-defined)
h2g = H2G # Heritability value (user-defined)
# Define input path
path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/h2g_", h2g, "_mix")
files = list.files(path, full.names = TRUE, pattern = ".rds")
# Initialize data list
data <- rep(list(NULL), length(files))
# Loop over all input files
for (file in files) {
rds = readRDS(file)
X = rds$X
Y = rds$Y
trait = list()
# Process first trait (apply special formatting)
trait[[1]] = calculate_sumstat(X, Y[, 1, drop = TRUE]) %>%
rename(freq = Freq, b = Beta) %>%
mutate(N = 1160) %>%
select(SNP, A1, A2, freq, b, se, p, N)
# Process remaining traits
for (i in 2:ncol(Y)) {
trait[[i]] = calculate_sumstat(X, Y[, i, drop = TRUE])
}
# Extract sample index from filename
index = str_extract(file, "(?<=sample_)\\d+")
# Prepare output data structure
data = list()
data[["trait"]] = trait
data$index = index
data$gwas_var_num = length(rds$variant[[1]])
# Create output directory if it does not exist
dir.create(
paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera"),
showWarnings = FALSE,
recursive = TRUE
)
# Save processed summary statistics
saveRDS(
data,
paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera/index_", index, ".rds")
)
cnt = cnt + 1
}
Example output#
result = readRDS("../simulation_data/simulation_opera/real_simulation_2opera/mix_sumstat/h2g_0.02/real_simulation_2opera/index_240.rds")
str(result)
head(result$trait[[1]])
List of 3
$ trait :List of 2
..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 6791 obs. of 8 variables:
.. ..$ SNP : chr [1:6791] "chr12:108093914_C_T" "chr12:108093950_A_G" "chr12:108094333_C_G" "chr12:108094913_A_G" ...
.. ..$ A1 : chr [1:6791] "T" "G" "G" "G" ...
.. ..$ A2 : chr [1:6791] "C" "A" "C" "A" ...
.. ..$ freq: num [1:6791] 0.0924 0.3747 0.3504 0.0928 0.2207 ...
.. ..$ b : num [1:6791] -0.4608 0.1953 -0.0277 -0.4819 -0.443 ...
.. ..$ se : num [1:6791] 0.314 0.181 0.191 0.314 0.212 ...
.. ..$ p : num [1:6791] 0.1425 0.2811 0.885 0.1244 0.0369 ...
.. ..$ N : num [1:6791] 1160 1160 1160 1160 1160 1160 1160 1160 1160 1160 ...
..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 6791 obs. of 9 variables:
.. ..$ Chr : num [1:6791] 12 12 12 12 12 12 12 12 12 12 ...
.. ..$ SNP : chr [1:6791] "chr12:108093914_C_T" "chr12:108093950_A_G" "chr12:108094333_C_G" "chr12:108094913_A_G" ...
.. ..$ Bp : num [1:6791] 1.08e+08 1.08e+08 1.08e+08 1.08e+08 1.08e+08 ...
.. ..$ A1 : chr [1:6791] "T" "G" "G" "G" ...
.. ..$ A2 : chr [1:6791] "C" "A" "C" "A" ...
.. ..$ Freq: num [1:6791] 0.0924 0.3747 0.3504 0.0928 0.2207 ...
.. ..$ Beta: num [1:6791] -0.1844 -0.1309 0.128 -0.1795 -0.0441 ...
.. ..$ se : num [1:6791] 0.203 0.117 0.123 0.203 0.137 ...
.. ..$ p : num [1:6791] 0.364 0.263 0.299 0.376 0.748 ...
$ index : chr "240"
$ gwas_var_num: int 1
SNP | A1 | A2 | freq | b | se | p | N | |
---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | chr12:108093914_C_T | T | C | 0.09236774 | -0.46078434 | 0.3141903 | 0.14249112 | 1160 |
2 | chr12:108093950_A_G | G | A | 0.37467476 | 0.19533530 | 0.1812171 | 0.28107492 | 1160 |
3 | chr12:108094333_C_G | G | C | 0.35039029 | -0.02765113 | 0.1911775 | 0.88499837 | 1160 |
4 | chr12:108094913_A_G | G | A | 0.09280139 | -0.48190914 | 0.3136359 | 0.12440993 | 1160 |
5 | chr12:108095069_C_T | T | C | 0.22072853 | -0.44301635 | 0.2122775 | 0.03689105 | 1160 |
6 | chr12:108095158_C_T | T | C | 0.25628794 | 0.19442679 | 0.2086272 | 0.35137065 | 1160 |
base_sh="base_script"
# Create the base_script file and write the bash code into it
cat << 'EOF' > ${base_sh}
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 78:00:00
#SBATCH --mem=30000
#SBATCH -J extract
#SBATCH -o /home/hs3393/cloud_colocalization/submission_script/sumstat_generate/log/extract.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/submission_script/sumstat_generate/log/extract.%j.err
source ~/mamba_activate.sh
module load Singularity
Rscript /home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/sumstat_generate/trait_TRAIT_h2g_H2G.R
EOF
base_R="base_R"
for trait in 2 5 10; do
for h2g in 0.02 0.03 0.04 0.05; do
output_R="trait_${trait}_h2g_${h2g}.R"
output_sh="trait_${trait}_h2g_${h2g}.sh"
cat ${base_R}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_R}
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_sh}
sbatch ${output_sh}
done
done
Step 4: Transform sumstat into .esd format and make the .flist summary file#
Also, GWAS trait is saved into .ma format.
# Prepare input files for OPERA simulations (mixed sumstats)
# Load libraries
library(tidyverse)
# ----------------------
# Initialize parameters
# ----------------------
cnt = 1
ntrait = TRAIT # Number of traits (user-defined)
h2g = H2G # Heritability (user-defined)
# Define paths
cwd = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_",
ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera/")
# Create output directory
dir.create(paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_",
ntrait, "opera/opera_input_mix/", "h2g_", h2g, "/real_simulation_", ntrait, "opera/"),
showWarnings = FALSE, recursive = TRUE)
output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_",
ntrait, "opera/opera_input_mix/", "h2g_", h2g, "/real_simulation_", ntrait, "opera/")
# ----------------------
# Initialize storage
# ----------------------
gwas = list()
path = vector("list", ntrait)
chr = c()
distance = c()
probe_id = c()
# ----------------------
# Process each mixed sumstat file
# ----------------------
cnt = 1
for (file in list.files(cwd, pattern = ".rds")) {
file_path = paste0(cwd, file)
rds = readRDS(file_path)
# Extract TAD (region) index from file name
tad_index = as.numeric(sub(".*index_([0-9]+).*", "\\1", file_path))
# Save GWAS for the first trait
gwas[[cnt]] = rds$trait[[1]]
# Record metadata (chromosome and region midpoint)
distance[cnt] = floor((max(as.numeric(rds$trait[[2]]$Bp)) + min(as.numeric(rds$trait[[2]]$Bp))) / 2)
chr[cnt] = unique(rds$trait[[2]]$Chr)
probe_id[cnt] = paste0("Region_", tad_index)
# Save other traits as separate ESD files
for (i in 2:ntrait) {
path[[i-1]][cnt] = paste0(output_path, "/region_", tad_index, "_trait_", i, ".esd")
rds$trait[[i]] %>% write_tsv(path[[i-1]][cnt])
}
cnt = cnt + 1
}
# ----------------------
# Write .flist files for traits 2 to ntrait
# ----------------------
for (i in 2:ntrait) {
tibble(
Chr = chr,
ProbeID = paste0(probe_id, "_", i),
GeneticDistance = 0, # Set genetic distance to 0 (not used here)
ProbeBp = distance,
Gene = paste0(probe_id, "_", i),
Orientation = NA,
PathOfEsd = path[[i-1]]
) %>%
write_tsv(paste0(output_path, "/trait_", i, ".flist"))
}
# ----------------------
# Write .ma file for trait 1 and SNP list
# ----------------------
# Combine and save all trait 1 GWAS summary stats
bind_rows(gwas) %>%
write_tsv(paste0(output_path, "/trait_1.ma"))
# Save list of SNPs (for reference)
bind_rows(gwas) %>%
select(SNP) %>%
write_tsv(paste0(output_path, "/remain_snp.tsv"), col_names = FALSE)
Example output#
library(tidyverse)
result = read_delim("../simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera/region_274_trait_2.esd")
head(result)
Rows: 5504 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): SNP, A1, A2
dbl (6): Chr, Bp, Freq, Beta, se, p
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyverse)
result = read_delim("../simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera/trait_2.flist")
head(result)
Rows: 495 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): ProbeID, Gene, PathOfEsd
dbl (3): Chr, GeneticDistance, ProbeBp
lgl (1): Orientation
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Chr | ProbeID | GeneticDistance | ProbeBp | Gene | Orientation | PathOfEsd |
---|---|---|---|---|---|---|
<dbl> | <chr> | <dbl> | <dbl> | <chr> | <lgl> | <chr> |
8 | Region_0_2 | 0 | 17497391 | Region_0_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_0_trait_2.esd |
7 | Region_1_2 | 0 | 107320728 | Region_1_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_1_trait_2.esd |
5 | Region_10_2 | 0 | 113896770 | Region_10_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_10_trait_2.esd |
20 | Region_100_2 | 0 | 61926166 | Region_100_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_100_trait_2.esd |
20 | Region_101_2 | 0 | 35475951 | Region_101_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_101_trait_2.esd |
1 | Region_102_2 | 0 | 147899365 | Region_102_2 | NA | /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_102_trait_2.esd |
base_sh="base_script"
# Create the base_script file and write the bash code into it
cat << 'EOF' > ${base_sh}
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 78:00:00
#SBATCH --mem=50000
#SBATCH -J extract
#SBATCH -o /home/hs3393/cloud_colocalization/submission_script/opera_sumstat_extract/log/config.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/submission_script/opera_sumstat_extract/log/config.%j.err
source ~/mamba_activate.sh
module load Singularity
Rscript /home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/opera_sumstat_extract/trait_TRAIT_h2g_H2G.R
EOF
base_R="base_R"
for trait in 2 5 10; do
for h2g in 0.02 0.03 0.04 0.05; do
output_R="trait_${trait}_h2g_${h2g}.R"
output_sh="trait_${trait}_h2g_${h2g}.sh"
cat ${base_R}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_R}
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_sh}
sbatch ${output_sh}
done
done
Step 5: Use SMR to convert .esd files to .bsed files as OPERA input#
job_name="besd"
WD="/home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/make_besd"
mkdir -p ${WD}/code
mkdir -p ${WD}/log
traits=("2" "5" "10")
h2gs=("0.02" "0.03" "0.04" "0.05")
cd ${WD}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOBNAME
#SBATCH -o PWD/log/JOBNAME."%j".out
#SBATCH -e PWD/log/JOBNAME."%j".err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera
for trait_num in {2..10}; do
~/smr_heidi/SMR/smr-1.3.1-linux-x86_64/smr-1.3.1 \
--eqtl-flist ./trait_${trait_num}.flist \
--make-besd \
--cis-wind 3000 \ # extended since our cis region can be larger than 2Mb
--make-besd-dense \ # keep all SNP info
--out ./trait_${trait_num}
done
EOF
base_script="base_script"
for trait in "${traits[@]}"; do
for h2g in "${h2gs[@]}"; do
output_script="script_trait_${trait}_h2g_${h2g}.sh"
cat ${base_script} | sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" | sed "s|PWD|${WD}|g" | sed "s|JOBNAME|${job_name}|g" > ${output_script}
sb ${output_script}
# cat script_variant_${variant}.sh
done
done
Step 6: Exposure (QTL) meta file summary#
# Create a two-line table with strings
library(tidyverse)
for(TRAIT in c(2,5,10)){
for(H2G in c(0.02, 0.03, 0.04, 0.05)){
path = c()
ntrait = TRAIT
h2g = H2G
cnt = 1
output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/opera_input_mix/h2g_", h2g, "/real_simulation_", ntrait, "opera/")
for(i in c(2:ntrait)){
path[cnt] = paste0(output_path, "trait_", i)
cnt = cnt + 1
}
data <- data.frame(Path = path)
# Print the data
print(data)
# Save the data to a CSV file
write_csv(data, paste0(output_path, "/qtl_list"), col_names = FALSE)
}
}
Example output#
cat /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/qtl_list
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_2
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_3
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_4
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_5
Step 7: Subset the genome-wide SNP PLINK binary file#
Only include the variants in our selected non-overlapping regions for stage 1 prior estimation.
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J LDextract
#SBATCH -o LDextract.%j.out
#SBATCH -e LDextract.%j.err
source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera
plink2 --bfile /mnt/vast/hpc/csg/molecular_phenotype_calling/genotype_arch/ROSMAP_NIA_WGS.leftnorm.filtered --extract remain_snp.tsv --make-bed --out remain_snp_LD
base_sh="base_sh"
for trait in 2 5 10; do
for h2g in 0.02 0.03 0.04 0.05; do
output_sh="trait_${trait}_h2g_${h2g}.sh"
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_sh}
sbatch ${output_sh}
done
done
Step 8: Save genotype (LD) metadata in remain_snp_LD file#
# LD
# Create a two-line table with strings
library(tidyverse)
for(ntrait in c(2,5,10)){
for(h2g in c(0.02, 0.03, 0.04, 0.05)){
cnt = 1
output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/opera_input_mix/h2g_", h2g, "/real_simulation_", ntrait, "opera/")
data <- paste0(output_path, "/remain_snp_LD")
# Print the data
print(data)
# Save the data to a CSV file
write_lines(data, paste0(output_path, "/genotype_data"))
}
}
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.03/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.04/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.05/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.03/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.04/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.02/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.03/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.04/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.05/real_simulation_10opera//remain_snp_LD"
Step 9: Run OPERA#
Stage 1 OPERA (pi estimation)#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J opera1
#SBATCH -o opera1.%j.out
#SBATCH -e opera1.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--mbfile ./genotype_data --pi-wind 150 \ # extended since our cis region can be larger than 2Mb
--estimate-pi --out pi_estimation
base_sh="base_sh"
for trait in 2 5 10; do
for h2g in 0.02 0.03 0.04 0.05; do
output_sh="trait_${trait}_h2g_${h2g}.sh"
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_sh}
sbatch ${output_sh}
done
done
Example output#
cat ../simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera/pi_estimation.pi
Posteriors Pi1(0:0:0:0) Pi2(0:0:0:1) Pi3(0:0:1:0) Pi4(0:0:1:1) Pi5(0:1:0:0) Pi6(0:1:0:1) Pi7(0:1:1:0) Pi8(0:1:1:1) Pi9(1:0:0:0) Pi10(1:0:0:1) Pi11(1:0:1:0) Pi12(1:0:1:1) Pi13(1:1:0:0) Pi14(1:1:0:1) Pi15(1:1:1:0) Pi16(1:1:1:1)
Mean 0.275201 0.00228085 0.00233336 0.00224573 0.00231277 0.00220141 0.00223646 0.00227284 0.00246007 0.00209981 0.00291004 0.00263845 0.00362954 0.00242097 0.00327664 0.68948
SD 0.0643589 0.00712061 0.00685669 0.00697502 0.00686388 0.00684095 0.00673787 0.00699348 0.00695986 0.00649404 0.00799288 0.00752632 0.00940491 0.00681977 0.00880387 0.0667714
Stage 2 OPERA#
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=80000
#SBATCH -J opera2
#SBATCH -o opera2.%j.out
#SBATCH -e opera2.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--bfile ./remain_snp_LD \
--prior-pi-file ./pi_estimation.pi \
--prior-var-file ./pi_estimation.var \
--out ./opera_res --outcome-wind 1500 # extended since our cis region can be larger than 2Mb
base_sh="base_sh"
for trait in 2 5 10; do
for h2g in 0.02 0.03 0.04 0.05; do
output_sh="trait_${trait}_h2g_${h2g}.sh"
cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g" > ${output_sh}
sbatch ${output_sh}
done
done
Example output#
head ../simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera/opera_res_4_expos_assoc.ppa
Chr Expo1_ID Expo1_bp Expo2_ID Expo2_bp Expo3_ID Expo3_bp Expo4_ID Expo4_bp PPA(1,2,3,4) p_HEIDI(1,2,3,4)
10 Region_241_2 37025329 Region_241_3 37025329 Region_241_4 37025329 Region_241_5 37025329 0.999967 5.250539e-01
11 Region_107_2 11927391 Region_107_3 11927391 Region_107_4 11927391 Region_107_5 11927391 0.999991 8.772566e-01
11 Region_108_2 18057498 Region_108_3 18057498 Region_108_4 18057498 Region_108_5 18057498 1 6.443910e-01
13 Region_43_2 54814185 Region_43_3 54814185 Region_43_4 54814185 Region_43_5 54814185 0.999916 5.845443e-01
13 Region_449_2 30572631 Region_449_3 30572631 Region_449_4 30572631 Region_449_5 30572631 0.999933 3.243127e-01
18 Region_41_2 61335205 Region_41_3 61335205 Region_41_4 61335205 Region_41_5 61335205 0.999989 8.673183e-01
1 Region_124_2 84179558 Region_124_3 84179558 Region_124_4 84179558 Region_124_5 84179558 0.999975 4.870884e-01
2 Region_73_2 59140327 Region_73_3 59140327 Region_73_4 59140327 Region_73_5 59140327 0.999987 9.353629e-01
3 Region_246_2 108165863 Region_246_3 108165863 Region_246_4 108165863 Region_246_5 108165863 0.999999 3.673531e-01