Run OPERA

Run OPERA#

Goal#

After simulating the phenotypes, we will summarize them into summary statistics format, making them compatible with OPERA, and compare with ColocBoost targeted version.

Input#

Simulated individual level data.

Step 1: Choose non-overlapping regions from 1,287 regions used in other simulations#

# Load necessary libraries
library(tidyverse)

# Read in selected gene table
select_gene = read_tsv("./Data//selected_gene_tb.tsv")

# Clean and arrange the gene table:
# - Remove "TADB_" prefix from TADB_index
# - Convert index to numeric
# - Sort by index
select_gene = select_gene %>% 
  mutate(index = str_replace(TADB_index, "TADB_", "")) %>%
  mutate(index = as.numeric(index)) %>%
  arrange(index)

# Initialize vectors to store non-overlapping gene intervals
chrom_str = c()
start_str = c()
end_str = c()
gene_str = c()
cnt = 1

# Iterate through genes to select non-overlapping regions
for (i in 1:nrow(select_gene)) {
    chrom = select_gene[["#chr"]][i]
    start = select_gene[["gene_cis_start"]][i]
    end = select_gene[["gene_cis_end"]][i]
    gene = select_gene[["gene_id"]][i]
    
    if (i == 1) {
        # Always keep the first gene
        chrom_str = c(chrom_str, chrom)
        start_str = c(start_str, start)
        end_str = c(end_str, end)
        gene_str = c(gene_str, gene)
        cnt = cnt + 1
    } else {
        if (chrom != chrom_str[cnt - 1]) {
            # If chromosome changes, keep the gene
            chrom_str = c(chrom_str, chrom)
            start_str = c(start_str, start)
            end_str = c(end_str, end)
            gene_str = c(gene_str, gene)
            cnt = cnt + 1
        } else {
            if (start > end_str[cnt - 1]) {
                # If no overlap with previous region on the same chromosome, keep the gene
                chrom_str = c(chrom_str, chrom)
                start_str = c(start_str, start)
                end_str = c(end_str, end)
                gene_str = c(gene_str, gene)
                cnt = cnt + 1
            } 
        }
    }
}

Step 2: Based on causal variant distribution, mix the result by proportion#

Save the selected files in another folder.

## Obtained from true colocalization configuration

trait_2 = c(2177, 233, 26)
trait_2 = trait_2 / sum(trait_2)

trait_5 = c(1831, 258, 41, 9 )
trait_5 = trait_5 / sum(trait_5)

trait_10 = c(3175, 805, 149, 45, 10)
trait_10 = trait_10 / sum(trait_10)

# Mix simulation files based on causal variant number distribution

# ------------------------
# Part 1: 2 causal variants
# ------------------------

set.seed(100)  # For reproducibility
total_indices <- 500
proportions <- trait_2  # Proportions for 2 causal variant groups

# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))  # Correct rounding errors

# Assign samples to groups
groups <- rep(1:3, times = group_sizes)
groups <- sample(groups)  # Shuffle group assignments

# Find sample indices for each group
partitioned_indices <- lapply(1:3, function(i) which(groups == i) - 1)

# Copy files into mixed folders by h2g value
for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
    path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/h2g_", h2g, "_mix/")
    dir.create(path, showWarnings = FALSE, recursive = TRUE)
    
    # Copy from group 1
    for (i in partitioned_indices[[1]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_1_h2g_", 
                           h2g, "/real_simulation_2opera/sample_", 
                           i, "_real_simulation_1_ncausal_2_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    # Copy from group 2
    for (i in partitioned_indices[[2]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_2_h2g_", 
                           h2g, "/real_simulation_2opera/sample_", 
                           i, "_real_simulation_2_ncausal_2_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    # Copy from group 3
    for (i in partitioned_indices[[3]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/causal_3_h2g_", 
                           h2g, "/real_simulation_2opera/sample_", 
                           i, "_real_simulation_3_ncausal_2_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
}

# ------------------------
# Part 2: 5 causal variants
# ------------------------

set.seed(100)  # Reset seed
total_indices <- 500
proportions <- trait_5  # Proportions for 5 causal variant groups

# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))

groups <- rep(1:4, times = group_sizes)
groups <- sample(groups)

partitioned_indices <- lapply(1:4, function(i) which(groups == i) - 1)

for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
    path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/h2g_", h2g, "_mix/")
    dir.create(path, showWarnings = FALSE, recursive = TRUE)
    
    for (i in partitioned_indices[[1]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_1_h2g_", 
                           h2g, "/real_simulation_5opera/sample_", 
                           i, "_real_simulation_1_ncausal_5_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[2]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_2_h2g_", 
                           h2g, "/real_simulation_5opera/sample_", 
                           i, "_real_simulation_2_ncausal_5_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[3]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_3_h2g_", 
                           h2g, "/real_simulation_5opera/sample_", 
                           i, "_real_simulation_3_ncausal_5_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[4]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/causal_4_h2g_", 
                           h2g, "/real_simulation_5opera/sample_", 
                           i, "_real_simulation_4_ncausal_5_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
}

# -------------------------
# Part 3: 10 causal variants
# -------------------------

set.seed(100)
total_indices <- 500
proportions <- trait_10  # Proportions for 10 causal variant groups

# Calculate number of samples per group
group_sizes <- floor(total_indices * proportions)
group_sizes[1] <- group_sizes[1] + (total_indices - sum(group_sizes))

groups <- rep(1:5, times = group_sizes)
groups <- sample(groups)

partitioned_indices <- lapply(1:5, function(i) which(groups == i) - 1)

for (h2g in c(0.02, 0.03, 0.04, 0.05)) {
    path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/h2g_", h2g, "_mix/")
    dir.create(path, showWarnings = FALSE, recursive = TRUE)
    
    for (i in partitioned_indices[[1]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_1_h2g_", 
                           h2g, "/real_simulation_10opera/sample_", 
                           i, "_real_simulation_1_ncausal_10_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[2]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_2_h2g_", 
                           h2g, "/real_simulation_10opera/sample_", 
                           i, "_real_simulation_2_ncausal_10_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[3]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_3_h2g_", 
                           h2g, "/real_simulation_10opera/sample_", 
                           i, "_real_simulation_3_ncausal_10_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[4]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_4_h2g_", 
                           h2g, "/real_simulation_10opera/sample_", 
                           i, "_real_simulation_4_ncausal_10_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
    
    for (i in partitioned_indices[[5]]) {
        file_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/causal_5_h2g_", 
                           h2g, "/real_simulation_10opera/sample_", 
                           i, "_real_simulation_5_ncausal_10_trait.rds")
        file.copy(from = file_path, to = path, overwrite = TRUE)
    }
}

Step 3: Transform individual level simulation into sumstat#

# --- Load necessary libraries ---
library("MASS")
library("plink2R")
library("dplyr")
library("readr")
library("tidyverse")

# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")

# --- Load helper functions ---
source("~/simxQTL/simulate_linreg.R")

# Function to calculate GWAS summary statistics for each SNP
calculate_sumstat = function(X, Y) {
    Beta = c()
    se = c()
    Freq = c()
    p = c()
    
    for (mm in 1:ncol(X)) {
        rr <- susieR::univariate_regression(X[, mm], Y)
        Beta[mm] = rr$betahat
        se[mm] = rr$sebetahat
        Freq[mm] = sum(X[, mm]) / (2 * nrow(X))
        p[mm] = 2 * (1 - pnorm(abs(rr$betahat / rr$sebetahat)))
    }
    
    # Assemble summary statistics table
    tb = tibble(
        SNP = colnames(X),
        Beta = Beta,
        se = se,
        Freq = Freq,
        p = p
    ) %>%
    separate(SNP, into = c("Chr", "Bp", "A2", "A1"), sep = "[:_]", remove = FALSE) %>%
    select(Chr, SNP, Bp, A1, A2, Freq, Beta, se, p) %>%
    mutate(
        Chr = str_replace(Chr, "chr", ""),
        Chr = as.numeric(Chr),
        Bp = as.numeric(Bp)
    )
    
    return(tb)
}

# ------------------------------
# Main script to process samples
# ------------------------------

cnt = 1
ntrait = TRAIT  # Number of traits (user-defined)
h2g = H2G       # Heritability value (user-defined)

# Define input path
path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/h2g_", h2g, "_mix")
files = list.files(path, full.names = TRUE, pattern = ".rds")

# Initialize data list
data <- rep(list(NULL), length(files))

# Loop over all input files
for (file in files) {
    rds = readRDS(file)
    X = rds$X
    Y = rds$Y
    trait = list()
    
    # Process first trait (apply special formatting)
    trait[[1]] = calculate_sumstat(X, Y[, 1, drop = TRUE]) %>%
                 rename(freq = Freq, b = Beta) %>%
                 mutate(N = 1160) %>%
                 select(SNP, A1, A2, freq, b, se, p, N)
    
    # Process remaining traits
    for (i in 2:ncol(Y)) {
        trait[[i]] = calculate_sumstat(X, Y[, i, drop = TRUE])
    }
    
    # Extract sample index from filename
    index = str_extract(file, "(?<=sample_)\\d+")
    
    # Prepare output data structure
    data = list()
    data[["trait"]] = trait
    data$index = index
    data$gwas_var_num = length(rds$variant[[1]])
    
    # Create output directory if it does not exist
    dir.create(
        paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera"),
        showWarnings = FALSE,
        recursive = TRUE
    )
    
    # Save processed summary statistics
    saveRDS(
        data,
        paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera/index_", index, ".rds")
    )
    
    cnt = cnt + 1
}

Example output#

result = readRDS("../simulation_data/simulation_opera/real_simulation_2opera/mix_sumstat/h2g_0.02/real_simulation_2opera/index_240.rds")
str(result)
head(result$trait[[1]])

List of 3
 $ trait       :List of 2
  ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6791 obs. of  8 variables:
  .. ..$ SNP : chr [1:6791] "chr12:108093914_C_T" "chr12:108093950_A_G" "chr12:108094333_C_G" "chr12:108094913_A_G" ...
  .. ..$ A1  : chr [1:6791] "T" "G" "G" "G" ...
  .. ..$ A2  : chr [1:6791] "C" "A" "C" "A" ...
  .. ..$ freq: num [1:6791] 0.0924 0.3747 0.3504 0.0928 0.2207 ...
  .. ..$ b   : num [1:6791] -0.4608 0.1953 -0.0277 -0.4819 -0.443 ...
  .. ..$ se  : num [1:6791] 0.314 0.181 0.191 0.314 0.212 ...
  .. ..$ p   : num [1:6791] 0.1425 0.2811 0.885 0.1244 0.0369 ...
  .. ..$ N   : num [1:6791] 1160 1160 1160 1160 1160 1160 1160 1160 1160 1160 ...
  ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6791 obs. of  9 variables:
  .. ..$ Chr : num [1:6791] 12 12 12 12 12 12 12 12 12 12 ...
  .. ..$ SNP : chr [1:6791] "chr12:108093914_C_T" "chr12:108093950_A_G" "chr12:108094333_C_G" "chr12:108094913_A_G" ...
  .. ..$ Bp  : num [1:6791] 1.08e+08 1.08e+08 1.08e+08 1.08e+08 1.08e+08 ...
  .. ..$ A1  : chr [1:6791] "T" "G" "G" "G" ...
  .. ..$ A2  : chr [1:6791] "C" "A" "C" "A" ...
  .. ..$ Freq: num [1:6791] 0.0924 0.3747 0.3504 0.0928 0.2207 ...
  .. ..$ Beta: num [1:6791] -0.1844 -0.1309 0.128 -0.1795 -0.0441 ...
  .. ..$ se  : num [1:6791] 0.203 0.117 0.123 0.203 0.137 ...
  .. ..$ p   : num [1:6791] 0.364 0.263 0.299 0.376 0.748 ...
 $ index       : chr "240"
 $ gwas_var_num: int 1

A tibble: 6 × 8
	SNP	A1	A2	freq	b	se	p	N
	<chr>	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	chr12:108093914_C_T	T	C	0.09236774	-0.46078434	0.3141903	0.14249112	1160
2	chr12:108093950_A_G	G	A	0.37467476	0.19533530	0.1812171	0.28107492	1160
3	chr12:108094333_C_G	G	C	0.35039029	-0.02765113	0.1911775	0.88499837	1160
4	chr12:108094913_A_G	G	A	0.09280139	-0.48190914	0.3136359	0.12440993	1160
5	chr12:108095069_C_T	T	C	0.22072853	-0.44301635	0.2122775	0.03689105	1160
6	chr12:108095158_C_T	T	C	0.25628794	0.19442679	0.2086272	0.35137065	1160

base_sh="base_script"
# Create the base_script file and write the bash code into it
cat << 'EOF' > ${base_sh}
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 78:00:00
#SBATCH --mem=30000
#SBATCH -J extract
#SBATCH -o /home/hs3393/cloud_colocalization/submission_script/sumstat_generate/log/extract.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/submission_script/sumstat_generate/log/extract.%j.err

source ~/mamba_activate.sh
module load Singularity

Rscript /home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/sumstat_generate/trait_TRAIT_h2g_H2G.R
EOF

base_R="base_R"

for trait in 2 5 10; do
    for h2g in 0.02 0.03 0.04 0.05; do
        output_R="trait_${trait}_h2g_${h2g}.R"
        output_sh="trait_${trait}_h2g_${h2g}.sh"
        cat ${base_R}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g"   > ${output_R}
        cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g"  > ${output_sh}
        sbatch ${output_sh}
    done
done

Step 4: Transform sumstat into .esd format and make the .flist summary file#

Also, GWAS trait is saved into .ma format.

# Prepare input files for OPERA simulations (mixed sumstats)

# Load libraries
library(tidyverse)

# ----------------------
# Initialize parameters
# ----------------------
cnt = 1
ntrait = TRAIT  # Number of traits (user-defined)
h2g = H2G       # Heritability (user-defined)

# Define paths
cwd = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", 
             ntrait, "opera/mix_sumstat/h2g_", h2g, "/real_simulation_", ntrait, "opera/")

# Create output directory
dir.create(paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", 
                  ntrait, "opera/opera_input_mix/", "h2g_", h2g, "/real_simulation_", ntrait, "opera/"), 
           showWarnings = FALSE, recursive = TRUE)

output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", 
                     ntrait, "opera/opera_input_mix/", "h2g_", h2g, "/real_simulation_", ntrait, "opera/")

# ----------------------
# Initialize storage
# ----------------------
gwas = list()
path = vector("list", ntrait)
chr = c()
distance = c()
probe_id = c()

# ----------------------
# Process each mixed sumstat file
# ----------------------
cnt = 1
for (file in list.files(cwd, pattern = ".rds")) {
    file_path = paste0(cwd, file)
    rds = readRDS(file_path)
    
    # Extract TAD (region) index from file name
    tad_index = as.numeric(sub(".*index_([0-9]+).*", "\\1", file_path))
    
    # Save GWAS for the first trait
    gwas[[cnt]] = rds$trait[[1]]
    
    # Record metadata (chromosome and region midpoint)
    distance[cnt] = floor((max(as.numeric(rds$trait[[2]]$Bp)) + min(as.numeric(rds$trait[[2]]$Bp))) / 2)
    chr[cnt] = unique(rds$trait[[2]]$Chr)
    probe_id[cnt] = paste0("Region_", tad_index)
    
    # Save other traits as separate ESD files
    for (i in 2:ntrait) {
        path[[i-1]][cnt] = paste0(output_path, "/region_", tad_index, "_trait_", i, ".esd")
        rds$trait[[i]] %>% write_tsv(path[[i-1]][cnt])
    }
    
    cnt = cnt + 1
}

# ----------------------
# Write .flist files for traits 2 to ntrait
# ----------------------
for (i in 2:ntrait) {
    tibble(
        Chr = chr, 
        ProbeID = paste0(probe_id, "_", i), 
        GeneticDistance = 0,   # Set genetic distance to 0 (not used here)
        ProbeBp = distance, 
        Gene = paste0(probe_id, "_", i),  
        Orientation = NA, 
        PathOfEsd = path[[i-1]]
    ) %>%  
    write_tsv(paste0(output_path, "/trait_", i, ".flist"))
}

# ----------------------
# Write .ma file for trait 1 and SNP list
# ----------------------

# Combine and save all trait 1 GWAS summary stats
bind_rows(gwas) %>% 
    write_tsv(paste0(output_path, "/trait_1.ma"))

# Save list of SNPs (for reference)
bind_rows(gwas) %>% 
    select(SNP) %>% 
    write_tsv(paste0(output_path, "/remain_snp.tsv"), col_names = FALSE)

Example output#

library(tidyverse)
result = read_delim("../simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera/region_274_trait_2.esd")
head(result)

Rows: 5504 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): SNP, A1, A2
dbl (6): Chr, Bp, Freq, Beta, se, p

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(tidyverse)
result = read_delim("../simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera/trait_2.flist")
head(result)

Rows: 495 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): ProbeID, Gene, PathOfEsd
dbl (3): Chr, GeneticDistance, ProbeBp
lgl (1): Orientation

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 7
Chr	ProbeID	GeneticDistance	ProbeBp	Gene	Orientation	PathOfEsd
<dbl>	<chr>	<dbl>	<dbl>	<chr>	<lgl>	<chr>
8	Region_0_2	0	17497391	Region_0_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_0_trait_2.esd
7	Region_1_2	0	107320728	Region_1_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_1_trait_2.esd
5	Region_10_2	0	113896770	Region_10_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_10_trait_2.esd
20	Region_100_2	0	61926166	Region_100_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_100_trait_2.esd
20	Region_101_2	0	35475951	Region_101_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_101_trait_2.esd
1	Region_102_2	0	147899365	Region_102_2	NA	/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//region_102_trait_2.esd

base_sh="base_script"
# Create the base_script file and write the bash code into it
cat << 'EOF' > ${base_sh}
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 78:00:00
#SBATCH --mem=50000
#SBATCH -J extract
#SBATCH -o /home/hs3393/cloud_colocalization/submission_script/opera_sumstat_extract/log/config.%j.out
#SBATCH -e /home/hs3393/cloud_colocalization/submission_script/opera_sumstat_extract/log/config.%j.err

source ~/mamba_activate.sh
module load Singularity

Rscript /home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/opera_sumstat_extract/trait_TRAIT_h2g_H2G.R
EOF

base_R="base_R"

for trait in 2 5 10; do
    for h2g in 0.02 0.03 0.04 0.05; do
        output_R="trait_${trait}_h2g_${h2g}.R"
        output_sh="trait_${trait}_h2g_${h2g}.sh"
        cat ${base_R}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g"   > ${output_R}
        cat ${base_sh}| sed "s|TRAIT|${trait}|g" | sed "s|H2G|${h2g}|g"  > ${output_sh}
        sbatch ${output_sh}
    done
done

Step 5: Use SMR to convert .esd files to .bsed files as OPERA input#

job_name="besd"
WD="/home/hs3393/cb_simulation/simulation_data/simulation_opera/opera_code/make_besd"


mkdir -p ${WD}/code
mkdir -p ${WD}/log

traits=("2" "5" "10")
h2gs=("0.02" "0.03" "0.04" "0.05")

cd ${WD}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 10:00:00
#SBATCH --mem=30000
#SBATCH -J JOBNAME
#SBATCH -o PWD/log/JOBNAME."%j".out
#SBATCH -e PWD/log/JOBNAME."%j".err


source ~/mamba_activate.sh
module load Singularity

cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera

for trait_num in {2..10}; do
    ~/smr_heidi/SMR/smr-1.3.1-linux-x86_64/smr-1.3.1 \
    --eqtl-flist ./trait_${trait_num}.flist \
    --make-besd \
    --cis-wind 3000 \ # extended since our cis region can be larger than 2Mb
    --make-besd-dense \ # keep all SNP info
    --out ./trait_${trait_num}
done

EOF
            
base_script="base_script"
for trait in "${traits[@]}"; do
      for h2g in "${h2gs[@]}"; do  
            output_script="script_trait_${trait}_h2g_${h2g}.sh"
            cat ${base_script} | sed "s|TRAIT|${trait}|g" |  sed "s|H2G|${h2g}|g" | sed "s|PWD|${WD}|g" | sed "s|JOBNAME|${job_name}|g"  > ${output_script}
            sb ${output_script}
            # cat script_variant_${variant}.sh
    done
done

Step 6: Exposure (QTL) meta file summary#

# Create a two-line table with strings
library(tidyverse)
for(TRAIT in c(2,5,10)){
        for(H2G in c(0.02, 0.03, 0.04, 0.05)){
        path = c()
        ntrait = TRAIT
        h2g = H2G
        cnt = 1
        output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/opera_input_mix/h2g_", h2g, "/real_simulation_", ntrait, "opera/")
        for(i in c(2:ntrait)){
            path[cnt] = paste0(output_path, "trait_", i)
            cnt = cnt + 1
        }
        data <- data.frame(Path = path)

        # Print the data
        print(data)

        # Save the data to a CSV file
        write_csv(data, paste0(output_path, "/qtl_list"), col_names = FALSE)
        }
}

Example output#

cat /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/qtl_list

/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_2
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_3
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_4
/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera/trait_5

Step 7: Subset the genome-wide SNP PLINK binary file#

Only include the variants in our selected non-overlapping regions for stage 1 prior estimation.

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J LDextract
#SBATCH -o LDextract.%j.out
#SBATCH -e LDextract.%j.err

source ~/mamba_activate.sh
module load Singularity
cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera

plink2 --bfile /mnt/vast/hpc/csg/molecular_phenotype_calling/genotype_arch/ROSMAP_NIA_WGS.leftnorm.filtered --extract remain_snp.tsv --make-bed --out remain_snp_LD

base_sh="base_sh"
for trait in 2 5 10; do
        for h2g in 0.02 0.03 0.04 0.05; do
            output_sh="trait_${trait}_h2g_${h2g}.sh"
            cat ${base_sh}| sed "s|TRAIT|${trait}|g"  | sed "s|H2G|${h2g}|g"  > ${output_sh}
            sbatch ${output_sh}
    done
done

Step 8: Save genotype (LD) metadata in remain_snp_LD file#

# LD
# Create a two-line table with strings
library(tidyverse)
for(ntrait in c(2,5,10)){
        for(h2g in c(0.02, 0.03, 0.04, 0.05)){

            cnt = 1
            output_path = paste0("/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_", ntrait, "opera/opera_input_mix/h2g_", h2g, "/real_simulation_", ntrait, "opera/")
            data <- paste0(output_path, "/remain_snp_LD")

            # Print the data
            print(data)

            # Save the data to a CSV file
            write_lines(data, paste0(output_path, "/genotype_data"))
    }
}

[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.02/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.03/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.04/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_2opera/opera_input_mix/h2g_0.05/real_simulation_2opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.02/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.03/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.04/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.02/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.03/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.04/real_simulation_10opera//remain_snp_LD"
[1] "/home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_10opera/opera_input_mix/h2g_0.05/real_simulation_10opera//remain_snp_LD"

Step 9: Run OPERA#

Stage 1 OPERA (pi estimation)#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=60000
#SBATCH -J opera1
#SBATCH -o opera1.%j.out
#SBATCH -e opera1.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera
/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
--mbfile ./genotype_data --pi-wind 150 \ # extended since our cis region can be larger than 2Mb
--estimate-pi --out pi_estimation

base_sh="base_sh"
for trait in 2 5 10; do
        for h2g in 0.02 0.03 0.04 0.05; do
            output_sh="trait_${trait}_h2g_${h2g}.sh"
            cat ${base_sh}| sed "s|TRAIT|${trait}|g"  | sed "s|H2G|${h2g}|g"  > ${output_sh}
            sbatch ${output_sh}
    done
done

Example output#

cat ../simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera/pi_estimation.pi

Posteriors	Pi1(0:0:0:0)	Pi2(0:0:0:1)	Pi3(0:0:1:0)	Pi4(0:0:1:1)	Pi5(0:1:0:0)	Pi6(0:1:0:1)	Pi7(0:1:1:0)	Pi8(0:1:1:1)	Pi9(1:0:0:0)	Pi10(1:0:0:1)	Pi11(1:0:1:0)	Pi12(1:0:1:1)	Pi13(1:1:0:0)	Pi14(1:1:0:1)	Pi15(1:1:1:0)	Pi16(1:1:1:1)
Mean	0.275201	0.00228085	0.00233336	0.00224573	0.00231277	0.00220141	0.00223646	0.00227284	0.00246007	0.00209981	0.00291004	0.00263845	0.00362954	0.00242097	0.00327664	0.68948
SD	0.0643589	0.00712061	0.00685669	0.00697502	0.00686388	0.00684095	0.00673787	0.00699348	0.00695986	0.00649404	0.00799288	0.00752632	0.00940491	0.00681977	0.00880387	0.0667714

Stage 2 OPERA#

#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 36:00:00
#SBATCH --mem=80000
#SBATCH -J opera2
#SBATCH -o opera2.%j.out
#SBATCH -e opera2.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd /home/hs3393/cb_simulation/simulation_data/simulation_opera/real_simulation_TRAITopera/opera_input_mix/h2g_H2G/real_simulation_TRAITopera

/home/hs3393/opera_Linux/opera_Linux --besd-flist qtl_list --gwas-summary ./trait_1.ma \
    --bfile ./remain_snp_LD \
    --prior-pi-file ./pi_estimation.pi \ 
    --prior-var-file ./pi_estimation.var \
    --out ./opera_res --outcome-wind 1500 # extended since our cis region can be larger than 2Mb

base_sh="base_sh"
for trait in 2 5 10; do
        for h2g in 0.02 0.03 0.04 0.05; do
            output_sh="trait_${trait}_h2g_${h2g}.sh"
            cat ${base_sh}| sed "s|TRAIT|${trait}|g"  | sed "s|H2G|${h2g}|g"  > ${output_sh}
            sbatch ${output_sh}
    done
done

Example output#

head ../simulation_data/simulation_opera/real_simulation_5opera/opera_input_mix/h2g_0.05/real_simulation_5opera/opera_res_4_expos_assoc.ppa

Chr	Expo1_ID	Expo1_bp	Expo2_ID	Expo2_bp	Expo3_ID	Expo3_bp	Expo4_ID	Expo4_bp	PPA(1,2,3,4)	p_HEIDI(1,2,3,4)
Region_241_2	37025329	Region_241_3	37025329	Region_241_4	37025329	Region_241_5	37025329	0.999967	5.250539e-01
Region_107_2	11927391	Region_107_3	11927391	Region_107_4	11927391	Region_107_5	11927391	0.999991	8.772566e-01
Region_108_2	18057498	Region_108_3	18057498	Region_108_4	18057498	Region_108_5	18057498	1	6.443910e-01
Region_43_2	54814185	Region_43_3	54814185	Region_43_4	54814185	Region_43_5	54814185	0.999916	5.845443e-01
Region_449_2	30572631	Region_449_3	30572631	Region_449_4	30572631	Region_449_5	30572631	0.999933	3.243127e-01
Region_41_2	61335205	Region_41_3	61335205	Region_41_4	61335205	Region_41_5	61335205	0.999989	8.673183e-01
Region_124_2	84179558	Region_124_3	84179558	Region_124_4	84179558	Region_124_5	84179558	0.999975	4.870884e-01
Region_73_2	59140327	Region_73_3	59140327	Region_73_4	59140327	Region_73_5	59140327	0.999987	9.353629e-01
Region_246_2	108165863	Region_246_3	108165863	Region_246_4	108165863	Region_246_5	108165863	0.999999	3.673531e-01

Run OPERA

Contents

Run OPERA#

Goal#

Input#

Step 1: Choose non-overlapping regions from 1,287 regions used in other simulations#

Step 2: Based on causal variant distribution, mix the result by proportion#

Step 3: Transform individual level simulation into sumstat#

Example output#

Step 4: Transform sumstat into .esd format and make the .flist summary file#

Example output#

Step 5: Use SMR to convert .esd files to .bsed files as OPERA input#

Step 6: Exposure (QTL) meta file summary#

Example output#

Step 7: Subset the genome-wide SNP PLINK binary file#

Step 8: Save genotype (LD) metadata in remain_snp_LD file#

Step 9: Run OPERA#

Stage 1 OPERA (pi estimation)#

Example output#

Stage 2 OPERA#

Example output#