Other Colocalization Methods#
Goal#
This notebook demonstrates how to execute Hyprcoloc, MOLOC or COLOC(V5) (SuSiE-Coloc) on simulated data and analyze its performance through power and false discovery rate (FDR) calculations.
Note: Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait
, coloc_set
, true_trait
, and true_variant
.
Input#
simufile
: Individual level data X and Y introduced in Phenotype Simulation.
Output#
Hyprcoloc / MOLOC / COLOC (V5) original result, along with some summarized elements.
coloc_trait
: traits identified by ColocBoost as having shared genetic effectscoloc_set
: sets of variants identified as causaltrue_trait
: traits with true causal relationships (ground truth)true_variant
: variants that are truly causal (ground truth)
Example output:
result = readRDS("../simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_9_real_simulation_3_ncausal_10_trait_ntr_10_hypercoloc.rds")
result$results
iteration | traits | posterior_prob | regional_prob | candidate_snp | posterior_explained_by_snp | dropped_trait | |
---|---|---|---|---|---|---|---|
<dbl> | <chr> | <dbl> | <dbl> | <chr> | <dbl> | <chr> | |
1 | 1 | 5, 9 | 0.8754 | 1 | snp917 | 0.3881 | NA |
2 | 2 | 3, 8 | 0.7915 | 1 | snp2568 | 0.0418 | NA |
3 | 3 | 2, 6 | 0.9426 | 1 | snp2904 | 0.1141 | NA |
print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set
# colocalization result - colocalizing trait
result$coloc_trait
print("=====Truth=====")
# true colocalizing CoS
result$true_variant
# true colocalizing trait
result$true_trait
[1] "=====Prediction====="
-
- 917
- 911
- 931
- 934
- 916
- 913
-
- 2568
- 2479
- 2605
- 2627
- 2535
- 2483
- 2485
- 2487
- 2495
- 2496
- 2497
- 2498
- 2504
- 2507
- 2510
- 2512
- 2513
- 2514
- 2515
- 2518
- 2526
- 2528
- 2529
- 2530
- 2537
- 2544
- 2545
- 2547
- 2548
- 2550
- 2552
- 2554
- 2555
- 2558
- 2560
- 2562
- 2563
- 2582
- 2590
- 2591
- 2592
- 2600
- 2603
- 2604
- 2610
- 2617
- 2618
- 2620
- 2621
- 2625
- 2630
- 2631
- 2632
- 2634
- 2635
- 2636
- 2639
- 2642
- 2644
- 2646
- 2647
- 2648
- 2650
- 2652
- 2655
- 2656
- 2657
- 2658
- 2659
- 2665
- 2667
- 2668
- 2673
- 2594
- 2539
- 2614
- 2564
- 2615
- 2649
- 2531
- 2533
- 2493
- 2576
-
- 2904
- 2892
- 2900
- 2908
- 2910
- 2911
- 2920
- 2874
- 2887
- 2859
- 2888
- 2894
-
- 5
- 9
-
- 3
- 8
-
- 2
- 6
[1] "=====Truth====="
- 934
- 2485
- 2908
-
- 5
- 6
- 9
-
- 3
- 8
-
- 2
- 6
Run HyprColoc without LD#
[hyprcoloc_set]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting="normal"
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hyprcoloc.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
# ======= Load necessary libraries =======
library("hyprcoloc")
library("susieR")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# ======= Define run_hypercoloc function =======
run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
library(hyprcoloc)
# --- Load simulation file ---
rds = readRDS(file)
# --- Prepare input data: X (genotype), Y (phenotypes), variant (causal SNPs) ---
X = list()
Y = list()
variant = list()
for (i in 1:actual_pheno_number) {
X[[i]] = rds$X[, , drop = FALSE]
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
# --- Estimate marginal betas and SEs using univariate regression ---
betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)
for (i in 1:actual_pheno_number) {
for (mm in 1:ncol(X[[1]])) {
rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
betas[mm, i] = rr$betahat
sebetas[mm, i] = rr$sebetahat
}
}
# --- Set trait and SNP labels ---
traits <- c(1:actual_pheno_number)
rsid <- paste0("snp", c(1:ncol(X[[1]])))
colnames(betas) <- colnames(sebetas) <- traits
rownames(betas) <- rownames(sebetas) <- rsid
# --- Run HyPrColoc ---
t1 <- Sys.time()
res_hyprcoloc <- hyprcoloc(betas, sebetas, trait.names = traits, snp.id = rsid, snpscores = TRUE)
t2 <- Sys.time()
# --- Process HyPrColoc results ---
tb = res_hyprcoloc$results
index = which(tb$traits != "None")
tb = tb[index, , drop = FALSE]
res_hyprcoloc$results = tb
num = nrow(res_hyprcoloc$results)
# --- Handle "normal" setting - output the credible set ---
if (setting == "normal") {
if (num == 0) {
causal_snp = NULL
causal_trait = NULL
} else {
hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
causal_snp = list()
causal_trait = list()
for (i in 1:num) {
causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
}
}
### Find true coloc variants and traits ###
all_var = unlist(variant)
coloc_var_index = as.numeric(names(which(table(all_var) >= 2))) # Variants appearing in ≥2 traits
true_coloc_var = coloc_var_index
true_coloc_trait = list()
for (variant_index in 1:length(coloc_var_index)) {
temp_vec = c()
for (i in 1:length(variant)) {
if (coloc_var_index[variant_index] %in% variant[[i]]) {
temp_vec = c(temp_vec, i)
}
}
true_coloc_trait[[variant_index]] = temp_vec
}
# --- Create report ---
hypercoloc_report = res_hyprcoloc
hypercoloc_report$true_variant = true_coloc_var
hypercoloc_report$true_trait = true_coloc_trait
hypercoloc_report$coloc_set = causal_snp
hypercoloc_report$coloc_trait = causal_trait
hypercoloc_report$file = file
hypercoloc_report$time = (t2 - t1)
# --- Handle "alternative" setting (no trait recording): output the SNP hit (one variant) ---
} else {
if (num == 0) {
causal_snp = NULL
} else {
hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
causal_snp = list()
for (i in 1:num) {
causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
}
}
hypercoloc_report = res_hyprcoloc
hypercoloc_report$coloc_set = causal_snp
hypercoloc_report$file = file
hypercoloc_report$time = (t2 - t1)
}
# --- Purity filtering: keep only sets with LD-purity > 0.5 ---
purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
purity_index = which(unlist(purity_check) > 0.5)
if (length(purity_index) > 0) {
hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
} else {
print("All filtered out")
hypercoloc_report$coloc_set = NULL
hypercoloc_report$coloc_trait = NULL
}
hypercoloc_report$purity = purity_check
# --- Return result ---
return(hypercoloc_report)
}
# ======= Execute and Save =======
file = ${_input:ar,}
hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
saveRDS(hypercoloc_result, ${_output:r})
Run Hyprcoloc with LD info#
[hyprcoloc_LD_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting = "normal"
# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hypercoloc_LD.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("hyprcoloc")
library("susieR")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# ======= Define run_hypercoloc function =======
run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
library(hyprcoloc)
# --- Load simulation data ---
rds = readRDS(file)
# --- Prepare input lists ---
X = list()
Y = list()
variant = list()
for (i in 1:actual_pheno_number) {
X[[i]] = rds$X[, , drop = FALSE]
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
# --- Estimate marginal betas and SEs ---
betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)
for (i in 1:actual_pheno_number) {
for (mm in 1:ncol(X[[1]])) {
rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
betas[mm, i] = rr$betahat
sebetas[mm, i] = rr$sebetahat
}
}
# --- Label traits and SNPs ---
traits <- c(1:actual_pheno_number)
rsid <- paste0("snp", c(1:ncol(X[[1]])))
colnames(betas) <- colnames(sebetas) <- traits
rownames(betas) <- rownames(sebetas) <- rsid
# --- Calculate LD matrix from X (correlation) ---
LD = get_correlation(X[[1]])
colnames(LD) <- rownames(LD) <- colnames(X[[1]])
# --- Run HyPrColoc with LD matrix ---
t1 <- Sys.time()
res_hyprcoloc <- hyprcoloc(
betas,
sebetas,
trait.names = traits,
snp.id = rsid,
snpscores = TRUE,
ld.matrix = LD # <<=== NEW compared to previous version
)
t2 <- Sys.time()
# --- Process HyPrColoc results ---
tb = res_hyprcoloc$results
index = which(tb$traits != "None")
tb = tb[index, , drop = FALSE]
res_hyprcoloc$results = tb
num = nrow(res_hyprcoloc$results)
# --- Handle "normal" setting ---
if (setting == "normal") {
if (num == 0) {
causal_snp = NULL
causal_trait = NULL
} else {
hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
causal_snp = list()
causal_trait = list()
for (i in 1:num) {
causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
}
}
# --- Identify true coloc variants and traits ---
all_var = unlist(variant)
coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))
true_coloc_var = coloc_var_index
true_coloc_trait = list()
for (variant_index in 1:length(coloc_var_index)) {
temp_vec = c()
for (i in 1:length(variant)) {
if (coloc_var_index[variant_index] %in% variant[[i]]) {
temp_vec = c(temp_vec, i)
}
}
true_coloc_trait[[variant_index]] = temp_vec
}
# --- Save full report ---
hypercoloc_report = res_hyprcoloc
hypercoloc_report$true_variant = true_coloc_var
hypercoloc_report$true_trait = true_coloc_trait
hypercoloc_report$coloc_set = causal_snp
hypercoloc_report$coloc_trait = causal_trait
hypercoloc_report$file = file
hypercoloc_report$time = (t2 - t1)
# --- Handle alternative setting ---
} else {
if (num == 0) {
causal_snp = NULL
} else {
hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
causal_snp = list()
for (i in 1:num) {
causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
}
}
hypercoloc_report = res_hyprcoloc
hypercoloc_report$coloc_set = causal_snp
hypercoloc_report$file = file
hypercoloc_report$time = (t2 - t1)
}
# --- Purity filtering (keep coloc sets with LD purity > 0.5) ---
purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
purity_index = which(unlist(purity_check) > 0.5)
if (length(purity_index) > 0) {
hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
} else {
print("All filtered out")
hypercoloc_report$coloc_set = NULL
hypercoloc_report$coloc_trait = NULL
}
hypercoloc_report$purity = purity_check
# --- Return result ---
return(hypercoloc_report)
}
# ======= Execute and Save =======
file = ${_input:ar,}
hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
saveRDS(hypercoloc_result, ${_output:r})
Run MOLOC#
[moloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 50
parameter: container = ""
# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_moloc_set.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("moloc")
library("tidyverse")
library("susieR")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
# ======= Define run_moloc function =======
run_moloc <- function(file, actual_pheno_number, setting = "normal") {
# --- Load simulation RDS file ---
rds = readRDS(file)
# --- Prepare X (genotypes), Y (phenotypes), variant (causal SNPs) ---
X = list()
Y = list()
variant = list()
for (i in 1:actual_pheno_number) {
X[[i]] = rds$X[, , drop = FALSE]
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
# --- Build input summary statistics tables for moloc ---
tb <- list()
cnt = 1
for (i in 1:actual_pheno_number) {
betas = c()
sebetas = c()
MAF = c()
for (mm in 1:ncol(X[[i]])) {
rr <- susieR::univariate_regression(X[[i]][, mm], Y[[i]])
betas[mm] = rr$betahat
sebetas[mm] = rr$sebetahat
# Minor Allele Frequency (MAF) calculation
MAF[mm] = min(sum(X[[i]][, mm]) / (2 * nrow(X[[i]])), 1 - sum(X[[i]][, mm]) / (2 * nrow(X[[i]])))
}
tb[[cnt]] = tibble(
SNP = 1:ncol(X[[i]]),
BETA = betas,
SE = sebetas,
N = nrow(X[[i]]),
MAF = MAF
)
cnt = cnt + 1
}
# --- Run moloc on the prepared input ---
options(scipen = 1, digits = 3)
t1 <- Sys.time()
moloc <- moloc_test(listData = tb, save.SNP.info = TRUE)
t2 <- Sys.time()
# --- Identify true coloc variants across traits ---
all_var = unlist(variant)
coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))
true_coloc_var = coloc_var_index
true_coloc_trait = list()
for (variant_index in 1:length(coloc_var_index)) {
temp_vec = c()
for (i in 1:length(variant)) {
if (coloc_var_index[variant_index] %in% variant[[i]]) {
temp_vec = c(temp_vec, i)
}
}
true_coloc_trait[[variant_index]] = temp_vec
}
# --- Extract colocalization result from moloc output ---
index = which.max(moloc$priors_lkl_ppa$PPA)
hypothesis_name = rownames(moloc$priors_lkl_ppa)[index]
coloc_traits = unlist(strsplit(hypothesis_name, ","))
# Find the hypothesis with maximum number of traits coloc'd
trait_num = max(nchar(coloc_traits))
coloc_trait = coloc_traits[which.max(nchar(coloc_traits))]
# Decode coloc trait indices (letters -> numbers)
coloc_trait_index = c()
for (item in unlist(strsplit(coloc_trait, NULL))) {
temp_index = as.integer(charToRaw(item)) - as.integer(charToRaw("a")) + 1
coloc_trait_index = c(coloc_trait_index, temp_index)
}
# --- Find credible SNP set covering 95% probability ---
index2 = which(rownames(moloc$best_snp[[1]]) == coloc_trait)
colname = colnames(moloc$best_snp[[2]])[index2]
row_index <- moloc$best_snp[[2]] %>%
as_tibble() %>%
arrange(desc(!!sym(colname))) %>%
mutate(row_number = row_number()) %>%
mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
filter(cumulative_sum >= 0.95) %>%
pull(row_number) %>%
min()
# Identify top SNPs up to 95% credible set
snps = moloc$best_snp[[2]] %>%
as_tibble() %>%
mutate(row_number = row_number()) %>%
arrange(desc(!!sym(colname))) %>%
pull(row_number)
snps = snps[1:row_index]
# --- Assemble moloc report ---
moloc_report = moloc
moloc_report$file = file
moloc_report$true_variant = true_coloc_var
moloc_report$true_trait = true_coloc_trait
if (length(coloc_trait_index) >= 2) {
moloc_report$coloc_set = list(snps)
moloc_report$coloc_trait = list(coloc_trait_index)
} else {
# If only one trait coloc, treat as fine-mapping only (not true coloc)
moloc_report$coloc_set = NULL
moloc_report$coloc_trait = NULL
}
# --- Purity filtering (keep coloc sets with LD-purity > 0.5) ---
purity_check = lapply(moloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
purity_index = which(unlist(purity_check) > 0.5)
if (length(purity_index) > 0) {
moloc_report$coloc_set = moloc_report$coloc_set[purity_index]
moloc_report$coloc_trait = moloc_report$coloc_trait[purity_index]
} else {
print("All filtered out")
moloc_report$coloc_set = NULL
moloc_report$coloc_trait = NULL
}
moloc_report$time = (t2 - t1)
moloc_report$purity = purity_check
return(moloc_report)
}
# ======= Execute and Save =======
files = c(${_input:ar,})
moloc_result = run_moloc(files, ${trait})
saveRDS(moloc_result, ${_output:r})
Run COLOC V5 (SuSiE coloc)#
[susie_coloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 300
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 2
parameter: container = ""
# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_susie_coloc_result.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
library("susieR")
library("coloc")
library("tidyverse")
# install simulation package
# devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
# BiocManager::install("StatFunGen/pecotmr")
library("pecotmr")
library("simxQTL")
run_susie_coloc <- function(file, actual_pheno_number = 2) {
# --- Load simulation file ---
rds = readRDS(file)
# --- Extract X, Y, variant for each trait ---
X = list()
Y = list()
variant = list()
for (i in 1:actual_pheno_number) {
X[[i]] = rds$X[, , drop = FALSE]
Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
variant[[i]] = rds$variant[[i]]
}
# --- Prepare for SuSiE fine-mapping (assume two traits) ---
Xe = X[[1]] # First trait
Xp = X[[2]] # Second trait
Ye = Y[[1]]
Yp = Y[[2]]
# --- Estimate marginal betas and varbetas ---
beta_p = varbeta_p = rep(0, ncol(Xp))
beta_e = varbeta_e = rep(0, ncol(Xe))
for (mm in 1:ncol(Xe)) {
rr <- susieR::univariate_regression(Yp, Xp[, mm])
beta_p[mm] = rr$betahat
varbeta_p[mm] = rr$sebetahat^2
rr <- susieR::univariate_regression(Ye, Xe[, mm])
beta_e[mm] = rr$betahat
varbeta_e[mm] = rr$sebetahat^2
}
# --- Construct SuSiE input for both traits ---
LDp = cor(Xp)
LDe = cor(Xe)
snp = paste0("snp", 1:ncol(Xe))
colnames(LDe) = rownames(LDe) = colnames(LDp) = rownames(LDp) = snp
dp = list(
"beta" = beta_p, "varbeta" = varbeta_p,
"LD" = LDp, "snp" = snp, "N" = nrow(Xp),
"type" = "quant", "sdY" = sd(Yp)
)
de = list(
"beta" = beta_e, "varbeta" = varbeta_e,
"LD" = LDe, "snp" = snp, "N" = nrow(Xe),
"type" = "quant", "sdY" = sd(Ye)
)
# --- Run SuSiE fine-mapping and coloc.susie ---
t1 <- Sys.time()
out_p <- runsusie(dp)
out_e <- runsusie(de)
out_coloc = coloc.susie(out_e, out_p)
t2 <- Sys.time()
# --- Identify true colocalized variants ---
all_var = unlist(variant)
coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))
true_coloc_var = coloc_var_index
true_coloc_trait = list()
for (variant_index in 1:length(coloc_var_index)) {
temp_vec = c()
for (i in 1:length(variant)) {
if (coloc_var_index[variant_index] %in% variant[[i]]) {
temp_vec = c(temp_vec, i)
}
}
true_coloc_trait[[variant_index]] = temp_vec
}
### --- Post-process coloc summary results ---
df <- out_coloc$summary
if (is.null(df)) {
out_coloc = list()
sets = NULL
coloc_trait_index = NULL
} else {
# Find rows where PP.H4 is maximum
index <- which(df$PP.H4.abf > df$PP.H1.abf & df$PP.H4.abf > df$PP.H2.abf & df$PP.H4.abf > df$PP.H3.abf)
if (length(index) > 0) {
tb <- out_coloc$results
number = 1
# For each H4-supported region
for (i in index) {
new_index = i + 1
colname = names(tb)[new_index]
row_index <- tb %>%
as_tibble() %>%
arrange(desc(!!sym(colname))) %>%
mutate(row_number = row_number()) %>%
mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
filter(cumulative_sum >= 0.95) %>%
pull(row_number) %>%
min()
snps = tb %>%
as_tibble() %>%
arrange(desc(!!sym(colname))) %>%
pull(snp)
if (number == 1) {
sets = list(as.numeric(gsub("snp", "", snps[1:row_index])))
} else {
new_list = as.numeric(gsub("snp", "", snps[1:row_index]))
sets = c(sets, list(new_list))
}
number = number + 1
}
} else {
sets = NULL
}
coloc_trait_index = list()
if (length(sets) == 0) {
coloc_trait_index = NULL
} else {
for (j in 1:length(sets)) {
coloc_trait_index[[j]] = c(1, 2)
}
}
}
# --- Assemble final report ---
susie_coloc_report = out_coloc
susie_coloc_report$file = file
susie_coloc_report$true_variant = true_coloc_var
susie_coloc_report$true_trait = true_coloc_trait
susie_coloc_report$coloc_set = sets
susie_coloc_report$coloc_trait = coloc_trait_index
susie_coloc_report$time = (t2 - t1)
# --- Purity filtering (LD purity > 0.5) ---
purity_check = lapply(susie_coloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
purity_index = which(unlist(purity_check) > 0.5)
if (length(purity_index) > 0) {
susie_coloc_report$coloc_set = susie_coloc_report$coloc_set[purity_index]
susie_coloc_report$coloc_trait = susie_coloc_report$coloc_trait[purity_index]
} else {
print("All filtered out")
susie_coloc_report$coloc_set = NULL
susie_coloc_report$coloc_trait = NULL
}
susie_coloc_report$purity = purity_check
return(susie_coloc_report)
}
# ======= Execute and Save =======
file = ${_input:ar,}
susie_coloc_result = run_susie_coloc(file)
saveRDS(susie_coloc_result, ${_output:r})
Bash files#
Hyprcoloc#
## 2 trait
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="hyp_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run ./3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 20G --trait 2 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
## 5 trait
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_5trait/"
job="hyp_real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 20G --trait 5 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
## 10 trait
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_10trait/"
job="hyp_real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 30G --trait 10 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
## 20 trait
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_20trait/"
job="hyp_real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 20 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3 4; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
MOLOC#
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="moloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/moloc"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=15000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb moloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 15G --trait 2 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done
COLOC V5 (SuSiE-COLOC)#
data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="susie_coloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/susie_coloc"
#!/bin/bash
mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result
cd ${work_dir}/${job}/code
cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err
source /home/hs3393/mamba_activate.sh
module load Singularity
cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb susie_coloc_set \
--simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
--mem 40G --trait 2 \
--cwd WORK_DIR/JOB/result
EOF
base_script="base_script"
for ncausal in 1 2 3; do
output_script="ncausal_${ncausal}.sh"
cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g" > ${output_script}
sbatch ${output_script}
done