Other Colocalization Methods#

Goal#

This notebook demonstrates how to execute Hyprcoloc, MOLOC or COLOC(V5) (SuSiE-Coloc) on simulated data and analyze its performance through power and false discovery rate (FDR) calculations.

Note: Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set, true_trait, and true_variant.

Input#

simufile: Individual level data X and Y introduced in Phenotype Simulation.

Output#

Hyprcoloc / MOLOC / COLOC (V5) original result, along with some summarized elements.

  • coloc_trait: traits identified by ColocBoost as having shared genetic effects

  • coloc_set: sets of variants identified as causal

  • true_trait: traits with true causal relationships (ground truth)

  • true_variant: variants that are truly causal (ground truth)

Example output:

result = readRDS("../simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_9_real_simulation_3_ncausal_10_trait_ntr_10_hypercoloc.rds")
result$results
A data.frame: 3 × 7
iterationtraitsposterior_probregional_probcandidate_snpposterior_explained_by_snpdropped_trait
<dbl><chr><dbl><dbl><chr><dbl><chr>
115, 90.87541snp917 0.3881NA
223, 80.79151snp25680.0418NA
332, 60.94261snp29040.1141NA
print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set

# colocalization result - colocalizing trait
result$coloc_trait

print("=====Truth=====")
#  true colocalizing CoS
result$true_variant

#  true colocalizing trait
result$true_trait
[1] "=====Prediction====="
    1. 917
    2. 911
    3. 931
    4. 934
    5. 916
    6. 913
    1. 2568
    2. 2479
    3. 2605
    4. 2627
    5. 2535
    6. 2483
    7. 2485
    8. 2487
    9. 2495
    10. 2496
    11. 2497
    12. 2498
    13. 2504
    14. 2507
    15. 2510
    16. 2512
    17. 2513
    18. 2514
    19. 2515
    20. 2518
    21. 2526
    22. 2528
    23. 2529
    24. 2530
    25. 2537
    26. 2544
    27. 2545
    28. 2547
    29. 2548
    30. 2550
    31. 2552
    32. 2554
    33. 2555
    34. 2558
    35. 2560
    36. 2562
    37. 2563
    38. 2582
    39. 2590
    40. 2591
    41. 2592
    42. 2600
    43. 2603
    44. 2604
    45. 2610
    46. 2617
    47. 2618
    48. 2620
    49. 2621
    50. 2625
    51. 2630
    52. 2631
    53. 2632
    54. 2634
    55. 2635
    56. 2636
    57. 2639
    58. 2642
    59. 2644
    60. 2646
    61. 2647
    62. 2648
    63. 2650
    64. 2652
    65. 2655
    66. 2656
    67. 2657
    68. 2658
    69. 2659
    70. 2665
    71. 2667
    72. 2668
    73. 2673
    74. 2594
    75. 2539
    76. 2614
    77. 2564
    78. 2615
    79. 2649
    80. 2531
    81. 2533
    82. 2493
    83. 2576
    1. 2904
    2. 2892
    3. 2900
    4. 2908
    5. 2910
    6. 2911
    7. 2920
    8. 2874
    9. 2887
    10. 2859
    11. 2888
    12. 2894
    1. 5
    2. 9
    1. 3
    2. 8
    1. 2
    2. 6
[1] "=====Truth====="
  1. 934
  2. 2485
  3. 2908
    1. 5
    2. 6
    3. 9
    1. 3
    2. 8
    1. 2
    2. 6

Run HyprColoc without LD#

[hyprcoloc_set]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting="normal"
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hyprcoloc.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    # ======= Load necessary libraries =======
    library("hyprcoloc")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_hypercoloc function =======
    run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
        library(hyprcoloc)

        # --- Load simulation file ---
        rds = readRDS(file)

        # --- Prepare input data: X (genotype), Y (phenotypes), variant (causal SNPs) ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Estimate marginal betas and SEs using univariate regression ---
        betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)

        for (i in 1:actual_pheno_number) {
            for (mm in 1:ncol(X[[1]])) {
                rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
                betas[mm, i] = rr$betahat
                sebetas[mm, i] = rr$sebetahat
            }
        }

        # --- Set trait and SNP labels ---
        traits <- c(1:actual_pheno_number)
        rsid <- paste0("snp", c(1:ncol(X[[1]])))
        colnames(betas) <- colnames(sebetas) <- traits
        rownames(betas) <- rownames(sebetas) <- rsid


        # --- Run HyPrColoc ---
        t1 <- Sys.time()
        res_hyprcoloc <- hyprcoloc(betas, sebetas, trait.names = traits, snp.id = rsid, snpscores = TRUE)
        t2 <- Sys.time()

        # --- Process HyPrColoc results ---
        tb = res_hyprcoloc$results
        index = which(tb$traits != "None")
        tb = tb[index, , drop = FALSE]
        res_hyprcoloc$results = tb
        num = nrow(res_hyprcoloc$results)

        # --- Handle "normal" setting - output the credible set ---
        if (setting == "normal") {
            if (num == 0) {
                causal_snp = NULL
                causal_trait = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()
                causal_trait = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                    causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
                }
            }

            ### Find true coloc variants and traits ###
            all_var = unlist(variant)
            coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))  # Variants appearing in ≥2 traits

            true_coloc_var = coloc_var_index
            true_coloc_trait = list()

            for (variant_index in 1:length(coloc_var_index)) {
                temp_vec = c()
                for (i in 1:length(variant)) {
                    if (coloc_var_index[variant_index] %in% variant[[i]]) {
                        temp_vec = c(temp_vec, i)
                    }
                }
                true_coloc_trait[[variant_index]] = temp_vec
            }

            # --- Create report ---
            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$true_variant = true_coloc_var
            hypercoloc_report$true_trait = true_coloc_trait
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$coloc_trait = causal_trait
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)

        # --- Handle "alternative" setting (no trait recording): output the SNP hit (one variant) ---
        } else {
            if (num == 0) {
                causal_snp = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                }
            }

            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)
        }

        # --- Purity filtering: keep only sets with LD-purity > 0.5 ---
        purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
            hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            hypercoloc_report$coloc_set = NULL
            hypercoloc_report$coloc_trait = NULL
        }

        hypercoloc_report$purity = purity_check

        # --- Return result ---
        return(hypercoloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
    saveRDS(hypercoloc_result, ${_output:r})
  

Run Hyprcoloc with LD info#

[hyprcoloc_LD_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting = "normal"

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hypercoloc_LD.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 

    library("hyprcoloc")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_hypercoloc function =======
    run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
        library(hyprcoloc)

        # --- Load simulation data ---
        rds = readRDS(file)

        # --- Prepare input lists ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Estimate marginal betas and SEs ---
        betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)
        for (i in 1:actual_pheno_number) {
            for (mm in 1:ncol(X[[1]])) {
                rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
                betas[mm, i] = rr$betahat
                sebetas[mm, i] = rr$sebetahat
            }
        }

        # --- Label traits and SNPs ---
        traits <- c(1:actual_pheno_number)
        rsid <- paste0("snp", c(1:ncol(X[[1]])))
        colnames(betas) <- colnames(sebetas) <- traits
        rownames(betas) <- rownames(sebetas) <- rsid

        # --- Calculate LD matrix from X (correlation) ---
        LD = get_correlation(X[[1]])
        colnames(LD) <- rownames(LD) <- colnames(X[[1]])

        # --- Run HyPrColoc with LD matrix ---
        t1 <- Sys.time()
        res_hyprcoloc <- hyprcoloc(
            betas, 
            sebetas, 
            trait.names = traits, 
            snp.id = rsid, 
            snpscores = TRUE, 
            ld.matrix = LD   # <<=== NEW compared to previous version
        )
        t2 <- Sys.time()

        # --- Process HyPrColoc results ---
        tb = res_hyprcoloc$results
        index = which(tb$traits != "None")
        tb = tb[index, , drop = FALSE]
        res_hyprcoloc$results = tb
        num = nrow(res_hyprcoloc$results)

        # --- Handle "normal" setting ---
        if (setting == "normal") {
            if (num == 0) {
                causal_snp = NULL
                causal_trait = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()
                causal_trait = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                    causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
                }
            }

            # --- Identify true coloc variants and traits ---
            all_var = unlist(variant)
            coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

            true_coloc_var = coloc_var_index
            true_coloc_trait = list()

            for (variant_index in 1:length(coloc_var_index)) {
                temp_vec = c()
                for (i in 1:length(variant)) {
                    if (coloc_var_index[variant_index] %in% variant[[i]]) {
                        temp_vec = c(temp_vec, i)
                    }
                }
                true_coloc_trait[[variant_index]] = temp_vec
            }

            # --- Save full report ---
            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$true_variant = true_coloc_var
            hypercoloc_report$true_trait = true_coloc_trait
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$coloc_trait = causal_trait
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)

        # --- Handle alternative setting ---
        } else {
            if (num == 0) {
                causal_snp = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                }
            }

            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)
        }

        # --- Purity filtering (keep coloc sets with LD purity > 0.5) ---
        purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
            hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            hypercoloc_report$coloc_set = NULL
            hypercoloc_report$coloc_trait = NULL
        }

        hypercoloc_report$purity = purity_check

        # --- Return result ---
        return(hypercoloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
    saveRDS(hypercoloc_result, ${_output:r})

Run MOLOC#

[moloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 50
parameter: container = ""

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_moloc_set.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 

    library("moloc")
    library("tidyverse")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_moloc function =======
    run_moloc <- function(file, actual_pheno_number, setting = "normal") {
        
        # --- Load simulation RDS file ---
        rds = readRDS(file)

        # --- Prepare X (genotypes), Y (phenotypes), variant (causal SNPs) ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Build input summary statistics tables for moloc ---
        tb <- list()
        cnt = 1

        for (i in 1:actual_pheno_number) {
            betas = c()
            sebetas = c()
            MAF = c()

            for (mm in 1:ncol(X[[i]])) {
                rr <- susieR::univariate_regression(X[[i]][, mm], Y[[i]])
                betas[mm] = rr$betahat
                sebetas[mm] = rr$sebetahat
                # Minor Allele Frequency (MAF) calculation
                MAF[mm] = min(sum(X[[i]][, mm]) / (2 * nrow(X[[i]])), 1 - sum(X[[i]][, mm]) / (2 * nrow(X[[i]])))
            }

            tb[[cnt]] = tibble(
                SNP = 1:ncol(X[[i]]),
                BETA = betas,
                SE = sebetas,
                N = nrow(X[[i]]),
                MAF = MAF
            )
            cnt = cnt + 1
        }

        # --- Run moloc on the prepared input ---
        options(scipen = 1, digits = 3)

        t1 <- Sys.time()
        moloc <- moloc_test(listData = tb, save.SNP.info = TRUE)
        t2 <- Sys.time()

        # --- Identify true coloc variants across traits ---
        all_var = unlist(variant)
        coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

        true_coloc_var = coloc_var_index
        true_coloc_trait = list()

        for (variant_index in 1:length(coloc_var_index)) {
            temp_vec = c()
            for (i in 1:length(variant)) {
                if (coloc_var_index[variant_index] %in% variant[[i]]) {
                    temp_vec = c(temp_vec, i)
                }
            }
            true_coloc_trait[[variant_index]] = temp_vec
        }

        # --- Extract colocalization result from moloc output ---
        index = which.max(moloc$priors_lkl_ppa$PPA)
        hypothesis_name = rownames(moloc$priors_lkl_ppa)[index]
        coloc_traits = unlist(strsplit(hypothesis_name, ","))

        # Find the hypothesis with maximum number of traits coloc'd
        trait_num = max(nchar(coloc_traits))
        coloc_trait = coloc_traits[which.max(nchar(coloc_traits))]

        # Decode coloc trait indices (letters -> numbers)
        coloc_trait_index = c()
        for (item in unlist(strsplit(coloc_trait, NULL))) {
            temp_index = as.integer(charToRaw(item)) - as.integer(charToRaw("a")) + 1
            coloc_trait_index = c(coloc_trait_index, temp_index)
        }

        # --- Find credible SNP set covering 95% probability ---
        index2 = which(rownames(moloc$best_snp[[1]]) == coloc_trait)
        colname = colnames(moloc$best_snp[[2]])[index2]

        row_index <- moloc$best_snp[[2]] %>%
            as_tibble() %>%
            arrange(desc(!!sym(colname))) %>%
            mutate(row_number = row_number()) %>%
            mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
            filter(cumulative_sum >= 0.95) %>%
            pull(row_number) %>%
            min()

        # Identify top SNPs up to 95% credible set
        snps = moloc$best_snp[[2]] %>%
            as_tibble() %>%
            mutate(row_number = row_number()) %>%
            arrange(desc(!!sym(colname))) %>%
            pull(row_number)
        snps = snps[1:row_index]

        # --- Assemble moloc report ---
        moloc_report = moloc
        moloc_report$file = file
        moloc_report$true_variant = true_coloc_var
        moloc_report$true_trait = true_coloc_trait

        if (length(coloc_trait_index) >= 2) {
            moloc_report$coloc_set = list(snps)
            moloc_report$coloc_trait = list(coloc_trait_index)
        } else {
            # If only one trait coloc, treat as fine-mapping only (not true coloc)
            moloc_report$coloc_set = NULL
            moloc_report$coloc_trait = NULL
        }

        # --- Purity filtering (keep coloc sets with LD-purity > 0.5) ---
        purity_check = lapply(moloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            moloc_report$coloc_set = moloc_report$coloc_set[purity_index]
            moloc_report$coloc_trait = moloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            moloc_report$coloc_set = NULL
            moloc_report$coloc_trait = NULL
        }

        moloc_report$time = (t2 - t1)
        moloc_report$purity = purity_check

        return(moloc_report)
    }

    # ======= Execute and Save =======
    files = c(${_input:ar,})
    moloc_result = run_moloc(files, ${trait})
    saveRDS(moloc_result, ${_output:r})

Run COLOC V5 (SuSiE coloc)#

[susie_coloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 300
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 2
parameter: container = ""

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_susie_coloc_result.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("susieR")
    library("coloc")
    library("tidyverse")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")
  
    run_susie_coloc <- function(file, actual_pheno_number = 2) {


        # --- Load simulation file ---
        rds = readRDS(file)

        # --- Extract X, Y, variant for each trait ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Prepare for SuSiE fine-mapping (assume two traits) ---
        Xe = X[[1]]  # First trait
        Xp = X[[2]]  # Second trait
        Ye = Y[[1]]
        Yp = Y[[2]]

        # --- Estimate marginal betas and varbetas ---
        beta_p = varbeta_p = rep(0, ncol(Xp))
        beta_e = varbeta_e = rep(0, ncol(Xe))

        for (mm in 1:ncol(Xe)) {
            rr <- susieR::univariate_regression(Yp, Xp[, mm])
            beta_p[mm] = rr$betahat
            varbeta_p[mm] = rr$sebetahat^2

            rr <- susieR::univariate_regression(Ye, Xe[, mm])
            beta_e[mm] = rr$betahat
            varbeta_e[mm] = rr$sebetahat^2
        }

        # --- Construct SuSiE input for both traits ---
        LDp = cor(Xp)
        LDe = cor(Xe)
        snp = paste0("snp", 1:ncol(Xe))
        colnames(LDe) = rownames(LDe) = colnames(LDp) = rownames(LDp) = snp

        dp = list(
            "beta" = beta_p, "varbeta" = varbeta_p,
            "LD" = LDp, "snp" = snp, "N" = nrow(Xp),
            "type" = "quant", "sdY" = sd(Yp)
        )

        de = list(
            "beta" = beta_e, "varbeta" = varbeta_e,
            "LD" = LDe, "snp" = snp, "N" = nrow(Xe),
            "type" = "quant", "sdY" = sd(Ye)
        )

        # --- Run SuSiE fine-mapping and coloc.susie ---
        t1 <- Sys.time()
        out_p <- runsusie(dp)
        out_e <- runsusie(de)
        out_coloc = coloc.susie(out_e, out_p)
        t2 <- Sys.time()

        # --- Identify true colocalized variants ---
        all_var = unlist(variant)
        coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

        true_coloc_var = coloc_var_index
        true_coloc_trait = list()

        for (variant_index in 1:length(coloc_var_index)) {
            temp_vec = c()
            for (i in 1:length(variant)) {
                if (coloc_var_index[variant_index] %in% variant[[i]]) {
                    temp_vec = c(temp_vec, i)
                }
            }
            true_coloc_trait[[variant_index]] = temp_vec
        }

        ### --- Post-process coloc summary results ---
        df <- out_coloc$summary

        if (is.null(df)) {
            out_coloc = list()
            sets = NULL
            coloc_trait_index = NULL

        } else {
            # Find rows where PP.H4 is maximum
            index <- which(df$PP.H4.abf > df$PP.H1.abf & df$PP.H4.abf > df$PP.H2.abf & df$PP.H4.abf > df$PP.H3.abf)

            if (length(index) > 0) {
                tb <- out_coloc$results
                number = 1

                # For each H4-supported region
                for (i in index) {
                    new_index = i + 1
                    colname = names(tb)[new_index]

                    row_index <- tb %>%
                        as_tibble() %>%
                        arrange(desc(!!sym(colname))) %>%
                        mutate(row_number = row_number()) %>%
                        mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
                        filter(cumulative_sum >= 0.95) %>%
                        pull(row_number) %>%
                        min()

                    snps = tb %>%
                        as_tibble() %>%
                        arrange(desc(!!sym(colname))) %>%
                        pull(snp)

                    if (number == 1) {
                        sets = list(as.numeric(gsub("snp", "", snps[1:row_index])))
                    } else {
                        new_list = as.numeric(gsub("snp", "", snps[1:row_index]))
                        sets = c(sets, list(new_list))
                    }
                    number = number + 1
                }
            } else {
                sets = NULL
            }

            coloc_trait_index = list()
            if (length(sets) == 0) {
                coloc_trait_index = NULL
            } else {
                for (j in 1:length(sets)) {
                    coloc_trait_index[[j]] = c(1, 2)
                }
            }
        }

        # --- Assemble final report ---
        susie_coloc_report = out_coloc
        susie_coloc_report$file = file
        susie_coloc_report$true_variant = true_coloc_var
        susie_coloc_report$true_trait = true_coloc_trait
        susie_coloc_report$coloc_set = sets
        susie_coloc_report$coloc_trait = coloc_trait_index
        susie_coloc_report$time = (t2 - t1)

        # --- Purity filtering (LD purity > 0.5) ---
        purity_check = lapply(susie_coloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            susie_coloc_report$coloc_set = susie_coloc_report$coloc_set[purity_index]
            susie_coloc_report$coloc_trait = susie_coloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            susie_coloc_report$coloc_set = NULL
            susie_coloc_report$coloc_trait = NULL
        }

        susie_coloc_report$purity = purity_check

        return(susie_coloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    susie_coloc_result = run_susie_coloc(file)
    saveRDS(susie_coloc_result, ${_output:r})

Bash files#

Hyprcoloc#

## 2 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="hyp_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run ./3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 20G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 5 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_5trait/"
job="hyp_real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 20G --trait 5 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 10 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_10trait/"
job="hyp_real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 30G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 20 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_20trait/"
job="hyp_real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 20 \
    --cwd WORK_DIR/JOB/result
EOF

base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

MOLOC#

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="moloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/moloc"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=15000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb moloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 15G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

COLOC V5 (SuSiE-COLOC)#

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="susie_coloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/susie_coloc"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb susie_coloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done