Other Colocalization Methods

Other Colocalization Methods#

Goal#

This notebook demonstrates how to execute Hyprcoloc, MOLOC or COLOC(V5) (SuSiE-Coloc) on simulated data and analyze its performance through power and false discovery rate (FDR) calculations.

Note: Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set, true_trait, and true_variant.

Input#

simufile: Individual level data X and Y introduced in Phenotype Simulation.

Output#

Hyprcoloc / MOLOC / COLOC (V5) original result, along with some summarized elements.

coloc_trait: traits identified by ColocBoost as having shared genetic effects
coloc_set: sets of variants identified as causal
true_trait: traits with true causal relationships (ground truth)
true_variant: variants that are truly causal (ground truth)

Example output:

result = readRDS("../simulation_result/hyprcoloc/hyp_real_simulation_10trait/result/sample_9_real_simulation_3_ncausal_10_trait_ntr_10_hypercoloc.rds")
result$results

A data.frame: 3 × 7
	iteration	traits	posterior_prob	regional_prob	candidate_snp	posterior_explained_by_snp	dropped_trait
	<dbl>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<chr>
1	1	5, 9	0.8754	1	snp917	0.3881	NA
2	2	3, 8	0.7915	1	snp2568	0.0418	NA
3	3	2, 6	0.9426	1	snp2904	0.1141	NA

print("=====Prediction=====")
# colocalization result - CoS
result$coloc_set

# colocalization result - colocalizing trait
result$coloc_trait

print("=====Truth=====")
#  true colocalizing CoS
result$true_variant

#  true colocalizing trait
result$true_trait

[1] "=====Prediction====="

1. 917
2. 911
3. 931
4. 934
5. 916
6. 913
1. 2568
2. 2479
3. 2605
4. 2627
5. 2535
6. 2483
7. 2485
8. 2487
9. 2495
10. 2496
11. 2497
12. 2498
13. 2504
14. 2507
15. 2510
16. 2512
17. 2513
18. 2514
19. 2515
20. 2518
21. 2526
22. 2528
23. 2529
24. 2530
25. 2537
26. 2544
27. 2545
28. 2547
29. 2548
30. 2550
31. 2552
32. 2554
33. 2555
34. 2558
35. 2560
36. 2562
37. 2563
38. 2582
39. 2590
40. 2591
41. 2592
42. 2600
43. 2603
44. 2604
45. 2610
46. 2617
47. 2618
48. 2620
49. 2621
50. 2625
51. 2630
52. 2631
53. 2632
54. 2634
55. 2635
56. 2636
57. 2639
58. 2642
59. 2644
60. 2646
61. 2647
62. 2648
63. 2650
64. 2652
65. 2655
66. 2656
67. 2657
68. 2658
69. 2659
70. 2665
71. 2667
72. 2668
73. 2673
74. 2594
75. 2539
76. 2614
77. 2564
78. 2615
79. 2649
80. 2531
81. 2533
82. 2493
83. 2576
1. 2904
2. 2892
3. 2900
4. 2908
5. 2910
6. 2911
7. 2920
8. 2874
9. 2887
10. 2859
11. 2888
12. 2894

1. 5
2. 9
1. 3
2. 8
1. 2
2. 6

[1] "=====Truth====="

934
2485
2908

1. 5
2. 6
3. 9
1. 3
2. 8
1. 2
2. 6

Run HyprColoc without LD#

[hyprcoloc_set]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting="normal"
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hyprcoloc.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    # ======= Load necessary libraries =======
    library("hyprcoloc")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_hypercoloc function =======
    run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
        library(hyprcoloc)

        # --- Load simulation file ---
        rds = readRDS(file)

        # --- Prepare input data: X (genotype), Y (phenotypes), variant (causal SNPs) ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Estimate marginal betas and SEs using univariate regression ---
        betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)

        for (i in 1:actual_pheno_number) {
            for (mm in 1:ncol(X[[1]])) {
                rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
                betas[mm, i] = rr$betahat
                sebetas[mm, i] = rr$sebetahat
            }
        }

        # --- Set trait and SNP labels ---
        traits <- c(1:actual_pheno_number)
        rsid <- paste0("snp", c(1:ncol(X[[1]])))
        colnames(betas) <- colnames(sebetas) <- traits
        rownames(betas) <- rownames(sebetas) <- rsid


        # --- Run HyPrColoc ---
        t1 <- Sys.time()
        res_hyprcoloc <- hyprcoloc(betas, sebetas, trait.names = traits, snp.id = rsid, snpscores = TRUE)
        t2 <- Sys.time()

        # --- Process HyPrColoc results ---
        tb = res_hyprcoloc$results
        index = which(tb$traits != "None")
        tb = tb[index, , drop = FALSE]
        res_hyprcoloc$results = tb
        num = nrow(res_hyprcoloc$results)

        # --- Handle "normal" setting - output the credible set ---
        if (setting == "normal") {
            if (num == 0) {
                causal_snp = NULL
                causal_trait = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()
                causal_trait = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                    causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
                }
            }

            ### Find true coloc variants and traits ###
            all_var = unlist(variant)
            coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))  # Variants appearing in ≥2 traits

            true_coloc_var = coloc_var_index
            true_coloc_trait = list()

            for (variant_index in 1:length(coloc_var_index)) {
                temp_vec = c()
                for (i in 1:length(variant)) {
                    if (coloc_var_index[variant_index] %in% variant[[i]]) {
                        temp_vec = c(temp_vec, i)
                    }
                }
                true_coloc_trait[[variant_index]] = temp_vec
            }

            # --- Create report ---
            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$true_variant = true_coloc_var
            hypercoloc_report$true_trait = true_coloc_trait
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$coloc_trait = causal_trait
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)

        # --- Handle "alternative" setting (no trait recording): output the SNP hit (one variant) ---
        } else {
            if (num == 0) {
                causal_snp = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                }
            }

            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)
        }

        # --- Purity filtering: keep only sets with LD-purity > 0.5 ---
        purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
            hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            hypercoloc_report$coloc_set = NULL
            hypercoloc_report$coloc_trait = NULL
        }

        hypercoloc_report$purity = purity_check

        # --- Return result ---
        return(hypercoloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
    saveRDS(hypercoloc_result, ${_output:r})
  

Run Hyprcoloc with LD info#

[hyprcoloc_LD_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "30G"
parameter: numThreads = 1
parameter: trait = 10
parameter: container = ""
parameter: setting = "normal"

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_hypercoloc_LD.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 

    library("hyprcoloc")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_hypercoloc function =======
    run_hypercoloc = function(file, actual_pheno_number, setting = "normal") {
        library(hyprcoloc)

        # --- Load simulation data ---
        rds = readRDS(file)

        # --- Prepare input lists ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Estimate marginal betas and SEs ---
        betas = sebetas = matrix(NA, nrow = ncol(X[[1]]), ncol = actual_pheno_number)
        for (i in 1:actual_pheno_number) {
            for (mm in 1:ncol(X[[1]])) {
                rr <- susieR::univariate_regression(scale(X[[i]][, mm]), scale(Y[[i]]))
                betas[mm, i] = rr$betahat
                sebetas[mm, i] = rr$sebetahat
            }
        }

        # --- Label traits and SNPs ---
        traits <- c(1:actual_pheno_number)
        rsid <- paste0("snp", c(1:ncol(X[[1]])))
        colnames(betas) <- colnames(sebetas) <- traits
        rownames(betas) <- rownames(sebetas) <- rsid

        # --- Calculate LD matrix from X (correlation) ---
        LD = get_correlation(X[[1]])
        colnames(LD) <- rownames(LD) <- colnames(X[[1]])

        # --- Run HyPrColoc with LD matrix ---
        t1 <- Sys.time()
        res_hyprcoloc <- hyprcoloc(
            betas, 
            sebetas, 
            trait.names = traits, 
            snp.id = rsid, 
            snpscores = TRUE, 
            ld.matrix = LD   # <<=== NEW compared to previous version
        )
        t2 <- Sys.time()

        # --- Process HyPrColoc results ---
        tb = res_hyprcoloc$results
        index = which(tb$traits != "None")
        tb = tb[index, , drop = FALSE]
        res_hyprcoloc$results = tb
        num = nrow(res_hyprcoloc$results)

        # --- Handle "normal" setting ---
        if (setting == "normal") {
            if (num == 0) {
                causal_snp = NULL
                causal_trait = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()
                causal_trait = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                    causal_trait[[i]] = as.numeric(unlist(strsplit(res_hyprcoloc$results$traits[i], ", ")))
                }
            }

            # --- Identify true coloc variants and traits ---
            all_var = unlist(variant)
            coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

            true_coloc_var = coloc_var_index
            true_coloc_trait = list()

            for (variant_index in 1:length(coloc_var_index)) {
                temp_vec = c()
                for (i in 1:length(variant)) {
                    if (coloc_var_index[variant_index] %in% variant[[i]]) {
                        temp_vec = c(temp_vec, i)
                    }
                }
                true_coloc_trait[[variant_index]] = temp_vec
            }

            # --- Save full report ---
            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$true_variant = true_coloc_var
            hypercoloc_report$true_trait = true_coloc_trait
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$coloc_trait = causal_trait
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)

        # --- Handle alternative setting ---
        } else {
            if (num == 0) {
                causal_snp = NULL
            } else {
                hyprcoloc_csets <- cred.sets(res_hyprcoloc, value = 0.95)
                causal_snp = list()

                for (i in 1:num) {
                    causal_snp[[i]] = as.numeric(gsub("[^0-9]", "", names(hyprcoloc_csets[[i]])))
                }
            }

            hypercoloc_report = res_hyprcoloc
            hypercoloc_report$coloc_set = causal_snp    
            hypercoloc_report$file = file
            hypercoloc_report$time = (t2 - t1)
        }

        # --- Purity filtering (keep coloc sets with LD purity > 0.5) ---
        purity_check = lapply(hypercoloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            hypercoloc_report$coloc_set = hypercoloc_report$coloc_set[purity_index]
            hypercoloc_report$coloc_trait = hypercoloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            hypercoloc_report$coloc_set = NULL
            hypercoloc_report$coloc_trait = NULL
        }

        hypercoloc_report$purity = purity_check

        # --- Return result ---
        return(hypercoloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    hypercoloc_result = run_hypercoloc(file, ${trait}, setting = "${setting}")
    saveRDS(hypercoloc_result, ${_output:r})

Run MOLOC#

[moloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 30
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 50
parameter: container = ""

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_moloc_set.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 

    library("moloc")
    library("tidyverse")
    library("susieR")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")

    # ======= Define run_moloc function =======
    run_moloc <- function(file, actual_pheno_number, setting = "normal") {
        
        # --- Load simulation RDS file ---
        rds = readRDS(file)

        # --- Prepare X (genotypes), Y (phenotypes), variant (causal SNPs) ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Build input summary statistics tables for moloc ---
        tb <- list()
        cnt = 1

        for (i in 1:actual_pheno_number) {
            betas = c()
            sebetas = c()
            MAF = c()

            for (mm in 1:ncol(X[[i]])) {
                rr <- susieR::univariate_regression(X[[i]][, mm], Y[[i]])
                betas[mm] = rr$betahat
                sebetas[mm] = rr$sebetahat
                # Minor Allele Frequency (MAF) calculation
                MAF[mm] = min(sum(X[[i]][, mm]) / (2 * nrow(X[[i]])), 1 - sum(X[[i]][, mm]) / (2 * nrow(X[[i]])))
            }

            tb[[cnt]] = tibble(
                SNP = 1:ncol(X[[i]]),
                BETA = betas,
                SE = sebetas,
                N = nrow(X[[i]]),
                MAF = MAF
            )
            cnt = cnt + 1
        }

        # --- Run moloc on the prepared input ---
        options(scipen = 1, digits = 3)

        t1 <- Sys.time()
        moloc <- moloc_test(listData = tb, save.SNP.info = TRUE)
        t2 <- Sys.time()

        # --- Identify true coloc variants across traits ---
        all_var = unlist(variant)
        coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

        true_coloc_var = coloc_var_index
        true_coloc_trait = list()

        for (variant_index in 1:length(coloc_var_index)) {
            temp_vec = c()
            for (i in 1:length(variant)) {
                if (coloc_var_index[variant_index] %in% variant[[i]]) {
                    temp_vec = c(temp_vec, i)
                }
            }
            true_coloc_trait[[variant_index]] = temp_vec
        }

        # --- Extract colocalization result from moloc output ---
        index = which.max(moloc$priors_lkl_ppa$PPA)
        hypothesis_name = rownames(moloc$priors_lkl_ppa)[index]
        coloc_traits = unlist(strsplit(hypothesis_name, ","))

        # Find the hypothesis with maximum number of traits coloc'd
        trait_num = max(nchar(coloc_traits))
        coloc_trait = coloc_traits[which.max(nchar(coloc_traits))]

        # Decode coloc trait indices (letters -> numbers)
        coloc_trait_index = c()
        for (item in unlist(strsplit(coloc_trait, NULL))) {
            temp_index = as.integer(charToRaw(item)) - as.integer(charToRaw("a")) + 1
            coloc_trait_index = c(coloc_trait_index, temp_index)
        }

        # --- Find credible SNP set covering 95% probability ---
        index2 = which(rownames(moloc$best_snp[[1]]) == coloc_trait)
        colname = colnames(moloc$best_snp[[2]])[index2]

        row_index <- moloc$best_snp[[2]] %>%
            as_tibble() %>%
            arrange(desc(!!sym(colname))) %>%
            mutate(row_number = row_number()) %>%
            mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
            filter(cumulative_sum >= 0.95) %>%
            pull(row_number) %>%
            min()

        # Identify top SNPs up to 95% credible set
        snps = moloc$best_snp[[2]] %>%
            as_tibble() %>%
            mutate(row_number = row_number()) %>%
            arrange(desc(!!sym(colname))) %>%
            pull(row_number)
        snps = snps[1:row_index]

        # --- Assemble moloc report ---
        moloc_report = moloc
        moloc_report$file = file
        moloc_report$true_variant = true_coloc_var
        moloc_report$true_trait = true_coloc_trait

        if (length(coloc_trait_index) >= 2) {
            moloc_report$coloc_set = list(snps)
            moloc_report$coloc_trait = list(coloc_trait_index)
        } else {
            # If only one trait coloc, treat as fine-mapping only (not true coloc)
            moloc_report$coloc_set = NULL
            moloc_report$coloc_trait = NULL
        }

        # --- Purity filtering (keep coloc sets with LD-purity > 0.5) ---
        purity_check = lapply(moloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            moloc_report$coloc_set = moloc_report$coloc_set[purity_index]
            moloc_report$coloc_trait = moloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            moloc_report$coloc_set = NULL
            moloc_report$coloc_trait = NULL
        }

        moloc_report$time = (t2 - t1)
        moloc_report$purity = purity_check

        return(moloc_report)
    }

    # ======= Execute and Save =======
    files = c(${_input:ar,})
    moloc_result = run_moloc(files, ${trait})
    saveRDS(moloc_result, ${_output:r})

Run COLOC V5 (SuSiE coloc)#

[susie_coloc_set]
# ======= Define parameters =======
parameter: simufile = paths
parameter: cwd = path("output")
parameter: job_size = 300
parameter: walltime = "50h"
parameter: mem = "40G"
parameter: numThreads = 1
parameter: trait = 2
parameter: container = ""

# ======= Workflow input/output =======
input: simufile, group_by = 1
output: f'{cwd:a}/{_input[0]:bn}_ntr_{trait}_susie_coloc_result.rds'

task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'

# ======= Main R block =======
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("susieR")
    library("coloc")
    library("tidyverse")
    # install simulation package
    # devtools::install_github("StatFunGen/simxQTL", build_vignettes = FALSE)
    # BiocManager::install("StatFunGen/pecotmr")
    library("pecotmr")
    library("simxQTL")
  
    run_susie_coloc <- function(file, actual_pheno_number = 2) {


        # --- Load simulation file ---
        rds = readRDS(file)

        # --- Extract X, Y, variant for each trait ---
        X = list()
        Y = list()
        variant = list()

        for (i in 1:actual_pheno_number) {
            X[[i]] = rds$X[, , drop = FALSE]
            Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])
            variant[[i]] = rds$variant[[i]]
        }

        # --- Prepare for SuSiE fine-mapping (assume two traits) ---
        Xe = X[[1]]  # First trait
        Xp = X[[2]]  # Second trait
        Ye = Y[[1]]
        Yp = Y[[2]]

        # --- Estimate marginal betas and varbetas ---
        beta_p = varbeta_p = rep(0, ncol(Xp))
        beta_e = varbeta_e = rep(0, ncol(Xe))

        for (mm in 1:ncol(Xe)) {
            rr <- susieR::univariate_regression(Yp, Xp[, mm])
            beta_p[mm] = rr$betahat
            varbeta_p[mm] = rr$sebetahat^2

            rr <- susieR::univariate_regression(Ye, Xe[, mm])
            beta_e[mm] = rr$betahat
            varbeta_e[mm] = rr$sebetahat^2
        }

        # --- Construct SuSiE input for both traits ---
        LDp = cor(Xp)
        LDe = cor(Xe)
        snp = paste0("snp", 1:ncol(Xe))
        colnames(LDe) = rownames(LDe) = colnames(LDp) = rownames(LDp) = snp

        dp = list(
            "beta" = beta_p, "varbeta" = varbeta_p,
            "LD" = LDp, "snp" = snp, "N" = nrow(Xp),
            "type" = "quant", "sdY" = sd(Yp)
        )

        de = list(
            "beta" = beta_e, "varbeta" = varbeta_e,
            "LD" = LDe, "snp" = snp, "N" = nrow(Xe),
            "type" = "quant", "sdY" = sd(Ye)
        )

        # --- Run SuSiE fine-mapping and coloc.susie ---
        t1 <- Sys.time()
        out_p <- runsusie(dp)
        out_e <- runsusie(de)
        out_coloc = coloc.susie(out_e, out_p)
        t2 <- Sys.time()

        # --- Identify true colocalized variants ---
        all_var = unlist(variant)
        coloc_var_index = as.numeric(names(which(table(all_var) >= 2)))

        true_coloc_var = coloc_var_index
        true_coloc_trait = list()

        for (variant_index in 1:length(coloc_var_index)) {
            temp_vec = c()
            for (i in 1:length(variant)) {
                if (coloc_var_index[variant_index] %in% variant[[i]]) {
                    temp_vec = c(temp_vec, i)
                }
            }
            true_coloc_trait[[variant_index]] = temp_vec
        }

        ### --- Post-process coloc summary results ---
        df <- out_coloc$summary

        if (is.null(df)) {
            out_coloc = list()
            sets = NULL
            coloc_trait_index = NULL

        } else {
            # Find rows where PP.H4 is maximum
            index <- which(df$PP.H4.abf > df$PP.H1.abf & df$PP.H4.abf > df$PP.H2.abf & df$PP.H4.abf > df$PP.H3.abf)

            if (length(index) > 0) {
                tb <- out_coloc$results
                number = 1

                # For each H4-supported region
                for (i in index) {
                    new_index = i + 1
                    colname = names(tb)[new_index]

                    row_index <- tb %>%
                        as_tibble() %>%
                        arrange(desc(!!sym(colname))) %>%
                        mutate(row_number = row_number()) %>%
                        mutate(cumulative_sum = cumsum(!!sym(colname))) %>%
                        filter(cumulative_sum >= 0.95) %>%
                        pull(row_number) %>%
                        min()

                    snps = tb %>%
                        as_tibble() %>%
                        arrange(desc(!!sym(colname))) %>%
                        pull(snp)

                    if (number == 1) {
                        sets = list(as.numeric(gsub("snp", "", snps[1:row_index])))
                    } else {
                        new_list = as.numeric(gsub("snp", "", snps[1:row_index]))
                        sets = c(sets, list(new_list))
                    }
                    number = number + 1
                }
            } else {
                sets = NULL
            }

            coloc_trait_index = list()
            if (length(sets) == 0) {
                coloc_trait_index = NULL
            } else {
                for (j in 1:length(sets)) {
                    coloc_trait_index[[j]] = c(1, 2)
                }
            }
        }

        # --- Assemble final report ---
        susie_coloc_report = out_coloc
        susie_coloc_report$file = file
        susie_coloc_report$true_variant = true_coloc_var
        susie_coloc_report$true_trait = true_coloc_trait
        susie_coloc_report$coloc_set = sets
        susie_coloc_report$coloc_trait = coloc_trait_index
        susie_coloc_report$time = (t2 - t1)

        # --- Purity filtering (LD purity > 0.5) ---
        purity_check = lapply(susie_coloc_report$coloc_set, function(x) (get_purity(x, X = X[[1]]))[1])
        purity_index = which(unlist(purity_check) > 0.5)

        if (length(purity_index) > 0) {
            susie_coloc_report$coloc_set = susie_coloc_report$coloc_set[purity_index]
            susie_coloc_report$coloc_trait = susie_coloc_report$coloc_trait[purity_index]
        } else {
            print("All filtered out")
            susie_coloc_report$coloc_set = NULL
            susie_coloc_report$coloc_trait = NULL
        }

        susie_coloc_report$purity = purity_check

        return(susie_coloc_report)
    }

    # ======= Execute and Save =======
    file = ${_input:ar,}
    susie_coloc_result = run_susie_coloc(file)
    saveRDS(susie_coloc_result, ${_output:r})

Bash files#

Hyprcoloc#

## 2 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="hyp_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run ./3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 20G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 5 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_5trait/"
job="hyp_real_simulation_5trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=20000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 20G --trait 5 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 10 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_10trait/"
job="hyp_real_simulation_10trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=30000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 30G --trait 10 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

## 20 trait

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_20trait/"
job="hyp_real_simulation_20trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/hyprcoloc/"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb hyprcoloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 20 \
    --cwd WORK_DIR/JOB/result
EOF

base_script="base_script"
for ncausal in 1 2 3 4; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

MOLOC#

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="moloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/moloc"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=15000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb moloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 15G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done

COLOC V5 (SuSiE-COLOC)#

data_dir="/home/hs3393/cb_Mar/simulation_data/real_simulation_2trait/"
job="susie_coloc_real_simulation_2trait"
work_dir="/home/hs3393/cb_Mar/simulation_result/susie_coloc"
#!/bin/bash

mkdir -p ${work_dir}/${job}/code
mkdir -p ${work_dir}/${job}/log
mkdir -p ${work_dir}/${job}/result

cd ${work_dir}/${job}/code

cat << 'EOF' > base_script
#!/bin/bash -l
# NOTE the -l flag!
#
#SBATCH -t 80:00:00
#SBATCH --mem=40000
#SBATCH -J JOB
#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out
#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err

source /home/hs3393/mamba_activate.sh
module load Singularity

cd DATA_DIR
sos run /home/hs3393/cb_Mar/simulation_code/3_Other_Colocalization_Methods.ipynb susie_coloc_set \
    --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \
    --mem 40G --trait 2 \
    --cwd WORK_DIR/JOB/result
EOF


base_script="base_script"
for ncausal in 1 2 3; do
    output_script="ncausal_${ncausal}.sh"
    cat ${base_script}| sed "s|WORK_DIR|${work_dir}|g" |sed "s|NCAUSAL|${ncausal}|g" | sed "s|JOB|${job}|g" | sed "s|DATA_DIR|${data_dir}|g"  > ${output_script}
    sbatch ${output_script}
done