{ "cells": [ { "cell_type": "markdown", "id": "87491953-99d3-47a0-a7e9-1b6a24854efa", "metadata": { "kernel": "SoS", "tags": [] }, "source": [ "# Run Colocboost" ] }, { "cell_type": "markdown", "id": "423c6c8f-e199-48ec-a13f-40aa6bcbce77", "metadata": { "kernel": "SoS", "tags": [] }, "source": [ "## Goal\n", "\n", "This notebook takes the input and run colocboost, the result can be summarized to get power and FDR.\n", "\n", "Because different methods have different output, we summarized four elements to calculate FDR and power - coloc_trait, coloc_set true_trait and true_variant.\n", "\n", "## Input \n", "\n", "Individual level data X and Y or summary statistics z and LD from other notebooks. file names of these data should be put in the parameter: simufile.\n", "\n", "## Output\n", "\n", "Colocboost original result, along with some summarized elements.\n", "\n", "Example output:\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "c7c25a1d-7dc2-4c94-8702-ea28a1df6925", "metadata": { "kernel": "R", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. \n", "
    1. 196
    2. 5616
    3. 6566
    \n", "
  2. \n", "\t
  3. \n", "
    1. 5616
    2. 6566
    \n", "
  4. \n", "\t
  5. \n", "
    1. 196
    2. 6566
    \n", "
  6. \n", "\t
  7. \n", "
    1. 196
    2. 5616
    \n", "
  8. \n", "\t
  9. \n", "
    1. 196
    2. 6566
    \n", "
  10. \n", "
\n" ], "text/latex": [ "\\begin{enumerate}\n", "\\item \\begin{enumerate*}\n", "\\item 196\n", "\\item 5616\n", "\\item 6566\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 5616\n", "\\item 6566\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 196\n", "\\item 6566\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 196\n", "\\item 5616\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 196\n", "\\item 6566\n", "\\end{enumerate*}\n", "\n", "\\end{enumerate}\n" ], "text/markdown": [ "1. 1. 196\n", "2. 5616\n", "3. 6566\n", "\n", "\n", "\n", "2. 1. 5616\n", "2. 6566\n", "\n", "\n", "\n", "3. 1. 196\n", "2. 6566\n", "\n", "\n", "\n", "4. 1. 196\n", "2. 5616\n", "\n", "\n", "\n", "5. 1. 196\n", "2. 6566\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "[[1]]\n", "[1] 196 5616 6566\n", "\n", "[[2]]\n", "[1] 5616 6566\n", "\n", "[[3]]\n", "[1] 196 6566\n", "\n", "[[4]]\n", "[1] 196 5616\n", "\n", "[[5]]\n", "[1] 196 6566\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[1] \"=====Prediction=====\"\n" ] }, { "data": { "text/html": [ "
\n", "\t
$`cos1:y1_y3_y4_y5`
\n", "\t\t
196
\n", "\t
$`cos2:y1_y2_y3_y5`
\n", "\t\t
\n", "
  1. 6566
  2. 6535
\n", "
\n", "\t
$`cos3:y1_y2_y4`
\n", "\t\t
\n", "
  1. 5616
  2. 5621
  3. 5628
  4. 5629
  5. 5632
  6. 5636
  7. 5637
  8. 5634
  9. 5635
  10. 5627
  11. 5623
  12. 5625
  13. 5615
\n", "
\n", "
\n" ], "text/latex": [ "\\begin{description}\n", "\\item[\\$`cos1:y1\\_y3\\_y4\\_y5`] 196\n", "\\item[\\$`cos2:y1\\_y2\\_y3\\_y5`] \\begin{enumerate*}\n", "\\item 6566\n", "\\item 6535\n", "\\end{enumerate*}\n", "\n", "\\item[\\$`cos3:y1\\_y2\\_y4`] \\begin{enumerate*}\n", "\\item 5616\n", "\\item 5621\n", "\\item 5628\n", "\\item 5629\n", "\\item 5632\n", "\\item 5636\n", "\\item 5637\n", "\\item 5634\n", "\\item 5635\n", "\\item 5627\n", "\\item 5623\n", "\\item 5625\n", "\\item 5615\n", "\\end{enumerate*}\n", "\n", "\\end{description}\n" ], "text/markdown": [ "$`cos1:y1_y3_y4_y5`\n", ": 196\n", "$`cos2:y1_y2_y3_y5`\n", ": 1. 6566\n", "2. 6535\n", "\n", "\n", "\n", "$`cos3:y1_y2_y4`\n", ": 1. 5616\n", "2. 5621\n", "3. 5628\n", "4. 5629\n", "5. 5632\n", "6. 5636\n", "7. 5637\n", "8. 5634\n", "9. 5635\n", "10. 5627\n", "11. 5623\n", "12. 5625\n", "13. 5615\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "$`cos1:y1_y3_y4_y5`\n", "[1] 196\n", "\n", "$`cos2:y1_y2_y3_y5`\n", "[1] 6566 6535\n", "\n", "$`cos3:y1_y2_y4`\n", " [1] 5616 5621 5628 5629 5632 5636 5637 5634 5635 5627 5623 5625 5615\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
    \n", "\t
  1. \n", "
    1. 1
    2. 3
    3. 4
    4. 5
    \n", "
  2. \n", "\t
  3. \n", "
    1. 1
    2. 2
    3. 3
    4. 5
    \n", "
  4. \n", "\t
  5. \n", "
    1. 1
    2. 2
    3. 4
    \n", "
  6. \n", "
\n" ], "text/latex": [ "\\begin{enumerate}\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 3\n", "\\item 4\n", "\\item 5\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 2\n", "\\item 3\n", "\\item 5\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 2\n", "\\item 4\n", "\\end{enumerate*}\n", "\n", "\\end{enumerate}\n" ], "text/markdown": [ "1. 1. 1\n", "2. 3\n", "3. 4\n", "4. 5\n", "\n", "\n", "\n", "2. 1. 1\n", "2. 2\n", "3. 3\n", "4. 5\n", "\n", "\n", "\n", "3. 1. 1\n", "2. 2\n", "3. 4\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "[[1]]\n", "[1] 1 3 4 5\n", "\n", "[[2]]\n", "[1] 1 2 3 5\n", "\n", "[[3]]\n", "[1] 1 2 4\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[1] \"=====Truth=====\"\n" ] }, { "data": { "text/html": [ "\n", "
  1. 196
  2. 5616
  3. 6566
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 196\n", "\\item 5616\n", "\\item 6566\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 196\n", "2. 5616\n", "3. 6566\n", "\n", "\n" ], "text/plain": [ "[1] 196 5616 6566" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
    \n", "\t
  1. \n", "
    1. 1
    2. 3
    3. 4
    4. 5
    \n", "
  2. \n", "\t
  3. \n", "
    1. 1
    2. 2
    3. 4
    \n", "
  4. \n", "\t
  5. \n", "
    1. 1
    2. 2
    3. 3
    4. 5
    \n", "
  6. \n", "
\n" ], "text/latex": [ "\\begin{enumerate}\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 3\n", "\\item 4\n", "\\item 5\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 2\n", "\\item 4\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 1\n", "\\item 2\n", "\\item 3\n", "\\item 5\n", "\\end{enumerate*}\n", "\n", "\\end{enumerate}\n" ], "text/markdown": [ "1. 1. 1\n", "2. 3\n", "3. 4\n", "4. 5\n", "\n", "\n", "\n", "2. 1. 1\n", "2. 2\n", "3. 4\n", "\n", "\n", "\n", "3. 1. 1\n", "2. 2\n", "3. 3\n", "4. 5\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "[[1]]\n", "[1] 1 3 4 5\n", "\n", "[[2]]\n", "[1] 1 2 4\n", "\n", "[[3]]\n", "[1] 1 2 3 5\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "result = readRDS(\"/home/hs3393/cb_Mar/simulation_result/real_simulation_5trait/result/sample_999_real_simulation_3_ncausal_5_trait_ntrait_5_colocboost.rds\")\n", "# true variant and corresponding trait index\n", "result$var\n", "\n", "print(\"=====Prediction=====\")\n", "# colocalization result - CoS\n", "result$coloc_set\n", "\n", "# colocalization result - colocalizing trait\n", "result$coloc_trait\n", "\n", "print(\"=====Truth=====\")\n", "# true colocalizing CoS\n", "result$true_variant\n", "\n", "# true colocalizing trait\n", "result$true_trait" ] }, { "cell_type": "markdown", "id": "31724ff3-56bd-4a22-8f43-5b2fad272cd9", "metadata": { "kernel": "SoS", "tags": [] }, "source": [ "## Colocboost Running" ] }, { "cell_type": "code", "execution_count": null, "id": "6d0bde3f-fc54-47c1-9d7c-028520470ae4", "metadata": { "kernel": "SoS" }, "outputs": [], "source": [ "[colocboost]\n", "parameter: simufile = paths\n", "parameter: cwd = path(\"output\")\n", "parameter: job_size = 15\n", "parameter: walltime = \"80h\"\n", "parameter: mem = \"60G\"\n", "parameter: numThreads = 3\n", "parameter: trait = 10\n", "parameter: container = \"\"\n", "input: simufile, group_by = 1\n", "output: f'{cwd:a}/{_input[0]:bn}_ntrait_{trait}_{step_name}.rds'\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container \n", " for(file in list.files(\"/home/xc2270/COLOCBoost/code_COLOCBoost/colocboost_updating/\", full.names = T)) {source(file)}\n", " X = list()\n", " Y = list()\n", " variant = list()\n", " rds = readRDS(${_input:ar})\n", "\n", " for(i in 1:${trait}){\n", " X[[i]] = rds$X\n", " Y[[i]] = as.matrix(rds$Y[, i, drop = FALSE])\n", " variant[[i]] = rds$variant[[i]]\n", " }\n", "\n", " start_time <- Sys.time()\n", " colocboost_result = colocboost(\n", " X = X, \n", " Y = Y\n", " )\n", " end_time <- Sys.time()\n", "\n", " # record true variant, analysed trait number and corresponding file name\n", " colocboost_result$var = variant\n", " colocboost_result$trait_num = ${trait}\n", " colocboost_result$file = \"${_input[0]:a}\"\n", "\n", " # In real setting (list: variant), show which snp appear in at least two traits\n", " all_var = unlist(variant)\n", " true_var = as.numeric(names(which(table(all_var) >= 2)))\n", " true_trait = list()\n", "\n", " # Iterate through the variant list to find which traits the true_var is colocalized on\n", " for (variant_index in 1:length(true_var)){\n", " temp_vec = c()\n", " for(i in 1:length(variant)){\n", " if(true_var[variant_index] %in% variant[[i]]){\n", " temp_vec = c(temp_vec, i)\n", " }\n", " }\n", " true_trait[[variant_index]] = temp_vec\n", " }\n", "\n", " library(stringr)\n", " coloc_trait = list()\n", "\n", " # if no coloc sets detected, assign coloc_trait as NULL as well\n", " if(length(colocboost_result$cos_details$cos$cos_index) == 0){\n", " coloc_trait = NULL \n", " } else {\n", " for(i in 1:length(colocboost_result$cos_details$cos_outcomes$outcome_index)){\n", " coloc_trait[[i]] = colocboost_result$cos_details$cos_outcomes$outcome_index[[i]]\n", " }\n", " }\n", "\n", " colocboost_result$true_variant = true_var\n", " colocboost_result$true_trait = true_trait\n", " colocboost_result$coloc_set = colocboost_result$cos_details$cos$cos_index\n", " colocboost_result$coloc_trait = coloc_trait\n", " colocboost_result$time = end_time - start_time\n", " saveRDS(colocboost_result, ${_output:r})" ] }, { "cell_type": "markdown", "id": "bd872451-4f86-4642-91e8-60951ab47a5b", "metadata": { "kernel": "SoS" }, "source": [ "## Job submission\n", "\n", "Real configuration simulation running - 2, 5, 10 and 20 traits." ] }, { "cell_type": "code", "execution_count": null, "id": "6ace413b-837f-4a6f-a1f2-f0b0af747092", "metadata": { "kernel": "SoS" }, "outputs": [], "source": [ "data_dir=\"/home/hs3393/cb_Mar/simulation_data/\"\n", "job=\"real_simulation_2trait\"\n", "work_dir=\"/home/hs3393/cb_Mar/simulation_result/\"\n", "\n", "mkdir -p ${work_dir}/${job}/code\n", "mkdir -p ${work_dir}/${job}/log\n", "mkdir -p ${work_dir}/${job}/result\n", "\n", "cd ${work_dir}/${job}/code\n", "\n", "cat << 'EOF' > base_script\n", "#!/bin/bash -l\n", "# NOTE the -l flag!\n", "#\n", "#SBATCH -t 80:00:00\n", "#SBATCH --mem=30000\n", "#SBATCH -J JOB\n", "#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out\n", "#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err\n", "\n", "source /home/hs3393/mamba_activate.sh\n", "module load Singularity\n", "\n", "cd DATA_DIR/JOB\n", "sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \\\n", " --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \\\n", " --mem 40G --trait 2 \\\n", " --cwd WORK_DIR/JOB/result\n", "EOF\n", "\n", "\n", "base_script=\"base_script\"\n", "for ncausal in 1 2 3; do\n", " output_script=\"ncausal_${ncausal}.sh\"\n", " cat ${base_script}| sed \"s|WORK_DIR|${work_dir}|g\" |sed \"s|NCAUSAL|${ncausal}|g\" | sed \"s|JOB|${job}|g\" | sed \"s|DATA_DIR|${data_dir}|g\" > ${output_script}\n", " sbatch ${output_script}\n", "done\n", "\n", "\n", "data_dir=\"/home/hs3393/cb_Mar/simulation_data/\"\n", "job=\"real_simulation_5trait\"\n", "work_dir=\"/home/hs3393/cb_Mar/simulation_result/\"\n", "#!/bin/bash\n", "\n", "mkdir -p ${work_dir}/${job}/code\n", "mkdir -p ${work_dir}/${job}/log\n", "mkdir -p ${work_dir}/${job}/result\n", "\n", "cd ${work_dir}/${job}/code\n", "\n", "cat << 'EOF' > base_script\n", "#!/bin/bash -l\n", "# NOTE the -l flag!\n", "#\n", "#SBATCH -t 80:00:00\n", "#SBATCH --mem=30000\n", "#SBATCH -J JOB\n", "#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out\n", "#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err\n", "\n", "source /home/hs3393/mamba_activate.sh\n", "module load Singularity\n", "\n", "cd DATA_DIR/JOB\n", "sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \\\n", " --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \\\n", " --mem 40G --trait 5 \\\n", " --cwd WORK_DIR/JOB/result\n", "EOF\n", "\n", "\n", "base_script=\"base_script\"\n", "for ncausal in 1 2 3 4; do\n", " output_script=\"ncausal_${ncausal}.sh\"\n", " cat ${base_script}| sed \"s|WORK_DIR|${work_dir}|g\" |sed \"s|NCAUSAL|${ncausal}|g\" | sed \"s|JOB|${job}|g\" | sed \"s|DATA_DIR|${data_dir}|g\" > ${output_script}\n", " sbatch ${output_script}\n", "done\n", "\n", "\n", "\n", "data_dir=\"/home/hs3393/cb_Mar/simulation_data/\"\n", "job=\"real_simulation_10trait\"\n", "work_dir=\"/home/hs3393/cb_Mar/simulation_result/\"\n", "#!/bin/bash\n", "\n", "mkdir -p ${work_dir}/${job}/code\n", "mkdir -p ${work_dir}/${job}/log\n", "mkdir -p ${work_dir}/${job}/result\n", "\n", "cd ${work_dir}/${job}/code\n", "\n", "cat << 'EOF' > base_script\n", "#!/bin/bash -l\n", "# NOTE the -l flag!\n", "#\n", "#SBATCH -t 80:00:00\n", "#SBATCH --mem=40000\n", "#SBATCH -J JOB\n", "#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out\n", "#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err\n", "\n", "source /home/hs3393/mamba_activate.sh\n", "module load Singularity\n", "\n", "cd DATA_DIR/JOB\n", "sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \\\n", " --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \\\n", " --mem 40G --trait 10 \\\n", " --cwd WORK_DIR/JOB/result\n", "EOF\n", "\n", "\n", "base_script=\"base_script\"\n", "for ncausal in 1 2 3 4 5; do\n", " output_script=\"ncausal_${ncausal}.sh\"\n", " cat ${base_script}| sed \"s|WORK_DIR|${work_dir}|g\" |sed \"s|NCAUSAL|${ncausal}|g\" | sed \"s|JOB|${job}|g\" | sed \"s|DATA_DIR|${data_dir}|g\" > ${output_script}\n", " sbatch ${output_script}\n", "done\n", "\n", "\n", "data_dir=\"/home/hs3393/cb_Mar/simulation_data/\"\n", "job=\"real_simulation_20trait\"\n", "work_dir=\"/home/hs3393/cb_Mar/simulation_result/\"\n", "#!/bin/bash\n", "\n", "mkdir -p ${work_dir}/${job}/code\n", "mkdir -p ${work_dir}/${job}/log\n", "mkdir -p ${work_dir}/${job}/result\n", "\n", "cd ${work_dir}/${job}/code\n", "\n", "cat << 'EOF' > base_script\n", "#!/bin/bash -l\n", "# NOTE the -l flag!\n", "#\n", "#SBATCH -t 80:00:00\n", "#SBATCH --mem=40000\n", "#SBATCH -J JOB\n", "#SBATCH -o WORK_DIR/JOB/log/JOB.%j.out\n", "#SBATCH -e WORK_DIR/JOB/log/JOB.%j.err\n", "\n", "source /home/hs3393/mamba_activate.sh\n", "module load Singularity\n", "\n", "cd DATA_DIR/JOB\n", "sos run /home/hs3393/cb_Mar/simulation_code/2.Run_Colocboost.ipynb colocboost \\\n", " --simufile $(find -type f -name '*_NCAUSAL_ncausal_*.rds') \\\n", " --mem 40G --trait 20 \\\n", " --cwd WORK_DIR/JOB/result\n", "EOF\n", "\n", "\n", "base_script=\"base_script\"\n", "for ncausal in 1 2 3 4 5; do\n", " output_script=\"ncausal_${ncausal}.sh\"\n", " cat ${base_script}| sed \"s|WORK_DIR|${work_dir}|g\" |sed \"s|NCAUSAL|${ncausal}|g\" | sed \"s|JOB|${job}|g\" | sed \"s|DATA_DIR|${data_dir}|g\" > ${output_script}\n", " sbatch ${output_script}\n", "done" ] } ], "metadata": { "kernelspec": { "display_name": "SoS", "language": "sos", "name": "sos" }, "language_info": { "codemirror_mode": "sos", "file_extension": ".sos", "mimetype": "text/x-sos", "name": "sos", "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", "pygments_lexer": "sos" }, "sos": { "kernels": [ [ "R", "ir", "R", "", "r" ], [ "SoS", "sos", "", "", "sos" ] ], "version": "0.24.3" } }, "nbformat": 4, "nbformat_minor": 5 }