From 2c57a5d7ddeafa20285d0427e582e6b1a12ffe5e Mon Sep 17 00:00:00 2001 From: bsantan <70932395+bsantan@users.noreply.github.com> Date: Tue, 3 Sep 2024 13:10:13 -0600 Subject: [PATCH 1/6] add slurm script --- hpc/run_parallel_merge.sl | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 hpc/run_parallel_merge.sl diff --git a/hpc/run_parallel_merge.sl b/hpc/run_parallel_merge.sl new file mode 100644 index 0000000..f7cb633 --- /dev/null +++ b/hpc/run_parallel_merge.sl @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --account=m4689 +#SBATCH --qos=regular +#SBATCH --constraint=cpu +#SBATCH --time=360 +#SBATCH --ntasks=1 +#SBATCH --mem=425GB +#SBATCH --job-name=parallel_merge +#SBATCH --output=parallel_merge_%A_%a.out +#SBATCH --error=parallel_merge_%A_%a.err +#SBATCH --array=0-10 +#SBATCH -N 1 + +module load python/3.10 +cd kg-microbe-merge +python -m venv venv-merge +source venv-merge/bin/activate +pip install poetry +poetry install + +# Array of merged graph names +merges=( + kg-microbe-core + kg-microbe-function + kg-microbe-biomedical + kg-microbe-biomedical-function +) + +# Get the merge for this job array task +merge=${merges[$SLURM_ARRAY_TASK_ID]} + +echo "Starting $merge" +time poetry run make $merge +echo "Finished $merge" From 68ac5f611c3efbda4224274c3edc6343b94c9a96 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Wed, 4 Sep 2024 12:40:07 -0500 Subject: [PATCH 2/6] Only 0-3 array elements --- hpc/run_parallel_merge.sl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpc/run_parallel_merge.sl b/hpc/run_parallel_merge.sl index f7cb633..1c841f4 100644 --- a/hpc/run_parallel_merge.sl +++ b/hpc/run_parallel_merge.sl @@ -8,7 +8,7 @@ #SBATCH --job-name=parallel_merge #SBATCH --output=parallel_merge_%A_%a.out #SBATCH --error=parallel_merge_%A_%a.err -#SBATCH --array=0-10 +#SBATCH --array=0-3 #SBATCH -N 1 module load python/3.10 From 3bb6ddfb3e95d10f311980e6c903e102b83ce64c Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Thu, 5 Sep 2024 14:39:20 -0500 Subject: [PATCH 3/6] make node and edge db files unique to each merge label --- kg_microbe_merge/utils/duckdb_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kg_microbe_merge/utils/duckdb_utils.py b/kg_microbe_merge/utils/duckdb_utils.py index f1f3e56..1fe8cdc 100644 --- a/kg_microbe_merge/utils/duckdb_utils.py +++ b/kg_microbe_merge/utils/duckdb_utils.py @@ -1,6 +1,7 @@ """Utility functions for working with DuckDB in the KG Microbe Merge project.""" import os +from pathlib import Path from typing import List import duckdb @@ -304,7 +305,9 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz :param priority_sources: List of source names to prioritize. """ # Create a DuckDB connection - conn = duckdb.connect("nodes.db") + merge_label = Path(output_file).parent.name + nodes_db_file = f"{merge_label}_nodes.db" + conn = duckdb.connect(nodes_db_file) # Load the files into DuckDB load_into_duckdb(conn, nodes_file_list, "combined_nodes") @@ -379,7 +382,7 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz finally: # Close the connection conn.close() - os.remove("nodes.db") + os.remove(nodes_db_file) def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000): @@ -416,7 +419,8 @@ def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000): memory usage and allows for processing of very large datasets that exceed available RAM. """ os.makedirs(TMP_DIR, exist_ok=True) - db_file = "edges_persistent.db" + merge_label = Path(output_file).parent.name + db_file = f"{merge_label}_edges_persistent.db" conn = duckdb.connect(db_file) try: From 3d91e0e439aa9ddf8524a89f6b7976aaaacbad3e Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Fri, 6 Sep 2024 09:04:42 -0500 Subject: [PATCH 4/6] wrong name --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 42b4a24..79611ce 100644 --- a/Makefile +++ b/Makefile @@ -11,5 +11,5 @@ kg-microbe-function: kg-microbe-biomedical: poetry run kg merge -m duckdb -n 1000000 -e 100000 -s "bacdive, mediadive, madin_etal, rhea_mappings, bactotraits, chebi, ec, envo, go, ncbitaxon, upa, hp, mondo, ctd, wallen_etal, uniprot_human" --merge-label $@ -kg-microbe-biomedical-function-merge: +kg-microbe-biomedical-function: poetry run kg merge -m duckdb -n 1000000 -e 100000 -s "bacdive, mediadive, madin_etal, rhea_mappings, bactotraits, chebi, ec, envo, go, ncbitaxon, upa, hp, mondo, ctd, wallen_etal, uniprot_human, uniprot_functional_microbes" --merge-label $@ \ No newline at end of file From 3069004e831e3d2b46788b300b4203bd2333bb95 Mon Sep 17 00:00:00 2001 From: realmarcin Date: Mon, 9 Sep 2024 10:50:03 -0700 Subject: [PATCH 5/6] slurm updates --- hpc/run_parallel_merge.sl | 2 -- hpc/run_parallel_merge_biomedical_function.sl | 33 +++++++++++++++++++ hpc/run_parallel_merge_function.sl | 33 +++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 hpc/run_parallel_merge_biomedical_function.sl create mode 100644 hpc/run_parallel_merge_function.sl diff --git a/hpc/run_parallel_merge.sl b/hpc/run_parallel_merge.sl index 1c841f4..d69addc 100644 --- a/hpc/run_parallel_merge.sl +++ b/hpc/run_parallel_merge.sl @@ -21,9 +21,7 @@ poetry install # Array of merged graph names merges=( kg-microbe-core - kg-microbe-function kg-microbe-biomedical - kg-microbe-biomedical-function ) # Get the merge for this job array task diff --git a/hpc/run_parallel_merge_biomedical_function.sl b/hpc/run_parallel_merge_biomedical_function.sl new file mode 100644 index 0000000..83d1ad4 --- /dev/null +++ b/hpc/run_parallel_merge_biomedical_function.sl @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --account=m4689 +#SBATCH --qos=regular +#SBATCH --constraint=cpu +#SBATCH --time=360 +#SBATCH --ntasks=1 +#SBATCH --mem=425GB +#SBATCH --job-name=parallel_merge +#SBATCH --output=parallel_merge_%A_%a.out +#SBATCH --error=parallel_merge_%A_%a.err +#SBATCH --array=0 +#SBATCH -N 1 +#SBATCH --mail-type=BEGIN,END +#SBATCH --mail-user=MJoachimiak@lbl.gov + +module load python/3.10 +cd kg-microbe-merge +python -m venv venv-merge +source venv-merge/bin/activate +pip install poetry +poetry install + +# Array of merged graph names +merges=( + kg-microbe-biomedical-function +) + +# Get the merge for this job array task +merge=${merges[$SLURM_ARRAY_TASK_ID]} + +echo "Starting $merge" +time poetry run make $merge +echo "Finished $merge" diff --git a/hpc/run_parallel_merge_function.sl b/hpc/run_parallel_merge_function.sl new file mode 100644 index 0000000..673dd56 --- /dev/null +++ b/hpc/run_parallel_merge_function.sl @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --account=m4689 +#SBATCH --qos=regular +#SBATCH --constraint=cpu +#SBATCH --time=360 +#SBATCH --ntasks=1 +#SBATCH --mem=425GB +#SBATCH --job-name=parallel_merge +#SBATCH --output=parallel_merge_%A_%a.out +#SBATCH --error=parallel_merge_%A_%a.err +#SBATCH --array=0 +#SBATCH -N 1 +#SBATCH --mail-type=BEGIN,END +#SBATCH --mail-user=MJoachimiak@lbl.gov + +module load python/3.10 +cd kg-microbe-merge +python -m venv venv-merge +source venv-merge/bin/activate +pip install poetry +poetry install + +# Array of merged graph names +merges=( + kg-microbe-function +) + +# Get the merge for this job array task +merge=${merges[$SLURM_ARRAY_TASK_ID]} + +echo "Starting $merge" +time poetry run make $merge +echo "Finished $merge" From 4ca85fb59308d42b21035f8959a5b0f1d9e78627 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Tue, 10 Sep 2024 12:26:00 -0600 Subject: [PATCH 6/6] Update download.yaml update bacdive download name --- download.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.yaml b/download.yaml index 4c37e62..7e1a876 100644 --- a/download.yaml +++ b/download.yaml @@ -37,7 +37,7 @@ # - url: git://Knowledge-Graph-Hub/kg-microbe/BactoTraits.tar.gz - local_name: BactoTraits.tar.gz + local_name: bactotraits.tar.gz # # KG-Microbe [CTD]