Skip to content

Commit

Permalink
Merge pull request #7 from Knowledge-Graph-Hub/merge_slurm_script
Browse files Browse the repository at this point in the history
add slurm script
  • Loading branch information
hrshdhgd authored Sep 13, 2024
2 parents ad3d682 + 9f1a1c1 commit 00a5d77
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 5 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ kg-microbe-biomedical:
kg-microbe-biomedical-function-merge:
poetry run kg merge -m duckdb -n 1000000 -e 100000 -s "bacdive, mediadive, madin_etal, rhea_mappings, bactotraits, chebi, ec, envo, go, ncbitaxon, upa, hp, mondo, ctd, wallen_etal, uniprot_human, uniprot_functional_microbes" --merge-label $@

include kg-microbe-merge.Makefile
include kg-microbe-merge.Makefile

2 changes: 1 addition & 1 deletion download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#
-
url: git://Knowledge-Graph-Hub/kg-microbe/BactoTraits.tar.gz
local_name: BactoTraits.tar.gz
local_name: bactotraits.tar.gz

#
# KG-Microbe [CTD]
Expand Down
32 changes: 32 additions & 0 deletions hpc/run_parallel_merge.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0-3
#SBATCH -N 1

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-core
kg-microbe-biomedical
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
33 changes: 33 additions & 0 deletions hpc/run_parallel_merge_biomedical_function.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0
#SBATCH -N 1
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-biomedical-function
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
33 changes: 33 additions & 0 deletions hpc/run_parallel_merge_function.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0
#SBATCH -N 1
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-function
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
10 changes: 7 additions & 3 deletions kg_microbe_merge/utils/duckdb_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions for working with DuckDB in the KG Microbe Merge project."""

import os
from pathlib import Path
from typing import List

import duckdb
Expand Down Expand Up @@ -304,7 +305,9 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
:param priority_sources: List of source names to prioritize.
"""
# Create a DuckDB connection
conn = duckdb.connect("nodes.db")
merge_label = Path(output_file).parent.name
nodes_db_file = f"{merge_label}_nodes.db"
conn = duckdb.connect(nodes_db_file)

# Load the files into DuckDB
load_into_duckdb(conn, nodes_file_list, "combined_nodes")
Expand Down Expand Up @@ -379,7 +382,7 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
finally:
# Close the connection
conn.close()
os.remove("nodes.db")
os.remove(nodes_db_file)


def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
Expand Down Expand Up @@ -416,7 +419,8 @@ def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
memory usage and allows for processing of very large datasets that exceed available RAM.
"""
os.makedirs(TMP_DIR, exist_ok=True)
db_file = "edges_persistent.db"
merge_label = Path(output_file).parent.name
db_file = f"{merge_label}_edges_persistent.db"
conn = duckdb.connect(db_file)

try:
Expand Down

0 comments on commit 00a5d77

Please sign in to comment.