Skip to content

Commit

Permalink
make node and edge db files unique to each merge label
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Sep 5, 2024
1 parent 68ac5f6 commit 3bb6ddf
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions kg_microbe_merge/utils/duckdb_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions for working with DuckDB in the KG Microbe Merge project."""

import os
from pathlib import Path
from typing import List

import duckdb
Expand Down Expand Up @@ -304,7 +305,9 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
:param priority_sources: List of source names to prioritize.
"""
# Create a DuckDB connection
conn = duckdb.connect("nodes.db")
merge_label = Path(output_file).parent.name
nodes_db_file = f"{merge_label}_nodes.db"
conn = duckdb.connect(nodes_db_file)

# Load the files into DuckDB
load_into_duckdb(conn, nodes_file_list, "combined_nodes")
Expand Down Expand Up @@ -379,7 +382,7 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
finally:
# Close the connection
conn.close()
os.remove("nodes.db")
os.remove(nodes_db_file)


def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
Expand Down Expand Up @@ -416,7 +419,8 @@ def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
memory usage and allows for processing of very large datasets that exceed available RAM.
"""
os.makedirs(TMP_DIR, exist_ok=True)
db_file = "edges_persistent.db"
merge_label = Path(output_file).parent.name
db_file = f"{merge_label}_edges_persistent.db"
conn = duckdb.connect(db_file)

try:
Expand Down

0 comments on commit 3bb6ddf

Please sign in to comment.