Skip to content

Commit

Permalink
Merge pull request #6 from Knowledge-Graph-Hub/merge-distinction
Browse files Browse the repository at this point in the history
Added `-l`/`--merge-label` to distinguish subset merges
  • Loading branch information
hrshdhgd authored Aug 26, 2024
2 parents fe9ff13 + c9d4265 commit 8601376
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
9 changes: 6 additions & 3 deletions kg_microbe_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def load_and_merge(yaml_file: str, processes: int = 1) -> nx.MultiDiGraph:
def duckdb_merge(
nodes_files_path: List[Union[str, Path]],
edges_files_path: List[Union[str, Path]],
merge_nodes_output_path: Union[str, Path],
merged_nodes_output_path: Union[str, Path],
merged_edges_output_path: Union[str, Path],
nodes_batch_size: int = 100000,
edges_batch_size: int = 2000000,
Expand Down Expand Up @@ -91,13 +91,16 @@ def duckdb_merge(
priority_sources.append(provided_by_value)
break # We only need the value from one row

os.makedirs(os.path.dirname(merged_nodes_output_path), exist_ok=True)

# Merge nodes
duckdb_nodes_merge(
nodes_files_path, merge_nodes_output_path, priority_sources, nodes_batch_size
nodes_files_path, merged_nodes_output_path, priority_sources, nodes_batch_size
)

# Merge edges
duckdb_edges_merge(edges_files_path, merged_edges_output_path, edges_batch_size)

# Tarball all files in a directory
tarball_files_in_dir(MERGED_DATA_DIR, "merged_kg")
tarball_name = str(merged_nodes_output_path).split("/")[-2]
tarball_files_in_dir(MERGED_DATA_DIR / tarball_name, tarball_name)
12 changes: 10 additions & 2 deletions kg_microbe_merge/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def download(*args, **kwargs) -> None:
@click.option("--merge-tool", "-m", default="kgx", type=click.Choice(["kgx", "duckdb"]))
@click.option("--data-dir", "-d", type=click.Path(exists=True), default=RAW_DATA_DIR)
@click.option("--subset-transforms", "-s", multiple=True)
@click.option("--merge-label", "-l", default="merged-kg")
@click.option("--nodes-batch-size", "-n", type=int, default=100000)
@click.option("--edges-batch-size", "-e", type=int, default=2000000)
def merge(
Expand All @@ -103,6 +104,7 @@ def merge(
merge_tool: str,
data_dir: str,
subset_transforms: tuple,
merge_label: str,
nodes_batch_size: int,
edges_batch_size: int,
) -> None:
Expand Down Expand Up @@ -136,12 +138,18 @@ def merge(

merge_kg_object.merged_graph = merged_graph_object
if merge_tool == "duckdb":
if merge_label:
merged_nodes_output_path = MERGED_DATA_DIR / merge_label / "nodes.tsv"
merged_edges_output_path = MERGED_DATA_DIR / merge_label / "edges.tsv"
else:
merged_nodes_output_path = MERGED_DATA_DIR / "nodes.tsv"
merged_edges_output_path = MERGED_DATA_DIR / "edges.tsv"

duckdb_merge(
node_paths,
edge_paths,
MERGED_DATA_DIR / "nodes.tsv",
MERGED_DATA_DIR / "edges.tsv",
merged_nodes_output_path,
merged_edges_output_path,
nodes_batch_size,
edges_batch_size,
)
Expand Down

0 comments on commit 8601376

Please sign in to comment.