Skip to content

Commit

Permalink
formatted
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Aug 4, 2024
1 parent b355324 commit fe406e4
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 13 deletions.
2 changes: 0 additions & 2 deletions kg_microbe_merge/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from pathlib import Path


BASE_NODES_TABLE_NAME = "base_kg_nodes"
SUBSET_NODES_TABLE_NAME = "subset_kg_nodes"
BASE_EDGES_TABLE_NAME = "base_kg_edges"
Expand All @@ -28,4 +27,3 @@
DATA_DIR = PWD / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
MERGED_DATA_DIR = DATA_DIR / "merged"

12 changes: 7 additions & 5 deletions kg_microbe_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,19 @@ def duckdb_merge(
# get the value of the `provided_by` column in the tsv file and add it to the priority_sources list

priority_sources = []
ontology_nodes_paths = [Path(file_path) for file_path in nodes_files_path if "ontologies" in str(file_path)]
ontology_nodes_paths = [
Path(file_path) for file_path in nodes_files_path if "ontologies" in str(file_path)
]
for file_path in ontology_nodes_paths:
if file_path.suffix == ".tsv":
with file_path.open(newline='') as tsvfile:
reader = csv.DictReader(tsvfile, delimiter='\t')
with file_path.open(newline="") as tsvfile:
reader = csv.DictReader(tsvfile, delimiter="\t")
for row in reader:
provided_by_value = row.get('provided_by')
provided_by_value = row.get("provided_by")
if provided_by_value:
priority_sources.append(provided_by_value)
break # We only need the value from one row

# Merge nodes
duckdb_nodes_merge(nodes_files_path, merge_nodes_output_path, priority_sources)

Expand Down
6 changes: 4 additions & 2 deletions kg_microbe_merge/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def download(*args, **kwargs) -> None:
# @click.option("base_edges", "-base-e", type=click.Path(exists=True), required=False)
# @click.option("subset_nodes", "-subset-n", type=click.Path(exists=True), required=False)
# @click.option("subset_edges", "-subset-e", type=click.Path(exists=True), required=False)
@click.option("--data-dir", "-d", type=click.Path(exists=True), default = RAW_DATA_DIR)
@click.option("--data-dir", "-d", type=click.Path(exists=True), default=RAW_DATA_DIR)
def merge(
yaml: str,
processes: int,
Expand Down Expand Up @@ -123,7 +123,9 @@ def merge(
node_paths.append(os.path.join(data_dir, directory, file))
elif "edges" in file:
edge_paths.append(os.path.join(data_dir, directory, file))
duckdb_merge(node_paths, edge_paths, MERGED_DATA_DIR/"nodes.tsv", MERGED_DATA_DIR/"edges.tsv")
duckdb_merge(
node_paths, edge_paths, MERGED_DATA_DIR / "nodes.tsv", MERGED_DATA_DIR / "edges.tsv"
)
# duckdb_merge(base_nodes, subset_nodes, base_edges, subset_edges)
else:
load_and_merge(yaml, processes)
Expand Down
8 changes: 4 additions & 4 deletions kg_microbe_merge/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Utility functions for file operations."""

# Given a path to a directory, look for all files with the extension tar.zip and unzip them all
from pathlib import Path
import tarfile
from pathlib import Path
from typing import Union


Expand All @@ -16,12 +16,12 @@ def unzip_files_in_dir(dir_path: Union[str, Path]) -> None:
dir_path = Path(dir_path)
for file in dir_path.iterdir():
if file.suffix == ".gz" and file.stem.endswith(".tar"):
extract_dir = dir_path / file.stem.replace('.tar', '')
extract_dir = dir_path / file.stem.replace(".tar", "")
if extract_dir.exists() and any(extract_dir.iterdir()):
print(f"Skipping {file.name}, already extracted.")
continue

with tarfile.open(file, "r:gz") as tar:
tar.extractall(path=extract_dir)
tar.close()
print(f"Extracted {file.name} to {extract_dir}")
print(f"Extracted {file.name} to {extract_dir}")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ lint.extend-ignore = [
"D211", # `no-blank-line-before-class`
"D212", # `multi-line-summary-first-line`
"S608" , # Possible SQL injection vector through string-based query construction"
"S202", # Uses of `tarfile.extractall()`
]
line-length = 120

Expand Down

0 comments on commit fe406e4

Please sign in to comment.