From 620fde3cae71e4a90ba26723f53ee997f4932fd2 Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Mon, 30 Sep 2024 18:08:08 +0000 Subject: [PATCH 1/2] [GSProcessing] Fix to make graph_name optional again --- .../distributed_executor.py | 31 ++++++----- .../dist_heterogeneous_loader.py | 9 ++-- .../tests/test_dist_executor.py | 51 ++++++++++++------- 3 files changed, 56 insertions(+), 35 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index 3bc1bc4936..02c565f2f9 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -59,7 +59,7 @@ import tempfile import time from collections.abc import Mapping -from typing import Any, Dict +from typing import Any, Dict, Optional import boto3 import botocore @@ -106,8 +106,8 @@ class ExecutorConfig: The filesystem type, can be LOCAL or S3 add_reverse_edges : bool Whether to create reverse edges for each edge type. - graph_name: str - The name of the graph being processed + graph_name: str, optional + The name of the graph being processed. If not provided we use part of the input_prefix. do_repartition: bool Whether to apply repartitioning to the graph on the Spark leader. """ @@ -121,7 +121,7 @@ class ExecutorConfig: config_filename: str filesystem_type: FilesystemType add_reverse_edges: bool - graph_name: str + graph_name: Optional[str] do_repartition: bool @@ -135,7 +135,7 @@ class GSProcessingArguments: num_output_files: int add_reverse_edges: bool log_level: str - graph_name: str + graph_name: Optional[str] do_repartition: bool @@ -162,7 +162,12 @@ def __init__( self.filesystem_type = executor_config.filesystem_type self.execution_env = executor_config.execution_env self.add_reverse_edges = executor_config.add_reverse_edges - self.graph_name = executor_config.graph_name + self.graph_name = ( + executor_config.graph_name + if executor_config.graph_name + else s3_utils.s3_path_remove_trailing(executor_config.input_prefix).split("/")[-1] + ) + check_graph_name(self.graph_name) self.repartition_on_leader = executor_config.do_repartition # Input config dict using GSProcessing schema self.gsp_config_dict = {} @@ -541,11 +546,14 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--graph-name", type=str, - help="Name for the graph being processed." - "The graph name must adhere to the Python " - "identifier naming rules with the exception " - "that hyphens (-) are permitted and the name " - "can start with numbers", + help=( + "Name for the graph being processed." + "The graph name must adhere to the Python " + "identifier naming rules with the exception " + "that hyphens (-) are permitted and the name " + "can start with numbers. If not provided, we will use the last " + "section of the input prefix path." + ), required=False, default=None, ) @@ -604,7 +612,6 @@ def main(): level=gsprocessing_args.log_level, format="[GSPROCESSING] %(asctime)s %(levelname)-8s %(message)s", ) - check_graph_name(gsprocessing_args.graph_name) # Determine execution environment if os.path.exists("/opt/ml/config/processingjobconfig.json"): diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index 65ef636e5f..b95de895eb 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -324,11 +324,6 @@ def process_and_write_graph_data( self.timers["process_edge_data"] = perf_counter() - edges_start_time metadata_dict["edge_data"] = edge_data_dict metadata_dict["edges"] = edge_structure_dict - # We use the data location as the graph name, can also take from user? - # TODO: Fix this, take from config? - metadata_dict["graph_name"] = ( - self.graph_name if self.graph_name else self.input_prefix.split("/")[-1] - ) # Ensure output dict has the correct order of keys for edge_type in metadata_dict["edge_type"]: @@ -447,6 +442,10 @@ def _initialize_metadata_dict( metadata_dict["edge_type"] = edge_types metadata_dict["node_type"] = sorted(node_type_set) + # We use the data location as the graph name, can also take from user? + # TODO: Fix this, take from config? + metadata_dict["graph_name"] = self.graph_name + return metadata_dict def _finalize_graphinfo_dict(self, metadata_dict: Dict) -> Dict: diff --git a/graphstorm-processing/tests/test_dist_executor.py b/graphstorm-processing/tests/test_dist_executor.py index b88ac38ab6..b398c68cfb 100644 --- a/graphstorm-processing/tests/test_dist_executor.py +++ b/graphstorm-processing/tests/test_dist_executor.py @@ -62,8 +62,9 @@ def user_state_categorical_precomp_file_fixture(): os.remove(precomp_file) -def test_dist_executor_run_with_precomputed(tempdir: str, user_state_categorical_precomp_file): - """Test run function with local data""" +@pytest.fixture(name="executor_configuration") +def executor_config_fixture(tempdir: str): + """Create a re-usable ExecutorConfig""" input_path = os.path.join(_ROOT, "resources/small_heterogeneous_graph") executor_configuration = ExecutorConfig( local_config_path=input_path, @@ -79,6 +80,15 @@ def test_dist_executor_run_with_precomputed(tempdir: str, user_state_categorical do_repartition=True, ) + yield executor_configuration + + +def test_dist_executor_run_with_precomputed( + tempdir: str, + user_state_categorical_precomp_file: str, + executor_configuration: ExecutorConfig, +): + """Test run function with local data""" original_precomp_file = user_state_categorical_precomp_file with open(original_precomp_file, "r", encoding="utf-8") as f: @@ -106,23 +116,8 @@ def test_dist_executor_run_with_precomputed(tempdir: str, user_state_categorical # TODO: Verify other metadata files that verify_integ_test_output doesn't check for -def test_merge_input_and_transform_dicts(tempdir: str): +def test_merge_input_and_transform_dicts(executor_configuration: ExecutorConfig): """Test the _merge_config_with_transformations function with hardcoded json data""" - input_path = os.path.join(_ROOT, "resources/small_heterogeneous_graph") - executor_configuration = ExecutorConfig( - local_config_path=input_path, - local_metadata_output_path=tempdir, - input_prefix=input_path, - output_prefix=tempdir, - num_output_files=-1, - config_filename="gsprocessing-config.json", - execution_env=ExecutionEnv.LOCAL, - filesystem_type=FilesystemType.LOCAL, - add_reverse_edges=True, - graph_name="small_heterogeneous_graph", - do_repartition=True, - ) - dist_executor = DistributedExecutor(executor_configuration) pre_comp_transormations = { @@ -148,3 +143,23 @@ def test_merge_input_and_transform_dicts(tempdir: str): if "state" == feature["column"]: transform_for_feature = feature["precomputed_transformation"] assert transform_for_feature["transformation_name"] == "categorical" + + +def test_dist_executor_graph_name(executor_configuration: ExecutorConfig): + """Test cases for graph name""" + + # Ensure default value is used when graph_name is not provided + executor_configuration.graph_name = None + dist_executor = DistributedExecutor(executor_configuration) + assert dist_executor.graph_name == "small_heterogeneous_graph" + + # Ensure we raise when invalid graph name is provided + with pytest.raises(AssertionError): + executor_configuration.graph_name = "graph.name" + dist_executor = DistributedExecutor(executor_configuration) + + # Ensure a valid default graph name is parsed when the input ends in '/' + executor_configuration.graph_name = None + executor_configuration.input_prefix = executor_configuration.input_prefix + "/" + dist_executor = DistributedExecutor(executor_configuration) + assert dist_executor.graph_name == "small_heterogeneous_graph" From 1062628c17b3ed2ccc67c861acdbfd74a588c814 Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Tue, 1 Oct 2024 19:46:58 +0000 Subject: [PATCH 2/2] Include warning when graph name is derived, add valid name test. --- .../graphstorm_processing/distributed_executor.py | 12 +++++++----- .../graph_loaders/dist_heterogeneous_loader.py | 2 -- graphstorm-processing/tests/test_dist_executor.py | 5 +++++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index 02c565f2f9..ee6858b544 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -162,11 +162,13 @@ def __init__( self.filesystem_type = executor_config.filesystem_type self.execution_env = executor_config.execution_env self.add_reverse_edges = executor_config.add_reverse_edges - self.graph_name = ( - executor_config.graph_name - if executor_config.graph_name - else s3_utils.s3_path_remove_trailing(executor_config.input_prefix).split("/")[-1] - ) + # We use the data location as the graph name if a name is not provided + if executor_config.graph_name: + self.graph_name = executor_config.graph_name + else: + derived_name = s3_utils.s3_path_remove_trailing(self.input_prefix).split("/")[-1] + logging.warning("Setting graph name derived from input path: %s", derived_name) + self.graph_name = derived_name check_graph_name(self.graph_name) self.repartition_on_leader = executor_config.do_repartition # Input config dict using GSProcessing schema diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index b95de895eb..e92e1068cc 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -442,8 +442,6 @@ def _initialize_metadata_dict( metadata_dict["edge_type"] = edge_types metadata_dict["node_type"] = sorted(node_type_set) - # We use the data location as the graph name, can also take from user? - # TODO: Fix this, take from config? metadata_dict["graph_name"] = self.graph_name return metadata_dict diff --git a/graphstorm-processing/tests/test_dist_executor.py b/graphstorm-processing/tests/test_dist_executor.py index b398c68cfb..3290126bb0 100644 --- a/graphstorm-processing/tests/test_dist_executor.py +++ b/graphstorm-processing/tests/test_dist_executor.py @@ -148,6 +148,11 @@ def test_merge_input_and_transform_dicts(executor_configuration: ExecutorConfig) def test_dist_executor_graph_name(executor_configuration: ExecutorConfig): """Test cases for graph name""" + # Ensure we can set a valid graph name + executor_configuration.graph_name = "2024-a_valid_name" + dist_executor = DistributedExecutor(executor_configuration) + assert dist_executor.graph_name == "2024-a_valid_name" + # Ensure default value is used when graph_name is not provided executor_configuration.graph_name = None dist_executor = DistributedExecutor(executor_configuration)