From 598a1dba6f117b82626c2c32f49a32aba5d9c8a9 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 11:21:37 -0800 Subject: [PATCH 01/46] Ignore deprecation warnings from gputil --- .../production/morpheus/benchmarks/conftest.py | 7 ++++++- tests/benchmarks/conftest.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py index eaaadf9236..f9f4d54014 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py @@ -15,9 +15,14 @@ import glob import json +import warnings from os import path -import GPUtil +with warnings.catch_warnings(): + # Ignore deprecation warnings from GPUtil + # https://github.com/nv-morpheus/Morpheus/issues/1446 + warnings.simplefilter("ignore", category=DeprecationWarning) + import GPUtil from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index ff83e4b9a3..1ede875600 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -17,9 +17,15 @@ import glob import os import typing +import warnings from unittest import mock -import GPUtil +with warnings.catch_warnings(): + # Ignore deprecation warnings from GPUtil + # https://github.com/nv-morpheus/Morpheus/issues/1446 + warnings.simplefilter("ignore", category=DeprecationWarning) + import GPUtil + import pytest from test_bench_e2e_pipelines import E2E_TEST_CONFIGS From 0dc1816c4fc95c827f2e38d6d4c2ff2e8a393a98 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 11:56:26 -0800 Subject: [PATCH 02/46] Suppress warnings from merlin/nvt regardning tensorflow not being installed --- morpheus/utils/column_info.py | 7 ++++++- morpheus/utils/schema_transforms.py | 29 +++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 80f7e69694..1f6c5d632d 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -17,10 +17,15 @@ import logging import re import typing +import warnings from datetime import datetime from functools import partial -import nvtabular as nvt +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.simplefilter("ignore") + import nvtabular as nvt + import pandas as pd import cudf diff --git a/morpheus/utils/schema_transforms.py b/morpheus/utils/schema_transforms.py index 8abbccf9c3..6ef3eb8b64 100644 --- a/morpheus/utils/schema_transforms.py +++ b/morpheus/utils/schema_transforms.py @@ -15,8 +15,8 @@ import logging import os import typing +import warnings -import nvtabular as nvt import pandas as pd import cudf @@ -27,17 +27,22 @@ from morpheus.utils.nvt.extensions import morpheus_ext from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow -if os.environ.get("MORPHEUS_IN_SPHINX_BUILD") is None: - # Apply patches to NVT - # TODO(Devin): Can be removed, once numpy mappings are updated in Merlin - # ======================================================================== - patches.patch_numpy_dtype_registry() - # ======================================================================== - - # Add morpheus conversion mappings - # ======================================================================== - morpheus_ext.register_morpheus_extensions() - # ========================================================================= +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.simplefilter("ignore") + import nvtabular as nvt + + if os.environ.get("MORPHEUS_IN_SPHINX_BUILD") is None: + # Apply patches to NVT + # TODO(Devin): Can be removed, once numpy mappings are updated in Merlin + # ======================================================================== + patches.patch_numpy_dtype_registry() + # ======================================================================== + + # Add morpheus conversion mappings + # ======================================================================== + morpheus_ext.register_morpheus_extensions() + # ========================================================================= logger = logging.getLogger(__name__) From 80575ca57ea645e9db4f8c280ce5f2ec60ce05e7 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 12:27:01 -0800 Subject: [PATCH 03/46] Revert "Ignore deprecation warnings from gputil" This reverts commit 598a1dba6f117b82626c2c32f49a32aba5d9c8a9. --- .../production/morpheus/benchmarks/conftest.py | 7 +------ tests/benchmarks/conftest.py | 8 +------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py index f9f4d54014..eaaadf9236 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py @@ -15,14 +15,9 @@ import glob import json -import warnings from os import path -with warnings.catch_warnings(): - # Ignore deprecation warnings from GPUtil - # https://github.com/nv-morpheus/Morpheus/issues/1446 - warnings.simplefilter("ignore", category=DeprecationWarning) - import GPUtil +import GPUtil from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index 1ede875600..ff83e4b9a3 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -17,15 +17,9 @@ import glob import os import typing -import warnings from unittest import mock -with warnings.catch_warnings(): - # Ignore deprecation warnings from GPUtil - # https://github.com/nv-morpheus/Morpheus/issues/1446 - warnings.simplefilter("ignore", category=DeprecationWarning) - import GPUtil - +import GPUtil import pytest from test_bench_e2e_pipelines import E2E_TEST_CONFIGS From 7a722bbf1c77b4a207c8414ac2f76fccb0232650 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 12:27:28 -0800 Subject: [PATCH 04/46] Ignore distutils deprecation warning --- pyproject.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e52c590aba..c33d4e0e88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,12 @@ filterwarnings = [ 'ignore:`np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe', 'ignore:Warning the df property returns a copy, please use the copy_dataframe method or the mutable_dataframe context manager to modify the DataFrame in-place instead.', 'ignore:`np.MachAr` is deprecated \(NumPy 1.22\):DeprecationWarning', - 'ignore:Please use `spmatrix` from the `scipy.sparse` namespace, the `scipy.sparse.base` namespace is deprecated:DeprecationWarning', + 'ignore:Please use `spmatrix` from the `scipy.sparse` namespace, the `scipy.sparse.base` namespace is deprecated:DeprecationWarning', + + # Deprecation warning from any project using distutils, currently known sources of this are: + # GPUtils https://github.com/anderskm/gputil/issues/48 + # PySpark https://issues.apache.org/jira/browse/SPARK-45390 + 'ignore:The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives', ] testpaths = ["tests"] From 0d1fa3760284c8931f08817515e497e3c6d76f06 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 12:31:38 -0800 Subject: [PATCH 05/46] Mark the http server tests as slow --- tests/common/test_http_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/common/test_http_server.py b/tests/common/test_http_server.py index 322d64d687..654f5aee58 100644 --- a/tests/common/test_http_server.py +++ b/tests/common/test_http_server.py @@ -35,6 +35,7 @@ def make_parse_fn(status: HTTPStatus = HTTPStatus.OK, return mock_parse_fn +@pytest.mark.slow @pytest.mark.parametrize("endpoint", ["/test", "test/", "/a/b/c/d"]) @pytest.mark.parametrize("port", [8088, 9090]) @pytest.mark.parametrize("method", ["GET", "POST", "PUT"]) From da692e8f899ccb3999fa0eb62a966ff2bb9d3937 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 19 Dec 2023 12:42:48 -0800 Subject: [PATCH 06/46] Capture the warning emitted when timestamp is provided but timestamp pattern is not --- tests/modules/test_payload_batcher.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/modules/test_payload_batcher.py b/tests/modules/test_payload_batcher.py index 25a405923e..3b46450ae8 100644 --- a/tests/modules/test_payload_batcher.py +++ b/tests/modules/test_payload_batcher.py @@ -135,8 +135,10 @@ def test_custom_params(config, expected_count, expected_exception): + expected_warning = False if timestamp_column_name: filter_probs_df["timestamp"] = TIMESTAMPS + expected_warning = timestamp_pattern is None pipe = Pipeline(config) @@ -182,6 +184,10 @@ def test_custom_params(config, if expected_exception: with pytest.raises(type(expected_exception), match=str(expected_exception)): pipe.run() + elif expected_warning: + with pytest.warns(UserWarning): + pipe.run() + assert len(sink_stage.get_messages()) == expected_count else: pipe.run() assert len(sink_stage.get_messages()) == expected_count From f1dcd44ef124c9abe65d22f7a2530780013bb2a2 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 29 Dec 2023 09:31:54 -0800 Subject: [PATCH 07/46] Narrow the warning filters to be more specific, ensuring we aren't accidentally ignoring other warnings --- morpheus/utils/column_info.py | 2 +- morpheus/utils/schema_transforms.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 1f6c5d632d..82ce740d5a 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -23,7 +23,7 @@ with warnings.catch_warnings(): # Ignore warning regarding tensorflow not being installed - warnings.simplefilter("ignore") + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) import nvtabular as nvt import pandas as pd diff --git a/morpheus/utils/schema_transforms.py b/morpheus/utils/schema_transforms.py index 6ef3eb8b64..e8e2383b25 100644 --- a/morpheus/utils/schema_transforms.py +++ b/morpheus/utils/schema_transforms.py @@ -29,7 +29,7 @@ with warnings.catch_warnings(): # Ignore warning regarding tensorflow not being installed - warnings.simplefilter("ignore") + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) import nvtabular as nvt if os.environ.get("MORPHEUS_IN_SPHINX_BUILD") is None: From 42614ec384c89cf66ad7307eb8e60759b99c454c Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 5 Jan 2024 16:09:14 -0800 Subject: [PATCH 08/46] Silence warnings about not having a config option --- tests/test_cli.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index aef467bf84..b9cb75f2d5 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -16,6 +16,7 @@ import os import shutil +import warnings from unittest import mock import click @@ -129,6 +130,15 @@ def mlflow_uri(tmp_path): mlflow.end_run() +@pytest.fixture(scope="function", autouse=True) +def config_warning_fixture(): + # morpheus.cli.utils._apply_to_config method will warn about any keyword arguments that don't match a config option + # this isn't triggered in normal production code, but is triggered in the cli tests. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="No config option matches for.*", category=UserWarning) + yield + + @pytest.mark.reload_modules(commands) @pytest.mark.usefixtures("chdir_tmpdir", "reload_modules") @pytest.mark.use_python From 074081eceb487a563cfb7b1a0c3a7d942e3a7e13 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 5 Jan 2024 16:20:56 -0800 Subject: [PATCH 09/46] Avoid warning from cudf regardning an upcoming change of behavior when applying a groupby to a list with only a single element refer: https://github.com/rapidsai/cudf/blob/branch-24.02/python/cudf/cudf/core/groupby/groupby.py#L281 --- morpheus/modules/payload_batcher.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index 18e4a70506..eb1b666dbf 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -147,7 +147,12 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] # Period object conversion is not supported in cudf df[period_column] = df[period_column].to_pandas().dt.to_period(period).astype('str') - groups = df.groupby(group_by_columns) + if len(group_by_columns) == 1: + group_by_columns_ = group_by_columns[0] + else: + group_by_columns_ = group_by_columns + + groups = df.groupby(group_by_columns_) dfs = [] for _, group in groups: From 3a822c786cdfe362be350cac83ac2096a2a721a5 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Sat, 6 Jan 2024 10:55:22 -0800 Subject: [PATCH 10/46] Add comment explaining the special handling of single element lists --- morpheus/modules/payload_batcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index eb1b666dbf..29ab09fe5c 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -148,6 +148,8 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] df[period_column] = df[period_column].to_pandas().dt.to_period(period).astype('str') if len(group_by_columns) == 1: + # Avoid warning from cudf regardning an upcoming change of behavior when applying a groupby to a single + # element list. group_by_columns_ = group_by_columns[0] else: group_by_columns_ = group_by_columns From 32824758979512f8a62d7e737c5d65bb2bc1217b Mon Sep 17 00:00:00 2001 From: David Gardner Date: Sat, 6 Jan 2024 10:57:23 -0800 Subject: [PATCH 11/46] WIP: need to find the root cause of this warning --- morpheus/modules/payload_batcher.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index 29ab09fe5c..f660238341 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -157,19 +157,25 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] groups = df.groupby(group_by_columns_) dfs = [] - for _, group in groups: - if disable_max_batch_size: - dfs.append(group) - else: - group_length = len(group) - if group_length <= max_batch_size: + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.", + category=DeprecationWarning) + for _, group in groups: + if disable_max_batch_size: dfs.append(group) else: - num_batches = (group_length + max_batch_size - 1) // max_batch_size - group_batches = [ - group.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches) - ] - dfs.extend(group_batches) + group_length = len(group) + if group_length <= max_batch_size: + dfs.append(group) + else: + num_batches = (group_length + max_batch_size - 1) // max_batch_size + group_batches = [ + group.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches) + ] + dfs.extend(group_batches) return dfs From a226c17ce49c9cb4d8fcf991982633af721d7fc1 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 8 Jan 2024 08:04:47 -0800 Subject: [PATCH 12/46] Update CR Year --- morpheus/modules/payload_batcher.py | 2 +- morpheus/utils/column_info.py | 2 +- morpheus/utils/schema_transforms.py | 2 +- tests/common/test_http_server.py | 2 +- tests/modules/test_payload_batcher.py | 2 +- tests/test_cli.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index 29ab09fe5c..ca62a252bd 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 82ce740d5a..783bbb88c6 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/morpheus/utils/schema_transforms.py b/morpheus/utils/schema_transforms.py index e8e2383b25..2fd93482cb 100644 --- a/morpheus/utils/schema_transforms.py +++ b/morpheus/utils/schema_transforms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/common/test_http_server.py b/tests/common/test_http_server.py index 654f5aee58..26eeb8adbb 100644 --- a/tests/common/test_http_server.py +++ b/tests/common/test_http_server.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/modules/test_payload_batcher.py b/tests/modules/test_payload_batcher.py index 3b46450ae8..47f43849d7 100644 --- a/tests/modules/test_payload_batcher.py +++ b/tests/modules/test_payload_batcher.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/test_cli.py b/tests/test_cli.py index b9cb75f2d5..f3d5ff10f1 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From a0bbee2063460d0b068229e6a917ce06b23bc6c5 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 8 Jan 2024 08:42:14 -0800 Subject: [PATCH 13/46] Add comment explaining the warning coming from numpy 1.26 --- morpheus/modules/payload_batcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index 18b89a6959..2800f2e186 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -159,6 +159,8 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] dfs = [] with warnings.catch_warnings(): + # cudf is triggering a deprecation warning when using np.find_common_type which is deprecated in numpy 1.26 + # Future versions of cudf are pinned to numpy<1.25 warnings.filterwarnings( "ignore", message="np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.", From 592f7f4edef10a33afee6ef64de7e92370df4619 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 10:40:28 -0800 Subject: [PATCH 14/46] Update test for completion pipe to use actual asyncio futures, similar to that used by the completion benchmark, this removes the need to mock the asyncio.gather method which is also used by the PromptTemplateNode --- tests/llm/conftest.py | 23 +++++++++++++++++++++++ tests/llm/test_completion_pipe.py | 14 ++++---------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py index 226fee96d9..3b9653e1e0 100644 --- a/tests/llm/conftest.py +++ b/tests/llm/conftest.py @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio +import typing +from unittest import mock + import pytest from _utils import require_env_variable @@ -94,3 +98,22 @@ def serpapi_api_key_fixture(): yield require_env_variable( varname="SERPAPI_API_KEY", reason="serpapi integration tests require the `SERPAPI_API_KEY` environment variable to be defined.") + + +@pytest.mark.usefixtures("nemollm") +@pytest.fixture(name="mock_nemollm") +def mock_nemollm_fixture(mock_nemollm: mock.MagicMock): + + # The generate function is a blocking call that returns a future when return_type="async" + async def mock_task(fut: asyncio.Future, value: typing.Any = mock.DEFAULT): + fut.set_result(value) + + def create_future(*args, **kwargs) -> asyncio.Future: + event_loop = asyncio.get_event_loop() + fut = event_loop.create_future() + event_loop.create_task(mock_task(fut, mock.DEFAULT)) + return fut + + mock_nemollm.generate.side_effect = create_future + + yield mock_nemollm diff --git a/tests/llm/test_completion_pipe.py b/tests/llm/test_completion_pipe.py index 615e7954e0..582a295ab0 100644 --- a/tests/llm/test_completion_pipe.py +++ b/tests/llm/test_completion_pipe.py @@ -82,16 +82,10 @@ def _run_pipeline(config: Config, @pytest.mark.usefixtures("nemollm") @pytest.mark.use_python -@mock.patch("asyncio.wrap_future") -@mock.patch("asyncio.gather", new_callable=mock.AsyncMock) -def test_completion_pipe_nemo( - mock_asyncio_gather: mock.AsyncMock, - mock_asyncio_wrap_future: mock.MagicMock, # pylint: disable=unused-argument - config: Config, - mock_nemollm: mock.MagicMock, - countries: list[str], - capital_responses: list[str]): - mock_asyncio_gather.return_value = [mock.MagicMock() for _ in range(len(countries))] +def test_completion_pipe_nemo(config: Config, + mock_nemollm: mock.MagicMock, + countries: list[str], + capital_responses: list[str]): mock_nemollm.post_process_generate_response.side_effect = [{"text": response} for response in capital_responses] results = _run_pipeline(config, NeMoLLMService, countries=countries, capital_responses=capital_responses) assert_results(results) From 032ddec5a945156ffa8768fb7a1f3b1f4cf3e4a9 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 11:55:43 -0800 Subject: [PATCH 15/46] Avoid warning about setting values on a copy --- tests/examples/ransomware_detection/test_preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/examples/ransomware_detection/test_preprocessing.py b/tests/examples/ransomware_detection/test_preprocessing.py index 36874b4e2f..134ff42ebd 100644 --- a/tests/examples/ransomware_detection/test_preprocessing.py +++ b/tests/examples/ransomware_detection/test_preprocessing.py @@ -139,10 +139,10 @@ def test_merge_curr_and_prev_snapshots(self, config: Config, rwd_conf: dict, dat } expected_df = dataset_pandas['examples/ransomware_detection/dask_results.csv'].fillna('') - expected_df['pid_process'][1] = 'test_val1' - expected_df['pid_process'][3] = 'test_val2' + expected_df.loc[1, 'pid_process'] = 'test_val1' + expected_df.loc[3, 'pid_process'] = 'test_val2' - expected_df['snapshot_id'] = snapshot_ids + expected_df.loc[:, 'snapshot_id'] = snapshot_ids expected_df.index = expected_df.snapshot_id stage._merge_curr_and_prev_snapshots(df, source_pid_process) From 766d474169bf3a98fa3eade7ed489aea80d5c8ec Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 11:56:40 -0800 Subject: [PATCH 16/46] Ignore performance warnings from pandas triggered by the df compare method in the tests --- tests/_utils/dataset_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/_utils/dataset_manager.py b/tests/_utils/dataset_manager.py index eeb1e9bb27..16095f5279 100644 --- a/tests/_utils/dataset_manager.py +++ b/tests/_utils/dataset_manager.py @@ -18,6 +18,7 @@ import os import random import typing +import warnings import cupy as cp import pandas as pd @@ -235,7 +236,10 @@ def compare_df(cls, dfb: typing.Union[pd.DataFrame, cdf.DataFrame], **compare_args): """Wrapper for `morpheus.utils.compare_df.compare_df`.""" - return compare_df.compare_df(cls._value_as_pandas(dfa), cls._value_as_pandas(dfb), **compare_args) + with warnings.catch_warnings(): + # Ignore performance warnings from pandas triggered by the comparison + warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) + return compare_df.compare_df(cls._value_as_pandas(dfa), cls._value_as_pandas(dfb), **compare_args) @classmethod def assert_compare_df(cls, From 5462726839bd91829c4368eb10c6cabd0d568a62 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 11:59:34 -0800 Subject: [PATCH 17/46] Restrict numpy to versions prior to 1.25, this avoids numerous warnings triggered by cudf, more recent versions of cudf use this same version restriction --- .../all_cuda-118_arch-x86_64.yaml | 6 ++-- .../dev_cuda-118_arch-x86_64.yaml | 6 ++-- dependencies.yaml | 2 ++ docker/conda/environments/cuda11.8_dev.yml | 2 ++ morpheus/modules/payload_batcher.py | 30 +++++++------------ 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 884973ebf0..b5365a4d90 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -1,5 +1,5 @@ # This file is generated by `rapids-dependency-file-generator`. -# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +# To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. channels: - conda-forge - huggingface @@ -65,6 +65,7 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* +- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - openai=0.28 @@ -112,4 +113,5 @@ dependencies: - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 -name: all_cuda-118_arch-x86_64 +name: all_cuda-118_arch-x86_64_py-310 + diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index b13d2e8d64..27fed3396d 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -1,5 +1,5 @@ # This file is generated by `rapids-dependency-file-generator`. -# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +# To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. channels: - conda-forge - huggingface @@ -52,6 +52,7 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* +- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - pip @@ -88,4 +89,5 @@ dependencies: - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 -name: dev_cuda-118_arch-x86_64 +name: dev_cuda-118_arch-x86_64_py-310 + diff --git a/dependencies.yaml b/dependencies.yaml index 6b31678d78..a2c757a890 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -238,6 +238,8 @@ dependencies: - grpcio - mlflow>=2.2.1,<3 - nb_conda_kernels + # Avoids numerous warnings triggered by cudf, recent versions of cudf impost this version requirement + - numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - python-confluent-kafka=1.9.2 diff --git a/docker/conda/environments/cuda11.8_dev.yml b/docker/conda/environments/cuda11.8_dev.yml index 5ee09141b0..2fc7576cde 100644 --- a/docker/conda/environments/cuda11.8_dev.yml +++ b/docker/conda/environments/cuda11.8_dev.yml @@ -74,6 +74,8 @@ dependencies: - ninja=1.10 - nodejs=18.* - numba>=0.56.2 + # Avoids numerous warnings triggered by cudf, recent versions of cudf impost this version + - numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - pandas=1.3 diff --git a/morpheus/modules/payload_batcher.py b/morpheus/modules/payload_batcher.py index 2800f2e186..ca62a252bd 100644 --- a/morpheus/modules/payload_batcher.py +++ b/morpheus/modules/payload_batcher.py @@ -157,27 +157,19 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] groups = df.groupby(group_by_columns_) dfs = [] - - with warnings.catch_warnings(): - # cudf is triggering a deprecation warning when using np.find_common_type which is deprecated in numpy 1.26 - # Future versions of cudf are pinned to numpy<1.25 - warnings.filterwarnings( - "ignore", - message="np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.", - category=DeprecationWarning) - for _, group in groups: - if disable_max_batch_size: + for _, group in groups: + if disable_max_batch_size: + dfs.append(group) + else: + group_length = len(group) + if group_length <= max_batch_size: dfs.append(group) else: - group_length = len(group) - if group_length <= max_batch_size: - dfs.append(group) - else: - num_batches = (group_length + max_batch_size - 1) // max_batch_size - group_batches = [ - group.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches) - ] - dfs.extend(group_batches) + num_batches = (group_length + max_batch_size - 1) // max_batch_size + group_batches = [ + group.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches) + ] + dfs.extend(group_batches) return dfs From 874776f14fdc495e8767f5844de1f6a6443ae65c Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 12:10:29 -0800 Subject: [PATCH 18/46] Silence warning about cudf's subword tokenizer behaving differently than huggingface --- .../stages/preprocess/preprocess_nlp_stage.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/morpheus/stages/preprocess/preprocess_nlp_stage.py b/morpheus/stages/preprocess/preprocess_nlp_stage.py index a06ee5a30c..c3eb94c3fe 100644 --- a/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -13,6 +13,7 @@ # limitations under the License. import typing +import warnings from functools import partial import mrc @@ -148,13 +149,18 @@ def pre_process_batch(x: MultiMessage, """ text_ser = cudf.Series(x.get_meta(column)) - tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file, - do_lower_case=do_lower_case, - text_ser=text_ser, - seq_len=seq_len, - stride=stride, - truncation=truncation, - add_special_tokens=add_special_tokens) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="When truncation is not True, the behavior currently differs from HuggingFace.*", + category=UserWarning) + tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file, + do_lower_case=do_lower_case, + text_ser=text_ser, + seq_len=seq_len, + stride=stride, + truncation=truncation, + add_special_tokens=add_special_tokens) del text_ser seg_ids = tokenized.segment_ids From 21d856225c38f323b9f2a65713a475953e61e3be Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 12:22:53 -0800 Subject: [PATCH 19/46] Avoid pandas warning about setting a value on a copy --- .../digital_fingerprinting/test_dfp_postprocessing_stage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py index 0ae8bca2b5..2808d0a8a6 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py @@ -51,7 +51,7 @@ def test_process_events_on_data(mock_datetime: mock.MagicMock, # post-process should replace nans, lets add a nan to the DF with dfp_multi_ae_message.meta.mutable_dataframe() as df: - df['v2'][10] = np.nan + df.loc[10, 'v2'] = np.nan df['event_time'] = '' set_log_level(morpheus_log_level) From 03168a2d70ec245df07d4885be987762b7440faa Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 12:23:40 -0800 Subject: [PATCH 20/46] Ignore the warning from the logging_timer which happens whenever DFPSplitUsersStage doesn't have any output messages --- .../digital_fingerprinting/test_dfp_split_users_stage.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py b/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py index bfc5c9366a..aae17e1060 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py @@ -16,6 +16,7 @@ import json import os import typing +import warnings import pytest @@ -99,7 +100,12 @@ def test_extract_users(config: Config, skip_users=skip_users, only_users=only_users) - results = stage.extract_users(df) + with warnings.catch_warnings(): + # Ignore warning about the log message not being set. This happens whenever there aren't any output_messages + warnings.filterwarnings("ignore", + message="Must set log msg before end of context! Skipping log", + category=UserWarning) + results = stage.extract_users(df) if not include_generic and not include_individual: # Extra check for weird combination From 062c214cd2ec648909991e00d68257642723e4e5 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 14:01:08 -0800 Subject: [PATCH 21/46] Fix name of fixture resulting from bad copy/paste --- tests/examples/llm/common/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/examples/llm/common/conftest.py b/tests/examples/llm/common/conftest.py index 11ef4bad0c..2769439468 100644 --- a/tests/examples/llm/common/conftest.py +++ b/tests/examples/llm/common/conftest.py @@ -18,10 +18,10 @@ from _utils import import_or_skip -@pytest.fixture(name="nemollm", autouse=True, scope='session') -def nemollm_fixture(fail_missing: bool): +@pytest.fixture(name="langchain", autouse=True, scope='session') +def langchain_fixture(fail_missing: bool): """ - All the tests in this subdir require nemollm + All the tests in this subdir require langchain """ skip_reason = ("Tests for the WebScraperStage require the langchain package to be installed, to install this run:\n" "`mamba install -n base -c conda-forge conda-merge`\n" From a29cece73e9506c2e99aa60b33e5be1111eb0d50 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 14:32:26 -0800 Subject: [PATCH 22/46] Replace deprecated usage of is_monotonic attribute with is_monotonic_increasing and is_monotonic_decreasing --- morpheus/_lib/src/objects/table_info.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/morpheus/_lib/src/objects/table_info.cpp b/morpheus/_lib/src/objects/table_info.cpp index ab25ebc213..522ddaa918 100644 --- a/morpheus/_lib/src/objects/table_info.cpp +++ b/morpheus/_lib/src/objects/table_info.cpp @@ -300,7 +300,8 @@ std::optional MutableTableInfo::ensure_sliceable_index() auto df_index = py_df.attr("index"); // Check to see if we actually need the change - if (df_index.attr("is_unique").cast() && df_index.attr("is_monotonic").cast()) + if (df_index.attr("is_unique").cast() && (df_index.attr("is_monotonic_increasing").cast() || + df_index.attr("is_monotonic_decreasing").cast())) { // Set the outputname to nullopt old_index_col_name = std::nullopt; From 5154135e7c6cf62d8ccb86ad8f9af6ed213fb015 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 14:54:03 -0800 Subject: [PATCH 23/46] Replace usage of deprecated on_start method in monitor stage --- morpheus/stages/general/monitor_stage.py | 4 ++-- tests/test_monitor_stage.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/morpheus/stages/general/monitor_stage.py b/morpheus/stages/general/monitor_stage.py index 5d2c4a665e..8d709d7d92 100644 --- a/morpheus/stages/general/monitor_stage.py +++ b/morpheus/stages/general/monitor_stage.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -99,7 +99,7 @@ def accepted_types(self) -> typing.Tuple: def supports_cpp_node(self): return False - def on_start(self): + async def start_async(self): """ Starts the pipeline stage's progress bar. """ diff --git a/tests/test_monitor_stage.py b/tests/test_monitor_stage.py index 0db399f749..91bc936878 100755 --- a/tests/test_monitor_stage.py +++ b/tests/test_monitor_stage.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio import inspect import logging import os @@ -59,13 +60,13 @@ def two_x(x): @mock.patch('morpheus.controllers.monitor_controller.MorpheusTqdm') -def test_on_start(mock_morph_tqdm: mock.MagicMock, config: Config): +def test_start_async(mock_morph_tqdm: mock.MagicMock, config: Config): mock_morph_tqdm.return_value = mock_morph_tqdm stage = MonitorStage(config, log_level=logging.WARNING) assert stage._mc._progress is None - stage.on_start() + asyncio.run(stage.start_async()) mock_morph_tqdm.assert_called_once() mock_morph_tqdm.reset.assert_called_once() assert stage._mc._progress is mock_morph_tqdm @@ -82,7 +83,7 @@ def test_stop(mock_morph_tqdm: mock.MagicMock, config: Config): stage.stop() mock_morph_tqdm.assert_not_called() - stage.on_start() + asyncio.run(stage.start_async()) stage.stop() mock_morph_tqdm.close.assert_called_once() @@ -94,7 +95,7 @@ def test_refresh(mock_morph_tqdm: mock.MagicMock, config: Config): stage = MonitorStage(config, log_level=logging.WARNING) assert stage._mc._progress is None - stage.on_start() + asyncio.run(stage.start_async()) stage._mc.refresh_progress(None) mock_morph_tqdm.refresh.assert_called_once() @@ -138,7 +139,7 @@ def test_progress_sink(mock_morph_tqdm: mock.MagicMock, config: Config): mock_morph_tqdm.return_value = mock_morph_tqdm stage = MonitorStage(config, log_level=logging.WARNING) - stage.on_start() + asyncio.run(stage.start_async()) stage._mc.progress_sink(None) assert stage._mc._determine_count_fn is None From e188d3ad91c703c3bc329fd4ed4c353e29f05c8a Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 14:56:49 -0800 Subject: [PATCH 24/46] Replace usage of deprecated on_start method in HttpServerSinkStage --- morpheus/stages/output/http_server_sink_stage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/morpheus/stages/output/http_server_sink_stage.py b/morpheus/stages/output/http_server_sink_stage.py index 458f821c67..2a0be0a298 100644 --- a/morpheus/stages/output/http_server_sink_stage.py +++ b/morpheus/stages/output/http_server_sink_stage.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -142,7 +142,7 @@ def supports_cpp_node(self): """Indicates whether or not this stage supports a C++ node.""" return False - def on_start(self): + async def start_async(self): """Starts the HTTP server.""" from morpheus.common import HttpServer self._server = HttpServer(parse_fn=self._request_handler, From 71c74753baed2fff5eae21520ef0be363a76376d Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 15:04:44 -0800 Subject: [PATCH 25/46] Update test to assert that a deprecation warning for on_start is produced. --- tests/pipeline/test_pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 40ca0b9612..355310ce48 100755 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -93,7 +93,10 @@ def _run_pipeline(filter_probs_df: DataFrameType, pipe = LinearPipeline(config) pipe.set_source(SourceTestStage(config, [filter_probs_df], **source_callbacks)) pipe.add_stage(SinkTestStage(config, **sink_callbacks)) - pipe.run() + + with pytest.deprecated_call(match="The on_start method is deprecated and may be removed in the future.*"): + # The sink stage ensures that the on_start callback method still works, even though it is deprecated. + pipe.run() @pytest.mark.use_cudf From 90c79f513ab0fdff2752158484417ee500f7c7a8 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 15:24:55 -0800 Subject: [PATCH 26/46] Silence warning regarding tensorflow not being installed from nvt/merlin --- tests/utils/nvt/test_json_flatten_transform.py | 8 +++++++- tests/utils/nvt/test_mutate_op.py | 14 ++++++++++---- tests/utils/nvt/test_schema_converters.py | 7 ++++++- tests/utils/nvt/test_transforms.py | 8 +++++++- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/utils/nvt/test_json_flatten_transform.py b/tests/utils/nvt/test_json_flatten_transform.py index faf998e4ff..7f00722f7d 100644 --- a/tests/utils/nvt/test_json_flatten_transform.py +++ b/tests/utils/nvt/test_json_flatten_transform.py @@ -12,9 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + import pandas as pd import pytest -from nvtabular.ops.operator import ColumnSelector + +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) + from nvtabular.ops.operator import ColumnSelector import cudf diff --git a/tests/utils/nvt/test_mutate_op.py b/tests/utils/nvt/test_mutate_op.py index 034e4f9049..a1d6998c7d 100644 --- a/tests/utils/nvt/test_mutate_op.py +++ b/tests/utils/nvt/test_mutate_op.py @@ -12,13 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + import numpy as np import pandas as pd import pytest -from merlin.core.dispatch import DataFrameType -from merlin.schema import ColumnSchema -from merlin.schema import Schema -from nvtabular.ops.operator import ColumnSelector + +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) + from merlin.core.dispatch import DataFrameType + from merlin.schema import ColumnSchema + from merlin.schema import Schema + from nvtabular.ops.operator import ColumnSelector from morpheus.utils.nvt.mutate import MutateOp diff --git a/tests/utils/nvt/test_schema_converters.py b/tests/utils/nvt/test_schema_converters.py index 03270a6da5..e7681f634e 100644 --- a/tests/utils/nvt/test_schema_converters.py +++ b/tests/utils/nvt/test_schema_converters.py @@ -13,8 +13,13 @@ # limitations under the License. import json +import warnings + +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) + import nvtabular as nvt -import nvtabular as nvt import pandas as pd import pytest diff --git a/tests/utils/nvt/test_transforms.py b/tests/utils/nvt/test_transforms.py index c390f37627..6b967bc98b 100644 --- a/tests/utils/nvt/test_transforms.py +++ b/tests/utils/nvt/test_transforms.py @@ -12,9 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + import pandas as pd import pytest -from nvtabular.ops.operator import ColumnSelector + +with warnings.catch_warnings(): + # Ignore warning regarding tensorflow not being installed + warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) + from nvtabular.ops.operator import ColumnSelector from _utils.dataset_manager import DatasetManager from morpheus.utils.nvt.transforms import json_flatten From cb66ea0ce4ac178eb41f50248a9d694b667aa0a3 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 16:23:27 -0800 Subject: [PATCH 27/46] avoid deprecation warning about using .astype to convert from a tz-aware type to a tz-naive type --- morpheus/utils/column_info.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 783bbb88c6..121878d9e1 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -387,7 +387,16 @@ def _process_column(self, df: pd.DataFrame) -> pd.Series: The processed column as a datetime Series. """ - return pd.to_datetime(df[self.input_name], infer_datetime_format=True, utc=True).astype(self.get_pandas_dtype()) + dt_series = pd.to_datetime(df[self.input_name], + infer_datetime_format=True, + utc=True) + + dtype = self.get_pandas_dtype() + if dtype == 'datetime64[ns]': + # avoid deprecation warning about using .astype to convert from a tz-aware type to a tz-naive type + return dt_series.dt.tz_localize(None) + + return dt_series.astype(dtype) @dataclasses.dataclass From 0913d14931fa7bdd4aac71c9ac8ea5f78ee58963 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 16:24:43 -0800 Subject: [PATCH 28/46] Avoid cudf warning that reading a json file uses pandas, create fixtures for the input_df --- tests/test_column_info.py | 48 ++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/tests/test_column_info.py b/tests/test_column_info.py index 3e0c713773..b14205c1ce 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -18,6 +18,7 @@ import json import os from datetime import datetime +from datetime import timezone from functools import partial import numpy as np @@ -27,6 +28,8 @@ import cudf from _utils import TEST_DIRS +from morpheus.common import FileTypes +from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import ColumnInfo from morpheus.utils.column_info import CustomColumn from morpheus.utils.column_info import DataFrameInputSchema @@ -37,13 +40,26 @@ from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow from morpheus.utils.schema_transforms import process_dataframe - -@pytest.mark.use_python -def test_dataframe_input_schema_with_json_cols(): +@pytest.fixture(name="_azure_ad_logs_pdf", scope="module") +def fixture__azure_ad_logs_pdf(): + # Explicitly reading this in to ensure that lines=False. + # Using pandas since the C++ impl for read_file_to_df doesn't support parser_kwargs, this also avoids a warning + # that cudf.read_json uses pandas.read_json under the hood. src_file = os.path.join(TEST_DIRS.tests_data_dir, "azure_ad_logs.json") + yield read_file_to_df(src_file, df_type='pandas', parser_kwargs={'lines': False}) - input_df = cudf.read_json(src_file) +@pytest.fixture(name="azure_ad_logs_pdf", scope="function") +def fixture_azure_ad_logs_pdf(_azure_ad_logs_pdf: pd.DataFrame): + yield _azure_ad_logs_pdf.copy(deep=True) +@pytest.fixture(name="azure_ad_logs_cdf", scope="function") +def fixture_azure_ad_logs_cdf(azure_ad_logs_pdf: pd.DataFrame): + # cudf.from_pandas essentially does a deep copy, so we can use this to ensure that the source pandas df is not + # modified + yield cudf.from_pandas(azure_ad_logs_pdf) + +@pytest.mark.use_python +def test_dataframe_input_schema_with_json_cols(azure_ad_logs_cdf: cudf.DataFrame): raw_data_columns = [ 'time', 'resourceId', @@ -63,8 +79,8 @@ def test_dataframe_input_schema_with_json_cols(): 'properties' ] - assert len(input_df.columns) == 16 - assert list(input_df.columns) == raw_data_columns + assert len(azure_ad_logs_cdf.columns) == 16 + assert list(azure_ad_logs_cdf.columns) == raw_data_columns column_info = [ DateTimeColumn(name="timestamp", dtype='datetime64[ns]', input_name="time"), @@ -89,10 +105,10 @@ def test_dataframe_input_schema_with_json_cols(): schema = DataFrameInputSchema(json_columns=["properties"], column_info=column_info) - df_processed_schema = process_dataframe(input_df, schema) + df_processed_schema = process_dataframe(azure_ad_logs_cdf, schema) processed_df_cols = df_processed_schema.columns - assert len(input_df) == len(df_processed_schema) + assert len(azure_ad_logs_cdf) == len(df_processed_schema) assert len(processed_df_cols) == len(column_info) assert "timestamp" in processed_df_cols assert "userId" in processed_df_cols @@ -100,17 +116,13 @@ def test_dataframe_input_schema_with_json_cols(): assert "properties.userPrincipalName" not in processed_df_cols nvt_workflow = create_and_attach_nvt_workflow(schema) - df_processed_workflow = process_dataframe(input_df, nvt_workflow) + df_processed_workflow = process_dataframe(azure_ad_logs_cdf, nvt_workflow) assert df_processed_schema.equals(df_processed_workflow) @pytest.mark.use_python -def test_dataframe_input_schema_without_json_cols(): - src_file = os.path.join(TEST_DIRS.tests_data_dir, "azure_ad_logs.json") - - input_df = pd.read_json(src_file) - - assert len(input_df.columns) == 16 +def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFrame): + assert len(azure_ad_logs_pdf.columns) == 16 column_info = [ DateTimeColumn(name="timestamp", dtype='datetime64[ns]', input_name="time"), @@ -119,10 +131,10 @@ def test_dataframe_input_schema_without_json_cols(): schema = DataFrameInputSchema(column_info=column_info) - df_processed = process_dataframe(input_df, schema) + df_processed = process_dataframe(azure_ad_logs_pdf, schema) processed_df_cols = df_processed.columns - assert len(input_df) == len(df_processed) + assert len(azure_ad_logs_pdf) == len(df_processed) assert len(processed_df_cols) == len(column_info) assert "timestamp" in processed_df_cols assert "time" not in processed_df_cols @@ -152,7 +164,7 @@ def test_dataframe_input_schema_without_json_cols(): # When trying to concat columns that don't exist in the dataframe, an exception is raised. with pytest.raises(Exception): - process_dataframe(input_df, schema2) + process_dataframe(azure_ad_logs_pdf, schema2) @pytest.mark.use_python From 07ea3fdc8fb1ddcfe9f3658ec88431d4a38ecf6e Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 16:39:27 -0800 Subject: [PATCH 29/46] Ignore warning about an existing dask client, as we are explicitly constructing/passing in that client --- morpheus/utils/downloader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/morpheus/utils/downloader.py b/morpheus/utils/downloader.py index d2882afa93..c762ae17a1 100644 --- a/morpheus/utils/downloader.py +++ b/morpheus/utils/downloader.py @@ -21,6 +21,7 @@ import os import threading import typing +import warnings from enum import Enum import fsspec @@ -131,7 +132,13 @@ def get_dask_client(self): dask.config.set({"distributed.client.heartbeat": self._dask_heartbeat_interval}) if (self._merlin_distributed is None): - self._merlin_distributed = Distributed(client=dask.distributed.Client(self.get_dask_cluster())) + with warnings.catch_warnings(): + # Merlin.Distributed will warn if a client already exists, the client in question is the one created + # and are explicitly passing to it in the constructor. + warnings.filterwarnings("ignore", + message="Existing Dask-client object detected in the current context.*", + category=UserWarning) + self._merlin_distributed = Distributed(client=dask.distributed.Client(self.get_dask_cluster())) return self._merlin_distributed From b4b1df268f8a7989960499749cf6c7d30c479ef6 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 16:48:18 -0800 Subject: [PATCH 30/46] Ignore cudf warnings about Pandas being used under the hood for processing json --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c33d4e0e88..91056c2b2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,12 +29,16 @@ filterwarnings = [ 'ignore:`np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe', 'ignore:Warning the df property returns a copy, please use the copy_dataframe method or the mutable_dataframe context manager to modify the DataFrame in-place instead.', 'ignore:`np.MachAr` is deprecated \(NumPy 1.22\):DeprecationWarning', - 'ignore:Please use `spmatrix` from the `scipy.sparse` namespace, the `scipy.sparse.base` namespace is deprecated:DeprecationWarning', + 'ignore:Please use `spmatrix` from the `scipy.sparse` namespace, the `scipy.sparse.base` namespace is deprecated:DeprecationWarning', # Deprecation warning from any project using distutils, currently known sources of this are: # GPUtils https://github.com/anderskm/gputil/issues/48 # PySpark https://issues.apache.org/jira/browse/SPARK-45390 'ignore:The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives', + + # Ignore cudf warnings about Pandas being used under the hood for processing json + 'ignore:Using CPU via Pandas to write JSON dataset', + 'ignore:Using CPU via Pandas to read JSON dataset', ] testpaths = ["tests"] From d88d5ef888d2911a22a5d92a10b6287411feca1b Mon Sep 17 00:00:00 2001 From: David Gardner Date: Thu, 11 Jan 2024 17:06:18 -0800 Subject: [PATCH 31/46] Ignore warning that is only produced when calling with an abnormally small tensor size --- tests/dfencoder/test_scalers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/dfencoder/test_scalers.py b/tests/dfencoder/test_scalers.py index caa02fb472..4e0aec3bdd 100644 --- a/tests/dfencoder/test_scalers.py +++ b/tests/dfencoder/test_scalers.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + import numpy as np import pytest import torch @@ -48,8 +50,14 @@ def modified_scaler(fit_tensor): @pytest.fixture(scope="function") def gauss_rank_scaler(fit_tensor): scaler = scalers.GaussRankScaler() - scaler.fit(fit_tensor) - yield scaler + + with warnings.catch_warnings(): + # This warning is triggered by the abnormally small tensor size used in this test + warnings.filterwarnings("ignore", + message=r"n_quantiles \(1000\) is greater than the total number of samples \(3\).*", + category=UserWarning) + scaler.fit(fit_tensor) + yield scaler def test_ensure_float_type(): From 45a92e6e796c45e0cba0548ac986497ee3bab234 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 10:00:40 -0800 Subject: [PATCH 32/46] Adopt dgl v1.1.1 to pickup pytorch v2.0.1 support. Add rapids-dependency-file-generator to dev env --- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 ++- conda/environments/dev_cuda-118_arch-x86_64.yaml | 1 + dependencies.yaml | 5 ++++- docker/conda/environments/cuda11.8_examples.yml | 2 +- examples/gnn_fraud_detection_pipeline/requirements.yml | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b5365a4d90..f705ef865a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -34,7 +34,7 @@ dependencies: - cython=0.29.24 - dask>=2023.1.1 - datacompy=0.8 -- dgl=1.0.2 +- dgl=1.1.1 - dill - dill=0.3.6 - distributed>=2023.1.1 @@ -113,5 +113,6 @@ dependencies: - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 + - rapids-dependency-file-generator name: all_cuda-118_arch-x86_64_py-310 diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index 27fed3396d..673a8f20cf 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -89,5 +89,6 @@ dependencies: - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 + - rapids-dependency-file-generator name: dev_cuda-118_arch-x86_64_py-310 diff --git a/dependencies.yaml b/dependencies.yaml index a2c757a890..4c2c0a07f1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -164,6 +164,9 @@ dependencies: - isort - pylint>=2.17.4,<2.18 # 2.17.4 contains a fix for toml support - yapf=0.40.1 + - pip + - pip: + - rapids-dependency-file-generator docs: common: @@ -196,7 +199,7 @@ dependencies: - boto3 - cuml=23.06 - dask>=2023.1.1 - - dgl=1.0.2 + - dgl=1.1.1 - dill=0.3.6 - distributed>=2023.1.1 - huggingface_hub=0.10.1 # work-around for https://github.com/UKPLab/sentence-transformers/issues/1762 diff --git a/docker/conda/environments/cuda11.8_examples.yml b/docker/conda/environments/cuda11.8_examples.yml index 1d49130bcc..35d28977d7 100644 --- a/docker/conda/environments/cuda11.8_examples.yml +++ b/docker/conda/environments/cuda11.8_examples.yml @@ -32,7 +32,7 @@ dependencies: - boto3 - cuml=23.06 - dask>=2023.1.1 - - dgl=1.0.2 + - dgl=1.1.1 - dill=0.3.6 - distributed>=2023.1.1 - huggingface_hub=0.10.1 # work-around for https://github.com/UKPLab/sentence-transformers/issues/1762 diff --git a/examples/gnn_fraud_detection_pipeline/requirements.yml b/examples/gnn_fraud_detection_pipeline/requirements.yml index e0f37be2cd..3bb033db26 100644 --- a/examples/gnn_fraud_detection_pipeline/requirements.yml +++ b/examples/gnn_fraud_detection_pipeline/requirements.yml @@ -21,4 +21,4 @@ channels: - defaults dependencies: - cuml=23.06 - - dgl=1.0.2 + - dgl=1.1.1 From 12b71c3a389c21b3f3f3aaee342f1a539634d2d0 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Wed, 10 Jan 2024 12:14:47 -0800 Subject: [PATCH 33/46] Specify a version for sysroot_linux-64, without this I was getting out of date kernel headers and glibc link errors --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/dev_cuda-118_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index f705ef865a..52ad033372 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -95,6 +95,7 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 +- sysroot_linux-64=2.17 - tqdm=4 - transformers=4.30.2 - tritonclient=2.26 diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index 673a8f20cf..4d35b3507e 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -76,6 +76,7 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 +- sysroot_linux-64=2.17 - tqdm=4 - tritonclient=2.26 - typing_utils=0.1 diff --git a/dependencies.yaml b/dependencies.yaml index 4c2c0a07f1..7f8b16b514 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -139,6 +139,7 @@ dependencies: - pybind11-stubgen=0.10 - rapidjson=1.1.0 - scikit-build=0.17.1 + - sysroot_linux-64=2.17 - tritonclient=2.26 # Required by NvTabular, force the version, so we get protobufs compatible with 4.21 - ucx=1.14 From b5562fa8e1d133ff10883ded37b765a0e181ba43 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Tue, 9 Jan 2024 14:57:28 -0800 Subject: [PATCH 34/46] Switch to using conda to fetch pyarrow-hotfix --- ci/conda/recipes/morpheus/meta.yaml | 1 + conda/environments/all_cuda-118_arch-x86_64.yaml | 9 ++++----- conda/environments/dev_cuda-118_arch-x86_64.yaml | 7 +++---- dependencies.yaml | 2 +- docker/conda/environments/cuda11.8_dev.yml | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ci/conda/recipes/morpheus/meta.yaml b/ci/conda/recipes/morpheus/meta.yaml index 039920e272..ba9e0c6f96 100644 --- a/ci/conda/recipes/morpheus/meta.yaml +++ b/ci/conda/recipes/morpheus/meta.yaml @@ -92,6 +92,7 @@ outputs: - pandas 1.3.* - pluggy 1.0.* - pyarrow * *_cuda # Ensure we get a CUDA build. Version determined by cuDF + - pyarrow-hotfix # CVE-2023-47248 - python - python-confluent-kafka 1.9.2 - pytorch 2.0.1 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 52ad033372..9b2588f8a5 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -34,7 +34,7 @@ dependencies: - cython=0.29.24 - dask>=2023.1.1 - datacompy=0.8 -- dgl=1.1.1 +- dgl=1.0.2 - dill - dill=0.3.6 - distributed>=2023.1.1 @@ -46,6 +46,7 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs +- glog=0.6 - grpcio - gxx_linux-64=11.2 - huggingface_hub=0.10.1 @@ -65,7 +66,6 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* -- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - openai=0.28 @@ -74,6 +74,7 @@ dependencies: - pluggy=1.0 - pre-commit - protobuf=4.21.* +- pyarrow-hotfix - pybind11-stubgen=0.10 - pylint>=2.17.4,<2.18 - pypdf=3.16 @@ -95,13 +96,13 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 -- sysroot_linux-64=2.17 - tqdm=4 - transformers=4.30.2 - tritonclient=2.26 - typing_utils=0.1 - ucx=1.14 - ujson=5.8 +- versioneer - watchdog=2.1 - websockets - yapf=0.40.1 @@ -111,9 +112,7 @@ dependencies: - grpcio-status==1.58 - milvus==2.3.2 - nemollm - - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 - - rapids-dependency-file-generator name: all_cuda-118_arch-x86_64_py-310 diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index 4d35b3507e..17b860dddd 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -38,6 +38,7 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs +- glog=0.6 - grpcio - gxx_linux-64=11.2 - include-what-you-use=0.20 @@ -52,13 +53,13 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* -- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - pip - pluggy=1.0 - pre-commit - protobuf=4.21.* +- pyarrow-hotfix - pybind11-stubgen=0.10 - pylint>=2.17.4,<2.18 - pytest @@ -76,20 +77,18 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 -- sysroot_linux-64=2.17 - tqdm=4 - tritonclient=2.26 - typing_utils=0.1 - ucx=1.14 +- versioneer - watchdog=2.1 - websockets - yapf=0.40.1 - pip: - databricks-connect - milvus==2.3.2 - - pyarrow_hotfix - pymilvus==2.3.2 - pytest-kafka==0.6.0 - - rapids-dependency-file-generator name: dev_cuda-118_arch-x86_64_py-310 diff --git a/dependencies.yaml b/dependencies.yaml index 7f8b16b514..4c790e4080 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -246,6 +246,7 @@ dependencies: - numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 + - pyarrow-hotfix - python-confluent-kafka=1.9.2 - python-graphviz - pytorch-cuda @@ -261,7 +262,6 @@ dependencies: - pip: - databricks-connect - milvus==2.3.2 - - pyarrow_hotfix - pymilvus==2.3.2 test_python_morpheus: diff --git a/docker/conda/environments/cuda11.8_dev.yml b/docker/conda/environments/cuda11.8_dev.yml index 2fc7576cde..3d6b029bbd 100644 --- a/docker/conda/environments/cuda11.8_dev.yml +++ b/docker/conda/environments/cuda11.8_dev.yml @@ -84,6 +84,7 @@ dependencies: - pluggy=1.0 - protobuf=4.21.* - pyarrow * *_cuda # Ensure we get a CUDA build. Version determined by cuDF + - pyarrow-hotfix # CVE-2023-47248. See morpheus/__init__.py for more details - pybind11-stubgen=0.10.5 - pydot - pylint>=2.17.4,<2.18 # 2.17.4 contains a fix for toml support @@ -125,6 +126,5 @@ dependencies: # Add additional dev dependencies here - databricks-connect - milvus==2.3.2 - - pyarrow_hotfix # CVE-2023-47248. See morpheus/__init__.py for more details - pymilvus==2.3.2 - pytest-kafka==0.6.0 From e07b1c636382e1275e207c2b7f9ebc77f1f95e70 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 11:32:58 -0800 Subject: [PATCH 35/46] Ensure we receive a cuda enabled build of pyarrow re-order channels ensuring nvidia & rapidsai versions of packages receive priority Add missing dep for ccache Pin pandas to 1.3, more recent versions trigger deprecation warnins from cudf Add in pip transitive deps, ensuring we receive the conda versions rather than the pip versions --- .../all_cuda-118_arch-x86_64.yaml | 19 +++++++++++++------ .../dev_cuda-118_arch-x86_64.yaml | 17 ++++++++++++----- dependencies.yaml | 18 +++++++++++++++--- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 9b2588f8a5..dd9060236f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -1,14 +1,13 @@ # This file is generated by `rapids-dependency-file-generator`. # To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. channels: -- conda-forge -- huggingface - rapidsai - nvidia/label/cuda-11.8.0 - nvidia -- rapidsai-nightly - nvidia/label/dev +- huggingface - pytorch +- conda-forge - dglteam/label/cu118 dependencies: - appdirs @@ -17,6 +16,8 @@ dependencies: - boost-cpp=1.82 - boto3 - breathe=4.34.0 +- cachetools=5.0.0 +- ccache>=3.7 - clangdev=16 - click >=8 - click>=8 @@ -34,7 +35,7 @@ dependencies: - cython=0.29.24 - dask>=2023.1.1 - datacompy=0.8 -- dgl=1.0.2 +- dgl=1.1.1 - dill - dill=0.3.6 - distributed>=2023.1.1 @@ -46,13 +47,13 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs -- glog=0.6 - grpcio - gxx_linux-64=11.2 - huggingface_hub=0.10.1 - include-what-you-use=0.20 - ipython - isort +- kafka-python=2.0 - langchain=0.0.190 - librdkafka=1.9.2 - libwebp>=1.3.2 @@ -66,14 +67,19 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* +- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 - openai=0.28 +- pandas=1.3 - papermill=2.3.4 - pip - pluggy=1.0 +- port-for=0.7 - pre-commit - protobuf=4.21.* +- py4j=0.10 +- pyarrow * *_cuda - pyarrow-hotfix - pybind11-stubgen=0.10 - pylint>=2.17.4,<2.18 @@ -96,13 +102,13 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 +- sysroot_linux-64=2.17 - tqdm=4 - transformers=4.30.2 - tritonclient=2.26 - typing_utils=0.1 - ucx=1.14 - ujson=5.8 -- versioneer - watchdog=2.1 - websockets - yapf=0.40.1 @@ -114,5 +120,6 @@ dependencies: - nemollm - pymilvus==2.3.2 - pytest-kafka==0.6.0 + - rapids-dependency-file-generator name: all_cuda-118_arch-x86_64_py-310 diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index 17b860dddd..9377ff1b48 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -1,20 +1,21 @@ # This file is generated by `rapids-dependency-file-generator`. # To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. channels: -- conda-forge -- huggingface - rapidsai - nvidia/label/cuda-11.8.0 - nvidia -- rapidsai-nightly - nvidia/label/dev +- huggingface - pytorch +- conda-forge - dglteam/label/cu118 dependencies: - appdirs - benchmark=1.6.0 - boost-cpp=1.82 - breathe=4.34.0 +- cachetools=5.0.0 +- ccache>=3.7 - clangdev=16 - click >=8 - click>=8 @@ -38,12 +39,12 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs -- glog=0.6 - grpcio - gxx_linux-64=11.2 - include-what-you-use=0.20 - ipython - isort +- kafka-python=2.0 - librdkafka=1.9.2 - mlflow>=2.2.1,<3 - mrc=24.03 @@ -53,12 +54,17 @@ dependencies: - ninja=1.10 - nlohmann_json=3.9 - nodejs=18.* +- numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 +- pandas=1.3 - pip - pluggy=1.0 +- port-for=0.7 - pre-commit - protobuf=4.21.* +- py4j=0.10 +- pyarrow * *_cuda - pyarrow-hotfix - pybind11-stubgen=0.10 - pylint>=2.17.4,<2.18 @@ -77,11 +83,11 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 +- sysroot_linux-64=2.17 - tqdm=4 - tritonclient=2.26 - typing_utils=0.1 - ucx=1.14 -- versioneer - watchdog=2.1 - websockets - yapf=0.40.1 @@ -90,5 +96,6 @@ dependencies: - milvus==2.3.2 - pymilvus==2.3.2 - pytest-kafka==0.6.0 + - rapids-dependency-file-generator name: dev_cuda-118_arch-x86_64_py-310 diff --git a/dependencies.yaml b/dependencies.yaml index 4c790e4080..172be0ded1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -107,14 +107,13 @@ files: channels: - - conda-forge - - huggingface - rapidsai - nvidia/label/cuda-11.8.0 - nvidia - - rapidsai-nightly - nvidia/label/dev + - huggingface - pytorch + - conda-forge - dglteam/label/cu118 dependencies: @@ -126,6 +125,8 @@ dependencies: - mrc=24.03 # should this be in build, or somewhere else? - gcc_linux-64=11.2 - gxx_linux-64=11.2 + - cachetools=5.0.0 + - ccache>=3.7 - cmake=3.25 - boost-cpp=1.82 - cuda-nvcc @@ -135,7 +136,10 @@ dependencies: - librdkafka=1.9.2 - ninja=1.10 - nlohmann_json=3.9 + - pandas=1.3 - protobuf=4.21.* + - pyarrow * *_cuda # Ensure we get a CUDA build. Version determined by cuDF + - pyarrow-hotfix - pybind11-stubgen=0.10 - rapidjson=1.1.0 - scikit-build=0.17.1 @@ -246,6 +250,7 @@ dependencies: - numpy>=1.21,<1.25 - numpydoc=1.4 - nvtabular=23.06 + - pyarrow * *_cuda # Ensure we get a CUDA build. Version determined by cuDF - pyarrow-hotfix - python-confluent-kafka=1.9.2 - python-graphviz @@ -258,6 +263,13 @@ dependencies: - typing_utils=0.1 - watchdog=2.1 - websockets + + ####### Pip Transitive Dependencies (keep sorted!) ####### + # These are dependencies that are available on conda, but are required by the pip packages listed below. Its much + # better to install them with conda than pip to allow for better dependency resolution. + - kafka-python=2.0 + - port-for=0.7 + - py4j=0.10 - pip - pip: - databricks-connect From 82ca88b316063b0669fdb9c9aad239cc06f2ba54 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 12:13:19 -0800 Subject: [PATCH 36/46] update CR --- docker/conda/environments/cuda11.8_examples.yml | 2 +- examples/gnn_fraud_detection_pipeline/requirements.yml | 2 +- morpheus/_lib/src/objects/table_info.cpp | 2 +- morpheus/stages/preprocess/preprocess_nlp_stage.py | 2 +- morpheus/utils/downloader.py | 2 +- tests/_utils/dataset_manager.py | 2 +- tests/dfencoder/test_scalers.py | 2 +- .../digital_fingerprinting/test_dfp_postprocessing_stage.py | 2 +- .../digital_fingerprinting/test_dfp_split_users_stage.py | 2 +- tests/examples/llm/common/conftest.py | 2 +- tests/examples/ransomware_detection/test_preprocessing.py | 2 +- tests/llm/conftest.py | 2 +- tests/llm/test_completion_pipe.py | 2 +- tests/pipeline/test_pipeline.py | 2 +- tests/test_column_info.py | 2 +- tests/utils/nvt/test_json_flatten_transform.py | 2 +- tests/utils/nvt/test_mutate_op.py | 2 +- tests/utils/nvt/test_schema_converters.py | 2 +- tests/utils/nvt/test_transforms.py | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docker/conda/environments/cuda11.8_examples.yml b/docker/conda/environments/cuda11.8_examples.yml index 35d28977d7..3b549e1f17 100644 --- a/docker/conda/environments/cuda11.8_examples.yml +++ b/docker/conda/environments/cuda11.8_examples.yml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/examples/gnn_fraud_detection_pipeline/requirements.yml b/examples/gnn_fraud_detection_pipeline/requirements.yml index 3bb033db26..f15ba3847c 100644 --- a/examples/gnn_fraud_detection_pipeline/requirements.yml +++ b/examples/gnn_fraud_detection_pipeline/requirements.yml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/morpheus/_lib/src/objects/table_info.cpp b/morpheus/_lib/src/objects/table_info.cpp index 522ddaa918..dd89427a69 100644 --- a/morpheus/_lib/src/objects/table_info.cpp +++ b/morpheus/_lib/src/objects/table_info.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/morpheus/stages/preprocess/preprocess_nlp_stage.py b/morpheus/stages/preprocess/preprocess_nlp_stage.py index c3eb94c3fe..cebafa6d65 100644 --- a/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/morpheus/utils/downloader.py b/morpheus/utils/downloader.py index c762ae17a1..0a68ae6e14 100644 --- a/morpheus/utils/downloader.py +++ b/morpheus/utils/downloader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/_utils/dataset_manager.py b/tests/_utils/dataset_manager.py index 16095f5279..72a277cf21 100644 --- a/tests/_utils/dataset_manager.py +++ b/tests/_utils/dataset_manager.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/dfencoder/test_scalers.py b/tests/dfencoder/test_scalers.py index 4e0aec3bdd..46b7344dd1 100644 --- a/tests/dfencoder/test_scalers.py +++ b/tests/dfencoder/test_scalers.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py index 2808d0a8a6..4b13bacde5 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py b/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py index aae17e1060..8189df73fe 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_split_users_stage.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/examples/llm/common/conftest.py b/tests/examples/llm/common/conftest.py index 2769439468..fa9a6bc25a 100644 --- a/tests/examples/llm/common/conftest.py +++ b/tests/examples/llm/common/conftest.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/examples/ransomware_detection/test_preprocessing.py b/tests/examples/ransomware_detection/test_preprocessing.py index 134ff42ebd..a72225edbf 100644 --- a/tests/examples/ransomware_detection/test_preprocessing.py +++ b/tests/examples/ransomware_detection/test_preprocessing.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py index 3b9653e1e0..38db5d6c4d 100644 --- a/tests/llm/conftest.py +++ b/tests/llm/conftest.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/llm/test_completion_pipe.py b/tests/llm/test_completion_pipe.py index 582a295ab0..65940bb6d4 100644 --- a/tests/llm/test_completion_pipe.py +++ b/tests/llm/test_completion_pipe.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 355310ce48..014fb4ca40 100755 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/test_column_info.py b/tests/test_column_info.py index b14205c1ce..d546dfc142 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/utils/nvt/test_json_flatten_transform.py b/tests/utils/nvt/test_json_flatten_transform.py index 7f00722f7d..e0657925f5 100644 --- a/tests/utils/nvt/test_json_flatten_transform.py +++ b/tests/utils/nvt/test_json_flatten_transform.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/utils/nvt/test_mutate_op.py b/tests/utils/nvt/test_mutate_op.py index a1d6998c7d..3023d9701e 100644 --- a/tests/utils/nvt/test_mutate_op.py +++ b/tests/utils/nvt/test_mutate_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/utils/nvt/test_schema_converters.py b/tests/utils/nvt/test_schema_converters.py index e7681f634e..9b00440d1a 100644 --- a/tests/utils/nvt/test_schema_converters.py +++ b/tests/utils/nvt/test_schema_converters.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/utils/nvt/test_transforms.py b/tests/utils/nvt/test_transforms.py index 6b967bc98b..96df15447c 100644 --- a/tests/utils/nvt/test_transforms.py +++ b/tests/utils/nvt/test_transforms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 95748a4a45c0b0d1fa85c30e93be7c5dd1c86730 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 12:18:01 -0800 Subject: [PATCH 37/46] lint fixes --- tests/dfencoder/test_scalers.py | 20 ++++++++++---------- tests/test_column_info.py | 6 ++++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/dfencoder/test_scalers.py b/tests/dfencoder/test_scalers.py index 46b7344dd1..3eaaa6b754 100644 --- a/tests/dfencoder/test_scalers.py +++ b/tests/dfencoder/test_scalers.py @@ -23,32 +23,32 @@ from morpheus.models.dfencoder import scalers -@pytest.fixture(scope="function") -def fit_tensor(): +@pytest.fixture(name="fit_tensor", scope="function") +def fit_tensor_fixture(): yield torch.tensor([4.4, 5.3, 6.5], dtype=torch.float32) -@pytest.fixture(scope="function") -def tensor(): +@pytest.fixture(name="tensor", scope="function") +def tensor_fixture(): yield torch.tensor([7.4, 8.3, 9.5], dtype=torch.float32) -@pytest.fixture(scope="function") -def standard_scaler(fit_tensor): +@pytest.fixture(name="standard_scalar", scope="function") +def standard_scaler_fixture(fit_tensor): scaler = scalers.StandardScaler() scaler.fit(fit_tensor) yield scaler -@pytest.fixture(scope="function") -def modified_scaler(fit_tensor): +@pytest.fixture(name="modified_scalar", scope="function") +def modified_scaler_fixture(fit_tensor): scaler = scalers.ModifiedScaler() scaler.fit(fit_tensor) yield scaler -@pytest.fixture(scope="function") -def gauss_rank_scaler(fit_tensor): +@pytest.fixture(name="gauss_rank_scaler", scope="function") +def gauss_rank_scaler_fixture(fit_tensor): scaler = scalers.GaussRankScaler() with warnings.catch_warnings(): diff --git a/tests/test_column_info.py b/tests/test_column_info.py index d546dfc142..829995c534 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -18,7 +18,6 @@ import json import os from datetime import datetime -from datetime import timezone from functools import partial import numpy as np @@ -28,7 +27,6 @@ import cudf from _utils import TEST_DIRS -from morpheus.common import FileTypes from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import ColumnInfo from morpheus.utils.column_info import CustomColumn @@ -40,6 +38,7 @@ from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow from morpheus.utils.schema_transforms import process_dataframe + @pytest.fixture(name="_azure_ad_logs_pdf", scope="module") def fixture__azure_ad_logs_pdf(): # Explicitly reading this in to ensure that lines=False. @@ -48,16 +47,19 @@ def fixture__azure_ad_logs_pdf(): src_file = os.path.join(TEST_DIRS.tests_data_dir, "azure_ad_logs.json") yield read_file_to_df(src_file, df_type='pandas', parser_kwargs={'lines': False}) + @pytest.fixture(name="azure_ad_logs_pdf", scope="function") def fixture_azure_ad_logs_pdf(_azure_ad_logs_pdf: pd.DataFrame): yield _azure_ad_logs_pdf.copy(deep=True) + @pytest.fixture(name="azure_ad_logs_cdf", scope="function") def fixture_azure_ad_logs_cdf(azure_ad_logs_pdf: pd.DataFrame): # cudf.from_pandas essentially does a deep copy, so we can use this to ensure that the source pandas df is not # modified yield cudf.from_pandas(azure_ad_logs_pdf) + @pytest.mark.use_python def test_dataframe_input_schema_with_json_cols(azure_ad_logs_cdf: cudf.DataFrame): raw_data_columns = [ From 02c74fbcebdb8938c7e59c02f9ae3a3b69a04d38 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 12:18:30 -0800 Subject: [PATCH 38/46] formatting --- morpheus/utils/column_info.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 121878d9e1..59ce19a6ba 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -387,9 +387,7 @@ def _process_column(self, df: pd.DataFrame) -> pd.Series: The processed column as a datetime Series. """ - dt_series = pd.to_datetime(df[self.input_name], - infer_datetime_format=True, - utc=True) + dt_series = pd.to_datetime(df[self.input_name], infer_datetime_format=True, utc=True) dtype = self.get_pandas_dtype() if dtype == 'datetime64[ns]': From 1347cb38705b4efee381f01fa57163d8c4845df3 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 12:22:46 -0800 Subject: [PATCH 39/46] lint fixes --- tests/dfencoder/test_scalers.py | 16 +++++++--------- tests/llm/conftest.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/dfencoder/test_scalers.py b/tests/dfencoder/test_scalers.py index 3eaaa6b754..a9af2cf067 100644 --- a/tests/dfencoder/test_scalers.py +++ b/tests/dfencoder/test_scalers.py @@ -115,8 +115,7 @@ def test_modified_scaler_transform(modified_scaler, tensor): assert torch.equal(torch.round(results, decimals=2), expected), f"{results} != {expected}" # Test alternate path where median absolute deviation is 1 - t = torch.tensor([3.0, 4.0, 4.0, 5.0]) - modified_scaler.fit(t) + modified_scaler.fit(torch.tensor([3.0, 4.0, 4.0, 5.0])) results = modified_scaler.transform(tensor) expected = torch.tensor([5.43, 6.86, 8.78]) assert torch.equal(torch.round(results, decimals=2), expected), f"{results} != {expected}" @@ -128,8 +127,7 @@ def test_modified_scaler_inverse_transform(modified_scaler, tensor): assert torch.equal(torch.round(results, decimals=2), expected), f"{results} != {expected}" # Test alternate path where median absolute deviation is 1 - t = torch.tensor([3.0, 4.0, 4.0, 5.0]) - modified_scaler.fit(t) + modified_scaler.fit(torch.tensor([3.0, 4.0, 4.0, 5.0])) results = modified_scaler.inverse_transform(tensor) expected = torch.tensor([8.64, 9.2, 9.95]) assert torch.equal(torch.round(results, decimals=2), expected), f"{results} != {expected}" @@ -161,13 +159,13 @@ def test_gauss_rank_scaler_fit_transform(gauss_rank_scaler, tensor): def test_null_scaler(tensor): orig = tensor.to(dtype=torch.float32, copy=True) - ns = scalers.NullScaler() - ns.fit(tensor) + scalar = scalers.NullScaler() + scalar.fit(tensor) # Verify it does nothing - assert ns.transform(tensor) is tensor - assert ns.inverse_transform(tensor) is tensor - assert ns.fit_transform(tensor) is tensor + assert scalar.transform(tensor) is tensor + assert scalar.inverse_transform(tensor) is tensor + assert scalar.fit_transform(tensor) is tensor # After all that the values should be the same assert torch.equal(tensor, orig), f"{tensor} != {orig}" diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py index 38db5d6c4d..f92a16d148 100644 --- a/tests/llm/conftest.py +++ b/tests/llm/conftest.py @@ -108,7 +108,7 @@ def mock_nemollm_fixture(mock_nemollm: mock.MagicMock): async def mock_task(fut: asyncio.Future, value: typing.Any = mock.DEFAULT): fut.set_result(value) - def create_future(*args, **kwargs) -> asyncio.Future: + def create_future(*args, **kwargs) -> asyncio.Future: # pylint: disable=unused-argument event_loop = asyncio.get_event_loop() fut = event_loop.create_future() event_loop.create_task(mock_task(fut, mock.DEFAULT)) From 6891868af95fca092589f979888784dc69e2eadd Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 12:28:52 -0800 Subject: [PATCH 40/46] Fix fixture names --- tests/dfencoder/test_scalers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dfencoder/test_scalers.py b/tests/dfencoder/test_scalers.py index a9af2cf067..7166a88e67 100644 --- a/tests/dfencoder/test_scalers.py +++ b/tests/dfencoder/test_scalers.py @@ -33,14 +33,14 @@ def tensor_fixture(): yield torch.tensor([7.4, 8.3, 9.5], dtype=torch.float32) -@pytest.fixture(name="standard_scalar", scope="function") +@pytest.fixture(name="standard_scaler", scope="function") def standard_scaler_fixture(fit_tensor): scaler = scalers.StandardScaler() scaler.fit(fit_tensor) yield scaler -@pytest.fixture(name="modified_scalar", scope="function") +@pytest.fixture(name="modified_scaler", scope="function") def modified_scaler_fixture(fit_tensor): scaler = scalers.ModifiedScaler() scaler.fit(fit_tensor) From 05db11ff70e7d770ff213f8f0f45740127b13f00 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 13:19:26 -0800 Subject: [PATCH 41/46] Ignore warnings generated by the kafka fixtures themselves, but not the actual morpheus code --- tests/_utils/kafka.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/_utils/kafka.py b/tests/_utils/kafka.py index 1921e9e289..f9f55e7fac 100644 --- a/tests/_utils/kafka.py +++ b/tests/_utils/kafka.py @@ -18,6 +18,7 @@ import subprocess import time import typing +import warnings from collections import namedtuple from functools import partial @@ -73,10 +74,15 @@ def seek_to_beginning(kafka_consumer: "KafkaConsumer", timeout: int = PARTITION_ @pytest.fixture(name='kafka_consumer', scope='function') def kafka_consumer_fixture(kafka_topics: KafkaTopics, _kafka_consumer: "KafkaConsumer"): - _kafka_consumer.subscribe([kafka_topics.output_topic]) - seek_to_beginning(_kafka_consumer) - - yield _kafka_consumer + with warnings.catch_warnings(): + # Ignore warnings specific to the test fixture and not the actual morpheus code + warnings.filterwarnings("ignore", + message=r"Exception ignored in:.*ConsumerCoordinator\.__del__", + category=pytest.PytestUnraisableExceptionWarning) + _kafka_consumer.subscribe([kafka_topics.output_topic]) + seek_to_beginning(_kafka_consumer) + + yield _kafka_consumer def _init_pytest_kafka() -> (bool, Exception): From 27c0a18e80f385ede0be57a77669a31d9cc33671 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 13:44:28 -0800 Subject: [PATCH 42/46] Fix CR --- tests/_utils/kafka.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_utils/kafka.py b/tests/_utils/kafka.py index f9f55e7fac..21e8dee721 100644 --- a/tests/_utils/kafka.py +++ b/tests/_utils/kafka.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From f6eaab443d5c4519203a0fc9e86a2dec04494e9e Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 13:56:46 -0800 Subject: [PATCH 43/46] Don't mock asyncio.gather because it is also used by jinja --- tests/llm/test_rag_standalone_pipe.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/llm/test_rag_standalone_pipe.py b/tests/llm/test_rag_standalone_pipe.py index a98c9e1c1a..b9e8a070dc 100644 --- a/tests/llm/test_rag_standalone_pipe.py +++ b/tests/llm/test_rag_standalone_pipe.py @@ -131,24 +131,18 @@ def _run_pipeline(config: Config, @pytest.mark.use_cudf @pytest.mark.parametrize("repeat_count", [5]) @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/common/utils.py')) -@mock.patch("asyncio.wrap_future") -@mock.patch("asyncio.gather", new_callable=mock.AsyncMock) -def test_rag_standalone_pipe_nemo( - mock_asyncio_gather: mock.AsyncMock, - mock_asyncio_wrap_future: mock.MagicMock, # pylint: disable=unused-argument - config: Config, - mock_nemollm: mock.MagicMock, - dataset: DatasetManager, - milvus_server_uri: str, - repeat_count: int, - import_mod: types.ModuleType): +def test_rag_standalone_pipe_nemo(config: Config, + mock_nemollm: mock.MagicMock, + dataset: DatasetManager, + milvus_server_uri: str, + repeat_count: int, + import_mod: types.ModuleType): collection_name = "test_rag_standalone_pipe_nemo" populate_milvus(milvus_server_uri=milvus_server_uri, collection_name=collection_name, resource_kwargs=import_mod.build_milvus_config(embedding_size=EMBEDDING_SIZE), df=dataset["service/milvus_rss_data.json"], overwrite=True) - mock_asyncio_gather.return_value = [mock.MagicMock() for _ in range(repeat_count)] mock_nemollm.post_process_generate_response.side_effect = [{"text": EXPECTED_RESPONSE} for _ in range(repeat_count)] results = _run_pipeline( config=config, From a973eb2fd43c4f1902852679acd146cf99e592fb Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 14:16:24 -0800 Subject: [PATCH 44/46] Set max_batch_size to match model's max_batch_size --- examples/llm/vdb_upload/run.py | 4 ++-- tests/llm/test_vdb_upload_pipe.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/llm/vdb_upload/run.py b/examples/llm/vdb_upload/run.py index fb127f4fac..74b24e52c7 100644 --- a/examples/llm/vdb_upload/run.py +++ b/examples/llm/vdb_upload/run.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ def run(): ) @click.option( "--model_max_batch_size", - default=64, + default=256, type=click.IntRange(min=1), help="Max batch size to use for the model", ) diff --git a/tests/llm/test_vdb_upload_pipe.py b/tests/llm/test_vdb_upload_pipe.py index fb0599f938..c1213a70c3 100644 --- a/tests/llm/test_vdb_upload_pipe.py +++ b/tests/llm/test_vdb_upload_pipe.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -37,7 +37,7 @@ from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage EMBEDDING_SIZE = 384 -MODEL_MAX_BATCH_SIZE = 64 +MODEL_MAX_BATCH_SIZE = 256 MODEL_FEA_LENGTH = 512 @@ -116,7 +116,7 @@ def test_vdb_upload_pipe(mock_triton_client: mock.MagicMock, "name": "output", "datatype": "FP32", "shape": [-1, EMBEDDING_SIZE] }] } - mock_model_config = {"config": {"max_batch_size": 256}} + mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} mock_triton_client.return_value = mock_triton_client mock_triton_client.is_server_live.return_value = True From c7eecae53aadcddb2b0a841ed3bc6dbdf6e0631e Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 12 Jan 2024 14:39:12 -0800 Subject: [PATCH 45/46] Update CR --- tests/llm/test_rag_standalone_pipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm/test_rag_standalone_pipe.py b/tests/llm/test_rag_standalone_pipe.py index b9e8a070dc..583f84944a 100644 --- a/tests/llm/test_rag_standalone_pipe.py +++ b/tests/llm/test_rag_standalone_pipe.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 2169ad3e620d06b2c656aa189679c4416ab30cdd Mon Sep 17 00:00:00 2001 From: David Gardner Date: Wed, 17 Jan 2024 15:04:15 -0800 Subject: [PATCH 46/46] Rename fixture methods to conform to a _fixture suffix --- tests/test_column_info.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_column_info.py b/tests/test_column_info.py index 829995c534..4cd71a9804 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -40,7 +40,7 @@ @pytest.fixture(name="_azure_ad_logs_pdf", scope="module") -def fixture__azure_ad_logs_pdf(): +def _azure_ad_logs_pdf_fixture(): # Explicitly reading this in to ensure that lines=False. # Using pandas since the C++ impl for read_file_to_df doesn't support parser_kwargs, this also avoids a warning # that cudf.read_json uses pandas.read_json under the hood. @@ -49,15 +49,15 @@ def fixture__azure_ad_logs_pdf(): @pytest.fixture(name="azure_ad_logs_pdf", scope="function") -def fixture_azure_ad_logs_pdf(_azure_ad_logs_pdf: pd.DataFrame): +def azure_ad_logs_pdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): yield _azure_ad_logs_pdf.copy(deep=True) @pytest.fixture(name="azure_ad_logs_cdf", scope="function") -def fixture_azure_ad_logs_cdf(azure_ad_logs_pdf: pd.DataFrame): +def azure_ad_logs_cdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): # cudf.from_pandas essentially does a deep copy, so we can use this to ensure that the source pandas df is not # modified - yield cudf.from_pandas(azure_ad_logs_pdf) + yield cudf.from_pandas(_azure_ad_logs_pdf) @pytest.mark.use_python