From 1c532c4cc0a846fa964dbaf1452a065bd99a66dc Mon Sep 17 00:00:00 2001 From: Farhad Ramezanghorbani Date: Tue, 17 Dec 2024 10:15:02 -0700 Subject: [PATCH] Drop subprocess test and test the core logic instead (#540) --- .../bionemo/esm2/scripts/test_infer_esm2.py | 50 +++++-------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py index 39729e5bc2..aac0ed617d 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py @@ -14,16 +14,11 @@ # limitations under the License. import glob -import os -import shlex -import subprocess -from pathlib import Path from typing import get_args import pandas as pd import pytest import torch -from lightning.fabric.plugins.environments.lightning import find_free_network_port from torch.utils.data import DataLoader from bionemo.core.data.load import load @@ -147,7 +142,12 @@ def test_esm2_fine_tune_data_module_val_dataloader(data_module): @pytest.mark.parametrize("prediction_interval", get_args(IntervalT)) @pytest.mark.skipif(check_gpu_memory(30), reason="Skipping test due to insufficient GPU memory") def test_infer_runs( - tmpdir, dummy_protein_csv, dummy_protein_sequences, precision, padded_tokenized_sequences, prediction_interval + tmpdir, + dummy_protein_csv, + dummy_protein_sequences, + precision, + prediction_interval, + padded_tokenized_sequences, ): data_path = dummy_protein_csv result_dir = tmpdir / "results" @@ -188,35 +188,9 @@ def test_infer_runs( # token_logits are [sequence, batch, num_tokens] assert results["token_logits"].shape[:-1] == (min_seq_len, len(dummy_protein_sequences)) - -@pytest.mark.skipif(check_gpu_memory(40), reason="Skipping test due to insufficient GPU memory") -@pytest.mark.parametrize("checkpoint_path", [esm2_3b_checkpoint_path, esm2_650m_checkpoint_path]) -def test_infer_cli(tmpdir, dummy_protein_csv, checkpoint_path): - # Clear the GPU cache before starting the test - torch.cuda.empty_cache() - - result_dir = Path(tmpdir.mkdir("results")) - results_path = result_dir / "esm2_infer_results.pt" - open_port = find_free_network_port() - env = dict(**os.environ) - env["MASTER_PORT"] = str(open_port) - - cmd_str = f"""infer_esm2 \ - --checkpoint-path {checkpoint_path} \ - --data-path {dummy_protein_csv} \ - --results-path {results_path} \ - --precision bf16-mixed \ - --include-hiddens \ - --include-embeddings \ - --include-logits \ - --include-input-ids - """.strip() - - cmd = shlex.split(cmd_str) - result = subprocess.run( - cmd, - cwd=tmpdir, - env=env, - capture_output=True, - ) - assert result.returncode == 0, f"Failed with: {cmd_str}" + # test 1:1 mapping between input sequence and results + # this does not apply to "batch" prediction_interval mode since the order of batches may not be consistent + # due distributed processing. To address this, we optionally include input_ids in the predictions, allowing + # for accurate mapping post-inference. + if prediction_interval == "epoch": + assert torch.equal(padded_tokenized_sequences, results["input_ids"])