diff --git a/.github/workflows/run-tests-reusable.yml b/.github/workflows/run-tests-reusable.yml new file mode 100644 index 00000000..8dd2ffd4 --- /dev/null +++ b/.github/workflows/run-tests-reusable.yml @@ -0,0 +1,63 @@ +name: Reusable Workflow to Run Hugging Face DLCs Tests + +on: + workflow_call: + inputs: + group: + description: "The GitHub Runners Group to run on." + required: true + type: string + tests-path: + description: "The path of the tests to run inside `tests`." + required: true + type: string + training-dlc: + description: "The URI of the Hugging Face PyTorch DLC for Training (GPU only)." + required: false + type: string + inference-dlc: + description: "The URI of the Hugging Face PyTorch DLC for Inference (CPU and GPU)." + required: false + type: string + tgi-dlc: + description: "The URI of the Hugging Face TGI DLC (GPU only)." + required: false + type: string + tei-dlc: + description: "The URI of the Hugging Face TEI DLC (CPU and GPU)." + required: false + type: string + +jobs: + run-tests: + runs-on: + group: ${{ inputs.group }} + + steps: + - name: Check out the repository + uses: actions/checkout@v4.1.7 + + - name: Set up Python + uses: actions/setup-python@v5.2.0 + with: + python-version: "3.10" + + - name: Set up uv + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH=$HOME/.cargo/bin:$PATH + uv --version + + - name: Install dependencies + run: | + uv venv --python 3.10 + uv pip install -r tests/requirements.txt + + - name: Run Hugging Face DLC Tests + if: + run: uv run pytest -s tests/${{ inputs.tests-path }} --basetemp=${{ runner.temp }} + env: + TRAINING_DLC: ${{ inputs.training-dlc }} + INFERENCE_DLC: ${{ inputs.inference-dlc }} + TGI_DLC: ${{ inputs.tgi-dlc }} + TEI_DLC: ${{ inputs.tei-dlc }} diff --git a/.github/workflows/test-pytorch-inference-dlcs.yml b/.github/workflows/test-pytorch-inference-dlcs.yml new file mode 100644 index 00000000..366619e2 --- /dev/null +++ b/.github/workflows/test-pytorch-inference-dlcs.yml @@ -0,0 +1,42 @@ +name: Test Hugging Face PyTorch DLCs for Inference (CPU and GPU) + +on: + push: + branches: + - main + paths: + - tests/pytorch/inference/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-pytorch-inference-dlcs.yml + pull_request: + types: + - synchronize + - ready_for_review + branches: + - main + paths: + - tests/pytorch/inference/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-pytorch-inference-dlcs.yml + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + inference-on-cpu: + name: Test Hugging Face PyTorch DLCs for Inference on CPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-general-8-plus + tests-path: pytorch/inference + inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cpu.2-2.transformers.4-44.ubuntu2204.py311 + + inference-on-gpu: + name: Test Hugging Face PyTorch DLCs for Inference on GPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-g4dn-2xlarge + tests-path: pytorch/inference + inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311 diff --git a/.github/workflows/test-pytorch-training-dlcs.yml b/.github/workflows/test-pytorch-training-dlcs.yml new file mode 100644 index 00000000..961cf147 --- /dev/null +++ b/.github/workflows/test-pytorch-training-dlcs.yml @@ -0,0 +1,34 @@ +name: Test Hugging Face PyTorch DLCs for Training (GPU) + +on: + push: + branches: + - main + paths: + - tests/pytorch/training/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-pytorch-training-dlcs.yml + pull_request: + types: + - synchronize + - ready_for_review + branches: + - main + paths: + - tests/pytorch/training/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-pytorch-training-dlcs.yml + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + training-on-gpu: + name: Test Hugging Face PyTorch DLCs for Training on GPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-g4dn-2xlarge + tests-path: pytorch/training + training-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.transformers.4-42.ubuntu2204.py310 diff --git a/.github/workflows/test-text-embeddings-inference-dlcs.yml b/.github/workflows/test-text-embeddings-inference-dlcs.yml new file mode 100644 index 00000000..d6bdd790 --- /dev/null +++ b/.github/workflows/test-text-embeddings-inference-dlcs.yml @@ -0,0 +1,42 @@ +name: Test Hugging Face DLCs for TEI (CPU and GPU) + +on: + push: + branches: + - main + paths: + - tests/tei/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-text-embeddings-inference-dlcs.yml + pull_request: + types: + - synchronize + - ready_for_review + branches: + - main + paths: + - tests/tei/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-text-embeddings-inference-dlcs.yml + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + tei-on-cpu: + name: Test Hugging Face DLCs for TEI on CPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-general-8-plus + tests-path: tei + tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cpu.1-4 + + tei-on-gpu: + name: Test Hugging Face DLCs for TEI on GPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-g4dn-2xlarge + tests-path: tei + tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cu122.1-4.ubuntu2204 diff --git a/.github/workflows/test-text-generation-inference-dlcs.yml b/.github/workflows/test-text-generation-inference-dlcs.yml new file mode 100644 index 00000000..2d77aefb --- /dev/null +++ b/.github/workflows/test-text-generation-inference-dlcs.yml @@ -0,0 +1,34 @@ +name: Test Hugging Face DLCs for TGI (GPU) + +on: + push: + branches: + - main + paths: + - tests/tgi/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-text-generation-inference-dlcs.yml + pull_request: + types: + - synchronize + - ready_for_review + branches: + - main + paths: + - tests/tgi/* + - .github/workflows/run-tests-reusable.yml + - .github/workflows/test-text-generation-inference-dlcs.yml + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + tgi-on-gpu: + name: Test Hugging Face DLCs for TGI on GPU + uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests + with: + group: aws-g4dn-2xlarge + tests-path: tgi + tgi-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..31a7b732 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +log_cli = true +log_cli_level = INFO +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pytorch/__init__.py b/tests/pytorch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pytorch/inference/__init__.py b/tests/pytorch/inference/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pytorch/inference/test_huggingface_inference_toolkit.py b/tests/pytorch/inference/test_huggingface_inference_toolkit.py new file mode 100644 index 00000000..6145ac0c --- /dev/null +++ b/tests/pytorch/inference/test_huggingface_inference_toolkit.py @@ -0,0 +1,144 @@ +import logging +import os +import threading +import time + +import docker +import pytest +import requests + +from docker.types.containers import DeviceRequest + +from ...utils import gpu_available, stream_logs + +MAX_RETRIES = 10 + + +# Tests below are only on some combinations of models and tasks, since most of those +# tests are already available within https://github.com/huggingface/huggingface-inference-toolkit +# as `huggingface-inference-toolkit` is the inference engine powering the PyTorch DLCs for Inference +@pytest.mark.parametrize( + ("hf_model_id", "hf_task", "prediction_payload"), + [ + ( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "text-classification", + { + "instances": ["I love this product", "I hate this product"], + "parameters": {"top_k": 2}, + }, + ), + ( + "BAAI/bge-base-en-v1.5", + "sentence-embeddings", + {"instances": ["I love this product"]}, + ), + ( + "lambdalabs/miniSD-diffusers", + "text-to-image", + { + "instances": ["A cat holding a sign that says hello world"], + "parameters": { + "negative_prompt": "", + "num_inference_steps": 2, + "guidance_scale": 0.7, + }, + }, + ), + ], +) +def test_huggingface_inference_toolkit( + caplog: pytest.LogCaptureFixture, + hf_model_id: str, + hf_task: str, + prediction_payload: dict, +) -> None: + caplog.set_level(logging.INFO) + + container_uri = os.getenv("INFERENCE_DLC", None) + if container_uri is None or container_uri == "": + assert False, "INFERENCE_DLC environment variable is not set" + + client = docker.from_env() + + logging.info(f"Starting container for {hf_model_id}...") + container = client.containers.run( + container_uri, + ports={"8080": 8080}, + environment={ + "HF_MODEL_ID": hf_model_id, + "HF_TASK": hf_task, + "AIP_MODE": "PREDICTION", + "AIP_HTTP_PORT": "8080", + "AIP_PREDICT_ROUTE": "/predict", + "AIP_HEALTH_ROUTE": "/health", + }, + healthcheck={ + "test": ["CMD", "curl", "-s", "http://localhost:8080/health"], + "interval": int(30 * 1e9), + "timeout": int(30 * 1e9), + "retries": 3, + "start_period": int(30 * 1e9), + }, + platform="linux/amd64", + detach=True, + # Extra `device_requests` related to the CUDA devices if any + device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])] + if gpu_available() + else None, + ) + + # Start log streaming in a separate thread + log_thread = threading.Thread(target=stream_logs, args=(container,)) + log_thread.daemon = True + log_thread.start() + + logging.info(f"Container {container.id} started...") # type: ignore + container_healthy = False + for _ in range(MAX_RETRIES): + # It the container failed to start properly, then the health check will fail + if container.status == "exited": # type: ignore + container_healthy = False + break + + try: + logging.info( + f"Trying to connect to http://localhost:8080/health [retry {_ + 1}/{MAX_RETRIES}]..." + ) + response = requests.get("http://localhost:8080/health") + assert response.status_code == 200 + container_healthy = True + break + except requests.exceptions.ConnectionError: + time.sleep(30) + + if not container_healthy: + logging.error("Container is not healthy after several retries...") + container.stop() # type: ignore + assert container_healthy + + container_failed = False + try: + logging.info("Sending prediction request to http://localhost:8080/predict...") + start_time = time.perf_counter() + response = requests.post( + "http://localhost:8080/predict", + json=prediction_payload, + ) + end_time = time.perf_counter() + assert response.status_code in [200, 201] + assert "predictions" in response.json() + logging.info(f"Prediction request took {end_time - start_time:.2f}s") + except Exception as e: + logging.error( + f"Error while sending prediction request with exception: {e}" # type: ignore + ) + container_failed = True + finally: + if log_thread.is_alive(): + log_thread.join(timeout=5) + logging.info(f"Stopping container {container.id}...") # type: ignore + container.stop() # type: ignore + container.remove() # type: ignore + + assert not container_failed diff --git a/tests/pytorch/training/__init__.py b/tests/pytorch/training/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pytorch/training/test_trl.py b/tests/pytorch/training/test_trl.py new file mode 100644 index 00000000..8268e728 --- /dev/null +++ b/tests/pytorch/training/test_trl.py @@ -0,0 +1,141 @@ +import logging +import os +import pytest +import threading + +import docker +from docker.types.containers import DeviceRequest +from pathlib import PosixPath + +from ...utils import gpu_available, stream_logs + + +MODEL_ID = "sshleifer/tiny-gpt2" + + +@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available") +def test_trl(caplog: pytest.LogCaptureFixture, tmp_path: PosixPath) -> None: + """Adapted from https://github.com/huggingface/trl/blob/main/examples/scripts/sft.py""" + caplog.set_level(logging.INFO) + + container_uri = os.getenv("TRAINING_DLC", None) + if container_uri is None or container_uri == "": + assert False, "TRAINING_DLC environment variable is not set" + + client = docker.from_env() + + logging.info("Running the container for TRL...") + container = client.containers.run( + container_uri, + command=[ + "trl", + "sft", + f"--model_name_or_path={MODEL_ID}", + "--dataset_text_field=text", + "--report_to=none", + "--learning_rate=1e-5", + "--per_device_train_batch_size=8", + "--gradient_accumulation_steps=1", + "--output_dir=/opt/huggingface/trained_model", + "--logging_steps=1", + "--max_steps=10", + "--gradient_checkpointing", + ], + environment={ + "TRL_USE_RICH": "0", + "ACCELERATE_LOG_LEVEL": "INFO", + "TRANSFORMERS_LOG_LEVEL": "INFO", + "TQDM_POSITION": "-1", + }, + platform="linux/amd64", + detach=True, + # Mount the volume from the `tmp_path` to the `/opt/huggingface/trained_model` + volumes={ + tmp_path: { + "bind": "/opt/huggingface/trained_model", + "mode": "rw", + } + }, + # Extra `device_requests` related to the CUDA devices + device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])], + ) + + # Start log streaming in a separate thread + log_thread = threading.Thread(target=stream_logs, args=(container,)) + log_thread.daemon = True + log_thread.start() + + # Wait for the container to finish + container.wait() # type: ignore + + # Remove the container + container.remove() # type: ignore + + assert tmp_path.exists() + assert (tmp_path / "model.safetensors").exists() + + +@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available") +def test_trl_peft(caplog: pytest.LogCaptureFixture, tmp_path: PosixPath) -> None: + """Adapted from https://github.com/huggingface/trl/blob/main/examples/scripts/sft.py""" + caplog.set_level(logging.INFO) + + container_uri = os.getenv("TRAINING_DLC", None) + if container_uri is None or container_uri == "": + assert False, "TRAINING_DLC environment variable is not set" + + client = docker.from_env() + + logging.info("Running the container for TRL...") + container = client.containers.run( + container_uri, + command=[ + "trl", + "sft", + f"--model_name_or_path={MODEL_ID}", + "--dataset_text_field=text", + "--report_to=none", + "--learning_rate=1e-5", + "--per_device_train_batch_size=8", + "--gradient_accumulation_steps=1", + "--output_dir=/opt/huggingface/trained_model", + "--logging_steps=1", + "--max_steps=10", + "--gradient_checkpointing", + "--use_peft", + "--lora_r=64", + "--lora_alpha=16", + ], + environment={ + "TRL_USE_RICH": "0", + "ACCELERATE_LOG_LEVEL": "INFO", + "TRANSFORMERS_LOG_LEVEL": "INFO", + "TQDM_POSITION": "-1", + }, + platform="linux/amd64", + detach=True, + # Mount the volume from the `tmp_path` to the `/opt/huggingface/trained_model` + volumes={ + tmp_path: { + "bind": "/opt/huggingface/trained_model", + "mode": "rw", + } + }, + # Extra `device_requests` related to the CUDA devices + device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])], + ) + + # Start log streaming in a separate thread + log_thread = threading.Thread(target=stream_logs, args=(container,)) + log_thread.daemon = True + log_thread.start() + + # Wait for the container to finish + container.wait() # type: ignore + + # Remove the container + container.remove() # type: ignore + + assert tmp_path.exists() + assert (tmp_path / "adapter_config.json").exists() + assert (tmp_path / "adapter_model.safetensors").exists() diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 00000000..089ca7e9 --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,2 @@ +docker==7.1.0 +pytest==8.3.2 diff --git a/tests/tei/__init__.py b/tests/tei/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tei/test_tei.py b/tests/tei/test_tei.py new file mode 100644 index 00000000..5edeeb47 --- /dev/null +++ b/tests/tei/test_tei.py @@ -0,0 +1,135 @@ +import logging +import os +import threading +import time + +import docker +import pytest +import requests + +from docker.types.containers import DeviceRequest + +from ..utils import gpu_available, stream_logs + +MAX_RETRIES = 10 + + +@pytest.mark.parametrize( + "text_embeddings_router_kwargs", + [ + { + "MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2", + }, + { + "MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2", + "AIP_MODE": "PREDICTION", + }, + ], +) +def test_text_embeddings_inference( + caplog: pytest.LogCaptureFixture, + text_embeddings_router_kwargs: dict, +) -> None: + caplog.set_level(logging.INFO) + + container_uri = os.getenv("TEI_DLC", None) + if container_uri is None or container_uri == "": + assert False, "TEI_DLC environment variable is not set" + + client = docker.from_env() + + logging.info( + f"Starting container for {text_embeddings_router_kwargs.get('MODEL_ID', None)}..." + ) + container = client.containers.run( + container_uri, + ports={8080: 8080}, # type: ignore + environment=text_embeddings_router_kwargs, + healthcheck={ + "test": ["CMD", "curl", "-s", "http://localhost:8080/health"], + "interval": int(30 * 1e9), + "timeout": int(30 * 1e9), + "retries": 3, + "start_period": int(30 * 1e9), + }, + platform="linux/amd64", + detach=True, + # Extra `device_requests` related to the CUDA devices if any + device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])] + if gpu_available() + else None, + ) + logging.info(f"Container {container.id} started...") # type: ignore + + # Start log streaming in a separate thread + log_thread = threading.Thread(target=stream_logs, args=(container,)) + log_thread.daemon = True + log_thread.start() + + # Get endpoint names for both health and predict (may differ if AIP env vars are defined) + health_route = os.getenv("AIP_HEALTH_ROUTE", "/health") + predict_route = ( + os.getenv("AIP_PREDICT_ROUTE", "/predict") + if os.getenv("AIP_MODE") + else "/embed" + ) + + container_healthy = False + for _ in range(MAX_RETRIES): + # It the container failed to start properly, then the health check will fail + if container.status == "exited": # type: ignore + container_healthy = False + break + + try: + logging.info( + f"Trying to connect to http://localhost:8080{health_route} [retry {_ + 1}/{MAX_RETRIES}]..." + ) + response = requests.get(f"http://localhost:8080{health_route}") + assert response.status_code == 200 + container_healthy = True + break + except requests.exceptions.ConnectionError: + time.sleep(30) + + if not container_healthy: + logging.error("Container is not healthy after several retries...") + container.stop() # type: ignore + + assert container_healthy + + container_failed = False + try: + logging.info( + f"Sending prediction request to http://localhost:8080{predict_route}..." + ) + payload = {"inputs": "What's Deep Learning?"} + + if os.getenv("AIP_MODE"): + payload = {"instances": [payload]} + + start_time = time.perf_counter() + response = requests.post( + f"http://localhost:8080{predict_route}", + json=payload, + ) + end_time = time.perf_counter() + + assert response.status_code in [200, 201] + assert response.json() is not None + + logging.info(f"Prediction request took {end_time - start_time:.2f}s") + except Exception as e: + logging.error( + f"Error while sending prediction request with exception: {e}" # type: ignore + ) + container_failed = True + finally: + if log_thread.is_alive(): + log_thread.join(timeout=5) + + logging.info(f"Stopping container {container.id}...") # type: ignore + container.stop() # type: ignore + container.remove() # type: ignore + + assert not container_failed diff --git a/tests/tgi/__init__.py b/tests/tgi/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tgi/test_tgi.py b/tests/tgi/test_tgi.py new file mode 100644 index 00000000..c50deb86 --- /dev/null +++ b/tests/tgi/test_tgi.py @@ -0,0 +1,155 @@ +import logging +import os +import threading +import time + +import docker +import pytest +import requests + +from docker.types.containers import DeviceRequest + +from ..utils import gpu_available, stream_logs, supports_flash_attention + +MAX_RETRIES = 10 + + +@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available") +@pytest.mark.parametrize( + "text_generation_launcher_kwargs", + [ + { + "MODEL_ID": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "MAX_INPUT_TOKENS": "512", + "MAX_TOTAL_TOKENS": "1024", + "MAX_BATCH_PREFILL_TOKENS": "1512", + }, + { + "MODEL_ID": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "MAX_INPUT_TOKENS": "512", + "MAX_TOTAL_TOKENS": "1024", + "MAX_BATCH_PREFILL_TOKENS": "1512", + "AIP_MODE": "PREDICTION", + }, + ], +) +def test_text_generation_inference( + caplog: pytest.LogCaptureFixture, + text_generation_launcher_kwargs: dict, +) -> None: + caplog.set_level(logging.INFO) + + container_uri = os.getenv("TGI_DLC", None) + if container_uri is None or container_uri == "": + assert False, "TGI_DLC environment variable is not set" + + client = docker.from_env() + + # If the GPU doesn't support Flash Attention, then set `USE_FLASH_ATTENTION=false` + if not supports_flash_attention(): + text_generation_launcher_kwargs["USE_FLASH_ATTENTION"] = "false" + + logging.info( + f"Starting container for {text_generation_launcher_kwargs.get('MODEL_ID', None)}..." + ) + container = client.containers.run( + container_uri, + ports={8080: 8080}, # type: ignore + environment=text_generation_launcher_kwargs, + healthcheck={ + "test": ["CMD", "curl", "-s", "http://localhost:8080/health"], + "interval": int(30 * 1e9), + "timeout": int(30 * 1e9), + "retries": 3, + "start_period": int(30 * 1e9), + }, + platform="linux/amd64", + detach=True, + # Extra kwargs related to the CUDA devices + device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])], + ) + logging.info(f"Container {container.id} started...") # type: ignore + + # Start log streaming in a separate thread + log_thread = threading.Thread(target=stream_logs, args=(container,)) + log_thread.daemon = True + log_thread.start() + + # Get endpoint names for both health and predict (may differ if AIP env vars are defined) + health_route = os.getenv("AIP_HEALTH_ROUTE", "/health") + predict_route = ( + os.getenv("AIP_PREDICT_ROUTE", "/predict") + if os.getenv("AIP_MODE") + else "/generate" + ) + + container_healthy = False + for _ in range(MAX_RETRIES): + # It the container failed to start properly, then the health check will fail + if container.status == "exited": # type: ignore + container_healthy = False + break + + try: + logging.info( + f"Trying to connect to http://localhost:8080{health_route} [retry {_ + 1}/{MAX_RETRIES}]..." + ) + response = requests.get(f"http://localhost:8080{health_route}") + assert response.status_code == 200 + container_healthy = True + break + except requests.exceptions.ConnectionError: + time.sleep(30) + + if not container_healthy: + logging.error("Container is not healthy after several retries...") + container.stop() # type: ignore + + assert container_healthy + + container_failed = False + try: + for prompt in ["What's Deep Learning?", "What's the capital of France?"]: + logging.info( + f"Sending prediction request for {prompt=} to http://localhost:8080{predict_route}..." + ) + payload = { + "inputs": prompt, + "parameters": { + "max_new_tokens": 256, + "do_sample": True, + "top_p": 0.95, + "temperature": 1.0, + }, + } + + if os.getenv("AIP_MODE"): + payload = {"instances": [payload]} + + start_time = time.perf_counter() + response = requests.post( + f"http://localhost:8080{predict_route}", + json=payload, + ) + end_time = time.perf_counter() + + assert response.status_code in [200, 201] + assert "generated_text" in response.json() + + logging.info( + f"Prediction request for {prompt=} took {end_time - start_time:.2f}s" + ) + except Exception as e: + logging.error( + f"Error while sending prediction request with exception: {e}" # type: ignore + ) + container_failed = True + finally: + if log_thread.is_alive(): + log_thread.join(timeout=5) + + logging.info(f"Stopping container {container.id}...") # type: ignore + container.stop() # type: ignore + container.remove() # type: ignore + + assert not container_failed diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..b4814029 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,30 @@ +import logging +import subprocess + +from docker.models.containers import Container + + +def stream_logs(container: Container) -> None: + """Streams the logs generated by `containers.run` via the Docker SDK for Python.""" + for line in container.logs(stream=True, follow=True): + logging.info(line.decode("utf-8", errors="ignore").strip()) + + +def gpu_available() -> bool: + """Returns whether the current environment has a GPU available.""" + try: + subprocess.run(["nvidia-smi"], capture_output=True, text=True) + return True + except FileNotFoundError: + return False + + +def supports_flash_attention() -> bool: + """Returns whether the current GPU supports Flash Attention or not (based on compute capability).""" + output = subprocess.run( + ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader,nounits"], + capture_output=True, + text=True, + check=True, + ) + return float(output.stdout.strip()) >= 8.0