huggingface · alvarobartt · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.github/workflows/run-tests-reusable.yml b/.github/workflows/run-tests-reusable.yml
@@ -0,0 +1,63 @@
+name: Reusable Workflow to Run Hugging Face DLCs Tests
+
+on:
+  workflow_call:
+    inputs:
+      group:
+        description: "The GitHub Runners Group to run on."
+        required: true
+        type: string
+      tests-path:
+        description: "The path of the tests to run inside `tests`."
+        required: true
+        type: string
+      training-dlc:
+        description: "The URI of the Hugging Face PyTorch DLC for Training (GPU only)."
+        required: false
+        type: string
+      inference-dlc:
+        description: "The URI of the Hugging Face PyTorch DLC for Inference (CPU and GPU)."
+        required: false
+        type: string
+      tgi-dlc:
+        description: "The URI of the Hugging Face TGI DLC (GPU only)."
+        required: false
+        type: string
+      tei-dlc:
+        description: "The URI of the Hugging Face TEI DLC (CPU and GPU)."
+        required: false
+        type: string
+
+jobs:
+  run-tests:
+    runs-on:
+      group: ${{ inputs.group }}
+
+    steps:
+      - name: Check out the repository
+        uses: actions/[email protected]
+
+      - name: Set up Python
+        uses: actions/[email protected]
+        with:
+          python-version: "3.10"
+
+      - name: Set up uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          export PATH=$HOME/.cargo/bin:$PATH
+          uv --version
+
+      - name: Install dependencies
+        run: |
+          uv venv --python 3.10
+          uv pip install -r tests/requirements.txt
+
+      - name: Run Hugging Face DLC Tests
+        if:
+        run: uv run pytest -s tests/${{ inputs.tests-path }} --basetemp=${{ runner.temp }}
+        env:
+          TRAINING_DLC: ${{ inputs.training-dlc }}
+          INFERENCE_DLC: ${{ inputs.inference-dlc }}
+          TGI_DLC: ${{ inputs.tgi-dlc }}
+          TEI_DLC: ${{ inputs.tei-dlc }}
diff --git a/.github/workflows/test-pytorch-inference-dlcs.yml b/.github/workflows/test-pytorch-inference-dlcs.yml
@@ -0,0 +1,42 @@
+name: Test Hugging Face PyTorch DLCs for Inference (CPU and GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/pytorch/inference/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/pytorch/inference/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  inference-on-cpu:
+    name: Test Hugging Face PyTorch DLCs for Inference on CPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-general-8-plus
+      tests-path: pytorch/inference
+      inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cpu.2-2.transformers.4-44.ubuntu2204.py311
+
+  inference-on-gpu:
+    name: Test Hugging Face PyTorch DLCs for Inference on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: pytorch/inference
+      inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311
diff --git a/.github/workflows/test-pytorch-training-dlcs.yml b/.github/workflows/test-pytorch-training-dlcs.yml
@@ -0,0 +1,34 @@
+name: Test Hugging Face PyTorch DLCs for Training (GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/pytorch/training/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-training-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/pytorch/training/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-training-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  training-on-gpu:
+    name: Test Hugging Face PyTorch DLCs for Training on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: pytorch/training
+      training-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.transformers.4-42.ubuntu2204.py310
diff --git a/.github/workflows/test-text-embeddings-inference-dlcs.yml b/.github/workflows/test-text-embeddings-inference-dlcs.yml
@@ -0,0 +1,42 @@
+name: Test Hugging Face DLCs for TEI (CPU and GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/tei/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-embeddings-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/tei/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-embeddings-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tei-on-cpu:
+    name: Test Hugging Face DLCs for TEI on CPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-general-8-plus
+      tests-path: tei
+      tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cpu.1-4
+
+  tei-on-gpu:
+    name: Test Hugging Face DLCs for TEI on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: tei
+      tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cu122.1-4.ubuntu2204
diff --git a/.github/workflows/test-text-generation-inference-dlcs.yml b/.github/workflows/test-text-generation-inference-dlcs.yml
@@ -0,0 +1,34 @@
+name: Test Hugging Face DLCs for TGI (GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/tgi/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-generation-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/tgi/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-generation-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tgi-on-gpu:
+    name: Test Hugging Face DLCs for TGI on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: tgi
+      tgi-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+log_cli = true
+log_cli_level = INFO
+log_format = %(asctime)s %(levelname)s %(message)s
+log_date_format = %Y-%m-%d %H:%M:%S
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/pytorch/__init__.py b/tests/pytorch/__init__.py
diff --git a/tests/pytorch/inference/__init__.py b/tests/pytorch/inference/__init__.py
diff --git a/tests/pytorch/inference/test_huggingface_inference_toolkit.py b/tests/pytorch/inference/test_huggingface_inference_toolkit.py
@@ -0,0 +1,144 @@
+import logging
+import os
+import threading
+import time
+
+import docker
+import pytest
+import requests
+
+from docker.types.containers import DeviceRequest
+
+from ...utils import gpu_available, stream_logs
+
+MAX_RETRIES = 10
+
+
+# Tests below are only on some combinations of models and tasks, since most of those
+# tests are already available within https://github.com/huggingface/huggingface-inference-toolkit
+# as `huggingface-inference-toolkit` is the inference engine powering the PyTorch DLCs for Inference
+@pytest.mark.parametrize(
+    ("hf_model_id", "hf_task", "prediction_payload"),
+    [
+        (
+            "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+            "text-classification",
+            {
+                "instances": ["I love this product", "I hate this product"],
+                "parameters": {"top_k": 2},
+            },
+        ),
+        (
+            "BAAI/bge-base-en-v1.5",
+            "sentence-embeddings",
+            {"instances": ["I love this product"]},
+        ),
+        (
+            "lambdalabs/miniSD-diffusers",
+            "text-to-image",
+            {
+                "instances": ["A cat holding a sign that says hello world"],
+                "parameters": {
+                    "negative_prompt": "",
+                    "num_inference_steps": 2,
+                    "guidance_scale": 0.7,
+                },
+            },
+        ),
+    ],
+)
+def test_huggingface_inference_toolkit(
+    caplog: pytest.LogCaptureFixture,
+    hf_model_id: str,
+    hf_task: str,
+    prediction_payload: dict,
+) -> None:
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("INFERENCE_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "INFERENCE_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    logging.info(f"Starting container for {hf_model_id}...")
+    container = client.containers.run(
+        container_uri,
+        ports={"8080": 8080},
+        environment={
+            "HF_MODEL_ID": hf_model_id,
+            "HF_TASK": hf_task,
+            "AIP_MODE": "PREDICTION",
+            "AIP_HTTP_PORT": "8080",
+            "AIP_PREDICT_ROUTE": "/predict",
+            "AIP_HEALTH_ROUTE": "/health",
+        },
+        healthcheck={
+            "test": ["CMD", "curl", "-s", "http://localhost:8080/health"],
+            "interval": int(30 * 1e9),
+            "timeout": int(30 * 1e9),
+            "retries": 3,
+            "start_period": int(30 * 1e9),
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Extra `device_requests` related to the CUDA devices if any
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])]
+        if gpu_available()
+        else None,
+    )
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    logging.info(f"Container {container.id} started...")  # type: ignore
+    container_healthy = False
+    for _ in range(MAX_RETRIES):
+        # It the container failed to start properly, then the health check will fail
+        if container.status == "exited":  # type: ignore
+            container_healthy = False
+            break
+
+        try:
+            logging.info(
+                f"Trying to connect to http://localhost:8080/health [retry {_ + 1}/{MAX_RETRIES}]..."
+            )
+            response = requests.get("http://localhost:8080/health")
+            assert response.status_code == 200
+            container_healthy = True
+            break
+        except requests.exceptions.ConnectionError:
+            time.sleep(30)
+
+    if not container_healthy:
+        logging.error("Container is not healthy after several retries...")
+        container.stop()  # type: ignore
+    assert container_healthy
+
+    container_failed = False
+    try:
+        logging.info("Sending prediction request to http://localhost:8080/predict...")
+        start_time = time.perf_counter()
+        response = requests.post(
+            "http://localhost:8080/predict",
+            json=prediction_payload,
+        )
+        end_time = time.perf_counter()
+        assert response.status_code in [200, 201]
+        assert "predictions" in response.json()
+        logging.info(f"Prediction request took {end_time - start_time:.2f}s")
+    except Exception as e:
+        logging.error(
+            f"Error while sending prediction request with exception: {e}"  # type: ignore
+        )
+        container_failed = True
+    finally:
+        if log_thread.is_alive():
+            log_thread.join(timeout=5)
+        logging.info(f"Stopping container {container.id}...")  # type: ignore
+        container.stop()  # type: ignore
+        container.remove()  # type: ignore
+
+    assert not container_failed
diff --git a/tests/pytorch/training/__init__.py b/tests/pytorch/training/__init__.py