diff --git a/.github/workflows/run-tests-reusable.yml b/.github/workflows/run-tests-reusable.yml
new file mode 100644
index 00000000..8dd2ffd4
--- /dev/null
+++ b/.github/workflows/run-tests-reusable.yml
@@ -0,0 +1,63 @@
+name: Reusable Workflow to Run Hugging Face DLCs Tests
+
+on:
+  workflow_call:
+    inputs:
+      group:
+        description: "The GitHub Runners Group to run on."
+        required: true
+        type: string
+      tests-path:
+        description: "The path of the tests to run inside `tests`."
+        required: true
+        type: string
+      training-dlc:
+        description: "The URI of the Hugging Face PyTorch DLC for Training (GPU only)."
+        required: false
+        type: string
+      inference-dlc:
+        description: "The URI of the Hugging Face PyTorch DLC for Inference (CPU and GPU)."
+        required: false
+        type: string
+      tgi-dlc:
+        description: "The URI of the Hugging Face TGI DLC (GPU only)."
+        required: false
+        type: string
+      tei-dlc:
+        description: "The URI of the Hugging Face TEI DLC (CPU and GPU)."
+        required: false
+        type: string
+
+jobs:
+  run-tests:
+    runs-on:
+      group: ${{ inputs.group }}
+
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4.1.7
+
+      - name: Set up Python
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: "3.10"
+
+      - name: Set up uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          export PATH=$HOME/.cargo/bin:$PATH
+          uv --version
+
+      - name: Install dependencies
+        run: |
+          uv venv --python 3.10
+          uv pip install -r tests/requirements.txt
+
+      - name: Run Hugging Face DLC Tests
+        if:
+        run: uv run pytest -s tests/${{ inputs.tests-path }} --basetemp=${{ runner.temp }}
+        env:
+          TRAINING_DLC: ${{ inputs.training-dlc }}
+          INFERENCE_DLC: ${{ inputs.inference-dlc }}
+          TGI_DLC: ${{ inputs.tgi-dlc }}
+          TEI_DLC: ${{ inputs.tei-dlc }}
diff --git a/.github/workflows/test-pytorch-inference-dlcs.yml b/.github/workflows/test-pytorch-inference-dlcs.yml
new file mode 100644
index 00000000..366619e2
--- /dev/null
+++ b/.github/workflows/test-pytorch-inference-dlcs.yml
@@ -0,0 +1,42 @@
+name: Test Hugging Face PyTorch DLCs for Inference (CPU and GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/pytorch/inference/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/pytorch/inference/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  inference-on-cpu:
+    name: Test Hugging Face PyTorch DLCs for Inference on CPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-general-8-plus
+      tests-path: pytorch/inference
+      inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cpu.2-2.transformers.4-44.ubuntu2204.py311
+
+  inference-on-gpu:
+    name: Test Hugging Face PyTorch DLCs for Inference on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: pytorch/inference
+      inference-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311
diff --git a/.github/workflows/test-pytorch-training-dlcs.yml b/.github/workflows/test-pytorch-training-dlcs.yml
new file mode 100644
index 00000000..961cf147
--- /dev/null
+++ b/.github/workflows/test-pytorch-training-dlcs.yml
@@ -0,0 +1,34 @@
+name: Test Hugging Face PyTorch DLCs for Training (GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/pytorch/training/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-training-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/pytorch/training/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-pytorch-training-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  training-on-gpu:
+    name: Test Hugging Face PyTorch DLCs for Training on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: pytorch/training
+      training-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.transformers.4-42.ubuntu2204.py310
diff --git a/.github/workflows/test-text-embeddings-inference-dlcs.yml b/.github/workflows/test-text-embeddings-inference-dlcs.yml
new file mode 100644
index 00000000..d6bdd790
--- /dev/null
+++ b/.github/workflows/test-text-embeddings-inference-dlcs.yml
@@ -0,0 +1,42 @@
+name: Test Hugging Face DLCs for TEI (CPU and GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/tei/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-embeddings-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/tei/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-embeddings-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tei-on-cpu:
+    name: Test Hugging Face DLCs for TEI on CPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-general-8-plus
+      tests-path: tei
+      tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cpu.1-4
+
+  tei-on-gpu:
+    name: Test Hugging Face DLCs for TEI on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: tei
+      tei-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cu122.1-4.ubuntu2204
diff --git a/.github/workflows/test-text-generation-inference-dlcs.yml b/.github/workflows/test-text-generation-inference-dlcs.yml
new file mode 100644
index 00000000..2d77aefb
--- /dev/null
+++ b/.github/workflows/test-text-generation-inference-dlcs.yml
@@ -0,0 +1,34 @@
+name: Test Hugging Face DLCs for TGI (GPU)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - tests/tgi/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-generation-inference-dlcs.yml
+  pull_request:
+    types:
+      - synchronize
+      - ready_for_review
+    branches:
+      - main
+    paths:
+      - tests/tgi/*
+      - .github/workflows/run-tests-reusable.yml
+      - .github/workflows/test-text-generation-inference-dlcs.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tgi-on-gpu:
+    name: Test Hugging Face DLCs for TGI on GPU
+    uses: huggingface/Google-Cloud-Containers/.github/workflows/run-tests-reusable.yml@add-integration-tests
+    with:
+      group: aws-g4dn-2xlarge
+      tests-path: tgi
+      tgi-dlc: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..31a7b732
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+log_cli = true
+log_cli_level = INFO
+log_format = %(asctime)s %(levelname)s %(message)s
+log_date_format = %Y-%m-%d %H:%M:%S
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pytorch/__init__.py b/tests/pytorch/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pytorch/inference/__init__.py b/tests/pytorch/inference/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pytorch/inference/test_huggingface_inference_toolkit.py b/tests/pytorch/inference/test_huggingface_inference_toolkit.py
new file mode 100644
index 00000000..6145ac0c
--- /dev/null
+++ b/tests/pytorch/inference/test_huggingface_inference_toolkit.py
@@ -0,0 +1,144 @@
+import logging
+import os
+import threading
+import time
+
+import docker
+import pytest
+import requests
+
+from docker.types.containers import DeviceRequest
+
+from ...utils import gpu_available, stream_logs
+
+MAX_RETRIES = 10
+
+
+# Tests below are only on some combinations of models and tasks, since most of those
+# tests are already available within https://github.com/huggingface/huggingface-inference-toolkit
+# as `huggingface-inference-toolkit` is the inference engine powering the PyTorch DLCs for Inference
+@pytest.mark.parametrize(
+    ("hf_model_id", "hf_task", "prediction_payload"),
+    [
+        (
+            "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+            "text-classification",
+            {
+                "instances": ["I love this product", "I hate this product"],
+                "parameters": {"top_k": 2},
+            },
+        ),
+        (
+            "BAAI/bge-base-en-v1.5",
+            "sentence-embeddings",
+            {"instances": ["I love this product"]},
+        ),
+        (
+            "lambdalabs/miniSD-diffusers",
+            "text-to-image",
+            {
+                "instances": ["A cat holding a sign that says hello world"],
+                "parameters": {
+                    "negative_prompt": "",
+                    "num_inference_steps": 2,
+                    "guidance_scale": 0.7,
+                },
+            },
+        ),
+    ],
+)
+def test_huggingface_inference_toolkit(
+    caplog: pytest.LogCaptureFixture,
+    hf_model_id: str,
+    hf_task: str,
+    prediction_payload: dict,
+) -> None:
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("INFERENCE_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "INFERENCE_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    logging.info(f"Starting container for {hf_model_id}...")
+    container = client.containers.run(
+        container_uri,
+        ports={"8080": 8080},
+        environment={
+            "HF_MODEL_ID": hf_model_id,
+            "HF_TASK": hf_task,
+            "AIP_MODE": "PREDICTION",
+            "AIP_HTTP_PORT": "8080",
+            "AIP_PREDICT_ROUTE": "/predict",
+            "AIP_HEALTH_ROUTE": "/health",
+        },
+        healthcheck={
+            "test": ["CMD", "curl", "-s", "http://localhost:8080/health"],
+            "interval": int(30 * 1e9),
+            "timeout": int(30 * 1e9),
+            "retries": 3,
+            "start_period": int(30 * 1e9),
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Extra `device_requests` related to the CUDA devices if any
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])]
+        if gpu_available()
+        else None,
+    )
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    logging.info(f"Container {container.id} started...")  # type: ignore
+    container_healthy = False
+    for _ in range(MAX_RETRIES):
+        # It the container failed to start properly, then the health check will fail
+        if container.status == "exited":  # type: ignore
+            container_healthy = False
+            break
+
+        try:
+            logging.info(
+                f"Trying to connect to http://localhost:8080/health [retry {_ + 1}/{MAX_RETRIES}]..."
+            )
+            response = requests.get("http://localhost:8080/health")
+            assert response.status_code == 200
+            container_healthy = True
+            break
+        except requests.exceptions.ConnectionError:
+            time.sleep(30)
+
+    if not container_healthy:
+        logging.error("Container is not healthy after several retries...")
+        container.stop()  # type: ignore
+    assert container_healthy
+
+    container_failed = False
+    try:
+        logging.info("Sending prediction request to http://localhost:8080/predict...")
+        start_time = time.perf_counter()
+        response = requests.post(
+            "http://localhost:8080/predict",
+            json=prediction_payload,
+        )
+        end_time = time.perf_counter()
+        assert response.status_code in [200, 201]
+        assert "predictions" in response.json()
+        logging.info(f"Prediction request took {end_time - start_time:.2f}s")
+    except Exception as e:
+        logging.error(
+            f"Error while sending prediction request with exception: {e}"  # type: ignore
+        )
+        container_failed = True
+    finally:
+        if log_thread.is_alive():
+            log_thread.join(timeout=5)
+        logging.info(f"Stopping container {container.id}...")  # type: ignore
+        container.stop()  # type: ignore
+        container.remove()  # type: ignore
+
+    assert not container_failed
diff --git a/tests/pytorch/training/__init__.py b/tests/pytorch/training/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pytorch/training/test_trl.py b/tests/pytorch/training/test_trl.py
new file mode 100644
index 00000000..8268e728
--- /dev/null
+++ b/tests/pytorch/training/test_trl.py
@@ -0,0 +1,141 @@
+import logging
+import os
+import pytest
+import threading
+
+import docker
+from docker.types.containers import DeviceRequest
+from pathlib import PosixPath
+
+from ...utils import gpu_available, stream_logs
+
+
+MODEL_ID = "sshleifer/tiny-gpt2"
+
+
+@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available")
+def test_trl(caplog: pytest.LogCaptureFixture, tmp_path: PosixPath) -> None:
+    """Adapted from https://github.com/huggingface/trl/blob/main/examples/scripts/sft.py"""
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("TRAINING_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "TRAINING_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    logging.info("Running the container for TRL...")
+    container = client.containers.run(
+        container_uri,
+        command=[
+            "trl",
+            "sft",
+            f"--model_name_or_path={MODEL_ID}",
+            "--dataset_text_field=text",
+            "--report_to=none",
+            "--learning_rate=1e-5",
+            "--per_device_train_batch_size=8",
+            "--gradient_accumulation_steps=1",
+            "--output_dir=/opt/huggingface/trained_model",
+            "--logging_steps=1",
+            "--max_steps=10",
+            "--gradient_checkpointing",
+        ],
+        environment={
+            "TRL_USE_RICH": "0",
+            "ACCELERATE_LOG_LEVEL": "INFO",
+            "TRANSFORMERS_LOG_LEVEL": "INFO",
+            "TQDM_POSITION": "-1",
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Mount the volume from the `tmp_path` to the `/opt/huggingface/trained_model`
+        volumes={
+            tmp_path: {
+                "bind": "/opt/huggingface/trained_model",
+                "mode": "rw",
+            }
+        },
+        # Extra `device_requests` related to the CUDA devices
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])],
+    )
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    # Wait for the container to finish
+    container.wait()  # type: ignore
+
+    # Remove the container
+    container.remove()  # type: ignore
+
+    assert tmp_path.exists()
+    assert (tmp_path / "model.safetensors").exists()
+
+
+@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available")
+def test_trl_peft(caplog: pytest.LogCaptureFixture, tmp_path: PosixPath) -> None:
+    """Adapted from https://github.com/huggingface/trl/blob/main/examples/scripts/sft.py"""
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("TRAINING_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "TRAINING_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    logging.info("Running the container for TRL...")
+    container = client.containers.run(
+        container_uri,
+        command=[
+            "trl",
+            "sft",
+            f"--model_name_or_path={MODEL_ID}",
+            "--dataset_text_field=text",
+            "--report_to=none",
+            "--learning_rate=1e-5",
+            "--per_device_train_batch_size=8",
+            "--gradient_accumulation_steps=1",
+            "--output_dir=/opt/huggingface/trained_model",
+            "--logging_steps=1",
+            "--max_steps=10",
+            "--gradient_checkpointing",
+            "--use_peft",
+            "--lora_r=64",
+            "--lora_alpha=16",
+        ],
+        environment={
+            "TRL_USE_RICH": "0",
+            "ACCELERATE_LOG_LEVEL": "INFO",
+            "TRANSFORMERS_LOG_LEVEL": "INFO",
+            "TQDM_POSITION": "-1",
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Mount the volume from the `tmp_path` to the `/opt/huggingface/trained_model`
+        volumes={
+            tmp_path: {
+                "bind": "/opt/huggingface/trained_model",
+                "mode": "rw",
+            }
+        },
+        # Extra `device_requests` related to the CUDA devices
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])],
+    )
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    # Wait for the container to finish
+    container.wait()  # type: ignore
+
+    # Remove the container
+    container.remove()  # type: ignore
+
+    assert tmp_path.exists()
+    assert (tmp_path / "adapter_config.json").exists()
+    assert (tmp_path / "adapter_model.safetensors").exists()
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 00000000..089ca7e9
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,2 @@
+docker==7.1.0
+pytest==8.3.2
diff --git a/tests/tei/__init__.py b/tests/tei/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/tei/test_tei.py b/tests/tei/test_tei.py
new file mode 100644
index 00000000..5edeeb47
--- /dev/null
+++ b/tests/tei/test_tei.py
@@ -0,0 +1,135 @@
+import logging
+import os
+import threading
+import time
+
+import docker
+import pytest
+import requests
+
+from docker.types.containers import DeviceRequest
+
+from ..utils import gpu_available, stream_logs
+
+MAX_RETRIES = 10
+
+
+@pytest.mark.parametrize(
+    "text_embeddings_router_kwargs",
+    [
+        {
+            "MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2",
+        },
+        {
+            "MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2",
+            "AIP_MODE": "PREDICTION",
+        },
+    ],
+)
+def test_text_embeddings_inference(
+    caplog: pytest.LogCaptureFixture,
+    text_embeddings_router_kwargs: dict,
+) -> None:
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("TEI_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "TEI_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    logging.info(
+        f"Starting container for {text_embeddings_router_kwargs.get('MODEL_ID', None)}..."
+    )
+    container = client.containers.run(
+        container_uri,
+        ports={8080: 8080},  # type: ignore
+        environment=text_embeddings_router_kwargs,
+        healthcheck={
+            "test": ["CMD", "curl", "-s", "http://localhost:8080/health"],
+            "interval": int(30 * 1e9),
+            "timeout": int(30 * 1e9),
+            "retries": 3,
+            "start_period": int(30 * 1e9),
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Extra `device_requests` related to the CUDA devices if any
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])]
+        if gpu_available()
+        else None,
+    )
+    logging.info(f"Container {container.id} started...")  # type: ignore
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    # Get endpoint names for both health and predict (may differ if AIP env vars are defined)
+    health_route = os.getenv("AIP_HEALTH_ROUTE", "/health")
+    predict_route = (
+        os.getenv("AIP_PREDICT_ROUTE", "/predict")
+        if os.getenv("AIP_MODE")
+        else "/embed"
+    )
+
+    container_healthy = False
+    for _ in range(MAX_RETRIES):
+        # It the container failed to start properly, then the health check will fail
+        if container.status == "exited":  # type: ignore
+            container_healthy = False
+            break
+
+        try:
+            logging.info(
+                f"Trying to connect to http://localhost:8080{health_route} [retry {_ + 1}/{MAX_RETRIES}]..."
+            )
+            response = requests.get(f"http://localhost:8080{health_route}")
+            assert response.status_code == 200
+            container_healthy = True
+            break
+        except requests.exceptions.ConnectionError:
+            time.sleep(30)
+
+    if not container_healthy:
+        logging.error("Container is not healthy after several retries...")
+        container.stop()  # type: ignore
+
+    assert container_healthy
+
+    container_failed = False
+    try:
+        logging.info(
+            f"Sending prediction request to http://localhost:8080{predict_route}..."
+        )
+        payload = {"inputs": "What's Deep Learning?"}
+
+        if os.getenv("AIP_MODE"):
+            payload = {"instances": [payload]}
+
+        start_time = time.perf_counter()
+        response = requests.post(
+            f"http://localhost:8080{predict_route}",
+            json=payload,
+        )
+        end_time = time.perf_counter()
+
+        assert response.status_code in [200, 201]
+        assert response.json() is not None
+
+        logging.info(f"Prediction request took {end_time - start_time:.2f}s")
+    except Exception as e:
+        logging.error(
+            f"Error while sending prediction request with exception: {e}"  # type: ignore
+        )
+        container_failed = True
+    finally:
+        if log_thread.is_alive():
+            log_thread.join(timeout=5)
+
+        logging.info(f"Stopping container {container.id}...")  # type: ignore
+        container.stop()  # type: ignore
+        container.remove()  # type: ignore
+
+    assert not container_failed
diff --git a/tests/tgi/__init__.py b/tests/tgi/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/tgi/test_tgi.py b/tests/tgi/test_tgi.py
new file mode 100644
index 00000000..c50deb86
--- /dev/null
+++ b/tests/tgi/test_tgi.py
@@ -0,0 +1,155 @@
+import logging
+import os
+import threading
+import time
+
+import docker
+import pytest
+import requests
+
+from docker.types.containers import DeviceRequest
+
+from ..utils import gpu_available, stream_logs, supports_flash_attention
+
+MAX_RETRIES = 10
+
+
+@pytest.mark.skipif(not gpu_available(), reason="CUDA is not available")
+@pytest.mark.parametrize(
+    "text_generation_launcher_kwargs",
+    [
+        {
+            "MODEL_ID": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "MAX_INPUT_TOKENS": "512",
+            "MAX_TOTAL_TOKENS": "1024",
+            "MAX_BATCH_PREFILL_TOKENS": "1512",
+        },
+        {
+            "MODEL_ID": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "MAX_INPUT_TOKENS": "512",
+            "MAX_TOTAL_TOKENS": "1024",
+            "MAX_BATCH_PREFILL_TOKENS": "1512",
+            "AIP_MODE": "PREDICTION",
+        },
+    ],
+)
+def test_text_generation_inference(
+    caplog: pytest.LogCaptureFixture,
+    text_generation_launcher_kwargs: dict,
+) -> None:
+    caplog.set_level(logging.INFO)
+
+    container_uri = os.getenv("TGI_DLC", None)
+    if container_uri is None or container_uri == "":
+        assert False, "TGI_DLC environment variable is not set"
+
+    client = docker.from_env()
+
+    # If the GPU doesn't support Flash Attention, then set `USE_FLASH_ATTENTION=false`
+    if not supports_flash_attention():
+        text_generation_launcher_kwargs["USE_FLASH_ATTENTION"] = "false"
+
+    logging.info(
+        f"Starting container for {text_generation_launcher_kwargs.get('MODEL_ID', None)}..."
+    )
+    container = client.containers.run(
+        container_uri,
+        ports={8080: 8080},  # type: ignore
+        environment=text_generation_launcher_kwargs,
+        healthcheck={
+            "test": ["CMD", "curl", "-s", "http://localhost:8080/health"],
+            "interval": int(30 * 1e9),
+            "timeout": int(30 * 1e9),
+            "retries": 3,
+            "start_period": int(30 * 1e9),
+        },
+        platform="linux/amd64",
+        detach=True,
+        # Extra kwargs related to the CUDA devices
+        device_requests=[DeviceRequest(count=-1, capabilities=[["gpu"]])],
+    )
+    logging.info(f"Container {container.id} started...")  # type: ignore
+
+    # Start log streaming in a separate thread
+    log_thread = threading.Thread(target=stream_logs, args=(container,))
+    log_thread.daemon = True
+    log_thread.start()
+
+    # Get endpoint names for both health and predict (may differ if AIP env vars are defined)
+    health_route = os.getenv("AIP_HEALTH_ROUTE", "/health")
+    predict_route = (
+        os.getenv("AIP_PREDICT_ROUTE", "/predict")
+        if os.getenv("AIP_MODE")
+        else "/generate"
+    )
+
+    container_healthy = False
+    for _ in range(MAX_RETRIES):
+        # It the container failed to start properly, then the health check will fail
+        if container.status == "exited":  # type: ignore
+            container_healthy = False
+            break
+
+        try:
+            logging.info(
+                f"Trying to connect to http://localhost:8080{health_route} [retry {_ + 1}/{MAX_RETRIES}]..."
+            )
+            response = requests.get(f"http://localhost:8080{health_route}")
+            assert response.status_code == 200
+            container_healthy = True
+            break
+        except requests.exceptions.ConnectionError:
+            time.sleep(30)
+
+    if not container_healthy:
+        logging.error("Container is not healthy after several retries...")
+        container.stop()  # type: ignore
+
+    assert container_healthy
+
+    container_failed = False
+    try:
+        for prompt in ["What's Deep Learning?", "What's the capital of France?"]:
+            logging.info(
+                f"Sending prediction request for {prompt=} to http://localhost:8080{predict_route}..."
+            )
+            payload = {
+                "inputs": prompt,
+                "parameters": {
+                    "max_new_tokens": 256,
+                    "do_sample": True,
+                    "top_p": 0.95,
+                    "temperature": 1.0,
+                },
+            }
+
+            if os.getenv("AIP_MODE"):
+                payload = {"instances": [payload]}
+
+            start_time = time.perf_counter()
+            response = requests.post(
+                f"http://localhost:8080{predict_route}",
+                json=payload,
+            )
+            end_time = time.perf_counter()
+
+            assert response.status_code in [200, 201]
+            assert "generated_text" in response.json()
+
+            logging.info(
+                f"Prediction request for {prompt=} took {end_time - start_time:.2f}s"
+            )
+    except Exception as e:
+        logging.error(
+            f"Error while sending prediction request with exception: {e}"  # type: ignore
+        )
+        container_failed = True
+    finally:
+        if log_thread.is_alive():
+            log_thread.join(timeout=5)
+
+        logging.info(f"Stopping container {container.id}...")  # type: ignore
+        container.stop()  # type: ignore
+        container.remove()  # type: ignore
+
+    assert not container_failed
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 00000000..b4814029
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,30 @@
+import logging
+import subprocess
+
+from docker.models.containers import Container
+
+
+def stream_logs(container: Container) -> None:
+    """Streams the logs generated by `containers.run` via the Docker SDK for Python."""
+    for line in container.logs(stream=True, follow=True):
+        logging.info(line.decode("utf-8", errors="ignore").strip())
+
+
+def gpu_available() -> bool:
+    """Returns whether the current environment has a GPU available."""
+    try:
+        subprocess.run(["nvidia-smi"], capture_output=True, text=True)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+def supports_flash_attention() -> bool:
+    """Returns whether the current GPU supports Flash Attention or not (based on compute capability)."""
+    output = subprocess.run(
+        ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader,nounits"],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return float(output.stdout.strip()) >= 8.0