Triton Inference Server In-Process Python API [Beta]

Enables developers to integrate triton inference server instances into their applications. Co-authored-by: Ryan McCormick Co-authored-by: Tabrizian Co-authored-by: Olga Andreeva Co-authored-by: GuanLuo
triton-inference-server · Jan 11, 2024 · 6d0cd38 · 6d0cd38
1 parent 3b97b2f
commit 6d0cd38
Show file tree

Hide file tree

Showing 21 changed files with 4,945 additions and 14 deletions.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -30,7 +30,7 @@ add_subdirectory(tritonserver)
 file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
 configure_file(../LICENSE LICENSE.txt COPYONLY)
 configure_file(setup.py setup.py @ONLY)
-file(COPY test/test_binding.py DESTINATION ./test/.)
+file(COPY test/  DESTINATION ./test/.)
 
 set(WHEEL_DEPENDS
       ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION
@@ -58,12 +58,20 @@ add_custom_target(
     "${wheel_stamp_file}"
 )
 
+
+# Wheel
+set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/")
 install(
-  CODE "file(GLOB _Wheel \"${CMAKE_CURRENT_BINARY_DIR}/generic/triton*.whl\")"
-  CODE "file(INSTALL \${_Wheel} DESTINATION \"${CMAKE_INSTALL_PREFIX}/python\")"
+  DIRECTORY
+  ${WHEEL_OUT_DIR}
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
 )
 
-# Test
+
+# Tests
+set(TEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/test")
 install(
-  CODE "file(INSTALL ${CMAKE_CURRENT_BINARY_DIR}/test/test_binding.py DESTINATION \"${CMAKE_INSTALL_PREFIX}/python\")"
+  DIRECTORY
+  ${TEST_DIR}
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
 )
diff --git a/python/build_wheel.py b/python/build_wheel.py
@@ -93,11 +93,13 @@ def sed(pattern, replace, source, dest=None):
     print("=== Building in: {}".format(os.getcwd()))
     print("=== Using builddir: {}".format(FLAGS.whl_dir))
     print("Adding package files")
-
     mkdir(os.path.join(FLAGS.whl_dir, "tritonserver"))
     shutil.copy("tritonserver/__init__.py", os.path.join(FLAGS.whl_dir, "tritonserver"))
-
+    # Type checking marker file indicating support for type checkers.
+    # https://peps.python.org/pep-0561/
+    shutil.copy("tritonserver/py.typed", os.path.join(FLAGS.whl_dir, "tritonserver"))
     cpdir("tritonserver/_c", os.path.join(FLAGS.whl_dir, "tritonserver", "_c"))
+    cpdir("tritonserver/_api", os.path.join(FLAGS.whl_dir, "tritonserver", "_api"))
     PYBIND_LIB = os.path.basename(FLAGS.binding_path)
     shutil.copyfile(
         FLAGS.binding_path,

diff --git a/python/setup.py b/python/setup.py
@@ -61,14 +61,27 @@ def get_tag(self):
 data_files = [
     ("", ["LICENSE.txt"]),
 ]
-platform_package_data = [os.environ["TRITON_PYBIND"]]
+
+# Type checking marker file indicating support for type checkers.
+# https://peps.python.org/pep-0561/
+# Type hints for c extension generated by mypy
+platform_package_data = [
+    os.environ["TRITON_PYBIND"],
+    "py.typed",
+    "_c/__init__.pyi",
+    "_c/triton_bindings.pyi",
+]
+
+gpu_extras = ["cupy-cuda12x"]
+test_extras = ["pytest"]
+all_extras = gpu_extras + test_extras
 
 setup(
     name="tritonserver",
     version=VERSION,
     author="NVIDIA Inc.",
     author_email="[email protected]",
-    description="Python API of the Triton In-Process Server",
+    description="Triton Inference Server In-Process Python API",
     license="BSD",
     url="https://developer.nvidia.com/nvidia-triton-inference-server",
     classifiers=[
@@ -95,4 +108,6 @@ def get_tag(self):
     zip_safe=False,
     cmdclass={"bdist_wheel": bdist_wheel},
     data_files=data_files,
+    install_requires=["numpy"],
+    extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
 )
diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -0,0 +1,254 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import asyncio
+import json
+import os
+import queue
+import time
+import unittest
+
+import numpy
+import pytest
+import tritonserver
+
+try:
+    import cupy
+except ImportError:
+    cupy = None
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+module_directory = os.path.split(os.path.abspath(__file__))[0]
+test_model_directory = os.path.abspath(
+    os.path.join(module_directory, "test_api_models")
+)
+
+server_options = tritonserver.Options(
+    server_id="TestServer",
+    model_repository=test_model_directory,
+    log_verbose=0,
+    log_error=True,
+    exit_on_error=True,
+    strict_model_config=False,
+    model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
+)
+
+
+class ModelTests(unittest.TestCase):
+    def test_create_request(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        request = server.models()["test"].create_request()
+
+        request = tritonserver.InferenceRequest(server.model("test"))
+
+
+class AllocatorTests(unittest.TestCase):
+    def test_allocate_on_cpu_and_reshape(self):
+        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]
+
+        memory_buffer = allocator.allocate(
+            memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200
+        )
+
+        cpu_array = memory_buffer.owner
+
+        self.assertEqual(memory_buffer.size, 200)
+
+        fp32_size = int(memory_buffer.size / 4)
+
+        tensor = tritonserver.Tensor(
+            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
+        )
+
+        cpu_fp32_array = numpy.from_dlpack(tensor)
+        self.assertEqual(cpu_array.ctypes.data, cpu_fp32_array.ctypes.data)
+        self.assertEqual(cpu_fp32_array.dtype, numpy.float32)
+        self.assertEqual(cpu_fp32_array.nbytes, 200)
+
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
+    def test_allocate_on_gpu_and_reshape(self):
+        if cupy is None:
+            return
+
+        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
+
+        memory_buffer = allocator.allocate(
+            memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200
+        )
+
+        gpu_array = memory_buffer.owner
+
+        gpu_array = cupy.empty([10, 20], dtype=cupy.uint8)
+        memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)
+
+        self.assertEqual(memory_buffer.size, 200)
+
+        fp32_size = int(memory_buffer.size / 4)
+
+        tensor = tritonserver.Tensor(
+            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
+        )
+
+        gpu_fp32_array = cupy.from_dlpack(tensor)
+        self.assertEqual(
+            gpu_array.__cuda_array_interface__["data"][0],
+            gpu_fp32_array.__cuda_array_interface__["data"][0],
+        )
+        self.assertEqual(gpu_fp32_array.dtype, cupy.float32)
+        self.assertEqual(gpu_fp32_array.nbytes, 200)
+
+        torch_fp32_tensor = torch.from_dlpack(tensor)
+        self.assertEqual(torch_fp32_tensor.dtype, torch.float32)
+        self.assertEqual(
+            torch_fp32_tensor.data_ptr(), gpu_array.__cuda_array_interface__["data"][0]
+        )
+        self.assertEqual(torch_fp32_tensor.nbytes, 200)
+
+
+class TensorTests(unittest.TestCase):
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    def test_cpu_to_gpu(self):
+        if cupy is None:
+            return
+        cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32)
+        cpu_tensor = tritonserver.Tensor.from_dlpack(cpu_array)
+        gpu_tensor = cpu_tensor.to_device("gpu:0")
+        gpu_array = cupy.from_dlpack(gpu_tensor)
+
+        self.assertEqual(gpu_array.device, cupy.cuda.Device(0))
+
+        numpy.testing.assert_array_equal(cpu_array, gpu_array.get())
+
+        memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)
+
+        self.assertEqual(
+            gpu_array.__cuda_array_interface__["data"][0], memory_buffer.data_ptr
+        )
+
+    @pytest.mark.skipif(
+        torch is None, reason="Skipping gpu memory, torch not installed"
+    )
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    def test_gpu_tensor_from_dl_pack(self):
+        if cupy is None or torch is None:
+            return
+        cupy_array = cupy.ones([100]).astype(cupy.float64)
+        tensor = tritonserver.Tensor.from_dlpack(cupy_array)
+        torch_tensor = torch.from_dlpack(cupy_array)
+
+        self.assertEqual(torch_tensor.data_ptr(), tensor.data_ptr)
+        self.assertEqual(torch_tensor.nbytes, tensor.size)
+        self.assertEqual(torch_tensor.__dlpack_device__(), tensor.__dlpack_device__())
+
+    @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
+    def test_tensor_from_numpy(self):
+        cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32)
+        tensor = tritonserver.Tensor.from_dlpack(cpu_array)
+        torch_tensor = torch.from_dlpack(tensor)
+        numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array)
+        self.assertEqual(torch_tensor.data_ptr(), cpu_array.ctypes.data)
+
+
+class ServerTests(unittest.TestCase):
+    def test_not_started(self):
+        server = tritonserver.Server()
+        with self.assertRaises(tritonserver.InvalidArgumentError):
+            server.ready()
+
+    def test_invalid_option_type(self):
+        server = tritonserver.Server(server_id=1)
+        with self.assertRaises(TypeError):
+            server.start()
+
+        server = tritonserver.Server(model_repository=1)
+        with self.assertRaises(TypeError):
+            server.start()
+
+    def test_invalid_repo(self):
+        with self.assertRaises(tritonserver.InternalError):
+            tritonserver.Server(model_repository="foo").start()
+
+    def test_ready(self):
+        server = tritonserver.Server(server_options).start()
+        self.assertTrue(server.ready())
+
+
+class InferenceTests(unittest.TestCase):
+    def test_basic_inference(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        self.assertTrue(server.ready())
+
+        server.load(
+            "test",
+            {
+                "config": json.dumps(
+                    {
+                        "backend": "python",
+                        "parameters": {"decoupled": {"string_value": "False"}},
+                    }
+                )
+            },
+        )
+
+        fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16)
+
+        for response in server.model("test").infer(
+            inputs={"fp16_input": fp16_input},
+            output_memory_type="cpu",
+            raise_on_error=True,
+        ):
+            fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
+            numpy.testing.assert_array_equal(fp16_input, fp16_output)
+
+        for response in server.model("test").infer(
+            inputs={"fp16_input": fp16_input},
+            output_memory_type="gpu",
+        ):
+            fp16_output = cupy.from_dlpack(response.outputs["fp16_output"])
+            self.assertEqual(fp16_input[0][0], fp16_output[0][0])
+
+        for response in server.model("test").infer(
+            inputs={"string_input": [["hello"]]},
+            output_memory_type="gpu",
+        ):
+            text_output = response.outputs["string_output"].to_string_array()
+            self.assertEqual(text_output[0][0], "hello")
+
+        for response in server.model("test").infer(
+            inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])},
+            output_memory_type="gpu",
+        ):
+            text_output = response.outputs["string_output"].to_string_array()
+            text_output = response.outputs["string_output"].to_string_array()
+            self.assertEqual(text_output[0][0], "hello")
+        server.stop()