Skip to content

Commit

Permalink
Triton Inference Server In-Process Python API [Beta]
Browse files Browse the repository at this point in the history
Enables developers to integrate triton inference server instances into their applications.

Co-authored-by: Ryan McCormick
Co-authored-by: Tabrizian
Co-authored-by: Olga Andreeva
Co-authored-by: GuanLuo
  • Loading branch information
nnshah1 authored Jan 11, 2024
1 parent 3b97b2f commit 6d0cd38
Show file tree
Hide file tree
Showing 21 changed files with 4,945 additions and 14 deletions.
18 changes: 13 additions & 5 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ add_subdirectory(tritonserver)
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
configure_file(../LICENSE LICENSE.txt COPYONLY)
configure_file(setup.py setup.py @ONLY)
file(COPY test/test_binding.py DESTINATION ./test/.)
file(COPY test/ DESTINATION ./test/.)

set(WHEEL_DEPENDS
${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION
Expand Down Expand Up @@ -58,12 +58,20 @@ add_custom_target(
"${wheel_stamp_file}"
)


# Wheel
set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/")
install(
CODE "file(GLOB _Wheel \"${CMAKE_CURRENT_BINARY_DIR}/generic/triton*.whl\")"
CODE "file(INSTALL \${_Wheel} DESTINATION \"${CMAKE_INSTALL_PREFIX}/python\")"
DIRECTORY
${WHEEL_OUT_DIR}
DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
)

# Test

# Tests
set(TEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/test")
install(
CODE "file(INSTALL ${CMAKE_CURRENT_BINARY_DIR}/test/test_binding.py DESTINATION \"${CMAKE_INSTALL_PREFIX}/python\")"
DIRECTORY
${TEST_DIR}
DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
)
6 changes: 4 additions & 2 deletions python/build_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,13 @@ def sed(pattern, replace, source, dest=None):
print("=== Building in: {}".format(os.getcwd()))
print("=== Using builddir: {}".format(FLAGS.whl_dir))
print("Adding package files")

mkdir(os.path.join(FLAGS.whl_dir, "tritonserver"))
shutil.copy("tritonserver/__init__.py", os.path.join(FLAGS.whl_dir, "tritonserver"))

# Type checking marker file indicating support for type checkers.
# https://peps.python.org/pep-0561/
shutil.copy("tritonserver/py.typed", os.path.join(FLAGS.whl_dir, "tritonserver"))
cpdir("tritonserver/_c", os.path.join(FLAGS.whl_dir, "tritonserver", "_c"))
cpdir("tritonserver/_api", os.path.join(FLAGS.whl_dir, "tritonserver", "_api"))
PYBIND_LIB = os.path.basename(FLAGS.binding_path)
shutil.copyfile(
FLAGS.binding_path,
Expand Down
19 changes: 17 additions & 2 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,27 @@ def get_tag(self):
data_files = [
("", ["LICENSE.txt"]),
]
platform_package_data = [os.environ["TRITON_PYBIND"]]

# Type checking marker file indicating support for type checkers.
# https://peps.python.org/pep-0561/
# Type hints for c extension generated by mypy
platform_package_data = [
os.environ["TRITON_PYBIND"],
"py.typed",
"_c/__init__.pyi",
"_c/triton_bindings.pyi",
]

gpu_extras = ["cupy-cuda12x"]
test_extras = ["pytest"]
all_extras = gpu_extras + test_extras

setup(
name="tritonserver",
version=VERSION,
author="NVIDIA Inc.",
author_email="[email protected]",
description="Python API of the Triton In-Process Server",
description="Triton Inference Server In-Process Python API",
license="BSD",
url="https://developer.nvidia.com/nvidia-triton-inference-server",
classifiers=[
Expand All @@ -95,4 +108,6 @@ def get_tag(self):
zip_safe=False,
cmdclass={"bdist_wheel": bdist_wheel},
data_files=data_files,
install_requires=["numpy"],
extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
)
254 changes: 254 additions & 0 deletions python/test/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import asyncio
import json
import os
import queue
import time
import unittest

import numpy
import pytest
import tritonserver

try:
import cupy
except ImportError:
cupy = None

try:
import torch
except ImportError:
torch = None

module_directory = os.path.split(os.path.abspath(__file__))[0]
test_model_directory = os.path.abspath(
os.path.join(module_directory, "test_api_models")
)

server_options = tritonserver.Options(
server_id="TestServer",
model_repository=test_model_directory,
log_verbose=0,
log_error=True,
exit_on_error=True,
strict_model_config=False,
model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
)


class ModelTests(unittest.TestCase):
def test_create_request(self):
server = tritonserver.Server(server_options).start(wait_until_ready=True)

request = server.models()["test"].create_request()

request = tritonserver.InferenceRequest(server.model("test"))


class AllocatorTests(unittest.TestCase):
def test_allocate_on_cpu_and_reshape(self):
allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]

memory_buffer = allocator.allocate(
memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200
)

cpu_array = memory_buffer.owner

self.assertEqual(memory_buffer.size, 200)

fp32_size = int(memory_buffer.size / 4)

tensor = tritonserver.Tensor(
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
)

cpu_fp32_array = numpy.from_dlpack(tensor)
self.assertEqual(cpu_array.ctypes.data, cpu_fp32_array.ctypes.data)
self.assertEqual(cpu_fp32_array.dtype, numpy.float32)
self.assertEqual(cpu_fp32_array.nbytes, 200)

@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
def test_allocate_on_gpu_and_reshape(self):
if cupy is None:
return

allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]

memory_buffer = allocator.allocate(
memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200
)

gpu_array = memory_buffer.owner

gpu_array = cupy.empty([10, 20], dtype=cupy.uint8)
memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)

self.assertEqual(memory_buffer.size, 200)

fp32_size = int(memory_buffer.size / 4)

tensor = tritonserver.Tensor(
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
)

gpu_fp32_array = cupy.from_dlpack(tensor)
self.assertEqual(
gpu_array.__cuda_array_interface__["data"][0],
gpu_fp32_array.__cuda_array_interface__["data"][0],
)
self.assertEqual(gpu_fp32_array.dtype, cupy.float32)
self.assertEqual(gpu_fp32_array.nbytes, 200)

torch_fp32_tensor = torch.from_dlpack(tensor)
self.assertEqual(torch_fp32_tensor.dtype, torch.float32)
self.assertEqual(
torch_fp32_tensor.data_ptr(), gpu_array.__cuda_array_interface__["data"][0]
)
self.assertEqual(torch_fp32_tensor.nbytes, 200)


class TensorTests(unittest.TestCase):
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
def test_cpu_to_gpu(self):
if cupy is None:
return
cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32)
cpu_tensor = tritonserver.Tensor.from_dlpack(cpu_array)
gpu_tensor = cpu_tensor.to_device("gpu:0")
gpu_array = cupy.from_dlpack(gpu_tensor)

self.assertEqual(gpu_array.device, cupy.cuda.Device(0))

numpy.testing.assert_array_equal(cpu_array, gpu_array.get())

memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)

self.assertEqual(
gpu_array.__cuda_array_interface__["data"][0], memory_buffer.data_ptr
)

@pytest.mark.skipif(
torch is None, reason="Skipping gpu memory, torch not installed"
)
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
def test_gpu_tensor_from_dl_pack(self):
if cupy is None or torch is None:
return
cupy_array = cupy.ones([100]).astype(cupy.float64)
tensor = tritonserver.Tensor.from_dlpack(cupy_array)
torch_tensor = torch.from_dlpack(cupy_array)

self.assertEqual(torch_tensor.data_ptr(), tensor.data_ptr)
self.assertEqual(torch_tensor.nbytes, tensor.size)
self.assertEqual(torch_tensor.__dlpack_device__(), tensor.__dlpack_device__())

@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
def test_tensor_from_numpy(self):
cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32)
tensor = tritonserver.Tensor.from_dlpack(cpu_array)
torch_tensor = torch.from_dlpack(tensor)
numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array)
self.assertEqual(torch_tensor.data_ptr(), cpu_array.ctypes.data)


class ServerTests(unittest.TestCase):
def test_not_started(self):
server = tritonserver.Server()
with self.assertRaises(tritonserver.InvalidArgumentError):
server.ready()

def test_invalid_option_type(self):
server = tritonserver.Server(server_id=1)
with self.assertRaises(TypeError):
server.start()

server = tritonserver.Server(model_repository=1)
with self.assertRaises(TypeError):
server.start()

def test_invalid_repo(self):
with self.assertRaises(tritonserver.InternalError):
tritonserver.Server(model_repository="foo").start()

def test_ready(self):
server = tritonserver.Server(server_options).start()
self.assertTrue(server.ready())


class InferenceTests(unittest.TestCase):
def test_basic_inference(self):
server = tritonserver.Server(server_options).start(wait_until_ready=True)

self.assertTrue(server.ready())

server.load(
"test",
{
"config": json.dumps(
{
"backend": "python",
"parameters": {"decoupled": {"string_value": "False"}},
}
)
},
)

fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16)

for response in server.model("test").infer(
inputs={"fp16_input": fp16_input},
output_memory_type="cpu",
raise_on_error=True,
):
fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
numpy.testing.assert_array_equal(fp16_input, fp16_output)

for response in server.model("test").infer(
inputs={"fp16_input": fp16_input},
output_memory_type="gpu",
):
fp16_output = cupy.from_dlpack(response.outputs["fp16_output"])
self.assertEqual(fp16_input[0][0], fp16_output[0][0])

for response in server.model("test").infer(
inputs={"string_input": [["hello"]]},
output_memory_type="gpu",
):
text_output = response.outputs["string_output"].to_string_array()
self.assertEqual(text_output[0][0], "hello")

for response in server.model("test").infer(
inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])},
output_memory_type="gpu",
):
text_output = response.outputs["string_output"].to_string_array()
text_output = response.outputs["string_output"].to_string_array()
self.assertEqual(text_output[0][0], "hello")
server.stop()
Loading

0 comments on commit 6d0cd38

Please sign in to comment.