-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Triton Inference Server In-Process Python API [Beta]
Enables developers to integrate triton inference server instances into their applications. Co-authored-by: Ryan McCormick Co-authored-by: Tabrizian Co-authored-by: Olga Andreeva Co-authored-by: GuanLuo
- Loading branch information
Showing
21 changed files
with
4,945 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,14 +61,27 @@ def get_tag(self): | |
data_files = [ | ||
("", ["LICENSE.txt"]), | ||
] | ||
platform_package_data = [os.environ["TRITON_PYBIND"]] | ||
|
||
# Type checking marker file indicating support for type checkers. | ||
# https://peps.python.org/pep-0561/ | ||
# Type hints for c extension generated by mypy | ||
platform_package_data = [ | ||
os.environ["TRITON_PYBIND"], | ||
"py.typed", | ||
"_c/__init__.pyi", | ||
"_c/triton_bindings.pyi", | ||
] | ||
|
||
gpu_extras = ["cupy-cuda12x"] | ||
test_extras = ["pytest"] | ||
all_extras = gpu_extras + test_extras | ||
|
||
setup( | ||
name="tritonserver", | ||
version=VERSION, | ||
author="NVIDIA Inc.", | ||
author_email="[email protected]", | ||
description="Python API of the Triton In-Process Server", | ||
description="Triton Inference Server In-Process Python API", | ||
license="BSD", | ||
url="https://developer.nvidia.com/nvidia-triton-inference-server", | ||
classifiers=[ | ||
|
@@ -95,4 +108,6 @@ def get_tag(self): | |
zip_safe=False, | ||
cmdclass={"bdist_wheel": bdist_wheel}, | ||
data_files=data_files, | ||
install_requires=["numpy"], | ||
extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
import asyncio | ||
import json | ||
import os | ||
import queue | ||
import time | ||
import unittest | ||
|
||
import numpy | ||
import pytest | ||
import tritonserver | ||
|
||
try: | ||
import cupy | ||
except ImportError: | ||
cupy = None | ||
|
||
try: | ||
import torch | ||
except ImportError: | ||
torch = None | ||
|
||
module_directory = os.path.split(os.path.abspath(__file__))[0] | ||
test_model_directory = os.path.abspath( | ||
os.path.join(module_directory, "test_api_models") | ||
) | ||
|
||
server_options = tritonserver.Options( | ||
server_id="TestServer", | ||
model_repository=test_model_directory, | ||
log_verbose=0, | ||
log_error=True, | ||
exit_on_error=True, | ||
strict_model_config=False, | ||
model_control_mode=tritonserver.ModelControlMode.EXPLICIT, | ||
) | ||
|
||
|
||
class ModelTests(unittest.TestCase): | ||
def test_create_request(self): | ||
server = tritonserver.Server(server_options).start(wait_until_ready=True) | ||
|
||
request = server.models()["test"].create_request() | ||
|
||
request = tritonserver.InferenceRequest(server.model("test")) | ||
|
||
|
||
class AllocatorTests(unittest.TestCase): | ||
def test_allocate_on_cpu_and_reshape(self): | ||
allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU] | ||
|
||
memory_buffer = allocator.allocate( | ||
memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200 | ||
) | ||
|
||
cpu_array = memory_buffer.owner | ||
|
||
self.assertEqual(memory_buffer.size, 200) | ||
|
||
fp32_size = int(memory_buffer.size / 4) | ||
|
||
tensor = tritonserver.Tensor( | ||
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer | ||
) | ||
|
||
cpu_fp32_array = numpy.from_dlpack(tensor) | ||
self.assertEqual(cpu_array.ctypes.data, cpu_fp32_array.ctypes.data) | ||
self.assertEqual(cpu_fp32_array.dtype, numpy.float32) | ||
self.assertEqual(cpu_fp32_array.nbytes, 200) | ||
|
||
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") | ||
@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") | ||
def test_allocate_on_gpu_and_reshape(self): | ||
if cupy is None: | ||
return | ||
|
||
allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] | ||
|
||
memory_buffer = allocator.allocate( | ||
memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200 | ||
) | ||
|
||
gpu_array = memory_buffer.owner | ||
|
||
gpu_array = cupy.empty([10, 20], dtype=cupy.uint8) | ||
memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array) | ||
|
||
self.assertEqual(memory_buffer.size, 200) | ||
|
||
fp32_size = int(memory_buffer.size / 4) | ||
|
||
tensor = tritonserver.Tensor( | ||
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer | ||
) | ||
|
||
gpu_fp32_array = cupy.from_dlpack(tensor) | ||
self.assertEqual( | ||
gpu_array.__cuda_array_interface__["data"][0], | ||
gpu_fp32_array.__cuda_array_interface__["data"][0], | ||
) | ||
self.assertEqual(gpu_fp32_array.dtype, cupy.float32) | ||
self.assertEqual(gpu_fp32_array.nbytes, 200) | ||
|
||
torch_fp32_tensor = torch.from_dlpack(tensor) | ||
self.assertEqual(torch_fp32_tensor.dtype, torch.float32) | ||
self.assertEqual( | ||
torch_fp32_tensor.data_ptr(), gpu_array.__cuda_array_interface__["data"][0] | ||
) | ||
self.assertEqual(torch_fp32_tensor.nbytes, 200) | ||
|
||
|
||
class TensorTests(unittest.TestCase): | ||
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") | ||
def test_cpu_to_gpu(self): | ||
if cupy is None: | ||
return | ||
cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32) | ||
cpu_tensor = tritonserver.Tensor.from_dlpack(cpu_array) | ||
gpu_tensor = cpu_tensor.to_device("gpu:0") | ||
gpu_array = cupy.from_dlpack(gpu_tensor) | ||
|
||
self.assertEqual(gpu_array.device, cupy.cuda.Device(0)) | ||
|
||
numpy.testing.assert_array_equal(cpu_array, gpu_array.get()) | ||
|
||
memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array) | ||
|
||
self.assertEqual( | ||
gpu_array.__cuda_array_interface__["data"][0], memory_buffer.data_ptr | ||
) | ||
|
||
@pytest.mark.skipif( | ||
torch is None, reason="Skipping gpu memory, torch not installed" | ||
) | ||
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") | ||
def test_gpu_tensor_from_dl_pack(self): | ||
if cupy is None or torch is None: | ||
return | ||
cupy_array = cupy.ones([100]).astype(cupy.float64) | ||
tensor = tritonserver.Tensor.from_dlpack(cupy_array) | ||
torch_tensor = torch.from_dlpack(cupy_array) | ||
|
||
self.assertEqual(torch_tensor.data_ptr(), tensor.data_ptr) | ||
self.assertEqual(torch_tensor.nbytes, tensor.size) | ||
self.assertEqual(torch_tensor.__dlpack_device__(), tensor.__dlpack_device__()) | ||
|
||
@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") | ||
def test_tensor_from_numpy(self): | ||
cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32) | ||
tensor = tritonserver.Tensor.from_dlpack(cpu_array) | ||
torch_tensor = torch.from_dlpack(tensor) | ||
numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array) | ||
self.assertEqual(torch_tensor.data_ptr(), cpu_array.ctypes.data) | ||
|
||
|
||
class ServerTests(unittest.TestCase): | ||
def test_not_started(self): | ||
server = tritonserver.Server() | ||
with self.assertRaises(tritonserver.InvalidArgumentError): | ||
server.ready() | ||
|
||
def test_invalid_option_type(self): | ||
server = tritonserver.Server(server_id=1) | ||
with self.assertRaises(TypeError): | ||
server.start() | ||
|
||
server = tritonserver.Server(model_repository=1) | ||
with self.assertRaises(TypeError): | ||
server.start() | ||
|
||
def test_invalid_repo(self): | ||
with self.assertRaises(tritonserver.InternalError): | ||
tritonserver.Server(model_repository="foo").start() | ||
|
||
def test_ready(self): | ||
server = tritonserver.Server(server_options).start() | ||
self.assertTrue(server.ready()) | ||
|
||
|
||
class InferenceTests(unittest.TestCase): | ||
def test_basic_inference(self): | ||
server = tritonserver.Server(server_options).start(wait_until_ready=True) | ||
|
||
self.assertTrue(server.ready()) | ||
|
||
server.load( | ||
"test", | ||
{ | ||
"config": json.dumps( | ||
{ | ||
"backend": "python", | ||
"parameters": {"decoupled": {"string_value": "False"}}, | ||
} | ||
) | ||
}, | ||
) | ||
|
||
fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16) | ||
|
||
for response in server.model("test").infer( | ||
inputs={"fp16_input": fp16_input}, | ||
output_memory_type="cpu", | ||
raise_on_error=True, | ||
): | ||
fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) | ||
numpy.testing.assert_array_equal(fp16_input, fp16_output) | ||
|
||
for response in server.model("test").infer( | ||
inputs={"fp16_input": fp16_input}, | ||
output_memory_type="gpu", | ||
): | ||
fp16_output = cupy.from_dlpack(response.outputs["fp16_output"]) | ||
self.assertEqual(fp16_input[0][0], fp16_output[0][0]) | ||
|
||
for response in server.model("test").infer( | ||
inputs={"string_input": [["hello"]]}, | ||
output_memory_type="gpu", | ||
): | ||
text_output = response.outputs["string_output"].to_string_array() | ||
self.assertEqual(text_output[0][0], "hello") | ||
|
||
for response in server.model("test").infer( | ||
inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])}, | ||
output_memory_type="gpu", | ||
): | ||
text_output = response.outputs["string_output"].to_string_array() | ||
text_output = response.outputs["string_output"].to_string_array() | ||
self.assertEqual(text_output[0][0], "hello") | ||
server.stop() |
Oops, something went wrong.