From 8cced02512b407a800e65474707e5cdd7d479680 Mon Sep 17 00:00:00 2001
From: pranavm <pranavm@nvidia.com>
Date: Fri, 22 Nov 2024 15:55:21 -0800
Subject: [PATCH] Parametrizes tests instead of running loops

---
 python/test/test_api.py     |  63 +++++++--
 python/test/test_binding.py | 267 ++++++++++++++++++++++++------------
 2 files changed, 231 insertions(+), 99 deletions(-)

diff --git a/python/test/test_api.py b/python/test/test_api.py
index 9de262d4c..96179438b 100644
--- a/python/test/test_api.py
+++ b/python/test/test_api.py
@@ -120,7 +120,10 @@ def test_memory_fallback_to_cpu(self, server_options):
         for response in server.model("test").infer(
             inputs={"fp16_input": fp16_input},
         ):
-            assert response.outputs["fp16_output"].memory_type == tritonserver.MemoryType.CPU
+            assert (
+                response.outputs["fp16_output"].memory_type
+                == tritonserver.MemoryType.CPU
+            )
             fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
             assert fp16_input[0][0] == fp16_output[0][0]
 
@@ -145,7 +148,9 @@ def test_memory_allocator_exception(self, server_options):
 
         with pytest.raises(tritonserver.InternalError):
             for response in server.model("test").infer(
-                inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])},
+                inputs={
+                    "string_input": tritonserver.Tensor.from_string_array([["hello"]])
+                },
                 output_memory_type="gpu",
                 output_memory_allocator=TestAllocators.MockMemoryAllocator(),
             ):
@@ -169,7 +174,9 @@ def test_unsupported_memory_type(self, server_options):
         )
 
         if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators:
-            allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
+            allocator = tritonserver.default_memory_allocators[
+                tritonserver.MemoryType.GPU
+            ]
 
             del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
         else:
@@ -177,19 +184,25 @@ def test_unsupported_memory_type(self, server_options):
 
         with pytest.raises(tritonserver.InvalidArgumentError):
             for response in server.model("test").infer(
-                inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])},
+                inputs={
+                    "string_input": tritonserver.Tensor.from_string_array([["hello"]])
+                },
                 output_memory_type="gpu",
             ):
                 pass
 
         if allocator is not None:
-            tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator
+            tritonserver.default_memory_allocators[
+                tritonserver.MemoryType.GPU
+            ] = allocator
 
     @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
     def test_allocate_on_cpu_and_reshape(self):
         allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]
 
-        memory_buffer = allocator.allocate(memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200)
+        memory_buffer = allocator.allocate(
+            memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200
+        )
 
         cpu_array = memory_buffer.owner
 
@@ -197,7 +210,9 @@ def test_allocate_on_cpu_and_reshape(self):
 
         fp32_size = int(memory_buffer.size / 4)
 
-        tensor = tritonserver.Tensor(tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer)
+        tensor = tritonserver.Tensor(
+            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
+        )
 
         cpu_fp32_array = numpy.from_dlpack(tensor)
         assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data
@@ -209,7 +224,9 @@ def test_allocate_on_cpu_and_reshape(self):
     def test_allocate_on_gpu_and_reshape(self):
         allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
 
-        memory_buffer = allocator.allocate(memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200)
+        memory_buffer = allocator.allocate(
+            memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200
+        )
 
         gpu_array = memory_buffer.owner
 
@@ -220,17 +237,25 @@ def test_allocate_on_gpu_and_reshape(self):
 
         fp32_size = int(memory_buffer.size / 4)
 
-        tensor = tritonserver.Tensor(tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer)
+        tensor = tritonserver.Tensor(
+            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
+        )
 
         gpu_fp32_array = cupy.from_dlpack(tensor)
-        assert gpu_array.__cuda_array_interface__["data"][0] == gpu_fp32_array.__cuda_array_interface__["data"][0]
+        assert (
+            gpu_array.__cuda_array_interface__["data"][0]
+            == gpu_fp32_array.__cuda_array_interface__["data"][0]
+        )
 
         assert gpu_fp32_array.dtype == cupy.float32
         assert gpu_fp32_array.nbytes == 200
 
         torch_fp32_tensor = torch.from_dlpack(tensor)
         assert torch_fp32_tensor.dtype == torch.float32
-        assert torch_fp32_tensor.data_ptr() == gpu_array.__cuda_array_interface__["data"][0]
+        assert (
+            torch_fp32_tensor.data_ptr()
+            == gpu_array.__cuda_array_interface__["data"][0]
+        )
         assert torch_fp32_tensor.nbytes == 200
 
 
@@ -250,7 +275,9 @@ def test_cpu_to_gpu(self):
 
         assert gpu_array.__cuda_array_interface__["data"][0] == memory_buffer.data_ptr
 
-    @pytest.mark.skipif(torch is None, reason="Skipping gpu memory, torch not installed")
+    @pytest.mark.skipif(
+        torch is None, reason="Skipping gpu memory, torch not installed"
+    )
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
     def test_gpu_tensor_from_dl_pack(self):
         cupy_array = cupy.ones([100]).astype(cupy.float64)
@@ -307,7 +334,9 @@ def test_stop(self, server_options):
                         "parameters": {"decoupled": {"string_value": "False"}},
                         # Keep instance count low for fast startup/cleanup.
                         # Alternatively can use KIND_CPU here, but keeping gpus/count explicit.
-                        "instance_group": [{"kind": "KIND_GPU", "gpus": [0], "count": 1}],
+                        "instance_group": [
+                            {"kind": "KIND_GPU", "gpus": [0], "count": 1}
+                        ],
                     }
                 )
             },
@@ -414,7 +443,9 @@ def test_basic_inference(self, server_options):
             raise_on_error=True,
         ):
             for input_name, input_value in inputs.items():
-                output_value = numpy.from_dlpack(response.outputs[input_name.replace("input", "output")])
+                output_value = numpy.from_dlpack(
+                    response.outputs[input_name.replace("input", "output")]
+                )
                 numpy.testing.assert_array_equal(input_value, output_value)
 
     def test_parameters(self, server_options):
@@ -450,7 +481,9 @@ def test_parameters(self, server_options):
         ):
             fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
             numpy.testing.assert_array_equal(fp16_input, fp16_output)
-            output_parameters = json.loads(response.outputs["output_parameters"].to_string_array()[0])
+            output_parameters = json.loads(
+                response.outputs["output_parameters"].to_string_array()[0]
+            )
             assert input_parameters == output_parameters
 
         with pytest.raises(tritonserver.InvalidArgumentError):
diff --git a/python/test/test_binding.py b/python/test/test_binding.py
index 3aeb6e11a..143e55f50 100644
--- a/python/test/test_binding.py
+++ b/python/test/test_binding.py
@@ -39,7 +39,9 @@
 # Callback functions used in inference pipeline
 # 'user_object' is a per-request counter of how many times the
 # callback is invoked
-def g_alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object):
+def g_alloc_fn(
+    allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object
+):
     if "alloc" not in user_object:
         user_object["alloc"] = 0
     user_object["alloc"] += 1
@@ -47,10 +49,14 @@ def g_alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, u
     return (buffer.ctypes.data, buffer, triton_bindings.TRITONSERVER_MemoryType.CPU, 0)
 
 
-def g_release_fn(allocator, buffer, buffer_user_object, byte_size, memory_type, memory_type_id):
+def g_release_fn(
+    allocator, buffer, buffer_user_object, byte_size, memory_type, memory_type_id
+):
     # No-op, buffer ('buffer_user_object') will be garbage collected
     # only sanity check that the objects are expected
-    if (not isinstance(buffer_user_object, numpy.ndarray)) or (buffer_user_object.ctypes.data != buffer):
+    if (not isinstance(buffer_user_object, numpy.ndarray)) or (
+        buffer_user_object.ctypes.data != buffer
+    ):
         raise Exception("Misaligned parameters in allocator release callback")
     pass
 
@@ -62,14 +68,18 @@ def g_start_fn(allocator, user_object):
     pass
 
 
-def g_query_fn(allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id):
+def g_query_fn(
+    allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id
+):
     if "query" not in user_object:
         user_object["query"] = 0
     user_object["query"] += 1
     return (triton_bindings.TRITONSERVER_MemoryType.CPU, 0)
 
 
-def g_buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user_object):
+def g_buffer_fn(
+    allocator, tensor_name, buffer_attribute, user_object, buffer_user_object
+):
     if "buffer" not in user_object:
         user_object["buffer"] = 0
     user_object["buffer"] += 1
@@ -236,9 +246,13 @@ def _to_pyobject(self, triton_message):
 
     # prepare a model repository with "addsub" model
     def _create_model_repository(self):
-        os.makedirs(os.path.join(self._test_model_repo, self._model_name, self._version))
+        os.makedirs(
+            os.path.join(self._test_model_repo, self._model_name, self._version)
+        )
         with open(
-            os.path.join(self._test_model_repo, self._model_name, self._version, self._file_name),
+            os.path.join(
+                self._test_model_repo, self._model_name, self._version, self._file_name
+            ),
             "wb",
         ) as f:
             f.write(g_python_addsub)
@@ -251,7 +265,9 @@ def _start_polling_server(self):
 
         options = triton_bindings.TRITONSERVER_ServerOptions()
         options.set_model_repository_path(self._test_model_repo)
-        options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.POLL)
+        options.set_model_control_mode(
+            triton_bindings.TRITONSERVER_ModelControlMode.POLL
+        )
         # enable "auto-complete" to skip providing config.pbtxt
         options.set_strict_model_config(False)
         options.set_server_id("testing_server")
@@ -260,17 +276,23 @@ def _start_polling_server(self):
         return triton_bindings.TRITONSERVER_Server(options)
 
     def _prepare_inference_request(self, server):
-        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(g_alloc_fn, g_release_fn, g_start_fn)
+        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(
+            g_alloc_fn, g_release_fn, g_start_fn
+        )
         allocator.set_buffer_attributes_function(g_buffer_fn)
         allocator.set_query_function(g_query_fn)
 
         request_counter = queue.Queue()
         response_queue = queue.Queue()
         allocator_counter = {}
-        request = triton_bindings.TRITONSERVER_InferenceRequest(server, self._model_name, -1)
+        request = triton_bindings.TRITONSERVER_InferenceRequest(
+            server, self._model_name, -1
+        )
         request.id = "req_0"
         request.set_release_callback(g_request_fn, request_counter)
-        request.set_response_callback(allocator, allocator_counter, g_response_fn, response_queue)
+        request.set_response_callback(
+            allocator, allocator_counter, g_response_fn, response_queue
+        )
 
         input = numpy.ones([4], dtype=numpy.float32)
         input_buffer = input.ctypes.data
@@ -279,15 +301,20 @@ def _prepare_inference_request(self, server):
         ba.memory_type_id = 0
         ba.byte_size = input.itemsize * input.size
 
-        request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
-        request.add_input("INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
+        request.add_input(
+            "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
+        request.add_input(
+            "INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
         request.append_input_data_with_buffer_attributes("INPUT0", input_buffer, ba)
         request.append_input_data_with_buffer_attributes("INPUT1", input_buffer, ba)
 
         return request, allocator, response_queue, request_counter
 
-    def test_exceptions(self):
-        ex_list = [
+    @pytest.mark.parametrize(
+        "ex_type",
+        [
             triton_bindings.UnknownError,
             triton_bindings.InternalError,
             triton_bindings.NotFoundError,
@@ -295,13 +322,15 @@ def test_exceptions(self):
             triton_bindings.UnavailableError,
             triton_bindings.UnsupportedError,
             triton_bindings.AlreadyExistsError,
-        ]
-        for ex_type in ex_list:
-            with pytest.raises(ex_type, match="Error message") as ctx:
-                raise ex_type("Error message")
-
-    def test_data_type(self):
-        t_list = [
+        ],
+    )
+    def test_exceptions(self, ex_type):
+        with pytest.raises(ex_type, match="Error message") as ctx:
+            raise ex_type("Error message")
+
+    @pytest.mark.parametrize(
+        "t, t_str, t_size",
+        [
             (triton_bindings.TRITONSERVER_DataType.INVALID, "<invalid>", 0),
             (triton_bindings.TRITONSERVER_DataType.BOOL, "BOOL", 1),
             (triton_bindings.TRITONSERVER_DataType.UINT8, "UINT8", 1),
@@ -317,31 +346,35 @@ def test_data_type(self):
             (triton_bindings.TRITONSERVER_DataType.FP64, "FP64", 8),
             (triton_bindings.TRITONSERVER_DataType.BYTES, "BYTES", 0),
             (triton_bindings.TRITONSERVER_DataType.BF16, "BF16", 2),
-        ]
-
-        for t, t_str, t_size in t_list:
-            assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str
-            assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t
-            assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size
-
-    def test_memory_type(self):
-        t_list = [
+        ],
+    )
+    def test_data_type(self, t, t_str, t_size):
+        assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str
+        assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t
+        assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size
+
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_MemoryType.CPU, "CPU"),
             (triton_bindings.TRITONSERVER_MemoryType.CPU_PINNED, "CPU_PINNED"),
             (triton_bindings.TRITONSERVER_MemoryType.GPU, "GPU"),
-        ]
-        for t, t_str in t_list:
-            assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str
-
-    def test_parameter_type(self):
-        t_list = [
+        ],
+    )
+    def test_memory_type(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str
+
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_ParameterType.STRING, "STRING"),
             (triton_bindings.TRITONSERVER_ParameterType.INT, "INT"),
             (triton_bindings.TRITONSERVER_ParameterType.BOOL, "BOOL"),
             (triton_bindings.TRITONSERVER_ParameterType.BYTES, "BYTES"),
-        ]
-        for t, t_str in t_list:
-            assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str
+        ],
+    )
+    def test_parameter_type(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str
 
     def test_parameter(self):
         # C API doesn't provide additional API for parameter, can only test
@@ -358,15 +391,17 @@ def test_parameter(self):
         del bytes_param
         gc.collect()
 
-    def test_instance_kind(self):
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO, "AUTO"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, "CPU"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, "GPU"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, "MODEL"),
-        ]
-        for t, t_str in t_list:
-            assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str
+        ],
+    )
+    def test_instance_kind(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str
 
     def test_log(self):
         # This test depends on 'TRITONSERVER_ServerOptions' operates properly
@@ -471,12 +506,16 @@ def test_buffer_attributes(self):
         handle_byte_size = 64
         mock_handle = array("b", [i for i in range(handle_byte_size)])
         buffer_attributes.cuda_ipc_handle = mock_handle.buffer_info()[0]
-        res_arr = (ctypes.c_char * handle_byte_size).from_address(buffer_attributes.cuda_ipc_handle)
+        res_arr = (ctypes.c_char * handle_byte_size).from_address(
+            buffer_attributes.cuda_ipc_handle
+        )
         for i in range(handle_byte_size):
             assert int.from_bytes(res_arr[i], "big") == mock_handle[i]
 
     def test_allocator(self):
-        def alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object):
+        def alloc_fn(
+            allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object
+        ):
             return (123, None, triton_bindings.TRITONSERVER_MemoryType.GPU, 1)
 
         def release_fn(
@@ -492,10 +531,14 @@ def release_fn(
         def start_fn(allocator, user_object):
             pass
 
-        def query_fn(allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id):
+        def query_fn(
+            allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id
+        ):
             return (triton_bindings.TRITONSERVER_MemoryType.GPU, 1)
 
-        def buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user_object):
+        def buffer_fn(
+            allocator, tensor_name, buffer_attribute, user_object, buffer_user_object
+        ):
             return buffer_attribute
 
         # allocator without start_fn
@@ -504,7 +547,9 @@ def buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user
         gc.collect()
 
         # allocator with start_fn
-        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(alloc_fn, release_fn, start_fn)
+        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(
+            alloc_fn, release_fn, start_fn
+        )
         allocator.set_buffer_attributes_function(buffer_fn)
         allocator.set_query_function(query_fn)
 
@@ -521,30 +566,45 @@ def test_metrics(self):
         # a model repository is proper repository
         options = triton_bindings.TRITONSERVER_ServerOptions()
         options.set_model_repository_path(self._test_model_repo)
-        options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT)
+        options.set_model_control_mode(
+            triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT
+        )
         server = triton_bindings.TRITONSERVER_Server(options)
         metrics = server.metrics()
         # Check one of the metrics is reported
-        assert "nv_cpu_memory_used_bytes" in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS)
+        assert "nv_cpu_memory_used_bytes" in metrics.formatted(
+            triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS
+        )
 
-    def test_trace_enum(self):
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.DISABLED, "DISABLED"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.MIN, "MIN"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.MAX, "MAX"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS, "TIMESTAMPS"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS, "TENSORS"),
-        ]
-        for t, t_str in t_list:
-            assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str
+        ],
+    )
+    def test_trace_enum(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str
+
+    def test_trace_bitwise_operations(self):
         # bit-wise operation
         level = int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) | int(
             triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS
         )
-        assert level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) != 0
-        assert level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0
+        assert (
+            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS)
+            != 0
+        )
+        assert (
+            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0
+        )
 
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (
                 triton_bindings.TRITONSERVER_InferenceTraceActivity.REQUEST_START,
                 "REQUEST_START",
@@ -585,9 +645,10 @@ def test_trace_enum(self):
                 triton_bindings.TRITONSERVER_InferenceTraceActivity.TENSOR_BACKEND_OUTPUT,
                 "TENSOR_BACKEND_OUTPUT",
             ),
-        ]
-        for t, t_str in t_list:
-            assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str
+        ],
+    )
+    def test_trace_activity_enum(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str
 
     def test_trace(self):
         # This test depends on 'test_infer_async' test to capture
@@ -687,13 +748,17 @@ def test_options(self):
         options.set_model_load_thread_count(2)
         options.set_model_namespacing(True)
         # Only support Kind GPU for now
-        options.set_model_load_device_limit(triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, 0, 0.5)
+        options.set_model_load_device_limit(
+            triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, 0, 0.5
+        )
         for k in [
             triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO,
             triton_bindings.TRITONSERVER_InstanceGroupKind.CPU,
             triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL,
         ]:
-            with pytest.raises(triton_bindings.TritonError, match="not supported") as context:
+            with pytest.raises(
+                triton_bindings.TritonError, match="not supported"
+            ) as context:
                 options.set_model_load_device_limit(k, 0, 0)
 
         # Backend
@@ -714,7 +779,9 @@ def test_options(self):
         options.set_cuda_memory_pool_byte_size(0, 2048)
         # cache
         options.set_response_cache_byte_size(4096)
-        options.set_cache_config("cache_name", json.dumps({"config_0": "value_0", "config_1": "value_1"}))
+        options.set_cache_config(
+            "cache_name", json.dumps({"config_0": "value_0", "config_1": "value_1"})
+        )
         options.set_cache_directory("cache_dir_0")
         options.set_cache_directory("cache_dir_1")
         # Log
@@ -746,7 +813,9 @@ def test_options(self):
         options.set_metrics_config("metrics_group", "setting", "value")
 
         # Misc..
-        with pytest.raises(triton_bindings.TritonError, match="Unsupported host policy setting") as context:
+        with pytest.raises(
+            triton_bindings.TritonError, match="Unsupported host policy setting"
+        ) as context:
             options.set_host_policy("policy_name", "setting", "value")
 
         options.set_repo_agent_directory("repo_agent_dir_0")
@@ -766,13 +835,19 @@ def test_server(self):
             int(triton_bindings.TRITONSERVER_ModelBatchFlag.UNKNOWN),
             0,
         )
-        assert server.model_batch_properties(self._model_name, -1) == expected_batch_properties
+        assert (
+            server.model_batch_properties(self._model_name, -1)
+            == expected_batch_properties
+        )
         # model_transaction_properties
         expected_transaction_policy = (
             int(triton_bindings.TRITONSERVER_ModelTxnPropertyFlag.ONE_TO_ONE),
             0,
         )
-        assert server.model_transaction_properties(self._model_name, -1) == expected_transaction_policy
+        assert (
+            server.model_transaction_properties(self._model_name, -1)
+            == expected_transaction_policy
+        )
         # metadata
         server_meta_data = self._to_pyobject(server.metadata())
         assert "name" in server_meta_data
@@ -782,7 +857,9 @@ def test_server(self):
         assert "name" in model_meta_data
         assert model_meta_data["name"] == self._model_name
         # model_statistics
-        model_statistics = self._to_pyobject(server.model_statistics(self._model_name, -1))
+        model_statistics = self._to_pyobject(
+            server.model_statistics(self._model_name, -1)
+        )
         assert "model_stats" in model_statistics
         # model_config
         model_config = self._to_pyobject(server.model_config(self._model_name, -1, 1))
@@ -799,12 +876,14 @@ def test_request(self):
         server = self._start_polling_server()
 
         with pytest.raises(triton_bindings.NotFoundError, match="unknown model") as ctx:
-            _ = triton_bindings.TRITONSERVER_InferenceRequest(server, "not_existing_model", -1)
+            _ = triton_bindings.TRITONSERVER_InferenceRequest(
+                server, "not_existing_model", -1
+            )
 
         expected_request_id = "request"
-        expected_flags = int(triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_START) | int(
-            triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_END
-        )
+        expected_flags = int(
+            triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_START
+        ) | int(triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_END)
         expected_correlation_id = 2
         expected_correlation_id_string = "123"
         expected_priority = 19
@@ -848,7 +927,9 @@ def test_request(self):
         ba.memory_type_id = 0
         ba.byte_size = input.itemsize * input.size
 
-        request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
+        request.add_input(
+            "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
         with pytest.raises(triton_bindings.TritonError):
             request.remove_input("INPUT2")
         # raw input assumes single input
@@ -870,7 +951,9 @@ def test_request(self):
         with pytest.raises(triton_bindings.TritonError):
             request.remove_all_input_data("INPUT0")
         # Add back input
-        request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
+        request.add_input(
+            "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
         request.append_input_data(*aid_args)
         request.remove_all_input_data("INPUT0")
 
@@ -883,17 +966,23 @@ def test_infer_async(self):
         server = self._start_polling_server()
 
         # prepare for infer
-        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(g_alloc_fn, g_release_fn, g_start_fn)
+        allocator = triton_bindings.TRITONSERVER_ResponseAllocator(
+            g_alloc_fn, g_release_fn, g_start_fn
+        )
         allocator.set_buffer_attributes_function(g_buffer_fn)
         allocator.set_query_function(g_query_fn)
 
         request_counter = queue.Queue()
         response_queue = queue.Queue()
         allocator_counter = {}
-        request = triton_bindings.TRITONSERVER_InferenceRequest(server, self._model_name, -1)
+        request = triton_bindings.TRITONSERVER_InferenceRequest(
+            server, self._model_name, -1
+        )
         request.id = "req_0"
         request.set_release_callback(g_request_fn, request_counter)
-        request.set_response_callback(allocator, allocator_counter, g_response_fn, response_queue)
+        request.set_response_callback(
+            allocator, allocator_counter, g_response_fn, response_queue
+        )
 
         input = numpy.ones([4], dtype=numpy.float32)
         input_buffer = input.ctypes.data
@@ -902,8 +991,12 @@ def test_infer_async(self):
         ba.memory_type_id = 0
         ba.byte_size = input.itemsize * input.size
 
-        request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
-        request.add_input("INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape)
+        request.add_input(
+            "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
+        request.add_input(
+            "INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
+        )
         request.append_input_data_with_buffer_attributes("INPUT0", input_buffer, ba)
         request.append_input_data_with_buffer_attributes("INPUT1", input_buffer, ba)
 
@@ -973,7 +1066,9 @@ def test_server_explicit(self):
         # explicit : load with params
         options = triton_bindings.TRITONSERVER_ServerOptions()
         options.set_model_repository_path(self._test_model_repo)
-        options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT)
+        options.set_model_control_mode(
+            triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT
+        )
         options.set_strict_model_config(False)
         server = triton_bindings.TRITONSERVER_Server(options)
         load_file_params = [
@@ -1005,7 +1100,9 @@ def test_server_explicit(self):
     def test_custom_metric(self):
         options = triton_bindings.TRITONSERVER_ServerOptions()
         options.set_model_repository_path(self._test_model_repo)
-        options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT)
+        options.set_model_control_mode(
+            triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT
+        )
         server = triton_bindings.TRITONSERVER_Server(options)
 
         # create custom metric
@@ -1024,4 +1121,6 @@ def test_custom_metric(self):
 
         # Check custom metric is reported
         metrics = server.metrics()
-        assert "custom_metric_familiy" in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS)
+        assert "custom_metric_familiy" in metrics.formatted(
+            triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS
+        )