From 8cced02512b407a800e65474707e5cdd7d479680 Mon Sep 17 00:00:00 2001 From: pranavm Date: Fri, 22 Nov 2024 15:55:21 -0800 Subject: [PATCH] Parametrizes tests instead of running loops --- python/test/test_api.py | 63 +++++++-- python/test/test_binding.py | 267 ++++++++++++++++++++++++------------ 2 files changed, 231 insertions(+), 99 deletions(-) diff --git a/python/test/test_api.py b/python/test/test_api.py index 9de262d4c..96179438b 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -120,7 +120,10 @@ def test_memory_fallback_to_cpu(self, server_options): for response in server.model("test").infer( inputs={"fp16_input": fp16_input}, ): - assert response.outputs["fp16_output"].memory_type == tritonserver.MemoryType.CPU + assert ( + response.outputs["fp16_output"].memory_type + == tritonserver.MemoryType.CPU + ) fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) assert fp16_input[0][0] == fp16_output[0][0] @@ -145,7 +148,9 @@ def test_memory_allocator_exception(self, server_options): with pytest.raises(tritonserver.InternalError): for response in server.model("test").infer( - inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])}, + inputs={ + "string_input": tritonserver.Tensor.from_string_array([["hello"]]) + }, output_memory_type="gpu", output_memory_allocator=TestAllocators.MockMemoryAllocator(), ): @@ -169,7 +174,9 @@ def test_unsupported_memory_type(self, server_options): ) if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators: - allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] + allocator = tritonserver.default_memory_allocators[ + tritonserver.MemoryType.GPU + ] del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] else: @@ -177,19 +184,25 @@ def test_unsupported_memory_type(self, server_options): with pytest.raises(tritonserver.InvalidArgumentError): for response in server.model("test").infer( - inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])}, + inputs={ + "string_input": tritonserver.Tensor.from_string_array([["hello"]]) + }, output_memory_type="gpu", ): pass if allocator is not None: - tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator + tritonserver.default_memory_allocators[ + tritonserver.MemoryType.GPU + ] = allocator @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") def test_allocate_on_cpu_and_reshape(self): allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU] - memory_buffer = allocator.allocate(memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200) + memory_buffer = allocator.allocate( + memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200 + ) cpu_array = memory_buffer.owner @@ -197,7 +210,9 @@ def test_allocate_on_cpu_and_reshape(self): fp32_size = int(memory_buffer.size / 4) - tensor = tritonserver.Tensor(tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer) + tensor = tritonserver.Tensor( + tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer + ) cpu_fp32_array = numpy.from_dlpack(tensor) assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data @@ -209,7 +224,9 @@ def test_allocate_on_cpu_and_reshape(self): def test_allocate_on_gpu_and_reshape(self): allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] - memory_buffer = allocator.allocate(memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200) + memory_buffer = allocator.allocate( + memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200 + ) gpu_array = memory_buffer.owner @@ -220,17 +237,25 @@ def test_allocate_on_gpu_and_reshape(self): fp32_size = int(memory_buffer.size / 4) - tensor = tritonserver.Tensor(tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer) + tensor = tritonserver.Tensor( + tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer + ) gpu_fp32_array = cupy.from_dlpack(tensor) - assert gpu_array.__cuda_array_interface__["data"][0] == gpu_fp32_array.__cuda_array_interface__["data"][0] + assert ( + gpu_array.__cuda_array_interface__["data"][0] + == gpu_fp32_array.__cuda_array_interface__["data"][0] + ) assert gpu_fp32_array.dtype == cupy.float32 assert gpu_fp32_array.nbytes == 200 torch_fp32_tensor = torch.from_dlpack(tensor) assert torch_fp32_tensor.dtype == torch.float32 - assert torch_fp32_tensor.data_ptr() == gpu_array.__cuda_array_interface__["data"][0] + assert ( + torch_fp32_tensor.data_ptr() + == gpu_array.__cuda_array_interface__["data"][0] + ) assert torch_fp32_tensor.nbytes == 200 @@ -250,7 +275,9 @@ def test_cpu_to_gpu(self): assert gpu_array.__cuda_array_interface__["data"][0] == memory_buffer.data_ptr - @pytest.mark.skipif(torch is None, reason="Skipping gpu memory, torch not installed") + @pytest.mark.skipif( + torch is None, reason="Skipping gpu memory, torch not installed" + ) @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") def test_gpu_tensor_from_dl_pack(self): cupy_array = cupy.ones([100]).astype(cupy.float64) @@ -307,7 +334,9 @@ def test_stop(self, server_options): "parameters": {"decoupled": {"string_value": "False"}}, # Keep instance count low for fast startup/cleanup. # Alternatively can use KIND_CPU here, but keeping gpus/count explicit. - "instance_group": [{"kind": "KIND_GPU", "gpus": [0], "count": 1}], + "instance_group": [ + {"kind": "KIND_GPU", "gpus": [0], "count": 1} + ], } ) }, @@ -414,7 +443,9 @@ def test_basic_inference(self, server_options): raise_on_error=True, ): for input_name, input_value in inputs.items(): - output_value = numpy.from_dlpack(response.outputs[input_name.replace("input", "output")]) + output_value = numpy.from_dlpack( + response.outputs[input_name.replace("input", "output")] + ) numpy.testing.assert_array_equal(input_value, output_value) def test_parameters(self, server_options): @@ -450,7 +481,9 @@ def test_parameters(self, server_options): ): fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) numpy.testing.assert_array_equal(fp16_input, fp16_output) - output_parameters = json.loads(response.outputs["output_parameters"].to_string_array()[0]) + output_parameters = json.loads( + response.outputs["output_parameters"].to_string_array()[0] + ) assert input_parameters == output_parameters with pytest.raises(tritonserver.InvalidArgumentError): diff --git a/python/test/test_binding.py b/python/test/test_binding.py index 3aeb6e11a..143e55f50 100644 --- a/python/test/test_binding.py +++ b/python/test/test_binding.py @@ -39,7 +39,9 @@ # Callback functions used in inference pipeline # 'user_object' is a per-request counter of how many times the # callback is invoked -def g_alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object): +def g_alloc_fn( + allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object +): if "alloc" not in user_object: user_object["alloc"] = 0 user_object["alloc"] += 1 @@ -47,10 +49,14 @@ def g_alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, u return (buffer.ctypes.data, buffer, triton_bindings.TRITONSERVER_MemoryType.CPU, 0) -def g_release_fn(allocator, buffer, buffer_user_object, byte_size, memory_type, memory_type_id): +def g_release_fn( + allocator, buffer, buffer_user_object, byte_size, memory_type, memory_type_id +): # No-op, buffer ('buffer_user_object') will be garbage collected # only sanity check that the objects are expected - if (not isinstance(buffer_user_object, numpy.ndarray)) or (buffer_user_object.ctypes.data != buffer): + if (not isinstance(buffer_user_object, numpy.ndarray)) or ( + buffer_user_object.ctypes.data != buffer + ): raise Exception("Misaligned parameters in allocator release callback") pass @@ -62,14 +68,18 @@ def g_start_fn(allocator, user_object): pass -def g_query_fn(allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id): +def g_query_fn( + allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id +): if "query" not in user_object: user_object["query"] = 0 user_object["query"] += 1 return (triton_bindings.TRITONSERVER_MemoryType.CPU, 0) -def g_buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user_object): +def g_buffer_fn( + allocator, tensor_name, buffer_attribute, user_object, buffer_user_object +): if "buffer" not in user_object: user_object["buffer"] = 0 user_object["buffer"] += 1 @@ -236,9 +246,13 @@ def _to_pyobject(self, triton_message): # prepare a model repository with "addsub" model def _create_model_repository(self): - os.makedirs(os.path.join(self._test_model_repo, self._model_name, self._version)) + os.makedirs( + os.path.join(self._test_model_repo, self._model_name, self._version) + ) with open( - os.path.join(self._test_model_repo, self._model_name, self._version, self._file_name), + os.path.join( + self._test_model_repo, self._model_name, self._version, self._file_name + ), "wb", ) as f: f.write(g_python_addsub) @@ -251,7 +265,9 @@ def _start_polling_server(self): options = triton_bindings.TRITONSERVER_ServerOptions() options.set_model_repository_path(self._test_model_repo) - options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.POLL) + options.set_model_control_mode( + triton_bindings.TRITONSERVER_ModelControlMode.POLL + ) # enable "auto-complete" to skip providing config.pbtxt options.set_strict_model_config(False) options.set_server_id("testing_server") @@ -260,17 +276,23 @@ def _start_polling_server(self): return triton_bindings.TRITONSERVER_Server(options) def _prepare_inference_request(self, server): - allocator = triton_bindings.TRITONSERVER_ResponseAllocator(g_alloc_fn, g_release_fn, g_start_fn) + allocator = triton_bindings.TRITONSERVER_ResponseAllocator( + g_alloc_fn, g_release_fn, g_start_fn + ) allocator.set_buffer_attributes_function(g_buffer_fn) allocator.set_query_function(g_query_fn) request_counter = queue.Queue() response_queue = queue.Queue() allocator_counter = {} - request = triton_bindings.TRITONSERVER_InferenceRequest(server, self._model_name, -1) + request = triton_bindings.TRITONSERVER_InferenceRequest( + server, self._model_name, -1 + ) request.id = "req_0" request.set_release_callback(g_request_fn, request_counter) - request.set_response_callback(allocator, allocator_counter, g_response_fn, response_queue) + request.set_response_callback( + allocator, allocator_counter, g_response_fn, response_queue + ) input = numpy.ones([4], dtype=numpy.float32) input_buffer = input.ctypes.data @@ -279,15 +301,20 @@ def _prepare_inference_request(self, server): ba.memory_type_id = 0 ba.byte_size = input.itemsize * input.size - request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) - request.add_input("INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) + request.add_input( + "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) + request.add_input( + "INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) request.append_input_data_with_buffer_attributes("INPUT0", input_buffer, ba) request.append_input_data_with_buffer_attributes("INPUT1", input_buffer, ba) return request, allocator, response_queue, request_counter - def test_exceptions(self): - ex_list = [ + @pytest.mark.parametrize( + "ex_type", + [ triton_bindings.UnknownError, triton_bindings.InternalError, triton_bindings.NotFoundError, @@ -295,13 +322,15 @@ def test_exceptions(self): triton_bindings.UnavailableError, triton_bindings.UnsupportedError, triton_bindings.AlreadyExistsError, - ] - for ex_type in ex_list: - with pytest.raises(ex_type, match="Error message") as ctx: - raise ex_type("Error message") - - def test_data_type(self): - t_list = [ + ], + ) + def test_exceptions(self, ex_type): + with pytest.raises(ex_type, match="Error message") as ctx: + raise ex_type("Error message") + + @pytest.mark.parametrize( + "t, t_str, t_size", + [ (triton_bindings.TRITONSERVER_DataType.INVALID, "", 0), (triton_bindings.TRITONSERVER_DataType.BOOL, "BOOL", 1), (triton_bindings.TRITONSERVER_DataType.UINT8, "UINT8", 1), @@ -317,31 +346,35 @@ def test_data_type(self): (triton_bindings.TRITONSERVER_DataType.FP64, "FP64", 8), (triton_bindings.TRITONSERVER_DataType.BYTES, "BYTES", 0), (triton_bindings.TRITONSERVER_DataType.BF16, "BF16", 2), - ] - - for t, t_str, t_size in t_list: - assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str - assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t - assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size - - def test_memory_type(self): - t_list = [ + ], + ) + def test_data_type(self, t, t_str, t_size): + assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str + assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t + assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size + + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_MemoryType.CPU, "CPU"), (triton_bindings.TRITONSERVER_MemoryType.CPU_PINNED, "CPU_PINNED"), (triton_bindings.TRITONSERVER_MemoryType.GPU, "GPU"), - ] - for t, t_str in t_list: - assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str - - def test_parameter_type(self): - t_list = [ + ], + ) + def test_memory_type(self, t, t_str): + assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str + + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_ParameterType.STRING, "STRING"), (triton_bindings.TRITONSERVER_ParameterType.INT, "INT"), (triton_bindings.TRITONSERVER_ParameterType.BOOL, "BOOL"), (triton_bindings.TRITONSERVER_ParameterType.BYTES, "BYTES"), - ] - for t, t_str in t_list: - assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str + ], + ) + def test_parameter_type(self, t, t_str): + assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str def test_parameter(self): # C API doesn't provide additional API for parameter, can only test @@ -358,15 +391,17 @@ def test_parameter(self): del bytes_param gc.collect() - def test_instance_kind(self): - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO, "AUTO"), (triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, "CPU"), (triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, "GPU"), (triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, "MODEL"), - ] - for t, t_str in t_list: - assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str + ], + ) + def test_instance_kind(self, t, t_str): + assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str def test_log(self): # This test depends on 'TRITONSERVER_ServerOptions' operates properly @@ -471,12 +506,16 @@ def test_buffer_attributes(self): handle_byte_size = 64 mock_handle = array("b", [i for i in range(handle_byte_size)]) buffer_attributes.cuda_ipc_handle = mock_handle.buffer_info()[0] - res_arr = (ctypes.c_char * handle_byte_size).from_address(buffer_attributes.cuda_ipc_handle) + res_arr = (ctypes.c_char * handle_byte_size).from_address( + buffer_attributes.cuda_ipc_handle + ) for i in range(handle_byte_size): assert int.from_bytes(res_arr[i], "big") == mock_handle[i] def test_allocator(self): - def alloc_fn(allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object): + def alloc_fn( + allocator, tensor_name, byte_size, memory_type, memory_type_id, user_object + ): return (123, None, triton_bindings.TRITONSERVER_MemoryType.GPU, 1) def release_fn( @@ -492,10 +531,14 @@ def release_fn( def start_fn(allocator, user_object): pass - def query_fn(allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id): + def query_fn( + allocator, user_object, tensor_name, byte_size, memory_type, memory_type_id + ): return (triton_bindings.TRITONSERVER_MemoryType.GPU, 1) - def buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user_object): + def buffer_fn( + allocator, tensor_name, buffer_attribute, user_object, buffer_user_object + ): return buffer_attribute # allocator without start_fn @@ -504,7 +547,9 @@ def buffer_fn(allocator, tensor_name, buffer_attribute, user_object, buffer_user gc.collect() # allocator with start_fn - allocator = triton_bindings.TRITONSERVER_ResponseAllocator(alloc_fn, release_fn, start_fn) + allocator = triton_bindings.TRITONSERVER_ResponseAllocator( + alloc_fn, release_fn, start_fn + ) allocator.set_buffer_attributes_function(buffer_fn) allocator.set_query_function(query_fn) @@ -521,30 +566,45 @@ def test_metrics(self): # a model repository is proper repository options = triton_bindings.TRITONSERVER_ServerOptions() options.set_model_repository_path(self._test_model_repo) - options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT) + options.set_model_control_mode( + triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT + ) server = triton_bindings.TRITONSERVER_Server(options) metrics = server.metrics() # Check one of the metrics is reported - assert "nv_cpu_memory_used_bytes" in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS) + assert "nv_cpu_memory_used_bytes" in metrics.formatted( + triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS + ) - def test_trace_enum(self): - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_InferenceTraceLevel.DISABLED, "DISABLED"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.MIN, "MIN"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.MAX, "MAX"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS, "TIMESTAMPS"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS, "TENSORS"), - ] - for t, t_str in t_list: - assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str + ], + ) + def test_trace_enum(self, t, t_str): + assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str + + def test_trace_bitwise_operations(self): # bit-wise operation level = int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) | int( triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS ) - assert level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) != 0 - assert level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0 + assert ( + level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) + != 0 + ) + assert ( + level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0 + ) - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ ( triton_bindings.TRITONSERVER_InferenceTraceActivity.REQUEST_START, "REQUEST_START", @@ -585,9 +645,10 @@ def test_trace_enum(self): triton_bindings.TRITONSERVER_InferenceTraceActivity.TENSOR_BACKEND_OUTPUT, "TENSOR_BACKEND_OUTPUT", ), - ] - for t, t_str in t_list: - assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str + ], + ) + def test_trace_activity_enum(self, t, t_str): + assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str def test_trace(self): # This test depends on 'test_infer_async' test to capture @@ -687,13 +748,17 @@ def test_options(self): options.set_model_load_thread_count(2) options.set_model_namespacing(True) # Only support Kind GPU for now - options.set_model_load_device_limit(triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, 0, 0.5) + options.set_model_load_device_limit( + triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, 0, 0.5 + ) for k in [ triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO, triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, ]: - with pytest.raises(triton_bindings.TritonError, match="not supported") as context: + with pytest.raises( + triton_bindings.TritonError, match="not supported" + ) as context: options.set_model_load_device_limit(k, 0, 0) # Backend @@ -714,7 +779,9 @@ def test_options(self): options.set_cuda_memory_pool_byte_size(0, 2048) # cache options.set_response_cache_byte_size(4096) - options.set_cache_config("cache_name", json.dumps({"config_0": "value_0", "config_1": "value_1"})) + options.set_cache_config( + "cache_name", json.dumps({"config_0": "value_0", "config_1": "value_1"}) + ) options.set_cache_directory("cache_dir_0") options.set_cache_directory("cache_dir_1") # Log @@ -746,7 +813,9 @@ def test_options(self): options.set_metrics_config("metrics_group", "setting", "value") # Misc.. - with pytest.raises(triton_bindings.TritonError, match="Unsupported host policy setting") as context: + with pytest.raises( + triton_bindings.TritonError, match="Unsupported host policy setting" + ) as context: options.set_host_policy("policy_name", "setting", "value") options.set_repo_agent_directory("repo_agent_dir_0") @@ -766,13 +835,19 @@ def test_server(self): int(triton_bindings.TRITONSERVER_ModelBatchFlag.UNKNOWN), 0, ) - assert server.model_batch_properties(self._model_name, -1) == expected_batch_properties + assert ( + server.model_batch_properties(self._model_name, -1) + == expected_batch_properties + ) # model_transaction_properties expected_transaction_policy = ( int(triton_bindings.TRITONSERVER_ModelTxnPropertyFlag.ONE_TO_ONE), 0, ) - assert server.model_transaction_properties(self._model_name, -1) == expected_transaction_policy + assert ( + server.model_transaction_properties(self._model_name, -1) + == expected_transaction_policy + ) # metadata server_meta_data = self._to_pyobject(server.metadata()) assert "name" in server_meta_data @@ -782,7 +857,9 @@ def test_server(self): assert "name" in model_meta_data assert model_meta_data["name"] == self._model_name # model_statistics - model_statistics = self._to_pyobject(server.model_statistics(self._model_name, -1)) + model_statistics = self._to_pyobject( + server.model_statistics(self._model_name, -1) + ) assert "model_stats" in model_statistics # model_config model_config = self._to_pyobject(server.model_config(self._model_name, -1, 1)) @@ -799,12 +876,14 @@ def test_request(self): server = self._start_polling_server() with pytest.raises(triton_bindings.NotFoundError, match="unknown model") as ctx: - _ = triton_bindings.TRITONSERVER_InferenceRequest(server, "not_existing_model", -1) + _ = triton_bindings.TRITONSERVER_InferenceRequest( + server, "not_existing_model", -1 + ) expected_request_id = "request" - expected_flags = int(triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_START) | int( - triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_END - ) + expected_flags = int( + triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_START + ) | int(triton_bindings.TRITONSERVER_RequestFlag.SEQUENCE_END) expected_correlation_id = 2 expected_correlation_id_string = "123" expected_priority = 19 @@ -848,7 +927,9 @@ def test_request(self): ba.memory_type_id = 0 ba.byte_size = input.itemsize * input.size - request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) + request.add_input( + "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) with pytest.raises(triton_bindings.TritonError): request.remove_input("INPUT2") # raw input assumes single input @@ -870,7 +951,9 @@ def test_request(self): with pytest.raises(triton_bindings.TritonError): request.remove_all_input_data("INPUT0") # Add back input - request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) + request.add_input( + "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) request.append_input_data(*aid_args) request.remove_all_input_data("INPUT0") @@ -883,17 +966,23 @@ def test_infer_async(self): server = self._start_polling_server() # prepare for infer - allocator = triton_bindings.TRITONSERVER_ResponseAllocator(g_alloc_fn, g_release_fn, g_start_fn) + allocator = triton_bindings.TRITONSERVER_ResponseAllocator( + g_alloc_fn, g_release_fn, g_start_fn + ) allocator.set_buffer_attributes_function(g_buffer_fn) allocator.set_query_function(g_query_fn) request_counter = queue.Queue() response_queue = queue.Queue() allocator_counter = {} - request = triton_bindings.TRITONSERVER_InferenceRequest(server, self._model_name, -1) + request = triton_bindings.TRITONSERVER_InferenceRequest( + server, self._model_name, -1 + ) request.id = "req_0" request.set_release_callback(g_request_fn, request_counter) - request.set_response_callback(allocator, allocator_counter, g_response_fn, response_queue) + request.set_response_callback( + allocator, allocator_counter, g_response_fn, response_queue + ) input = numpy.ones([4], dtype=numpy.float32) input_buffer = input.ctypes.data @@ -902,8 +991,12 @@ def test_infer_async(self): ba.memory_type_id = 0 ba.byte_size = input.itemsize * input.size - request.add_input("INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) - request.add_input("INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape) + request.add_input( + "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) + request.add_input( + "INPUT1", triton_bindings.TRITONSERVER_DataType.FP32, input.shape + ) request.append_input_data_with_buffer_attributes("INPUT0", input_buffer, ba) request.append_input_data_with_buffer_attributes("INPUT1", input_buffer, ba) @@ -973,7 +1066,9 @@ def test_server_explicit(self): # explicit : load with params options = triton_bindings.TRITONSERVER_ServerOptions() options.set_model_repository_path(self._test_model_repo) - options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT) + options.set_model_control_mode( + triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT + ) options.set_strict_model_config(False) server = triton_bindings.TRITONSERVER_Server(options) load_file_params = [ @@ -1005,7 +1100,9 @@ def test_server_explicit(self): def test_custom_metric(self): options = triton_bindings.TRITONSERVER_ServerOptions() options.set_model_repository_path(self._test_model_repo) - options.set_model_control_mode(triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT) + options.set_model_control_mode( + triton_bindings.TRITONSERVER_ModelControlMode.EXPLICIT + ) server = triton_bindings.TRITONSERVER_Server(options) # create custom metric @@ -1024,4 +1121,6 @@ def test_custom_metric(self): # Check custom metric is reported metrics = server.metrics() - assert "custom_metric_familiy" in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS) + assert "custom_metric_familiy" in metrics.formatted( + triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS + )