Timing: Log invocation durations (#7)

jpmorganchase · Jul 4, 2023 · b7c92db · b7c92db
2 parents fe62aa1 + 70fd60a
commit b7c92db
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 8 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,6 +46,7 @@ keywords = [
 requires-python = ">=3.7"
 # All runtime dependencies that must be packaged, pin major version only.
 dependencies = [
+    "codetiming~=1.4",
     "importlib-metadata<4; python_version<'3.8'",
     "pluggy~=1.0",
     "werkzeug~=2.0",

diff --git a/src/inference_server/__init__.py b/src/inference_server/__init__.py
@@ -18,6 +18,7 @@
 import logging
 from typing import TYPE_CHECKING
 
+import codetiming
 import werkzeug
 import werkzeug.exceptions
 from werkzeug.datastructures import MIMEAccept
@@ -76,14 +77,15 @@ def _handle_invocations(request: werkzeug.Request) -> werkzeug.Response:
 
     :param request: HTTP request data
     """
-    pm = inference_server._plugin.manager()
-    # Deserialize HTTP body payload (bytes) into input features
-    data = pm.hook.input_fn(input_data=request.data, content_type=request.content_type)
-    # Then use the model to make a prediction
-    prediction = pm.hook.predict_fn(data=data, model=_model())
-    # Then serialize the data as bytes. This is often (but not necessarily) JSON bytes.
-    prediction_bytes, content_type = pm.hook.output_fn(prediction=prediction, accept=request.accept_mimetypes)
-    return werkzeug.Response(prediction_bytes, mimetype=content_type)
+    with codetiming.Timer(text="Invocation took {:.3f} seconds", logger=logger.debug):
+        pm = inference_server._plugin.manager()
+        # Deserialize HTTP body payload (bytes) into input features
+        data = pm.hook.input_fn(input_data=request.data, content_type=request.content_type)
+        # Then use the model to make a prediction
+        prediction = pm.hook.predict_fn(data=data, model=_model())
+        # Then serialize the data as bytes. This is often (but not necessarily) JSON bytes.
+        prediction_bytes, content_type = pm.hook.output_fn(prediction=prediction, accept=request.accept_mimetypes)
+        return werkzeug.Response(prediction_bytes, mimetype=content_type)
 
 
 def _handle_ping(request: werkzeug.Request) -> werkzeug.Response: