From 12d467773b996ffa7ee00bd385dd4e841c19a17a Mon Sep 17 00:00:00 2001 From: Arnaud YANKWA WANDJI Date: Fri, 4 Mar 2022 14:27:19 +0100 Subject: [PATCH] feat(lgbm): add logic to convert string features to int before prediction --- .gitignore | 1 + CHANGELOG.md | 5 ++ VERSION | 2 +- src/pythie_serving/lightgbm_wrapper.py | 95 +++++++++++++++++++------- src/pythie_serving/utils.py | 66 ++++++++++++------ 5 files changed, 121 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 688dbc8..33961ff 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ sdist/ var/ .idea/ *.egg-info/ +.debug .installed.cfg *.egg diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bd425b..8dbb5bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Pythie serving +## 2.2.0 + +### Change +* Allow client to request model with string features used as pandas categoricals during training + ## 2.1.0 ### Change diff --git a/VERSION b/VERSION index 7ec1d6d..ccbccc3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 +2.2.0 diff --git a/src/pythie_serving/lightgbm_wrapper.py b/src/pythie_serving/lightgbm_wrapper.py index cbe0e10..6713576 100644 --- a/src/pythie_serving/lightgbm_wrapper.py +++ b/src/pythie_serving/lightgbm_wrapper.py @@ -1,62 +1,105 @@ +import logging import os import pickle -import logging +from typing import List, Optional, Any import grpc - from lightgbm import Booster +from .exceptions import PythieServingException +from .tensorflow_proto.tensorflow_serving.apis import ( + predict_pb2, + prediction_service_pb2_grpc, +) from .tensorflow_proto.tensorflow_serving.config import model_server_config_pb2 -from .tensorflow_proto.tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc from .utils import make_ndarray_from_tensor -from .exceptions import PythieServingException -class LightGBMPredictionServiceServicer(prediction_service_pb2_grpc.PredictionServiceServicer): - - def __init__(self, *, logger: logging.Logger, model_server_config: model_server_config_pb2.ModelServerConfig): +class LightGBMPredictionServiceServicer( + prediction_service_pb2_grpc.PredictionServiceServicer +): + def __init__( + self, + *, + logger: logging.Logger, + model_server_config: model_server_config_pb2.ModelServerConfig, + ): self.logger = logger self.model_map = {} for model_config in model_server_config.model_config_list.config: - with open(os.path.join(model_config.base_path, model_config.name) + ".pickled", 'rb') as opened_model: + with open( + os.path.join(model_config.base_path, model_config.name) + ".pickled", + "rb", + ) as opened_model: model = pickle.load(opened_model) + pandas_categorical = model.pandas_categorical if isinstance(model, Booster): feature_names = model.feature_name() best_iteration = model.best_iteration else: feature_names = model.feature_names - best_iteration = getattr(model, 'best_iteration', None) + best_iteration = getattr(model, "best_iteration", None) - self.model_map[model_config.name] = {'model': model, 'feature_names': feature_names, - 'best_iteration': best_iteration} + self.model_map[model_config.name] = { + "model": model, + "feature_names": feature_names, + "best_iteration": best_iteration, + "pandas_categorical": pandas_categorical, + } def Predict(self, request: predict_pb2.PredictRequest, context: grpc.RpcContext): model_name = request.model_spec.name if model_name not in self.model_map: - raise PythieServingException(f'Unknown model: {model_name}. This pythie-serving instance can only ' - f'serve one of the following: {",".join(self.model_map.keys())}') + raise PythieServingException( + f"Unknown model: {model_name}. This pythie-serving instance can only " + f'serve one of the following: {",".join(self.model_map.keys())}' + ) model_dict = self.model_map[model_name] - - features_names = model_dict['feature_names'] - samples = None + model = model_dict["model"] + pandas_categorical = model_dict["pandas_categorical"] + features_names = model_dict["feature_names"] + pd_categorical_features: List[str] = [] + samples: Optional[List[List[Any]]] = None for feature_name in features_names: if feature_name not in request.inputs: - raise PythieServingException(f'{feature_name} not set in the predict request') + raise PythieServingException( + f"{feature_name} not set in the predict request" + ) nd_array = make_ndarray_from_tensor(request.inputs[feature_name]) - if len(nd_array.shape) != 2 or nd_array.shape[1] != 1: - raise PythieServingException('All input vectors should be 1D tensor') + # get features categories from model if feature dtype is object ("string") + feature_categories = {} + if nd_array.dtype == object: + if pandas_categorical is None: + raise PythieServingException( + f"{feature_name} feature has type 'object' but " + f"there is no saved pandas categories from model" + ) + # lgbm save categories in the same order categorical features appear in model.feature_name() + feature_categories = { + category_name: category_position + for category_position, category_name in enumerate( + pandas_categorical[len(pd_categorical_features)] + ) + } + pd_categorical_features.append(feature_name) - if samples is None: - samples = [[] for _ in range(nd_array.shape[0])] + if len(nd_array.shape) != 2 or nd_array.shape[1] != 1: + raise PythieServingException("All input vectors should be 1D tensor") - for sample_index, value in enumerate(nd_array): - samples[sample_index].append(value[0]) + samples = samples or [[] for _ in range(nd_array.shape[0])] + if len(feature_categories) > 0: # get category position from its value + for sample_index, value in enumerate(nd_array): + samples[sample_index].append( + feature_categories[value[0].decode("utf-8")] + ) + else: + for sample_index, value in enumerate(nd_array): + samples[sample_index].append(value[0]) - model = model_dict['model'] kwargs = {} - if model_dict['best_iteration']: - kwargs['best_iteration'] = model_dict['best_iteration'] + if model_dict["best_iteration"]: + kwargs["best_iteration"] = model_dict["best_iteration"] return model.predict(samples, **kwargs) diff --git a/src/pythie_serving/utils.py b/src/pythie_serving/utils.py index 58002d9..01a87a6 100644 --- a/src/pythie_serving/utils.py +++ b/src/pythie_serving/utils.py @@ -1,27 +1,47 @@ -from typing import List, Any, Type +from typing import Any, List, Type import numpy as np -from .tensorflow_proto.tensorflow.core.framework import tensor_pb2, tensor_shape_pb2, types_pb2 - -_types_map = ( - (np.int32, types_pb2.DT_INT32), (np.int64, types_pb2.DT_INT64), (np.float32, types_pb2.DT_FLOAT), - (np.float64, types_pb2.DT_DOUBLE), (np.bool_, types_pb2.DT_BOOL), (np.bytes_, types_pb2.DT_STRING) +from .tensorflow_proto.tensorflow.core.framework import ( + tensor_pb2, + tensor_shape_pb2, + types_pb2, ) -_TF_TYPE_MAP = {tf_type: np_type for np_type, tf_type in _types_map} -_NP_TYPE_MAP = {np_type: tf_type for np_type, tf_type in _types_map} +# from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/dtypes.py +_TF_TO_NP = { + types_pb2.DT_HALF: np.float16, + types_pb2.DT_FLOAT: np.float32, + types_pb2.DT_DOUBLE: np.float64, + types_pb2.DT_INT32: np.int32, + types_pb2.DT_UINT8: np.uint8, + types_pb2.DT_UINT16: np.uint16, + types_pb2.DT_UINT32: np.uint32, + types_pb2.DT_UINT64: np.uint64, + types_pb2.DT_INT16: np.int16, + types_pb2.DT_INT8: np.int8, + # NOTE(touts): For strings we use object as it supports variable length # strings. + types_pb2.DT_STRING: object, + types_pb2.DT_COMPLEX64: np.complex64, + types_pb2.DT_COMPLEX128: np.complex128, + types_pb2.DT_INT64: np.int64, + types_pb2.DT_BOOL: np.bool_, +} + +_NP_TO_TF = {nt: tt for tt, nt in _TF_TO_NP.items()} +_NP_TO_TF[np.bytes_] = types_pb2.DT_STRING +_NP_TO_TF[np.str_] = types_pb2.DT_STRING def get_tf_type(np_dtype: Type): """ - :param np_type: python Type + :param np_dtype: python Type :return: types_pb2.DataType """ try: - return _NP_TYPE_MAP[np_dtype.type] + return _NP_TO_TF[np_dtype.type] except KeyError: - raise TypeError(f'Could not infer tensorflow type for {np_dtype.type}') + raise TypeError(f"Could not infer tensorflow type for {np_dtype.type}") def get_np_dtype(tf_type: types_pb2.DataType): @@ -30,9 +50,9 @@ def get_np_dtype(tf_type: types_pb2.DataType): :return: types_pb2.DataType """ try: - return np.dtype(_TF_TYPE_MAP[tf_type]) + return np.dtype(_TF_TO_NP[tf_type]) except KeyError: - raise TypeError(f'Could not infer numpy type for {tf_type}') + raise TypeError(f"Could not infer numpy type for {tf_type}") def make_tensor_proto(values: List[Any]): @@ -59,15 +79,15 @@ def make_tensor_proto(values: List[Any]): for vector in np_array: for s in vector: if not isinstance(s, bytes): - raise TypeError(f'{values} expect a list of bytes when working with DT_STRING types') - string_val.append(s) - tensor_kwargs['string_val'] = string_val + raise TypeError( + f"{values} expect a list of bytes when working with DT_STRING types" + ) + string_val.append(s) + tensor_kwargs["string_val"] = string_val else: - tensor_kwargs['tensor_content'] = np_array.tobytes() + tensor_kwargs["tensor_content"] = np_array.tobytes() return tensor_pb2.TensorProto( - dtype=dtype, - tensor_shape=tensor_shape_proto, - **tensor_kwargs + dtype=dtype, tensor_shape=tensor_shape_proto, **tensor_kwargs ) @@ -75,7 +95,9 @@ def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto): shape = [d.size for d in tensor.tensor_shape.dim] np_dtype = get_np_dtype(tensor.dtype) if tensor.tensor_content: - return np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape) + return ( + np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape) + ) if tensor.dtype == types_pb2.DT_FLOAT: values = np.fromiter(tensor.float_val, dtype=np_dtype) @@ -85,6 +107,8 @@ def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto): values = np.fromiter(tensor.int_val, dtype=np_dtype) elif tensor.dtype == types_pb2.DT_BOOL: values = np.fromiter(tensor.bool_val, dtype=np_dtype) + elif tensor.dtype == types_pb2.DT_STRING: + values = np.array(tensor.string_val, dtype=np_dtype) else: raise TypeError("Unsupported tensor type: %s" % tensor.dtype)