Skip to content

feat(lgbm): support string features via pandas categorical #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ sdist/
var/
.idea/
*.egg-info/
.debug
.installed.cfg
*.egg

Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Pythie serving

## 2.2.0

### Change
* Allow client to request model with string features used as pandas categoricals during training

## 2.1.0

### Change
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.1.0
2.2.0
95 changes: 69 additions & 26 deletions src/pythie_serving/lightgbm_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,105 @@
import logging
import os
import pickle
import logging
from typing import List, Optional, Any

import grpc

from lightgbm import Booster

from .exceptions import PythieServingException
from .tensorflow_proto.tensorflow_serving.apis import (
predict_pb2,
prediction_service_pb2_grpc,
)
from .tensorflow_proto.tensorflow_serving.config import model_server_config_pb2
from .tensorflow_proto.tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
from .utils import make_ndarray_from_tensor
from .exceptions import PythieServingException


class LightGBMPredictionServiceServicer(prediction_service_pb2_grpc.PredictionServiceServicer):

def __init__(self, *, logger: logging.Logger, model_server_config: model_server_config_pb2.ModelServerConfig):
class LightGBMPredictionServiceServicer(
prediction_service_pb2_grpc.PredictionServiceServicer
):
def __init__(
self,
*,
logger: logging.Logger,
model_server_config: model_server_config_pb2.ModelServerConfig,
):
self.logger = logger
self.model_map = {}
for model_config in model_server_config.model_config_list.config:
with open(os.path.join(model_config.base_path, model_config.name) + ".pickled", 'rb') as opened_model:
with open(
os.path.join(model_config.base_path, model_config.name) + ".pickled",
"rb",
) as opened_model:
model = pickle.load(opened_model)
pandas_categorical = model.pandas_categorical

if isinstance(model, Booster):
feature_names = model.feature_name()
best_iteration = model.best_iteration
else:
feature_names = model.feature_names
best_iteration = getattr(model, 'best_iteration', None)
best_iteration = getattr(model, "best_iteration", None)

self.model_map[model_config.name] = {'model': model, 'feature_names': feature_names,
'best_iteration': best_iteration}
self.model_map[model_config.name] = {
"model": model,
"feature_names": feature_names,
"best_iteration": best_iteration,
"pandas_categorical": pandas_categorical,
}

def Predict(self, request: predict_pb2.PredictRequest, context: grpc.RpcContext):
model_name = request.model_spec.name
if model_name not in self.model_map:
raise PythieServingException(f'Unknown model: {model_name}. This pythie-serving instance can only '
f'serve one of the following: {",".join(self.model_map.keys())}')
raise PythieServingException(
f"Unknown model: {model_name}. This pythie-serving instance can only "
f'serve one of the following: {",".join(self.model_map.keys())}'
)

model_dict = self.model_map[model_name]

features_names = model_dict['feature_names']
samples = None
model = model_dict["model"]
pandas_categorical = model_dict["pandas_categorical"]
features_names = model_dict["feature_names"]
pd_categorical_features: List[str] = []
samples: Optional[List[List[Any]]] = None
for feature_name in features_names:
if feature_name not in request.inputs:
raise PythieServingException(f'{feature_name} not set in the predict request')
raise PythieServingException(
f"{feature_name} not set in the predict request"
)

nd_array = make_ndarray_from_tensor(request.inputs[feature_name])
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
raise PythieServingException('All input vectors should be 1D tensor')
# get features categories from model if feature dtype is object ("string")
feature_categories = {}
if nd_array.dtype == object:
if pandas_categorical is None:
raise PythieServingException(
f"{feature_name} feature has type 'object' but "
f"there is no saved pandas categories from model"
)
# lgbm save categories in the same order categorical features appear in model.feature_name()
feature_categories = {
category_name: category_position
for category_position, category_name in enumerate(
pandas_categorical[len(pd_categorical_features)]
)
}
pd_categorical_features.append(feature_name)

if samples is None:
samples = [[] for _ in range(nd_array.shape[0])]
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
raise PythieServingException("All input vectors should be 1D tensor")

for sample_index, value in enumerate(nd_array):
samples[sample_index].append(value[0])
samples = samples or [[] for _ in range(nd_array.shape[0])]
if len(feature_categories) > 0: # get category position from its value
for sample_index, value in enumerate(nd_array):
samples[sample_index].append(
feature_categories[value[0].decode("utf-8")]
)
else:
for sample_index, value in enumerate(nd_array):
samples[sample_index].append(value[0])

model = model_dict['model']
kwargs = {}
if model_dict['best_iteration']:
kwargs['best_iteration'] = model_dict['best_iteration']
if model_dict["best_iteration"]:
kwargs["best_iteration"] = model_dict["best_iteration"]
return model.predict(samples, **kwargs)
66 changes: 45 additions & 21 deletions src/pythie_serving/utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,47 @@
from typing import List, Any, Type
from typing import Any, List, Type

import numpy as np

from .tensorflow_proto.tensorflow.core.framework import tensor_pb2, tensor_shape_pb2, types_pb2

_types_map = (
(np.int32, types_pb2.DT_INT32), (np.int64, types_pb2.DT_INT64), (np.float32, types_pb2.DT_FLOAT),
(np.float64, types_pb2.DT_DOUBLE), (np.bool_, types_pb2.DT_BOOL), (np.bytes_, types_pb2.DT_STRING)
from .tensorflow_proto.tensorflow.core.framework import (
tensor_pb2,
tensor_shape_pb2,
types_pb2,
)

_TF_TYPE_MAP = {tf_type: np_type for np_type, tf_type in _types_map}
_NP_TYPE_MAP = {np_type: tf_type for np_type, tf_type in _types_map}
# from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/dtypes.py
_TF_TO_NP = {
types_pb2.DT_HALF: np.float16,
types_pb2.DT_FLOAT: np.float32,
types_pb2.DT_DOUBLE: np.float64,
types_pb2.DT_INT32: np.int32,
types_pb2.DT_UINT8: np.uint8,
types_pb2.DT_UINT16: np.uint16,
types_pb2.DT_UINT32: np.uint32,
types_pb2.DT_UINT64: np.uint64,
types_pb2.DT_INT16: np.int16,
types_pb2.DT_INT8: np.int8,
# NOTE(touts): For strings we use object as it supports variable length # strings.
types_pb2.DT_STRING: object,
types_pb2.DT_COMPLEX64: np.complex64,
types_pb2.DT_COMPLEX128: np.complex128,
types_pb2.DT_INT64: np.int64,
types_pb2.DT_BOOL: np.bool_,
}

_NP_TO_TF = {nt: tt for tt, nt in _TF_TO_NP.items()}
_NP_TO_TF[np.bytes_] = types_pb2.DT_STRING
_NP_TO_TF[np.str_] = types_pb2.DT_STRING


def get_tf_type(np_dtype: Type):
"""
:param np_type: python Type
:param np_dtype: python Type
:return: types_pb2.DataType
"""
try:
return _NP_TYPE_MAP[np_dtype.type]
return _NP_TO_TF[np_dtype.type]
except KeyError:
raise TypeError(f'Could not infer tensorflow type for {np_dtype.type}')
raise TypeError(f"Could not infer tensorflow type for {np_dtype.type}")


def get_np_dtype(tf_type: types_pb2.DataType):
Expand All @@ -30,9 +50,9 @@ def get_np_dtype(tf_type: types_pb2.DataType):
:return: types_pb2.DataType
"""
try:
return np.dtype(_TF_TYPE_MAP[tf_type])
return np.dtype(_TF_TO_NP[tf_type])
except KeyError:
raise TypeError(f'Could not infer numpy type for {tf_type}')
raise TypeError(f"Could not infer numpy type for {tf_type}")


def make_tensor_proto(values: List[Any]):
Expand All @@ -59,23 +79,25 @@ def make_tensor_proto(values: List[Any]):
for vector in np_array:
for s in vector:
if not isinstance(s, bytes):
raise TypeError(f'{values} expect a list of bytes when working with DT_STRING types')
string_val.append(s)
tensor_kwargs['string_val'] = string_val
raise TypeError(
f"{values} expect a list of bytes when working with DT_STRING types"
)
string_val.append(s)
tensor_kwargs["string_val"] = string_val
else:
tensor_kwargs['tensor_content'] = np_array.tobytes()
tensor_kwargs["tensor_content"] = np_array.tobytes()
return tensor_pb2.TensorProto(
dtype=dtype,
tensor_shape=tensor_shape_proto,
**tensor_kwargs
dtype=dtype, tensor_shape=tensor_shape_proto, **tensor_kwargs
)


def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
shape = [d.size for d in tensor.tensor_shape.dim]
np_dtype = get_np_dtype(tensor.dtype)
if tensor.tensor_content:
return np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
return (
np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
)

if tensor.dtype == types_pb2.DT_FLOAT:
values = np.fromiter(tensor.float_val, dtype=np_dtype)
Expand All @@ -85,6 +107,8 @@ def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
values = np.fromiter(tensor.int_val, dtype=np_dtype)
elif tensor.dtype == types_pb2.DT_BOOL:
values = np.fromiter(tensor.bool_val, dtype=np_dtype)
elif tensor.dtype == types_pb2.DT_STRING:
values = np.array(tensor.string_val, dtype=np_dtype)
else:
raise TypeError("Unsupported tensor type: %s" % tensor.dtype)

Expand Down