Skip to content

Commit 12d4677

Browse files
author
Arnaud YANKWA WANDJI
committed
feat(lgbm): add logic to convert string features to int before prediction
1 parent c0f52b7 commit 12d4677

File tree

5 files changed

+121
-48
lines changed

5 files changed

+121
-48
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ sdist/
2626
var/
2727
.idea/
2828
*.egg-info/
29+
.debug
2930
.installed.cfg
3031
*.egg
3132

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Pythie serving
22

3+
## 2.2.0
4+
5+
### Change
6+
* Allow client to request model with string features used as pandas categoricals during training
7+
38
## 2.1.0
49

510
### Change

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.0
1+
2.2.0
Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,105 @@
1+
import logging
12
import os
23
import pickle
3-
import logging
4+
from typing import List, Optional, Any
45

56
import grpc
6-
77
from lightgbm import Booster
88

9+
from .exceptions import PythieServingException
10+
from .tensorflow_proto.tensorflow_serving.apis import (
11+
predict_pb2,
12+
prediction_service_pb2_grpc,
13+
)
914
from .tensorflow_proto.tensorflow_serving.config import model_server_config_pb2
10-
from .tensorflow_proto.tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
1115
from .utils import make_ndarray_from_tensor
12-
from .exceptions import PythieServingException
1316

1417

15-
class LightGBMPredictionServiceServicer(prediction_service_pb2_grpc.PredictionServiceServicer):
16-
17-
def __init__(self, *, logger: logging.Logger, model_server_config: model_server_config_pb2.ModelServerConfig):
18+
class LightGBMPredictionServiceServicer(
19+
prediction_service_pb2_grpc.PredictionServiceServicer
20+
):
21+
def __init__(
22+
self,
23+
*,
24+
logger: logging.Logger,
25+
model_server_config: model_server_config_pb2.ModelServerConfig,
26+
):
1827
self.logger = logger
1928
self.model_map = {}
2029
for model_config in model_server_config.model_config_list.config:
21-
with open(os.path.join(model_config.base_path, model_config.name) + ".pickled", 'rb') as opened_model:
30+
with open(
31+
os.path.join(model_config.base_path, model_config.name) + ".pickled",
32+
"rb",
33+
) as opened_model:
2234
model = pickle.load(opened_model)
35+
pandas_categorical = model.pandas_categorical
2336

2437
if isinstance(model, Booster):
2538
feature_names = model.feature_name()
2639
best_iteration = model.best_iteration
2740
else:
2841
feature_names = model.feature_names
29-
best_iteration = getattr(model, 'best_iteration', None)
42+
best_iteration = getattr(model, "best_iteration", None)
3043

31-
self.model_map[model_config.name] = {'model': model, 'feature_names': feature_names,
32-
'best_iteration': best_iteration}
44+
self.model_map[model_config.name] = {
45+
"model": model,
46+
"feature_names": feature_names,
47+
"best_iteration": best_iteration,
48+
"pandas_categorical": pandas_categorical,
49+
}
3350

3451
def Predict(self, request: predict_pb2.PredictRequest, context: grpc.RpcContext):
3552
model_name = request.model_spec.name
3653
if model_name not in self.model_map:
37-
raise PythieServingException(f'Unknown model: {model_name}. This pythie-serving instance can only '
38-
f'serve one of the following: {",".join(self.model_map.keys())}')
54+
raise PythieServingException(
55+
f"Unknown model: {model_name}. This pythie-serving instance can only "
56+
f'serve one of the following: {",".join(self.model_map.keys())}'
57+
)
3958

4059
model_dict = self.model_map[model_name]
41-
42-
features_names = model_dict['feature_names']
43-
samples = None
60+
model = model_dict["model"]
61+
pandas_categorical = model_dict["pandas_categorical"]
62+
features_names = model_dict["feature_names"]
63+
pd_categorical_features: List[str] = []
64+
samples: Optional[List[List[Any]]] = None
4465
for feature_name in features_names:
4566
if feature_name not in request.inputs:
46-
raise PythieServingException(f'{feature_name} not set in the predict request')
67+
raise PythieServingException(
68+
f"{feature_name} not set in the predict request"
69+
)
4770

4871
nd_array = make_ndarray_from_tensor(request.inputs[feature_name])
49-
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
50-
raise PythieServingException('All input vectors should be 1D tensor')
72+
# get features categories from model if feature dtype is object ("string")
73+
feature_categories = {}
74+
if nd_array.dtype == object:
75+
if pandas_categorical is None:
76+
raise PythieServingException(
77+
f"{feature_name} feature has type 'object' but "
78+
f"there is no saved pandas categories from model"
79+
)
80+
# lgbm save categories in the same order categorical features appear in model.feature_name()
81+
feature_categories = {
82+
category_name: category_position
83+
for category_position, category_name in enumerate(
84+
pandas_categorical[len(pd_categorical_features)]
85+
)
86+
}
87+
pd_categorical_features.append(feature_name)
5188

52-
if samples is None:
53-
samples = [[] for _ in range(nd_array.shape[0])]
89+
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
90+
raise PythieServingException("All input vectors should be 1D tensor")
5491

55-
for sample_index, value in enumerate(nd_array):
56-
samples[sample_index].append(value[0])
92+
samples = samples or [[] for _ in range(nd_array.shape[0])]
93+
if len(feature_categories) > 0: # get category position from its value
94+
for sample_index, value in enumerate(nd_array):
95+
samples[sample_index].append(
96+
feature_categories[value[0].decode("utf-8")]
97+
)
98+
else:
99+
for sample_index, value in enumerate(nd_array):
100+
samples[sample_index].append(value[0])
57101

58-
model = model_dict['model']
59102
kwargs = {}
60-
if model_dict['best_iteration']:
61-
kwargs['best_iteration'] = model_dict['best_iteration']
103+
if model_dict["best_iteration"]:
104+
kwargs["best_iteration"] = model_dict["best_iteration"]
62105
return model.predict(samples, **kwargs)

src/pythie_serving/utils.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,47 @@
1-
from typing import List, Any, Type
1+
from typing import Any, List, Type
22

33
import numpy as np
44

5-
from .tensorflow_proto.tensorflow.core.framework import tensor_pb2, tensor_shape_pb2, types_pb2
6-
7-
_types_map = (
8-
(np.int32, types_pb2.DT_INT32), (np.int64, types_pb2.DT_INT64), (np.float32, types_pb2.DT_FLOAT),
9-
(np.float64, types_pb2.DT_DOUBLE), (np.bool_, types_pb2.DT_BOOL), (np.bytes_, types_pb2.DT_STRING)
5+
from .tensorflow_proto.tensorflow.core.framework import (
6+
tensor_pb2,
7+
tensor_shape_pb2,
8+
types_pb2,
109
)
1110

12-
_TF_TYPE_MAP = {tf_type: np_type for np_type, tf_type in _types_map}
13-
_NP_TYPE_MAP = {np_type: tf_type for np_type, tf_type in _types_map}
11+
# from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/dtypes.py
12+
_TF_TO_NP = {
13+
types_pb2.DT_HALF: np.float16,
14+
types_pb2.DT_FLOAT: np.float32,
15+
types_pb2.DT_DOUBLE: np.float64,
16+
types_pb2.DT_INT32: np.int32,
17+
types_pb2.DT_UINT8: np.uint8,
18+
types_pb2.DT_UINT16: np.uint16,
19+
types_pb2.DT_UINT32: np.uint32,
20+
types_pb2.DT_UINT64: np.uint64,
21+
types_pb2.DT_INT16: np.int16,
22+
types_pb2.DT_INT8: np.int8,
23+
# NOTE(touts): For strings we use object as it supports variable length # strings.
24+
types_pb2.DT_STRING: object,
25+
types_pb2.DT_COMPLEX64: np.complex64,
26+
types_pb2.DT_COMPLEX128: np.complex128,
27+
types_pb2.DT_INT64: np.int64,
28+
types_pb2.DT_BOOL: np.bool_,
29+
}
30+
31+
_NP_TO_TF = {nt: tt for tt, nt in _TF_TO_NP.items()}
32+
_NP_TO_TF[np.bytes_] = types_pb2.DT_STRING
33+
_NP_TO_TF[np.str_] = types_pb2.DT_STRING
1434

1535

1636
def get_tf_type(np_dtype: Type):
1737
"""
18-
:param np_type: python Type
38+
:param np_dtype: python Type
1939
:return: types_pb2.DataType
2040
"""
2141
try:
22-
return _NP_TYPE_MAP[np_dtype.type]
42+
return _NP_TO_TF[np_dtype.type]
2343
except KeyError:
24-
raise TypeError(f'Could not infer tensorflow type for {np_dtype.type}')
44+
raise TypeError(f"Could not infer tensorflow type for {np_dtype.type}")
2545

2646

2747
def get_np_dtype(tf_type: types_pb2.DataType):
@@ -30,9 +50,9 @@ def get_np_dtype(tf_type: types_pb2.DataType):
3050
:return: types_pb2.DataType
3151
"""
3252
try:
33-
return np.dtype(_TF_TYPE_MAP[tf_type])
53+
return np.dtype(_TF_TO_NP[tf_type])
3454
except KeyError:
35-
raise TypeError(f'Could not infer numpy type for {tf_type}')
55+
raise TypeError(f"Could not infer numpy type for {tf_type}")
3656

3757

3858
def make_tensor_proto(values: List[Any]):
@@ -59,23 +79,25 @@ def make_tensor_proto(values: List[Any]):
5979
for vector in np_array:
6080
for s in vector:
6181
if not isinstance(s, bytes):
62-
raise TypeError(f'{values} expect a list of bytes when working with DT_STRING types')
63-
string_val.append(s)
64-
tensor_kwargs['string_val'] = string_val
82+
raise TypeError(
83+
f"{values} expect a list of bytes when working with DT_STRING types"
84+
)
85+
string_val.append(s)
86+
tensor_kwargs["string_val"] = string_val
6587
else:
66-
tensor_kwargs['tensor_content'] = np_array.tobytes()
88+
tensor_kwargs["tensor_content"] = np_array.tobytes()
6789
return tensor_pb2.TensorProto(
68-
dtype=dtype,
69-
tensor_shape=tensor_shape_proto,
70-
**tensor_kwargs
90+
dtype=dtype, tensor_shape=tensor_shape_proto, **tensor_kwargs
7191
)
7292

7393

7494
def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
7595
shape = [d.size for d in tensor.tensor_shape.dim]
7696
np_dtype = get_np_dtype(tensor.dtype)
7797
if tensor.tensor_content:
78-
return np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
98+
return (
99+
np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
100+
)
79101

80102
if tensor.dtype == types_pb2.DT_FLOAT:
81103
values = np.fromiter(tensor.float_val, dtype=np_dtype)
@@ -85,6 +107,8 @@ def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
85107
values = np.fromiter(tensor.int_val, dtype=np_dtype)
86108
elif tensor.dtype == types_pb2.DT_BOOL:
87109
values = np.fromiter(tensor.bool_val, dtype=np_dtype)
110+
elif tensor.dtype == types_pb2.DT_STRING:
111+
values = np.array(tensor.string_val, dtype=np_dtype)
88112
else:
89113
raise TypeError("Unsupported tensor type: %s" % tensor.dtype)
90114

0 commit comments

Comments
 (0)