Skip to content

Commit 7e07e25

Browse files
author
Arnaud YANKWA WANDJI
committed
feat(lgbm): add logic to convert string features to int before prediction
1 parent 4a0982b commit 7e07e25

File tree

4 files changed

+113
-48
lines changed

4 files changed

+113
-48
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ sdist/
2626
var/
2727
.idea/
2828
*.egg-info/
29+
.debug
2930
.installed.cfg
3031
*.egg
3132

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.2.0
1+
1.3.0
Lines changed: 66 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,101 @@
1-
import pickle
21
import logging
2+
import pickle
3+
from typing import List, Optional, Any
34

45
import grpc
5-
66
from lightgbm import Booster
77

8+
from .exceptions import PythieServingException
9+
from .tensorflow_proto.tensorflow_serving.apis import (
10+
predict_pb2,
11+
prediction_service_pb2_grpc,
12+
)
813
from .tensorflow_proto.tensorflow_serving.config import model_server_config_pb2
9-
from .tensorflow_proto.tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
1014
from .utils import make_ndarray_from_tensor
11-
from .exceptions import PythieServingException
1215

1316

14-
class LightGBMPredictionServiceServicer(prediction_service_pb2_grpc.PredictionServiceServicer):
15-
16-
def __init__(self, *, logger: logging.Logger, model_server_config: model_server_config_pb2.ModelServerConfig):
17+
class LightGBMPredictionServiceServicer(
18+
prediction_service_pb2_grpc.PredictionServiceServicer
19+
):
20+
def __init__(
21+
self,
22+
*,
23+
logger: logging.Logger,
24+
model_server_config: model_server_config_pb2.ModelServerConfig,
25+
):
1726
self.logger = logger
1827
self.model_map = {}
1928
for model_config in model_server_config.model_config_list.config:
20-
with open(model_config.base_path, 'rb') as opened_model:
29+
with open(model_config.base_path, "rb") as opened_model:
2130
model = pickle.load(opened_model)
31+
pandas_categorical = model.pandas_categorical
2232

2333
if isinstance(model, Booster):
2434
feature_names = model.feature_name()
2535
best_iteration = model.best_iteration
2636
else:
2737
feature_names = model.feature_names
28-
best_iteration = getattr(model, 'best_iteration', None)
38+
best_iteration = getattr(model, "best_iteration", None)
2939

30-
self.model_map[model_config.name] = {'model': model, 'feature_names': feature_names,
31-
'best_iteration': best_iteration}
40+
self.model_map[model_config.name] = {
41+
"model": model,
42+
"feature_names": feature_names,
43+
"best_iteration": best_iteration,
44+
"pandas_categorical": pandas_categorical,
45+
}
3246

3347
def Predict(self, request: predict_pb2.PredictRequest, context: grpc.RpcContext):
3448
model_name = request.model_spec.name
3549
if model_name not in self.model_map:
36-
raise PythieServingException(f'Unknown model: {model_name}. This pythie-serving instance can only '
37-
f'serve one of the following: {",".join(self.model_map.keys())}')
50+
raise PythieServingException(
51+
f"Unknown model: {model_name}. This pythie-serving instance can only "
52+
f'serve one of the following: {",".join(self.model_map.keys())}'
53+
)
3854

3955
model_dict = self.model_map[model_name]
40-
41-
features_names = model_dict['feature_names']
42-
samples = None
56+
model = model_dict["model"]
57+
pandas_categorical = model_dict["pandas_categorical"]
58+
features_names = model_dict["feature_names"]
59+
pd_categorical_features: List[str] = []
60+
samples: Optional[List[List[Any]]] = None
4361
for feature_name in features_names:
4462
if feature_name not in request.inputs:
45-
raise PythieServingException(f'{feature_name} not set in the predict request')
63+
raise PythieServingException(
64+
f"{feature_name} not set in the predict request"
65+
)
4666

4767
nd_array = make_ndarray_from_tensor(request.inputs[feature_name])
48-
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
49-
raise PythieServingException('All input vectors should be 1D tensor')
68+
# get features categories from model if feature dtype is object ("string")
69+
feature_categories = {}
70+
if nd_array.dtype == object:
71+
if pandas_categorical is None:
72+
raise PythieServingException(
73+
f"{feature_name} feature has type 'object' but "
74+
f"there is no saved pandas categories from model"
75+
)
76+
# lgbm save categories in the same order categorical features appear in model.feature_name()
77+
feature_categories = {
78+
category_name: category_position
79+
for category_position, category_name in enumerate(
80+
pandas_categorical[len(pd_categorical_features)]
81+
)
82+
}
83+
pd_categorical_features.append(feature_name)
5084

51-
if samples is None:
52-
samples = [[] for _ in range(nd_array.shape[0])]
85+
if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
86+
raise PythieServingException("All input vectors should be 1D tensor")
5387

54-
for sample_index, value in enumerate(nd_array):
55-
samples[sample_index].append(value[0])
88+
samples = samples or [[] for _ in range(nd_array.shape[0])]
89+
if len(feature_categories) > 0: # get category position from its value
90+
for sample_index, value in enumerate(nd_array):
91+
samples[sample_index].append(
92+
feature_categories[value[0].decode("utf-8")]
93+
)
94+
else:
95+
for sample_index, value in enumerate(nd_array):
96+
samples[sample_index].append(value[0])
5697

57-
model = model_dict['model']
5898
kwargs = {}
59-
if model_dict['best_iteration']:
60-
kwargs['best_iteration'] = model_dict['best_iteration']
99+
if model_dict["best_iteration"]:
100+
kwargs["best_iteration"] = model_dict["best_iteration"]
61101
return model.predict(samples, **kwargs)

src/pythie_serving/utils.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,47 @@
1-
from typing import List, Any, Type
1+
from typing import Any, List, Type
22

33
import numpy as np
44

5-
from .tensorflow_proto.tensorflow.core.framework import tensor_pb2, tensor_shape_pb2, types_pb2
6-
7-
_types_map = (
8-
(np.int32, types_pb2.DT_INT32), (np.int64, types_pb2.DT_INT64), (np.float32, types_pb2.DT_FLOAT),
9-
(np.float64, types_pb2.DT_DOUBLE), (np.bool_, types_pb2.DT_BOOL), (np.bytes_, types_pb2.DT_STRING)
5+
from .tensorflow_proto.tensorflow.core.framework import (
6+
tensor_pb2,
7+
tensor_shape_pb2,
8+
types_pb2,
109
)
1110

12-
_TF_TYPE_MAP = {tf_type: np_type for np_type, tf_type in _types_map}
13-
_NP_TYPE_MAP = {np_type: tf_type for np_type, tf_type in _types_map}
11+
# from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/dtypes.py
12+
_TF_TO_NP = {
13+
types_pb2.DT_HALF: np.float16,
14+
types_pb2.DT_FLOAT: np.float32,
15+
types_pb2.DT_DOUBLE: np.float64,
16+
types_pb2.DT_INT32: np.int32,
17+
types_pb2.DT_UINT8: np.uint8,
18+
types_pb2.DT_UINT16: np.uint16,
19+
types_pb2.DT_UINT32: np.uint32,
20+
types_pb2.DT_UINT64: np.uint64,
21+
types_pb2.DT_INT16: np.int16,
22+
types_pb2.DT_INT8: np.int8,
23+
# NOTE(touts): For strings we use object as it supports variable length # strings.
24+
types_pb2.DT_STRING: object,
25+
types_pb2.DT_COMPLEX64: np.complex64,
26+
types_pb2.DT_COMPLEX128: np.complex128,
27+
types_pb2.DT_INT64: np.int64,
28+
types_pb2.DT_BOOL: np.bool_,
29+
}
30+
31+
_NP_TO_TF = {nt: tt for tt, nt in _TF_TO_NP.items()}
32+
_NP_TO_TF[np.bytes_] = types_pb2.DT_STRING
33+
_NP_TO_TF[np.str_] = types_pb2.DT_STRING
1434

1535

1636
def get_tf_type(np_dtype: Type):
1737
"""
18-
:param np_type: python Type
38+
:param np_dtype: python Type
1939
:return: types_pb2.DataType
2040
"""
2141
try:
22-
return _NP_TYPE_MAP[np_dtype.type]
42+
return _NP_TO_TF[np_dtype.type]
2343
except KeyError:
24-
raise TypeError(f'Could not infer tensorflow type for {np_dtype.type}')
44+
raise TypeError(f"Could not infer tensorflow type for {np_dtype.type}")
2545

2646

2747
def get_np_dtype(tf_type: types_pb2.DataType):
@@ -30,9 +50,9 @@ def get_np_dtype(tf_type: types_pb2.DataType):
3050
:return: types_pb2.DataType
3151
"""
3252
try:
33-
return np.dtype(_TF_TYPE_MAP[tf_type])
53+
return np.dtype(_TF_TO_NP[tf_type])
3454
except KeyError:
35-
raise TypeError(f'Could not infer numpy type for {tf_type}')
55+
raise TypeError(f"Could not infer numpy type for {tf_type}")
3656

3757

3858
def make_tensor_proto(values: List[Any]):
@@ -59,23 +79,25 @@ def make_tensor_proto(values: List[Any]):
5979
for vector in np_array:
6080
for s in vector:
6181
if not isinstance(s, bytes):
62-
raise TypeError(f'{values} expect a list of bytes when working with DT_STRING types')
63-
string_val.append(s)
64-
tensor_kwargs['string_val'] = string_val
82+
raise TypeError(
83+
f"{values} expect a list of bytes when working with DT_STRING types"
84+
)
85+
string_val.append(s)
86+
tensor_kwargs["string_val"] = string_val
6587
else:
66-
tensor_kwargs['tensor_content'] = np_array.tobytes()
88+
tensor_kwargs["tensor_content"] = np_array.tobytes()
6789
return tensor_pb2.TensorProto(
68-
dtype=dtype,
69-
tensor_shape=tensor_shape_proto,
70-
**tensor_kwargs
90+
dtype=dtype, tensor_shape=tensor_shape_proto, **tensor_kwargs
7191
)
7292

7393

7494
def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
7595
shape = [d.size for d in tensor.tensor_shape.dim]
7696
np_dtype = get_np_dtype(tensor.dtype)
7797
if tensor.tensor_content:
78-
return np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
98+
return (
99+
np.frombuffer(tensor.tensor_content, dtype=np_dtype).copy().reshape(shape)
100+
)
79101

80102
if tensor.dtype == types_pb2.DT_FLOAT:
81103
values = np.fromiter(tensor.float_val, dtype=np_dtype)
@@ -85,6 +107,8 @@ def make_ndarray_from_tensor(tensor: tensor_pb2.TensorProto):
85107
values = np.fromiter(tensor.int_val, dtype=np_dtype)
86108
elif tensor.dtype == types_pb2.DT_BOOL:
87109
values = np.fromiter(tensor.bool_val, dtype=np_dtype)
110+
elif tensor.dtype == types_pb2.DT_STRING:
111+
values = np.array(tensor.string_val, dtype=np_dtype)
88112
else:
89113
raise TypeError("Unsupported tensor type: %s" % tensor.dtype)
90114

0 commit comments

Comments
 (0)