Skip to content

Commit

Permalink
simplify input logic
Browse files Browse the repository at this point in the history
  • Loading branch information
浅梦 authored Oct 3, 2019
1 parent 924f00f commit db63fc6
Show file tree
Hide file tree
Showing 33 changed files with 231 additions and 266 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
matrix:
python-version: [3.5,3.6,3.7]
tf-version: [1.4.0,1.14.0,2.0.0b1]
tf-version: [1.4.0,1.14.0,2.0.0]

exclude:
- python-version: 3.7
Expand Down
2 changes: 1 addition & 1 deletion deepctr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
from . import models
from .utils import check_version

__version__ = '0.6.1'
__version__ = '0.6.2'
check_version(__version__)
2 changes: 1 addition & 1 deletion deepctr/contrib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __init__(self,

else:

total_arg_size += shape[1]#.value
total_arg_size += int(shape[1])#.value

dtype = [a.dtype for a in args][0]

Expand Down
46 changes: 18 additions & 28 deletions deepctr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,39 +42,31 @@ def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype
embedding_name = name
return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype, embedding_name,embedding)


def get_fixlen_feature_names(feature_columns):
features = build_input_features(feature_columns, include_varlen=False,include_fixlen=True)
return list(features.keys())

def get_varlen_feature_names(feature_columns):
features = build_input_features(feature_columns, include_varlen=True,include_fixlen=False)
def get_feature_names(feature_columns):
features = build_input_features(feature_columns)
return list(features.keys())

def get_inputs_list(inputs):
return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))

def build_input_features(feature_columns, include_varlen=True, mask_zero=True, prefix='',include_fixlen=True):
def build_input_features(feature_columns, mask_zero=True, prefix=''):
input_features = OrderedDict()
if include_fixlen:
for fc in feature_columns:
if isinstance(fc,SparseFeat):
input_features[fc.name] = Input(
shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
elif isinstance(fc,DenseFeat):
input_features[fc.name] = Input(
shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
if include_varlen:
for fc in feature_columns:
if isinstance(fc,VarLenSparseFeat):
input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + 'seq_' + fc.name,
dtype=fc.dtype)
if not mask_zero:
for fc in feature_columns:
input_features[fc.name+"_seq_length"] = Input(shape=(
for fc in feature_columns:
if isinstance(fc,SparseFeat):
input_features[fc.name] = Input(
shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
elif isinstance(fc,DenseFeat):
input_features[fc.name] = Input(
shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
elif isinstance(fc,VarLenSparseFeat):
input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
dtype=fc.dtype)
if not mask_zero:
input_features[fc.name + "_seq_length"] = Input(shape=(
1,), name=prefix + 'seq_length_' + fc.name)
input_features[fc.name+"_seq_max_length"] = fc.maxlen

input_features[fc.name + "_seq_max_length"] = fc.maxlen
else:
raise TypeError("Invalid feature column type,got",type(fc))

return input_features

Expand Down Expand Up @@ -119,8 +111,6 @@ def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns,
l2_reg),
name=prefix + '_seq_emb_' + feat.name,
mask_zero=seq_mask_zero)


return sparse_embedding


Expand Down
2 changes: 1 addition & 1 deletion deepctr/layers/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ def build(self, input_shape):

if self.axis < 1 or self.axis > len(input_shape):
raise ValueError("axis must be 1~%d,now is %d" %
(len(input_shape), len(input_shape)))
(len(input_shape), self.axis))

if self.k < 1 or self.k > input_shape[self.axis]:
raise ValueError("k must be in 1 ~ %d,now k is %d" %
Expand Down
2 changes: 1 addition & 1 deletion deepctr/models/afm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def AFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, use_atten
"""


features = build_input_features(linear_feature_columns+dnn_feature_columns)
features = build_input_features(linear_feature_columns + dnn_feature_columns)

inputs_list = list(features.values())

Expand Down
2 changes: 1 addition & 1 deletion deepctr/models/ccpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def CCPM(linear_feature_columns, dnn_feature_columns, embedding_size=8, conv_ker
raise ValueError(
"conv_kernel_width must have same element with conv_filters")

features = build_input_features(linear_feature_columns+dnn_feature_columns)
features = build_input_features(linear_feature_columns + dnn_feature_columns)
inputs_list = list(features.values())

sparse_embedding_list, _ = input_from_feature_columns(features,dnn_feature_columns,embedding_size,
Expand Down
3 changes: 1 addition & 2 deletions deepctr/models/fibinet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def FiBiNET(linear_feature_columns, dnn_feature_columns, embedding_size=8, bilin
:return: A Keras model instance.
"""

features = build_input_features(
linear_feature_columns + dnn_feature_columns)
features = build_input_features(linear_feature_columns + dnn_feature_columns)

inputs_list = list(features.values())

Expand Down
2 changes: 1 addition & 1 deletion deepctr/models/mlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def MLR(region_feature_columns, base_feature_columns=None, region_num=4,
if bias_feature_columns is None:
bias_feature_columns = []

features = build_input_features(region_feature_columns + base_feature_columns+bias_feature_columns)
features = build_input_features(region_feature_columns + base_feature_columns + bias_feature_columns)

inputs_list = list(features.values())

Expand Down
153 changes: 76 additions & 77 deletions docs/source/Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names
from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names

if __name__ == "__main__":
data = pd.read_csv('./criteo_sample.txt')
Expand Down Expand Up @@ -59,14 +59,13 @@ if __name__ == "__main__":
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name] for name in fixlen_feature_names]

test_model_input = [test[name] for name in fixlen_feature_names]
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
Expand All @@ -91,7 +90,7 @@ from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names
from deepctr.inputs import SparseFeat, DenseFeat,get_feature_names

if __name__ == "__main__":
data = pd.read_csv('./criteo_sample.txt')
Expand All @@ -115,14 +114,14 @@ if __name__ == "__main__":

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns, )
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name] for name in fixlen_feature_names]

test_model_input = [test[name] for name in fixlen_feature_names]
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}


# 4.Define Model,train,predict and evaluate
Expand Down Expand Up @@ -156,7 +155,7 @@ from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_fixlen_feature_names
from deepctr.inputs import SparseFeat,get_feature_names

if __name__ == "__main__":

Expand All @@ -174,12 +173,13 @@ if __name__ == "__main__":
for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name].values for name in fixlen_feature_names]
test_model_input = [test[name].values for name in fixlen_feature_names]
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
Expand Down Expand Up @@ -228,7 +228,7 @@ from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names


def split(x):
Expand All @@ -239,49 +239,49 @@ def split(x):
key2index[key] = len(key2index) + 1
return list(map(lambda x: key2index[x], key_ans))

if __name__ == "__main__":
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
target = ['rating']

data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
target = ['rating']
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
# 2.count #unique features for each sparse field and generate feature config for sequence feature

# 2.count #unique features for each sparse field and generate feature config for sequence feature
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', len(
key2index) + 1, max_len, 'mean')] # Notice : value 0 is for padding for sequence input feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', len(
key2index) + 1, max_len, 'mean')] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns+dnn_feature_columns)


# 3.generate input data for model
fixlen_input = [data[name].values for name in fixlen_feature_names]
varlen_input = [genres_list]#varlen_feature_names[0]
model_input = fixlen_input + varlen_input # make sure the order is right
# 3.generate input data for model
model_input = {name:data[name] for name in feature_names}#
model_input["genres"] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

```

Expand All @@ -293,44 +293,43 @@ import pandas as pd
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names

data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names

data[sparse_features] = data[sparse_features].astype(str)
target = ['rating']
if __name__ == "__main__":
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]

# 1.Use hashing encoding on the fly for sparse features,and process sequence features
data[sparse_features] = data[sparse_features].astype(str)
target = ['rating']

genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# 1.Use hashing encoding on the fly for sparse features,and process sequence features

# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

# 2.set hashing space for each sparse field and generate feature config for sequence feature
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, use_hash=True, dtype='string')
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', 100, max_len, 'mean', use_hash=True,
dtype="string")] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
# 2.set hashing space for each sparse field and generate feature config for sequence feature

# 3.generate input data for model
fixlen_input = [data[name].values for name in feature_names]
varlen_input = [genres_list]
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, use_hash=True, dtype='string')
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', 100, max_len, 'mean', use_hash=True,
dtype="string")] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

model_input = fixlen_input + varlen_input # make sure the order is right
# 3.generate input data for model
model_input = {name:data[name] for name in feature_names}
model_input['genres'] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
```
Loading

0 comments on commit db63fc6

Please sign in to comment.