Skip to content

Commit 253378a

Browse files
authored
Merge pull request microsoft#378 from D-X-Y/main
Add MultiSegRecord and add segment kwargs in model.pred
2 parents 0387eaf + f809f0a commit 253378a

27 files changed

+328
-134
lines changed

qlib/contrib/model/__init__.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
try:
4+
from .catboost_model import CatBoostModel
5+
except ModuleNotFoundError:
6+
CatBoostModel = None
7+
print("Please install necessary libs for CatBoostModel.")
8+
try:
9+
from .double_ensemble import DEnsembleModel
10+
from .gbdt import LGBModel
11+
except ModuleNotFoundError:
12+
DEnsembleModel, LGBModel = None, None
13+
print("Please install necessary libs for DEnsembleModel and LGBModel, such as lightgbm.")
14+
try:
15+
from .xgboost import XGBModel
16+
except ModuleNotFoundError:
17+
XGBModel = None
18+
print("Please install necessary libs for XGBModel, such as xgboost.")
19+
try:
20+
from .linear import LinearModel
21+
except ModuleNotFoundError:
22+
LinearModel = None
23+
print("Please install necessary libs for LinearModel, such as scipy and sklearn.")
24+
# import pytorch models
25+
try:
26+
from .pytorch_alstm import ALSTM
27+
from .pytorch_gats import GATs
28+
from .pytorch_gru import GRU
29+
from .pytorch_lstm import LSTM
30+
from .pytorch_nn import DNNModelPytorch
31+
from .pytorch_tabnet import TabnetModel
32+
from .pytorch_sfm import SFM_Model
33+
34+
pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model)
35+
except ModuleNotFoundError:
36+
pytorch_classes = ()
37+
print("Please install necessary libs for PyTorch models.")
38+
39+
all_model_classes = (CatBoostModel, DEnsembleModel, LGBModel, XGBModel, LinearModel) + pytorch_classes

qlib/contrib/model/catboost_model.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import numpy as np
55
import pandas as pd
6+
from typing import Text, Union
67
from catboost import Pool, CatBoost
78
from catboost.utils import get_gpu_device_count
89

@@ -62,10 +63,10 @@ def fit(
6263
evals_result["train"] = list(evals_result["learn"].values())[0]
6364
evals_result["valid"] = list(evals_result["validation"].values())[0]
6465

65-
def predict(self, dataset):
66+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
6667
if self.model is None:
6768
raise ValueError("model is not fitted yet!")
68-
x_test = dataset.prepare("test", col_set="feature")
69+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
6970
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
7071

7172

qlib/contrib/model/double_ensemble.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import lightgbm as lgb
55
import numpy as np
66
import pandas as pd
7-
7+
from typing import Text, Union
88
from ...model.base import Model
99
from ...data.dataset import DatasetH
1010
from ...data.dataset.handler import DataHandlerLP
@@ -40,6 +40,10 @@ def __init__(
4040
self.bins_sr = bins_sr
4141
self.bins_fs = bins_fs
4242
self.decay = decay
43+
if sample_ratios is None: # the default values for sample_ratios
44+
sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4]
45+
if sub_weights is None: # the default values for sub_weights
46+
sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2]
4347
if not len(sample_ratios) == bins_fs:
4448
raise ValueError("The length of sample_ratios should be equal to bins_fs.")
4549
self.sample_ratios = sample_ratios
@@ -228,10 +232,10 @@ def retrieve_loss_curve(self, model, df_train, features):
228232
raise ValueError("not implemented yet")
229233
return loss_curve
230234

231-
def predict(self, dataset):
235+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
232236
if self.ensemble is None:
233237
raise ValueError("model is not fitted yet!")
234-
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
238+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
235239
pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
236240
for i_sub, submodel in enumerate(self.ensemble):
237241
feat_sub = self.sub_features[i_sub]

qlib/contrib/model/gbdt.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66
import lightgbm as lgb
7-
7+
from typing import Text, Union
88
from ...model.base import ModelFT
99
from ...data.dataset import DatasetH
1010
from ...data.dataset.handler import DataHandlerLP
@@ -61,10 +61,10 @@ def fit(
6161
evals_result["train"] = list(evals_result["train"].values())[0]
6262
evals_result["valid"] = list(evals_result["valid"].values())[0]
6363

64-
def predict(self, dataset):
64+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
6565
if self.model is None:
6666
raise ValueError("model is not fitted yet!")
67-
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
67+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
6868
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
6969

7070
def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20):

qlib/contrib/model/linear.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import numpy as np
55
import pandas as pd
6-
6+
from typing import Text, Union
77
from scipy.optimize import nnls
88
from sklearn.linear_model import LinearRegression, Ridge, Lasso
99

@@ -84,8 +84,8 @@ def _fit_nnls(self, X, y):
8484
self.coef_ = coef
8585
self.intercept_ = 0.0
8686

87-
def predict(self, dataset):
87+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
8888
if self.coef_ is None:
8989
raise ValueError("model is not fitted yet!")
90-
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
90+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
9191
return pd.Series(x_test.values @ self.coef_ + self.intercept_, index=x_test.index)

qlib/contrib/model/pytorch_alstm.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,9 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
13+
from ...utils import get_or_create_path
1814
from ...log import get_module_logger
1915

2016
import torch
@@ -273,11 +269,11 @@ def fit(
273269
if self.use_gpu:
274270
torch.cuda.empty_cache()
275271

276-
def predict(self, dataset):
272+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
277273
if not self.fitted:
278274
raise ValueError("model is not fitted yet!")
279275

280-
x_test = dataset.prepare("test", col_set="feature")
276+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
281277
index = x_test.index
282278
self.ALSTM_model.eval()
283279
x_values = x_test.values

qlib/contrib/model/pytorch_alstm_ts.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,9 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
13+
from ...utils import get_or_create_path
1814
from ...log import get_module_logger
1915

2016
import torch
@@ -264,11 +260,11 @@ def fit(
264260
if self.use_gpu:
265261
torch.cuda.empty_cache()
266262

267-
def predict(self, dataset):
263+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
268264
if not self.fitted:
269265
raise ValueError("model is not fitted yet!")
270266

271-
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
267+
dl_test = dataset.prepare(segment, col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
272268
dl_test.config(fillna_type="ffill+bfill")
273269
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
274270
self.ALSTM_model.eval()

qlib/contrib/model/pytorch_gats.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,9 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
13+
from ...utils import get_or_create_path
1814
from ...log import get_module_logger
1915
import torch
2016
import torch.nn as nn
@@ -83,7 +79,6 @@ def __init__(
8379
self.with_pretrain = with_pretrain
8480
self.model_path = model_path
8581
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
86-
self.use_gpu = torch.cuda.is_available()
8782
self.seed = seed
8883

8984
self.logger.info(
@@ -310,11 +305,11 @@ def fit(
310305
if self.use_gpu:
311306
torch.cuda.empty_cache()
312307

313-
def predict(self, dataset):
308+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
314309
if not self.fitted:
315310
raise ValueError("model is not fitted yet!")
316311

317-
x_test = dataset.prepare("test", col_set="feature")
312+
x_test = dataset.prepare(segment, col_set="feature")
318313
index = x_test.index
319314
self.GAT_model.eval()
320315
x_values = x_test.values

qlib/contrib/model/pytorch_gats_ts.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,7 @@
99
import numpy as np
1010
import pandas as pd
1111
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
12+
from ...utils import get_or_create_path
1813
from ...log import get_module_logger
1914
import torch
2015
import torch.nn as nn

qlib/contrib/model/pytorch_gru.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,9 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
13+
from ...utils import get_or_create_path
1814
from ...log import get_module_logger
1915

2016
import torch
@@ -273,11 +269,11 @@ def fit(
273269
if self.use_gpu:
274270
torch.cuda.empty_cache()
275271

276-
def predict(self, dataset):
272+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
277273
if not self.fitted:
278274
raise ValueError("model is not fitted yet!")
279275

280-
x_test = dataset.prepare("test", col_set="feature")
276+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
281277
index = x_test.index
282278
self.gru_model.eval()
283279
x_values = x_test.values

qlib/contrib/model/pytorch_gru_ts.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,7 @@
99
import numpy as np
1010
import pandas as pd
1111
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
12+
from ...utils import get_or_create_path
1813
from ...log import get_module_logger
1914

2015
import torch

qlib/contrib/model/pytorch_lstm.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,9 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
13+
from ...utils import get_or_create_path
1814
from ...log import get_module_logger
1915

2016
import torch
@@ -268,29 +264,25 @@ def fit(
268264
if self.use_gpu:
269265
torch.cuda.empty_cache()
270266

271-
def predict(self, dataset):
267+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
272268
if not self.fitted:
273269
raise ValueError("model is not fitted yet!")
274270

275-
x_test = dataset.prepare("test", col_set="feature")
271+
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
276272
index = x_test.index
277273
self.lstm_model.eval()
278274
x_values = x_test.values
279275
sample_num = x_values.shape[0]
280276
preds = []
281277

282278
for begin in range(sample_num)[:: self.batch_size]:
283-
284279
if sample_num - begin < self.batch_size:
285280
end = sample_num
286281
else:
287282
end = begin + self.batch_size
288-
289283
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
290-
291284
with torch.no_grad():
292285
pred = self.lstm_model(x_batch).detach().cpu().numpy()
293-
294286
preds.append(pred)
295287

296288
return pd.Series(np.concatenate(preds), index=index)

qlib/contrib/model/pytorch_lstm_ts.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,7 @@
99
import numpy as np
1010
import pandas as pd
1111
import copy
12-
from ...utils import (
13-
unpack_archive_with_buffer,
14-
save_multiple_parts_file,
15-
get_or_create_path,
16-
drop_nan_by_y_index,
17-
)
12+
from ...utils import get_or_create_path
1813
from ...log import get_module_logger
1914

2015
import torch

qlib/contrib/model/pytorch_nn.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import numpy as np
1010
import pandas as pd
11+
from typing import Text, Union
1112
from sklearn.metrics import roc_auc_score, mean_squared_error
1213

1314
import torch
@@ -18,7 +19,7 @@
1819
from ...model.base import Model
1920
from ...data.dataset import DatasetH
2021
from ...data.dataset.handler import DataHandlerLP
21-
from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path, drop_nan_by_y_index
22+
from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path
2223
from ...log import get_module_logger
2324
from ...workflow import R
2425

@@ -48,8 +49,8 @@ class DNNModelPytorch(Model):
4849

4950
def __init__(
5051
self,
51-
input_dim,
52-
output_dim,
52+
input_dim=360,
53+
output_dim=1,
5354
layers=(256,),
5455
lr=0.001,
5556
max_steps=300,
@@ -271,13 +272,12 @@ def get_loss(self, pred, w, target, loss_type):
271272
else:
272273
raise NotImplementedError("loss {} is not supported!".format(loss_type))
273274

274-
def predict(self, dataset):
275+
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
275276
if not self.fitted:
276277
raise ValueError("model is not fitted yet!")
277-
x_test_pd = dataset.prepare("test", col_set="feature")
278+
x_test_pd = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
278279
x_test = torch.from_numpy(x_test_pd.values).float().to(self.device)
279280
self.dnn_model.eval()
280-
281281
with torch.no_grad():
282282
preds = self.dnn_model(x_test).detach().cpu().numpy()
283283
return pd.Series(np.squeeze(preds), index=x_test_pd.index)

0 commit comments

Comments
 (0)