-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extraction.py
89 lines (85 loc) · 2.87 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
M5Forecast - Feature extraction
# Implementation of M5 Forecasting challenge on Kaggle, https://www.kaggle.com/c/m5-forecasting-uncertainty/.
Created: 22 may 2020
"""
import numpy as np
import pandas as pd
# adapted from from https://www.kaggle.com/robertburbidge/lightgbm-poisson-w-scaled-pinball-loss
def aggregate_adapted_fe(data, DAYS_PRED=28):
# demand features(過去の数量から変数生成)
#
for diff in [0, 1, 2]:
shift = DAYS_PRED + diff
data[f"shift_t{shift}"] = data.groupby(["id"])["demand"].transform(
lambda x: x.shift(shift)
)
#
for size in [7, 30, 60, 90, 180]:
data[f"rolling_std_t{size}"] = data.groupby(["id"])["demand"].transform(
lambda x: x.shift(DAYS_PRED).rolling(size).std()
)
#
for size in [7, 30, 60, 90, 180]:
data[f"rolling_mean_t{size}"] = data.groupby(["id"])["demand"].transform(
lambda x: x.shift(DAYS_PRED).rolling(size).mean()
)
#
data["rolling_skew_t30"] = data.groupby(["id"])["demand"].transform(
lambda x: x.shift(DAYS_PRED).rolling(30).skew()
)
data["rolling_kurt_t30"] = data.groupby(["id"])["demand"].transform(
lambda x: x.shift(DAYS_PRED).rolling(30).kurt()
)
#
# price features
# priceの動きと特徴量化(価格の変化率、過去1年間の最大価格との比など)
#
if "sell_price" in data.columns:
data["shift_price_t1"] = data.groupby(["id"])["sell_price"].transform(
lambda x: x.shift(1)
)
data["price_change_t1"] = (data["shift_price_t1"] - data["sell_price"]) / (
data["shift_price_t1"]
)
data["rolling_price_max_t365"] = data.groupby(["id"])["sell_price"].transform(
lambda x: x.shift(1).rolling(365).max()
)
data["price_change_t365"] = (data["rolling_price_max_t365"] - data["sell_price"]) / (
data["rolling_price_max_t365"]
)
#
data["rolling_price_std_t7"] = data.groupby(["id"])["sell_price"].transform(
lambda x: x.rolling(7).std()
)
data["rolling_price_std_t30"] = data.groupby(["id"])["sell_price"].transform(
lambda x: x.rolling(30).std()
)
#
# time features
# 日付に関するデータ
dt_col = "date"
data[dt_col] = pd.to_datetime(data[dt_col])
#
attrs = [
"year",
"quarter",
"month",
"week",
"day",
"dayofweek",
"is_year_end",
"is_year_start",
"is_quarter_end",
"is_quarter_start",
"is_month_end",
"is_month_start",
]
#
for attr in attrs:
dtype = np.int16 if attr == "year" else np.int8
data[attr] = getattr(data[dt_col].dt, attr).astype(dtype)
#
data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
#
return data