-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_engineering.py
85 lines (66 loc) · 2.45 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import time
from datetime import datetime
import numpy as np
import config
def to_timestamp(s):
return np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))
def calculate_log_close_mean_inplace(df, lag):
"""
Calculates the log close mean for a given lag in-place.
"""
df[f"log_close/mean_{lag}"] = np.log(
df["close"]
/ np.roll(
np.append(
np.convolve(df["close"], np.ones(lag) / lag, mode="valid"),
np.ones(lag - 1),
),
lag - 1,
)
)
def calculate_log_return_inplace(df, lag):
"""
Calculates the log return for a given lag in-place.
"""
df[f"log_return_{lag}"] = np.log(df["close"] / np.roll(df["close"], lag))
def process_train_data(df):
"""
Processes data for training, including setting flags and filtering by date.
"""
valid_window = [to_timestamp("12/03/2021")]
df["train_flg"] = np.where(df.index >= valid_window[0], 0, 1)
oldest_use_window = [to_timestamp("12/01/2019")]
return df[df.index >= oldest_use_window[0]]
def get_features(df, train=True, n_lags=336):
"""
Generates features for the given dataframe.
"""
# TODO - see if skipping train_flg does anything
# if train:
# df = process_train_data(df)
# Calculate log close mean and log return for each lag and add as columns
for lag in config.LAGS:
calculate_log_close_mean_inplace(df, lag)
calculate_log_return_inplace(df, lag)
# Calculate mean close and mean log returns
df[f"mean_close/mean_{lag}"] = np.mean(df[f"log_close/mean_{lag}"])
df[f"mean_log_returns_{lag}"] = np.mean(df[f"log_return_{lag}"])
# Additional calculations
df[f"log_close/mean_{lag}-mean_close/mean_{lag}"] = (
df[f"log_close/mean_{lag}"] - df[f"mean_close/mean_{lag}"]
)
df[f"log_return_{lag}-mean_log_returns_{lag}"] = (
df[f"log_return_{lag}"] - df[f"mean_log_returns_{lag}"]
)
# Add lagged features
for lag in range(1, n_lags + 1):
df[f"lag_{lag}"] = df["close"].shift(lag)
df["target"] = df["close"].shift(-1)
# Drop NaNs created from lagging
df = df.iloc[n_lags:-1]
# TODO - check later if this is necessary
# Additional processing for training data
# if train:
# oldest_use_window = [to_timestamp("12/01/2019")]
# df = df[df["timestamp"] >= oldest_use_window[0]]
return df