-
Notifications
You must be signed in to change notification settings - Fork 0
/
Gold_Prediction_Multivariant.py
197 lines (148 loc) · 7.69 KB
/
Gold_Prediction_Multivariant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pandas as pd
import tensorflow as tf
import random
import numpy as np
from keras import layers , Model
from keras.layers import LSTM ,Bidirectional, GlobalAveragePooling1D , Dense
from keras.optimizers import Adam , SGD
from keras.losses import CategoricalCrossentropy
import matplotlib.pyplot as plt
import os
df = pd.read_csv("C:\\Users\\thibe\\OneDrive\\Documents\\Time series\\Gold preds\\archive\\monthly_csv.csv" , parse_dates= True , index_col= ['Date'])
timesteps = df.index.to_numpy()
prices = df["Price"].to_numpy()
# Create train and test splits the right way for time series data
split_size = int(0.8 * len(prices)) # 80% train, 20% test
# Create train data splits (everything before the split)
X_train, y_train = timesteps[:split_size], prices[:split_size]
# Create test data splits (everything after the split)
X_test, y_test = timesteps[split_size:], prices[split_size:]
# Create a function to plot time series data
def plot_time_series(timesteps, values, format='.', start=0, end=None, label=None):
"""
Plots a timesteps (a series of points in time) against values (a series of values across timesteps).
Parameters
---------
timesteps : array of timesteps
values : array of values across time
format : style of plot, default "."
start : where to start the plot (setting a value will index from start of timesteps & values)
end : where to end the plot (setting a value will index from end of timesteps & values)
label : label to show on plot of values
"""
# Plot the series
plt.plot(timesteps[start:end], values[start:end], format, label=label)
plt.xlabel("Time")
plt.ylabel("Gold Price")
if label:
plt.legend(fontsize=14) # make label bigger
plt.grid(True)
def mean_absolute_scaled_error(y_true , y_pred ):
"""
Implement MASE (assuming no seasonality of data).
"""
mae = tf.reduce_mean(tf.abs(y_true - y_pred))
# Find MAE of naive forecast (no seasonality)
mae_naive_no_season = tf.reduce_mean(tf.abs(y_true[1:] - y_true[:-1])) # our seasonality is 1 day (hence the shifting of 1 day)
return mae / mae_naive_no_season
def evaluate_preds (y_true , y_pred):
# make it float32
y_true = tf.cast ( y_true , tf.float32)
y_pred = tf.cast ( y_pred , tf.float32)
mae = tf.keras.metrics.mean_absolute_error(y_true, y_pred)
mse = tf.keras.metrics.mean_squared_error(y_true, y_pred) # puts and emphasis on outliers (all errors get squared)
rmse = tf.sqrt(mse)
mape = tf.keras.metrics.mean_absolute_percentage_error(y_true, y_pred)
mase = mean_absolute_scaled_error ( y_true , y_pred)
return {"mae": mae.numpy(),
"mse": mse.numpy(),
"rmse": rmse.numpy(),
"mape": mape.numpy(),
"mase": mase.numpy()}
HORIZON =1
WINDOW = 12
# Create function to label windowed data
def get_labelled_windows(x, horizon=1):
"""
Creates labels for windowed dataset.
E.g. if horizon=1 (default)
Input: [1, 2, 3, 4, 5, 6] -> Output: ([1, 2, 3, 4, 5], [6])
"""
return x[:, :-horizon], x[:, -horizon:]
# Create function to view NumPy arrays as windows
def make_windows(x, window_size=7, horizon=1):
"""
Turns a 1D array into a 2D array of sequential windows of window_size.
"""
# 1. Create a window of specific window_size (add the horizon on the end for later labelling)
window_step = np.expand_dims(np.arange(window_size+horizon), axis=0)
# print(f"Window step:\n {window_step}")
# 2. Create a 2D array of multiple window steps (minus 1 to account for 0 indexing)
window_indexes = window_step + np.expand_dims(np.arange(len(x)-(window_size+horizon-1)), axis=0).T # create 2D array of windows of size window_size
# print(f"Window indexes:\n {window_indexes[:3], window_indexes[-3:], window_indexes.shape}")
# 3. Index on the target array (time series) with 2D array of multiple window steps
windowed_array = x[window_indexes]
# 4. Get the labelled windows
windows, labels = get_labelled_windows(windowed_array, horizon=horizon)
return windows, labels
# Make the train/test splits
def make_train_test_splits(windows, labels, test_split=0.2):
"""
Splits matching pairs of windows and labels into train and test splits.
"""
split_size = int(len(windows) * (1-test_split)) # this will default to 80% train/20% test
train_windows = windows[:split_size]
train_labels = labels[:split_size]
test_windows = windows[split_size:]
test_labels = labels[split_size:]
return train_windows, test_windows, train_labels, test_labels
full_windows, full_labels = make_windows(prices, window_size=WINDOW, horizon=HORIZON)
train_windows, test_windows, train_labels, test_labels = make_train_test_splits(full_windows, full_labels)
# Create a function to implement a ModelCheckpoint callback with a specific filename
def create_model_checkpoint(model_name, save_path="experiments"):
return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(save_path, model_name), # create filepath to save model
verbose=0, # only output a limited amount of text
save_best_only=True) # save only the best weights
# Adding a multivariant layer so help better the model predictions
df_usa = pd.read_csv("C:\\Users\\thibe\\OneDrive\\Documents\\Time series\\Gold preds\\DX-Y.NYB (6).csv",
parse_dates=['Date'], index_col= ['Date'])
df_usa = pd.DataFrame(df_usa["Close"]).rename(columns={"Close":"USA_Price"})
gold_usa_price = df.copy()
gold_usa_price["USA_Price"] = df_usa["USA_Price"]
from sklearn.preprocessing import minmax_scale
scaled_price_block_df = pd.DataFrame(minmax_scale(gold_usa_price[["Price", "USA_Price"]]), # we need to scale the data first
columns=gold_usa_price.columns,
index=gold_usa_price.index)
scaled_price_block_df.plot(figsize=(10, 7))
gold_usa_windowed = gold_usa_price.copy()
for i in range(WINDOW):
gold_usa_windowed[f"Price+{i+1}"] = gold_usa_windowed["Price"].shift(periods = i+1)
# Let's create X & y, remove the NaN's and convert to float32 to prevent TensorFlow errors
X = gold_usa_windowed.dropna().drop("Price", axis=1).astype(np.float32)
y = gold_usa_windowed.dropna()["Price"].astype(np.float32)
# Make train and test sets
split_size = int(len(X) * 0.8)
X_train, y_train = X[:split_size], y[:split_size]
X_test, y_test = X[split_size:], y[split_size:]
# Let's build an LSTM model with the Functional API
inputs = layers.Input(shape=(WINDOW))
x = layers.Lambda(lambda x: tf.expand_dims(x, axis=1))(inputs) # expand input dimension to be compatible with LSTM
# print(x.shape)
# x = layers.LSTM(128, activation="relu", return_sequences=True)(x) # this layer will error if the inputs are not the right shape
x = layers.LSTM(128, activation="relu")(x) # using the tanh loss function results in a massive error
# print(x.shape)
# Add another optional dense layer (you could add more of these to see if they improve model performance)
# x = layers.Dense(32, activation="relu")(x)
output = layers.Dense(HORIZON)(x)
model_5 = tf.keras.Model(inputs=inputs, outputs=output, name="Multivariant_lstm_model")
# Compile model
model_5.compile(loss="mae",
optimizer=tf.keras.optimizers.Adam())
# Seems when saving the model several warnings are appearing: https://github.com/tensorflow/tensorflow/issues/47554
model_5.fit(X_train,
y_train,
epochs=100,
verbose=0,
batch_size=128,
validation_data=(test_windows, test_labels),
callbacks=[create_model_checkpoint(model_name=model_5.name)])