From ad75240eb69c30c12b91c7b222cf32d7800c1e7e Mon Sep 17 00:00:00 2001 From: Gerhardsa0 Date: Tue, 14 Nov 2023 16:30:31 +0100 Subject: [PATCH] added trainsplit and normalization of data so the bug with insane error is fixed tested also the Dataloader and now set for good model implementation --- .../tabular/containers/_timeseries_table.py | 22 +++++++------ src/safeds/ml/nn/_model.py | 32 +++++++++++++++---- .../test_timeseries_creation.py | 21 ++++++------ 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/safeds/data/tabular/containers/_timeseries_table.py b/src/safeds/data/tabular/containers/_timeseries_table.py index 5c12914a3..79d594291 100644 --- a/src/safeds/data/tabular/containers/_timeseries_table.py +++ b/src/safeds/data/tabular/containers/_timeseries_table.py @@ -112,6 +112,7 @@ def _from_table( # Create Time Series Object result = object.__new__(TimeSeries) + result._feature_names = feature_names result._data = table._data result._schema = table._schema result._features = table.keep_only_columns(feature_names) @@ -214,7 +215,7 @@ def features(self) -> Table: return self._features @property - def target(self) -> Column: + def target(self) -> Column: """ Get the target column of the tagged table. @@ -435,10 +436,12 @@ def _as_table(self: TimeSeries) -> Table: # for testing purposes they are here # ------------------------------------------------------------------------------------------------------------------ - def _create_all_windows_for_column(self): + def _create_all_windows_for_column(self, train_size: float): #this generator generates all windows for all feature columns def in_yield(col: Column): - ser = col._data + testsplit_index = int(col.__len__()*train_size) + #get only the training data of a column and normalize it + ser = (col._data[:testsplit_index]-col.mean())/col.standard_deviation() for i in range(len(ser) - self._window_size): yield list(ser.iloc[i : i + self._window_size]) for col_name in self._feature_names: @@ -446,23 +449,24 @@ def in_yield(col: Column): yield list(in_yield(col)) - - def _create_all_labels_for_target_column(self): + def _create_all_labels_for_target_column(self, train_size: float): #this generator generates all forecast horizons for the target column def _generate_label_windows( ): - ser = self._target._data + testsplit_index = int(self.target.__len__()*train_size) + #get only the training data of a column and normalize it + ser = (self._target._data[:testsplit_index]-self._target.mean())/self._target.standard_deviation() for i in range(len(ser) - self._window_size): yield list(ser.iloc[i + self._window_size : i + self._window_size + self._forecast_horizon]) return list((_generate_label_windows())) - def into_DataLoader(self): + def into_train_DataLoader(self, train_size: float): #code below concatenate the column like the following #f1:[w1, w2, w3] f2[w1, w2, w3] -> [w1+w1, w2+w2, w3+w3] - x_train = np.concatenate(list(self._create_all_windows_for_column()), axis=1) + x_train = np.concatenate(list(self._create_all_windows_for_column(train_size)), axis=1) #for target this will be created: [ t1, t2, t3] - y_train = np.array(self._create_all_labels_for_target_column()) + y_train = np.array(self._create_all_labels_for_target_column(train_size)) #load them into PyTorch dataset = TimeSeriesDataset(x_train,y_train) return DataLoader(dataset, batch_size=1) diff --git a/src/safeds/ml/nn/_model.py b/src/safeds/ml/nn/_model.py index 15a9cd1e5..0d3958c3a 100644 --- a/src/safeds/ml/nn/_model.py +++ b/src/safeds/ml/nn/_model.py @@ -1,12 +1,12 @@ import pandas as pd import numpy as np import torch +import time import torch.nn as nn from torch.utils.data import DataLoader from safeds.ml.nn import RNN_Layer from safeds.data.tabular.containers import Column, Table, TaggedTable, TimeSeries -from safeds.exceptions import ColumnSizeError, DuplicateColumnNameError - +from safeds.exceptions import ColumnSizeError, DuplicateColumnNameError class Model(): def __init__(self, layers : list): self._model = PyTorchModel(layers) @@ -23,18 +23,36 @@ def model_forward(self, data : DataLoader): self._model(inputs) - def train(self,x): - pass + def train(self, train_loader: DataLoader, epochs: int, learningrate : float): + start_time = time.time() + criterion = nn.MSELoss() + optimizer = torch.optim.Adam(self._model.parameters(), lr = learningrate) + + for epoch in range(epochs): + for batch in iter(train_loader): + inputs, labels = batch + optimizer.zero_grad() + + labels = labels.to(torch.float32) + inputs = inputs.to(torch.float32) + outputs = self._model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + print(f'Epoch: {epoch+1:2} Loss {loss.item():10.8f}') + print(f'\nDuration: {time.time()-start_time:.0f} seconds') class PyTorchModel(nn.Module): def __init__(self, LayerListe :list[RNN_Layer]): super(PyTorchModel, self).__init__() - self.layerliste = [] + layers = [] for layer in LayerListe: - self.layerliste.append(layer._create_pytorch_layer()) + layers.append(layer._create_pytorch_layer()) + self._layerliste = nn.ModuleList(layers) + def forward(self, x): out = x - for layer in self.layerliste: + for layer in self._layerliste: out = layer(out) return out diff --git a/tests/safeds/data/tabular/containers/_table/_timeseries_table/test_timeseries_creation.py b/tests/safeds/data/tabular/containers/_table/_timeseries_table/test_timeseries_creation.py index 04b929af5..39eb46c12 100644 --- a/tests/safeds/data/tabular/containers/_table/_timeseries_table/test_timeseries_creation.py +++ b/tests/safeds/data/tabular/containers/_table/_timeseries_table/test_timeseries_creation.py @@ -3,6 +3,7 @@ import numpy as np import torch import torch.nn as nn +import time from torch.utils.data import DataLoader from safeds.data.tabular.containers import Column, Table, TaggedTable, TimeSeries from safeds.exceptions import ColumnSizeError, DuplicateColumnNameError @@ -12,31 +13,27 @@ def test_create_timeseries() -> None: - - table = Table(data={"f1": [1, 2, 3, 4, 6, 7], "target": [7,2, 3, 1, 3, 7], "f2": [4,7, 5, 5, 5, 7]}) - ts = TimeSeries(data={"f1": [1, 2, 3, 4, 6, 7], "target": [7,2, 3, 1, 3, 7], "f2": [4,7, 5, 5, 5, 7]}, - target_name="target", - date_name="f1", - window_size=2, - forecast_horizon=1, - feature_names=["f1", "f2", "target"]) + table = Table.from_csv_file(r"tests\resources\Alcohol_Sales (1).csv") + ts = TimeSeries._from_table(table,target_name="S4248SM144NCEN", date_name="DATE", window_size=12, forecast_horizon=1, feature_names=["S4248SM144NCEN"]) # ein Modell erstellen ist in safeDS noch nicht definiert darum low level in PyTorch # 2 ist hier die number der feature Columns input_dim = ts._window_size * len(ts._feature_names) - hidden_dim = 1 + hidden_dim = 256 output_dim = ts._forecast_horizon layer1 = RNN_Layer(input_dim, hidden_dim) layer2 = RNN_Layer(hidden_dim, output_dim) model = Model([layer1, layer2]) + #model.train(ts.into_DataLoader(), 5, 0.01) - #damit der Datensatz low level laden kann hier into_Dataloader - model.model_forward(ts.into_DataLoader()) + #wenn durchläuft wurde korrekt Table in Dataloader geladen - #assert False + assert False + +