diff --git a/README.md b/README.md index 71e33cd7..7f8a3934 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md). +### Loading the 3W Dataset 2.0 + +The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders. + +**Usage:** + +```python +import toolkit as tk + +# Load the real data from the 3W Dataset 2.0 +df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset') +``` + ## Incorporated Problems Specific problems will be incorporated into this project gradually. At this point, we can work on: diff --git a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb index 2bcb2658..90af9aec 100644 --- a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb +++ b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb @@ -49,7 +49,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'numpy'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'" + ] + } + ], "source": [ "import sys\n", "import os\n", @@ -58,6 +70,8 @@ "sys.path.append(os.path.join('..','..','..'))\n", "import toolkit as tk\n", "\n", + "from toolkit.base import load_3w_dataset\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'svg'" ] @@ -78,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -114,9 +128,43 @@ ], "source": [ "event_labels = list(experiment.event_labels.values())\n", - "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n", - "fold: tk.EventFold\n", - "folds: tk.EventFolds = experiment.folds()" + "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload 3W Dataset 2.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = load_3w_dataset(data_type='real', base_path='path/to/dataset') # Replaced by correct path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the folds manually" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "folds = tk.EventFolds(\n", + " experiment=experiment,\n", + " df=df, # Pass the loaded DataFrame to the EventFolds class\n", + " # ... (other parameters, if necessary) ...\n", + ")\n" ] }, { @@ -135,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -185,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1501,7 +1549,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.12.0" }, "toc": { "base_numbering": 1, diff --git a/toolkit/README.md b/toolkit/README.md index b0ac4d58..97bc38bc 100644 --- a/toolkit/README.md +++ b/toolkit/README.md @@ -13,8 +13,9 @@ * [Introduction](#introduction) * [Release Notes](#release-notes) - * [1.0.0](#100) - * [1.1.0](#110) +  * [1.0.0](#100) +  * [1.1.0](#110) +  * [1.2.0](#120) # Introduction @@ -36,4 +37,17 @@ Release: July 25, 2024. Highlights: -1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files. \ No newline at end of file +1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files. + +## 1.2.0 + +Release: October 19, 2024 # Lastiest version + +Highlights: + +1. **Adapts `load_dataset()` to 3W Dataset 2.0:** The `load_dataset()` function in `base.py` was adapted to correctly handle the folder structure and different data types of the 3W Dataset 2.0. It was renamed to `load_3w_dataset()`. +2. **Updates `dev.py` for 3W Dataset 2.0:** The `dev.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. The `extrai_arrays()` function was removed, and the `EventFolds` and `Experiment` classes were adjusted. +3. **Updates `misc.py` for 3W Dataset 2.0:** The `misc.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. Redundant functions were removed, and existing functions were adapted to receive the DataFrame as a parameter. +4. **Updates `__init__.py` for 3W Dataset 2.0:** The `__init__.py` file was updated to import and expose the new `load_3w_dataset()` function. + +These updates ensure that the 3W Toolkit is fully compatible with the 3W Dataset 2.0, providing a more efficient and streamlined workflow for loading and analyzing the data. \ No newline at end of file diff --git a/toolkit/__init__.py b/toolkit/__init__.py index e3419e2e..6bd9f6bb 100644 --- a/toolkit/__init__.py +++ b/toolkit/__init__.py @@ -1,53 +1,59 @@ -"""This is the 3W Toolkit, a software package written in Python 3 that +"""This is the 3W Toolkit, a software package written in Python 3 that  is one of the 3W Project's major components. This toolkit contains resources that make the following easier: - 3W Dataset overview generation; -- Experimentation and comparative analysis of Machine Learning-based -approaches and algorithms for specific problems related to undesirable -events that occur in offshore oil wells during their respective +- Experimentation and comparative   + analysis of Machine Learning-based   +  +approaches and algorithms for specific problems related to undesirable  +events that occur in offshore oil wells during their respective  production phases; -* Standardization of key points of the Machine Learning-based algorithm +* Standardization of key points of the Machine Learning-based algorithm  development pipeline. All these resources are implemented in the following sub-modules: - **base**: groups the objects used by the other sub-modules; -- **dev**: has all the resources related to development of Machine +- **dev**: has all the resources related to development of Machine   +  Learning models; -- **misc**: brings together diverse resources that do not fit in the +- **misc**: brings together diverse resources that do not fit in the  other sub-modules; -- **rolling_window**: creates a view of array which for every point -gives the n-dimensional neighbourhood of size window. New dimensions are +- **rolling_window**: creates a view of array which for every point  +gives the n-dimensional neighbourhood of size window. New dimensions are  added at the end of array or after the corresponding original dimension. -Specific problems will be incorporated into this toolkit gradually. At +Specific problems will be incorporated into this toolkit gradually. At   +  this time, models can be developed for the following problems: - Binary Classifier of Spurious Closure of DHSV. -Examples of how to use this toolkit will be incremented throughout its +Examples of how to use this toolkit will be incremented throughout   + its  development. Please, check the 3W Project's README.md file for more details. -It is important to note that there are arbitrary choices in this -toolkit, but they have been carefully made to allow adequate comparative -analysis without compromising the ability to experiment with different +It is important to note that there are arbitrary choices in this  +toolkit, but they have been carefully made to allow adequate comparative  +analysis without compromising the ability to experiment with different  approaches and algorithms. -This toolkit's documentation is generated in english and in Google format +This toolkit's documentation is generated in english and in Google format  with [autoDocstring - Python Docstring Generator ](https://github.com/NilsJPWerner/autoDocstring), which follows [PEP 257 ](https://peps.python.org/pep-0257/), and [pdoc3 ](https://pdoc3.github.io/pdoc/). -Its source code is implemented according to the style guide established -by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with +Its source code is implemented according to the style guide established   +  +by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with  the use of the [Black formatter](https://github.com/psf/black). """ __status__ = "Development" -__version__ = "1.1.0" +__version__ = "1.2.0" # Update version number after changes __license__ = "Apache License 2.0" __copyright__ = "Copyright 2024, Petróleo Brasileiro S.A." __authors__ = [ @@ -73,7 +79,8 @@ EventType, LABELS_DESCRIPTIONS, NORMAL_LABEL, - PARQUET_EXTENSION, + PARQUET_EXTENSION,   + PARQUET_ENGINE, PARQUET_COMPRESSION, PATH_3W_PROJECT, @@ -83,6 +90,7 @@ PATH_TOOLKIT, TRANSIENT_OFFSET, VARS, + load_3w_dataset, # To use by 3W v2.0 load_config_in_dataset_ini, ) @@ -104,7 +112,8 @@ get_all_labels_and_files, label_and_file_generator, load_instance, - load_instances, + load_instances,   + resample, plot_instance, ) diff --git a/toolkit/base.py b/toolkit/base.py index 41449181..31643b04 100644 --- a/toolkit/base.py +++ b/toolkit/base.py @@ -1,7 +1,7 @@ -"""This 3W toolkits' sub-module groups objects used by the other -sub-modules. +"""This 3W toolkits' sub-module groups objects used by the other +sub-modules. -Any resource that is not used by another sub-module must be maintained +Any resource that is not used by another sub-module must be maintained in the miscellaneous sub-module. """ @@ -56,6 +56,59 @@ def load_config_in_dataset_ini(): return dict(dataset_ini) +def load_3w_dataset(data_type='real', base_path=PATH_DATASET): + """ + Load the 3W Dataset 2.0. + + Parameters + ---------- + data_type : str, optional + Type of data to be loaded ('real', 'simulated' or 'imputed'). + The default is 'real'. + base_path : str, optional + Path to the root folder of the dataset. The default is PATH_DATASET. + + Returns + ------- + pandas.DataFrame + DataFrame with the 3W Dataset 2.0 data. + """ + + dataframes = [] + for i in range(10): # Loop through folders 0 to 9 + folder_path = os.path.join(base_path, str(i)) + if os.path.exists(folder_path): + parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')] + for file in parquet_files: + file_path = os.path.join(folder_path, file) + try: + df = pd.read_parquet(file_path) + + # Filter data by specified type + if data_type == 'real': + df_filtered = df[df['state'] == 0] # Real data + elif data_type == 'simulated': + df_filtered = df[df['state'] == 1] # Simulated data + elif data_type == 'imputed': + df_filtered = df[df['state'] == 2] # Imputed data + else: + raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.") + + dataframes.append(df_filtered) + except Exception as e: + print(f"Error reading file {file_path}: {e}") + else: + print(f"Folder {folder_path} not found.") + + # Concatenate all DataFrames into a single DataFrame + if dataframes: + df = pd.concat(dataframes, ignore_index=True) + return df + else: + print("No data found.") + return None + + # Loads all configurations present in the 3W Dataset's main # configuration file and provides specific configurations in different # granularity and formats @@ -123,3 +176,4 @@ def __init__(self, event_name): self.TRANSIENT = event_section.getboolean("TRANSIENT") self.window = event_section.getint("WINDOW") self.step = event_section.getint("STEP") + \ No newline at end of file diff --git a/toolkit/dev.py b/toolkit/dev.py index 32d1e5e1..469cabdd 100644 --- a/toolkit/dev.py +++ b/toolkit/dev.py @@ -1,4 +1,4 @@ -"""This 3W toolkits' sub-module has resources related to development of +"""This 3W toolkits' sub-module has resources related to development of Machine Learning models. The main tasks made possible by these features are: @@ -13,18 +13,14 @@ import numpy as np import warnings import matplotlib -import seaborn as sns -import os matplotlib.use("agg") from matplotlib import pyplot as plt -from pathlib import Path, PurePosixPath +from pathlib import Path from sklearn import metrics from alive_progress import alive_bar -from itertools import chain, compress, repeat +from itertools import compress, repeat from functools import lru_cache -from zipfile import ZipFile -from typing import Union from .rolling_window import rolling_window from .base import ( @@ -36,42 +32,11 @@ PATH_FOLDS, TRANSIENT_OFFSET, VARS, + load_3w_dataset, # For compatibility with 3W v2.0 ) - -# Transforma lista de instâncias (lista de tuplas (X, y)) em lista de -# exemplos (X, y). -# Adicionalmente, também filtra alvos inválidos -def extraia_exemplos(instancias, retornar_grupos=False): - if len(instancias) == 0: - return ([], []) - - # Busca alvos nulos - y_finite = map(np.isfinite, chain(*(instancia[1] for instancia in instancias))) - X_iter = chain(*(instancia[0] for instancia in instancias)) - y_iter = chain(*(instancia[1] for instancia in instancias)) - - # Adiciona iterador de grupos - if retornar_grupos: - grupos = list( - chain(*(repeat(grupo, len(X)) for grupo, (X, y) in enumerate(instancias))) - ) - - iter_zip = zip(X_iter, y_iter, grupos) - else: - iter_zip = zip(X_iter, y_iter) - - # Executa iteradores e retorna X, y, [grupos] como listas - result = list(map(list, zip(*compress(iter_zip, y_finite)))) - - # Converte y (segunda lista do resultado) para int - result[1] = list(map(int, result[1])) - - return tuple(result) - - -# Classe cujo objeto contém todas as informações necessárias para uma -# rodada do K-fold do classificador de evento. +# Class whose object contains all the necessary information for a +# round of K-fold of the event classifier. class EventFold: def __init__( self, @@ -85,13 +50,13 @@ def __init__( ): self.event_folds: EventFolds = event_folds - # Nota: `instancias_treino` e `instancias_teste` são listas de - # tuplas (X, y) + # Note: `instancias_treino` and `instancias_teste` are lists of + # tuples (X, y) - # Aplica passo em instâncias de treino + # Apply step to training instances self.instancias_treino = [(X[::step], y[::step]) for X, y in instancias_treino] - # Aplica passo em instâncias de teste + # Apply step to test instances self.instancias_teste = [ (X[::passo_teste], y[::passo_teste]) for X, y in instancias_teste ] @@ -99,48 +64,61 @@ def __init__( self.nome_instancias_treino = nome_instancias_treino self.nome_instancias_teste = nome_instancias_teste - # Verfica se alguma das instâncias ficou vazia depois de - # aplicado passo + # Check if any of the instances were empty after + # applying step for instancia in self.instancias_treino + self.instancias_teste: X, y = instancia assert min(X.shape) > 0 and min( y.shape - ), "Janela especificada gerou instância sem amostras" + ), "Specified window generated instance without samples" - # Método para extração de amostras para treino + # Method for extracting training samples @lru_cache(1) - def extract_training_samples(self, retornar_grupos=False): - return extraia_exemplos(self.instancias_treino, retornar_grupos) + def extract_training_samples(self): + # Extract training samples from self.instancias_treino + X_train = np.concatenate([x[0] for x in self.instancias_treino]) + y_train = np.concatenate([x[1] for x in self.instancias_treino]) + return X_train, y_train - # Método para extração de amostras para teste + # Method for extracting complete test samples @lru_cache(1) - def extraia_amostras_teste_completo(self, retornar_grupos=False): - return extraia_exemplos(self.instancias_teste, retornar_grupos) + def extraia_amostras_teste_completo(self): + # Extract test samples from self.instancias_teste + X_test = np.concatenate([x[0] for x in self.instancias_teste]) + y_test = np.concatenate([x[1] for x in self.instancias_teste]) + return X_test, y_test - # Método para extração de amostras para teste + # Method for extracting test samples def extract_test_samples(self): - # Retorna apenas X + # Return only X return self.extraia_amostras_teste_completo()[0] - # Método para cálculo de métricas parciais + # Method for calculating partial metrics def calculate_partial_metrics( - self, y_prev_soft, idx_to_codigo, apresente=False, apresente_conf={} + self, y_pred_soft, idx_to_codigo, apresente=False, apresente_conf={} ): """ - idx_to_codigo (list or dict): - idx_to_codigo[i] = j indica que a i-ésima coluna de - y_prev_soft corresponde ao código da classe j da tarefa - corrente. + Calculate partial metrics for the fold. + + Parameters + ---------- + y_pred_soft : np.ndarray + Soft predictions for the test set. + idx_to_codigo : list or dict + Mapping from prediction index to class code. + apresente : bool, optional + Whether to display the results, by default False + apresente_conf : dict, optional + Configuration for displaying the results, by default {} """ - _, y_teste, grupos_teste = self.extraia_amostras_teste_completo( - retornar_grupos=True - ) - assert len(y_prev_soft) == len( - y_teste - ), f"Número incorreto de previsões: esperado {len(y_teste)}, encontrado {len(y_prev_soft)}" + X_test, y_test = self.extraia_amostras_teste_completo() + + assert len(y_pred_soft) == len( + y_test + ), f"Incorrect number of predictions: expected {len(y_test)}, found {len(y_pred_soft)}" - # códigos das classes para a tarefa que esse fold faz parte + # Class codes for the task that this fold is part of event_labels = self.event_folds.experiment.event_labels n_codigos = len(event_labels) lista_codigos = list(event_labels.values()) @@ -157,46 +135,46 @@ def calculate_partial_metrics( i for i, j in enumerate(idx_to_codigo) if j == codigo_transiente ) - # Predições soft com shape correto - shape_ok = (len(y_teste), n_codigos) + # Soft predictions with correct shape + shape_ok = (len(y_test), n_codigos) assert ( - y_prev_soft.shape == shape_ok - ), f"Predição deve ter shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})" + y_pred_soft.shape == shape_ok + ), f"Prediction must have shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})" - # Todos os códigos devem aparecer em ordem_codigos_evento + # All codes must appear in ordem_codigos_evento codigos_faltando = set(lista_codigos) - set( [idx_to_codigo[i] for i in range(n_codigos)] ) assert ( len(codigos_faltando) == 0 - ), f"Códigos faltando em 'idx_to_codigo': {codigos_faltando}" + ), f"Missing codes in 'idx_to_codigo': {codigos_faltando}" - # Calculando predição da classe - y_prev_idx = y_prev_soft.argmax(1) - y_prev = list(map(idx_to_codigo.__getitem__, y_prev_idx)) + # Calculating class prediction + y_pred_idx = y_pred_soft.argmax(1) + y_pred = list(map(idx_to_codigo.__getitem__, y_pred_idx)) - # Calculando probabilidade predita de regime + transiente para - # plotar - y_prob_nao_normal = y_prev_soft[:, coluna_regime].copy() + # Calculating predicted probability of regime + transient to + # plot + y_prob_nao_normal = y_pred_soft[:, coluna_regime].copy() if coluna_transiente is not None: - y_prob_nao_normal += y_prev_soft[:, coluna_transiente] + y_prob_nao_normal += y_pred_soft[:, coluna_transiente] - # Métrica principal + # Main metric f_beta = metrics.fbeta_score( - y_teste, y_prev, beta=1.2, average="micro", labels=lista_codigos + y_test, y_pred, beta=1.2, average="micro", labels=lista_codigos ) f_beta *= 100.0 # MEAN_LOG_LOSS log_loss_medio = metrics.log_loss( - y_teste, y_prev_soft, labels=lista_codigos, normalize=True + y_test, y_pred_soft, labels=lista_codigos, normalize=True ) metricas = {"F_BETA [%]": f_beta, "MEAN_LOG_LOSS": log_loss_medio} self.event_folds.salve_metricas_parciais(self, metricas) if apresente: - # Carregando configuração padrão de apresentação + # Loading default presentation configuration def set_config(name, value, overwrite=False): if overwrite or (name not in apresente_conf): apresente_conf[name] = value @@ -208,7 +186,7 @@ def set_config(name, value, overwrite=False): set_config("mostra_prob", True) # =============================================== - # Valores do gráfico: normal=0, transiente=0.5, em regime=1 + # Chart values: normal=0, transient=0.5, in regime=1 plot_values = { event_labels["normal"]: 0, event_labels["regime"]: 1, @@ -216,10 +194,10 @@ def set_config(name, value, overwrite=False): if codigo_transiente is not None: plot_values[event_labels["transiente"]] = 0.5 - y_prev_plot = list(map(plot_values.__getitem__, y_prev)) - y_teste_plot = list(map(plot_values.__getitem__, y_teste)) + y_pred_plot = list(map(plot_values.__getitem__, y_pred)) + y_teste_plot = list(map(plot_values.__getitem__, y_test)) - # Cria um plot para cada grupo + # Create a plot for each group grupos_count = np.bincount(grupos_teste) n_grupos = len(grupos_count) @@ -230,391 +208,3 @@ def set_config(name, value, overwrite=False): sharex=apresente_conf["sharex"], ) plt.subplots_adjust(hspace=apresente_conf["hspace"]) - if n_grupos == 1: - axes = [axes] - - axes[0].set_title( - f"F_BETA [%]: {f_beta:.3f} MEAN_LOG_LOSS: {log_loss_medio:.5f}" - ) - - grupo_idx_inicio = 0 - for grupo, ax in enumerate(axes): - grupo_count = grupos_count[grupo] - y_prev_grupo = y_prev_plot[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - y_teste_grupo = y_teste_plot[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - y_prob_nao_normal_grupo = y_prob_nao_normal[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - - ax.plot(y_prev_grupo, marker=11, color="orange", linestyle="") - ax.plot(y_teste_grupo, marker=10, color="green", linestyle="") - ax.set_ylim([-0.2, 1.2]) - yticks, yticklabels = [0, 1], ["normal", "em regime"] - if codigo_transiente is not None: - yticks.insert(1, 0.5) - yticklabels.insert(1, "transiente") - ax.set_yticks(yticks) - ax.set_yticklabels(yticklabels) - if apresente_conf["mostrar_nome_instancia"] and ( - self.nome_instancias_teste is not None - ): - title = ax.get_title() - if title != "": - title += "\n" - title += f"{self.nome_instancias_teste[grupo]}" - ax.set_title(title) - - # Segundo eixo com probabilidade de regiem+transiente - if apresente_conf["mostra_prob"]: - ax2 = ax.twinx() - ax2.plot( - 100.0 * (y_prob_nao_normal_grupo), - color="orange", - linestyle="-", - alpha=0.6, - label="prob. não normal", - ) - ax2.set_ylim(0, 100) - - grupo_idx_inicio += grupo_count - - axes[0].legend(["predita", "verdadeira"]) - axes[-1].set_xlabel("amostra") - plt.show() - - return metricas - - -# Classe que encapsula vários objetos da classe EventFold -class EventFolds: - def __init__( - self, - experiment, - nomes_instancias, - folds_instancias, - ): - self.experiment: Experiment = experiment - self.event_type = experiment.event_type - self.use_instancias_extras = experiment.use_instancias_extras - self.pad_mode = experiment.pad_mode - self.pbar = experiment.pbar - self.warnings = experiment.warnings - self.forca_binario = experiment.forca_binario - - self.LABEL = experiment.LABEL - self.OBSERVATION_LABELS = experiment.OBSERVATION_LABELS - self.TRANSIENT = experiment.TRANSIENT - self.window = experiment.window - self.step = experiment.step - - assert not self.use_instancias_extras, "Funcionalidade não implementada" - - # Filtro de nomes de eventos - self.filtre_nomes_instancias = lambda filtro: list( - compress(nomes_instancias, map(filtro, folds_instancias)) - ) - - # Guarda nome das instâncias extras - self.nomes_instancias_extras = self.filtre_nomes_instancias( - lambda fold: fold == EXTRA_INSTANCES_TRAINING - ) - - # Obtém código de todos os folds, ignorando o fold negativo (utilizado - # sempre para treino) - self.folds_nums = sorted(set(folds_instancias) - {EXTRA_INSTANCES_TRAINING}) - - # Carrega instâncias do evento - nomes_instancias_evento = self.filtre_nomes_instancias( - lambda fold: fold != EXTRA_INSTANCES_TRAINING - ) - self.instancias = {} - with alive_bar( - len(nomes_instancias_evento), - disable=not (self.pbar), - force_tty=True, - title=f"Loading instances", - bar="bubbles", - spinner=None, - ) as bar: - for nome_instancia in nomes_instancias_evento: - self.instancias[nome_instancia] = self.carregue_instancia( - nome_instancia - ) - bar() - - # Cria folds, agrupado por fold_num - self.folds = [] - for fold_num in self.folds_nums: - # Treino - nome_instancias_treino = self.filtre_nomes_instancias( - lambda fold: fold not in {fold_num, EXTRA_INSTANCES_TRAINING} - ) - instancias_treino = [ - self.instancias[nome_instancia] - for nome_instancia in nome_instancias_treino - ] - - # Teste - nome_instancias_teste = self.filtre_nomes_instancias( - lambda fold: fold == fold_num - ) - instancias_teste = [ - self.instancias[nome_instancia] - for nome_instancia in nome_instancias_teste - ] - - # Cria Fold - event_fold = EventFold( - instancias_treino, - instancias_teste, - self.step, # WIP - self.step, # WIP - self, - nome_instancias_treino, - nome_instancias_teste, - ) - self.folds.append(event_fold) - - self.folds_metricas = {fold: None for fold in self.folds} - - def extrai_arrays(self, instancia_abs, pad_mode="na"): - """ - Extrai np.arrays X e y a partir do csv em instancia_abs. - Na extração os valore de referência são calculados e incluídos - como colunas em X. X tem ses dados completados segundo pad_mode - para formar primeiras janelas. - - pad_mode: - 'na' : completa X com NA alinhando com primeiro dado - anotado em y - 'valid' : descarta os dados que não cabem na primeira janela - de detecção - """ - # Leitura do arquivo CSV que contém a instância - with instancia_abs.open() as f: - df = pd.read_csv(f, usecols=VARS + [CLASS]) - - # Extração dos conjuntos de amostras para treino - X_treino = df[VARS].values.astype(np.float32) - - # Verifica primeiro índice da variável target - first_class = df[CLASS].first_valid_index() - inicio_X = first_class - self.window + 1 - inicio_y = first_class - - # Verifica o tamanho da jenala solicitada e aplica pad se - # necessário - if inicio_X < 0: - if self.warnings: - warnings.warn( - f'Arquivo "{instancia_abs}" não possui amostras suficientes para janela de detecção solicitada ({self.window}s.\ - Aplicando pad {pad_mode})', - RuntimeWarning, - ) - if pad_mode == "na": - # Completando os dados em X_treino para com NA - X_treino = np.vstack( - [ - np.full( - (-inicio_X, X_treino.shape[1]), - np.nan, - dtype=np.float32, - ), - X_treino, - ] - ) - inicio_X = 0 - elif pad_mode == "valid": - # Descartando (-inicio_X) instantes do df para ter 1a - # janela válida - inicio_y += -inicio_X - inicio_X = 0 - - # Validando se janela solicitada é maior do que dados - # disponíveis - if inicio_y >= df.shape[0]: - raise ( - Exception( - f"Arquivo '{instancia_abs}' não possui amostras suficientes para pad: {pad_mode}." - ) - ) - - # Validando se mais de 50% dos dados normais foram - # descartados (ou algum outro controle de qualidade?) - # TODO - - else: - raise (Exception(f"Opção de pad não reconhecida: {pad_mode}.")) - - X_treino_pad = X_treino[inicio_X:] - y_treino = df.iloc[inicio_y:][CLASS].values - - return X_treino_pad, y_treino - - def carregue_instancia(self, instancia): - instancia_abs = Path(os.path.join(PATH_DATASET, instancia)) - X_treino_extra, y_treino = self.extrai_arrays( - instancia_abs, pad_mode=self.pad_mode - ) - - # Aplicação de janela deslizante - Xw_treino = rolling_window(X_treino_extra, self.window, axes=0, toend=False) - - # Check de sanidade - assert len(y_treino) == len( - Xw_treino - ), f'[BUG] X e y de treino não estão sincronizados para o arquivo "{instancia_abs}"' - - assert ( - min(Xw_treino.shape) > 0 - ), f'Janela especificada gerou instância sem amostras para o arquivo "{instancia_abs}"' - - # Ao usar instâncias de outros eventos para o treinamento do - # evento corrente (self.event_type) - # códigos de outros eventos podem surgir em y_treino. - # y_treino deve ter somente os códigos do evento corrente. - # Os códigos novos (derivados de outros eventos) são convertidos - # para código do evento Normal (0). - y_finite_mask = np.isfinite(y_treino) - outro_codigo_mask = y_finite_mask & np.isin( - y_treino, list(self.OBSERVATION_LABELS), invert=True - ) - if self.warnings and outro_codigo_mask.sum() > 0: - novos_codigos = set(y_treino[outro_codigo_mask]) - warnings.warn( - f'Códigos de outros eventos ("{novos_codigos}") sendo convertidos para 0.', - RuntimeWarning, - ) - y_treino[outro_codigo_mask] = 0 - - # Tratamento para classificação binária : codigo_transitente -> - # codigo_regime - if self.TRANSIENT and self.forca_binario: - codigo_regime = self.LABEL - codigo_transiente = self.LABEL + TRANSIENT_OFFSET - y_treino[y_treino == codigo_transiente] = codigo_regime - - return Xw_treino, y_treino - - def __iter__(self): - for fold in self.folds: - yield fold - - def __len__(self): - return len(self.folds) - - # Método para retenção de métricas - def salve_metricas_parciais(self, fold, metricas): - assert fold in self.folds_metricas, "Fold não encontrado" - if self.folds_metricas[fold] is not None: - warnings.warn( - "Fold com métricas já computadas. Recarregue os folds " - + "para evitar esta mensagem.", - RuntimeWarning, - ) - self.folds_metricas[fold] = metricas - - @lru_cache(1) - def extraia_amostras_simuladas_e_desenhadas(self): - # Obtém instâncias extras (simuladas e desenhadas, representadas - # pelo fold==EXTRA_INSTANCES_TRAINING) - instancias_extras = [] - with alive_bar(len(self.nomes_instancias_extras)) as bar: - for nome_instancia in self.pbar(self.nomes_instancias_extras): - instancias_extras.append(self.carregue_instancia(nome_instancia)) - bar() - - instancias_extras_passo = [ - (X[:: self.step], y[:: self.step]) for X, y in instancias_extras - ] # Aplica passo de treino - return extraia_exemplos(instancias_extras_passo) - - # Método para consulta/cálculo de métricas parciais e globais (média - # e std) - def get_metrics(self, boxplot=False): - folds_metrics = { - i: metrics for i, (_, metrics) in enumerate(self.folds_metricas.items()) - } - df_metricas = pd.DataFrame.from_dict(folds_metrics, orient="index") - if boxplot: - for metrica in ["F_BETA [%]", "MEAN_LOG_LOSS"]: - plt.figure(figsize=(11, 1)) - sns.boxplot(x=df_metricas[metrica], width=0.4, palette="colorblind") - sns.stripplot( - x=df_metricas[metrica], - jitter=True, - marker="o", - alpha=0.5, - color="black", - ) - plt.show() - df_metricas.index.name = "FOLD" - df_metricas.loc["MEAN"] = df_metricas.mean() - df_metricas.loc["STANDARD DEVIATION"] = df_metricas.std() - self.experiment.metrics = df_metricas.loc["MEAN"].to_dict() - return df_metricas - - -class Experiment: - """This class contains objects related to machine learning approach - experiments""" - - def __init__( - self, - event_name, - ova=True, - use_instancias_extras=False, # WIP - pad_mode="valid", - pbar=True, - warnings=False, - forca_binario=False, # WIP - ): - """_summary_""" - self.event_type = EventType(event_name) - self.ova = ova - self.use_instancias_extras = use_instancias_extras # WIP - self.pad_mode = pad_mode - self.pbar = pbar - self.warnings = warnings - self.forca_binario = forca_binario # WIP - - self.LABEL = self.event_type.LABEL - self.OBSERVATION_LABELS = self.event_type.OBSERVATION_LABELS - self.DESCRIPTION = self.event_type.DESCRIPTION - self.TRANSIENT = self.event_type.TRANSIENT - self.window = self.event_type.window - self.step = self.event_type.step - - @property - def event_labels(self): # WIP - """ - Dicionário com os códigos das classes que envolvem essa tarefa - de classificação. As classes podem ser 'normal', 'regime' e - 'transiente'. A classe transiente não existe para tarefas de - classificação binária. - """ - codigos = {"normal": 0, "regime": self.LABEL} - if self.TRANSIENT and (not self.forca_binario): - codigos["transiente"] = self.LABEL + TRANSIENT_OFFSET - return codigos - - def folds(self): - folds = os.path.join(PATH_FOLDS, f"folds_clf_{self.LABEL:02d}.csv") - with Path(folds).open() as f: - df_event = pd.read_csv(f) - - if not self.ova: - df_event = df_event.query("~is_ova") - - nomes_instancias = df_event["instancia"].tolist() - folds_instancias = df_event["fold"].tolist() - - return EventFolds( - self, - nomes_instancias, - folds_instancias, - ) diff --git a/toolkit/mais/README.md b/toolkit/mais/README.md index a20f5006..77b551be 100644 --- a/toolkit/mais/README.md +++ b/toolkit/mais/README.md @@ -1,17 +1,19 @@ -# Modular Artificial Inteligence System (MAIS) +# Modular Artificial Intelligence System (MAIS) -This repository presents MAIS, a system that implements Machine Learning techniques on a modular way, enabling the developer to test his/her own experiments and/or adapting others esperiments with their own idea. MAIS was developed by the Signal, Multimedia and Telecommunications (SMT) laboratory with the help from Petrobras. +This repository presents MAIS, a system that implements Machine Learning techniques in a modular way, enabling the developer to test their own experiments and/or adapt others' experiments with their own ideas. MAIS was developed by the Signal, Multimedia and Telecommunications (SMT) laboratory with the help of Petrobras. + +**This version of MAIS has been adapted to be compatible with the 3W Dataset 2.0.** In this version, MAIS implements a multiclass LGBM classifier, with the following optional features: * Statistical features - * Regular average withn an window; - * Exponetially weigthed average within an window. + * Regular average within a window; + * Exponentially weighted average within a window. * Wavelets features * Imputation methods: keep NaN values, change by the mean, ... * Different labeling methods - * Using the most recent label from an window as the lael for that sample; or - * Using the label in the middle of an window as the lael for that sample. + * Using the most recent label from a window as the label for that sample; or + * Using the label in the middle of a window as the label for that sample. * Feature selection using Random Forest importance # Repository Structure @@ -37,42 +39,52 @@ In this version, MAIS implements a multiclass LGBM classifier, with the followin │   └── generate_report.py └── setup.py ``` -MAIS uses a class called Experiment, which contains all the necessary steps to create your experiment. So, under the folder "experiments/multiclass/experiments/", you add your custom Experiment class, based on the BaseExperiment defined on "experiments/multiclass/experiments/base_experiment.py". Some examples are already implemented in order to give an idea on how an experiment is created. +MAIS uses a class called `Experiment`, which contains all the necessary steps to create your experiment.   + To add your custom `Experiment` class, create a new file under the folder `experiments/multiclass/experiments/` based on the `BaseExperiment` defined in `experiments/multiclass/experiments/base_experiment.py`. Some examples are already implemented to give an idea of how an experiment is created. -The "mais/" folder contains classes definitions that create everything that is used to create an experiment, i.e., contains all utility classes . Some of the +The "mais/" folder contains class definitions that create everything used to create an experiment, i.e., it contains all utility classes. Some of them are: - 1. mais/data/dataset.py: Defines the class MAEDataset, which contains the core logic behind MAIS dataloader. Some of its functions are: read a .csv, read the feature extraction, create the final table (the model input). - 2. mais/data/feature\_mappers.py: Defines the classes that extract the attributes for a given experiment. the implementation uses torch in order to make the extraction faster when using a lot of data. In the current version there are some strategies already implemented, for example: - 1. TorchStatisticalFeatureMapper: created statistical features from a rectangular window; - 2. TorchWaveletFeatureMapper: creates wavelets features; - 3. TorchEWStatisticalFeatureMapper: creates statistical features from a window with exponential weights for each sample. - 3. mais/data/label\_mappers.py: Creates the classes that define how the detection is done. For example, it is possible to choose if the transient period of signal will be considered, or if the samples in the beggining of a file (which are usually not faulty) will be considered. +1. `mais/dataset/dataset.py`: Defines the class `MAEDataset`, which contains the core logic behind the MAIS dataloader. Some of its functions are: read a `.csv`, read the feature extraction, create the final table (the model input). +2. `mais/processing/feature_mappers.py`:   + Defines the classes that extract the attributes for a given experiment. The implementation uses PyTorch to make the extraction faster when using a lot of data. In the current version, there are some strategies already implemented, for example: + 1. `TorchStatisticalFeatureMapper`: Creates   + statistical features from a rectangular window; + 2. `TorchWaveletFeatureMapper`: Creates wavelet features; + 3. `TorchEWStatisticalFeatureMapper`: Creates statistical features from a window with exponential weights for each sample. +3. `mais/processing/   +label_mappers.py`: Creates the classes that define how the detection is done. For example, it is possible to choose if the transient period of   + the signal will be considered, or if the samples at the beginning of a file (which are usually not faulty) will be considered. - So, in order to add new utility functions and/or classes, the "mais/" folder is probably the best place (under the correspondent file). For example, if one needs to create a new feature extractor, the best way to proceed is creating a new FeatureMapper under the file "mais/data/feature\_mappers.py". +To add new utility functions and/or classes, the "mais/" folder is probably the best place (under the correspondent file). For example, if   + you need to create a new feature extractor, the best way to proceed is by creating a new `FeatureMapper` under the file `mais/processing/feature_mappers.py`. # Experiment examples -In the folder experiments/multiclass/ there are many examples that can guide on how to create a new one: -1. multi_ew_stats_mrl_nonan.py -2. multi_mixed_mrl_nonan.py -3. multi_mixed_select_mrl_nonan.py -4. multi_stats_mrl_nonan.py -5. multi_stats_select_mrl_nonan.py -6. multi_wavelets_mrl_nonan.py -7. multi_wavelets_select_mrl_nonan.py +In the folder `experiments/multiclass/`, there are many examples that can guide you on how to create a new one: + +1. `multi_ew_stats_mrl_nonan.py` +2. `multi_mixed_mrl_nonan.py` +3. `multi_mixed_select_mrl_nonan.py` +4. `multi_stats_mrl_nonan.py` +5. `multi_stats_select_mrl_nonan.py` +6. `multi_wavelets_mrl_nonan.py` +7. `multi_wavelets_select_mrl_nonan.py` -The name of theses experiments reflect what they implements, for examples, the experiment "multi_stats_select_mrl_nonan.py" implements a multiclass classifier that uses statistical features, a feature selector, uses the most recent label as the label associated to a window and imputes NaN values. The acronym we used are: +The names   + of these experiments reflect what they implement. For example, the experiment `multi_stats_select_mrl_nonan.py` implements a multiclass classifier that uses statistical features, a feature selector, uses the most recent label as the label associated   + with a window, and imputes NaN values. The acronyms used are: -* multi = Multiclass Experiment; -* ew = Exponentially weighted; -* stats = Statistical features; -* mrl = Most recent label; -* nonan = NaN imputation; -* mixed = both statistical and wavelet features; -* select = Feature selector; and -* wavelets = Wavelets +* `multi`: Multiclass Experiment; +* `ew`: Exponentially weighted; +* `stats`: Statistical features; +* `mrl`: Most recent label; +* `nonan`: NaN imputation; +* `mixed`: Both statistical and wavelet features; +* `select`: Feature selector; and +* `wavelets`: Wavelets -For example, between experiments 4 and 6, the difference is the kind of features will be computed. In the first on the exponentially weighted statistical features are used and in the second one, just the wavelets. And to do that, the difference is basically assign the correspondent feature wrapper. The following image is the wrapper for the Experiment 4 of the examples list. +For example, between experiments 4 and 6, the difference is the kind of features   + that will be computed. In the first one, the exponentially weighted statistical features are used, and in the second one, just the wavelets. To do that, the difference is basically assigning the correspondent feature wrapper. The following image is the wrapper for Experiment 4 of the examples list. ![Statistical Wrapper](images/README/stats.jpg "Statistical features wrapper") @@ -80,27 +92,28 @@ And for experiment 6, we have: ![Wavelets Wrapper](images/README/wavelets.jpg "Wavelets features wrapper") -# Using functions outside MAIS package +# Using functions outside   + the MAIS package -The functions defined within MAIS package are being exposed to mais package main folder. So, from the 3W folder one can use the RollingLabelStrategy class for example by just doing +The functions defined within the MAIS package are being exposed to the `mais` package main folder. So, from the 3W folder, you can use the `RollingLabelStrategy` class, for example, by just doing -`from toolkit.mais import RollingLabelStrategy`. +```python +from toolkit.mais import RollingLabelStrategy +``` -In a future, it is possible to make it transparent to the user, abstracting the MAIS package path. For that, two steps are necessary: +In the future, it is possible to make it transparent to the user, abstracting the MAIS package path. For that, two steps are necessary: -1. Add the proper import into __init__.py file in 3W package; and -2. Double check if there is any functions with the same name. +1. Add the proper import into the __init__.py file in the 3W package; and   +2. Double-check if there are any functions with the same name. # How to use - -After creating the experiment and putting it into the experiment folder (for example, 'experiments/multiclass/experiments/example.py', - - 1. [OPTIONAL] Initialize a mlflow server with an sqlite URI for the logs. In general, this option is the best one to avoid mlflow from creating tons of files. - 2. Execute 'tune\_lgbm.py'. This script initialize a mlflow experiment containing all runs from the Bayesian optimization search. For every run, the script trains a model and saves its metrics. Its commands are: - 1. data-root: Root directory with the data; - 2. experiment-name: Name of the experiment (must be inside 'experiments/multiclass/experiments/'); - 3. num-trials: Number of Bayesian optimization trials; - 4. inner-splits: Number of cross-validation inner loops; - 5. outer-splits: Number of cross-validation outer loops; - 6. n-jobs: Number of cores available for parallel processing; - All this commands can be also consulted using --help. [P.S: Use apropriate environment variables for your mlflow log system.] \ No newline at end of file +After creating the experiment and putting it into the experiment folder (for example, 'experiments/multiclass/experiments/example.py'), + +1. [OPTIONAL] Initialize an ´mlflow´ server with an sqlite URI for the logs. In general, this option is the best one to avoid mlflow from creating tons of files.   +2. Execute ´tune_lgbm.py´. This script initializes an mlflow experiment containing all runs from the Bayesian optimization search. For every run, the script trains a model and saves its metrics. Its commands are: + * data-root: Root directory with the data; + * experiment-name: Name of the experiment (must be inside 'experiments/multiclass/experiments/'); + * num-trials: Number of Bayesian optimization trials; + * inner-splits: Number of cross-validation inner loops; + * outer-splits: Number of cross-validation outer loops; + * n-jobs: Number of cores available for parallel processing; All these commands can also be consulted using --help. [P.S: Use appropriate environment variables for your mlflow log system.] \ No newline at end of file diff --git a/toolkit/mais/dataset/dataset.py b/toolkit/mais/dataset/dataset.py index 85f1f5e9..ef467634 100644 --- a/toolkit/mais/dataset/dataset.py +++ b/toolkit/mais/dataset/dataset.py @@ -2,13 +2,24 @@ """ Basic dataset definitions """ from collections import namedtuple -from pathlib import Path import numpy as np import pandas from joblib import Parallel, delayed +from .base import ( + CLASS, + COLUMNS_DATA_FILES, + EVENT_NAMES, + LABELS_DESCRIPTIONS, + NORMAL_LABEL, + PATH_DATASET, + TRANSIENT_OFFSET, + VARS, + load_3w_dataset, +) + class MAEDataset: """Load all files and return transformed glob @@ -28,7 +39,7 @@ class MAEDataset: one or more of {"real", "simulated", "drawn"} - **feature_mapper: CALLABLE** - (raw_tags: DataFrame, raw_labels: DataFrame) -> + (raw_tags: DataFrame,  raw_labels: DataFrame) -> (feature: DataFrame x label: DataFrame) -- deals with transforming the raw data from a single event, to be used during training/evaluation. Outputs from each event concatenated @@ -97,24 +108,14 @@ class MAEDataset: } # List of known data tags - TAG_NAMES = [ - "P-PDG", - "P-TPT", - "T-TPT", - "P-MON-CKP", - "T-JUS-CKP", - "P-JUS-CKGL", - "T-JUS-CKGL", - "QGL", - ] + TAG_NAMES = VARS def __init__( self, transformed_events=None, events=None, # either pass in preloaded events or the root directory - root_dir=None, + data_type="real", # Type of data to load: 'real', 'simulated', or 'imputed' tgt_events=[], # which event_types to load - instance_types=[], # simulated and or real and or drawn feature_mapper=tuple, # transformer from event to features n_jobs=-1, ): @@ -123,94 +124,62 @@ def __init__( """ # save parameters - self.root_dir = root_dir + self.data_type = data_type self.tgt_events = tgt_events - self.instance_types = instance_types self.n_jobs = n_jobs self.feature_mapper = feature_mapper # Call the heavy load _make_set passing the (maybe Null) events self._make_set(events) - def _instance_type(fname): - """ - Detects if instance type is selected. - - * Parameters: - - **fname**: STRING - name of the instance file - * Returns: - - **STRING** - string representing the instance type of the input file name - - """ - if fname.startswith("OLGA"): - return "simulated" - elif fname.startswith("DESENHADA"): - return "drawn" - else: - return "real" - - def load_events(data_root, n_jobs=-1): + def load_events(self, data_type="real", n_jobs=-1): """ - Scan data_root for raw files and return dict. useful for preloads. + Load events from the 3W Dataset 2.0. * Parameters: - - **data_root: STRING** - base location of events separated by event type + - **data_type: STRING** - Type of data to load ('real', 'simulated', or 'imputed') * Returns: - - **events**: [LIST] - Optional list of preloaded events + - **events**: [LIST] - List of loaded events """ - def _read(tgt, fname): + def _read(df): """ Return a dict with the summary of a target. * Parameters: - - **tgt: STRING** - Target location + - **df: pandas.DataFrame** - DataFrame with the 3W Dataset 2.0 data. * Returns: - - **fname**: STRING - Filename + - **dict**: Dict with the summary of a target. """ - df = pandas.read_csv( - fname, - index_col=MAEDataset.INDEX_NAME, - header=0, - parse_dates=True, - memory_map=True, - ) + tags = df[MAEDataset.TAG_NAMES] labels = df[MAEDataset.LABEL_NAME] return { - "file_name": str(fname.relative_to(tgt)), "tags": tags, "labels": labels, - "event_type": int(str(tgt.relative_to(data_root))), + "event_type": df["label"].iloc[0], } - data_root = Path(data_root) - target_dirs = [ - d for d in data_root.iterdir() if d.match("[0-8]") - ] # filter directories with classes - tasks = [(tgt, fname) for tgt in target_dirs for fname in tgt.glob("*.csv")] - with Parallel(n_jobs) as p: - events = p(delayed(_read)(*t) for t in tasks) + # Load the 3W Dataset 2.0 + df = load_3w_dataset(data_type=data_type) + + # Split the DataFrame by event type + events = [_read(df[df["label"] == event_type]) for event_type in df["label"].unique()] + return events def transform_events( - events, raw_mapper, tgt_events=None, instance_types=None, n_jobs=-1 + self, events, raw_mapper, tgt_events=None, n_jobs=-1 ): """ - Apply raw_mapper to list of events, filtering by target events and instance types + Apply raw_mapper to list of events, filtering by target events """ if tgt_events is not None: events = [e for e in events if (e["event_type"] in tgt_events)] - if instance_types is not None: - events = [ - e - for e in events - if (MAEDataset._instance_type(e["file_name"]) in instance_types) - ] with Parallel(n_jobs) as p: return p(delayed(raw_mapper)(e) for e in events) @@ -229,7 +198,7 @@ def gather(transformed_events): def _make_set(self, events=None): """ - Loads all instances of target classes from the desired types, transforming the raw data to obtain its + Loads all instances of target classes, transforming the raw data to obtain its features by calling the *feature_mapper()* method for each instance. * Parameters: @@ -238,15 +207,10 @@ def _make_set(self, events=None): # load if not preloaded if events is None: - events = MAEDataset.load_events(self.root_dir) + events = MAEDataset.load_events(self, self.data_type) # filter events - def _should_keep(e): - return ( - MAEDataset._instance_type(e["file_name"]) in self.instance_types - ) and (e["event_type"] in self.tgt_events) - - events = [e for e in events if _should_keep(e)] + events = [e for e in events if (e["event_type"] in self.tgt_events)] feature_names = [] X = [] @@ -283,4 +247,5 @@ def _should_keep(e): self.g_class = np.array(g_class) self.g = np.repeat(np.arange(self.g_len.size), self.g_len) - self.feature_names = np.array(feature_names[0].to_list()) + self.feature + \ No newline at end of file diff --git a/toolkit/mais/processing/feature_mappers.py b/toolkit/mais/processing/feature_mappers.py index c1294298..84d3f481 100644 --- a/toolkit/mais/processing/feature_mappers.py +++ b/toolkit/mais/processing/feature_mappers.py @@ -199,67 +199,5 @@ def __call__(self, tags, event_type=None): class TorchWaveletFeatureMapper: """PyTorch implementation of the wavelet feature mapper""" - def __init__(self, level, stride, offset=0): - self.level = level - self.window_size = 2**level - self.stride = stride - self.offset = offset - - impulse = np.zeros(self.window_size) - impulse[-1] = 1 - hs = pywt.swt(impulse, "haar", level=self.level) - H = np.stack([h[i] for h in hs for i in range(2)] + [impulse], axis=-1) - - self.feat_names = [ - f"{type_}{level}" - for level in range(self.level, 0, -1) - for type_ in ["A", "D"] - ] + ["A0"] - self.H = torch.tensor(H).double() - - def __call__(self, tags, event_type=None): - # preserve names and index - columns = tags.columns - index = tags.index - - # output names - out_columns = [f"{t}_{f}" for f in self.feat_names for t in columns] - - if len(tags) < self.offset + self.window_size: - # not enough for a single window - out = pd.DataFrame( - columns=out_columns, dtype=np.float64 - ) # return empty dataframe - out.index.name = index.name - return out - - tags = torch.tensor(tags.values).double() - - # apply windowing operation - tags = tags[self.offset :].unfold(0, self.window_size, self.stride) - index = index[self.offset :][self.window_size - 1 :: self.stride] - - coeffs = torch.tensordot(tags, self.H, dims=([-1], [0])) - - records = {} - for i, t in enumerate(columns): - for j, f in enumerate(self.feat_names): - records[f"{t}_{f}"] = coeffs[:, i, j] - - # fill dataframe in correct order - out = pd.DataFrame.from_records(records, index=index, columns=out_columns) - out.index.name = index.name # also preserve index - return out - - -class MixedMapper: - """ - Join features of multiple mappers. Feature sizes must be consistent. - - """ - - def __init__(self, *args): - self.mappers = args - - def __call__(self, tags, event_type=None): - return pd.concat([m(tags, event_type) for m in self.mappers], axis="columns") + def __init__(self, level, stride, offset + \ No newline at end of file diff --git a/toolkit/mais/processing/label_mappers.py b/toolkit/mais/processing/label_mappers.py index e74abe23..cb2f6bcd 100644 --- a/toolkit/mais/processing/label_mappers.py +++ b/toolkit/mais/processing/label_mappers.py @@ -3,9 +3,6 @@ import scipy.stats as sp import torch -from dataset.dataset import MAEDataset - - class RollingLabelStrategy: """ Base class that just wraps applications of apply, @@ -103,7 +100,7 @@ def __call__(self, labels, event_type): # not enough samples for windowing, return empty if len(labels) < self.offset + self.window_size: - out = pd.Series(name=MAEDataset.LABEL_NAME, dtype=np.float64) + out = pd.Series(name="class", dtype=np.float64) # Changed to 'class' out.index.name = index.name return out @@ -115,7 +112,7 @@ def __call__(self, labels, event_type): index = index[self.offset :][self.window_size - 1 :: self.stride] out = pd.Series( - name=MAEDataset.LABEL_NAME, + name="class", # Changed to 'class' data=self.apply(y, event_type), index=index, dtype=np.float64, @@ -199,3 +196,4 @@ class TorchMulticlassMRLStrategy(TorchLabelStrategy): def apply(self, y, event_type=None): last = y[:, -1] return torch.where(last.isnan(), self._NAN, (last % 100).float()) + \ No newline at end of file diff --git a/toolkit/mais/setup.py b/toolkit/mais/setup.py index d6f2d05e..3f362ea4 100644 --- a/toolkit/mais/setup.py +++ b/toolkit/mais/setup.py @@ -3,8 +3,8 @@ setup( name="mais", packages=find_packages(), - version="0.1.0", + version="0.2.0", # October 19, 2024 description="Fault detection of oil wells using machine learning", - author="SMT/UFRJ", + author="SMT/UFRJ, adapted to use with 3W 2.0 by Kelly Castro" license="", ) diff --git a/toolkit/mais/training/multiclass/train_lgbm.py b/toolkit/mais/training/multiclass/train_lgbm.py index 6eff27b3..f7a0e277 100644 --- a/toolkit/mais/training/multiclass/train_lgbm.py +++ b/toolkit/mais/training/multiclass/train_lgbm.py @@ -47,9 +47,9 @@ def eval_lgbm(ctx, **kwargs): # grab all cli params config = {**ctx.obj, **kwargs} - # preload events - training_events = MAEDataset.load_events(config["train_path"]) - test_events = MAEDataset.load_events(config["test_path"]) + # Create MAEDataset objects + training_events = MAEDataset(data_type='real', base_path=config["train_path"]) + test_events = MAEDataset(data_type='real', base_path=config["test_path"]) # fake trial sampler with fixed parameters with open(config["param_file"]) as f: @@ -66,7 +66,7 @@ def eval_lgbm(ctx, **kwargs): metric_name = experiment.metric_name() # process events - data = prepare_data(experiment, events, config["num_jobs"]) + data = prepare_data(experiment, training_events.events, test_events.events, config["num_jobs"]) Xt = data["train_X"] yt = data["train_y"] @@ -147,3 +147,4 @@ def cv(ctx, param_file): if __name__ == "__main__": os.nice(19) cli(obj={}) + \ No newline at end of file diff --git a/toolkit/mais/training/multiclass/tune_lgbm.py b/toolkit/mais/training/multiclass/tune_lgbm.py index 926adaf9..6c6b15ca 100755 --- a/toolkit/mais/training/multiclass/tune_lgbm.py +++ b/toolkit/mais/training/multiclass/tune_lgbm.py @@ -23,8 +23,6 @@ import joblib -# import dask.distributed - import seaborn as sns import matplotlib.pyplot as plt @@ -167,10 +165,10 @@ def plot_confusion_matrix(cm, std=None, normalize=False): if std is not None: annot = [ - [rf"${v:.2f}\pm{s:.2f}$" for v, s in zip(vr, sr)] for vr, sr in zip(cm, std) + [rf"\{v\:\.2f\}\\pm\{s\:\.2f\}" for v, s in zip(vr, sr)] for vr, sr in zip(cm, std) ] else: - annot = [[f"${v:.2f}$" for v in vr] for vr in cm] + annot = [[f"\{v\:\.2f\}" for v in vr] for vr in cm] fig, ax = plt.subplots(figsize=(10, 10)) sns.heatmap(cm, cmap="viridis", annot=annot, fmt="", square=True, ax=ax) @@ -197,7 +195,7 @@ def score_model(train_set, test_set, experiment, model, n_jobs=-1): # rebalance train set Xt, yt, _ = experiment.balance(Xt, yt, gt, train_set.g_class) - # preprocess and fit + # preprocess and fit logger.info("experiment.fit") Xt, yt = experiment.fit_transform(Xt, yt) Xs, ys = experiment.transform(Xs, ys) @@ -398,9 +396,13 @@ def nested_cv(ctx, **kwargs): # gather configuration config = {**ctx.obj, **kwargs} - with joblib.parallel_backend("loky", n_jobs=config["n_jobs"]): - # preload events - events = MAEDataset.load_events(config["data_root"], -1) + with joblib.parallel_backend("loky", n_jobs=config["n_jobs"]):   + + # Create an instance of the MAEDataset class + dataset = MAEDataset(data_type='real', base_path=config["data_root"]) # Assuming data_type is 'real' + + # Access the events through the events attribute + events = dataset.events model_sampler = lightgbm_sampler experiment_sampler = importlib.import_module(config["experiment_name"]).sample @@ -429,7 +431,8 @@ def tune(ctx, **kwargs): # _ = dask.distributed.Client(n_workers=config["n_jobs"], processes=True) with joblib.parallel_backend("loky", n_jobs=config["n_jobs"]): # preload events - train_events = MAEDataset.load_events(config["train_root"], config["n_jobs"]) + train_events = MAEDataset(data_type='real',   + base_path=config["train_root"]) # select samplers model_sampler = lightgbm_sampler @@ -443,7 +446,8 @@ def tune(ctx, **kwargs): # find best hyper-params study = hyperparameter_search( - train_events, experiment_sampler, model_sampler, config + train_events.events,   + experiment_sampler, model_sampler, config # Access events through the events attribute ) # train model with best params @@ -455,7 +459,8 @@ def tune(ctx, **kwargs): # map and gather tranining set transformed_train_events = MAEDataset.transform_events( - train_events, + train_events.events,   + # Access events through the events attribute best_experiment.raw_transform, instance_types=best_experiment.instance_types, tgt_events=best_experiment.tgt_events, @@ -464,9 +469,10 @@ def tune(ctx, **kwargs): train_set = MAEDataset.gather(transformed_train_events) # map and gather test set - test_events = MAEDataset.load_events(config["test_root"], config["n_jobs"]) + test_events = MAEDataset(data_type='real',   + base_path=config["test_root"]) transformed_test_events = MAEDataset.transform_events( - test_events, + test_events.events, # Access events through the events attribute best_experiment.raw_transform, instance_types=best_experiment.instance_types, tgt_events=best_experiment.tgt_events, @@ -481,4 +487,4 @@ def tune(ctx, **kwargs): if __name__ == "__main__": os.nice(19) - cli(obj={}) + cli(obj={})   diff --git a/toolkit/misc.py b/toolkit/misc.py index 6f2323e5..bbeb6b16 100644 --- a/toolkit/misc.py +++ b/toolkit/misc.py @@ -33,190 +33,90 @@ EVENT_NAMES, PARQUET_EXTENSION, PARQUET_ENGINE, + load_3w_dataset, # To work with 3W v2.0 ) # Methods # -def label_and_file_generator(real=True, simulated=False, drawn=False): - """This is a generating function that returns tuples for all - indicated instance sources (`real`, `simulated` and/or - `hand-drawn`). Each tuple refers to a specific instance and contains - its label (int) and its full path (Path). All 3W Dataset's instances - are considered. - Args: - real (bool, optional): Indicates whether `real` instances should - be considered. Defaults to True. - simulated (bool, optional): Indicates whether `simulated` - instances should be considered. Defaults to False. - drawn (bool, optional): Indicates whether `hand-drawn` instances - should be considered. Defaults to False. - - Yields: - generator: Tuples for all indicated instance sources. Each tuple - refers to a specific instance and contains its label (int) - and its full path (Path). - """ - for i in Path(PATH_DATASET).iterdir(): - try: - # Considers only directories - if i.is_dir(): - label = int(i.stem) - for fp in i.iterdir(): - # Considers only Parquet files - if fp.suffix == PARQUET_EXTENSION: - # Considers only instances from the requested - # source - if ( - (simulated and fp.stem.startswith("SIMULATED")) - or (drawn and fp.stem.startswith("DRAWN")) - or ( - real - and (not fp.stem.startswith("SIMULATED")) - and (not fp.stem.startswith("DRAWN")) - ) - ): - yield label, fp - except: - # Otherwise (e.g. files or directory without instances), do - # nothing - pass - - -def get_all_labels_and_files(): - """Gets lists with tuples related to all real, simulated, or - hand-drawn instances contained in the 3w Dataset. Each list - considers instances from a single source. Each tuple refers to a - specific instance and contains its label (int) and its full path - (Path). - Returns: - tuple: Tuple containing three lists with tuples related to real, - simulated, and hand-drawn instances, respectively. - """ - real_instances = list( - label_and_file_generator(real=True, simulated=False, drawn=False) - ) - simulated_instances = list( - label_and_file_generator(real=False, simulated=True, drawn=False) - ) - drawn_instances = list( - label_and_file_generator(real=False, simulated=False, drawn=True) - ) - - return real_instances, simulated_instances, drawn_instances - - -def create_table_of_instances(real_instances, simulated_instances, drawn_instances): +def create_table_of_instances(df): """Creates a table of instances (pandas.DataFrame) that shows the amount of instances that compose the 3W Dataset, by knowledge source - (real, simulated and hand-drawn instances) and by instance label. + (real, simulated and imputed instances) and by instance label. Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - simulated_instances (list): List with tuples related to all - simulated instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - drawn_instances (list): List with tuples related to all - hand-drawn instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: pandas.DataFrame: The created table that shows the amount of instances that compose the 3W Dataset, by knowledge source - (real, simulated and hand-drawn instances) and by instance + (real, simulated and imputed instances) and by instance label. """ - # Gets the label's description of all instances as a list of dicts - list_instances = ( - [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "REAL", - } - for label, fp in real_instances - ] - + [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "SIMULATED", - } - for label, fp in simulated_instances - ] - + [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "HAND-DRAWN", - } - for label, fp in drawn_instances - ] - ) - # Transforms the list of dicts into a pandas.DataFrame - df_instances = pd.DataFrame(list_instances) + # Create a new column with the instance label and description + df['INSTANCE LABEL'] = df['label'].astype(str) + " - " + df['label'].map(LABELS_DESCRIPTIONS) - # Creates the table of instances with relevant information and - # desired format + # Create the table of instances toi = ( - df_instances.groupby(["INSTANCE LABEL", "SOURCE"]) + df.groupby(['INSTANCE LABEL', 'state']) .size() .reset_index() - .pivot("SOURCE", "INSTANCE LABEL", 0) + .pivot('state', 'INSTANCE LABEL', 0) .fillna(0) .astype(int) .T ) - toi = toi.loc[natsorted(toi.index.values)] - toi = toi[["REAL", "SIMULATED", "HAND-DRAWN"]] - toi["TOTAL"] = toi.sum(axis=1) - toi.loc["TOTAL"] = toi.sum(axis=0) + + # Rename the columns to represent the data sources + toi = toi.rename(columns={0: 'REAL', 1: 'SIMULATED', 2: 'IMPUTED'}) + + # Add a 'TOTAL' column and row + toi['TOTAL'] = toi.sum(axis=1) + toi.loc['TOTAL'] = toi.sum(axis=0) return toi -def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False): +def filter_rare_undesirable_events(toi, threshold, simulated=False, imputed=False): """Generates a table of instances (pandas.DataFrame) that shows the amount of filtered instances, by knowledge source (real, `simulated` - and `hand-drawn` instances) and by instance label. This filter keeps - only real instances, as well as `simulated` and `hand-drawn` if + and `imputed` instances) and by instance label. This filter keeps + only real instances, as well as `simulated` and `imputed` if indicated, of rare event types. An event type is considered rare if the amount of instances labeled as this event relative to the total number of instances is less than the indicated `threshold`. In both - totalizations, `simulated` and `hand-drawn` instances are only + totalizations, `simulated` and `imputed` instances are only considered if indicated, but real instances are always taken into account. Args: toi (pandas.DataFrame): Table that shows the amount of instances that compose the 3W Dataset, by knowledge source (real, - `simulated` and `hand-drawn` instances) and by instance + `simulated` and `imputed` instances) and by instance label. This object is not modified in this function. threshold (float): Relative limit that establishes rare event types. simulated (bool, optional): Indicates whether `simulated` instances should be considered. Defaults to False. - drawn (bool, optional): Indicates whether `hand-drawn` instances + imputed (bool, optional): Indicates whether `imputed` instances should be considered. Defaults to False. Returns: pandas.DataFrame: The table of instances (pandas.DataFrame) that shows the amount of filtered instances, by knowledge source - (real, simulated and hand-drawn instances) and by instance + (real, simulated and imputed instances) and by instance label. """ - # Simulated and hand-drawn instances are optional, but real + # Simulated and imputed instances are optional, but real # instances are always considered totals = 0 if simulated: totals += toi["SIMULATED"] - if drawn: - totals += toi["HAND-DRAWN"] + if imputed: + totals += toi["IMPUTED"] totals += toi["REAL"] # Absolute limit @@ -229,12 +129,12 @@ def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False) return rue -def load_instance(instance): +def load_instance(label, fp): """Loads all data and metadata from a specific `instance`. Args: - instance (tuple): This tuple must refer to a specific `instance` - and contain its label (int) and its full path (Path). + label (int): Label of the instance. + fp (Path): Full path to the instance file. Raises: Exception: Error if the Parquet file passed as arg cannot be @@ -246,8 +146,6 @@ def load_instance(instance): other columns of the Parquet file and metadata loaded from the argument `instance` (label, well, and id). """ - # Loads label metadata from the argument `instance` - label, fp = instance try: # Loads well and id metadata from the argument `instance` @@ -272,30 +170,24 @@ def load_instance(instance): return df -def load_instances(instances): - """Loads all data and metadata from multiple `instances` in - parallel. +def load_instances(df): # Changed function signature + """Loads all data and metadata from the DataFrame. Args: - instances (list): List with tuples related to real, simulated, - or hand-drawn `instances`. Each tuple must refer to a - specific instance and must contain its label (int) and its - full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: - pandas.DataFrame: Its index contains the timestamps loaded from - the Parquet files. Its columns contain data loaded from the - other columns of the Parquet files and the metadata label, - well, and id). + pandas.DataFrame: DataFrame with loaded instances. """ + # Prepares for multiple parallel loadings pool = ThreadPool() dfs = [] try: # Calls multiple loadings in parallel - for df in pool.imap_unordered(load_instance, instances): - dfs.append(df) + for label, fp in df[['label', 'filepath']].values: # Assuming 'filepath' column exists + dfs.append(load_instance(label, Path(fp))) finally: # If the instance cannot be loaded pool.terminate() @@ -304,22 +196,17 @@ def load_instances(instances): return pd.concat(dfs) -def create_and_plot_scatter_map(real_instances): +def create_and_plot_scatter_map(df): """Creates and plots scatter map with all the real instances listed - in the `real_instances` argument. + in the `df` argument. Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: tuple: Tuple containing the first and the last year of occurrence among all instances, respectively. """ - # Loads all instances - df = load_instances(real_instances) # Finds the first and the last year of occurrence among all instances df_time = ( @@ -349,443 +236,4 @@ def create_and_plot_scatter_map(real_instances): last_year = np.max(df_time["max"]).year plt.rcParams["axes.labelsize"] = 9 plt.rcParams["font.size"] = 9 - plt.rcParams["legend.fontsize"] = 9 - fig, ax = plt.subplots(figsize=(9, 9)) - yticks = [] - yticks_labels = [] - for well in well_times.keys(): - times = well_times[well] - class_names = well_classes[well] - class_colors = list(map(cmap, class_names)) - well_id = well_code[well] - yticks.append(well_id * height + height / 2 - border / 2) - yticks_labels.append(well) - ax.broken_barh( - times, - (well_id * height, height - border), - facecolors=class_colors, - edgecolors=class_colors, - ) - ax.grid(True) - ax.set_axisbelow(True) - ax.set_yticks(yticks) - ax.set_yticklabels(yticks_labels) - ax.xaxis.set_major_locator(mdates.YearLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y")) - legend_colors = [ - Patch(facecolor=cmap(l), label=str(l) + " - " + d) - for l, d in LABELS_DESCRIPTIONS.items() - ] - ax.legend( - frameon=False, - handles=legend_colors, - loc="upper center", - bbox_to_anchor=(0.5, 1.12), - ncol=4, - ) - - return first_year, last_year - - -def count_properties_instance(instance): - """Counts properties from a specific `instance`. - - Args: - instance (tuple): This tuple must refer to a specific `instance` - and contain its label (int) and its full path (Path). - - Raises: - Exception: Error if the Parquet file passed as arg cannot be - read. - - Returns: - dict: Dict containing the counted properties with the following - keys: n_vars (number of variables), n_vars_missing (number - of missing variables), n_vars_frozen (number of frozen - variables), n_obs (number of observations), and - n_obs_unlabeled (number of unlabeled observations). - """ - # Preparation for counting - _, fp = instance - p = {"n_vars_missing": 0, "n_vars_frozen": 0} - - try: - # Read the Parquet file - df = pd.read_parquet(fp, engine=PARQUET_ENGINE) - except Exception as e: - raise Exception(f"error reading file {fp}: {e}") - - # Counts properties - vars = df.columns[:-1] # Last column with class is not considered - p["n_vars"] = len(vars) - for var in vars: - if df[var].isnull().all(): - p["n_vars_missing"] += 1 - u_values = df[var].unique() - if len(u_values) == 1 and not np.isnan(u_values): - p["n_vars_frozen"] += 1 - p["n_obs"] = len(df) - p["n_obs_unlabeled"] = df["class"].isnull().sum() - - return p - - -def count_properties_instances(instances): - """Counts properties from multiple `instances` in parallel. - - Args: - instances (list): List with tuples related to real, simulated, - or hand-drawn `instances`. Each tuple must refer to a - specific instance and must contain its label (int) and its - full path (Path). - - Returns: - dict: Dict containing the counted properties with the following - keys: n_vars (number of variables), n_vars_missing (number - of missing variables), n_vars_frozen (number of frozen - variables), n_obs (number of observations), and - n_obs_unlabeled (number of unlabeled observations). - """ - # Prepares for multiple parallel counts - pool = ThreadPool() - ps = [] - - try: - # Calls multiple counts in parallel - for p in pool.imap_unordered(count_properties_instance, instances): - ps.append(p) - finally: - # If the instance cannot be loaded - pool.terminate() - - # Sum ps and return the result - return dict(pd.DataFrame(ps).sum()) - - -def calc_stats_instances(real_instances, simulated_instances, drawn_instances): - """Calculates the 3W Dataset's fundamental aspects related to - inherent difficulties of actual data. Three statistics are - calculated: Missing Variables, Frozen Variables, and Unlabeled - Observations. All instances, regardless of their source, influence - these statistics. - - Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - simulated_instances (list): List with tuples related to all - simulated instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - drawn_instances (list): List with tuples related to all - hand-drawn instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - - Returns: - pandas.DataFrame: Its index contains the statistic's names. Its - columns contain statistics themselves (Amount and - Percentage) - """ - # Counts properties from all indicated instances - p = count_properties_instances( - real_instances + simulated_instances + drawn_instances - ) - - # Extract properties - n_vars = p["n_vars"] - n_vars_missing = p["n_vars_missing"] - n_vars_frozen = p["n_vars_frozen"] - n_obs = p["n_obs"] - n_obs_unlabeled = p["n_obs_unlabeled"] - - # Calculates the statistics - stats = { - "Missing Variables": [ - n_vars_missing, - f"{100*n_vars_missing/n_vars:.2f}% of {n_vars}", - ], - "Frozen Variables": [ - n_vars_frozen, - f"{100*n_vars_frozen/n_vars:.2f}% of {n_vars}", - ], - "Unlabeled Observations": [ - n_obs_unlabeled, - f"{100*n_obs_unlabeled/n_obs:.2f}% of {n_obs}", - ], - } - - return pd.DataFrame.from_dict( - stats, orient="index", columns=["Amount", "Percentage"] - ) - - -def resample(data, n, class_number): - """Downsampling for instances. - - Args: - data (string): Instance path - n (integer): Factor to downsampling the instance. - class_number (integer): integer that represents the event class - - Returns: - pandas.DataFrame: Downsamplig instance DataFrame - """ - # Timestamp is expected to be a column - data.reset_index(inplace=True) - # Group Timestamp and get last value - resampleTimestamp = data.timestamp.groupby(data.index // n).max() - # Replace transient label from 100 to 0.5 - data["class"] = data["class"].astype(float) - tempClassLabel = data["class"].replace(class_number + 100, 0.5) - # Get the max value from the group Class column - resampleClass = tempClassLabel.groupby(tempClassLabel.index // n).max() - # Back with transient label value - resampleClass.replace(0.5, class_number + 100, inplace=True) - # Non overlap group and get the average value from the data - dfResample = data.groupby(data.index // n).mean(numeric_only=True) - # Drop class column - dfResample.drop(["class"], axis=1, inplace=True) - # Insert resampled class label values - dfResample["class"] = resampleClass - # Insert resampled timestamp - dfResample.index = resampleTimestamp - - return dfResample - - -def plot_instance(class_number, instance_index, resample_factor): - """Plot one especific event class and instance. By default the - instance is downsampling (n=100) and Z-score Scaler. In order to - help the visualization transient labels was changed to '0.5'. - - Args: - class_number (integer): integer that represents the event class - instance_index (integer): input the instance file index - """ - instances_path = os.path.join( - PATH_DATASET, str(class_number), "*" + PARQUET_EXTENSION - ) - instances_path_list = glob.glob(instances_path) - if instance_index >= len(instances_path_list): - print( - f"instance index {instance_index} out of range - Insert a valid index between 0 and {len(instances_path_list)-1}" - ) - else: - df_instance = pd.read_parquet( - instances_path_list[instance_index], engine=PARQUET_ENGINE - ) - df_instance_resampled = resample(df_instance, resample_factor, class_number) - df_drop_resampled = df_instance_resampled.drop(["state", "class"], axis=1) - df_drop_resampled.interpolate( - method="linear", limit_direction="both", axis=0, inplace=True - ) - df_drop_resampled.fillna( - 0, - inplace=True, - ) - scaler_resampled = TimeSeriesScalerMeanVariance().fit_transform( - df_drop_resampled - ) - - df_scaler_resampled = pd.DataFrame( - scaler_resampled.squeeze(), - index=df_drop_resampled.index, - columns=df_drop_resampled.columns, - ) - df_instance_resampled["class"] = df_instance_resampled["class"].replace( - 100 + int(class_number), 0.5 - ) - df_instance_resampled["class"] = df_instance_resampled["class"].replace( - int(class_number), 1 - ) - - colors_traces = [ - "#008080", - "#3498DB", - "#E74C3C", - "#884EA0", - "#D4AC0D", - "#AF601A", - "#D35400", - "#839192", - "#2E4053", - ] - fig = go.Figure() - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[0]], - mode="lines+markers", - marker_symbol="circle", - marker_size=3, - name=VARS[0], - yaxis="y1", - line_color=colors_traces[0], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[1]], - mode="lines+markers", - marker_symbol="diamond", - marker_size=3, - name=VARS[1], - yaxis="y2", - line_color=colors_traces[1], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[2]], - mode="lines+markers", - marker_symbol="x", - marker_size=3, - name=VARS[2], - yaxis="y3", - line_color=colors_traces[2], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[3]], - mode="lines+markers", - marker_symbol="star", - marker_size=3, - name=VARS[3], - yaxis="y4", - line_color=colors_traces[3], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[4]], - mode="lines+markers", - marker_symbol="triangle-up", - marker_size=3, - name=VARS[4], - yaxis="y5", - line_color=colors_traces[4], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[5]], - mode="lines", - name=VARS[5], - yaxis="y6", - line_color=colors_traces[5], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[6]], - mode="lines", - name=VARS[6], - yaxis="y7", - line_color=colors_traces[6], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[7]], - mode="lines", - name=VARS[7], - yaxis="y8", - line_color=colors_traces[7], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_instance_resampled["class"], - mode="markers", - name="Label", - yaxis="y9", - line_color=colors_traces[8], - ) - ), - fileName = instances_path_list[instance_index].split(os.sep) - fig.update_layout( - title=EVENT_NAMES[class_number] + " - " + fileName[-1], - xaxis_title="Time(s)", - yaxis_title="z-score", - font=dict(size=12), - yaxis1=dict( - tickfont=dict(color=colors_traces[0]), - position=0, - tickformat=".2f", - showticklabels=False, - ), - yaxis2=dict( - tickfont=dict(color=colors_traces[1]), - overlaying="y", - side="left", - position=0.05, - tickformat=".2f", - showticklabels=False, - ), - yaxis3=dict( - tickfont=dict(color=colors_traces[2]), - overlaying="y", - side="left", - position=0.10, - tickformat=".2f", - showticklabels=False, - ), - yaxis4=dict( - tickfont=dict(color=colors_traces[3]), - overlaying="y", - side="left", - position=0.15, - tickformat=".2f", - showticklabels=False, - ), - yaxis5=dict( - tickfont=dict(color=colors_traces[4]), - overlaying="y", - side="left", - position=0.2, - tickformat=".2f", - showticklabels=False, - ), - yaxis6=dict( - tickfont=dict(color=colors_traces[5]), - overlaying="y", - side="left", - position=0.25, - tickformat=".2f", - showticklabels=False, - ), - yaxis7=dict( - tickfont=dict(color=colors_traces[6]), - overlaying="y", - side="left", - position=0.3, - tickformat=".2f", - showticklabels=False, - ), - yaxis8=dict( - tickfont=dict(color=colors_traces[7]), - overlaying="y", - side="left", - position=0.35, - tickformat=".2f", - showticklabels=False, - ), - yaxis9=dict( - tickfont=dict(color=colors_traces[8]), - anchor="x", - overlaying="y", - side="left", - ), - ) - fig.show() + plt.rcParams["legend.fontsize"]