Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt MAIS to 3W Dataset 2.0 #130

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th

The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md).

### Loading the 3W Dataset 2.0

The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders.

**Usage:**

```python
import toolkit as tk

# Load the real data from the 3W Dataset 2.0
df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset')
```

## Incorporated Problems

Specific problems will be incorporated into this project gradually. At this point, we can work on:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,19 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'numpy'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
]
}
],
"source": [
"import sys\n",
"import os\n",
Expand All @@ -58,6 +70,8 @@
"sys.path.append(os.path.join('..','..','..'))\n",
"import toolkit as tk\n",
"\n",
"from toolkit.base import load_3w_dataset\n",
"\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'svg'"
]
Expand All @@ -78,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -101,7 +115,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -114,9 +128,43 @@
],
"source": [
"event_labels = list(experiment.event_labels.values())\n",
"event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n",
"fold: tk.EventFold\n",
"folds: tk.EventFolds = experiment.folds()"
"event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload 3W Dataset 2.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = load_3w_dataset(data_type='real', base_path='path/to/dataset') # Replaced by correct path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the folds manually"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"folds = tk.EventFolds(\n",
" experiment=experiment,\n",
" df=df, # Pass the loaded DataFrame to the EventFolds class\n",
" # ... (other parameters, if necessary) ...\n",
")\n"
]
},
{
Expand All @@ -135,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -185,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1501,7 +1549,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.12.0"
},
"toc": {
"base_numbering": 1,
Expand Down
20 changes: 17 additions & 3 deletions toolkit/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@

* [Introduction](#introduction)
* [Release Notes](#release-notes)
* [1.0.0](#100)
* [1.1.0](#110)
  * [1.0.0](#100)
  * [1.1.0](#110)
  * [1.2.0](#120)

# Introduction

Expand All @@ -36,4 +37,17 @@ Release: July 25, 2024.

Highlights:

1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.
1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.

## 1.2.0

Release: October 19, 2024 # Lastiest version

Highlights:

1. **Adapts `load_dataset()` to 3W Dataset 2.0:** The `load_dataset()` function in `base.py` was adapted to correctly handle the folder structure and different data types of the 3W Dataset 2.0. It was renamed to `load_3w_dataset()`.
2. **Updates `dev.py` for 3W Dataset 2.0:** The `dev.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. The `extrai_arrays()` function was removed, and the `EventFolds` and `Experiment` classes were adjusted.
3. **Updates `misc.py` for 3W Dataset 2.0:** The `misc.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. Redundant functions were removed, and existing functions were adapted to receive the DataFrame as a parameter.
4. **Updates `__init__.py` for 3W Dataset 2.0:** The `__init__.py` file was updated to import and expose the new `load_3w_dataset()` function.

These updates ensure that the 3W Toolkit is fully compatible with the 3W Dataset 2.0, providing a more efficient and streamlined workflow for loading and analyzing the data.
49 changes: 29 additions & 20 deletions toolkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,59 @@
"""This is the 3W Toolkit, a software package written in Python 3 that
"""This is the 3W Toolkit, a software package written in Python 3 that 
is one of the 3W Project's major components.

This toolkit contains resources that make the following easier:

- 3W Dataset overview generation;
- Experimentation and comparative analysis of Machine Learning-based
approaches and algorithms for specific problems related to undesirable
events that occur in offshore oil wells during their respective
- Experimentation and comparative  
analysis of Machine Learning-based  

approaches and algorithms for specific problems related to undesirable 
events that occur in offshore oil wells during their respective 
production phases;
* Standardization of key points of the Machine Learning-based algorithm
* Standardization of key points of the Machine Learning-based algorithm 
development pipeline.

All these resources are implemented in the following sub-modules:

- **base**: groups the objects used by the other sub-modules;
- **dev**: has all the resources related to development of Machine
- **dev**: has all the resources related to development of Machine  

Learning models;
- **misc**: brings together diverse resources that do not fit in the
- **misc**: brings together diverse resources that do not fit in the 
other sub-modules;
- **rolling_window**: creates a view of array which for every point
gives the n-dimensional neighbourhood of size window. New dimensions are
- **rolling_window**: creates a view of array which for every point 
gives the n-dimensional neighbourhood of size window. New dimensions are 
added at the end of array or after the corresponding original dimension.

Specific problems will be incorporated into this toolkit gradually. At
Specific problems will be incorporated into this toolkit gradually. At  

this time, models can be developed for the following problems:

- Binary Classifier of Spurious Closure of DHSV.

Examples of how to use this toolkit will be incremented throughout its
Examples of how to use this toolkit will be incremented throughout  
its 
development. Please, check the 3W Project's README.md file for more details.

It is important to note that there are arbitrary choices in this
toolkit, but they have been carefully made to allow adequate comparative
analysis without compromising the ability to experiment with different
It is important to note that there are arbitrary choices in this 
toolkit, but they have been carefully made to allow adequate comparative 
analysis without compromising the ability to experiment with different 
approaches and algorithms.

This toolkit's documentation is generated in english and in Google format
This toolkit's documentation is generated in english and in Google format 
with [autoDocstring - Python Docstring Generator
](https://github.com/NilsJPWerner/autoDocstring), which follows [PEP 257
](https://peps.python.org/pep-0257/), and [pdoc3
](https://pdoc3.github.io/pdoc/).

Its source code is implemented according to the style guide established
by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with
Its source code is implemented according to the style guide established  

by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with 
the use of the [Black formatter](https://github.com/psf/black).
"""

__status__ = "Development"
__version__ = "1.1.0"
__version__ = "1.2.0" # Update version number after changes
__license__ = "Apache License 2.0"
__copyright__ = "Copyright 2024, Petróleo Brasileiro S.A."
__authors__ = [
Expand All @@ -73,7 +79,8 @@
EventType,
LABELS_DESCRIPTIONS,
NORMAL_LABEL,
PARQUET_EXTENSION,
PARQUET_EXTENSION,  

PARQUET_ENGINE,
PARQUET_COMPRESSION,
PATH_3W_PROJECT,
Expand All @@ -83,6 +90,7 @@
PATH_TOOLKIT,
TRANSIENT_OFFSET,
VARS,
load_3w_dataset, # To use by 3W v2.0
load_config_in_dataset_ini,
)

Expand All @@ -104,7 +112,8 @@
get_all_labels_and_files,
label_and_file_generator,
load_instance,
load_instances,
load_instances,  

resample,
plot_instance,
)
60 changes: 57 additions & 3 deletions toolkit/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This 3W toolkits' sub-module groups objects used by the other
sub-modules.
"""This 3W toolkits' sub-module groups objects used by the other
sub-modules.

Any resource that is not used by another sub-module must be maintained
Any resource that is not used by another sub-module must be maintained
in the miscellaneous sub-module.
"""

Expand Down Expand Up @@ -56,6 +56,59 @@ def load_config_in_dataset_ini():
return dict(dataset_ini)


def load_3w_dataset(data_type='real', base_path=PATH_DATASET):
"""
Load the 3W Dataset 2.0.

Parameters
----------
data_type : str, optional
Type of data to be loaded ('real', 'simulated' or 'imputed').
The default is 'real'.
base_path : str, optional
Path to the root folder of the dataset. The default is PATH_DATASET.

Returns
-------
pandas.DataFrame
DataFrame with the 3W Dataset 2.0 data.
"""

dataframes = []
for i in range(10): # Loop through folders 0 to 9
folder_path = os.path.join(base_path, str(i))
if os.path.exists(folder_path):
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
for file in parquet_files:
file_path = os.path.join(folder_path, file)
try:
df = pd.read_parquet(file_path)

# Filter data by specified type
if data_type == 'real':
df_filtered = df[df['state'] == 0] # Real data
elif data_type == 'simulated':
df_filtered = df[df['state'] == 1] # Simulated data
elif data_type == 'imputed':
df_filtered = df[df['state'] == 2] # Imputed data
else:
raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.")

dataframes.append(df_filtered)
except Exception as e:
print(f"Error reading file {file_path}: {e}")
else:
print(f"Folder {folder_path} not found.")

# Concatenate all DataFrames into a single DataFrame
if dataframes:
df = pd.concat(dataframes, ignore_index=True)
return df
else:
print("No data found.")
return None


# Loads all configurations present in the 3W Dataset's main
# configuration file and provides specific configurations in different
# granularity and formats
Expand Down Expand Up @@ -123,3 +176,4 @@ def __init__(self, event_name):
self.TRANSIENT = event_section.getboolean("TRANSIENT")
self.window = event_section.getint("WINDOW")
self.step = event_section.getint("STEP")

Loading