Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable LLM-Driven Data Exploration with Presets and 3W Integration #55

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,7 @@ dmypy.json
# Pyre type checker
.pyre/

# Temporary files
temp.py
temp.ipynb
/temp
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"editor.formatOnSave": false
}
16 changes: 15 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use

### Additional Features

Preferably, use the `_bibmon_tools.py` file to implement additional features.
Preferably, use the `_bibmon_tools.py` file to implement additional features.

### Testing New Functionalities

The first step to add new functionalities is to download the testing libraries. To do this, run the following command:

```bash
pip install -r test/requirements.txt
```

After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:

```bash
pytest
```
4 changes: 2 additions & 2 deletions bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from ._sbm import SBM
from ._sklearn_regressor import sklearnRegressor
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._load_data import load_tennessee_eastman, load_real_data, load_3w
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'load_tennessee_eastman', 'load_real_data', 'load_3w',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
'create_df_with_noise', 'align_dfs_by_rows']
85 changes: 84 additions & 1 deletion bibmon/_bibmon_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from typing import Literal

###############################################################################

Expand Down Expand Up @@ -692,4 +693,86 @@ def comparative_table (models, X_train, X_validation, X_test,

return_tables.append(times_df)

return return_tables
return return_tables

##############################################################################

def find_df_transitions(
df: pd.DataFrame,
threshold: float = 1,
data_type: Literal["string", "number"] = "number",
label: str = None,
) -> list[int]:
"""
Finds transitions in a DataFrame. This can be used to find indices of interesting events in the data.

Parameters
----------
df: pandas.DataFrame
Data to be analyzed.
threshold: float, optional
Threshold to be used in the transition detection, this is the minimum difference between two consecutive points. Will be used only if data_type is 'number'.
data_type: str, optional
Type of data to be analyzed. If 'number', the threshold will be used to detect transitions. If 'string', the function will look for changes in the values.
label: str
Label to be used in the transition detection.

Returns
----------
: list of ints
Indices of the transitions.
"""

if label is None:
return []

transitions = []
previous_event = df[label].iloc[0]

for i in range(1, len(df)):
if data_type == "number":
if abs(df[label].iloc[i] - previous_event) > threshold:
transitions.append(i)
previous_event = df[label].iloc[i]
elif data_type == "string":
if df[label].iloc[i] != previous_event:
transitions.append(i)
previous_event = df[label].iloc[i]

return transitions

###############################################################################

def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]:
"""
Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1.

For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%.

Warning: This function may cause data loss if the split cannot be done exactly according to the percentages.

Parameters
----------
df: pandas.DataFrame
Data to be split.
percentages: list of floats
List of percentages to be used in the split.

Returns
----------
: list of pandas.DataFrames
List with the split DataFrames.
"""

if sum(percentages) != 1:
raise ValueError("The sum of the percentages must be 1.")

split_dfs = []
start = 0

for i in range(len(percentages)):
end = start + int(percentages[i] * len(df))
split_dfs.append(df.iloc[start:end])
start = end

return split_dfs
161 changes: 121 additions & 40 deletions bibmon/_load_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import os
import pandas as pd
import importlib.resources as pkg_resources
from typing import Tuple
import requests
import io
import configparser
from tqdm import tqdm

from ._bibmon_tools import create_df_with_dates
from . import real_process_data, tennessee_eastman

from . import _bibmon_tools as b_tools
from . import real_process_data, tennessee_eastman, three_w

###############################################################################

def load_tennessee_eastman (train_id = 0, test_id = 0):

def load_tennessee_eastman(train_id=0, test_id=0):
"""
Load the 'Tennessee Eastman Process' benchmark data.

Expand All @@ -20,71 +27,145 @@ def load_tennessee_eastman (train_id = 0, test_id = 0):
Identifier of the test data.
No fault: 0. With faults: 1 to 20.
Returns
----------
----------
train_df: pandas.DataFrame
Training data.
test_df: pandas.DataFrame
Test data.
"""
tags1 = ['XMEAS('+str(ii)+')' for ii in range(1,42)]
tags2 = ['XMV('+str(ii)+')' for ii in range(1,12)]
"""

tags1 = ["XMEAS(" + str(ii) + ")" for ii in range(1, 42)]
tags2 = ["XMV(" + str(ii) + ")" for ii in range(1, 12)]
tags = tags1 + tags2
file_train = f'd{train_id}.dat'
file_test = f'd{test_id}_te.dat'

file_train = f"d{train_id}.dat"
file_test = f"d{test_id}_te.dat"

if len(file_train) == 6:
file_train = file_train[:2]+'0'+file_train[2:]
file_train = file_train[:2] + "0" + file_train[2:]

if len(file_test) == 9:
file_test = file_test[:1]+'0'+file_test[1:]
file_test = file_test[:1] + "0" + file_test[1:]

with pkg_resources.path(tennessee_eastman, file_train) as filepath:

if file_train == 'd00.dat':
tmp1 = pd.read_csv(filepath,sep='\t',
names=['0'])
tmp2 = pd.DataFrame([tmp1.T.iloc[0,i].strip() for
i in range(tmp1.shape[0])])
if file_train == "d00.dat":

tmp1 = pd.read_csv(filepath, sep="\t", names=["0"])
tmp2 = pd.DataFrame(
[tmp1.T.iloc[0, i].strip() for i in range(tmp1.shape[0])]
)
train_df = pd.DataFrame()

for ii in range(52):
train_df[tags[ii]]=[float(s) for s in tmp2[0][ii].split(' ')]

train_df = create_df_with_dates(train_df,
freq = '3min')

train_df[tags[ii]] = [float(s) for s in tmp2[0][ii].split(" ")]

train_df = b_tools.create_df_with_dates(train_df, freq="3min")

else:

train_df = create_df_with_dates(pd.read_csv(filepath,
sep = '\s+',
names = tags),
freq = '3min')
train_df = b_tools.create_df_with_dates(
pd.read_csv(filepath, sep="\s+", names=tags), freq="3min"
)

with pkg_resources.path(tennessee_eastman, file_test) as filepath:

test_df = create_df_with_dates(pd.read_csv(filepath,
sep = '\s+',
names = tags),
start = '2020-02-01 00:00:00',
freq = '3min')
test_df = b_tools.create_df_with_dates(
pd.read_csv(filepath, sep="\s+", names=tags),
start="2020-02-01 00:00:00",
freq="3min",
)

return train_df, test_df


###############################################################################

def load_real_data ():

def load_real_data():
"""
Load a sample of real process data.
The variables have been anonymized for availability in the library.


Returns
----------
: pandas.DataFrame
Process data.
"""

with pkg_resources.path(real_process_data, "real_process_data.csv") as file:
return pd.read_csv(file, index_col=0, parse_dates=True)


###############################################################################

def load_3w(dataset_class: int = 8, dataset_name: str = "WELL-00019_20120601165020.parquet") -> Tuple[pd.DataFrame, configparser.ConfigParser, int]:
"""
Load the '3W-8' benchmark data. If it receives a different class or dataset name, it will try to download from the repository.

Warning: This assumes that the dataset is available in the repository. If the dataset is not available, the function will raise an error. This will not download a new config file.

Parameters
----------
dataset_class: int, optional
Identifier of the dataset class.
dataset_name: str, optional
Name of the dataset file.
Returns
----------
----------
: pandas.DataFrame
Process data.
"""
: configparser.ConfigParser
Configuration file.
: int
Identifier of the dataset class.

"""
data_frame: pd.DataFrame = None
ini = three_w.tools.load_dataset_ini()

if dataset_class == 8 and dataset_name == "WELL-00019_20120601165020.parquet":
with pkg_resources.path(three_w, dataset_name) as file:
data_frame = pd.read_parquet(
file,
engine=ini.get("PARQUET_SETTINGS", "PARQUET_ENGINE"),
)
else:
data_set_url = f'https://github.com/petrobras/3W/raw/refs/heads/main/dataset/{dataset_class}/{dataset_name}'

print(f"Downloading dataset from {data_set_url}")

# Send a head request to know the total file size in advance
response = requests.head(data_set_url)
file_size = int(response.headers.get('content-length', 0)) # Get file size from headers

# Download the file with a progress bar
response = requests.get(data_set_url, stream=True)
response.raise_for_status() # Check if the request was successful

parquet_file = io.BytesIO()

chunk_size = 1024
with tqdm(total=file_size, unit='B', unit_scale=True, desc=dataset_name) as pbar:
for chunk in response.iter_content(chunk_size=chunk_size):
parquet_file.write(chunk)
pbar.update(len(chunk))

# Reset the BytesIO buffer's position to the beginning
parquet_file.seek(0)

data_frame = pd.read_parquet(
parquet_file,
engine=ini.get("PARQUET_SETTINGS", "PARQUET_ENGINE"),
)

if data_frame is None:
raise ValueError("The dataset could not be loaded.")
if ini is None:
raise ValueError("The dataset configuration file could not be loaded.")

with pkg_resources.path(real_process_data,'real_process_data.csv') as file:
return pd.read_csv(file,index_col = 0, parse_dates = True)
return (
data_frame,
ini,
int(dataset_class),
)
2 changes: 1 addition & 1 deletion bibmon/_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def a_pp(self, a_pp):

###########################################################################

def apply(self, df, train_or_test = 'train'):
def apply(self, df, train_or_test = 'train') -> pd.DataFrame:
"""
Sequentially applies the preprocessing functions
defined during initialization.
Expand Down
Loading