Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/adapt dev to 3w dataset 2.0.0 #126

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th

The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md).

### Loading the 3W Dataset 2.0

The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders.

**Usage:**

```python
import toolkit as tk

# Load the real data from the 3W Dataset 2.0
df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset')
```

## Incorporated Problems

Specific problems will be incorporated into this project gradually. At this point, we can work on:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,19 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'numpy'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
]
}
],
"source": [
"import sys\n",
"import os\n",
Expand All @@ -58,6 +70,8 @@
"sys.path.append(os.path.join('..','..','..'))\n",
"import toolkit as tk\n",
"\n",
"from toolkit.base import load_3w_dataset\n",
"\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'svg'"
]
Expand All @@ -78,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -101,7 +115,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -114,9 +128,43 @@
],
"source": [
"event_labels = list(experiment.event_labels.values())\n",
"event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n",
"fold: tk.EventFold\n",
"folds: tk.EventFolds = experiment.folds()"
"event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload 3W Dataset 2.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = load_3w_dataset(data_type='real', base_path='path/to/dataset') # Replaced by correct path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the folds manually"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"folds = tk.EventFolds(\n",
" experiment=experiment,\n",
" df=df, # Pass the loaded DataFrame to the EventFolds class\n",
" # ... (other parameters, if necessary) ...\n",
")\n"
]
},
{
Expand All @@ -135,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -185,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1501,7 +1549,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.12.0"
},
"toc": {
"base_numbering": 1,
Expand Down
60 changes: 57 additions & 3 deletions toolkit/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This 3W toolkits' sub-module groups objects used by the other
sub-modules.
"""This 3W toolkits' sub-module groups objects used by the other
sub-modules.

Any resource that is not used by another sub-module must be maintained
Any resource that is not used by another sub-module must be maintained
in the miscellaneous sub-module.
"""

Expand Down Expand Up @@ -56,6 +56,59 @@ def load_config_in_dataset_ini():
return dict(dataset_ini)


def load_3w_dataset(data_type='real', base_path=PATH_DATASET):
"""
Load the 3W Dataset 2.0.

Parameters
----------
data_type : str, optional
Type of data to be loaded ('real', 'simulated' or 'imputed').
The default is 'real'.
base_path : str, optional
Path to the root folder of the dataset. The default is PATH_DATASET.

Returns
-------
pandas.DataFrame
DataFrame with the 3W Dataset 2.0 data.
"""

dataframes = []
for i in range(10): # Loop through folders 0 to 9
folder_path = os.path.join(base_path, str(i))
if os.path.exists(folder_path):
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
for file in parquet_files:
file_path = os.path.join(folder_path, file)
try:
df = pd.read_parquet(file_path)

# Filter data by specified type
if data_type == 'real':
df_filtered = df[df['state'] == 0] # Real data
elif data_type == 'simulated':
df_filtered = df[df['state'] == 1] # Simulated data
elif data_type == 'imputed':
df_filtered = df[df['state'] == 2] # Imputed data
else:
raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.")

dataframes.append(df_filtered)
except Exception as e:
print(f"Error reading file {file_path}: {e}")
else:
print(f"Folder {folder_path} not found.")

# Concatenate all DataFrames into a single DataFrame
if dataframes:
df = pd.concat(dataframes, ignore_index=True)
return df
else:
print("No data found.")
return None


# Loads all configurations present in the 3W Dataset's main
# configuration file and provides specific configurations in different
# granularity and formats
Expand Down Expand Up @@ -123,3 +176,4 @@ def __init__(self, event_name):
self.TRANSIENT = event_section.getboolean("TRANSIENT")
self.window = event_section.getint("WINDOW")
self.step = event_section.getint("STEP")

Loading