petrobras · castrokelly · Oct 19, 2024 · Oct 19, 2024
diff --git a/README.md b/README.md
@@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th
 
 The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md).
 
+### Loading the 3W Dataset 2.0
+
+The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders.
+
+**Usage:**
+
+```python
+import toolkit as tk
+
+# Load the real data from the 3W Dataset 2.0
+df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset')
+```
+
 ## Incorporated Problems
 
 Specific problems will be incorporated into this project gradually. At this point, we can work on:

diff --git a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb
@@ -49,7 +49,19 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m      5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
+     ]
+    }
+   ],
    "source": [
     "import sys\n",
     "import os\n",
@@ -58,6 +70,8 @@
     "sys.path.append(os.path.join('..','..','..'))\n",
     "import toolkit as tk\n",
     "\n",
+    "from toolkit.base import load_3w_dataset\n",
+    "\n",
     "%matplotlib inline\n",
     "%config InlineBackend.figure_format = 'svg'"
    ]
@@ -78,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -114,9 +128,43 @@
    ],
    "source": [
     "event_labels = list(experiment.event_labels.values())\n",
-    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n",
-    "fold: tk.EventFold\n",
-    "folds: tk.EventFolds = experiment.folds()"
+    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Upload 3W Dataset 2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = load_3w_dataset(data_type='real', base_path='path/to/dataset')  # Replaced by correct path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the folds manually"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folds = tk.EventFolds(\n",
+    "    experiment=experiment,\n",
+    "    df=df,  # Pass the loaded DataFrame to the EventFolds class\n",
+    "    # ... (other parameters, if necessary) ...\n",
+    ")\n"
    ]
   },
   {
@@ -135,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -185,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1501,7 +1549,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.12.0"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/toolkit/base.py b/toolkit/base.py
@@ -1,7 +1,7 @@
-"""This 3W toolkits' sub-module groups objects used by the other 
-sub-modules. 
+"""This 3W toolkits' sub-module groups objects used by the other
+sub-modules.
 
-Any resource that is not used by another sub-module must be maintained 
+Any resource that is not used by another sub-module must be maintained
 in the miscellaneous sub-module.
 """
 
@@ -56,6 +56,59 @@ def load_config_in_dataset_ini():
     return dict(dataset_ini)
 
 
+def load_3w_dataset(data_type='real', base_path=PATH_DATASET):
+    """
+    Load the 3W Dataset 2.0.
+
+    Parameters
+    ----------
+    data_type : str, optional
+        Type of data to be loaded ('real', 'simulated' or 'imputed').
+        The default is 'real'.
+    base_path : str, optional
+        Path to the root folder of the dataset. The default is PATH_DATASET.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame with the 3W Dataset 2.0 data.
+    """
+
+    dataframes = []
+    for i in range(10):  # Loop through folders 0 to 9
+        folder_path = os.path.join(base_path, str(i))
+        if os.path.exists(folder_path):
+            parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
+            for file in parquet_files:
+                file_path = os.path.join(folder_path, file)
+                try:
+                    df = pd.read_parquet(file_path)
+
+                    # Filter data by specified type
+                    if data_type == 'real':
+                        df_filtered = df[df['state'] == 0]  # Real data
+                    elif data_type == 'simulated':
+                        df_filtered = df[df['state'] == 1]  # Simulated data
+                    elif data_type == 'imputed':
+                        df_filtered = df[df['state'] == 2]  # Imputed data
+                    else:
+                        raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.")
+
+                    dataframes.append(df_filtered)
+                except Exception as e:
+                    print(f"Error reading file {file_path}: {e}")
+        else:
+            print(f"Folder {folder_path} not found.")
+
+    # Concatenate all DataFrames into a single DataFrame
+    if dataframes:
+        df = pd.concat(dataframes, ignore_index=True)
+        return df
+    else:
+        print("No data found.")
+        return None
+
+
 # Loads all configurations present in the 3W Dataset's main
 # configuration file and provides specific configurations in different
 # granularity and formats
@@ -123,3 +176,4 @@ def __init__(self, event_name):
         self.TRANSIENT = event_section.getboolean("TRANSIENT")
         self.window = event_section.getint("WINDOW")
         self.step = event_section.getint("STEP")
+