Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented custom dataset creator class #962

Open
wants to merge 5 commits into
base: nextjs
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 165 additions & 100 deletions training/training/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1,100 +1,165 @@
from abc import ABC, abstractmethod
from typing import Callable, Optional, Union, cast

from numpy import ndarray
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from sklearn.conftest import fetch_california_housing
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris, load_wine
from torch.utils.data import TensorDataset
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.autograd import Variable


class TrainTestDatasetCreator(ABC):
"Creator that creates train and test PyTorch datasets"

@abstractmethod
def createTrainDataset(self) -> Dataset:
pass

@abstractmethod
def createTestDataset(self) -> Dataset:
pass


class SklearnDatasetCreator(TrainTestDatasetCreator):
DEFAULT_DATASETS: dict[
str, Callable[[], Union[Bunch, tuple[Bunch, tuple], tuple[ndarray, ndarray]]]
] = {
"IRIS": load_iris,
"BREAST_CANCER": load_breast_cancer,
"CALIFORNIA_HOUSING": fetch_california_housing,
"DIABETES": load_diabetes,
"WINE": load_wine,
}

def __init__(
self,
X: pd.DataFrame,
y: pd.Series,
test_size: float,
shuffle: bool,
category_list: Optional[list[str]],
) -> None:
super().__init__()
self._category_list = category_list
self._X_train, self._X_test, self._y_train, self._y_test = cast(
tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series],
train_test_split(X, y, test_size=test_size, shuffle=shuffle),
)

@classmethod
def getDefaultDataset(cls, name: str):
raw_data = cls.DEFAULT_DATASETS[name]()
default_dataset = pd.DataFrame(
data=np.c_[raw_data["data"], raw_data["target"]], # type: ignore
columns=raw_data["feature_names"] + ["target"], # type: ignore
)

# remove any empty lines
default_dataset.dropna(how="all", inplace=True)
return default_dataset

@classmethod
def fromDefault(cls, name: str, test_size: float, shuffle: bool):
raw_data = cls.DEFAULT_DATASETS[name]()
default_dataset = cls.getDefaultDataset(name)
y = default_dataset["target"]
X = default_dataset.drop("target", axis=1)
return cls(X, y, test_size, shuffle, list(raw_data.target_names) if hasattr(raw_data, "target_names") else None) # type: ignore

def createTrainDataset(self) -> Dataset:
X_train_tensor = Variable(torch.Tensor(self._X_train.to_numpy()))
X_train_tensor = torch.reshape(
X_train_tensor, (X_train_tensor.size()[0], 1, X_train_tensor.size()[1])
)
X_train_tensor.requires_grad_(True)

y_train_tensor = Variable(torch.Tensor(self._y_train.to_numpy()))
y_train_tensor = torch.reshape(y_train_tensor, (y_train_tensor.size()[0], 1))
return TensorDataset(X_train_tensor, y_train_tensor)

def createTestDataset(self) -> Dataset:
X_test_tensor = Variable(torch.Tensor(self._X_test.to_numpy()))
X_test_tensor = torch.reshape(
X_test_tensor, (X_test_tensor.size()[0], 1, X_test_tensor.size()[1])
)
X_test_tensor.requires_grad_(True)

y_test_tensor = Variable(torch.Tensor(self._y_test.to_numpy()))
y_test_tensor = torch.reshape(y_test_tensor, (y_test_tensor.size()[0], 1))
return TensorDataset(X_test_tensor, y_test_tensor)

def getCategoryList(self) -> list[str]:
if self._category_list is None:
raise Exception("Category list not available")
return self._category_list
from abc import ABC, abstractmethod
from typing import Callable, Optional, Union, cast

from numpy import ndarray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "numpy" could not be resolved (reportMissingImports)

from sklearn.model_selection import train_test_split
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "sklearn.model_selection" could not be resolved (reportMissingImports)

from sklearn.utils import Bunch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "sklearn.utils" could not be resolved (reportMissingImports)

from sklearn.conftest import fetch_california_housing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "sklearn.conftest" could not be resolved (reportMissingImports)

from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris, load_wine
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "sklearn.datasets" could not be resolved (reportMissingImports)

from torch.utils.data import TensorDataset
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "torch.utils.data" could not be resolved (reportMissingImports)

import numpy as np
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "numpy" could not be resolved (reportMissingImports)

import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "pandas" could not be resolved (reportMissingImports)

import torch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "torch" could not be resolved (reportMissingImports)

from torch.utils.data import Dataset
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "torch.utils.data" could not be resolved (reportMissingImports)

from torch.autograd import Variable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "torch.autograd" could not be resolved (reportMissingImports)


from sklearn.preprocessing import LabelEncoder
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "sklearn.preprocessing" could not be resolved (reportMissingImports)

import boto3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚫 [pyright] reported by reviewdog 🐶
Import "boto3" could not be resolved (reportMissingImports)

import io


class TrainTestDatasetCreator(ABC):
"Creator that creates train and test PyTorch datasets"

@abstractmethod
def createTrainDataset(self) -> Dataset:
pass

@abstractmethod
def createTestDataset(self) -> Dataset:
pass


class SklearnDatasetCreator(TrainTestDatasetCreator):
DEFAULT_DATASETS: dict[
str, Callable[[], Union[Bunch, tuple[Bunch, tuple], tuple[ndarray, ndarray]]]
] = {
"IRIS": load_iris,
"BREAST_CANCER": load_breast_cancer,
"CALIFORNIA_HOUSING": fetch_california_housing,
"DIABETES": load_diabetes,
"WINE": load_wine,
}

def __init__(
self,
X: pd.DataFrame,
y: pd.Series,
test_size: float,
shuffle: bool,
category_list: Optional[list[str]],
) -> None:
super().__init__()
self._category_list = category_list
self._X_train, self._X_test, self._y_train, self._y_test = cast(
tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series],
train_test_split(X, y, test_size=test_size, shuffle=shuffle),
)

@classmethod
def getDefaultDataset(cls, name: str):
raw_data = cls.DEFAULT_DATASETS[name]()
default_dataset = pd.DataFrame(
data=np.c_[raw_data["data"], raw_data["target"]], # type: ignore
columns=raw_data["feature_names"] + ["target"], # type: ignore
)

# remove any empty lines
default_dataset.dropna(how="all", inplace=True)
return default_dataset

@classmethod
def fromDefault(cls, name: str, test_size: float, shuffle: bool):
raw_data = cls.DEFAULT_DATASETS[name]()
default_dataset = cls.getDefaultDataset(name)
y = default_dataset["target"]
X = default_dataset.drop("target", axis=1)
return cls(X, y, test_size, shuffle, list(raw_data.target_names) if hasattr(raw_data, "target_names") else None) # type: ignore

def createTrainDataset(self) -> Dataset:
X_train_tensor = Variable(torch.Tensor(self._X_train.to_numpy()))
X_train_tensor = torch.reshape(
X_train_tensor, (X_train_tensor.size()[0], 1, X_train_tensor.size()[1])
)
X_train_tensor.requires_grad_(True)

y_train_tensor = Variable(torch.Tensor(self._y_train.to_numpy()))
y_train_tensor = torch.reshape(y_train_tensor, (y_train_tensor.size()[0], 1))
return TensorDataset(X_train_tensor, y_train_tensor)

def createTestDataset(self) -> Dataset:
X_test_tensor = Variable(torch.Tensor(self._X_test.to_numpy()))
X_test_tensor = torch.reshape(
X_test_tensor, (X_test_tensor.size()[0], 1, X_test_tensor.size()[1])
)
X_test_tensor.requires_grad_(True)

y_test_tensor = Variable(torch.Tensor(self._y_test.to_numpy()))
y_test_tensor = torch.reshape(y_test_tensor, (y_test_tensor.size()[0], 1))
return TensorDataset(X_test_tensor, y_test_tensor)

def getCategoryList(self) -> list[str]:
if self._category_list is None:
raise Exception("Category list not available")
return self._category_list


class TabularCustomDatasetCreator(TrainTestDatasetCreator):
"""Pulls user-uploaded dataset from S3 bucket and converts it to readable format"""

def __init__(
self,
X: pd.DataFrame,
y: pd.Series,
test_size: float,
shuffle: bool,
category_list: Optional[list[str]],
) -> None:
super().__init__()
self._category_list = category_list
self._X_train, self._X_test, self._y_train, self._y_test = cast(
tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series],
train_test_split(X, y, test_size=test_size, shuffle=shuffle),
)

@classmethod
def read_s3(
cls,
uid: str,
name: str,
test_size: float,
target_name: str,
shuffle: bool = True,
):
s3 = boto3.client("s3")
obj = s3.get_object(Bucket="dlp-upload-bucket", Key=f"{uid}/tabular/{name}")
data = pd.read_csv(io.BytesIO(obj["Body"].read()))
y = data[target_name]
X = data.drop(target_name, axis=1)
if y.apply(pd.to_numeric, errors="coerce").isnull().any():
le = LabelEncoder()
le.fit(y)
y = pd.Series(np.array(le.transform(y)))
return cls(X, y, test_size, shuffle, [target_name])

def createTrainDataset(self) -> Dataset:
X_train_tensor = Variable(torch.Tensor(self._X_train.to_numpy()))
X_train_tensor = torch.reshape(
X_train_tensor, (X_train_tensor.size()[0], 1, X_train_tensor.size()[1])
)
X_train_tensor.requires_grad_(True)

y_train_tensor = Variable(torch.Tensor(self._y_train.to_numpy()))
y_train_tensor = torch.reshape(y_train_tensor, (y_train_tensor.size()[0], 1))
return TensorDataset(X_train_tensor, y_train_tensor)

def createTestDataset(self) -> Dataset:
X_test_tensor = Variable(torch.Tensor(self._X_test.to_numpy()))
X_test_tensor = torch.reshape(
X_test_tensor, (X_test_tensor.size()[0], 1, X_test_tensor.size()[1])
)
X_test_tensor.requires_grad_(True)

y_test_tensor = Variable(torch.Tensor(self._y_test.to_numpy()))
y_test_tensor = torch.reshape(y_test_tensor, (y_test_tensor.size()[0], 1))
return TensorDataset(X_test_tensor, y_test_tensor)