Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactor] Made CrossValTypes, HoldoutValTypes to have split functions directly #108

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Prev Previous commit
Next Next commit
[fix] Fix mypy issues and modify the test accordingly
Since the previous codes had the default shuffle = True and the indices
shuffle before splitting, the test cases for CV and Holdout did not match.
More specifically, when I bring back the followings, I could reproduce
the original outputs:
1. Bring back _get_indices in BaseDataset
2. Make the default value of self.shuffle in BaseDataset True
3. Input shuffle = True in KFold instead of using ShuffleSplit
These reproduce the original outputs.
Note that KFold(shuffle=True) and ShuffleSplit() are not identical
and even when we input the same random_state, the results do not reproduce.
nabenabe0928 committed May 19, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit eee3b1c49a823f6242b9c624e7e87fcab1f2dba0
6 changes: 3 additions & 3 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
@@ -233,7 +233,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None

if isinstance(self.resampling_strategy, HoldoutValTypes):
val_share = self.resampling_strategy_args['val_share']
val_share = self.resampling_strategy_args.get('val_share', None)

return self.resampling_strategy(
random_state=self.random_state,
@@ -243,11 +243,11 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
labels_to_stratify=labels_to_stratify
)
elif isinstance(self.resampling_strategy, CrossValTypes):
num_splits = self.resampling_strategy_args['num_splits']
num_splits = self.resampling_strategy_args.get('num_splits', None)

return self.resampling_strategy(
random_state=self.random_state,
num_splits=int(num_splits),
num_splits=num_splits,
shuffle=self.shuffle,
indices=self._get_indices(),
labels_to_stratify=labels_to_stratify
47 changes: 24 additions & 23 deletions autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
@@ -26,19 +26,19 @@ class _ResamplingStrategyArgs(NamedTuple):
class HoldoutFuncs():
@staticmethod
def holdout_validation(
random_state: np.random.RandomState,
val_share: float,
indices: np.ndarray,
random_state: Optional[np.random.RandomState] = None,
val_share: Optional[float] = None,
shuffle: bool = False,
labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None
) -> List[Tuple[np.ndarray, np.ndarray]]:

train, val = train_test_split(
indices, test_size=val_share, shuffle=shuffle,
random_state=random_state if shuffle else None,
indices, test_size=val_share,
shuffle=shuffle, random_state=random_state,
stratify=labels_to_stratify
)
return [train, val]
return [(train, val)]


class CrossValFuncs():
@@ -52,9 +52,9 @@ class CrossValFuncs():

@staticmethod
def k_fold_cross_validation(
random_state: np.random.RandomState,
num_splits: int,
indices: np.ndarray,
random_state: Optional[np.random.RandomState] = None,
num_splits: Optional[int] = None,
shuffle: bool = False,
labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None
) -> List[Tuple[np.ndarray, np.ndarray]]:
@@ -70,22 +70,15 @@ def k_fold_cross_validation(

@staticmethod
def time_series(
random_state: np.random.RandomState,
num_splits: int,
indices: np.ndarray,
random_state: Optional[np.random.RandomState] = None,
num_splits: Optional[int] = None,
shuffle: bool = False,
labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None
) -> List[Tuple[np.ndarray, np.ndarray]]:
"""
Returns train and validation indices respecting the temporal ordering of the data.

Args:
indices (np.ndarray): array of indices to be split
num_splits (int): number of cross validation splits

Returns:
splits (List[Tuple[List, List]]): list of tuples of training and validation indices

Examples:
>>> indices = np.array([0, 1, 2, 3])
>>> CrossValFuncs.time_series_cross_validation(3, indices)
@@ -94,7 +87,7 @@ def time_series(
([0, 1, 2], [3])]

"""
cv = TimeSeriesSplit(n_splits=num_splits, random_state=random_state)
cv = TimeSeriesSplit(n_splits=num_splits)
splits = list(cv.split(indices))
return splits

@@ -122,9 +115,9 @@ class CrossValTypes(Enum):

def __call__(
self,
random_state: np.random.RandomState,
indices: np.ndarray,
num_splits: int = 5,
random_state: Optional[np.random.RandomState] = None,
num_splits: Optional[int] = None,
shuffle: bool = False,
labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None
) -> List[Tuple[np.ndarray, np.ndarray]]:
@@ -144,8 +137,12 @@ def __call__(
splits[a split identifier][0: train, 1: val][a data point identifier]

"""

default_num_splits = _ResamplingStrategyArgs().num_splits
num_splits = num_splits if num_splits is not None else default_num_splits

return self.value(
random_state=random_state,
random_state=random_state if shuffle else None,
num_splits=num_splits,
indices=indices,
shuffle=shuffle,
@@ -181,9 +178,9 @@ class HoldoutValTypes(Enum):

def __call__(
self,
random_state: np.random.RandomState,
indices: np.ndarray,
val_share: float = 0.33,
random_state: Optional[np.random.RandomState] = None,
val_share: Optional[float] = None,
shuffle: bool = False,
labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None
) -> List[Tuple[np.ndarray, np.ndarray]]:
@@ -203,8 +200,12 @@ def __call__(
splits[a split identifier][0: train, 1: val][a data point identifier]

"""

default_val_share = _ResamplingStrategyArgs().val_share
val_share = val_share if val_share is not None else default_val_share

return self.value(
random_state=random_state,
random_state=random_state if shuffle else None,
val_share=val_share,
indices=indices,
shuffle=shuffle,
4 changes: 2 additions & 2 deletions test/test_evaluation/test_train_evaluator.py
Original file line number Diff line number Diff line change
@@ -112,7 +112,7 @@ def test_holdout(self, pipeline_mock):
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

self.assertEqual(evaluator.file_output.call_count, 1)
self.assertEqual(result, 0.5652173913043479)
self.assertEqual(result, 0.30434782608695654)
self.assertEqual(pipeline_mock.fit.call_count, 1)
# 3 calls because of train, holdout and test set
self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
@@ -150,7 +150,7 @@ def test_cv(self, pipeline_mock):
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

self.assertEqual(evaluator.file_output.call_count, 1)
self.assertEqual(result, 0.46235467431119603)
self.assertEqual(result, 0.4651019270584489)
self.assertEqual(pipeline_mock.fit.call_count, 5)
# 9 calls because of the training, holdout and
# test set (3 sets x 5 folds = 15)