Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding capability of taking auxiliary data #436

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,9 @@ def load_data(self):
else:
raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")

if hasattr(self._task_def, 'auxiliary_data'):
self._dataset = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, dataset=self._dataset, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold)

def as_job(self):
job = Job(name=rconfig().token_separator.join([
'local',
Expand Down
30 changes: 30 additions & 0 deletions amlb/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,28 @@ def __repr__(self):
return repr_def(self)


class AuxData(ABC):

def __init__(self):
super().__init__()

@property
def path(self) -> str:
pass

@property
@abstractmethod
def data(self) -> DF:
"""
:return: the auxiliary data as a pandas DataFrame.
"""
pass

@profile(logger=log)
def release(self, properties=None):
clear_cache(self, properties)


class Datasplit(ABC):

def __init__(self, dataset, format):
Expand All @@ -98,6 +120,14 @@ def __init__(self, dataset, format):
def path(self) -> str:
return self.data_path(self.format)

@property
def has_auxiliary_data(self) -> bool:
pass

@property
def auxiliary_data(self) -> AuxData:
pass

@abstractmethod
def data_path(self, format: str) -> str:
"""
Expand Down
6 changes: 6 additions & 0 deletions amlb/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,11 @@ def load(self, source: DataSourceType, *args, **kwargs):
else:
raise NotImplementedError(f"data source {source} is not supported yet")

def load_auxiliary_data(self, source: DataSourceType, *args, **kwargs):
if source == DataSourceType.file:
return self.file_loader.load_auxiliary_data(*args, **kwargs)
else:
raise NotImplementedError(f"data source {source} is not supported yet")


__all__ = ["DataLoader", "DataSourceType"]
163 changes: 128 additions & 35 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
import pandas.api.types as pat

from ..data import Dataset, DatasetType, Datasplit, Feature
from ..data import AuxData, Dataset, DatasetType, Datasplit, Feature, DF
from ..datautils import read_csv, to_data_frame
from ..resources import config as rconfig
from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path
Expand Down Expand Up @@ -55,47 +55,99 @@ def load(self, dataset, fold=0):
else:
raise ValueError(f"Unsupported file type: {ext}")

@profile(logger=log)
def load_auxiliary_data(self, dataset, auxiliary_data, fold=0):
auxiliary_data = auxiliary_data if isinstance(auxiliary_data, ns) else ns(path=auxiliary_data)
log.debug("Loading auxiliary data %s", auxiliary_data)
paths = self._extract_auxiliary_paths(auxiliary_data.path if 'path' in auxiliary_data else auxiliary_data, fold=fold)
train_data = None
test_data = None
if 'train' in paths:
train_path = paths['train'][fold]
train_data = FileAuxData(train_path)
if 'test' in paths:
test_path = paths['test'][fold]
test_data = FileAuxData(test_path)
dataset._attach_auxiliary_data(train_data, test_data)
return dataset

def _extract_auxiliary_paths(self, auxiliary_data, fold=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like a copy/paste of _extract_train_test_paths with minor changes:

  • different regex pattern
  • slightly different messages
  • as it is a recursive function, internal call to _extract_auxiliary_paths instead of _extract_auxiliary_paths.

Given the complexity of this logic, we can't afford duplicating it.
Let's make a higher level _extract_paths function that can hold the logic for both, sth like:

def _extract_train_test_paths(self, dataset, fold=None):
    return self._extract_paths(self._extract_train_test_paths, dataset, fold)

def _extract_auxiliary_paths(self, dataset, fold=None):
    return self._extract_paths(self._extract_auxiliary_paths, dataset, fold,
                          train_suffix="train_auxiliary", test_suffix="test_auxiliary")    
    
def _extract_paths(self, extract_paths_fn, data, fold=None, train_suffix='train', test_suffix='test'):
    train_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+")
    ...
    return extract_paths_fn(ns(train=…)
    ...

train_suffix = 'train_auxiliary'
test_suffix = 'test_auxiliary'
if isinstance(auxiliary_data, (tuple, list)):
return self._extract_paths(ns(train=[p for p in auxiliary_data if train_suffix in p],
test=[p for p in auxiliary_data if test_suffix in p]),
fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
elif isinstance(auxiliary_data, ns):
return dict(
train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(auxiliary_data.train))] if 'train' in auxiliary_data else [],
test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else []
)
else:
self._extract_paths(auxiliary_data, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)


def _extract_train_test_paths(self, dataset, fold=None):
train_suffix = 'train'
test_suffix = 'test'
if isinstance(dataset, (tuple, list)):
assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]."
return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
return self._extract_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
test=[p for i, p in enumerate(dataset) if i % 2 == 1]),
fold=fold)
fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
elif isinstance(dataset, ns):
return dict(train=[self._extract_train_test_paths(p)['train'][0]
return dict(train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.train))],
test=[self._extract_train_test_paths(p)['train'][0]
test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.test))])
else:
assert isinstance(dataset, str)
dataset = os.path.expanduser(dataset)
dataset = dataset.format(**rconfig().common_dirs)

if os.path.exists(dataset):
if os.path.isfile(dataset):
if is_archive(dataset):
arch_name, _ = os.path.splitext(os.path.basename(dataset))
self._extract_paths(dataset, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)


def _extract_paths(self, data, fold=None, train_suffix='train', test_suffix='test'):
train_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+")
test_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+")
is_aux_data = False
if train_suffix == 'train_auxiliary' and test_suffix == 'test_auxiliary':
is_aux_data = True

assert isinstance(data, str)
data = os.path.expanduser(data)
data = data.format(**rconfig().common_dirs)

if os.path.exists(data):
if os.path.isfile(data):
# we leave the auxiliary data handling to the user
if is_archive(data) and not is_aux_data:
arch_name, _ = os.path.splitext(os.path.basename(data))
dest_folder = os.path.join(self._cache_dir, arch_name)
if not os.path.exists(dest_folder): # don't uncompress if previously done
dest_folder = unarchive_file(dataset, dest_folder)
return self._extract_train_test_paths(dest_folder)
dest_folder = unarchive_file(data, dest_folder)
return self._extract_paths(dest_folder, train_suffix=train_suffix, test_suffix=test_suffix)
else:
return dict(train=[dataset], test=[])
elif os.path.isdir(dataset):
files = list_all_files(dataset)
log.debug("Files found in dataset folder %s: %s", dataset, files)
assert len(files) > 0, f"Empty folder: {dataset}"
return dict(train=[data], test=[])
elif os.path.isdir(data):
files = list_all_files(data)
log.debug("Files found in data folder %s: %s", data, files)
assert len(files) > 0, f"Empty folder: {data}"
if len(files) == 1:
return dict(train=files, test=[])

train_matches = [m for m in [train_search_pat.search(f) for f in files] if m]
test_matches = [m for m in [test_search_pat.search(f) for f in files] if m]
# verify they're for the same dataset (just based on name)
assert train_matches and test_matches, f"Folder {dataset} must contain at least one training and one test dataset."
if not is_aux_data:
assert train_matches and test_matches, f"Folder {data} must contain at least one training and one test dataset."
else:
assert train_matches or test_matches, f"Folder {data} must contain at least one training auxiliary data or one test auxiliary data."
root_names = {m[1] for m in (train_matches+test_matches)}
assert len(root_names) == 1, f"All dataset files in {dataset} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0."
assert len(root_names) == 1, f"All data files in {data} should follow the same naming: xxxxx_{train_suffix}_N.ext or xxxxx_{test_suffix}_N.ext with N starting from 0."

train_no_fold = next((m[0] for m in train_matches if m[2] is None), None)
test_no_fold = next((m[0] for m in test_matches if m[2] is None), None)
Expand All @@ -107,23 +159,47 @@ def _extract_train_test_paths(self, dataset, fold=None):
while fold >= 0:
train = next((m[0] for m in train_matches if m[2] == str(fold)), None)
test = next((m[0] for m in test_matches if m[2] == str(fold)), None)
if train and test:
paths['train'].append(train)
paths['test'].append(test)
fold += 1
if not is_aux_data:
if train and test:
paths['train'].append(train)
paths['test'].append(test)
fold += 1
else:
fold = -1
else:
fold = -1
assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..."
if train:
paths['train'].append(train)
if test:
paths['test'].append(test)
if not train and not test:
fold = -1
fold += 1
assert len(paths) > 0, f"No data file found in {data}: they should follow the naming xxxx_{train_suffix}.ext, xxxx_{test_suffix}.ext or xxxx_{train_suffix}_0.ext, xxxx_{test_suffix}_0.ext, xxxx_{train_suffix}_1.ext, ..."
return paths
elif is_valid_url(dataset):
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
elif is_valid_url(data):
cached_file = os.path.join(self._cache_dir, os.path.basename(data))
if not os.path.exists(cached_file): # don't download if previously done
handler = get_file_handler(dataset)
assert handler.exists(dataset), f"Invalid path/url: {dataset}"
handler.download(dataset, dest_path=cached_file)
return self._extract_train_test_paths(cached_file)
handler = get_file_handler(data)
assert handler.exists(data), f"Invalid path/url: {data}"
handler.download(data, dest_path=cached_file)
return self._extract_paths(cached_file, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
else:
raise ValueError(f"Invalid dataset description: {dataset}")
raise ValueError(f"Invalid dataset description: {data}")


class FileAuxData(AuxData):

def __init__(self, path):
super().__init__()
self._path = path

@property
def path(self) -> str:
return self._path

@property
def data(self) -> DF:
return NotImplementedError


class FileDataset(Dataset):
Expand Down Expand Up @@ -161,6 +237,10 @@ def features(self) -> List[Feature]:
def target(self) -> Feature:
return self._get_metadata('target')

def _attach_auxiliary_data(self, train_auxiliary_data, test_auxiliary_data):
self._train._attach_auxiliary_data(train_auxiliary_data)
self._test._attach_auxiliary_data(test_auxiliary_data)

@memoize
def _get_metadata(self, prop):
meta = self._train.load_metadata()
Expand All @@ -173,6 +253,8 @@ def __init__(self, dataset: FileDataset, format: str, path: str):
super().__init__(dataset, format)
self._path = path
self._data = {format: path}
self._auxiliary_data = None


def data_path(self, format):
supported_formats = [cls.format for cls in __file_converters__]
Expand All @@ -181,6 +263,14 @@ def data_path(self, format):
raise ValueError(f"Dataset {name} is only available in one of {supported_formats} formats.")
return self._get_data(format)

@property
def has_auxiliary_data(self) -> bool:
return self._auxiliary_data != None

@property
def auxiliary_data(self) -> AuxData:
return self._auxiliary_data

@lazy_property
def data(self):
# use codecs for unicode support: path = codecs.load(self._path, 'rb', 'utf-8')
Expand Down Expand Up @@ -217,6 +307,9 @@ def _set_feature_as_target(self, target: Feature):
# target.data_type = 'category'
target.is_target = True

def _attach_auxiliary_data(self, auxiliary_data):
self._auxiliary_data = auxiliary_data


class ArffDataset(FileDataset):

Expand Down
4 changes: 4 additions & 0 deletions frameworks/AutoGluon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ def run(dataset: Dataset, config: TaskConfig):
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
)
if dataset.train.has_auxiliary_data:
data['train_auxiliary_data'] = dict(path=dataset.train.auxiliary_data.path)
if dataset.test.has_auxiliary_data:
data['test_auxiliary_data'] = dict(path=dataset.test.auxiliary_data.path)

return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
Expand Down
Binary file not shown.
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/unit/amlb/datasets/file/test_file_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,27 @@ def test_load_regression_task_arff(file_loader):
_assert_cholesterol_features(ds, ds_def, 'arff')


@pytest.mark.use_disk
def test_load_auxiliary_data(file_loader):
ds_def = ns(
train=os.path.join(res, "kc2_train.csv"),
test=os.path.join(res, "kc2_test.csv"),
target="problems"
)
ds = file_loader.load(ds_def)
aux_def = ns(
train=os.path.join(res, "image_train.zip"),
test=os.path.join(res, "image_test.zip")
)
ds = file_loader.load_auxiliary_data(ds, aux_def)
_assert_aux_data_path(ds)


def _assert_aux_data_path(dataset):
assert dataset.train.auxiliary_data.path == os.path.join(res, "image_train.zip")
assert dataset.test.auxiliary_data.path == os.path.join(res, "image_test.zip")


def _assert_cholesterol_features(dataset, definition, fmt):
assert len(dataset.features) == 14
assert len(dataset.predictors) == 13
Expand Down