openml · yinweisu · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
@@ -489,6 +489,9 @@ def load_data(self):
         else:
             raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")
 
+        if hasattr(self._task_def, 'auxiliary_data'):
+            self._dataset = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, dataset=self._dataset, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold)
+
     def as_job(self):
         job = Job(name=rconfig().token_separator.join([
                 'local',

diff --git a/amlb/data.py b/amlb/data.py
@@ -84,6 +84,28 @@ def __repr__(self):
         return repr_def(self)
 
 
+class AuxData(ABC):
+
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def path(self) -> str:
+        pass
+
+    @property
+    @abstractmethod
+    def data(self) -> DF:
+        """
+        :return: the auxiliary data as a pandas DataFrame.
+        """
+        pass
+
+    @profile(logger=log)
+    def release(self, properties=None):
+        clear_cache(self, properties)
+
+
 class Datasplit(ABC):
 
     def __init__(self, dataset, format):
@@ -98,6 +120,14 @@ def __init__(self, dataset, format):
     def path(self) -> str:
         return self.data_path(self.format)
 
+    @property
+    def has_auxiliary_data(self) -> bool:
+        pass
+
+    @property
+    def auxiliary_data(self) -> AuxData:
+        pass
+
     @abstractmethod
     def data_path(self, format: str) -> str:
         """

diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py
@@ -24,5 +24,11 @@ def load(self, source: DataSourceType, *args, **kwargs):
         else:
             raise NotImplementedError(f"data source {source} is not supported yet")
 
+    def load_auxiliary_data(self, source: DataSourceType, *args, **kwargs):
+        if source == DataSourceType.file:
+            return self.file_loader.load_auxiliary_data(*args, **kwargs)
+        else:
+            raise NotImplementedError(f"data source {source} is not supported yet")
+
 
 __all__ = ["DataLoader", "DataSourceType"]
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
@@ -10,7 +10,7 @@
 import pandas as pd
 import pandas.api.types as pat
 
-from ..data import Dataset, DatasetType, Datasplit, Feature
+from ..data import AuxData, Dataset, DatasetType, Datasplit, Feature, DF
 from ..datautils import read_csv, to_data_frame
 from ..resources import config as rconfig
 from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path
@@ -55,47 +55,99 @@ def load(self, dataset, fold=0):
         else:
             raise ValueError(f"Unsupported file type: {ext}")
 
+    @profile(logger=log)
+    def load_auxiliary_data(self, dataset, auxiliary_data, fold=0):
+        auxiliary_data = auxiliary_data if isinstance(auxiliary_data, ns) else ns(path=auxiliary_data)
+        log.debug("Loading auxiliary data %s", auxiliary_data)
+        paths = self._extract_auxiliary_paths(auxiliary_data.path if 'path' in auxiliary_data else auxiliary_data, fold=fold)
+        train_data = None
+        test_data = None
+        if 'train' in paths:
+            train_path = paths['train'][fold]
+            train_data = FileAuxData(train_path)
+        if 'test' in paths:
+            test_path = paths['test'][fold]
+            test_data = FileAuxData(test_path)
+        dataset._attach_auxiliary_data(train_data, test_data)
+        return dataset
+
+    def _extract_auxiliary_paths(self, auxiliary_data, fold=None):
+        train_suffix = 'train_auxiliary'
+        test_suffix = 'test_auxiliary'
+        if isinstance(auxiliary_data, (tuple, list)):
+            return self._extract_paths(ns(train=[p for p in auxiliary_data if train_suffix in p],
+                                                   test=[p for p in auxiliary_data if test_suffix in p]),
+                                                fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
+        elif isinstance(auxiliary_data, ns):
+            return dict(
+                train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
+                       if i == fold else None
+                       for i, p in enumerate(as_list(auxiliary_data.train))] if 'train' in auxiliary_data else [],
+                test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
+                      if i == fold else None
+                      for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else []
+            )
+        else:
+            self._extract_paths(auxiliary_data, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
+
+
     def _extract_train_test_paths(self, dataset, fold=None):
+        train_suffix = 'train'
+        test_suffix = 'test'
         if isinstance(dataset, (tuple, list)):
             assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]."
-            return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
+            return self._extract_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
                                                      test=[p for i, p in enumerate(dataset) if i % 2 == 1]),
-                                                  fold=fold)
+                                                  fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
         elif isinstance(dataset, ns):
-            return dict(train=[self._extract_train_test_paths(p)['train'][0]
+            return dict(train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
                                if i == fold else None
                                for i, p in enumerate(as_list(dataset.train))],
-                        test=[self._extract_train_test_paths(p)['train'][0]
+                        test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0]
                               if i == fold else None
                               for i, p in enumerate(as_list(dataset.test))])
         else:
-            assert isinstance(dataset, str)
-            dataset = os.path.expanduser(dataset)
-            dataset = dataset.format(**rconfig().common_dirs)
-
-        if os.path.exists(dataset):
-            if os.path.isfile(dataset):
-                if is_archive(dataset):
-                    arch_name, _ = os.path.splitext(os.path.basename(dataset))
+            self._extract_paths(dataset, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
+
+
+    def _extract_paths(self, data, fold=None, train_suffix='train', test_suffix='test'):
+        train_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+")
+        test_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+")
+        is_aux_data = False
+        if train_suffix == 'train_auxiliary' and test_suffix == 'test_auxiliary':
+            is_aux_data = True
+
+        assert isinstance(data, str)
+        data = os.path.expanduser(data)
+        data = data.format(**rconfig().common_dirs)
+
+        if os.path.exists(data):
+            if os.path.isfile(data):
+                # we leave the auxiliary data handling to the user
+                if is_archive(data) and not is_aux_data:
+                    arch_name, _ = os.path.splitext(os.path.basename(data))
                     dest_folder = os.path.join(self._cache_dir, arch_name)
                     if not os.path.exists(dest_folder):  # don't uncompress if previously done
-                        dest_folder = unarchive_file(dataset, dest_folder)
-                    return self._extract_train_test_paths(dest_folder)
+                        dest_folder = unarchive_file(data, dest_folder)
+                    return self._extract_paths(dest_folder, train_suffix=train_suffix, test_suffix=test_suffix)
                 else:
-                    return dict(train=[dataset], test=[])
-            elif os.path.isdir(dataset):
-                files = list_all_files(dataset)
-                log.debug("Files found in dataset folder %s: %s", dataset, files)
-                assert len(files) > 0, f"Empty folder: {dataset}"
+                    return dict(train=[data], test=[])
+            elif os.path.isdir(data):
+                files = list_all_files(data)
+                log.debug("Files found in data folder %s: %s", data, files)
+                assert len(files) > 0, f"Empty folder: {data}"
                 if len(files) == 1:
                     return dict(train=files, test=[])
 
                 train_matches = [m for m in [train_search_pat.search(f) for f in files] if m]
                 test_matches = [m for m in [test_search_pat.search(f) for f in files] if m]
                 # verify they're for the same dataset (just based on name)
-                assert train_matches and test_matches, f"Folder {dataset} must contain at least one training and one test dataset."
+                if not is_aux_data:
+                    assert train_matches and test_matches, f"Folder {data} must contain at least one training and one test dataset."
+                else:
+                    assert train_matches or test_matches, f"Folder {data} must contain at least one training auxiliary data or one test auxiliary data."
                 root_names = {m[1] for m in (train_matches+test_matches)}
-                assert len(root_names) == 1, f"All dataset files in {dataset} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0."
+                assert len(root_names) == 1, f"All data files in {data} should follow the same naming: xxxxx_{train_suffix}_N.ext or xxxxx_{test_suffix}_N.ext with N starting from 0."
 
                 train_no_fold = next((m[0] for m in train_matches if m[2] is None), None)
                 test_no_fold = next((m[0] for m in test_matches if m[2] is None), None)
@@ -107,23 +159,47 @@ def _extract_train_test_paths(self, dataset, fold=None):
                 while fold >= 0:
                     train = next((m[0] for m in train_matches if m[2] == str(fold)), None)
                     test = next((m[0] for m in test_matches if m[2] == str(fold)), None)
-                    if train and test:
-                        paths['train'].append(train)
-                        paths['test'].append(test)
-                        fold += 1
+                    if not is_aux_data:
+                        if train and test:
+                            paths['train'].append(train)
+                            paths['test'].append(test)
+                            fold += 1
+                        else:
+                            fold = -1
                     else:
-                        fold = -1
-                assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..."
+                        if train:
+                            paths['train'].append(train)
+                        if test:
+                            paths['test'].append(test)
+                        if not train and not test:
+                            fold = -1
+                        fold += 1
+                assert len(paths) > 0, f"No data file found in {data}: they should follow the naming xxxx_{train_suffix}.ext, xxxx_{test_suffix}.ext or xxxx_{train_suffix}_0.ext, xxxx_{test_suffix}_0.ext, xxxx_{train_suffix}_1.ext, ..."
                 return paths
-        elif is_valid_url(dataset):
-            cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
+        elif is_valid_url(data):
+            cached_file = os.path.join(self._cache_dir, os.path.basename(data))
             if not os.path.exists(cached_file):  # don't download if previously done
-                handler = get_file_handler(dataset)
-                assert handler.exists(dataset), f"Invalid path/url: {dataset}"
-                handler.download(dataset, dest_path=cached_file)
-            return self._extract_train_test_paths(cached_file)
+                handler = get_file_handler(data)
+                assert handler.exists(data), f"Invalid path/url: {data}"
+                handler.download(data, dest_path=cached_file)
+            return self._extract_paths(cached_file, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)
         else:
-            raise ValueError(f"Invalid dataset description: {dataset}")
+            raise ValueError(f"Invalid dataset description: {data}")
+
+
+class FileAuxData(AuxData):
+
+    def __init__(self, path):
+        super().__init__()
+        self._path = path
+
+    @property
+    def path(self) -> str:
+        return self._path
+
+    @property
+    def data(self) -> DF:
+        return NotImplementedError
 
 
 class FileDataset(Dataset):
@@ -161,6 +237,10 @@ def features(self) -> List[Feature]:
     def target(self) -> Feature:
         return self._get_metadata('target')
 
+    def _attach_auxiliary_data(self, train_auxiliary_data, test_auxiliary_data):
+        self._train._attach_auxiliary_data(train_auxiliary_data)
+        self._test._attach_auxiliary_data(test_auxiliary_data)
+
     @memoize
     def _get_metadata(self, prop):
         meta = self._train.load_metadata()
@@ -173,6 +253,8 @@ def __init__(self, dataset: FileDataset, format: str, path: str):
         super().__init__(dataset, format)
         self._path = path
         self._data = {format: path}
+        self._auxiliary_data = None
+
 
     def data_path(self, format):
         supported_formats = [cls.format for cls in __file_converters__]
@@ -181,6 +263,14 @@ def data_path(self, format):
             raise ValueError(f"Dataset {name} is only available in one of {supported_formats} formats.")
         return self._get_data(format)
 
+    @property
+    def has_auxiliary_data(self) -> bool:
+        return self._auxiliary_data != None
+
+    @property
+    def auxiliary_data(self) -> AuxData:
+        return self._auxiliary_data
+
     @lazy_property
     def data(self):
         # use codecs for unicode support: path = codecs.load(self._path, 'rb', 'utf-8')
@@ -217,6 +307,9 @@ def _set_feature_as_target(self, target: Feature):
                 # target.data_type = 'category'
         target.is_target = True
 
+    def _attach_auxiliary_data(self, auxiliary_data):
+        self._auxiliary_data = auxiliary_data
+
 
 class ArffDataset(FileDataset):
 

diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
@@ -19,6 +19,10 @@ def run(dataset: Dataset, config: TaskConfig):
         ),
         problem_type=dataset.type.name  # AutoGluon problem_type is using same names as amlb.data.DatasetType
     )
+    if dataset.train.has_auxiliary_data:
+        data['train_auxiliary_data'] = dict(path=dataset.train.auxiliary_data.path)
+    if dataset.test.has_auxiliary_data:
+        data['test_auxiliary_data'] = dict(path=dataset.test.auxiliary_data.path) 
 
     return run_in_venv(__file__, "exec.py",
                        input_data=data, dataset=dataset, config=config)

diff --git a/tests/unit/amlb/datasets/file/resources/image_test.zip b/tests/unit/amlb/datasets/file/resources/image_test.zip
diff --git a/tests/unit/amlb/datasets/file/resources/image_train.zip b/tests/unit/amlb/datasets/file/resources/image_train.zip
diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py
@@ -158,6 +158,27 @@ def test_load_regression_task_arff(file_loader):
     _assert_cholesterol_features(ds, ds_def, 'arff')
 
 
+@pytest.mark.use_disk
+def test_load_auxiliary_data(file_loader):
+    ds_def = ns(
+        train=os.path.join(res, "kc2_train.csv"),
+        test=os.path.join(res, "kc2_test.csv"),
+        target="problems"
+    )
+    ds = file_loader.load(ds_def)
+    aux_def = ns(
+        train=os.path.join(res, "image_train.zip"),
+        test=os.path.join(res, "image_test.zip")
+    )
+    ds = file_loader.load_auxiliary_data(ds, aux_def)
+    _assert_aux_data_path(ds)
+
+
+def _assert_aux_data_path(dataset):
+    assert dataset.train.auxiliary_data.path == os.path.join(res, "image_train.zip")
+    assert dataset.test.auxiliary_data.path == os.path.join(res, "image_test.zip")
+
+
 def _assert_cholesterol_features(dataset, definition, fmt):
     assert len(dataset.features) == 14
     assert len(dataset.predictors) == 13