added back extra root for numpy files

clemsgrs · Jun 14, 2024 · a118e31 · a118e31
1 parent 263adc9
commit a118e31
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -96,7 +96,7 @@ Then, follow these simple steps:
     ```shell
     tar -chf downstream_dataset.tar /path/to/downstream/dataset/image/folder
     ```
-   
+
 4. Infer the auxiliary files `query_entries.npy` and `query_file_indices.npy` :
 
     ```
@@ -143,19 +143,20 @@ Then run:
 ```shell
 python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
     --config-file dinov2/configs/train/vitl14.yaml \
-    train.dataset_path=Pathology:root={path/to/data/root}:subset={subset}
+    train.dataset_path=Pathology:root={path/to/tarball/root}:extra={path/to/entry/root}:subset={subset}
 ```
 
-Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `Pathology:root=/root/data`).<br>
+Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `Pathology:root=/root/data:extra=/root/data`).<br>
 Leave out `:subset={subset}` if you didn't restrict the dataset to a specific subset when preparing data.<br>
-Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:subset=train`).
+Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:extra=/root/data:subset=train`).
 
 In case you want to run downstream tuning, make sure to update the following two parameters in your config:
 
 ```shell
 tune:
-  query_dataset_path: KNN:root={path/to/data/root}:split=query
-  test_dataset_path: KNN:root={path/to/data/root}:split=test
+  query_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=query
+  test_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=test
 ```
 
-Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` file and `.npy` files during data preparation.
+Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` files.
+Replace `{path/to/entry/root}` with the folder where you dumped the downstream `.npy` entry files.
diff --git a/README_foundation.md b/README_foundation.md
@@ -118,7 +118,7 @@ Update `dinov2/configs/train/vitl14.yaml` if you want to change some parameters,
 ```shell
 python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
     --config-file dinov2/configs/train/vitl14.yaml \
-    train.dataset_path=PathologyFoundation:root={path/to/data/root}
+    train.dataset_path=PathologyFoundation:root={path/to/data/root}:extra={path/to/entry/root}
 ```
 
-Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `PathologyFoundation:root=/root/data`).
+Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `PathologyFoundation:root=/root/data:extra=/root/data`).<br>
diff --git a/dinov2/data/datasets/foundation.py b/dinov2/data/datasets/foundation.py
@@ -42,13 +42,15 @@ def __init__(
         self,
         *,
         root: str,
+        extra: str,
         subset: Optional["PathologyFoundationDataset.Subset"] = None,
         transforms: Optional[Callable] = None,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
     ) -> None:
         super().__init__(root, transforms, transform, target_transform)
+        self.extra = extra
         self._subset = subset
         self._get_entries()
         self._get_cohort_names()
@@ -70,14 +72,14 @@ def _get_entries(self) -> np.ndarray:
         self._entries = self._load_entries(self._entries_name)
 
     def _load_entries(self, _entries_name: str) -> np.ndarray:
-        entries_path = Path(self.root, _entries_name)
+        entries_path = Path(self.extra, _entries_name)
         return np.load(entries_path, mmap_mode="r")
 
     def _get_cohort_names(self) -> dict:
         self._cohort_names = self._load_cohort_names()
 
     def _load_cohort_names(self) -> dict:
-        cohort_dict_path = Path(self.root, "cohort_indices.npy")
+        cohort_dict_path = Path(self.extra, "cohort_indices.npy")
         return np.load(cohort_dict_path, allow_pickle=True).item()
 
     def get_image_data(self, index: int) -> bytes:

diff --git a/dinov2/data/datasets/pathology.py b/dinov2/data/datasets/pathology.py
@@ -29,12 +29,14 @@ def __init__(
         self,
         *,
         root: str,
+        extra: str,
         subset: Optional["PathologyDataset.Subset"] = None,
         transforms: Optional[Callable] = None,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:
         super().__init__(root, transforms, transform, target_transform)
+        self.extra = extra
         self._subset = subset
         self._get_entries()
         self._mmap_tarball = _make_mmap_tarball(Path(root, "pretrain_dataset.tar"))
@@ -51,7 +53,7 @@ def _get_entries(self) -> np.ndarray:
         self._entries = self._load_entries(self._entries_name)
 
     def _load_entries(self, _entries_name: str) -> np.ndarray:
-        entries_path = Path(self.root, _entries_name)
+        entries_path = Path(self.extra, _entries_name)
         return np.load(entries_path, mmap_mode="r")
 
     def get_image_data(self, index: int) -> bytes: