Skip to content

Commit

Permalink
added back extra root for numpy files
Browse files Browse the repository at this point in the history
  • Loading branch information
clemsgrs committed Jun 14, 2024
1 parent 263adc9 commit a118e31
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ Then, follow these simple steps:
```shell
tar -chf downstream_dataset.tar /path/to/downstream/dataset/image/folder
```
4. Infer the auxiliary files `query_entries.npy` and `query_file_indices.npy` :
```
Expand Down Expand Up @@ -143,19 +143,20 @@ Then run:
```shell
python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
--config-file dinov2/configs/train/vitl14.yaml \
train.dataset_path=Pathology:root={path/to/data/root}:subset={subset}
train.dataset_path=Pathology:root={path/to/tarball/root}:extra={path/to/entry/root}:subset={subset}
```
Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `Pathology:root=/root/data`).<br>
Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `Pathology:root=/root/data:extra=/root/data`).<br>
Leave out `:subset={subset}` if you didn't restrict the dataset to a specific subset when preparing data.<br>
Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:subset=train`).
Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:extra=/root/data:subset=train`).

In case you want to run downstream tuning, make sure to update the following two parameters in your config:

```shell
tune:
query_dataset_path: KNN:root={path/to/data/root}:split=query
test_dataset_path: KNN:root={path/to/data/root}:split=test
query_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=query
test_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=test
```

Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` file and `.npy` files during data preparation.
Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` files.
Replace `{path/to/entry/root}` with the folder where you dumped the downstream `.npy` entry files.
4 changes: 2 additions & 2 deletions README_foundation.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ Update `dinov2/configs/train/vitl14.yaml` if you want to change some parameters,
```shell
python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
--config-file dinov2/configs/train/vitl14.yaml \
train.dataset_path=PathologyFoundation:root={path/to/data/root}
train.dataset_path=PathologyFoundation:root={path/to/data/root}:extra={path/to/entry/root}
```
Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `PathologyFoundation:root=/root/data`).
Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `PathologyFoundation:root=/root/data:extra=/root/data`).<br>
6 changes: 4 additions & 2 deletions dinov2/data/datasets/foundation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@ def __init__(
self,
*,
root: str,
extra: str,
subset: Optional["PathologyFoundationDataset.Subset"] = None,
transforms: Optional[Callable] = None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
) -> None:
super().__init__(root, transforms, transform, target_transform)
self.extra = extra
self._subset = subset
self._get_entries()
self._get_cohort_names()
Expand All @@ -70,14 +72,14 @@ def _get_entries(self) -> np.ndarray:
self._entries = self._load_entries(self._entries_name)

def _load_entries(self, _entries_name: str) -> np.ndarray:
entries_path = Path(self.root, _entries_name)
entries_path = Path(self.extra, _entries_name)
return np.load(entries_path, mmap_mode="r")

def _get_cohort_names(self) -> dict:
self._cohort_names = self._load_cohort_names()

def _load_cohort_names(self) -> dict:
cohort_dict_path = Path(self.root, "cohort_indices.npy")
cohort_dict_path = Path(self.extra, "cohort_indices.npy")
return np.load(cohort_dict_path, allow_pickle=True).item()

def get_image_data(self, index: int) -> bytes:
Expand Down
4 changes: 3 additions & 1 deletion dinov2/data/datasets/pathology.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ def __init__(
self,
*,
root: str,
extra: str,
subset: Optional["PathologyDataset.Subset"] = None,
transforms: Optional[Callable] = None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
) -> None:
super().__init__(root, transforms, transform, target_transform)
self.extra = extra
self._subset = subset
self._get_entries()
self._mmap_tarball = _make_mmap_tarball(Path(root, "pretrain_dataset.tar"))
Expand All @@ -51,7 +53,7 @@ def _get_entries(self) -> np.ndarray:
self._entries = self._load_entries(self._entries_name)

def _load_entries(self, _entries_name: str) -> np.ndarray:
entries_path = Path(self.root, _entries_name)
entries_path = Path(self.extra, _entries_name)
return np.load(entries_path, mmap_mode="r")

def get_image_data(self, index: int) -> bytes:
Expand Down

0 comments on commit a118e31

Please sign in to comment.