Skip to content

Commit

Permalink
feat(data): working (slow) dataloaders!
Browse files Browse the repository at this point in the history
- Create a FiftyOne Dataset Exporter to export labels to lmdb
- Use LMDB to fetch labels for SegmentationDataset
- Add appropriate dependecies (cv2,lmdb, pycocotools)
- Add Hydra configurations
Dataloaders are slow, will be profiled. may need more breaking changes.
  • Loading branch information
charitarthchugh committed Nov 27, 2024
1 parent 9b21e3a commit dff3404
Show file tree
Hide file tree
Showing 9 changed files with 1,179 additions and 798 deletions.
14 changes: 14 additions & 0 deletions conf/nn/data/dataset/cityscapes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
_target_: lightningsparseinst
name: 'Cityscapes'
ref: 'cityscapes'
gt_field: 'gt_coarse'
detection_field: 'polylines'
train_split_name: 'train'
validation_split_name: 'validation'
test_split_name: 'test'
classes: '^(?!out of roi$).*' # filtering out the
transforms:
_target_: albumentations.Compose
transforms:
- _target_: albumentations.
- _target_:
21 changes: 21 additions & 0 deletions conf/nn/data/dataset/coco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
_target_: lightningsparseinst.data.dataset.SegmentationDataset
name: 'COCO-2017'
ref: 'coco-2017' #controls its cache dir as well
gt_field: 'ground_truth'
detection_field: 'detections'
max_detections: 100 # number used by DETR for coco. it is the max num images+slack
split_names:
train: train
validation: validation
test: test
transform:
_target_: albumentations.Compose
transforms:
- _target_: lightningsparseinst.utils.transforms.ResizeShortestEdge
shortest_max_size: [416, 448, 480, 512, 544, 576, 608, 640]
largest_max_size: 853
- _target_: albumentations.geometric.PadIfNeeded
min_width: 853 # max size of smallestmaxsize
min_height: 853
position: random # remove any bias in positioning
- _target_: albumentations.pytorch.ToTensorV2
3 changes: 3 additions & 0 deletions conf/nn/data/dataset/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
defaults:
- _self_
- dataset: coco # pick one of the yamls in nn/data/
14 changes: 7 additions & 7 deletions conf/nn/data/default.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
_target_: lightningsparseinst.data.datamodule.MyDataModule
val_images_fixed_idxs: [7371, 3963, 2861, 1701, 3172, 1749, 7023, 1606, 6481, 1377, 6003, 3593, 3410, 3399, 7277, 5337, 968, 8206, 288, 1968, 5677, 9156, 8139, 7660, 7089, 1893, 3845, 2084, 1944, 3375, 4848, 8704, 6038, 2183, 7422, 2682, 6878, 6127, 2941, 5823, 9129, 1798, 6477, 9264, 476, 3007, 4992, 1428, 9901, 5388]
_target_: lightningsparseinst.data.datamodule.DataModule
accelerator: ${train.trainer.accelerator}
num_workers:
train: 4
val: 2
test: 0
test: 2
batch_size:
train: 512
val: 128
test: 16
train: 32
val: 8
test: 8
cache_dir: '.cache/data/' # either from Project Root or absolute path
defaults:
- _self_
- dataset: vision/mnist # pick one of the yamls in nn/data/
- dataset: coco # pick one of the yamls in nn/data/
1,529 changes: 879 additions & 650 deletions poetry.lock

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ testpaths = ["tests"]

[tool.coverage.report]
exclude_lines = [
"raise NotImplementedError",
"raise NotImplementedError()",
"pragma: nocover",
"if __name__ == .__main__.:",
"raise NotImplementedError",
"raise NotImplementedError()",
"pragma: nocover",
"if __name__ == .__main__.:",
]
[tool.mypy]
files = ["src/**/*.py", "test/*.py"]
Expand Down Expand Up @@ -48,6 +48,7 @@ description = "SparseInst (CVPR 2022) in PyTorch Lightning"
authors = ["Charitarth Chugh <[email protected]>"]
license = "MIT"
readme = "README.md"
package-mode = true

[tool.poetry.dependencies]
python = "^3.11, <3.13"
Expand All @@ -60,6 +61,10 @@ torchmetrics = "^1.5.1"
omegaconf = "^2.3.0"
fiftyone = "^1.0.1"
albumentations = "^1.4.21"
opencv-python = "^4.10.0.84"
orjson = "^3.10.12"
lmdb = "^1.5.1"
pycocotools = "^2.0.8"


[tool.poetry.group.dev.dependencies]
Expand Down
82 changes: 50 additions & 32 deletions src/lightningsparseinst/data/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from nn_core.nn_types import Split

from lightningsparseinst.data.dataset import SegmentationDataset
from lightningsparseinst.utils.fiftyone_io import LMDBDetectionDatasetExporter

pylogger = logging.getLogger(__name__)

Expand Down Expand Up @@ -108,25 +109,32 @@ def __init__(
dataset: DictConfig,
num_workers: DictConfig,
batch_size: DictConfig,
split_names: DictConfig,
accelerator: str,
cache_dir: str,
# example
):
super().__init__()
self.dataset = dataset
self.num_workers = num_workers
self.batch_size = batch_size
self.split_names = split_names
# https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#gpus
self.pin_memory: bool = accelerator is not None and str(accelerator) == "gpu"

self.fiftyone_dataset: Optional[fo.Dataset] = None
self.classes: Optional[List[str] | str] = None
self.fiftyone_dataset: fo.Dataset = None
self.classes: List[str] | str = None
self.split_names: Mapping[str, str] = None

self.train_dataset: Optional[Dataset] = None
self.val_dataset: Optional[Dataset] = None
self.test_dataset: Optional[Dataset] = None
self.transform: Optional[Compose] = None
self.train_dataset: Dataset = None
self.val_dataset: Dataset = None
self.test_dataset: Dataset = None
self.transform: Compose = None

cache_dir = Path(cache_dir)
if not cache_dir.is_absolute():
cache_dir = PROJECT_ROOT / cache_dir

cache_dir.mkdir(exist_ok=True, parents=True)
self.cache_dir = cache_dir

@cached_property
def metadata(self) -> MetaData:
Expand All @@ -141,7 +149,7 @@ def metadata(self) -> MetaData:
if self.train_dataset is None:
self.setup(stage="fit")

return MetaData(class_vocab=self.train_dataset.labels_map_rev)
return MetaData(class_vocab=self.train_dataset.label_map_rv)

def prepare_data(self) -> None:
# download only\
Expand All @@ -151,6 +159,8 @@ def setup(self, stage: Optional[str] = None):
self.fiftyone_dataset = fo.load_dataset(self.dataset.ref)
self.fiftyone_dataset.compute_metadata()

self.split_names = self.dataset.split_names

self.transform = hydra.utils.instantiate(self.dataset.transform)
# Label filtering logic
self.classes = self.dataset.classes if "classes" in self.dataset.keys() else None
Expand All @@ -168,34 +178,27 @@ def setup(self, stage: Optional[str] = None):
self.classes = self.fiftyone_dataset.distinct(
f"{self.dataset.gt_field}.{self.dataset.detection_field}.label"
)
# self.hf_datasets = hydra.utils.instantiate(self.dataset)
# self.hf_datasets.set_transform(self.transform)
#
# # Here you should instantiate your dataset, you may also split the train into train and validation if needed.

label_map_rv = {cls: idx for idx, cls in enumerate(self.classes)}
dataset_cache = self.cache_dir / self.dataset.ref
dataset_cache.mkdir(exist_ok=True, parents=True)

if (stage is None or stage == "fit") and (self.train_dataset is None and self.val_dataset is None):
self._handle_lmdb_caching(dataset_cache, self.split_names["train"], label_map_rv)
self.train_dataset = SegmentationDataset(
self.fiftyone_dataset,
split=self.split_names["train"],
gt_field=self.dataset.gt_field,
detection_field=self.dataset.detection_field,
dataset_cache / self.split_names["train"],
transform=self.transform,
max_num_instances_per_image=self.dataset.max_num_instances_per_image,
max_detections=self.dataset.max_detections,
)
self._handle_lmdb_caching(dataset_cache, self.split_names["validation"], label_map_rv)
self.val_dataset = SegmentationDataset(
self.fiftyone_dataset,
split=self.split_names["validation"],
gt_field=self.dataset.gt_field,
detection_field=self.dataset.detection_field,
max_num_instances_per_image=self.dataset.max_num_instances_per_image,
dataset_cache / self.split_names["validation"], max_detections=self.dataset.max_detections
)
#
if stage is None or stage == "test":
self._handle_lmdb_caching(dataset_cache, self.split_names["test"], label_map_rv)
self.test_dataset = SegmentationDataset(
self.fiftyone_dataset,
split=self.split_names["test"],
gt_field=self.dataset.gt_field,
detection_field=self.dataset.detection_field,
max_num_instances_per_image=self.dataset.max_num_instances_per_image,
dataset_cache / self.split_names["test"], max_detections=self.dataset.max_detections
)

def train_dataloader(self) -> DataLoader:
Expand All @@ -204,17 +207,17 @@ def train_dataloader(self) -> DataLoader:
shuffle=True,
batch_size=self.batch_size.train,
num_workers=self.num_workers.train,
pin_memory=self.pin_memory,
# pin_memory=self.pin_memory,
collate_fn=partial(collate_fn, split="train", metadata=self.metadata),
)

def val_dataloader(self) -> DataLoader:
return DataLoader(
self.val_dataset,
self.val_datasewt,
shuffle=False,
batch_size=self.batch_size.val,
num_workers=self.num_workers.val,
pin_memory=self.pin_memory,
# pin_memory=self.pin_memory,
collate_fn=partial(collate_fn, split="val", metadata=self.metadata),
)

Expand All @@ -224,13 +227,28 @@ def test_dataloader(self) -> DataLoader:
shuffle=False,
batch_size=self.batch_size.test,
num_workers=self.num_workers.test,
pin_memory=self.pin_memory,
# pin_memory=self.pin_memory,
collate_fn=partial(collate_fn, split="test", metadata=self.metadata),
)

def __repr__(self) -> str:
return f"{self.__class__.__name__}(" f"{self.dataset=}, " f"{self.num_workers=}, " f"{self.batch_size=})"

def _handle_lmdb_caching(self, dataset_cache_path, split_name, label_map_rv) -> None:
split_cache_dir = dataset_cache_path / split_name

mdb_files = list(split_cache_dir.glob("*.mdb")) if split_cache_dir.exists() else None
if mdb_files:
pylogger.info(f"Found lmdb files in {split_name} cache directory: {mdb_files}")
else:
pylogger.info(f"Exporting lmdb files to {split_name} cache directory: {split_cache_dir}")
dataset_split = self.fiftyone_dataset.match_tags(split_name)
dataset_split.export(
dataset_exporter=LMDBDetectionDatasetExporter(
export_dir=split_cache_dir, gt_field=self.dataset.gt_field, label_map_rv=label_map_rv
)
)


@hydra.main(config_path=str(PROJECT_ROOT / "conf"), config_name="default", version_base="1.1")
def main(cfg: omegaconf.DictConfig) -> None:
Expand Down
Loading

0 comments on commit dff3404

Please sign in to comment.