DagsHub
diff --git a/‎.github/workflows/linters-and-test.yml
+32 b/‎.github/workflows/linters-and-test.yml
+32
diff --git a/‎README.md
+17-29 b/‎README.md
+17-29
diff --git a/‎dagshub_annotation_converter/cli.py
-2 b/‎dagshub_annotation_converter/cli.py
-2
diff --git a/‎dagshub_annotation_converter/image/__init__.py ‎dagshub_annotation_converter/converters/__init__.py b/‎dagshub_annotation_converter/image/__init__.py ‎dagshub_annotation_converter/converters/__init__.py
diff --git a/‎dagshub_annotation_converter/converters/common.py
+16 b/‎dagshub_annotation_converter/converters/common.py
+16
diff --git a/‎dagshub_annotation_converter/converters/cvat.py
+49 b/‎dagshub_annotation_converter/converters/cvat.py
+49
diff --git a/‎dagshub_annotation_converter/converters/yolo.py
+176 b/‎dagshub_annotation_converter/converters/yolo.py
+176
diff --git a/‎dagshub_annotation_converter/schema/__init__.py ‎dagshub_annotation_converter/formats/__init__.py b/‎dagshub_annotation_converter/schema/__init__.py ‎dagshub_annotation_converter/formats/__init__.py
diff --git a/‎dagshub_annotation_converter/formats/common.py
+21 b/‎dagshub_annotation_converter/formats/common.py
+21
diff --git a/‎dagshub_annotation_converter/formats/cvat/__init__.py
+18 b/‎dagshub_annotation_converter/formats/cvat/__init__.py
+18
@@ -0,0 +1,32 @@
+name: Lint and Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  run-linters:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: chartboost/ruff-action@v1
+
+  run-tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+            python-version: ${{ matrix.python-version }}
+      - name: Install Hatch
+        run: pipx install hatch
+      - name: Run tests
+        run: hatch test
@@ -6,38 +6,26 @@ between different annotation formats.
 This package is currently in development and has not that many features implemented.
 The API is not stable and is subject to change.
 
-Support Matrix for image annotations
+The package consists of the Intermediary Representation (IR) annotation format in Python Objects,
+and importers/exporters for different annotation formats.
 
-| Export > \/ Import V              | YOLO v5+ BBox | YOLO v5+ Segmentation | Yolo Poses | COCO | DagsHub Datasource (Label Studio) | Label Studio | CVAT Image |
-|-----------------------------------|---------------|-----------------------|------------|------|-----------------------------------|--------------|------------|
-| YOLO v5+ BBox                     | -             |                       |            |      | ✅                                 |              |            |
-| YOLO v5+ Segmentation             |               | -                     |            |      | ✅                                 |              |            |
-| YOLO Poses                        |               |                       | -          |      | ✅                                 |              |            |
-| COCO                              |               |                       |            | -    |                                   |              |            |
-| DagsHub Datasource (Label Studio) | ✅             | ✅                     | ✅          |      | -                                 |              |            |
-| Label Studio                      |               |                       |            |      |                                   | -            |            |
-| CVAT Image                        |               |                       |            |      | ✅                                 |              | -          |
+## Installation
 
-Example usage, importing annotations from [COCO_1K](https://dagshub.com/Dean/COCO_1K) and uploading it into a DagsHub Datasource:
+```bash
+pip install dagshub-annotation-converter
+```
 
+## Importers (Image):
+- [YOLO BBox, Segmentation, Poses](dagshub_annotation_converter/converters/yolo.py#L81)
+- [Label Studio](dagshub_annotation_converter/formats/label_studio/task.py#L72) (Only task schema implemented, importing from a project is left up to user):
 ```python
-from dagshub_annotation_converter.image.importers import YoloImporter
-from dagshub_annotation_converter.image.exporters import DagshubDatasourceExporter
-
-from dagshub.data_engine.datasources import get_datasource
-
-# Assuming that the current worker directory is the root of the repo and images are stored in "data" folder
-importer = YoloImporter(
-    data_dir="data",                 # Where the images are stored
-    annotation_type="segmentation",  # or bbox for bounding boxes
-    meta_file="custom_coco.yaml"     # file with the classes
-)
+from dagshub_annotation_converter.formats.label_studio.task import LabelStudioTask
+task_obj = LabelStudioTask.from_json("path/to/label_studio_task.json")
 
-proj = importer.parse()
-
-exporter = DagshubDatasourceExporter(
-    datasource=get_datasource("<user>/<repo>", "<my datasource>"),
-    annotation_field="exported_yolo_annotations"
-)
-exporter.export(proj)
+annotations = task_obj.to_ir_annotations()
 ```
+- [CVAT Image](dagshub_annotation_converter/converters/cvat.py#L46)
+
+## Exporters (Image):
+- [YOLO BBox, Segmentation, Poses](dagshub_annotation_converter/converters/yolo.py#L126)
+- [Label Studio](dagshub_annotation_converter/formats/label_studio/task.py#L225) (Again, only task schema, uploading the task to the project is left to the user)
@@ -0,0 +1,16 @@
+from typing import Sequence, Mapping, List, Dict
+
+from dagshub_annotation_converter.ir.image import IRImageAnnotationBase
+
+
+def group_annotations_by_filename(
+    annotations: Sequence[IRImageAnnotationBase],
+) -> Mapping[str, Sequence[IRImageAnnotationBase]]:
+    res: Dict[str, List[IRImageAnnotationBase]] = {}
+    for ann in annotations:
+        if ann.filename is None:
+            raise ValueError(f"An annotation {ann} doesn't have a filename associated, aborting")
+        if ann.filename not in res:
+            res[ann.filename] = []
+        res[ann.filename].append(ann)
+    return res
@@ -0,0 +1,49 @@
+import logging
+from os import PathLike
+from typing import Sequence, List, Dict
+from zipfile import ZipFile
+
+import lxml.etree
+
+from dagshub_annotation_converter.formats.cvat import annotation_parsers
+from dagshub_annotation_converter.formats.cvat.context import parse_image_tag
+from dagshub_annotation_converter.ir.image import IRImageAnnotationBase
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_image_annotations(img: lxml.etree.ElementBase) -> Sequence[IRImageAnnotationBase]:
+    annotations: List[IRImageAnnotationBase] = []
+    for annotation_elem in img:
+        annotation_type = annotation_elem.tag
+        if annotation_type not in annotation_parsers:
+            logger.warning(f"Unknown CVAT annotation type {annotation_type}")
+            continue
+        annotations.append(annotation_parsers[annotation_type](annotation_elem, img))
+
+    return annotations
+
+
+def load_cvat_from_xml_string(
+    xml_text: bytes,
+) -> Dict[str, Sequence[IRImageAnnotationBase]]:
+    annotations = {}
+    root_elem = lxml.etree.XML(xml_text)
+
+    for image_node in root_elem.xpath("//image"):
+        image_info = parse_image_tag(image_node)
+        annotations[image_info.name] = parse_image_annotations(image_node)
+
+    return annotations
+
+
+def load_cvat_from_xml_file(xml_file: PathLike) -> Dict[str, Sequence[IRImageAnnotationBase]]:
+    with open(xml_file, "rb") as f:
+        return load_cvat_from_xml_string(f.read())
+
+
+def load_cvat_from_zip(zip_path: PathLike) -> Dict[str, Sequence[IRImageAnnotationBase]]:
+    with ZipFile(zip_path) as proj_zip:
+        with proj_zip.open("annotations.xml") as f:
+            return load_cvat_from_xml_string(f.read())
@@ -0,0 +1,176 @@
+import logging
+import os
+from pathlib import Path
+from typing import Union, Sequence, List, Optional, Dict, Tuple
+
+import PIL.Image
+
+from dagshub_annotation_converter.converters.common import group_annotations_by_filename
+from dagshub_annotation_converter.formats.yolo import (
+    export_lookup,
+    allowed_annotation_types,
+    YoloContext,
+    import_lookup,
+    YoloAnnotationTypes,
+)
+from dagshub_annotation_converter.ir.image import IRImageAnnotationBase
+from dagshub_annotation_converter.util import is_image, replace_folder
+
+logger = logging.getLogger(__name__)
+
+
+def load_yolo_from_fs_with_context(
+    context: YoloContext,
+    import_dir: Union[str, Path] = ".",
+) -> Dict[str, Sequence[IRImageAnnotationBase]]:
+    assert context.path is not None
+
+    annotations: Dict[str, Sequence[IRImageAnnotationBase]] = {}
+
+    import_dir_path = Path(import_dir)
+
+    if context.path.is_absolute():
+        data_dir_path = context.path
+    else:
+        data_dir_path = import_dir_path / context.path
+
+    for dirpath, subdirs, files in os.walk(data_dir_path):
+        if context.image_dir_name not in dirpath.split("/"):
+            logger.debug(f"{dirpath} is not an image dir, skipping")
+            continue
+        for filename in files:
+            fullpath = os.path.join(dirpath, filename)
+            img = Path(fullpath)
+            relpath = img.relative_to(data_dir_path)
+            if not is_image(img):
+                logger.debug(f"Skipping {img} because it's not an image")
+                continue
+            annotation = replace_folder(img, context.image_dir_name, context.label_dir_name, context.label_extension)
+            if annotation is None:
+                logger.warning(f"Couldn't generate annotation file path for image file [{img}]")
+                continue
+            if not annotation.exists():
+                logger.warning(f"Couldn't find annotation file [{annotation}] for image file [{img}]")
+                continue
+            annotations[str(relpath)] = parse_annotation(context, data_dir_path, img, annotation)
+
+    return annotations
+
+
+def parse_annotation(
+    context: YoloContext, base_path: Path, img_path: Path, annotation_path: Path
+) -> Sequence[IRImageAnnotationBase]:
+    img = PIL.Image.open(img_path)
+    img_width, img_height = img.size
+
+    annotation_strings = annotation_path.read_text().strip().split("\n")
+
+    assert context.annotation_type is not None
+
+    convert_func = import_lookup[context.annotation_type]
+
+    res: List[IRImageAnnotationBase] = []
+    rel_path = str(img_path.relative_to(base_path))
+
+    for ann in annotation_strings:
+        res.append(convert_func(ann, context, img_width, img_height, img).with_filename(rel_path))
+
+    return res
+
+
+def load_yolo_from_fs(
+    annotation_type: YoloAnnotationTypes,
+    meta_file: Union[str, Path] = "annotations.yaml",
+    image_dir_name: str = "images",
+    label_dir_name: str = "labels",
+) -> Tuple[Dict[str, Sequence[IRImageAnnotationBase]], YoloContext]:
+    meta_file_path = Path(meta_file).absolute()
+    context = YoloContext.from_yaml_file(meta_file, annotation_type=annotation_type)
+    context.image_dir_name = image_dir_name
+    context.label_dir_name = label_dir_name
+    context.annotation_type = annotation_type
+
+    return load_yolo_from_fs_with_context(context, import_dir=meta_file_path.parent), context
+
+
+# ======== Annotation Export ======== #
+
+
+def annotations_to_string(annotations: Sequence[IRImageAnnotationBase], context: YoloContext) -> Optional[str]:
+    """
+    Serializes multiple YOLO annotations into the contents of the annotations file.
+    Also makes sure that only annotations of the correct type for context.annotation_type are serialized.
+
+    :param annotations: Annotations to serialize (should be single file)
+    :param context: Exporting context
+    :return: String of the content of the file
+    """
+    filtered_annotations = [
+        ann for ann in annotations if isinstance(ann, allowed_annotation_types[context.annotation_type])
+    ]
+
+    if len(filtered_annotations) != len(annotations):
+        logger.warning(
+            f"{annotations[0].filename} has {len(annotations) - len(filtered_annotations)} "
+            f"annotations of the wrong type that won't be exported"
+        )
+
+    if len(filtered_annotations) == 0:
+        return None
+
+    export_fn = export_lookup[context.annotation_type]
+
+    return "\n".join([export_fn(ann, context) for ann in filtered_annotations])
+
+
+def export_to_fs(
+    context: YoloContext,
+    annotations: List[IRImageAnnotationBase],
+    export_dir: Union[str, Path] = ".",
+    meta_file="yolo_dagshub.yaml",
+) -> Path:
+    """
+    Exports annotations to YOLO format.
+
+    This function exports them in a way that allows you to train with YOLO right away,
+    as long as the images have already been copied to the data folder.
+
+    :param context: Context for exporting. Set the ``path`` attribute to specify the directory with the data,
+        otherwise exports a ``data`` folder in the current working directory.
+    :param annotations: Annotations to export
+    :param export_dir: Directory to export to. If not specified, exports to the current working directory.
+    :param meta_file: Name of the YAML file of the YOLO dataset definition.
+        This file will be written to the parent directory of the data path.
+
+    :return: Path to the YAML file with the exported data
+    """
+    if context.path is None:
+        print(f"`YoloContext.path` was not set. Exporting to {os.path.join(os.getcwd(), 'data')}")
+        context.path = Path("data")
+
+    grouped_annotations = group_annotations_by_filename(annotations)
+
+    export_path = Path(export_dir)
+
+    for filename, anns in grouped_annotations.items():
+        annotation_filepath = replace_folder(
+            Path(filename), context.image_dir_name, context.label_dir_name, context.label_extension
+        )
+        if annotation_filepath is None:
+            logger.warning(f"Couldn't generate annotation file path for image file [{filename}]")
+            continue
+        annotation_filename = export_path / context.path / annotation_filepath
+        annotation_filename.parent.mkdir(parents=True, exist_ok=True)
+        annotation_content = annotations_to_string(anns, context)
+        if annotation_content is not None:
+            with open(annotation_filename, "w") as f:
+                f.write(annotation_content)
+
+    # TODO: test/val splitting
+    yaml_file_path = export_path / meta_file
+    with open(yaml_file_path, "w") as yaml_f:
+        yaml_f.write(context.get_yaml_content())
+
+    logger.warning(f"Saved annotations to {context.path}\nand .YAML file at {yaml_file_path}")
+
+    return yaml_file_path.absolute()
@@ -0,0 +1,21 @@
+from pathlib import Path
+from typing import Union, Optional, Tuple
+
+import PIL.Image
+
+ImageType = Union[str, Path, PIL.Image.Image]
+
+
+def determine_image_dimensions(
+    image_width: Optional[int] = None,
+    image_height: Optional[int] = None,
+    image: Optional[ImageType] = None,
+) -> Tuple[int, int]:
+    if image_width is not None and image_height is not None:
+        return image_width, image_height
+    if image is None:
+        raise ValueError("Either image or image_width and image_height should be provided")
+
+    if not isinstance(image, PIL.Image.Image):
+        image = PIL.Image.open(image)
+    return image.size
@@ -0,0 +1,18 @@
+from typing import Callable, Dict
+
+from lxml.etree import ElementBase
+
+from .box import parse_box
+from .polygon import parse_polygon
+from .points import parse_points
+from .skeleton import parse_skeleton
+from dagshub_annotation_converter.ir.image import IRImageAnnotationBase
+
+CVATParserFunction = Callable[[ElementBase, ElementBase], IRImageAnnotationBase]
+
+annotation_parsers: Dict[str, CVATParserFunction] = {
+    "box": parse_box,
+    "polygon": parse_polygon,
+    "points": parse_points,
+    "skeleton": parse_skeleton,
+}