From 493441da5aec76f5a65e03063c82e649d02b7e20 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 20 Nov 2024 11:18:12 +0100
Subject: [PATCH 1/5] adapt to nodelib 0.12

---
 trainer/app_code/model_files.py           |  17 +--
 trainer/app_code/tests/test_yolov5.py     | 136 ++++++++++++++--------
 trainer/app_code/tests/test_yolov5_cla.py |  76 +++++++-----
 trainer/app_code/yolov5_format.py         |  33 +++---
 trainer/app_code/yolov5_trainer.py        |  20 ++--
 5 files changed, 176 insertions(+), 106 deletions(-)

diff --git a/trainer/app_code/model_files.py b/trainer/app_code/model_files.py
index a31f77a..90947f0 100644
--- a/trainer/app_code/model_files.py
+++ b/trainer/app_code/model_files.py
@@ -22,19 +22,22 @@ def get_all_weightfiles(training_path: Path) -> List[Path]:
     return weightfiles
 
 
-def _epoch_from_weightfile(weightfile: Path) -> int:
-    number = weightfile.name[5:-3]
-    if number == '':
+def epoch_from_weightfile(weightfile: Path) -> int:
+    try:
+        number = weightfile.name[5:-3]
+        if number == '':
+            return 0
+        return int(number)
+    except ValueError:
         return 0
-    return int(number)
 
 
 def delete_older_epochs(training_path: Path, weightfile: Path):
     all_weightfiles = get_all_weightfiles(training_path)
 
-    target_epoch = _epoch_from_weightfile(weightfile)
+    target_epoch = epoch_from_weightfile(weightfile)
     for f in all_weightfiles:
-        if _epoch_from_weightfile(f) < target_epoch:
+        if epoch_from_weightfile(f) < target_epoch:
             _try_remove(f)
             delete_json_for_weightfile(f)
 
@@ -53,6 +56,6 @@ def _try_remove(file: Path):
 def get_new(training_path: Path) -> Union[Path, None]:
     all_weightfiles = get_all_weightfiles(training_path)
     if all_weightfiles:
-        all_weightfiles.sort(key=_epoch_from_weightfile)
+        all_weightfiles.sort(key=epoch_from_weightfile)
         return all_weightfiles[-1]
     return None
diff --git a/trainer/app_code/tests/test_yolov5.py b/trainer/app_code/tests/test_yolov5.py
index 943d8ee..7ada274 100644
--- a/trainer/app_code/tests/test_yolov5.py
+++ b/trainer/app_code/tests/test_yolov5.py
@@ -5,13 +5,14 @@
 import os
 import shutil
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List, Tuple
 from uuid import uuid4
 
 import pytest
-from learning_loop_node.data_classes import (Category, Context, Hyperparameter,
-                                             Training, TrainingData)
+from learning_loop_node.data_classes import (Category, Context, TrainerState,
+                                             Training)
 from learning_loop_node.data_exchanger import DataExchanger
+from learning_loop_node.helpers.misc import create_image_folder
 from learning_loop_node.loop_communication import LoopCommunicator
 from learning_loop_node.trainer.downloader import TrainingsDownloader
 from learning_loop_node.trainer.executor import Executor
@@ -37,15 +38,22 @@ class TestWithLoop:
     """This test environment sets up the environment vars and
     a test project in the loop which is used for testing."""
 
-    async def test_training_creates_model(self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator):
+    @pytest.mark.usefixtures('use_training_dir')
+    async def test_training_creates_model(self, data_exchanger: DataExchanger, glc: LoopCommunicator):
         """Test if training creates a model"""
+
+        project_folder = os.getcwd()
+        images_folder = create_image_folder(project_folder)
+        categories, image_data = await download_training_data(images_folder, data_exchanger, glc)
         training = Training(id=str(uuid4()),
-                            project_folder=os.getcwd(),
-                            training_folder=os.getcwd() + '/training',
-                            images_folder=os.getcwd() + '/images',
-                            base_model_uuid_or_name='model.pt',
-                            context=Context(project='pytest_yolo5det', organization='zauberzeug'))
-        training.data = await create_training_data(training, data_exchanger, glc)
+                            project_folder=project_folder,
+                            training_folder=project_folder + '/training',
+                            images_folder=images_folder,
+                            model_variant='',
+                            context=Context(project='pytest_yolo5det', organization='zauberzeug'),
+                            categories=categories, hyperparameters={}, training_number=1,
+                            training_state=TrainerState.Initialized.value,
+                            image_data=image_data)
         yolov5_format.create_file_structure(training)
         executor = Executor(os.getcwd())
         # from https://github.com/WongKinYiu/yolor#training
@@ -59,19 +67,25 @@ async def test_training_creates_model(self, use_training_dir, data_exchanger: Da
         best = training.training_folder + '/result/weights/best.pt'
         assert os.path.isfile(best)
 
-    async def test_parse_progress_from_log(self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator):
+    @pytest.mark.usefixtures('use_training_dir')
+    async def test_parse_progress_from_log(self, data_exchanger: DataExchanger, glc: LoopCommunicator):
         """Test if progress is parsed correctly from log"""
         trainer = Yolov5TrainerLogic()
         trainer.epochs = 2
+        project_folder = os.getcwd()
+        images_folder = create_image_folder(project_folder)
+        categories, image_data = await download_training_data(images_folder, data_exchanger, glc)
         trainer._training = Training(
             id=str(uuid4()),
-            project_folder=os.getcwd(),
-            training_folder=os.getcwd() + '/training',
-            images_folder=os.getcwd() + '/images',
-            base_model_uuid_or_name='model.pt',
+            project_folder=project_folder,
+            training_folder=project_folder + '/training',
+            images_folder=images_folder,
+            model_variant='',
             context=Context(project='pytest_yolo5det', organization='zauberzeug'),
+            categories=categories, hyperparameters={}, training_number=1,
+            training_state=TrainerState.Initialized.value,
+            image_data=image_data,
         )
-        trainer.training.data = await create_training_data(trainer.training, data_exchanger, glc)
         yolov5_format.create_file_structure(trainer.training)
 
         trainer._executor = Executor(os.getcwd())
@@ -93,16 +107,21 @@ async def test_parse_progress_from_log(self, use_training_dir, data_exchanger: D
 @pytest.mark.environment(organization='', project='', mode='DETECTION')
 class TestWithDetection:
 
-    async def test_create_file_structure_box_size(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    async def test_create_file_structure_box_size(self):
         categories = [Category(name='point_category_1', id='uuid_of_class_1'),
                       Category(name='point_category_2', id='uuid_of_class_2', point_size=30)]
         image_data = [{'set': 'train',  'id': 'image_1', 'width': 100, 'height': 100, 'box_annotations': [],
                        'point_annotations': [{'category_id': 'uuid_of_class_1', 'x': 50, 'y': 60},
                                              {'category_id': 'uuid_of_class_2', 'x': 60, 'y': 70}]}]
         trainer = Yolov5TrainerLogic()
-        trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./', images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=image_data, categories=categories)
+        trainer._training = Training(
+            id='someid', context=Context(organization='o', project='p'),
+            project_folder='./', images_folder='./', training_folder='./',
+            image_data=image_data, categories=categories, hyperparameters={},
+            model_variant='', training_number=1,
+            training_state=TrainerState.Initialized.value)
+
         yolov5_format.create_file_structure(trainer.training)
 
         with open('./train/image_1.txt', 'r') as f:
@@ -111,13 +130,17 @@ async def test_create_file_structure_box_size(self, use_training_dir):
         assert '0 0.500000 0.600000 0.200000 0.200000' in lines[0]
         assert '1 0.600000 0.700000 0.300000 0.300000' in lines[1]
 
-    async def test_new_model_discovery(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    async def test_new_model_discovery(self):
         """This test also triggers the creation of a wts file"""
         trainer = Yolov5TrainerLogic()
-        trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./',  images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='box')])
+        trainer._training = Training(
+            id='someid', context=Context(organization='o', project='p'),
+            project_folder='./', images_folder='./', training_folder='./', image_data=[],
+            categories=[Category(name='class_a', id='uuid_of_class_a', type='box')],
+            hyperparameters={}, model_variant='', training_number=1,
+            training_state=TrainerState.Initialized.value)
+
         assert trainer._get_new_best_training_state() is None, 'should not find any models'
 
         model_path = 'result/weights/published/latest.pt'
@@ -152,12 +175,15 @@ async def test_new_model_discovery(self, use_training_dir):
         #     'yolov5_pytorch': ['/tmp/model.pt', '/tmp/test_training/hyp.yaml'],
         #     'yolov5_wts': ['/tmp/model.wts']}
 
-    def test_newest_model_is_used(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    def test_newest_model_is_used(self):
         trainer = Yolov5TrainerLogic()
-        trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./', images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='box')])
+        trainer._training = Training(
+            id='someid', context=Context(organization='o', project='p'),
+            project_folder='./', images_folder='./', training_folder='./', image_data=[],
+            categories=[Category(name='class_a', id='uuid_of_class_a', type='box')],
+            hyperparameters={}, model_variant='', training_number=1,
+            training_state=TrainerState.Initialized.value)
 
         # create some models.
         mock_epoch(10, {})
@@ -168,12 +194,16 @@ def test_newest_model_is_used(self, use_training_dir):
         assert 'epoch10.pt' not in new_model.meta_information['weightfile']
         assert 'epoch200.pt' in new_model.meta_information['weightfile']
 
-    def test_old_model_files_are_deleted_on_publish(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    def test_old_model_files_are_deleted_on_publish(self):
         trainer = Yolov5TrainerLogic()
-        trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./', images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='box')])
+        trainer._training = Training(
+            id='someid', context=Context(organization='o', project='p'),
+            project_folder='./', images_folder='./', training_folder='./', image_data=[],
+            categories=[Category(name='class_a', id='uuid_of_class_a', type='box')],
+            hyperparameters={}, model_variant='', training_number=1,
+            training_state=TrainerState.Initialized.value)
+
         assert trainer._get_new_best_training_state() is None, 'should not find any models'
 
         mock_epoch(1, {'class_a': {'fp': 0, 'tp': 1, 'fn': 0}})
@@ -196,12 +226,15 @@ def test_old_model_files_are_deleted_on_publish(self, use_training_dir):
         _, _, files = next(os.walk("result/weights"))
         assert len(files) == 0
 
-    def test_newer_model_files_are_kept_during_deleting(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    def test_newer_model_files_are_kept_during_deleting(self):
         trainer = Yolov5TrainerLogic()
-        trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./',  images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='box')])
+        trainer._training = Training(
+            id='someid', context=Context(organization='o', project='p'),
+            project_folder='./', images_folder='./', training_folder='./', image_data=[],
+            categories=[Category(name='class_a', id='uuid_of_class_a', type='box')],
+            hyperparameters={}, model_variant='', training_number=1,
+            training_state=TrainerState.Initialized.value)
 
         # create some models.
         mock_epoch(10, {})
@@ -217,10 +250,13 @@ def test_newer_model_files_are_kept_during_deleting(self, use_training_dir):
         assert len(all_model_files) == 1
         assert 'epoch201.pt' in str(all_model_files[0]), 'Epoch201 is not yed synced. It should not be deleted.'
 
-    async def test_clear_training_data(self, use_training_dir):
+    @pytest.mark.usefixtures('use_training_dir')
+    async def test_clear_training_data(self):
         trainer = Yolov5TrainerLogic()
         trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./', images_folder='./', training_folder='./')
+                                     project_folder='./', images_folder='./', training_folder='./',
+                                     categories=[], hyperparameters={}, model_variant='',
+                                     image_data=[], training_number=1, training_state=TrainerState.Initialized.value)
         os.makedirs(f'{trainer.training.training_folder}/result/weights/', exist_ok=True)
         os.makedirs(f'{trainer.training.training_folder}/result/weights/published/', exist_ok=True)
 
@@ -255,7 +291,9 @@ def assert_yaml_content(yaml_path, **kwargs):
             assert content[key] == value
 
     shutil.copy('app_code/tests/test_data/hyp.yaml', '/tmp')
-    hyperparameter = Hyperparameter(resolution=600, flip_rl=True, flip_ud=True)
+    hyperparameter = {'resolution': 600,
+                      'flip_rl': True,
+                      'flip_ud': True}
 
     assert_yaml_content('/tmp/hyp.yaml', fliplr=0, flipud=0)
     update_hyps('/tmp/hyp.yaml', hyperparameter)
@@ -266,19 +304,17 @@ def assert_yaml_content(yaml_path, **kwargs):
 # =======================================================================================================================
 
 
-async def create_training_data(training: Training, data_exchanger: DataExchanger, glc: LoopCommunicator) -> TrainingData:
-    training_data = TrainingData()
+async def download_training_data(images_folder: str, data_exchanger: DataExchanger, glc: LoopCommunicator
+                                 ) -> Tuple[List[Category], List[Dict]]:
 
-    image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(training.images_folder)
-    logging.info(f'got {len(image_data)} images')
+    image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(images_folder)
 
     response = await glc.get(f"/{os.environ['LOOP_ORGANIZATION']}/projects/{os.environ['LOOP_PROJECT']}/data")
     assert response.status_code != 401, 'Authentification error - did you set LOOP_USERNAME and LOOP_PASSWORD in your environment?'
     assert response.status_code == 200
     data = response.json()
-    training_data.categories = Category.from_list(data['categories'])
-    training_data.image_data = image_data
-    return training_data
+    categories = Category.from_list(data['categories'])
+    return categories, image_data
 
 
 def mock_epoch(number: int, confusion_matrix: Dict):
diff --git a/trainer/app_code/tests/test_yolov5_cla.py b/trainer/app_code/tests/test_yolov5_cla.py
index 5aa2114..dd519da 100644
--- a/trainer/app_code/tests/test_yolov5_cla.py
+++ b/trainer/app_code/tests/test_yolov5_cla.py
@@ -4,14 +4,15 @@
 import logging
 import os
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List, Tuple
 from uuid import uuid4
 
 import pytest
 from learning_loop_node.data_classes import (Category, CategoryType, Context,
-                                             ModelInformation, Training,
-                                             TrainingData)
+                                             ModelInformation, TrainerState,
+                                             Training)
 from learning_loop_node.data_exchanger import DataExchanger
+from learning_loop_node.helpers.misc import create_image_folder
 from learning_loop_node.loop_communication import LoopCommunicator
 from learning_loop_node.trainer.downloader import TrainingsDownloader
 from learning_loop_node.trainer.executor import Executor
@@ -36,13 +37,20 @@ class TestWithLoop:
     async def test_cla_training_creates_model(
             self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator):
         """Training should create a model file (best.pt)"""
+
+        project_folder = os.getcwd()
+        categories, image_data = await download_training_data(create_image_folder(project_folder), data_exchanger, glc)
+
         training = Training(id=str(uuid4()),
-                            project_folder=os.getcwd(),
+                            project_folder=project_folder,
                             training_folder=os.getcwd() + '/training', images_folder=os.getcwd() + '/images',
                             model_uuid_for_detecting='model.pt',
                             context=Context(project=os.environ['LOOP_PROJECT'],
-                                            organization=os.environ['LOOP_ORGANIZATION']))
-        training.data = await download_training_data(training, data_exchanger, glc)
+                                            organization=os.environ['LOOP_ORGANIZATION']),
+                            categories=categories, hyperparameters={}, model_variant='',
+                            image_data=image_data,
+                            training_state=TrainerState.Initialized.value,
+                            training_number=1)
 
         yolov5_format.create_file_structure_cla(training)
         logging.info(training.training_folder)  # /tmp/test_training/
@@ -69,13 +77,20 @@ async def test_cla_parse_progress_from_log(
         trainer.epochs = 3  # NOTE: must correspond to the value set in test_data/hyp_cla.yaml
         if os.path.isfile('/tmp/model.pt'):
             os.remove('/tmp/model.pt')
+
+        project_folder = os.getcwd()
+        categories, image_data = await download_training_data(create_image_folder(project_folder), data_exchanger, glc)
+
         trainer._training = Training(id=str(uuid4()),
-                                     project_folder=os.getcwd(),
+                                     project_folder=project_folder,
                                      training_folder=os.getcwd() + '/training',
                                      images_folder=os.getcwd() + '/images',
                                      model_uuid_for_detecting='model.pt',
-                                     context=Context(project='demo', organization='zauberzeug'))
-        trainer.training.data = await download_training_data(trainer.training, data_exchanger, glc)
+                                     context=Context(project='demo', organization='zauberzeug'),
+                                     categories=categories, hyperparameters={}, model_variant='',
+                                     image_data=image_data,
+                                     training_state=TrainerState.Initialized.value,
+                                     training_number=1)
         yolov5_format.create_file_structure_cla(trainer.training)
 
         await asyncio.sleep(1)
@@ -95,9 +110,11 @@ async def test_cla_new_model_discovery(self, use_training_dir):
         """The trainer should find new models"""
         trainer = Yolov5TrainerLogic()
         trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./',  images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='classification')])
+                                     project_folder='./', images_folder='./', training_folder='./',
+                                     hyperparameters={}, model_variant='',
+                                     training_number=1, training_state=TrainerState.Initialized.value,
+                                     categories=[Category(name='class_a', id='uuid_of_class_a', type='classification')],
+                                     image_data=[])
         assert trainer._get_new_best_training_state() is None, 'should not find any models'
 
         model_path = 'result/weights/published/latest.pt'
@@ -136,9 +153,10 @@ def test_cla_old_model_files_are_deleted_on_publish(self, use_training_dir):
         """When a model is published, the old model files should be deleted"""
         trainer = Yolov5TrainerLogic()
         trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./', images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=[], categories=[
-            Category(name='class_a', id='uuid_of_class_a', type='classification')])
+                                     project_folder='./', images_folder='./', training_folder='./',
+                                     categories=[Category(name='class_a', id='uuid_of_class_a', type='classification')],
+                                     hyperparameters={}, model_variant='',
+                                     image_data=[], training_number=1, training_state=TrainerState.Initialized.value)
         assert trainer._get_new_best_training_state() is None, 'should not find any models'
 
         mock_epoch({'class_a': {'fp': 0, 'tp': 1, 'fn': 0}})
@@ -165,7 +183,9 @@ async def test_cla_clear_training_data(self, use_training_dir):
         trainer = Yolov5TrainerLogic()
         os.makedirs('./data/o/p/trainings/some_uuid', exist_ok=True)
         trainer._training = Training(id='someid', context=Context(organization='o', project='p'), project_folder='./',
-                                     images_folder='./', training_folder='./data/o/p/trainings/some_uuid')
+                                     images_folder='./', training_folder='./data/o/p/trainings/some_uuid',
+                                     categories=[], hyperparameters={}, model_variant='',
+                                     image_data=[], training_number=1, training_state=TrainerState.Initialized.value)
         os.makedirs(f'{trainer.training.training_folder}/result/weights/', exist_ok=True)
         os.makedirs(f'{trainer.training.training_folder}/result/weights/published/', exist_ok=True)
 
@@ -225,8 +245,10 @@ async def test_cla_create_file_structure(self, use_training_dir):
                            'category_id': 'uuid_of_class_2', }}]
         trainer = Yolov5TrainerLogic()
         trainer._training = Training(id='someid', context=Context(organization='o', project='p'),
-                                     project_folder='./',  images_folder='./', training_folder='./')
-        trainer.training.data = TrainingData(image_data=image_data, categories=categories)
+                                     project_folder='./', images_folder='./', training_folder='./',
+                                     image_data=image_data, categories=categories, hyperparameters={},
+                                     model_variant='', training_number=1,
+                                     training_state=TrainerState.Initialized.value)
 
         yolov5_format.create_file_structure_cla(trainer.training)
 
@@ -234,21 +256,23 @@ async def test_cla_create_file_structure(self, use_training_dir):
         assert Path('./test/classification_category_2/image_2.jpg').is_symlink()
 
 
-# ---------------------- HELPER FUNCTIONS ----------------------
+# =======================================================================================================================
+# ---------------------------------------------- HELPERS ----------------------------------------------------------------
+# =======================================================================================================================
+
 
+async def download_training_data(images_folder: str, data_exchanger: DataExchanger, glc: LoopCommunicator
+                                 ) -> Tuple[List[Category], List[Dict]]:
+    image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(images_folder)
 
-async def download_training_data(
-        training: Training, data_exchanger: DataExchanger, glc: LoopCommunicator) -> TrainingData:
-    training_data = TrainingData()
     response = await glc.get(f"/{os.environ['LOOP_ORGANIZATION']}/projects/{os.environ['LOOP_PROJECT']}/data")
+    assert response.status_code != 401, 'Authentification error - did you set LOOP_USERNAME and LOOP_PASSWORD in your environment?'
     assert response.status_code == 200
     data = response.json()
-    training_data.categories = Category.from_list(
+    categories = Category.from_list(
         [category for category in data['categories'] if category['type'] == 'classification'])
 
-    image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(training.images_folder)
-    training_data.image_data = image_data
-    return training_data
+    return categories, image_data
 
 
 def mock_epoch(confusion_matrix: Dict):
diff --git a/trainer/app_code/yolov5_format.py b/trainer/app_code/yolov5_format.py
index 491c313..b823fbd 100644
--- a/trainer/app_code/yolov5_format.py
+++ b/trainer/app_code/yolov5_format.py
@@ -2,10 +2,9 @@
 import os
 import shutil
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
-from learning_loop_node.data_classes import (CategoryType, Hyperparameter,
-                                             Training)
+from learning_loop_node.data_classes import CategoryType, Training
 from ruamel.yaml import YAML
 
 yaml = YAML()
@@ -13,9 +12,9 @@
 
 def get_ids_and_sizes_of_point_classes(training: Training) -> Tuple[List[str], List[str]]:
     """Returns a list of trainingids and sizes (in px) of point classes in the training data."""
-    assert training.data is not None, 'Training should have data'
+    assert training is not None, 'Training should have data'
     point_ids, point_sizes = [], []
-    for i, category in enumerate(training.data.categories):
+    for i, category in enumerate(training.categories):
         if category.type == CategoryType.Point:
             point_ids.append(str(i))
             point_sizes.append(str(category.point_size or 20))
@@ -23,8 +22,7 @@ def get_ids_and_sizes_of_point_classes(training: Training) -> Tuple[List[str], L
 
 
 def category_lookup_from_training(training: Training) -> Dict[str, str]:
-    assert training.data is not None, 'Training should have data'
-    return {c.name: c.id for c in training.data.categories}
+    return {c.name: c.id for c in training.categories}
 
 
 def _create_set(training: Training, set_name: str) -> int:
@@ -35,7 +33,6 @@ def _create_set(training: Training, set_name: str) -> int:
     "class(id) x_center y_center width height" (normalized by image width and height)
     Note that the id here is not the uuid but the training id (0, 1, 2, ...).
     [see here](https://docs.ultralytics.com/tutorials/train-custom-datasets/)."""
-    assert training.data is not None, 'Training should have data'
 
     category_uuids = list(category_lookup_from_training(training).values())
 
@@ -46,7 +43,7 @@ def _create_set(training: Training, set_name: str) -> int:
     os.makedirs(images_path, exist_ok=True)
     img_count = 0
 
-    for image in training.data.image_data:
+    for image in training.image_data or []:
         if image['set'] == set_name:
             img_count += 1
             image_name = image['id'] + '.jpg'
@@ -68,7 +65,7 @@ def _create_set(training: Training, set_name: str) -> int:
                 yolo_boxes.append(c_id + ' ' + ' '.join([f"{c:.6f}" for c in coords]) + '\n')
 
             for point in image['point_annotations']:
-                size = [c for c in training.data.categories if c.id == point['category_id']][0].point_size or 20
+                size = [c for c in training.categories if c.id == point['category_id']][0].point_size or 20
                 coords = [
                     point['x']/width,
                     point['y']/height,
@@ -103,15 +100,15 @@ def _create_set_cla(training: Training, set_name: str):
         # │   │   ├── image2.jpg
 
     count = 0
-    assert training.data is not None, 'Training should have data'
-    for image in training.data.image_data:
+    assert training.image_data is not None, 'Training should have data'
+    for image in training.image_data:
         if image['set'] == set_name:
             image_name = image['id'] + '.jpg'
             classification = image['classification_annotation']
             if classification:
                 count += 1
                 category = classification['category_id']
-                category_name = [c for c in training.data.categories if c.id == category][0].name
+                category_name = [c for c in training.categories if c.id == category][0].name
                 image_path = f"{images_path}/{category_name}/{image_name}"
                 # logging.info(f'linking {image_name} to {image_path}')
                 os.symlink(f'{os.path.abspath(training.images_folder)}/{image_name}', image_path)
@@ -144,6 +141,10 @@ def create_file_structure_cla(training: Training):
 
 
 def create_file_structure(training: Training):
+    """Uses:
+    - training.training_folder to create the file structure.
+    - training.image_data to create the image links and annotations.
+    - training.categories to create the annotations."""
     path = training.training_folder
     Path(path).mkdir(parents=True, exist_ok=True)
 
@@ -154,13 +155,13 @@ def create_file_structure(training: Training):
     logging.info(f'Prepared file structure with {num_train_imgs} training images and {num_test_imgs} test images')
 
 
-def update_hyps(yaml_path: str, hyperparameter: Hyperparameter):
+def update_hyps(yaml_path: str, hyperparameter: Dict[str, Any]):
 
     with open(yaml_path) as f:
         content = yaml.load(f)
 
-    content['fliplr'] = 0.5 if hyperparameter.flip_rl else 0
-    content['flipud'] = 0.5 if hyperparameter.flip_ud else 0
+    content['fliplr'] = 0.5 if hyperparameter['flip_rl'] else 0
+    content['flipud'] = 0.5 if hyperparameter['flip_ud'] else 0
 
     with open(yaml_path, 'w') as f:
         yaml.dump(content, f)
diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py
index 1e530fb..6c13f0b 100644
--- a/trainer/app_code/yolov5_trainer.py
+++ b/trainer/app_code/yolov5_trainer.py
@@ -17,7 +17,8 @@
                                              PointDetection, PretrainedModel,
                                              TrainingStateData)
 from learning_loop_node.trainer import trainer_logic
-from learning_loop_node.trainer.exceptions import NodeNeedsRestartError
+from learning_loop_node.trainer.exceptions import (CriticalError,
+                                                   NodeNeedsRestartError)
 from learning_loop_node.trainer.executor import Executor
 
 from . import batch_size_calculation, model_files, yolov5_format
@@ -65,7 +66,7 @@ async def _start_training_from_base_model(self) -> None:
         await self._start_training_from_model(f'{self.training.training_folder}/model.pt')
 
     async def _start_training_from_scratch(self) -> None:
-        await self._start_training_from_model(f'yolov5{self.training.base_model_uuid_or_name}.pt')
+        await self._start_training_from_model(f'yolov5{self.training.model_variant}.pt')
 
     def _can_resume(self) -> bool:
         path = self.training.training_folder_path / 'result/weights/published/latest.pt'
@@ -92,6 +93,11 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         if not weightfile:
             return None
 
+        if self.is_cla:
+            epoch = None
+        else:
+            epoch = model_files.epoch_from_weightfile(weightfile)
+
         weightfile_str = str(weightfile.absolute())
         logging.info(f'found new best model at {weightfile_str}')
 
@@ -101,7 +107,7 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
             for category_name in list(metrics.keys()):
                 metrics[categories[category_name]] = metrics.pop(category_name)
 
-        return TrainingStateData(confusion_matrix=metrics, meta_information={'weightfile': weightfile_str})
+        return TrainingStateData(confusion_matrix=metrics, meta_information={'weightfile': weightfile_str}, epoch=epoch)
 
     def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         pub_path = (self.training.training_folder_path / 'result/weights/published').absolute()
@@ -116,8 +122,8 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
     async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         weightfile = (self.training.training_folder_path / "result/weights/published/latest.pt").absolute()
         if not os.path.isfile(weightfile):
-            logging.warning(f'No model found at {weightfile}')
-            return {}
+            logging.error(f'No model found at {weightfile} - Training failed!')
+            raise CriticalError(f'No model found at {weightfile}')
 
         shutil.copy(weightfile, '/tmp/model.pt')
         training_path = '/'.join(str(weightfile).split('/')[:-4])
@@ -190,7 +196,7 @@ async def _start_training_from_model(self, model: str) -> None:
         def move_and_update_hyps(source: Path, target: str) -> None:
             assert (source).exists(), 'Hyperparameter file not found at "{hyperparameter_source_path}"'
             shutil.copy(source, target)
-            yolov5_format.update_hyps(target, self.hyperparameter)
+            yolov5_format.update_hyps(target, self.hyperparameters)
 
         hyp_path = Path(__file__).resolve().parents[1] / ('hyp_cla.yaml' if self.is_cla else 'hyp_det.yaml')
 
@@ -206,7 +212,7 @@ def move_and_update_hyps(source: Path, target: str) -> None:
             await self._start(model, " --clear")
 
     async def _start(self, model: str, additional_parameters: str = ''):
-        resolution = self.hyperparameter.resolution
+        resolution = self.training.hyperparameters['resolution']
         hyperparameter_path = f'{self.training.training_folder}/hyp.yaml'
         self._load_hyps_set_epochs(hyperparameter_path)
 

From b3f5b043aa9f558dae5e40455ce2a3ef694478f0 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Fri, 22 Nov 2024 12:10:28 +0100
Subject: [PATCH 2/5] refactor: rename `update_hyps` to
 `set_hyperparameters_in_file`

---
 trainer/app_code/tests/test_yolov5.py | 4 ++--
 trainer/app_code/yolov5_format.py     | 4 ++--
 trainer/app_code/yolov5_trainer.py    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/trainer/app_code/tests/test_yolov5.py b/trainer/app_code/tests/test_yolov5.py
index 7ada274..2999d92 100644
--- a/trainer/app_code/tests/test_yolov5.py
+++ b/trainer/app_code/tests/test_yolov5.py
@@ -19,7 +19,7 @@
 from ruamel.yaml import YAML
 
 from .. import model_files, yolov5_format
-from ..yolov5_format import update_hyps
+from ..yolov5_format import set_hyperparameters_in_file
 from ..yolov5_trainer import Yolov5TrainerLogic
 
 # pylint: disable=protected-access,unused-argument
@@ -296,7 +296,7 @@ def assert_yaml_content(yaml_path, **kwargs):
                       'flip_ud': True}
 
     assert_yaml_content('/tmp/hyp.yaml', fliplr=0, flipud=0)
-    update_hyps('/tmp/hyp.yaml', hyperparameter)
+    set_hyperparameters_in_file('/tmp/hyp.yaml', hyperparameter)
     assert_yaml_content('/tmp/hyp.yaml', fliplr=0.5, flipud=0.5)
 
 # =======================================================================================================================
diff --git a/trainer/app_code/yolov5_format.py b/trainer/app_code/yolov5_format.py
index b823fbd..a24951d 100644
--- a/trainer/app_code/yolov5_format.py
+++ b/trainer/app_code/yolov5_format.py
@@ -100,7 +100,7 @@ def _create_set_cla(training: Training, set_name: str):
         # │   │   ├── image2.jpg
 
     count = 0
-    assert training.image_data is not None, 'Training should have data'
+    assert training.image_data is not None, 'Training should have image data'
     for image in training.image_data:
         if image['set'] == set_name:
             image_name = image['id'] + '.jpg'
@@ -155,7 +155,7 @@ def create_file_structure(training: Training):
     logging.info(f'Prepared file structure with {num_train_imgs} training images and {num_test_imgs} test images')
 
 
-def update_hyps(yaml_path: str, hyperparameter: Dict[str, Any]):
+def set_hyperparameters_in_file(yaml_path: str, hyperparameter: Dict[str, Any]):
 
     with open(yaml_path) as f:
         content = yaml.load(f)
diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py
index 6c13f0b..69b74a1 100644
--- a/trainer/app_code/yolov5_trainer.py
+++ b/trainer/app_code/yolov5_trainer.py
@@ -196,7 +196,7 @@ async def _start_training_from_model(self, model: str) -> None:
         def move_and_update_hyps(source: Path, target: str) -> None:
             assert (source).exists(), 'Hyperparameter file not found at "{hyperparameter_source_path}"'
             shutil.copy(source, target)
-            yolov5_format.update_hyps(target, self.hyperparameters)
+            yolov5_format.set_hyperparameters_in_file(target, self.hyperparameters)
 
         hyp_path = Path(__file__).resolve().parents[1] / ('hyp_cla.yaml' if self.is_cla else 'hyp_det.yaml')
 

From ecacec7f8a2aeb20d9b0b7424d1d2b5d529aec71 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Fri, 22 Nov 2024 12:12:13 +0100
Subject: [PATCH 3/5] log: use lazy logging

---
 trainer/app_code/yolov5_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py
index 69b74a1..169dd3a 100644
--- a/trainer/app_code/yolov5_trainer.py
+++ b/trainer/app_code/yolov5_trainer.py
@@ -122,7 +122,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
     async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         weightfile = (self.training.training_folder_path / "result/weights/published/latest.pt").absolute()
         if not os.path.isfile(weightfile):
-            logging.error(f'No model found at {weightfile} - Training failed!')
+            logging.error('No model found at %s - Training failed!', weightfile)
             raise CriticalError(f'No model found at {weightfile}')
 
         shutil.copy(weightfile, '/tmp/model.pt')

From 5efa7d82aa23a0690c3785ca3a27d7fe2292fc1f Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 25 Nov 2024 16:57:07 +0100
Subject: [PATCH 4/5] bump trainer to nodelib 0.12.0

---
 trainer/docker.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trainer/docker.sh b/trainer/docker.sh
index 55769cb..7ac885c 100755
--- a/trainer/docker.sh
+++ b/trainer/docker.sh
@@ -44,8 +44,8 @@ fi
 # NODE_LIB_VERSION should only be used, to build the corresponding version and deploy to docker
 # make sure the remote repository always has the 'latest' tag (otherwise the CI tests will fail)
 
-SEMANTIC_VERSION=0.1.7
-NODE_LIB_VERSION=0.11.1
+SEMANTIC_VERSION=0.1.8
+NODE_LIB_VERSION=0.12.0
 
 if [ "$2" = "test_latest" ]; then
     image="zauberzeug/yolov5-trainer:latest"

From 6c5bcaf4a15a18b32036e1c9d717b946859594a3 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Nov 2024 09:26:47 +0100
Subject: [PATCH 5/5] raise critical error onn failure of batch size
 calculation

---
 trainer/app_code/yolov5_trainer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py
index 169dd3a..12bbb08 100644
--- a/trainer/app_code/yolov5_trainer.py
+++ b/trainer/app_code/yolov5_trainer.py
@@ -224,8 +224,13 @@ async def _start(self, model: str, additional_parameters: str = ''):
         else:
             p_ids, p_sizes = yolov5_format.get_ids_and_sizes_of_point_classes(self.training)
             self._try_replace_optimized_hyperparameter()
-            batch_size = await batch_size_calculation.calc(self.training.training_folder, model, hyperparameter_path,
-                                                           f'{self.training.training_folder}/dataset.yaml', resolution)
+            try:
+                batch_size = await batch_size_calculation.calc(self.training.training_folder, model, hyperparameter_path,
+                                                               f'{self.training.training_folder}/dataset.yaml', resolution)
+            except Exception as e:
+                logging.error(f'Error during batch size calculation: {e}')
+                raise NodeNeedsRestartError() from e
+
             cmd = f'python /app/train_det.py --exist-ok --patience {self.patience} \
                 --batch-size {batch_size} --img {resolution} --data dataset.yaml --weights {model} \
                 --project {self.training.training_folder} --name result --hyp {hyperparameter_path} \