From 493441da5aec76f5a65e03063c82e649d02b7e20 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 20 Nov 2024 11:18:12 +0100 Subject: [PATCH 1/5] adapt to nodelib 0.12 --- trainer/app_code/model_files.py | 17 +-- trainer/app_code/tests/test_yolov5.py | 136 ++++++++++++++-------- trainer/app_code/tests/test_yolov5_cla.py | 76 +++++++----- trainer/app_code/yolov5_format.py | 33 +++--- trainer/app_code/yolov5_trainer.py | 20 ++-- 5 files changed, 176 insertions(+), 106 deletions(-) diff --git a/trainer/app_code/model_files.py b/trainer/app_code/model_files.py index a31f77a..90947f0 100644 --- a/trainer/app_code/model_files.py +++ b/trainer/app_code/model_files.py @@ -22,19 +22,22 @@ def get_all_weightfiles(training_path: Path) -> List[Path]: return weightfiles -def _epoch_from_weightfile(weightfile: Path) -> int: - number = weightfile.name[5:-3] - if number == '': +def epoch_from_weightfile(weightfile: Path) -> int: + try: + number = weightfile.name[5:-3] + if number == '': + return 0 + return int(number) + except ValueError: return 0 - return int(number) def delete_older_epochs(training_path: Path, weightfile: Path): all_weightfiles = get_all_weightfiles(training_path) - target_epoch = _epoch_from_weightfile(weightfile) + target_epoch = epoch_from_weightfile(weightfile) for f in all_weightfiles: - if _epoch_from_weightfile(f) < target_epoch: + if epoch_from_weightfile(f) < target_epoch: _try_remove(f) delete_json_for_weightfile(f) @@ -53,6 +56,6 @@ def _try_remove(file: Path): def get_new(training_path: Path) -> Union[Path, None]: all_weightfiles = get_all_weightfiles(training_path) if all_weightfiles: - all_weightfiles.sort(key=_epoch_from_weightfile) + all_weightfiles.sort(key=epoch_from_weightfile) return all_weightfiles[-1] return None diff --git a/trainer/app_code/tests/test_yolov5.py b/trainer/app_code/tests/test_yolov5.py index 943d8ee..7ada274 100644 --- a/trainer/app_code/tests/test_yolov5.py +++ b/trainer/app_code/tests/test_yolov5.py @@ -5,13 +5,14 @@ import os import shutil from pathlib import Path -from typing import Dict +from typing import Dict, List, Tuple from uuid import uuid4 import pytest -from learning_loop_node.data_classes import (Category, Context, Hyperparameter, - Training, TrainingData) +from learning_loop_node.data_classes import (Category, Context, TrainerState, + Training) from learning_loop_node.data_exchanger import DataExchanger +from learning_loop_node.helpers.misc import create_image_folder from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.trainer.downloader import TrainingsDownloader from learning_loop_node.trainer.executor import Executor @@ -37,15 +38,22 @@ class TestWithLoop: """This test environment sets up the environment vars and a test project in the loop which is used for testing.""" - async def test_training_creates_model(self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator): + @pytest.mark.usefixtures('use_training_dir') + async def test_training_creates_model(self, data_exchanger: DataExchanger, glc: LoopCommunicator): """Test if training creates a model""" + + project_folder = os.getcwd() + images_folder = create_image_folder(project_folder) + categories, image_data = await download_training_data(images_folder, data_exchanger, glc) training = Training(id=str(uuid4()), - project_folder=os.getcwd(), - training_folder=os.getcwd() + '/training', - images_folder=os.getcwd() + '/images', - base_model_uuid_or_name='model.pt', - context=Context(project='pytest_yolo5det', organization='zauberzeug')) - training.data = await create_training_data(training, data_exchanger, glc) + project_folder=project_folder, + training_folder=project_folder + '/training', + images_folder=images_folder, + model_variant='', + context=Context(project='pytest_yolo5det', organization='zauberzeug'), + categories=categories, hyperparameters={}, training_number=1, + training_state=TrainerState.Initialized.value, + image_data=image_data) yolov5_format.create_file_structure(training) executor = Executor(os.getcwd()) # from https://github.com/WongKinYiu/yolor#training @@ -59,19 +67,25 @@ async def test_training_creates_model(self, use_training_dir, data_exchanger: Da best = training.training_folder + '/result/weights/best.pt' assert os.path.isfile(best) - async def test_parse_progress_from_log(self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator): + @pytest.mark.usefixtures('use_training_dir') + async def test_parse_progress_from_log(self, data_exchanger: DataExchanger, glc: LoopCommunicator): """Test if progress is parsed correctly from log""" trainer = Yolov5TrainerLogic() trainer.epochs = 2 + project_folder = os.getcwd() + images_folder = create_image_folder(project_folder) + categories, image_data = await download_training_data(images_folder, data_exchanger, glc) trainer._training = Training( id=str(uuid4()), - project_folder=os.getcwd(), - training_folder=os.getcwd() + '/training', - images_folder=os.getcwd() + '/images', - base_model_uuid_or_name='model.pt', + project_folder=project_folder, + training_folder=project_folder + '/training', + images_folder=images_folder, + model_variant='', context=Context(project='pytest_yolo5det', organization='zauberzeug'), + categories=categories, hyperparameters={}, training_number=1, + training_state=TrainerState.Initialized.value, + image_data=image_data, ) - trainer.training.data = await create_training_data(trainer.training, data_exchanger, glc) yolov5_format.create_file_structure(trainer.training) trainer._executor = Executor(os.getcwd()) @@ -93,16 +107,21 @@ async def test_parse_progress_from_log(self, use_training_dir, data_exchanger: D @pytest.mark.environment(organization='', project='', mode='DETECTION') class TestWithDetection: - async def test_create_file_structure_box_size(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + async def test_create_file_structure_box_size(self): categories = [Category(name='point_category_1', id='uuid_of_class_1'), Category(name='point_category_2', id='uuid_of_class_2', point_size=30)] image_data = [{'set': 'train', 'id': 'image_1', 'width': 100, 'height': 100, 'box_annotations': [], 'point_annotations': [{'category_id': 'uuid_of_class_1', 'x': 50, 'y': 60}, {'category_id': 'uuid_of_class_2', 'x': 60, 'y': 70}]}] trainer = Yolov5TrainerLogic() - trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=image_data, categories=categories) + trainer._training = Training( + id='someid', context=Context(organization='o', project='p'), + project_folder='./', images_folder='./', training_folder='./', + image_data=image_data, categories=categories, hyperparameters={}, + model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) + yolov5_format.create_file_structure(trainer.training) with open('./train/image_1.txt', 'r') as f: @@ -111,13 +130,17 @@ async def test_create_file_structure_box_size(self, use_training_dir): assert '0 0.500000 0.600000 0.200000 0.200000' in lines[0] assert '1 0.600000 0.700000 0.300000 0.300000' in lines[1] - async def test_new_model_discovery(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + async def test_new_model_discovery(self): """This test also triggers the creation of a wts file""" trainer = Yolov5TrainerLogic() - trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='box')]) + trainer._training = Training( + id='someid', context=Context(organization='o', project='p'), + project_folder='./', images_folder='./', training_folder='./', image_data=[], + categories=[Category(name='class_a', id='uuid_of_class_a', type='box')], + hyperparameters={}, model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) + assert trainer._get_new_best_training_state() is None, 'should not find any models' model_path = 'result/weights/published/latest.pt' @@ -152,12 +175,15 @@ async def test_new_model_discovery(self, use_training_dir): # 'yolov5_pytorch': ['/tmp/model.pt', '/tmp/test_training/hyp.yaml'], # 'yolov5_wts': ['/tmp/model.wts']} - def test_newest_model_is_used(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + def test_newest_model_is_used(self): trainer = Yolov5TrainerLogic() - trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='box')]) + trainer._training = Training( + id='someid', context=Context(organization='o', project='p'), + project_folder='./', images_folder='./', training_folder='./', image_data=[], + categories=[Category(name='class_a', id='uuid_of_class_a', type='box')], + hyperparameters={}, model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) # create some models. mock_epoch(10, {}) @@ -168,12 +194,16 @@ def test_newest_model_is_used(self, use_training_dir): assert 'epoch10.pt' not in new_model.meta_information['weightfile'] assert 'epoch200.pt' in new_model.meta_information['weightfile'] - def test_old_model_files_are_deleted_on_publish(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + def test_old_model_files_are_deleted_on_publish(self): trainer = Yolov5TrainerLogic() - trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='box')]) + trainer._training = Training( + id='someid', context=Context(organization='o', project='p'), + project_folder='./', images_folder='./', training_folder='./', image_data=[], + categories=[Category(name='class_a', id='uuid_of_class_a', type='box')], + hyperparameters={}, model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) + assert trainer._get_new_best_training_state() is None, 'should not find any models' mock_epoch(1, {'class_a': {'fp': 0, 'tp': 1, 'fn': 0}}) @@ -196,12 +226,15 @@ def test_old_model_files_are_deleted_on_publish(self, use_training_dir): _, _, files = next(os.walk("result/weights")) assert len(files) == 0 - def test_newer_model_files_are_kept_during_deleting(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + def test_newer_model_files_are_kept_during_deleting(self): trainer = Yolov5TrainerLogic() - trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='box')]) + trainer._training = Training( + id='someid', context=Context(organization='o', project='p'), + project_folder='./', images_folder='./', training_folder='./', image_data=[], + categories=[Category(name='class_a', id='uuid_of_class_a', type='box')], + hyperparameters={}, model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) # create some models. mock_epoch(10, {}) @@ -217,10 +250,13 @@ def test_newer_model_files_are_kept_during_deleting(self, use_training_dir): assert len(all_model_files) == 1 assert 'epoch201.pt' in str(all_model_files[0]), 'Epoch201 is not yed synced. It should not be deleted.' - async def test_clear_training_data(self, use_training_dir): + @pytest.mark.usefixtures('use_training_dir') + async def test_clear_training_data(self): trainer = Yolov5TrainerLogic() trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') + project_folder='./', images_folder='./', training_folder='./', + categories=[], hyperparameters={}, model_variant='', + image_data=[], training_number=1, training_state=TrainerState.Initialized.value) os.makedirs(f'{trainer.training.training_folder}/result/weights/', exist_ok=True) os.makedirs(f'{trainer.training.training_folder}/result/weights/published/', exist_ok=True) @@ -255,7 +291,9 @@ def assert_yaml_content(yaml_path, **kwargs): assert content[key] == value shutil.copy('app_code/tests/test_data/hyp.yaml', '/tmp') - hyperparameter = Hyperparameter(resolution=600, flip_rl=True, flip_ud=True) + hyperparameter = {'resolution': 600, + 'flip_rl': True, + 'flip_ud': True} assert_yaml_content('/tmp/hyp.yaml', fliplr=0, flipud=0) update_hyps('/tmp/hyp.yaml', hyperparameter) @@ -266,19 +304,17 @@ def assert_yaml_content(yaml_path, **kwargs): # ======================================================================================================================= -async def create_training_data(training: Training, data_exchanger: DataExchanger, glc: LoopCommunicator) -> TrainingData: - training_data = TrainingData() +async def download_training_data(images_folder: str, data_exchanger: DataExchanger, glc: LoopCommunicator + ) -> Tuple[List[Category], List[Dict]]: - image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(training.images_folder) - logging.info(f'got {len(image_data)} images') + image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(images_folder) response = await glc.get(f"/{os.environ['LOOP_ORGANIZATION']}/projects/{os.environ['LOOP_PROJECT']}/data") assert response.status_code != 401, 'Authentification error - did you set LOOP_USERNAME and LOOP_PASSWORD in your environment?' assert response.status_code == 200 data = response.json() - training_data.categories = Category.from_list(data['categories']) - training_data.image_data = image_data - return training_data + categories = Category.from_list(data['categories']) + return categories, image_data def mock_epoch(number: int, confusion_matrix: Dict): diff --git a/trainer/app_code/tests/test_yolov5_cla.py b/trainer/app_code/tests/test_yolov5_cla.py index 5aa2114..dd519da 100644 --- a/trainer/app_code/tests/test_yolov5_cla.py +++ b/trainer/app_code/tests/test_yolov5_cla.py @@ -4,14 +4,15 @@ import logging import os from pathlib import Path -from typing import Dict +from typing import Dict, List, Tuple from uuid import uuid4 import pytest from learning_loop_node.data_classes import (Category, CategoryType, Context, - ModelInformation, Training, - TrainingData) + ModelInformation, TrainerState, + Training) from learning_loop_node.data_exchanger import DataExchanger +from learning_loop_node.helpers.misc import create_image_folder from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.trainer.downloader import TrainingsDownloader from learning_loop_node.trainer.executor import Executor @@ -36,13 +37,20 @@ class TestWithLoop: async def test_cla_training_creates_model( self, use_training_dir, data_exchanger: DataExchanger, glc: LoopCommunicator): """Training should create a model file (best.pt)""" + + project_folder = os.getcwd() + categories, image_data = await download_training_data(create_image_folder(project_folder), data_exchanger, glc) + training = Training(id=str(uuid4()), - project_folder=os.getcwd(), + project_folder=project_folder, training_folder=os.getcwd() + '/training', images_folder=os.getcwd() + '/images', model_uuid_for_detecting='model.pt', context=Context(project=os.environ['LOOP_PROJECT'], - organization=os.environ['LOOP_ORGANIZATION'])) - training.data = await download_training_data(training, data_exchanger, glc) + organization=os.environ['LOOP_ORGANIZATION']), + categories=categories, hyperparameters={}, model_variant='', + image_data=image_data, + training_state=TrainerState.Initialized.value, + training_number=1) yolov5_format.create_file_structure_cla(training) logging.info(training.training_folder) # /tmp/test_training/ @@ -69,13 +77,20 @@ async def test_cla_parse_progress_from_log( trainer.epochs = 3 # NOTE: must correspond to the value set in test_data/hyp_cla.yaml if os.path.isfile('/tmp/model.pt'): os.remove('/tmp/model.pt') + + project_folder = os.getcwd() + categories, image_data = await download_training_data(create_image_folder(project_folder), data_exchanger, glc) + trainer._training = Training(id=str(uuid4()), - project_folder=os.getcwd(), + project_folder=project_folder, training_folder=os.getcwd() + '/training', images_folder=os.getcwd() + '/images', model_uuid_for_detecting='model.pt', - context=Context(project='demo', organization='zauberzeug')) - trainer.training.data = await download_training_data(trainer.training, data_exchanger, glc) + context=Context(project='demo', organization='zauberzeug'), + categories=categories, hyperparameters={}, model_variant='', + image_data=image_data, + training_state=TrainerState.Initialized.value, + training_number=1) yolov5_format.create_file_structure_cla(trainer.training) await asyncio.sleep(1) @@ -95,9 +110,11 @@ async def test_cla_new_model_discovery(self, use_training_dir): """The trainer should find new models""" trainer = Yolov5TrainerLogic() trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='classification')]) + project_folder='./', images_folder='./', training_folder='./', + hyperparameters={}, model_variant='', + training_number=1, training_state=TrainerState.Initialized.value, + categories=[Category(name='class_a', id='uuid_of_class_a', type='classification')], + image_data=[]) assert trainer._get_new_best_training_state() is None, 'should not find any models' model_path = 'result/weights/published/latest.pt' @@ -136,9 +153,10 @@ def test_cla_old_model_files_are_deleted_on_publish(self, use_training_dir): """When a model is published, the old model files should be deleted""" trainer = Yolov5TrainerLogic() trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=[], categories=[ - Category(name='class_a', id='uuid_of_class_a', type='classification')]) + project_folder='./', images_folder='./', training_folder='./', + categories=[Category(name='class_a', id='uuid_of_class_a', type='classification')], + hyperparameters={}, model_variant='', + image_data=[], training_number=1, training_state=TrainerState.Initialized.value) assert trainer._get_new_best_training_state() is None, 'should not find any models' mock_epoch({'class_a': {'fp': 0, 'tp': 1, 'fn': 0}}) @@ -165,7 +183,9 @@ async def test_cla_clear_training_data(self, use_training_dir): trainer = Yolov5TrainerLogic() os.makedirs('./data/o/p/trainings/some_uuid', exist_ok=True) trainer._training = Training(id='someid', context=Context(organization='o', project='p'), project_folder='./', - images_folder='./', training_folder='./data/o/p/trainings/some_uuid') + images_folder='./', training_folder='./data/o/p/trainings/some_uuid', + categories=[], hyperparameters={}, model_variant='', + image_data=[], training_number=1, training_state=TrainerState.Initialized.value) os.makedirs(f'{trainer.training.training_folder}/result/weights/', exist_ok=True) os.makedirs(f'{trainer.training.training_folder}/result/weights/published/', exist_ok=True) @@ -225,8 +245,10 @@ async def test_cla_create_file_structure(self, use_training_dir): 'category_id': 'uuid_of_class_2', }}] trainer = Yolov5TrainerLogic() trainer._training = Training(id='someid', context=Context(organization='o', project='p'), - project_folder='./', images_folder='./', training_folder='./') - trainer.training.data = TrainingData(image_data=image_data, categories=categories) + project_folder='./', images_folder='./', training_folder='./', + image_data=image_data, categories=categories, hyperparameters={}, + model_variant='', training_number=1, + training_state=TrainerState.Initialized.value) yolov5_format.create_file_structure_cla(trainer.training) @@ -234,21 +256,23 @@ async def test_cla_create_file_structure(self, use_training_dir): assert Path('./test/classification_category_2/image_2.jpg').is_symlink() -# ---------------------- HELPER FUNCTIONS ---------------------- +# ======================================================================================================================= +# ---------------------------------------------- HELPERS ---------------------------------------------------------------- +# ======================================================================================================================= + +async def download_training_data(images_folder: str, data_exchanger: DataExchanger, glc: LoopCommunicator + ) -> Tuple[List[Category], List[Dict]]: + image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(images_folder) -async def download_training_data( - training: Training, data_exchanger: DataExchanger, glc: LoopCommunicator) -> TrainingData: - training_data = TrainingData() response = await glc.get(f"/{os.environ['LOOP_ORGANIZATION']}/projects/{os.environ['LOOP_PROJECT']}/data") + assert response.status_code != 401, 'Authentification error - did you set LOOP_USERNAME and LOOP_PASSWORD in your environment?' assert response.status_code == 200 data = response.json() - training_data.categories = Category.from_list( + categories = Category.from_list( [category for category in data['categories'] if category['type'] == 'classification']) - image_data, _ = await TrainingsDownloader(data_exchanger).download_training_data(training.images_folder) - training_data.image_data = image_data - return training_data + return categories, image_data def mock_epoch(confusion_matrix: Dict): diff --git a/trainer/app_code/yolov5_format.py b/trainer/app_code/yolov5_format.py index 491c313..b823fbd 100644 --- a/trainer/app_code/yolov5_format.py +++ b/trainer/app_code/yolov5_format.py @@ -2,10 +2,9 @@ import os import shutil from pathlib import Path -from typing import Dict, List, Tuple +from typing import Any, Dict, List, Tuple -from learning_loop_node.data_classes import (CategoryType, Hyperparameter, - Training) +from learning_loop_node.data_classes import CategoryType, Training from ruamel.yaml import YAML yaml = YAML() @@ -13,9 +12,9 @@ def get_ids_and_sizes_of_point_classes(training: Training) -> Tuple[List[str], List[str]]: """Returns a list of trainingids and sizes (in px) of point classes in the training data.""" - assert training.data is not None, 'Training should have data' + assert training is not None, 'Training should have data' point_ids, point_sizes = [], [] - for i, category in enumerate(training.data.categories): + for i, category in enumerate(training.categories): if category.type == CategoryType.Point: point_ids.append(str(i)) point_sizes.append(str(category.point_size or 20)) @@ -23,8 +22,7 @@ def get_ids_and_sizes_of_point_classes(training: Training) -> Tuple[List[str], L def category_lookup_from_training(training: Training) -> Dict[str, str]: - assert training.data is not None, 'Training should have data' - return {c.name: c.id for c in training.data.categories} + return {c.name: c.id for c in training.categories} def _create_set(training: Training, set_name: str) -> int: @@ -35,7 +33,6 @@ def _create_set(training: Training, set_name: str) -> int: "class(id) x_center y_center width height" (normalized by image width and height) Note that the id here is not the uuid but the training id (0, 1, 2, ...). [see here](https://docs.ultralytics.com/tutorials/train-custom-datasets/).""" - assert training.data is not None, 'Training should have data' category_uuids = list(category_lookup_from_training(training).values()) @@ -46,7 +43,7 @@ def _create_set(training: Training, set_name: str) -> int: os.makedirs(images_path, exist_ok=True) img_count = 0 - for image in training.data.image_data: + for image in training.image_data or []: if image['set'] == set_name: img_count += 1 image_name = image['id'] + '.jpg' @@ -68,7 +65,7 @@ def _create_set(training: Training, set_name: str) -> int: yolo_boxes.append(c_id + ' ' + ' '.join([f"{c:.6f}" for c in coords]) + '\n') for point in image['point_annotations']: - size = [c for c in training.data.categories if c.id == point['category_id']][0].point_size or 20 + size = [c for c in training.categories if c.id == point['category_id']][0].point_size or 20 coords = [ point['x']/width, point['y']/height, @@ -103,15 +100,15 @@ def _create_set_cla(training: Training, set_name: str): # │ │ ├── image2.jpg count = 0 - assert training.data is not None, 'Training should have data' - for image in training.data.image_data: + assert training.image_data is not None, 'Training should have data' + for image in training.image_data: if image['set'] == set_name: image_name = image['id'] + '.jpg' classification = image['classification_annotation'] if classification: count += 1 category = classification['category_id'] - category_name = [c for c in training.data.categories if c.id == category][0].name + category_name = [c for c in training.categories if c.id == category][0].name image_path = f"{images_path}/{category_name}/{image_name}" # logging.info(f'linking {image_name} to {image_path}') os.symlink(f'{os.path.abspath(training.images_folder)}/{image_name}', image_path) @@ -144,6 +141,10 @@ def create_file_structure_cla(training: Training): def create_file_structure(training: Training): + """Uses: + - training.training_folder to create the file structure. + - training.image_data to create the image links and annotations. + - training.categories to create the annotations.""" path = training.training_folder Path(path).mkdir(parents=True, exist_ok=True) @@ -154,13 +155,13 @@ def create_file_structure(training: Training): logging.info(f'Prepared file structure with {num_train_imgs} training images and {num_test_imgs} test images') -def update_hyps(yaml_path: str, hyperparameter: Hyperparameter): +def update_hyps(yaml_path: str, hyperparameter: Dict[str, Any]): with open(yaml_path) as f: content = yaml.load(f) - content['fliplr'] = 0.5 if hyperparameter.flip_rl else 0 - content['flipud'] = 0.5 if hyperparameter.flip_ud else 0 + content['fliplr'] = 0.5 if hyperparameter['flip_rl'] else 0 + content['flipud'] = 0.5 if hyperparameter['flip_ud'] else 0 with open(yaml_path, 'w') as f: yaml.dump(content, f) diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py index 1e530fb..6c13f0b 100644 --- a/trainer/app_code/yolov5_trainer.py +++ b/trainer/app_code/yolov5_trainer.py @@ -17,7 +17,8 @@ PointDetection, PretrainedModel, TrainingStateData) from learning_loop_node.trainer import trainer_logic -from learning_loop_node.trainer.exceptions import NodeNeedsRestartError +from learning_loop_node.trainer.exceptions import (CriticalError, + NodeNeedsRestartError) from learning_loop_node.trainer.executor import Executor from . import batch_size_calculation, model_files, yolov5_format @@ -65,7 +66,7 @@ async def _start_training_from_base_model(self) -> None: await self._start_training_from_model(f'{self.training.training_folder}/model.pt') async def _start_training_from_scratch(self) -> None: - await self._start_training_from_model(f'yolov5{self.training.base_model_uuid_or_name}.pt') + await self._start_training_from_model(f'yolov5{self.training.model_variant}.pt') def _can_resume(self) -> bool: path = self.training.training_folder_path / 'result/weights/published/latest.pt' @@ -92,6 +93,11 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]: if not weightfile: return None + if self.is_cla: + epoch = None + else: + epoch = model_files.epoch_from_weightfile(weightfile) + weightfile_str = str(weightfile.absolute()) logging.info(f'found new best model at {weightfile_str}') @@ -101,7 +107,7 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]: for category_name in list(metrics.keys()): metrics[categories[category_name]] = metrics.pop(category_name) - return TrainingStateData(confusion_matrix=metrics, meta_information={'weightfile': weightfile_str}) + return TrainingStateData(confusion_matrix=metrics, meta_information={'weightfile': weightfile_str}, epoch=epoch) def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: pub_path = (self.training.training_folder_path / 'result/weights/published').absolute() @@ -116,8 +122,8 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: async def _get_latest_model_files(self) -> Dict[str, List[str]]: weightfile = (self.training.training_folder_path / "result/weights/published/latest.pt").absolute() if not os.path.isfile(weightfile): - logging.warning(f'No model found at {weightfile}') - return {} + logging.error(f'No model found at {weightfile} - Training failed!') + raise CriticalError(f'No model found at {weightfile}') shutil.copy(weightfile, '/tmp/model.pt') training_path = '/'.join(str(weightfile).split('/')[:-4]) @@ -190,7 +196,7 @@ async def _start_training_from_model(self, model: str) -> None: def move_and_update_hyps(source: Path, target: str) -> None: assert (source).exists(), 'Hyperparameter file not found at "{hyperparameter_source_path}"' shutil.copy(source, target) - yolov5_format.update_hyps(target, self.hyperparameter) + yolov5_format.update_hyps(target, self.hyperparameters) hyp_path = Path(__file__).resolve().parents[1] / ('hyp_cla.yaml' if self.is_cla else 'hyp_det.yaml') @@ -206,7 +212,7 @@ def move_and_update_hyps(source: Path, target: str) -> None: await self._start(model, " --clear") async def _start(self, model: str, additional_parameters: str = ''): - resolution = self.hyperparameter.resolution + resolution = self.training.hyperparameters['resolution'] hyperparameter_path = f'{self.training.training_folder}/hyp.yaml' self._load_hyps_set_epochs(hyperparameter_path) From b3f5b043aa9f558dae5e40455ce2a3ef694478f0 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Fri, 22 Nov 2024 12:10:28 +0100 Subject: [PATCH 2/5] refactor: rename `update_hyps` to `set_hyperparameters_in_file` --- trainer/app_code/tests/test_yolov5.py | 4 ++-- trainer/app_code/yolov5_format.py | 4 ++-- trainer/app_code/yolov5_trainer.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/trainer/app_code/tests/test_yolov5.py b/trainer/app_code/tests/test_yolov5.py index 7ada274..2999d92 100644 --- a/trainer/app_code/tests/test_yolov5.py +++ b/trainer/app_code/tests/test_yolov5.py @@ -19,7 +19,7 @@ from ruamel.yaml import YAML from .. import model_files, yolov5_format -from ..yolov5_format import update_hyps +from ..yolov5_format import set_hyperparameters_in_file from ..yolov5_trainer import Yolov5TrainerLogic # pylint: disable=protected-access,unused-argument @@ -296,7 +296,7 @@ def assert_yaml_content(yaml_path, **kwargs): 'flip_ud': True} assert_yaml_content('/tmp/hyp.yaml', fliplr=0, flipud=0) - update_hyps('/tmp/hyp.yaml', hyperparameter) + set_hyperparameters_in_file('/tmp/hyp.yaml', hyperparameter) assert_yaml_content('/tmp/hyp.yaml', fliplr=0.5, flipud=0.5) # ======================================================================================================================= diff --git a/trainer/app_code/yolov5_format.py b/trainer/app_code/yolov5_format.py index b823fbd..a24951d 100644 --- a/trainer/app_code/yolov5_format.py +++ b/trainer/app_code/yolov5_format.py @@ -100,7 +100,7 @@ def _create_set_cla(training: Training, set_name: str): # │ │ ├── image2.jpg count = 0 - assert training.image_data is not None, 'Training should have data' + assert training.image_data is not None, 'Training should have image data' for image in training.image_data: if image['set'] == set_name: image_name = image['id'] + '.jpg' @@ -155,7 +155,7 @@ def create_file_structure(training: Training): logging.info(f'Prepared file structure with {num_train_imgs} training images and {num_test_imgs} test images') -def update_hyps(yaml_path: str, hyperparameter: Dict[str, Any]): +def set_hyperparameters_in_file(yaml_path: str, hyperparameter: Dict[str, Any]): with open(yaml_path) as f: content = yaml.load(f) diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py index 6c13f0b..69b74a1 100644 --- a/trainer/app_code/yolov5_trainer.py +++ b/trainer/app_code/yolov5_trainer.py @@ -196,7 +196,7 @@ async def _start_training_from_model(self, model: str) -> None: def move_and_update_hyps(source: Path, target: str) -> None: assert (source).exists(), 'Hyperparameter file not found at "{hyperparameter_source_path}"' shutil.copy(source, target) - yolov5_format.update_hyps(target, self.hyperparameters) + yolov5_format.set_hyperparameters_in_file(target, self.hyperparameters) hyp_path = Path(__file__).resolve().parents[1] / ('hyp_cla.yaml' if self.is_cla else 'hyp_det.yaml') From ecacec7f8a2aeb20d9b0b7424d1d2b5d529aec71 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Fri, 22 Nov 2024 12:12:13 +0100 Subject: [PATCH 3/5] log: use lazy logging --- trainer/app_code/yolov5_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py index 69b74a1..169dd3a 100644 --- a/trainer/app_code/yolov5_trainer.py +++ b/trainer/app_code/yolov5_trainer.py @@ -122,7 +122,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: async def _get_latest_model_files(self) -> Dict[str, List[str]]: weightfile = (self.training.training_folder_path / "result/weights/published/latest.pt").absolute() if not os.path.isfile(weightfile): - logging.error(f'No model found at {weightfile} - Training failed!') + logging.error('No model found at %s - Training failed!', weightfile) raise CriticalError(f'No model found at {weightfile}') shutil.copy(weightfile, '/tmp/model.pt') From 5efa7d82aa23a0690c3785ca3a27d7fe2292fc1f Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 25 Nov 2024 16:57:07 +0100 Subject: [PATCH 4/5] bump trainer to nodelib 0.12.0 --- trainer/docker.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer/docker.sh b/trainer/docker.sh index 55769cb..7ac885c 100755 --- a/trainer/docker.sh +++ b/trainer/docker.sh @@ -44,8 +44,8 @@ fi # NODE_LIB_VERSION should only be used, to build the corresponding version and deploy to docker # make sure the remote repository always has the 'latest' tag (otherwise the CI tests will fail) -SEMANTIC_VERSION=0.1.7 -NODE_LIB_VERSION=0.11.1 +SEMANTIC_VERSION=0.1.8 +NODE_LIB_VERSION=0.12.0 if [ "$2" = "test_latest" ]; then image="zauberzeug/yolov5-trainer:latest" From 6c5bcaf4a15a18b32036e1c9d717b946859594a3 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Nov 2024 09:26:47 +0100 Subject: [PATCH 5/5] raise critical error onn failure of batch size calculation --- trainer/app_code/yolov5_trainer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py index 169dd3a..12bbb08 100644 --- a/trainer/app_code/yolov5_trainer.py +++ b/trainer/app_code/yolov5_trainer.py @@ -224,8 +224,13 @@ async def _start(self, model: str, additional_parameters: str = ''): else: p_ids, p_sizes = yolov5_format.get_ids_and_sizes_of_point_classes(self.training) self._try_replace_optimized_hyperparameter() - batch_size = await batch_size_calculation.calc(self.training.training_folder, model, hyperparameter_path, - f'{self.training.training_folder}/dataset.yaml', resolution) + try: + batch_size = await batch_size_calculation.calc(self.training.training_folder, model, hyperparameter_path, + f'{self.training.training_folder}/dataset.yaml', resolution) + except Exception as e: + logging.error(f'Error during batch size calculation: {e}') + raise NodeNeedsRestartError() from e + cmd = f'python /app/train_det.py --exist-ok --patience {self.patience} \ --batch-size {batch_size} --img {resolution} --data dataset.yaml --weights {model} \ --project {self.training.training_folder} --name result --hyp {hyperparameter_path} \