Merge branch 'fuse_cla_and_new_nodelib'

zauberzeug · Oct 24, 2023 · df7ff1d · df7ff1d
2 parents 2afd557 + 88267bc
commit df7ff1d
Show file tree

Hide file tree

Showing 157 changed files with 21,024 additions and 1,249 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ __pycache__/
 *.wts
 *.engine
 *.pt
+trainer/runs
diff --git a/README.md b/README.md
@@ -1,6 +1,25 @@
 # Yolov5 Nodes
 
-Learning Loop Trainer and Detector Node for Yolov5. Based on https://github.com/ultralytics/yolov5
+Learning Loop Trainer and Detector Node for Yolov5 (object detection and classification of images). The DL part is based on https://github.com/ultralytics/yolov5
+This repository is an implementation of Nodes that interact with the [Zauberzeug Learning Loop](https://github.com/zauberzeug/learning_loop_node).
+
+
+# Trainer
+
+This node is used to train Yolov5 Models in the Learning Loop. It is based on [this image](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-07.html) running Python 3.10.
+
+## Images
+
+Trainer Docker-Images are published on https://hub.docker.com/r/zauberzeug/yolov5-trainer
+
+Images can be pulled with `docker pull zauberzeug/yolov5-trainer:latest`.
+The script `docker.sh` in the folder `training` is recommended used to interact with this image. 
+It is required to setup a .env file in the training folder with the following values
+
+- HOST=learning-loop.ai"
+- ORGANIZATION=zauberzeug"
+- PROJECT=demo"
+- YOLOV5_MODE=CLASSIFICATION | DETECTION
 
 # Detector
 
@@ -17,6 +36,7 @@ Mandatory parameters (Please adapt as needed):
 - HOST=learning-loop.ai"
 - ORGANIZATION=zauberzeug"
 - PROJECT=demo"
+- YOLOV5_MODE=CLASSIFICATION | DETECTION
 
 ## Publish a new release
 
@@ -48,6 +68,6 @@ with open('test.jpg', 'rb') as f:
 
 # Formats
 
-The trainer uses the `yolov5_pytorch` format identifyer.
-When it saves a model to the Learning Loop it saves the model as `yolov5_pytorch` and `yolov5_wts`.
-The latter is used by the detector to create an engine file as required by tensorrtx (see https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5).
+The trainer uses the `yolov5_pytorch` format identifier (`yolov5_cla_pytorch` for classification).
+When it saves a model to the Learning Loop it saves the model as `yolov5_pytorch` and `yolov5_wts` (respectively `yolov5_cla_pytorch` and `yolov5_cla_wts` for classification).
+The wts formats may be used by a detector wunning on a NVIDIA jetson device to create an engine file as required by tensorrtx (see https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5).
diff --git a/trainer/.vscode/settings.json b/trainer/.vscode/settings.json
@@ -14,8 +14,10 @@
     "--disable=C0114", // Missing module docstring
     "--disable=C0301", // Line too long (exceeds character limit)
     "--disable=W0718", // Catching too general exception
+    "--disable=W0719", // Rraising too general exception
     "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments
-    "--disable=W1514" // Using open without explicitly specifying an encoding
+    "--disable=W1514", // Using open without explicitly specifying an encoding
+    "--generated-members=numpy.* ,torch.*,cv2.*" // Required because pylint doesn't recognize numpy and torch methods
   ],
   "[python]": {
     "editor.defaultFormatter": "ms-python.autopep8",

diff --git a/trainer/Dockerfile b/trainer/Dockerfile
@@ -18,40 +18,38 @@ RUN apt update && \
     apt install -y zip htop screen libgl1-mesa-glx libmpich-dev jpeginfo && \
     rm -rf /var/lib/apt/lists/*
 RUN python3 -m pip install --upgrade pip
-
-# TODO httptools is required but the test run even if it is not installed ... we need to extend the tests 
-RUN python3 -m pip install seaborn thop coremltools onnx gsutil notebook wandb>=0.12.2 opencv-python==4.5.5.64 torchinfo python-dotenv httptools
+RUN python3 -m pip install seaborn thop coremltools onnx gsutil notebook wandb>=0.12.2 opencv-python==4.5.5.64 torchinfo python-dotenv httptools dacite
+# https://githubmemory.com/repo/ultralytics/yolov5/issues/5374
+RUN pip install --no-cache -U torch torchvision numpy Pillow
+# CHECK IF THIS IS NEEDED because the image is already based on pytorch
 
 WORKDIR /
 
-# https://githubmemory.com/repo/ultralytics/yolov5/issues/5374
-RUN pip install --no-cache -U torch torchvision numpy Pillow
 
-# fetch yolov5 code
-RUN git clone https://github.com/ultralytics/yolov5.git
-WORKDIR /yolov5
-RUN git checkout 342fe05e6c88221750ce7e90b7d2e8baabd397dc
-COPY *.patch ./
-RUN git apply confusion_matrices.patch && git apply clear_epochs.patch && git apply ignore_previous_fitness.patch && rm *.patch
-# TODO there is no requirements.txt file ?!
-RUN python3 -m pip install --no-cache -r requirements.txt
-RUN pip uninstall -y nvidia-tensorboard nvidia-tensorboard-plugin-dlprof
-# wts file generator for detector format
-RUN wget https://raw.githubusercontent.com/wang-xinyu/tensorrtx/master/yolov5/gen_wts.py
+# fetch trainer code
+COPY ./ /yolov5_node/trainer/
+
 
+# setup python
+RUN python3 -m pip install --no-cache -r /yolov5_node/trainer/app_code/yolov5/requirements.txt
+RUN pip uninstall -y nvidia-tensorboard nvidia-tensorboard-plugin-dlprof
 RUN python3 -m pip install autopep8 debugpy gunicorn pyyaml uvloop pytest-asyncio ruamel.yaml
+# LL_NODE-Library can be overwritten by local version if environment variable LINKLL is set to TRUE
 RUN python3 -m pip install "learning_loop_node==v0.7.55"
 
-ADD ./ /yolov5_node/trainer/
-RUN ln -sf /yolov5_node/trainer /app
+
+# link trainer code to /app and yolov5 into /app/app_code/yolov5
+RUN ln -sf /yolov5_node/trainer /app 
+#RUN ln -sf /yolov5 /app/app_code/yolov5
 RUN git config --global --add safe.directory /yolov5_node && git config --global --add safe.directory /learning_loop_node
 
 WORKDIR /app
 
 EXPOSE 80
 
-ENV PYTHONPATH="$PYTHONPATH:/yolov5"
+#ENV PYTHONPATH="$PYTHONPATH:/yolov5"
 ENV PYTHONPATH="$PYTHONPATH:/"
-ENV PYTHONPATH="$PYTHONPATH:/opt/conda/lib/python3.8/site-packages"
+# TODO CHECK
+#ENV PYTHONPATH="$PYTHONPATH:/opt/conda/lib/python3.10/site-packages" 
 ENV TZ=Europe/Amsterdam
 CMD /app/start.sh
diff --git a/trainer/app_code/__init__.py b/trainer/app_code/__init__.py
diff --git a/trainer/batch_size_calculation.py → trainer/app_code/batch_size_calculation.py b/trainer/batch_size_calculation.py → trainer/app_code/batch_size_calculation.py
@@ -1,13 +1,15 @@
 import asyncio
 import logging
+import os
 from multiprocessing import Process, Queue
 
 import torch
 import yaml
-from learning_loop_node.trainer import trainer_utils
-from models.yolo import Model
+from learning_loop_node.helpers.misc import get_free_memory_mb
 from torchinfo import Verbosity, summary
-from utils.downloads import attempt_download
+
+from .yolov5.models.yolo import Model
+from .yolov5.utils.downloads import attempt_download
 
 
 async def calc(training_path: str, model_file: str, hyp_path: str, dataset_path: str, img_size: int) -> int:
@@ -20,7 +22,7 @@ async def calc(training_path: str, model_file: str, hyp_path: str, dataset_path:
             await asyncio.sleep(1)
             logging.warning('still calculating best batch size')
     except asyncio.CancelledError:
-        logging.warning('during batch size calculation, the training was cancelled')
+        logging.warning('the training was cancelled during batch size calculation')
         p.kill()
         raise
 
@@ -35,8 +37,8 @@ async def calc(training_path: str, model_file: str, hyp_path: str, dataset_path:
 
 def _calc_batch_size(
         queue: Queue, training_path: str, model_file: str, hyp_path: str, dataset_path: str, img_size: int) -> None:
-    logging.error('calc_batch_size.....')
-    import os
+    logging.info('calc_batch_size.....')
+
     os.chdir('/tmp')
 
     with open(hyp_path) as f:
@@ -51,7 +53,7 @@ def _calc_batch_size(
     r = torch.cuda.memory_reserved(0)
     a = torch.cuda.memory_allocated(0)
     logging.error(f'{t}, {r}, {a}')
-    free_mem = trainer_utils.get_free_memory_mb()
+    free_mem = get_free_memory_mb()
     fraction = 0.95
     free_mem *= fraction
     logging.info(f'{fraction:.0%} of free memory ({free_mem}) in use')
@@ -61,7 +63,7 @@ def _calc_batch_size(
     try:
         ckpt = torch.load(model_file, map_location=device)
     except FileNotFoundError:
-        # Continued Training
+        # Continue Training
         ckpt = torch.load(f'{training_path}/{model_file}', map_location=device)
 
     model = Model(ckpt['model'].yaml, ch=3, nc=dataset.get('nc'), anchors=hyp.get('anchors')).to(device)  # create
@@ -96,5 +98,5 @@ def _calc_batch_size(
         queue.put(best_batch_size)
         logging.info(f'found best matching batch size:  {best_batch_size}')
     else:
-        logging.error(f'could not find best matching batch size')
-        raise Exception('could not find best matching batch size')
+        logging.error('Did not find best matching batch size')
+        raise Exception('Did not find best matching batch size')
diff --git a/trainer/detect.py → trainer/app_code/detect.py b/trainer/detect.py → trainer/app_code/detect.py
@@ -2,12 +2,12 @@
 import argparse
 
 import torch
-import torch.backends.cudnn as cudnn
 import torch_tensorrt
-from models.experimental import attempt_load
-from utils.datasets import LoadImages
-from utils.general import check_img_size
-from utils.torch_utils import select_device
+
+from .yolov5.models.experimental import attempt_load
+from .yolov5.utils.dataloaders import LoadImages
+from .yolov5.utils.general import check_img_size
+from .yolov5.utils.torch_utils import select_device
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--weights', nargs='+', type=str, default='model.pt', help='model.pt path')
@@ -17,7 +17,7 @@
 imgsz = 800
 device = select_device('0')  # may be replaced with "cuda"
 
-model = attempt_load(args.weights, map_location=device)  # load FP32 model
+model = attempt_load(args.weights, device=device)  # load FP32 model
 imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
 model.eval()  # set model to "inference mode"
 model.model[-1].export = True  # set Detect() layer export=True

diff --git a/trainer/app_code/model_files.py b/trainer/app_code/model_files.py
@@ -0,0 +1,58 @@
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional, Union
+
+
+def get_best(training_path: Path) -> Optional[Path]:
+    path = training_path / 'result/weights'
+    if not path.exists():
+        return None
+    weightfiles = [path / f for f in os.listdir(path) if 'best' in f and f.endswith('.pt')]
+    if len(weightfiles) == 0:
+        return None
+    return weightfiles[0]
+
+
+def get_all_weightfiles(training_path: Path) -> List[Path]:
+    path = (training_path / 'result/weights').absolute()
+    if not path.exists():
+        return []
+    weightfiles = [path / f for f in os.listdir(path) if 'epoch' in f and f.endswith('.pt')]
+    return weightfiles
+
+
+def _epoch_from_weightfile(weightfile: Path) -> int:
+    number = weightfile.name[5:-3]
+    if number == '':
+        return 0
+    return int(number)
+
+
+def delete_older_epochs(training_path: Path, weightfile: Path):
+    all_weightfiles = get_all_weightfiles(training_path)
+
+    target_epoch = _epoch_from_weightfile(weightfile)
+    for f in all_weightfiles:
+        if _epoch_from_weightfile(f) < target_epoch:
+            _try_remove(f)
+            delete_json_for_weightfile(f)
+
+
+def delete_json_for_weightfile(weightfile: Path):
+    _try_remove(weightfile.with_suffix('.json'))
+
+
+def _try_remove(file: Path):
+    try:
+        os.remove(file)
+    except Exception:
+        logging.exception(f'could not remove {file}')
+
+
+def get_new(training_path: Path) -> Union[Path, None]:
+    all_weightfiles = get_all_weightfiles(training_path)
+    if all_weightfiles:
+        all_weightfiles.sort(key=_epoch_from_weightfile)
+        return all_weightfiles[-1]
+    return None
diff --git a/trainer/app_code/tests/__init__.py b/trainer/app_code/tests/__init__.py
diff --git a/trainer/app_code/tests/conftest.py b/trainer/app_code/tests/conftest.py
@@ -0,0 +1,105 @@
+import asyncio
+import logging
+import os
+import shutil
+import subprocess
+from typing import Dict
+
+import icecream
+import pytest
+from _pytest.fixtures import SubRequest
+# from dotenv import load_dotenv
+from learning_loop_node.data_classes import Context
+from learning_loop_node.data_exchanger import DataExchanger
+from learning_loop_node.loop_communication import LoopCommunicator
+
+icecream.install()
+logging.basicConfig(level=logging.INFO)
+
+# load_dotenv()
+
+# -------------------- Session fixtures --------------------
+
+
+@pytest.fixture(scope="session")
+def event_loop():
+    """Overrides pytest default function scoped event loop"""
+    policy = asyncio.get_event_loop_policy()
+    loop = policy.new_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="session")
+def prepare_model():
+    """Download model for testing"""
+    if not os.path.exists('app_code/tests/test_data/model.pt'):
+        url = 'https://github.com/ultralytics/yolov5/releases/download/v6.0/yolov5n.pt'
+        result = subprocess.run(f'curl  -L {url} -o app_code/tests/test_data/model.pt', shell=True, check=True)
+        assert result.returncode == 0
+    assert os.path.exists('app_code/tests/test_data/model.pt')
+    yield
+
+# -------------------- Class marks --------------------
+
+
+@pytest.fixture(autouse=True, scope='class')
+async def check_marks(request: SubRequest, glc: LoopCommunicator):  # pylint: disable=redefined-outer-name
+    """Set environment variables for testing and generate project if requested"""
+
+    markers = list(request.node.iter_markers('environment'))
+    assert len(markers) <= 1, 'Only one environment marker allowed'
+    if len(markers) == 1:
+        marker = markers[0]
+        os.environ['LOOP_ORGANIZATION'] = marker.kwargs['organization']
+        os.environ['LOOP_PROJECT'] = marker.kwargs['project']
+        os.environ['YOLOV5_MODE'] = marker.kwargs['mode']
+
+    markers = list(request.node.iter_markers('generate_project'))
+    assert len(markers) <= 1, 'Only one generate_project marker allowed'
+    if len(markers) == 1:
+        marker = markers[0]
+        configuration: Dict = marker.kwargs['configuration']
+        project = configuration['project_name']
+        # May not return 200 if project does not exist
+        await glc.delete(f"/zauberzeug/projects/{project}?keep_images=true")
+        await asyncio.sleep(1)
+        assert (await glc.post("/zauberzeug/projects/generator", json=configuration)).status_code == 200
+        await asyncio.sleep(1)
+        yield
+        # assert (await lc.delete(f"/zauberzeug/projects/{project}?keep_images=true")).status_code == 200
+    else:
+        yield
+
+
+# -------------------- Optional fixtures --------------------
+
+@pytest.fixture(scope="session")
+async def glc():
+    """The same LoopCommunicator is used for all tests
+    Credentials are read from environment variables"""
+
+    lc = LoopCommunicator()
+    await lc.ensure_login()
+    yield lc
+    await lc.shutdown()
+
+
+@pytest.fixture()
+def data_exchanger(glc: LoopCommunicator):  # pylint: disable=redefined-outer-name
+    context = Context(organization=os.environ['LOOP_ORGANIZATION'], project=os.environ['LOOP_PROJECT'])
+    dx = DataExchanger(context, glc)
+    yield dx
+
+
+@pytest.fixture()
+def use_training_dir(prepare_model, request: SubRequest):
+    """Step into a temporary directory for training tests and back out again"""
+
+    shutil.rmtree('/tmp/test_training', ignore_errors=True)
+    os.makedirs('/tmp/test_training', exist_ok=True)
+    shutil.copyfile('app_code/tests/test_data/model.pt', '/tmp/test_training/model.pt')
+    os.chdir('/tmp/test_training/')
+    yield
+    shutil.rmtree('/tmp/test_training', ignore_errors=True)
+    os.chdir(request.config.invocation_dir)
diff --git a/.../214c1ff5-f513-b2e5-fc7c-a2de958dcf34.jpg → .../214c1ff5-f513-b2e5-fc7c-a2de958dcf34.jpg b/.../214c1ff5-f513-b2e5-fc7c-a2de958dcf34.jpg → .../214c1ff5-f513-b2e5-fc7c-a2de958dcf34.jpg
diff --git a/.../8647fc30-c46c-4d13-a3fd-ead3b9a67652.jpg → .../8647fc30-c46c-4d13-a3fd-ead3b9a67652.jpg b/.../8647fc30-c46c-4d13-a3fd-ead3b9a67652.jpg → .../8647fc30-c46c-4d13-a3fd-ead3b9a67652.jpg
diff --git a/.../91b70f3e-89f5-6c62-af3b-6eeb7fafeb23.jpg → .../91b70f3e-89f5-6c62-af3b-6eeb7fafeb23.jpg b/.../91b70f3e-89f5-6c62-af3b-6eeb7fafeb23.jpg → .../91b70f3e-89f5-6c62-af3b-6eeb7fafeb23.jpg
diff --git a/.../a2852e15-fe1b-2cf7-314f-cb662f37559a.jpg → .../a2852e15-fe1b-2cf7-314f-cb662f37559a.jpg b/.../a2852e15-fe1b-2cf7-314f-cb662f37559a.jpg → .../a2852e15-fe1b-2cf7-314f-cb662f37559a.jpg
diff --git a/.../af85277c-ad3e-6a52-564b-4a1012cb9922.jpg → .../af85277c-ad3e-6a52-564b-4a1012cb9922.jpg b/.../af85277c-ad3e-6a52-564b-4a1012cb9922.jpg → .../af85277c-ad3e-6a52-564b-4a1012cb9922.jpg
diff --git a/.../ffaa4b25-23f8-4261-823c-4e598a801498.jpg → .../ffaa4b25-23f8-4261-823c-4e598a801498.jpg b/.../ffaa4b25-23f8-4261-823c-4e598a801498.jpg → .../ffaa4b25-23f8-4261-823c-4e598a801498.jpg
diff --git a/trainer/tests/example_training.json → trainer/app_code/tests/example_training.json b/trainer/tests/example_training.json → trainer/app_code/tests/example_training.json
diff --git a/trainer/hyp.yaml → trainer/app_code/tests/test_data/hyp.yaml b/trainer/hyp.yaml → trainer/app_code/tests/test_data/hyp.yaml
diff --git a/trainer/app_code/tests/test_data/hyp_cla.yaml b/trainer/app_code/tests/test_data/hyp_cla.yaml
@@ -0,0 +1,21 @@
+# Own set of hyps that are forwarded to albumentation
+
+# Optimizer is hardcoded to SGD
+
+lr0: 0.001 # initial learning rate (SGD=1E-2, Adam=1E-3)
+momentum: 0.843 # SGD momentum/Adam beta1
+decay: 0.00001 # optimizer weight decay
+label_smoothing: 0.1 # Label smoothing epsilon
+batch_size: 4
+epochs: 3
+
+# Augmentation
+jitter: 0.4 # colour jitter forr brightness, contrast, satuaration (hue is c-jitter/2)
+hue_jitter: 0.1
+min_scale: 0.1 # minimum image scale for augmentation
+min_ratio: 0.75 # minimum aspect ratio for augmentation
+r90_prob: 0.0 # rotate 90 probability
+
+# Maybe overwritten by learning loop
+hflip: 0.5 # horizontal flip probability
+vflip: 0.5 # vertical flip probability
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ __pycache__/ @@
     *.wts
     *.engine
     *.pt
+    trainer/runs