diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..891374c --- /dev/null +++ b/.gitignore @@ -0,0 +1,115 @@ +data/* +!data/download.sh +model/output/ + +#=========================================================================== + +# Created by https://www.gitignore.io/api/python +# Edit at https://www.gitignore.io/?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# End of https://www.gitignore.io/api/python + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8e95ed3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b31bc8d --- /dev/null +++ b/README.md @@ -0,0 +1,219 @@ +# Cam2BEV + + + +This repository contains the official implementation of our methodology for the computation of a semantically segmented bird's eye view (BEV) image given the images of multiple vehicle-mounted cameras as presented in our paper: + +> **A Sim2Real Deep Learning Approach for the Transformation of Images from Multiple Vehicle-Mounted Cameras to a Semantically Segmented Image in Bird’s Eye View** ([arXiv](https://arxiv.org/abs/XXXX.XXXXX)) +> +> Lennart Reiher, [Bastian Lampe](https://www.ika.rwth-aachen.de/en/institute/staff/bastian-lampe-m-sc.html), and [Lutz Eckstein](https://www.ika.rwth-aachen.de/en/institute/management/univ-prof-dr-ing-lutz-eckstein.html) +> [Institute for Automotive Engineering (ika), RWTH Aachen University](https://www.ika.rwth-aachen.de/en/) + +> _**Abstract**_ — Accurate environment perception is essential for automated driving. When using monocular cameras, the distance estimation of elements in the environment poses a major challenge. Distances can be more easily estimated when the camera perspective is transformed to a bird's eye view (BEV). For flat surfaces, _Inverse Perspective Mapping_ (IPM) can accurately transform images to a BEV. Three-dimensional objects such as vehicles and vulnerable road users are distorted by this transformation making it difficult to estimate their position relative to the sensor. This paper describes a methodology to obtain a corrected 360° BEV image given images from multiple vehicle-mounted cameras. The corrected BEV image is segmented into semantic classes and includes a prediction of occluded areas. The neural network approach does not rely on manually labeled data, but is trained on a synthetic dataset in such a way that it generalizes well to real-world data. By using semantically segmented images as input, we reduce the reality gap between simulated and real-world data and are able to show that our method can be successfully applied in the real world. Extensive experiments conducted on the synthetic data demonstrate the superiority of our approach compared to IPM. + +- [Repository Structure](#repository-structure) +- [Installation](#installation) +- [Data](#data) +- [Preprocessing](#preprocessing) +- [Training](#training) +- [Neural Network Architectures](#neural-network-architectures) +- [Customization](#customization) + +## Repository Structure + +``` +Cam2BEV +├── data # where our synthetic datasets are downloaded to by default +├── model # training scripts and configurations +│ ├── architecture # TensorFlow implementations of neural network architectures +│ └── one_hot_conversion # files defining the one-hot encoding of semantically segmented images +└── preprocessing # preprocessing scripts + ├── camera_configs # files defining the intrinsics/extrinsics of the cameras used in our datasets + ├── homography_converter # script to convert an OpenCV homography for usage within the uNetXST SpatialTransformers + ├── ipm # script for generating a classical homography image by means of IPM + └── occlusion # script for introducing an occluded class to the BEV images +``` + +## Installation + +We suggest to setup a **Python 3.7** virtual environment (e.g. by using _virtualenv_ or _conda_). Inside the virtual environment, users can then use _pip_ to install all package dependencies. The most important packages are _TensorFlow 2.1_ and _OpenCV 4.2_ +```bash +pip install -r requirements.txt +``` + +## Data + +We provide two synthetic datasets, which can be used to train the neural networks. The datasets are hosted in the [Cam2BEV Data Repository](https://gitlab.ika.rwth-aachen.de/cam2bev/cam2bev-data). Both datasets were used to produce the results presented in our paper: +- [*Dataset 1_FRLR*](https://gitlab.ika.rwth-aachen.de/cam2bev/cam2bev-data/-/tree/master/1_FRLR): images from four vehicle-mounted cameras, ground-truth BEV image centered above the ego vehicle +- [*Dataset 2_F*](https://gitlab.ika.rwth-aachen.de/cam2bev/cam2bev-data/-/tree/master/2_F): images from one frontal vehicle camera; ground-truth BEV image left-aligned with ego vehicle + +For more information regarding the data, please refer to the [repository's README](https://gitlab.ika.rwth-aachen.de/cam2bev/cam2bev-data). + +Both datasets can easily be downloaded and extracted by running the provided download script: +```bash +./data/download.sh +``` + +_**Note**: Download size is approximately 3.7GB, uncompressed size of both datasets is approximately 7.7GB._ + +## Preprocessing + +Our paper describes two preprocessing techniques: +(1) introducing an _occluded_ class to the label images and +(2) generating the homography image. + +### 1) Dealing with Occlusions + +Traffic participants and static obstacles may occlude parts of the environment making predictions for those areas in a BEV image mostly impossible. In order to formulate a well-posed problem, an additional semantic class needs to be introduced to the label images for areas in BEV, which are occluded in the camera perspectives. To this end, [preprocessing/occlusion](preprocessing/occlusion/) can be used. See below for an example of the occlusion preprocessing. + +![original](preprocessing/occlusion/assets/example-original.png) ![occluded](preprocessing/occlusion/assets/example-occluded.png) + + +Run the following command to process the original label images of _dataset 1_FRLR_ and introduce an _occluded_ class. You need to provide camera intrinsics/extrinsics for the drone camera and all vehicle-attached cameras (in the form of the yaml files). + +_**Note**: In batch mode, this script utilizes multiprocessing. It can however still take quite some time to process the entire dataset. Therefore, we also provide already preprocessed data._ + +```bash +cd preprocessing/occlusion +``` +```bash +./occlusion.py \ + --batch ../../data/1_FRLR/train/bev \ + --output ../../data/1_FRLR/train/bev+occlusion \ + ../camera_configs/1_FRLR/drone.yaml \ + ../camera_configs/1_FRLR/front.yaml \ + ../camera_configs/1_FRLR/rear.yaml \ + ../camera_configs/1_FRLR/left.yaml \ + ../camera_configs/1_FRLR/right.yaml +``` + +See [preprocessing/occlusion/README.md](preprocessing/occlusion/README.md) for more information. + +### 2) Projective Preprocessing + +As part of the incorporation of the Inverse Perspective Mapping (IPM) technique into our methods, the homographies, i.e. the projective transformations between vehicle camera frames and BEV need to be computed. As a preprocessing step to the first variation of our approach (Section III-C), IPM is applied to all images from the vehicle cameras. The transformation is set up to capture the same field of view as the ground truth BEV image. To this end, [preprocessing/ipm](preprocessing/ipm) can be used. See below for an example homography image computed from images of four vehicle-mounted cameras. + +![ipm](preprocessing/ipm/assets/example.png) + +Run the following command to compute a homography BEV image from all camera images of _dataset 1_FRLR_. You need to provide camera intrinsics/extrinsics for the drone camera and all vehicle-attached cameras (in the form of the yaml files). + +_**Note**: To save time, we also provide already preprocessed data._ + +```bash +cd preprocessing/ipm +``` +```bash +./ipm.py --batch --cc \ + --output ../../data/1_FRLR/train/homography \ + --drone ../camera_configs/1_FRLR/drone.yaml \ + ../camera_configs/1_FRLR/front.yaml \ + ../../data/1_FRLR/train/front \ + ../camera_configs/1_FRLR/rear.yaml \ + ../../data/1_FRLR/train/rear \ + ../camera_configs/1_FRLR/left.yaml \ + ../../data/1_FRLR/train/left \ + ../camera_configs/1_FRLR/right.yaml \ + ../../data/1_FRLR/train/right +``` + +See [preprocessing/ipm/README.md](preprocessing/ipm/README.md) for more information. + +## Training + +Use the scripts [model/train.py](model/train.py), [model/evaluate.py](model/evaluate.py), and [model/predict.py](model/predict.py) to train a model, evaluate it on validation data, and make predictions on a testing dataset. + +Input directories, training parameters, and more can be set via CLI arguments or in a config file. Run the scripts with `--help`-flag or see one of the provided exemplary config files for reference. We provide config files for either one of the networks and datasets: +- [model/config.1_FRLR.deeplab-mobilenet.yml](model/config.1_FRLR.deeplab-mobilenet.yml) +- [model/config.1_FRLR.deeplab-xception.yml](model/config.1_FRLR.deeplab-mobilenet.yml) +- [model/config.1_FRLR.unetxst.yml](model/config.1_FRLR.unetxst.yml) +- [model/config.2_F.deeplab-mobilenet.yml](model/config.2_F.deeplab-mobilenet.yml) +- [model/config.2_F.deeplab-xception.yml](model/config.2_F.deeplab-xception.yml) +- [model/config.2_F.unetxst.yml](model/config.2_F.unetxst.yml) + +The following commands will guide you through training _uNetXST_ on _dataset 1_FRLR_. + +### Training + +Start training _uNetXST_ by passing the provided config file [model/config.1_FRLR.unetxst.yml](model/config.1_FRLR.unetxst.yml). Training will automatically stop if the MIoU score on the validation dataset is not rising anymore. + +```bash +cd model/ +``` +```bash +./train.py -c config.1_FRLR.unetxst.yml +``` + +You can visualize training progress by pointing *TensorBoard* to the output directory (`model/output` by default). Training metrics will also be printed to `stdout`. + +### Evaluation + +Before evaluating your trained model, set the parameter `model-weights` to point to the `best_weights.hdf5` file in the `Checkpoints` folder of its model directory. Then run evaluation to compute a confusion matrix and class IoU scores. + +```bash +./evaluate.py -c config.1_FRLR.unetxst.yml --model-weights output//Checkpoints/best_weights.hdf5 +``` + +The evaluation results will be printed at the end of evaluation and also be exported to the `Evaluation` folder in your model directory. + +### Testing + +To actually see the predictions your network makes, try it out on unseen input images, such as the validation dataset. The predicted BEV images are exported to the directory specified by the parameter `output-dir-testing`. + +```bash +./predict.py -c config.1_FRLR.unetxst.yml --model-weights output//Checkpoints/best_weights.hdf5 --prediction-dir output//Predictions +``` + +## Neural Network Architectures + +We provide implementations for the use of the neural network architectures _DeepLab_ and _uNetXST_ in [model/architecture](model/architecture). _DeepLab_ comes with two different backbone networks: _MobileNetV2_ or _Xception_. + +### DeepLab + +The _DeepLab_ models are supposed to take the homography images computed by Inverse Perspective Mapping ([preprocessing/ipm](preprocessing/ipm)) as input. + +#### Configuration +- set `model` to `architecture/deeplab_mobilenet.py` or `architecture/deeplab_xception.py` +- set `input-training` and the other input directory parameters to the folders containing the homography images +- comment `unetxst-homographies` in the config file or don't supply it via CLI, respectively + +### uNetXST + +The _uNetXST_ model contains SpatialTransformer units, which perform IPM inside the network. Therefore, when building the network, the homographies to transform images from each camera need to be provided. + +#### Configuration +- set `model` to `architecture/uNetXST.py` +- set `input-training` and the other input directory parameters to a list of folders containing the images from each camera (e.g. `[data/front, data/rear, data/left, data/right]`) +- set `unetxst-homographies` to a Python file containing the homographies as a list of NumPy arrays stored in a variable `H` (e.g. `../preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py`) + - we provide these homographies for our two datasets in [preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py](preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py) and [preprocessing/homography_converter/uNetXST_homographies/2_F.py](preprocessing/homography_converter/uNetXST_homographies/2_F.py) + - in order to compute these homographies for different camera configurations, follow the instructions in [preprocessing/homography_converter](preprocessing/homography_converter) + +## Customization + +#### _I want to set different training hyperparameters_ + +Run the training script with `--help`-flag or have a look at one of the provided exemplary config files to see what parameters you can easily set. + +#### _I want the networks to work on more/fewer semantic classes_ + +The image datasets we provide include all 30 _CityScapes_ class colors. How these are reduced to say 10 classes is defined in the one-hot conversion files in [model/one_hot_conversion](model/one_hot_conversion). Use the training parameters `--one-hot-palette-input` and `--one-hot-palette-label` to choose one of the files. You can easily create your own one-hot conversion file, they are quite self-explanatory. + +If you adjust `--one-hot-palette-label`, you will also need to modify `--loss-weights`. Either omit the parameter to weight all output classes evenly, or compute new suitable loss weights. The weights found in the provided config files were computed (from the `model` directory) with the following Python snippet. +```python +import numpy as np +import utils +palette = utils.parse_convert_xml("one_hot_conversion/convert_9+occl.xml") +dist = utils.get_class_distribution("../data/1_FRLR/train/bev+occlusion", (256, 512), palette) +weights = np.log(np.reciprocal(list(dist.values()))) +print(weights) +``` + +#### _I want to use my own data_ + +You will need to run the preprocessing methods on your own data. A rough outline on what you need to consider: +- specify camera intrinsics/extrinsics similar to the files found in [preprocessing/camera_configs]([preprocessing/camera_configs]) +- run [preprocessing/occlusion/occlusion.py](preprocessing/occlusion/occlusion.py) +- run [preprocessing/occlusion/ipm.py](preprocessing/occlusion/ipm.py) +- compute uNetXST-compatible homographies by following the instructions in [preprocessing/homography_converter](preprocessing/homography_converter) +- adjust or create a new one-hot conversion file ([model/one_hot_conversion](model/one_hot_conversion)) +- set all training parameters in a dedicated config file +- start training diff --git a/assets/logo.png b/assets/logo.png new file mode 100644 index 0000000..f18037c Binary files /dev/null and b/assets/logo.png differ diff --git a/assets/teaser.gif b/assets/teaser.gif new file mode 100644 index 0000000..16babd1 Binary files /dev/null and b/assets/teaser.gif differ diff --git a/data/download.sh b/data/download.sh new file mode 100755 index 0000000..d46c6f1 --- /dev/null +++ b/data/download.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +URL="https://gitlab.ika.rwth-aachen.de/cam2bev/cam2bev-data/-/archive/master/cam2bev-data-master.tar.gz" +FILE="cam2bev-data.tar.gz" + +set -e +cd $(dirname $0) + +echo "Downloading Cam2BEV Data from $URL to $(realpath $FILE) ..." +wget -q --show-progress -c -O $FILE $URL + +echo -n "Extracting $FILE to $(pwd) ... " +tar -xzf $FILE +rm $FILE +mv cam2bev-data-master/* . +echo "done" + +for f in $(find . -name "*.tar.gz") +do + echo -n " Extracting $f ... " + tar -xzf $f -C "$(dirname $f)" + rm $f + echo "done" +done + +echo "Successfully downloaded Cam2BEV Data to $(pwd)" diff --git a/model/architecture/deeplab_mobilenet.py b/model/architecture/deeplab_mobilenet.py new file mode 100644 index 0000000..bc42d1f --- /dev/null +++ b/model/architecture/deeplab_mobilenet.py @@ -0,0 +1,41 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import tensorflow as tf + +from third_party.deeplabv3plus import Deeplabv3 + + +def get_network(input_shape, n_output_channels): + + deeplab = Deeplabv3(input_shape=input_shape, + classes=n_output_channels, + backbone="mobilenetv2", + weights=None, + activation="softmax") + + i = tf.keras.layers.Input(input_shape) + o = deeplab(i) + + return tf.keras.models.Model(i, o) diff --git a/model/architecture/deeplab_xception.py b/model/architecture/deeplab_xception.py new file mode 100644 index 0000000..7114e22 --- /dev/null +++ b/model/architecture/deeplab_xception.py @@ -0,0 +1,41 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import tensorflow as tf + +from third_party.deeplabv3plus import Deeplabv3 + + +def get_network(input_shape, n_output_channels): + + deeplab = Deeplabv3(input_shape=input_shape, + classes=n_output_channels, + backbone="xception", + weights=None, + activation="softmax") + + i = tf.keras.layers.Input(input_shape) + o = deeplab(i) + + return tf.keras.models.Model(i, o) diff --git a/model/architecture/third_party/deeplabv3plus.py b/model/architecture/third_party/deeplabv3plus.py new file mode 100644 index 0000000..42cc54b --- /dev/null +++ b/model/architecture/third_party/deeplabv3plus.py @@ -0,0 +1,502 @@ +# ============================================================================== +# Copyright (c) 2018 Emil Zakirov +# +# This work is licensed under the terms of the MIT license. +# For a copy, see https://opensource.org/licenses/MIT. +# ============================================================================== + +# ============================================================================== +# https://github.com/bonlime/keras-deeplab-v3-plus +# Commit: d9e4d149b3ee75cae892be060c6e52f30f3009de +# ============================================================================== + + +# -*- coding: utf-8 -*- + +""" Deeplabv3+ model for Keras. +This model is based on TF repo: +https://github.com/tensorflow/models/tree/master/research/deeplab +On Pascal VOC, original model gets to 84.56% mIOU + +MobileNetv2 backbone is based on this repo: +https://github.com/JonathanCMitchell/mobilenet_v2_keras + +# Reference +- [Encoder-Decoder with Atrous Separable Convolution + for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf) +- [Xception: Deep Learning with Depthwise Separable Convolutions] + (https://arxiv.org/abs/1610.02357) +- [Inverted Residuals and Linear Bottlenecks: Mobile Networks for + Classification, Detection and Segmentation](https://arxiv.org/abs/1801.04381) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from tensorflow.python.keras.models import Model +from tensorflow.python.keras import layers +from tensorflow.python.keras.layers import Input +from tensorflow.python.keras.layers import Lambda +from tensorflow.python.keras.layers import Activation +from tensorflow.python.keras.layers import Concatenate +from tensorflow.python.keras.layers import Add +from tensorflow.python.keras.layers import Dropout +from tensorflow.python.keras.layers import BatchNormalization +from tensorflow.python.keras.layers import Conv2D +from tensorflow.python.keras.layers import DepthwiseConv2D +from tensorflow.python.keras.layers import ZeroPadding2D +from tensorflow.python.keras.layers import GlobalAveragePooling2D +from tensorflow.python.keras.utils.layer_utils import get_source_inputs +from tensorflow.python.keras.utils.data_utils import get_file +from tensorflow.python.keras import backend as K +from tensorflow.python.keras.activations import relu +from tensorflow.python.keras.applications.imagenet_utils import preprocess_input + +WEIGHTS_PATH_X = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.1/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5" +WEIGHTS_PATH_MOBILE = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.1/deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels.h5" +WEIGHTS_PATH_X_CS = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.2/deeplabv3_xception_tf_dim_ordering_tf_kernels_cityscapes.h5" +WEIGHTS_PATH_MOBILE_CS = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.2/deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels_cityscapes.h5" + + +def SepConv_BN(x, filters, prefix, stride=1, kernel_size=3, rate=1, depth_activation=False, epsilon=1e-3): + """ SepConv with BN between depthwise & pointwise. Optionally add activation after BN + Implements right "same" padding for even kernel sizes + Args: + x: input tensor + filters: num of filters in pointwise convolution + prefix: prefix before name + stride: stride at depthwise conv + kernel_size: kernel size for depthwise convolution + rate: atrous rate for depthwise convolution + depth_activation: flag to use activation between depthwise & poinwise convs + epsilon: epsilon to use in BN layer + """ + + if stride == 1: + depth_padding = 'same' + else: + kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) + pad_total = kernel_size_effective - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + x = ZeroPadding2D((pad_beg, pad_end))(x) + depth_padding = 'valid' + + if not depth_activation: + x = Activation('relu')(x) + x = DepthwiseConv2D((kernel_size, kernel_size), strides=(stride, stride), dilation_rate=(rate, rate), + padding=depth_padding, use_bias=False, name=prefix + '_depthwise')(x) + x = BatchNormalization(name=prefix + '_depthwise_BN', epsilon=epsilon)(x) + if depth_activation: + x = Activation('relu')(x) + x = Conv2D(filters, (1, 1), padding='same', + use_bias=False, name=prefix + '_pointwise')(x) + x = BatchNormalization(name=prefix + '_pointwise_BN', epsilon=epsilon)(x) + if depth_activation: + x = Activation('relu')(x) + + return x + + +def _conv2d_same(x, filters, prefix, stride=1, kernel_size=3, rate=1): + """Implements right 'same' padding for even kernel sizes + Without this there is a 1 pixel drift when stride = 2 + Args: + x: input tensor + filters: num of filters in pointwise convolution + prefix: prefix before name + stride: stride at depthwise conv + kernel_size: kernel size for depthwise convolution + rate: atrous rate for depthwise convolution + """ + if stride == 1: + return Conv2D(filters, + (kernel_size, kernel_size), + strides=(stride, stride), + padding='same', use_bias=False, + dilation_rate=(rate, rate), + name=prefix)(x) + else: + kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) + pad_total = kernel_size_effective - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + x = ZeroPadding2D((pad_beg, pad_end))(x) + return Conv2D(filters, + (kernel_size, kernel_size), + strides=(stride, stride), + padding='valid', use_bias=False, + dilation_rate=(rate, rate), + name=prefix)(x) + + +def _xception_block(inputs, depth_list, prefix, skip_connection_type, stride, + rate=1, depth_activation=False, return_skip=False): + """ Basic building block of modified Xception network + Args: + inputs: input tensor + depth_list: number of filters in each SepConv layer. len(depth_list) == 3 + prefix: prefix before name + skip_connection_type: one of {'conv','sum','none'} + stride: stride at last depthwise conv + rate: atrous rate for depthwise convolution + depth_activation: flag to use activation between depthwise & pointwise convs + return_skip: flag to return additional tensor after 2 SepConvs for decoder + """ + residual = inputs + for i in range(3): + residual = SepConv_BN(residual, + depth_list[i], + prefix + '_separable_conv{}'.format(i + 1), + stride=stride if i == 2 else 1, + rate=rate, + depth_activation=depth_activation) + if i == 1: + skip = residual + if skip_connection_type == 'conv': + shortcut = _conv2d_same(inputs, depth_list[-1], prefix + '_shortcut', + kernel_size=1, + stride=stride) + shortcut = BatchNormalization(name=prefix + '_shortcut_BN')(shortcut) + outputs = layers.add([residual, shortcut]) + elif skip_connection_type == 'sum': + outputs = layers.add([residual, inputs]) + elif skip_connection_type == 'none': + outputs = residual + if return_skip: + return outputs, skip + else: + return outputs + + +def relu6(x): + return relu(x, max_value=6) + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id, skip_connection, rate=1): + in_channels = inputs.shape[-1] # inputs._keras_shape[-1] + pointwise_conv_filters = int(filters * alpha) + pointwise_filters = _make_divisible(pointwise_conv_filters, 8) + x = inputs + prefix = 'expanded_conv_{}_'.format(block_id) + if block_id: + # Expand + + x = Conv2D(expansion * in_channels, kernel_size=1, padding='same', + use_bias=False, activation=None, + name=prefix + 'expand')(x) + x = BatchNormalization(epsilon=1e-3, momentum=0.999, + name=prefix + 'expand_BN')(x) + x = Activation(relu6, name=prefix + 'expand_relu')(x) + else: + prefix = 'expanded_conv_' + # Depthwise + x = DepthwiseConv2D(kernel_size=3, strides=stride, activation=None, + use_bias=False, padding='same', dilation_rate=(rate, rate), + name=prefix + 'depthwise')(x) + x = BatchNormalization(epsilon=1e-3, momentum=0.999, + name=prefix + 'depthwise_BN')(x) + + x = Activation(relu6, name=prefix + 'depthwise_relu')(x) + + # Project + x = Conv2D(pointwise_filters, + kernel_size=1, padding='same', use_bias=False, activation=None, + name=prefix + 'project')(x) + x = BatchNormalization(epsilon=1e-3, momentum=0.999, + name=prefix + 'project_BN')(x) + + if skip_connection: + return Add(name=prefix + 'add')([inputs, x]) + + # if in_channels == pointwise_filters and stride == 1: + # return Add(name='res_connect_' + str(block_id))([inputs, x]) + + return x + + +def Deeplabv3(weights='pascal_voc', input_tensor=None, input_shape=(512, 512, 3), classes=21, backbone='mobilenetv2', + OS=16, alpha=1., activation=None): + """ Instantiates the Deeplabv3+ architecture + + Optionally loads weights pre-trained + on PASCAL VOC or Cityscapes. This model is available for TensorFlow only. + # Arguments + weights: one of 'pascal_voc' (pre-trained on pascal voc), + 'cityscapes' (pre-trained on cityscape) or None (random initialization) + input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) + to use as image input for the model. + input_shape: shape of input image. format HxWxC + PASCAL VOC model was trained on (512,512,3) images. None is allowed as shape/width + classes: number of desired classes. PASCAL VOC has 21 classes, Cityscapes has 19 classes. + If number of classes not aligned with the weights used, last layer is initialized randomly + backbone: backbone to use. one of {'xception','mobilenetv2'} + activation: optional activation to add to the top of the network. + One of 'softmax', 'sigmoid' or None + OS: determines input_shape/feature_extractor_output ratio. One of {8,16}. + Used only for xception backbone. + alpha: controls the width of the MobileNetV2 network. This is known as the + width multiplier in the MobileNetV2 paper. + - If `alpha` < 1.0, proportionally decreases the number + of filters in each layer. + - If `alpha` > 1.0, proportionally increases the number + of filters in each layer. + - If `alpha` = 1, default number of filters from the paper + are used at each layer. + Used only for mobilenetv2 backbone. Pretrained is only available for alpha=1. + + # Returns + A Keras model instance. + + # Raises + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + ValueError: in case of invalid argument for `weights` or `backbone` + + """ + + if not (weights in {'pascal_voc', 'cityscapes', None}): + raise ValueError('The `weights` argument should be either ' + '`None` (random initialization), `pascal_voc`, or `cityscapes` ' + '(pre-trained on PASCAL VOC)') + + if not (backbone in {'xception', 'mobilenetv2'}): + raise ValueError('The `backbone` argument should be either ' + '`xception` or `mobilenetv2` ') + + if input_tensor is None: + img_input = Input(shape=input_shape) + else: + img_input = input_tensor + + if backbone == 'xception': + if OS == 8: + entry_block3_stride = 1 + middle_block_rate = 2 # ! Not mentioned in paper, but required + exit_block_rates = (2, 4) + atrous_rates = (12, 24, 36) + else: + entry_block3_stride = 2 + middle_block_rate = 1 + exit_block_rates = (1, 2) + atrous_rates = (6, 12, 18) + + x = Conv2D(32, (3, 3), strides=(2, 2), + name='entry_flow_conv1_1', use_bias=False, padding='same')(img_input) + x = BatchNormalization(name='entry_flow_conv1_1_BN')(x) + x = Activation('relu')(x) + + x = _conv2d_same(x, 64, 'entry_flow_conv1_2', kernel_size=3, stride=1) + x = BatchNormalization(name='entry_flow_conv1_2_BN')(x) + x = Activation('relu')(x) + + x = _xception_block(x, [128, 128, 128], 'entry_flow_block1', + skip_connection_type='conv', stride=2, + depth_activation=False) + x, skip1 = _xception_block(x, [256, 256, 256], 'entry_flow_block2', + skip_connection_type='conv', stride=2, + depth_activation=False, return_skip=True) + + x = _xception_block(x, [728, 728, 728], 'entry_flow_block3', + skip_connection_type='conv', stride=entry_block3_stride, + depth_activation=False) + for i in range(16): + x = _xception_block(x, [728, 728, 728], 'middle_flow_unit_{}'.format(i + 1), + skip_connection_type='sum', stride=1, rate=middle_block_rate, + depth_activation=False) + + x = _xception_block(x, [728, 1024, 1024], 'exit_flow_block1', + skip_connection_type='conv', stride=1, rate=exit_block_rates[0], + depth_activation=False) + x = _xception_block(x, [1536, 1536, 2048], 'exit_flow_block2', + skip_connection_type='none', stride=1, rate=exit_block_rates[1], + depth_activation=True) + + else: + OS = 8 + first_block_filters = _make_divisible(32 * alpha, 8) + x = Conv2D(first_block_filters, + kernel_size=3, + strides=(2, 2), padding='same', + use_bias=False, name='Conv')(img_input) + x = BatchNormalization( + epsilon=1e-3, momentum=0.999, name='Conv_BN')(x) + x = Activation(relu6, name='Conv_Relu6')(x) + + x = _inverted_res_block(x, filters=16, alpha=alpha, stride=1, + expansion=1, block_id=0, skip_connection=False) + + x = _inverted_res_block(x, filters=24, alpha=alpha, stride=2, + expansion=6, block_id=1, skip_connection=False) + x = _inverted_res_block(x, filters=24, alpha=alpha, stride=1, + expansion=6, block_id=2, skip_connection=True) + + x = _inverted_res_block(x, filters=32, alpha=alpha, stride=2, + expansion=6, block_id=3, skip_connection=False) + x = _inverted_res_block(x, filters=32, alpha=alpha, stride=1, + expansion=6, block_id=4, skip_connection=True) + x = _inverted_res_block(x, filters=32, alpha=alpha, stride=1, + expansion=6, block_id=5, skip_connection=True) + + # stride in block 6 changed from 2 -> 1, so we need to use rate = 2 + x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, # 1! + expansion=6, block_id=6, skip_connection=False) + x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=7, skip_connection=True) + x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=8, skip_connection=True) + x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=9, skip_connection=True) + + x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=10, skip_connection=False) + x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=11, skip_connection=True) + x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=2, + expansion=6, block_id=12, skip_connection=True) + + x = _inverted_res_block(x, filters=160, alpha=alpha, stride=1, rate=2, # 1! + expansion=6, block_id=13, skip_connection=False) + x = _inverted_res_block(x, filters=160, alpha=alpha, stride=1, rate=4, + expansion=6, block_id=14, skip_connection=True) + x = _inverted_res_block(x, filters=160, alpha=alpha, stride=1, rate=4, + expansion=6, block_id=15, skip_connection=True) + + x = _inverted_res_block(x, filters=320, alpha=alpha, stride=1, rate=4, + expansion=6, block_id=16, skip_connection=False) + + # end of feature extractor + + # branching for Atrous Spatial Pyramid Pooling + + # Image Feature branch + shape_before = tf.shape(x) + b4 = GlobalAveragePooling2D()(x) + # from (b_size, channels)->(b_size, 1, 1, channels) + b4 = Lambda(lambda x: K.expand_dims(x, 1))(b4) + b4 = Lambda(lambda x: K.expand_dims(x, 1))(b4) + b4 = Conv2D(256, (1, 1), padding='same', + use_bias=False, name='image_pooling')(b4) + b4 = BatchNormalization(name='image_pooling_BN', epsilon=1e-5)(b4) + b4 = Activation('relu')(b4) + # upsample. have to use compat because of the option align_corners + size_before = tf.keras.backend.int_shape(x) + b4 = Lambda(lambda x: tf.compat.v1.image.resize(x, size_before[1:3], + method=0, align_corners=True))(b4) + # simple 1x1 + b0 = Conv2D(256, (1, 1), padding='same', use_bias=False, name='aspp0')(x) + b0 = BatchNormalization(name='aspp0_BN', epsilon=1e-5)(b0) + b0 = Activation('relu', name='aspp0_activation')(b0) + + # there are only 2 branches in mobilenetV2. not sure why + if backbone == 'xception': + # rate = 6 (12) + b1 = SepConv_BN(x, 256, 'aspp1', + rate=atrous_rates[0], depth_activation=True, epsilon=1e-5) + # rate = 12 (24) + b2 = SepConv_BN(x, 256, 'aspp2', + rate=atrous_rates[1], depth_activation=True, epsilon=1e-5) + # rate = 18 (36) + b3 = SepConv_BN(x, 256, 'aspp3', + rate=atrous_rates[2], depth_activation=True, epsilon=1e-5) + + # concatenate ASPP branches & project + x = Concatenate()([b4, b0, b1, b2, b3]) + else: + x = Concatenate()([b4, b0]) + + x = Conv2D(256, (1, 1), padding='same', + use_bias=False, name='concat_projection')(x) + x = BatchNormalization(name='concat_projection_BN', epsilon=1e-5)(x) + x = Activation('relu')(x) + x = Dropout(0.1)(x) + # DeepLab v.3+ decoder + + if backbone == 'xception': + # Feature projection + # x4 (x2) block + size_before2 = tf.keras.backend.int_shape(x) + x = Lambda(lambda xx: tf.compat.v1.image.resize(xx, + skip1.shape[1:3], + method=0, align_corners=True))(x) + + dec_skip1 = Conv2D(48, (1, 1), padding='same', + use_bias=False, name='feature_projection0')(skip1) + dec_skip1 = BatchNormalization( + name='feature_projection0_BN', epsilon=1e-5)(dec_skip1) + dec_skip1 = Activation('relu')(dec_skip1) + x = Concatenate()([x, dec_skip1]) + x = SepConv_BN(x, 256, 'decoder_conv0', + depth_activation=True, epsilon=1e-5) + x = SepConv_BN(x, 256, 'decoder_conv1', + depth_activation=True, epsilon=1e-5) + + # you can use it with arbitary number of classes + if (weights == 'pascal_voc' and classes == 21) or (weights == 'cityscapes' and classes == 19): + last_layer_name = 'logits_semantic' + else: + last_layer_name = 'custom_logits_semantic' + + x = Conv2D(classes, (1, 1), padding='same', name=last_layer_name)(x) + size_before3 = tf.keras.backend.int_shape(img_input) + x = Lambda(lambda xx: tf.compat.v1.image.resize(xx, + size_before3[1:3], + method=0, align_corners=True))(x) + + # Ensure that the model takes into account + # any potential predecessors of `input_tensor`. + if input_tensor is not None: + inputs = get_source_inputs(input_tensor) + else: + inputs = img_input + + if activation in {'softmax', 'sigmoid'}: + x = tf.keras.layers.Activation(activation)(x) + + model = Model(inputs, x, name='deeplabv3plus') + + # load weights + + if weights == 'pascal_voc': + if backbone == 'xception': + weights_path = get_file('deeplabv3_xception_tf_dim_ordering_tf_kernels.h5', + WEIGHTS_PATH_X, + cache_subdir='models') + else: + weights_path = get_file('deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels.h5', + WEIGHTS_PATH_MOBILE, + cache_subdir='models') + model.load_weights(weights_path, by_name=True) + elif weights == 'cityscapes': + if backbone == 'xception': + weights_path = get_file('deeplabv3_xception_tf_dim_ordering_tf_kernels_cityscapes.h5', + WEIGHTS_PATH_X_CS, + cache_subdir='models') + else: + weights_path = get_file('deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels_cityscapes.h5', + WEIGHTS_PATH_MOBILE_CS, + cache_subdir='models') + model.load_weights(weights_path, by_name=True) + return model + +def preprocess_input(x): + """Preprocesses a numpy array encoding a batch of images. + # Arguments + x: a 4D numpy array consists of RGB values within [0, 255]. + # Returns + Input array scaled to [-1.,1.] + """ + return preprocess_input(x, mode='tf') diff --git a/model/architecture/third_party/spatial_transformer.py b/model/architecture/third_party/spatial_transformer.py new file mode 100644 index 0000000..0b9d041 --- /dev/null +++ b/model/architecture/third_party/spatial_transformer.py @@ -0,0 +1,247 @@ +# ============================================================================== +# Copyright (c) 2017 Octavio +# +# This work is licensed under the terms of the MIT license. +# For a copy, see https://opensource.org/licenses/MIT. +# ============================================================================== + +# ============================================================================== +# https://github.com/oarriaga/STN.keras +# Commit: 9ab0160777d33af768da10d1f201c3c321cbac47 +# Modifications: +# - import fixes +# - single file +# - projective transform support +# - add Model containing a SpatialTransformer +# +# TODO: throws error: "topological sort failed" +# - training works, error appears only in validation mode +# - validation works if theta is constant +# - might be related to a concat-op +# - https://github.com/tensorflow/tensorflow/issues/24816 +# ============================================================================== + + +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Layer, Input, Activation, MaxPooling2D, Flatten, Conv2D, Dense, Multiply +from tensorflow.keras.models import Model +from tensorflow.keras import backend as K + + +def K_meshgrid(x, y): + return tf.meshgrid(x, y) + + +def K_linspace(start, stop, num): + return tf.linspace(start, stop, num) + + +class BilinearInterpolation(Layer): + """Performs bilinear interpolation as a keras layer + References + ---------- + [1] Spatial Transformer Networks, Max Jaderberg, et al. + [2] https://github.com/skaae/transformer_network + [3] https://github.com/EderSantana/seya + """ + + def __init__(self, output_size, **kwargs): + self.output_size = output_size + super(BilinearInterpolation, self).__init__(**kwargs) + + def get_config(self): + return { + 'output_size': self.output_size, + } + + def compute_output_shape(self, input_shapes): + height, width = self.output_size + num_channels = input_shapes[0][-1] + return (None, height, width, num_channels) + + def call(self, tensors, mask=None): + X, transformation = tensors + output = self._transform(X, transformation, self.output_size) + return output + + def _interpolate(self, image, sampled_grids, output_size): + + batch_size = K.shape(image)[0] + height = K.shape(image)[1] + width = K.shape(image)[2] + num_channels = K.shape(image)[3] + + x = K.cast(K.flatten(sampled_grids[:, 0:1, :]), dtype='float32') + y = K.cast(K.flatten(sampled_grids[:, 1:2, :]), dtype='float32') + + x = .5 * (x + 1.0) * K.cast(width, dtype='float32') + y = .5 * (y + 1.0) * K.cast(height, dtype='float32') + + x0 = K.cast(x, 'int32') + x1 = x0 + 1 + y0 = K.cast(y, 'int32') + y1 = y0 + 1 + + max_x = int(K.int_shape(image)[2] - 1) + max_y = int(K.int_shape(image)[1] - 1) + + x0 = K.clip(x0, 0, max_x) + x1 = K.clip(x1, 0, max_x) + y0 = K.clip(y0, 0, max_y) + y1 = K.clip(y1, 0, max_y) + + pixels_batch = K.arange(0, batch_size) * (height * width) + pixels_batch = K.expand_dims(pixels_batch, axis=-1) + flat_output_size = output_size[0] * output_size[1] + base = K.repeat_elements(pixels_batch, flat_output_size, axis=1) + base = K.flatten(base) + + # base_y0 = base + (y0 * width) + base_y0 = y0 * width + base_y0 = base + base_y0 + # base_y1 = base + (y1 * width) + base_y1 = y1 * width + base_y1 = base_y1 + base + + indices_a = base_y0 + x0 + indices_b = base_y1 + x0 + indices_c = base_y0 + x1 + indices_d = base_y1 + x1 + + flat_image = K.reshape(image, shape=(-1, num_channels)) + flat_image = K.cast(flat_image, dtype='float32') + pixel_values_a = K.gather(flat_image, indices_a) + pixel_values_b = K.gather(flat_image, indices_b) + pixel_values_c = K.gather(flat_image, indices_c) + pixel_values_d = K.gather(flat_image, indices_d) + + x0 = K.cast(x0, 'float32') + x1 = K.cast(x1, 'float32') + y0 = K.cast(y0, 'float32') + y1 = K.cast(y1, 'float32') + + area_a = K.expand_dims(((x1 - x) * (y1 - y)), 1) + area_b = K.expand_dims(((x1 - x) * (y - y0)), 1) + area_c = K.expand_dims(((x - x0) * (y1 - y)), 1) + area_d = K.expand_dims(((x - x0) * (y - y0)), 1) + + values_a = area_a * pixel_values_a + values_b = area_b * pixel_values_b + values_c = area_c * pixel_values_c + values_d = area_d * pixel_values_d + return values_a + values_b + values_c + values_d + + def _make_regular_grids(self, batch_size, height, width): + # making a single regular grid + x_linspace = K_linspace(-1., 1., width) + y_linspace = K_linspace(-1., 1., height) + x_coordinates, y_coordinates = K_meshgrid(x_linspace, y_linspace) + x_coordinates = K.flatten(x_coordinates) + y_coordinates = K.flatten(y_coordinates) + ones = K.ones_like(x_coordinates) + grid = K.concatenate([x_coordinates, y_coordinates, ones], 0) + + # repeating grids for each batch + grid = K.flatten(grid) + grids = K.tile(grid, K.stack([batch_size])) + return K.reshape(grids, (batch_size, 3, height * width)) + + def _transform(self, X, transformation, output_size): + + batch_size, num_channels = K.shape(X)[0], X.shape[3] + + transformation = K.reshape(transformation, shape=(batch_size, 3, 3)) + + # create regular grid (-1,1)x(-1,1) + regular_grids = self._make_regular_grids(batch_size, *output_size) + + # transform regular grid + sampled_grids = K.batch_dot(transformation, regular_grids) + + # homogeneous coords: divide by 3rd component w + w = tf.math.reciprocal(sampled_grids[:, 2, :]) + w = tf.reshape(w, (batch_size, 1, w.shape[-1])) + w = tf.tile(w, [1, 2, 1]) + sampled_grids = Multiply()([sampled_grids[:, 0:2, :], w]) + + # interpolate image + interpolated_image = self._interpolate(X, sampled_grids, output_size) + new_shape = (batch_size, output_size[0], output_size[1], num_channels) + interpolated_image = K.reshape(interpolated_image, new_shape) + + return interpolated_image + + +def STN_example_model(input_shape=(60, 60, 1), sampling_size=(30, 30), num_classes=10): + + image = Input(shape=input_shape) + locnet = MaxPooling2D(pool_size=(2, 2))(image) + locnet = Conv2D(20, (5, 5))(locnet) + locnet = MaxPooling2D(pool_size=(2, 2))(locnet) + locnet = Conv2D(20, (5, 5))(locnet) + locnet = Flatten()(locnet) + locnet = Dense(50)(locnet) + locnet = Activation('relu')(locnet) + def get_initial_weights(output_size): + b = np.zeros((2, 3), dtype='float32') + b[0, 0] = 1 + b[1, 1] = 1 + W = np.zeros((output_size, 6), dtype='float32') + weights = [W, b.flatten()] + return weights + weights = get_initial_weights(50) + locnet = Dense(6, weights=weights)(locnet) + x = BilinearInterpolation(sampling_size)([image, locnet]) + x = Conv2D(32, (3, 3), padding='same')(x) + x = Activation('relu')(x) + x = MaxPooling2D(pool_size=(2, 2))(x) + x = Conv2D(32, (3, 3))(x) + x = Activation('relu')(x) + x = MaxPooling2D(pool_size=(2, 2))(x) + x = Flatten()(x) + x = Dense(256)(x) + x = Activation('relu')(x) + x = Dense(num_classes)(x) + x = Activation('softmax')(x) + return Model(inputs=image, outputs=x) + + +def _spatial_transformer(input, output_shape, theta_init=np.eye(3), theta_const=False, loc_downsample=3, dense_units=20, filters=16, kernel_size=(3,3), activation=tf.nn.relu, dense_reg=0.0): + + theta_init = theta_init.flatten().astype(np.float32) + + if not theta_const: + + t = input + + # initialize transform to identity + init_weights = [np.zeros((dense_units, 9), dtype=np.float32), theta_init] + + # localization network + for d in range(loc_downsample): + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = MaxPooling2D(pool_size=(2,2), padding="same")(t) + t = Flatten()(t) + t = Dense(dense_units)(t) + + k_reg = tf.keras.regularizers.l2(dense_reg) if dense_reg > 0 else None + b_reg = tf.keras.regularizers.l2(dense_reg) if dense_reg > 0 else None + theta = Dense(9, weights=init_weights, kernel_regularizer=k_reg, bias_regularizer=b_reg)(t) # transformation parameters + + else: + + theta = tf.tile(theta_init, tf.shape(input)[0:1]) + + # transform feature map + output = BilinearInterpolation(output_shape)([input, theta]) + + return output + + +def SpatialTransformer(input_shape, output_shape, theta_init=np.eye(3), theta_const=False, loc_downsample=3, dense_units=20, filters=16, kernel_size=(3,3), activation=tf.nn.relu, dense_reg=0.0): + + input = Input(input_shape) + output = _spatial_transformer(input, output_shape[0:2], theta_init, theta_const, loc_downsample, dense_units, filters, kernel_size, activation, dense_reg) + + return Model(input, output) diff --git a/model/architecture/uNetXST.py b/model/architecture/uNetXST.py new file mode 100644 index 0000000..03b5298 --- /dev/null +++ b/model/architecture/uNetXST.py @@ -0,0 +1,158 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import numpy as np +import tensorflow as tf +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input +from tensorflow.keras.layers import Dense, Conv2D, Conv2DTranspose, MaxPooling2D +from tensorflow.keras.layers import BatchNormalization, Dropout +from tensorflow.keras.layers import Activation +from tensorflow.keras.layers import Concatenate + +from third_party.spatial_transformer import SpatialTransformer + + +def encoder(input, udepth=3, filters1=8, kernel_size=(3,3), activation=tf.nn.relu, batch_norm=True, dropout=0.1): + + t = input + encoder_layers = udepth * [None] + + # common parameters + pool_size = (2,2) + padding = "same" + + # layer creation with successive pooling + for d in range(udepth): + filters = (2**d) * filters1 + t = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation)(t) + t = encoder_layers[d] = BatchNormalization()(t) if batch_norm else t + if d < (udepth - 1): + t = MaxPooling2D(pool_size=pool_size, padding=padding)(t) + t = Dropout(rate=dropout)(t) if dropout > 0 else t + + return encoder_layers + + +def joiner(list_of_encoder_layers, thetas, filters1=8, kernel_size=(3,3), activation=tf.nn.relu, batch_norm=True, double_skip_connection=False): + + n_inputs = len(list_of_encoder_layers) + udepth = len(list_of_encoder_layers[0]) + encoder_layers = udepth * [None] + + for d in range(udepth): + filters = (2**d) * filters1 + shape = list_of_encoder_layers[0][d].shape[1:] + + warped_maps = [] + for i in range(n_inputs): # use Spatial Transformer with constant homography transformation before concatenating + # Problem w/ trainable theta: regularization necessary, huge loss, always went to loss=nan + t = SpatialTransformer(shape, shape, theta_init=thetas[i], theta_const=True)(list_of_encoder_layers[i][d]) + warped_maps.append(t) + t = Concatenate()(warped_maps) if n_inputs > 1 else warped_maps[0] + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = warped = BatchNormalization()(t) if batch_norm else t + + if not double_skip_connection: + + t = encoder_layers[d] = warped + + else: + + nonwarped_maps = [] + for i in range(n_inputs): # also concat non-warped maps + t = list_of_encoder_layers[i][d] + nonwarped_maps.append(t) + t = Concatenate()(nonwarped_maps) if n_inputs > 1 else nonwarped_maps[0] + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = nonwarped = BatchNormalization()(t) if batch_norm else t + + # concat both + t = Concatenate()([warped, nonwarped]) + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation=activation)(t) + t = encoder_layers[d] = BatchNormalization()(t) if batch_norm else t + + return encoder_layers + + +def decoder(encoder_layers, udepth=3, filters1=8, kernel_size=(3,3), activation=tf.nn.relu, batch_norm=True, dropout=0.1): + + # start at lowest encoder layer + t = encoder_layers[udepth-1] + + # common parameters + strides = (2,2) + padding = "same" + + # layer expansion symmetric to encoder + for d in reversed(range(udepth-1)): + filters = (2**d) * filters1 + t = Conv2DTranspose(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding)(t) + t = Concatenate()([encoder_layers[d], t]) + t = Dropout(rate=dropout)(t) if dropout > 0 else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + t = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation)(t) + t = BatchNormalization()(t) if batch_norm else t + + return t + + +def get_network(input_shape, n_output_channels, n_inputs, thetas, + udepth = 5, + filters1 = 16, + kernel_size = (3,3), + activation = tf.nn.relu, + batch_norm = True, + dropout = 0.1, + double_skip_connection = False): + + # build inputs + inputs = [Input(input_shape) for i in range(n_inputs)] + + # encode all inputs separately + list_of_encoder_layers = [] + for i in inputs: + encoder_layers = encoder(i, udepth, filters1, kernel_size, activation, batch_norm, dropout) + list_of_encoder_layers.append(encoder_layers) + + # fuse encodings of all inputs at all layers + encoder_layers = joiner(list_of_encoder_layers, thetas, filters1, kernel_size, activation, batch_norm, double_skip_connection) + + # decode from bottom to top layer + reconstruction = decoder(encoder_layers, udepth, filters1, kernel_size, activation, batch_norm, dropout) + + # build final prediction layer + prediction = Conv2D(filters=n_output_channels, kernel_size=kernel_size, padding="same", activation=activation)(reconstruction) + prediction = Activation("softmax")(prediction) + + return Model(inputs, prediction) diff --git a/model/config.1_FRLR.deeplab-mobilenet.yml b/model/config.1_FRLR.deeplab-mobilenet.yml new file mode 100644 index 0000000..371a59e --- /dev/null +++ b/model/config.1_FRLR.deeplab-mobilenet.yml @@ -0,0 +1,30 @@ +input-training: [../data/1_FRLR/train/homography] +label-training: ../data/1_FRLR/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/1_FRLR/val/homography] +label-validation: ../data/1_FRLR/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_9+occl.xml + +model: architecture/deeplab_mobilenet.py +# unetxst-homographies: +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [0.98684351, 2.2481491, 10.47452063, 4.78351389, 7.01028204, 8.41360361, 10.91633349, 2.38571558, 1.02473193, 2.79359197] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, sidewalk, person, car, truck, bus, bike, obstacle, vegetation, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/1_FRLR/val/homography] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/config.1_FRLR.deeplab-xception.yml b/model/config.1_FRLR.deeplab-xception.yml new file mode 100644 index 0000000..248fd1e --- /dev/null +++ b/model/config.1_FRLR.deeplab-xception.yml @@ -0,0 +1,30 @@ +input-training: [../data/1_FRLR/train/homography] +label-training: ../data/1_FRLR/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/1_FRLR/val/homography] +label-validation: ../data/1_FRLR/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_9+occl.xml + +model: architecture/deeplab_xception.py +# unetxst-homographies: +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [0.98684351, 2.2481491, 10.47452063, 4.78351389, 7.01028204, 8.41360361, 10.91633349, 2.38571558, 1.02473193, 2.79359197] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, sidewalk, person, car, truck, bus, bike, obstacle, vegetation, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/1_FRLR/val/homography] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/config.1_FRLR.unetxst.yml b/model/config.1_FRLR.unetxst.yml new file mode 100644 index 0000000..daca64c --- /dev/null +++ b/model/config.1_FRLR.unetxst.yml @@ -0,0 +1,30 @@ +input-training: [../data/1_FRLR/train/front, ../data/1_FRLR/train/rear, ../data/1_FRLR/train/left, ../data/1_FRLR/train/right] +label-training: ../data/1_FRLR/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/1_FRLR/val/front, ../data/1_FRLR/val/rear, ../data/1_FRLR/val/left, ../data/1_FRLR/val/right] +label-validation: ../data/1_FRLR/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_9+occl.xml + +model: architecture/uNetXST.py +unetxst-homographies: ../preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [0.98684351, 2.2481491, 10.47452063, 4.78351389, 7.01028204, 8.41360361, 10.91633349, 2.38571558, 1.02473193, 2.79359197] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, sidewalk, person, car, truck, bus, bike, obstacle, vegetation, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/1_FRLR/val/front, ../data/1_FRLR/val/rear, ../data/1_FRLR/val/left, ../data/1_FRLR/val/right] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/config.2_F.deeplab-mobilenet.yml b/model/config.2_F.deeplab-mobilenet.yml new file mode 100644 index 0000000..45893c9 --- /dev/null +++ b/model/config.2_F.deeplab-mobilenet.yml @@ -0,0 +1,30 @@ +input-training: [../data/2_F/train/homography] +label-training: ../data/2_F/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/2_F/val/homography] +label-validation: ../data/2_F/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_3+occl.xml + +model: architecture/deeplab_mobilenet.py +# unetxst-homographies: +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [1.00752063, 5.06392476, 1.15378408, 1.16118375] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, vehicle, obstacle, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/2_F/val/homography] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/config.2_F.deeplab-xception.yml b/model/config.2_F.deeplab-xception.yml new file mode 100644 index 0000000..bd7c20c --- /dev/null +++ b/model/config.2_F.deeplab-xception.yml @@ -0,0 +1,30 @@ +input-training: [../data/2_F/train/homography] +label-training: ../data/2_F/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/2_F/val/homography] +label-validation: ../data/2_F/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_3+occl.xml + +model: architecture/deeplab_xception.py +# unetxst-homographies: +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [1.00752063, 5.06392476, 1.15378408, 1.16118375] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, vehicle, obstacle, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/2_F/val/homography] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/config.2_F.unetxst.yml b/model/config.2_F.unetxst.yml new file mode 100644 index 0000000..2aa3870 --- /dev/null +++ b/model/config.2_F.unetxst.yml @@ -0,0 +1,30 @@ +input-training: [../data/2_F/train/front] +label-training: ../data/2_F/train/bev+occlusion +max-samples-training: 100000 +input-validation: [../data/2_F/val/front] +label-validation: ../data/2_F/val/bev+occlusion +max-samples-validation: 10000 + +image-shape: [256, 512] +one-hot-palette-input: one_hot_conversion/convert_10.xml +one-hot-palette-label: one_hot_conversion/convert_3+occl.xml + +model: architecture/uNetXST.py +unetxst-homographies: ../preprocessing/homography_converter/uNetXST_homographies/2_F.py +epochs: 100 +batch-size: 5 +learning-rate: 1e-4 +loss-weights: [1.00752063, 5.06392476, 1.15378408, 1.16118375] +early-stopping-patience: 20 + +save-interval: 5 +output-dir: output + +# for training continuation, evaluation and prediction only +class-names: [road, vehicle, obstacle, occluded] +# model-weights: + +# for predict.py only +input-testing: [../data/2_F/val/front] +max-samples-testing: 10000 +# prediction-dir: diff --git a/model/evaluate.py b/model/evaluate.py new file mode 100755 index 0000000..2a21fb5 --- /dev/null +++ b/model/evaluate.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import importlib +import os +import sys +import tqdm +import numpy as np +import cv2 +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sb +import tensorflow as tf +import configargparse + +import utils + + +# parse parameters from config file or CLI +parser = configargparse.ArgParser() +parser.add("-c", "--config", is_config_file=True, help="config file") +parser.add("-iv", "--input-validation", type=str, required=True, nargs="+", help="directory/directories of input samples for validation") +parser.add("-lv", "--label-validation", type=str, required=True, help="directory of label samples for validation") +parser.add("-nv", "--max-samples-validation", type=int, default=None, help="maximum number of validation samples") +parser.add("-is", "--image-shape", type=int, required=True, nargs=2, help="image dimensions (HxW) of inputs and labels for network") +parser.add("-ohi", "--one-hot-palette-input", type=str, required=True, help="xml-file for one-hot-conversion of input images") +parser.add("-ohl", "--one-hot-palette-label", type=str, required=True, help="xml-file for one-hot-conversion of label images") +parser.add("-cn", "--class-names", type=str, required=True, nargs="+", help="class names to annotate confusion matrix axes") +parser.add("-m", "--model", type=str, required=True, help="Python file defining the neural network") +parser.add("-uh", "--unetxst-homographies", type=str, default=None, help="Python file defining a list H of homographies to be used in uNetXST model") +parser.add("-mw", "--model-weights", type=str, required=True, help="weights file of trained model") +conf, unknown = parser.parse_known_args() + + +# determine absolute filepaths +conf.input_validation = [utils.abspath(path) for path in conf.input_validation] +conf.label_validation = utils.abspath(conf.label_validation) +conf.one_hot_palette_input = utils.abspath(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.abspath(conf.one_hot_palette_label) +conf.model = utils.abspath(conf.model) +conf.unetxst_homographies = utils.abspath(conf.unetxst_homographies) if conf.unetxst_homographies is not None else conf.unetxst_homographies +conf.model_weights = utils.abspath(conf.model_weights) + + +# load network architecture module +architecture = utils.load_module(conf.model) + + +# get max_samples_validation random validation samples +files_input = [utils.get_files_in_folder(folder) for folder in conf.input_validation] +files_label = utils.get_files_in_folder(conf.label_validation) +_, idcs = utils.sample_list(files_label, n_samples=conf.max_samples_validation) +files_input = [np.take(f, idcs) for f in files_input] +files_label = np.take(files_label, idcs) +n_inputs = len(conf.input_validation) +n_samples = len(files_label) +image_shape_original_input = utils.load_image(files_input[0][0]).shape[0:2] +image_shape_original_label = utils.load_image(files_label[0]).shape[0:2] +print(f"Found {n_samples} samples") + + +# parse one-hot-conversion.xml +conf.one_hot_palette_input = utils.parse_convert_xml(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.parse_convert_xml(conf.one_hot_palette_label) +n_classes_input = len(conf.one_hot_palette_input) +n_classes_label = len(conf.one_hot_palette_label) + + +# build model +if conf.unetxst_homographies is not None: + uNetXSTHomographies = utils.load_module(conf.unetxst_homographies) + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label, n_inputs=n_inputs, thetas=uNetXSTHomographies.H) +else: + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label) +model.load_weights(conf.model_weights) +print(f"Reloaded model from {conf.model_weights}") + + +# build data parsing function +def parse_sample(input_files, label_file): + # parse and process input images + inputs = [] + for inp in input_files: + inp = utils.load_image_op(inp) + inp = utils.resize_image_op(inp, image_shape_original_input, conf.image_shape, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + inp = utils.one_hot_encode_image_op(inp, conf.one_hot_palette_input) + inputs.append(inp) + inputs = inputs[0] if n_inputs == 1 else tuple(inputs) + # parse and process label image + label = utils.load_image_op(label_file) + label = utils.resize_image_op(label, image_shape_original_label, conf.image_shape, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + label = utils.one_hot_encode_image_op(label, conf.one_hot_palette_label) + return inputs, label + + +# evaluate confusion matrix +print("Evaluating confusion matrix ...") +confusion_matrix = np.zeros((n_classes_label, n_classes_label), dtype=np.int64) +for k in tqdm.tqdm(range(n_samples)): + + input_files = [files_input[i][k] for i in range(n_inputs)] + label_file = files_label[k] + + # load sample + inputs, label = parse_sample(input_files, label_file) + + # add batch dim + if n_inputs > 1: + inputs = [np.expand_dims(i, axis=0) for i in inputs] + else: + inputs = np.expand_dims(inputs, axis=0) + + # run prediction + prediction = model.predict(inputs).squeeze() + + # compute confusion matrix + label = np.argmax(label, axis=-1) + prediction = np.argmax(prediction, axis=-1) + sample_confusion_matrix = tf.math.confusion_matrix(label.flatten(), prediction.flatten(), num_classes=n_classes_label).numpy() + + # sum confusion matrix over dataset + confusion_matrix += sample_confusion_matrix + + +# normalize confusion matrix rows (What percentage of class X has been predicted to be class Y?) +confusion_matrix_norm = confusion_matrix / np.sum(confusion_matrix, axis=1)[:, np.newaxis] + + +# compute per-class IoU +row_sum = np.sum(confusion_matrix, axis=0) +col_sum = np.sum(confusion_matrix, axis=1) +diag = np.diag(confusion_matrix) +intersection = diag +union = row_sum + col_sum - diag +ious = intersection / union +iou = {} +for idx, v in enumerate(ious): + iou[conf.class_names[idx]] = v + + +# print metrics +print("\nPer-class IoU:") +for k, v in iou.items(): + print(f" {k}: {100*v:3.2f}%") +print("\nConfusion Matrix:") +print(confusion_matrix) +print("\nNormalized Confusion Matrix:") +print(confusion_matrix_norm) + + +# plot confusion matrix +confusion_matrix_df = pd.DataFrame(confusion_matrix_norm*100, conf.class_names, conf.class_names) +plt.figure(figsize=(8,8)) +hm = sb.heatmap(confusion_matrix_df, + annot=True, + fmt=".2f", + square=True, + vmin=0, + vmax=100, + cbar_kws={"label": "%", "shrink": 0.8}, + cmap=plt.cm.Blues) +hm.set_xticklabels(hm.get_xticklabels(), rotation=30) +plt.ylabel("True Label") +plt.xlabel("Predicted Label") + + +# save confusion matrix and class ious to file and export plot +eval_folder = os.path.join(os.path.dirname(conf.model_weights), os.pardir, "Evaluation") +if not os.path.exists(eval_folder): + os.makedirs(eval_folder) +filename = os.path.join(eval_folder, "confusion_matrix.txt") +np.savetxt(filename, confusion_matrix, fmt="%d") +filename = os.path.join(eval_folder, "class_iou.txt") +np.savetxt(filename, ious, fmt="%f") +filename = os.path.join(eval_folder, "confusion_matrix.pdf") +plt.savefig(filename, bbox_inches="tight") diff --git a/model/one_hot_conversion/convert_10.xml b/model/one_hot_conversion/convert_10.xml new file mode 100644 index 0000000..2e3fbcf --- /dev/null +++ b/model/one_hot_conversion/convert_10.xml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/model/one_hot_conversion/convert_3+occl.xml b/model/one_hot_conversion/convert_3+occl.xml new file mode 100644 index 0000000..c79af41 --- /dev/null +++ b/model/one_hot_conversion/convert_3+occl.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/model/one_hot_conversion/convert_3.xml b/model/one_hot_conversion/convert_3.xml new file mode 100644 index 0000000..3711b52 --- /dev/null +++ b/model/one_hot_conversion/convert_3.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/model/one_hot_conversion/convert_30.xml b/model/one_hot_conversion/convert_30.xml new file mode 100644 index 0000000..d109f4b --- /dev/null +++ b/model/one_hot_conversion/convert_30.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/model/one_hot_conversion/convert_9+occl.xml b/model/one_hot_conversion/convert_9+occl.xml new file mode 100644 index 0000000..d7af42b --- /dev/null +++ b/model/one_hot_conversion/convert_9+occl.xml @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/model/one_hot_conversion/convert_9.xml b/model/one_hot_conversion/convert_9.xml new file mode 100644 index 0000000..5517c7c --- /dev/null +++ b/model/one_hot_conversion/convert_9.xml @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/model/predict.py b/model/predict.py new file mode 100755 index 0000000..cf81c61 --- /dev/null +++ b/model/predict.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import importlib +import os +import sys +import tqdm +import numpy as np +import cv2 +import tensorflow as tf +import configargparse + +import utils + + +# parse parameters from config file or CLI +parser = configargparse.ArgParser() +parser.add("-c", "--config", is_config_file=True, help="config file") +parser.add("-ip", "--input-testing", type=str, required=True, nargs="+", help="directory/directories of input samples for testing") +parser.add("-np", "--max-samples-testing", type=int, default=None, help="maximum number of testing samples") +parser.add("-is", "--image-shape", type=int, required=True, nargs=2, help="image dimensions (HxW) of inputs and labels for network") +parser.add("-ohi", "--one-hot-palette-input", type=str, required=True, help="xml-file for one-hot-conversion of input images") +parser.add("-ohl", "--one-hot-palette-label", type=str, required=True, help="xml-file for one-hot-conversion of label images") +parser.add("-m", "--model", type=str, required=True, help="Python file defining the neural network") +parser.add("-uh", "--unetxst-homographies", type=str, default=None, help="Python file defining a list H of homographies to be used in uNetXST model") +parser.add("-mw", "--model-weights", type=str, required=True, help="weights file of trained model") +parser.add("-pd", "--prediction-dir", type=str, required=True, help="output directory for storing predictions of testing data") +conf, unknown = parser.parse_known_args() + + +# determine absolute filepaths +conf.input_testing = [utils.abspath(path) for path in conf.input_testing] +conf.one_hot_palette_input = utils.abspath(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.abspath(conf.one_hot_palette_label) +conf.model = utils.abspath(conf.model) +conf.unetxst_homographies = utils.abspath(conf.unetxst_homographies) if conf.unetxst_homographies is not None else conf.unetxst_homographies +conf.model_weights = utils.abspath(conf.model_weights) +conf.prediction_dir = utils.abspath(conf.prediction_dir) + + +# load network architecture module +architecture = utils.load_module(conf.model) + + +# get max_samples_testing samples +files_input = [utils.get_files_in_folder(folder) for folder in conf.input_testing] +_, idcs = utils.sample_list(files_input[0], n_samples=conf.max_samples_testing) +files_input = [np.take(f, idcs) for f in files_input] +n_inputs = len(conf.input_testing) +n_samples = len(files_input[0]) +image_shape_original = utils.load_image(files_input[0][0]).shape[0:2] +print(f"Found {n_samples} samples") + + +# parse one-hot-conversion.xml +conf.one_hot_palette_input = utils.parse_convert_xml(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.parse_convert_xml(conf.one_hot_palette_label) +n_classes_input = len(conf.one_hot_palette_input) +n_classes_label = len(conf.one_hot_palette_label) + + +# build model +if conf.unetxst_homographies is not None: + uNetXSTHomographies = utils.load_module(conf.unetxst_homographies) + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label, n_inputs=n_inputs, thetas=uNetXSTHomographies.H) +else: + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label) +model.load_weights(conf.model_weights) +print(f"Reloaded model from {conf.model_weights}") + + +# build data parsing function +def parse_sample(input_files): + # parse and process input images + inputs = [] + for inp in input_files: + inp = utils.load_image_op(inp) + inp = utils.resize_image_op(inp, image_shape_original, conf.image_shape, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + inp = utils.one_hot_encode_image_op(inp, conf.one_hot_palette_input) + inputs.append(inp) + inputs = inputs[0] if n_inputs == 1 else tuple(inputs) + return inputs + + +# create output directory +if not os.path.exists(conf.prediction_dir): + os.makedirs(conf.prediction_dir) + + +# run predictions +print(f"Running predictions and writing to {conf.prediction_dir} ...") +for k in tqdm.tqdm(range(n_samples)): + + input_files = [files_input[i][k] for i in range(n_inputs)] + + # load sample + inputs = parse_sample(input_files) + + # add batch dim + if n_inputs > 1: + inputs = [np.expand_dims(i, axis=0) for i in inputs] + else: + inputs = np.expand_dims(inputs, axis=0) + + # run prediction + prediction = model.predict(inputs).squeeze() + + # convert to output image + prediction = utils.one_hot_decode_image(prediction, conf.one_hot_palette_label) + + # write to disk + output_file = os.path.join(conf.prediction_dir, os.path.basename(files_input[0][k])) + cv2.imwrite(output_file, cv2.cvtColor(prediction, cv2.COLOR_RGB2BGR)) diff --git a/model/train.py b/model/train.py new file mode 100755 index 0000000..73e7404 --- /dev/null +++ b/model/train.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import importlib +import os +import sys +from datetime import datetime +import numpy as np +import cv2 +import tensorflow as tf +import configargparse + +import utils + + +# parse parameters from config file or CLI +parser = configargparse.ArgParser() +parser.add("-c", "--config", is_config_file=True, help="config file") +parser.add("-it", "--input-training", type=str, required=True, nargs="+", help="directory/directories of input samples for training") +parser.add("-lt", "--label-training", type=str, required=True, help="directory of label samples for training") +parser.add("-nt", "--max-samples-training", type=int, default=None, help="maximum number of training samples") +parser.add("-iv", "--input-validation", type=str, required=True, nargs="+", help="directory/directories of input samples for validation") +parser.add("-lv", "--label-validation", type=str, required=True, help="directory of label samples for validation") +parser.add("-nv", "--max-samples-validation", type=int, default=None, help="maximum number of validation samples") +parser.add("-is", "--image-shape", type=int, required=True, nargs=2, help="image dimensions (HxW) of inputs and labels for network") +parser.add("-ohi", "--one-hot-palette-input", type=str, required=True, help="xml-file for one-hot-conversion of input images") +parser.add("-ohl", "--one-hot-palette-label", type=str, required=True, help="xml-file for one-hot-conversion of label images") +parser.add("-m", "--model", type=str, required=True, help="Python file defining the neural network") +parser.add("-uh", "--unetxst-homographies", type=str, default=None, help="Python file defining a list H of homographies to be used in uNetXST model") +parser.add("-e", "--epochs", type=int, required=True, help="number of epochs for training") +parser.add("-bs", "--batch-size", type=int, required=True, help="batch size for training") +parser.add("-lr", "--learning-rate", type=float, default=1e-4, help="learning rate of Adam optimizer for training") +parser.add("-lw", "--loss-weights", type=float, default=None, nargs="+", help="factors for weighting classes differently in loss function") +parser.add("-esp", "--early-stopping-patience", type=int, default=10, help="patience for early-stopping due to converged validation mIoU") +parser.add("-si", "--save-interval", type=int, default=5, help="epoch interval between exports of the model") +parser.add("-o", "--output-dir", type=str, required=True, help="output dir for TensorBoard and models") +parser.add("-mw", "--model-weights", type=str, default=None, help="weights file of trained model for training continuation") +conf, unknown = parser.parse_known_args() + + +# determine absolute filepaths +conf.input_training = [utils.abspath(path) for path in conf.input_training] +conf.label_training = utils.abspath(conf.label_training) +conf.input_validation = [utils.abspath(path) for path in conf.input_validation] +conf.label_validation = utils.abspath(conf.label_validation) +conf.one_hot_palette_input = utils.abspath(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.abspath(conf.one_hot_palette_label) +conf.model = utils.abspath(conf.model) +conf.unetxst_homographies = utils.abspath(conf.unetxst_homographies) if conf.unetxst_homographies is not None else conf.unetxst_homographies +conf.model_weights = utils.abspath(conf.model_weights) if conf.model_weights is not None else conf.model_weights +conf.output_dir = utils.abspath(conf.output_dir) + + +# load network architecture module +architecture = utils.load_module(conf.model) + + +# get max_samples_training random training samples +n_inputs = len(conf.input_training) +files_train_input = [utils.get_files_in_folder(folder) for folder in conf.input_training] +files_train_label = utils.get_files_in_folder(conf.label_training) +_, idcs = utils.sample_list(files_train_label, n_samples=conf.max_samples_training) +files_train_input = [np.take(f, idcs) for f in files_train_input] +files_train_label = np.take(files_train_label, idcs) +image_shape_original_input = utils.load_image(files_train_input[0][0]).shape[0:2] +image_shape_original_label = utils.load_image(files_train_label[0]).shape[0:2] +print(f"Found {len(files_train_label)} training samples") + +# get max_samples_validation random validation samples +files_valid_input = [utils.get_files_in_folder(folder) for folder in conf.input_validation] +files_valid_label = utils.get_files_in_folder(conf.label_validation) +_, idcs = utils.sample_list(files_valid_label, n_samples=conf.max_samples_validation) +files_valid_input = [np.take(f, idcs) for f in files_valid_input] +files_valid_label = np.take(files_valid_label, idcs) +print(f"Found {len(files_valid_label)} validation samples") + + +# parse one-hot-conversion.xml +conf.one_hot_palette_input = utils.parse_convert_xml(conf.one_hot_palette_input) +conf.one_hot_palette_label = utils.parse_convert_xml(conf.one_hot_palette_label) +n_classes_input = len(conf.one_hot_palette_input) +n_classes_label = len(conf.one_hot_palette_label) + + +# build dataset pipeline parsing functions +def parse_sample(input_files, label_file): + # parse and process input images + inputs = [] + for inp in input_files: + inp = utils.load_image_op(inp) + inp = utils.resize_image_op(inp, image_shape_original_input, conf.image_shape, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + inp = utils.one_hot_encode_image_op(inp, conf.one_hot_palette_input) + inputs.append(inp) + inputs = inputs[0] if n_inputs == 1 else tuple(inputs) + # parse and process label image + label = utils.load_image_op(label_file) + label = utils.resize_image_op(label, image_shape_original_label, conf.image_shape, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + label = utils.one_hot_encode_image_op(label, conf.one_hot_palette_label) + return inputs, label + +# build training data pipeline +dataTrain = tf.data.Dataset.from_tensor_slices((tuple(files_train_input), files_train_label)) +dataTrain = dataTrain.shuffle(buffer_size=conf.max_samples_training, reshuffle_each_iteration=True) +dataTrain = dataTrain.map(parse_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE) +dataTrain = dataTrain.batch(conf.batch_size, drop_remainder=True) +dataTrain = dataTrain.repeat(conf.epochs) +dataTrain = dataTrain.prefetch(1) +print("Built data pipeline for training") + +# build validation data pipeline +dataValid = tf.data.Dataset.from_tensor_slices((tuple(files_valid_input), files_valid_label)) +dataValid = dataValid.map(parse_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE) +dataValid = dataValid.batch(1) +dataValid = dataValid.repeat(conf.epochs) +dataValid = dataValid.prefetch(1) +print("Built data pipeline for validation") + + +# build model +if conf.unetxst_homographies is not None: + uNetXSTHomographies = utils.load_module(conf.unetxst_homographies) + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label, n_inputs=n_inputs, thetas=uNetXSTHomographies.H) +else: + model = architecture.get_network((conf.image_shape[0], conf.image_shape[1], n_classes_input), n_classes_label) +if conf.model_weights is not None: + model.load_weights(conf.model_weights) +optimizer = tf.keras.optimizers.Adam(learning_rate=conf.learning_rate) +if conf.loss_weights is not None: + loss = utils.weighted_categorical_crossentropy(conf.loss_weights) +else: + loss = tf.keras.losses.CategoricalCrossentropy() +metrics = [tf.keras.metrics.CategoricalAccuracy(), utils.MeanIoUWithOneHotLabels(num_classes=n_classes_label)] +model.compile(optimizer=optimizer, loss=loss, metrics=metrics) +print(f"Compiled model {os.path.basename(conf.model)}") + + +# create output directories +model_output_dir = os.path.join(conf.output_dir, datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) +tensorboard_dir = os.path.join(model_output_dir, "TensorBoard") +checkpoint_dir = os.path.join(model_output_dir, "Checkpoints") +if not os.path.exists(tensorboard_dir): + os.makedirs(tensorboard_dir) +if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + + +# create callbacks to be called after each epoch +tensorboard_cb = tf.keras.callbacks.TensorBoard(tensorboard_dir, update_freq="epoch", profile_batch=0) +checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(os.path.join(checkpoint_dir, "e{epoch:03d}_weights.hdf5"), period=conf.save_interval, save_weights_only=True) +best_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(os.path.join(checkpoint_dir, "best_weights.hdf5"), save_best_only=True, monitor="val_mean_io_u_with_one_hot_labels", mode="max", save_weights_only=True) +early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor="val_mean_io_u_with_one_hot_labels", mode="max", patience=conf.early_stopping_patience, verbose=1) +callbacks = [tensorboard_cb, checkpoint_cb, best_checkpoint_cb, early_stopping_cb] + + +# start training +print("Starting training...") +n_batches_train = len(files_train_label) // conf.batch_size +n_batches_valid = len(files_valid_label) +model.fit(dataTrain, + epochs=conf.epochs, steps_per_epoch=n_batches_train, + validation_data=dataValid, validation_freq=1, validation_steps=n_batches_valid, + callbacks=callbacks) diff --git a/model/utils.py b/model/utils.py new file mode 100644 index 0000000..345cf5e --- /dev/null +++ b/model/utils.py @@ -0,0 +1,258 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import os +import sys +import numpy as np +import random +import tensorflow as tf +import cv2 +import xml.etree.ElementTree as xmlET +from tqdm import tqdm +import importlib + + +def abspath(path): + + return os.path.abspath(os.path.expanduser(path)) + + +def get_files_in_folder(folder): + + return sorted([os.path.join(folder, f) for f in os.listdir(folder)]) + + + +def sample_list(*ls, n_samples, replace=False): + + n_samples = min(len(ls[0]), n_samples) + idcs = np.random.choice(np.arange(0, len(ls[0])), n_samples, replace=replace) + samples = zip([np.take(l, idcs) for l in ls]) + return samples, idcs + + +def load_module(module_file): + + name = os.path.splitext(os.path.basename(module_file))[0] + dir = os.path.dirname(module_file) + sys.path.append(dir) + spec = importlib.util.spec_from_file_location(name, module_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + return module + + +def load_image(filename): + + img = cv2.imread(filename) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + return img + + +def load_image_op(filename): + + img = tf.io.read_file(filename) + img = tf.image.decode_png(img) + return img + + +def resize_image(img, shape, interpolation=cv2.INTER_CUBIC): + + # resize relevant image axis to length of corresponding target axis while preserving aspect ratio + axis = 0 if float(shape[0]) / float(img.shape[0]) > float(shape[1]) / float(img.shape[1]) else 1 + factor = float(shape[axis]) / float(img.shape[axis]) + img = cv2.resize(img, (0,0), fx=factor, fy=factor, interpolation=interpolation) + + # crop other image axis to match target shape + center = img.shape[int(not axis)] / 2.0 + step = shape[int(not axis)] / 2.0 + left = int(center-step) + right = int(center+step) + if axis == 0: + img = img[:, left:right] + else: + img = img[left:right, :] + + return img + + +def resize_image_op(img, fromShape, toShape, cropToPreserveAspectRatio=True, interpolation=tf.image.ResizeMethod.BICUBIC): + + if not cropToPreserveAspectRatio: + img = tf.image.resize(img, toShape, method=interpolation) + + else: + + # first crop to match target aspect ratio + fx = toShape[1] / fromShape[1] + fy = toShape[0] / fromShape[0] + relevantAxis = 0 if fx < fy else 1 + if relevantAxis == 0: + crop = fromShape[0] * toShape[1] / toShape[0] + img = tf.image.crop_to_bounding_box(img, 0, int((fromShape[1] - crop) / 2), fromShape[0], int(crop)) + else: + crop = fromShape[1] * toShape[0] / toShape[1] + img = tf.image.crop_to_bounding_box(img, int((fromShape[0] - crop) / 2), 0, int(crop), fromShape[1]) + + # then resize to target shape + img = tf.image.resize(img, toShape, method=interpolation) + + return img + + +def one_hot_encode_image(image, palette): + + one_hot_map = [] + + # find instances of class colors and append layer to one-hot-map + for class_colors in palette: + class_map = np.zeros(image.shape[0:2], dtype=bool) + for color in class_colors: + class_map = class_map | (image == color).all(axis=-1) + one_hot_map.append(class_map) + + # finalize one-hot-map + one_hot_map = np.stack(one_hot_map, axis=-1) + one_hot_map = one_hot_map.astype(np.float32) + + return one_hot_map + + +def one_hot_encode_image_op(image, palette): + + one_hot_map = [] + + for class_colors in palette: + + class_map = tf.zeros(image.shape[0:2], dtype=tf.int32) + + for color in class_colors: + # find instances of color and append layer to one-hot-map + class_map = tf.bitwise.bitwise_or(class_map, tf.cast(tf.reduce_all(tf.equal(image, color), axis=-1), tf.int32)) + one_hot_map.append(class_map) + + # finalize one-hot-map + one_hot_map = tf.stack(one_hot_map, axis=-1) + one_hot_map = tf.cast(one_hot_map, tf.float32) + + return one_hot_map + + +def one_hot_decode_image(one_hot_image, palette): + + # create empty image with correct dimensions + height, width = one_hot_image.shape[0:2] + depth = palette[0][0].size + image = np.zeros([height, width, depth]) + + # reduce all layers of one-hot-encoding to one layer with indices of the classes + map_of_classes = one_hot_image.argmax(2) + + for idx, class_colors in enumerate(palette): + # fill image with corresponding class colors + image[np.where(map_of_classes == idx)] = class_colors[0] + + image = image.astype(np.uint8) + + return image + + +def parse_convert_xml(conversion_file_path): + + defRoot = xmlET.parse(conversion_file_path).getroot() + + one_hot_palette = [] + class_list = [] + for idx, defElement in enumerate(defRoot.findall("SLabel")): + from_color = np.fromstring(defElement.get("fromColour"), dtype=int, sep=" ") + to_class = np.fromstring(defElement.get("toValue"), dtype=int, sep=" ") + if to_class in class_list: + one_hot_palette[class_list.index(to_class)].append(from_color) + else: + one_hot_palette.append([from_color]) + class_list.append(to_class) + + return one_hot_palette + + +def get_class_distribution(folder, shape, palette): + + # get filepaths + files = [os.path.join(folder, f) for f in os.listdir(folder) if not f.startswith(".")] + + n_classes = len(palette) + + def get_img(file, shape, interpolation=cv2.INTER_NEAREST, one_hot_reduce=False): + img = load_image(file) + img = resize_image(img, shape, interpolation) + img = one_hot_encode_image(img, palette) + return img + + px = shape[0] * shape[1] + + distribution = {} + for k in range(n_classes): + distribution[str(k)] = 0 + + i = 0 + bar = tqdm(files) + for f in bar: + + img = get_img(f, shape) + + classes = np.argmax(img, axis=-1) + + unique, counts = np.unique(classes, return_counts=True) + + occs = dict(zip(unique, counts)) + + for k in range(n_classes): + occ = occs[k] if k in occs.keys() else 0 + distribution[str(k)] = (distribution[str(k)] * i + occ / px) / (i+1) + + bar.set_postfix(distribution) + + i += 1 + + return distribution + + +def weighted_categorical_crossentropy(weights): + + def wcce(y_true, y_pred): + Kweights = tf.constant(weights) + if not tf.is_tensor(y_pred): y_pred = tf.constant(y_pred) + y_true = tf.cast(y_true, y_pred.dtype) + return tf.keras.backend.categorical_crossentropy(y_true, y_pred) * tf.keras.backend.sum(y_true * Kweights, axis=-1) + + return wcce + + +class MeanIoUWithOneHotLabels(tf.keras.metrics.MeanIoU): + + def __call__(self, y_true, y_pred, sample_weight=None): + y_true = tf.argmax(y_true, axis=-1) + y_pred = tf.argmax(y_pred, axis=-1) + return super().__call__(y_true, y_pred, sample_weight=sample_weight) diff --git a/preprocessing/camera_configs/1_FRLR/drone.yaml b/preprocessing/camera_configs/1_FRLR/drone.yaml new file mode 100644 index 0000000..1f1dbb9 --- /dev/null +++ b/preprocessing/camera_configs/1_FRLR/drone.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 682.578 +fy: 682.578 +px: 482.0 +py: 302.0 + +# rotation matrix R (in deg) (drone is assumed to be at 90° pitch, 90° roll) +yaw: 0.0 +pitch: 90.0 +roll: 90.0 + +# vehicle coords of camera origin +XCam: 0.0 +YCam: 0.0 +ZCam: 50.0 diff --git a/preprocessing/camera_configs/1_FRLR/front.yaml b/preprocessing/camera_configs/1_FRLR/front.yaml new file mode 100644 index 0000000..a05d077 --- /dev/null +++ b/preprocessing/camera_configs/1_FRLR/front.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 278.283 +fy: 408.1295 +px: 482.0 +py: 302.0 + +# rotation matrix R (in deg) +yaw: 0.0 +pitch: 0.0 +roll: 0.0 + +# vehicle coords of camera origin +XCam: 1.7 +YCam: 0.0 +ZCam: 1.4 diff --git a/preprocessing/camera_configs/1_FRLR/left.yaml b/preprocessing/camera_configs/1_FRLR/left.yaml new file mode 100644 index 0000000..c5be979 --- /dev/null +++ b/preprocessing/camera_configs/1_FRLR/left.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 278.283 +fy: 408.1295 +px: 482.0 +py: 302.0 + +# rotation matrix R (in deg) +yaw: 90.0 +pitch: 0.0 +roll: 0.0 + +# vehicle coords of camera origin +XCam: 0.5 +YCam: 0.5 +ZCam: 1.5 diff --git a/preprocessing/camera_configs/1_FRLR/rear.yaml b/preprocessing/camera_configs/1_FRLR/rear.yaml new file mode 100644 index 0000000..7e24ef6 --- /dev/null +++ b/preprocessing/camera_configs/1_FRLR/rear.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 278.283 +fy: 408.1295 +px: 482.0 +py: 302.0 + +# rotation matrix R (in deg) +yaw: 180.0 +pitch: 0.0 +roll: 0.0 + +# vehicle coords of camera origin +XCam: -0.6 +YCam: 0.0 +ZCam: 1.4 diff --git a/preprocessing/camera_configs/1_FRLR/right.yaml b/preprocessing/camera_configs/1_FRLR/right.yaml new file mode 100644 index 0000000..547e6ac --- /dev/null +++ b/preprocessing/camera_configs/1_FRLR/right.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 278.283 +fy: 408.1295 +px: 482.0 +py: 302.0 + +# rotation matrix R (in deg) +yaw: -90.0 +pitch: 0.0 +roll: 0.0 + +# vehicle coords of camera origin +XCam: 0.5 +YCam: -0.5 +ZCam: 1.5 diff --git a/preprocessing/camera_configs/2_F/drone.yaml b/preprocessing/camera_configs/2_F/drone.yaml new file mode 100644 index 0000000..878cf89 --- /dev/null +++ b/preprocessing/camera_configs/2_F/drone.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 3872.0 +fy: 3872.0 +px: 968.0 +py: 484.0 + +# rotation matrix R (in deg) (drone is assumed to be at 90° pitch, 90° roll) +yaw: 0.0 +pitch: 90.0 +roll: 90.0 + +# vehicle coords of camera origin +XCam: 28.84452 +YCam: 0.0 +ZCam: 100.0 diff --git a/preprocessing/camera_configs/2_F/front.yaml b/preprocessing/camera_configs/2_F/front.yaml new file mode 100644 index 0000000..da4ac6a --- /dev/null +++ b/preprocessing/camera_configs/2_F/front.yaml @@ -0,0 +1,15 @@ +# camera calibration matrix K +fx: 2166.108091 +fy: 2167.956694 +px: 969.357789 +py: 615.399164 + +# rotation matrix R (in deg) +yaw: 0.0 +pitch: 0.0 +roll: 0.0 + +# vehicle coords of camera origin +XCam: 1.84452 +YCam: -0.096 +ZCam: 1.1273 diff --git a/preprocessing/homography_converter/README.md b/preprocessing/homography_converter/README.md new file mode 100644 index 0000000..6098cdd --- /dev/null +++ b/preprocessing/homography_converter/README.md @@ -0,0 +1,64 @@ +## Homography Converter + +This script can be used to convert a given homography for usage on different input/output resolutions and convert a given OpenCV homography for use with SpatialTransformer units. + +When preprocessing the dataset and creating homography images by running `ipm.py`, all images are processed at their native resolution. The actual neural network training can however be performed at a decreased resolution and different aspect ratio. Additionally, the SpatialTransformer units in *uNetXST* work slightly differently than OpenCV's warping method. In order to configure _uNetXST_ with correct homographies for in-network transformation, this script needs to be used. + +1. Use `ipm.py` with `-v` flag to only print the computed homographies. +1. Run this script with the homographies from `ipm.py` as input to convert them for usage with SpatialTransformer units. +1. Create a file similar to [preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py](preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py) and paste the converted SpatialTransformer homography there, if _uNetXST_ is chosen as the neural network architecture. Don't forget to set the `unetxst-homographies` parameter in the training config file. + +Note that for our datasets we already provide the correct homographies to be used within *uNetXST*. + +### Usage + +``` +usage: homography_converter.py [-h] -roi H W [-roo H W] -rni H W [-rno H W] + homography + +Converts a given homography matrix to work on images of different resolution. +Also converts OpenCV homography matrices for use with SpatialTransformer +units. + +positional arguments: + homography homography to convert (string representation) + +optional arguments: + -h, --help show this help message and exit + -roi H W original input resolution (HxW) + -roo H W original output resolution (HxW) + -rni H W new input resolution (HxW) + -rno H W new output resolution (HxW) +``` + +### Example + +#### Convert homography from Inverse Perspective Mapping (`ipm.py`) for 604x964 front images (dataset `1_FRLR`) in order to be used on 256x512 images + +```bash +./ipm.py -v --drone ../camera_configs/1_FRLR/drone.yaml ../camera_configs/1_FRLR/front.yaml front ../camera_configs/1_FRLR/rear.yaml rear ../camera_configs/1_FRLR/left.yaml left ../camera_configs/1_FRLR/right.yaml right +# OpenCV homography for front: +# [[0.0, 0.8841865353311344, -253.37277367000263], [0.049056392233805146, 0.5285437237795494, -183.265385638118], [-0.0, 0.001750144780726984, -0.5285437237795492]] +# OpenCV homography for rear: +# [[6.288911300436434e-18, 0.8292344604207404, -264.08036704706365], [-0.04905639223380515, 0.5285437237795513, -135.9750235247304], [-0.0, 0.0017501447807269904, -0.5285437237795512]] +# OpenCV homography for left: +# [[0.04905639223380514, 0.7984814950483465, -264.7865925612947], [3.0038376863423275e-18, 0.4821577791689496, -159.26320930902278], [-0.0, 0.0016334684620118568, -0.49330747552758086]] +# OpenCV homography for right: +# [[-0.04905639223380516, 0.7984814950483448, -217.49623044790604], [3.0038376863423283e-18, 0.5044571718862112, -138.69450590963578], [-0.0, 0.0016334684620118542, -0.49330747552758]] +``` + +```bash +./homography_converter.py '[[0.0, 0.8841865353311344, -253.37277367000263], [0.049056392233805146, 0.5285437237795494, -183.265385638118], [-0.0, 0.001750144780726984, -0.5285437237795492]]' -roi 604 964 -rni 256 512 +``` + +#### Convert homography from Inverse Perspective Mapping (`ipm.py`) for 1936x1216 front images and 1936x1216 drone images (dataset `2_F`) in order to be used on 256x512 images + +```bash +./ipm.py -v --drone ../camera_configs/2_F/drone.yaml ../camera_configs/2_F/front.yaml front +# OpenCV homography for front: +# [[-8.869343394420501e-18, -0.031686570310719885, 58.219888879244245], [0.017875377577360244, 0.1995620198169141, -140.13793664741047], [-0.0, 0.0004091757529793379, -0.25180641631255507]] +``` + +```bash +./homography_converter.py '[[-8.869343394420501e-18, -0.031686570310719885, 58.219888879244245], [0.017875377577360244, 0.1995620198169141, -140.13793664741047], [-0.0, 0.0004091757529793379, -0.25180641631255507]]' -roi 1216 1936 -roo 968 1936 -rni 256 512 +``` diff --git a/preprocessing/homography_converter/homography_converter.py b/preprocessing/homography_converter/homography_converter.py new file mode 100755 index 0000000..8b1144d --- /dev/null +++ b/preprocessing/homography_converter/homography_converter.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import os +import numpy as np +import argparse + + +# parse command line arguments +parser = argparse.ArgumentParser(description="Converts a given homography matrix to work on images of different resolution. Also converts OpenCV homography matrices for use with SpatialTransformer units.") +parser.add_argument("homography", type=str, help="homography to convert (string representation)") +parser.add_argument("-roi", type=int, nargs=2, required=True, metavar=("H", "W"), help="original input resolution (HxW)") +parser.add_argument("-roo", type=int, nargs=2, default=None, metavar=("H", "W"), help="original output resolution (HxW)") +parser.add_argument("-rni", type=int, nargs=2, required=True, metavar=("H", "W"), help="new input resolution (HxW)") +parser.add_argument("-rno", type=int, nargs=2, default=None, metavar=("H", "W"), help="new output resolution (HxW)") +args = parser.parse_args() + +oldInputResolution = args.roi +oldOutputResolution = args.roo if args.roo is not None else oldInputResolution +newInputResolution = args.rni +newOutputResolution = args.rno if args.rno is not None else newInputResolution + +# read original homography +cvH = np.array(eval(args.homography)) + +# calculate intermediate shapes +newInputAspectRatio = newInputResolution[0] / newInputResolution[1] +newOutputAspectRatio = newOutputResolution[0] / newOutputResolution[1] +isNewInputWide = newInputAspectRatio <= 1 +isNewOutputWide = newOutputAspectRatio <= 1 +if isNewInputWide: + newInputResolutionAtOldInputAspectRatio = np.array((newInputResolution[1] / oldInputResolution[1] * oldInputResolution[0], newInputResolution[1])) +else: + newInputResolutionAtOldInputAspectRatio = np.array((newInputResolution[0], newInputResolution[0] / oldInputResolution[0] * oldInputResolution[1])) +if isNewOutputWide: + oldOutputResolutionAtNewOutputAspectRatio = np.array((newOutputAspectRatio * oldOutputResolution[1], oldOutputResolution[1])) +else: + oldOutputResolutionAtNewOutputAspectRatio = np.array((oldOutputResolution[0], oldOutputResolution[0] / newOutputAspectRatio)) + +#=== introduce additional transformation matrices to correct for different aspect ratio + +# shift input to simulate padding to original aspect ratio +px = (newInputResolutionAtOldInputAspectRatio[1] - newInputResolution[1]) / 2 if not isNewInputWide else 0 +py = (newInputResolutionAtOldInputAspectRatio[0] - newInputResolution[0]) / 2 if isNewInputWide else 0 +Ti = np.array([[ 1, 0, px], + [ 0, 1, py], + [ 0, 0, 1]], dtype=np.float32) + +# scale input to original resolution +fx = oldInputResolution[1] / newInputResolutionAtOldInputAspectRatio[1] +fy = oldInputResolution[0] / newInputResolutionAtOldInputAspectRatio[0] +Ki = np.array([[fx, 0, 0], + [ 0, fy, 0], + [ 0, 0, 1]], dtype=np.float32) + +# crop away part of size 'oldOutputResolutionAtNewOutputAspectRatio' from original output resolution +px = -(oldOutputResolution[1] - oldOutputResolutionAtNewOutputAspectRatio[1]) / 2 +py = -(oldOutputResolution[0] - oldOutputResolutionAtNewOutputAspectRatio[0]) / 2 +To = np.array([[ 1, 0, px], + [ 0, 1, py], + [ 0, 0, 1]], dtype=np.float32) + +# scale output to new resolution +fx = newOutputResolution[1] / oldOutputResolutionAtNewOutputAspectRatio[1] +fy = newOutputResolution[0] / oldOutputResolutionAtNewOutputAspectRatio[0] +Ko = np.array([[fx, 0, 0], + [ 0, fy, 0], + [ 0, 0, 1]], dtype=np.float32) + +# assemble adjusted homography +cvHr = Ko.dot(To.dot(cvH.dot(Ki.dot(Ti)))) + + +#=== introduce additional transformation matrices to correct for implementation differences +# between cv2.warpPerspective() and STN's BilinearInterpolation + +# scale from unit grid (-1,1)^2 to new input resolution +fx = newInputResolution[1] / 2 +fy = newInputResolution[0] / 2 +px = newInputResolution[1] / 2 +py = newInputResolution[0] / 2 +Si = np.array([[fx, 0, px], + [ 0, fy, py], + [ 0, 0, 1]], dtype=np.float32) + +# scale from output resolution back to unit grid (-1,1)^2 +fx = 2 / newOutputResolution[1] +fy = 2 / newOutputResolution[0] +px = -1 +py = -1 +So = np.array([[fx, 0, px], + [ 0, fy, py], + [ 0, 0, 1]], dtype=np.float32) + +# assemble adjusted homography +stnHr = np.linalg.inv(So.dot(cvHr.dot(Si))) + + +#=== print transformation matrices + +print(f"\nOriginal OpenCV homography used for resolution {oldInputResolution[0]}x{oldInputResolution[1]} -> {oldOutputResolution[0]}x{oldOutputResolution[1]}:") +print(cvH.tolist()) + +print(f"\nAdjusted OpenCV homography usable for resolution {newInputResolution[0]}x{newInputResolution[1]} -> {newOutputResolution[0]}x{newOutputResolution[1]}:") +print(cvHr.tolist()) + +print(f"\nAdjusted SpatialTransformer homography usable for resolution {newInputResolution[0]}x{newInputResolution[1]} -> {newOutputResolution[0]}x{newOutputResolution[1]}:") +print(stnHr.tolist()) diff --git a/preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py b/preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py new file mode 100644 index 0000000..e3bdbac --- /dev/null +++ b/preprocessing/homography_converter/uNetXST_homographies/1_FRLR.py @@ -0,0 +1,33 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import numpy as np + +# for dataaset 1_FRLR +H = [ + np.array([[4.651574574230558e-14, 10.192351107009959, -5.36318723862984e-07], [-5.588661045867985e-07, 0.0, 2.3708767903941617], [35.30731833118676, 0.0, -1.7000018578614013]]), # front + np.array([[-5.336674306912119e-14, -10.192351107009957, 5.363187220578325e-07], [5.588660952931949e-07, 3.582264351370481e-23, 2.370876772982613], [-35.30731833118661, -2.263156574813233e-15, -0.5999981421386035]]), # rear + np.array([[20.38470221401992, 7.562206982469407e-14, -0.28867638384075833], [-3.422067857504854e-23, 2.794330463189411e-07, 2.540225111648729], [2.1619497190382224e-15, -17.65365916559334, -0.4999990710692976]]), # left + np.array([[-20.38470221401991, -4.849709834037436e-15, 0.2886763838407495], [-3.4220679184765114e-23, -2.794330512976549e-07, 2.5402251116487626], [2.161949719038217e-15, 17.653659165593304, -0.5000009289306967]]) # right +] diff --git a/preprocessing/homography_converter/uNetXST_homographies/2_F.py b/preprocessing/homography_converter/uNetXST_homographies/2_F.py new file mode 100644 index 0000000..f725a9f --- /dev/null +++ b/preprocessing/homography_converter/uNetXST_homographies/2_F.py @@ -0,0 +1,30 @@ +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import numpy as np + +# for dataset 2_F +H = [ + np.array([[0.03506686613905922, 27.971438297785962, -0.17694724954191404], [0.3821882391578238, 9.481642330993019e-17, 5.46222110929461], [25.000001047737943, 6.202207287472715e-15, 27.000001047737943]]) # front +] diff --git a/preprocessing/ipm/README.md b/preprocessing/ipm/README.md new file mode 100644 index 0000000..eb0dfb7 --- /dev/null +++ b/preprocessing/ipm/README.md @@ -0,0 +1,71 @@ +## Inverse Perspective Mapping to Bird's-Eye-View + +This script can be used to recreate a Bird's-Eye-View from camera images using Inverse Perspective Mapping. + +### Usage +``` +usage: ipm.py [-h] [-wm WM] [-hm HM] [-r R] + [--drone DRONE] [--batch] + [--output OUTPUT] + [CAM IMG [CAM IMG ...]] + +Warps camera images to the plane z=0 in the world frame. + +positional arguments: + CAM IMG camera config file and image file + +optional arguments: + -h, --help show this help message and exit + -wm WM output image width in [m] + -hm HM output image height in [m] + -r R output image resolution in [px/m] + --drone DRONE camera config file of drone to map to + --batch process folders of images instead of single images + --output OUTPUT output directory to write transformed images to + --cc use with color-coded images to enable NN-interpolation + -v only print homography matrices +``` + +### Example +#### Generate and display BEV from segmented surround camera images, adjust output image size to match drone intrinsics +```bash +./ipm.py --cc --drone droneCameraConfig.yaml front.yaml front.png rear.yaml rear.png left.yaml left.png right.yaml right.png +``` +![](assets/example.png) +#### Generate and export BEVs from multiple pairs of front and rear images +```bash +./ipm.py --batch --output output/ frontCameraConfig.yaml frontImages/ rearCameraConfig.yaml rearImages/ +``` + +### How it works + +The relationship between world coordinates $`x_w`$ and the image pixels $`x_i`$ world points get projected onto is given by the projection matrix $`\mathrm{P}`$: +```math +x_i = \mathrm{P} x_w \quad \,, +``` +where we use homogeneous coordinates, which would still need to be normalized. + +The projection matrix encodes the camera intrinsics $`\mathrm{K}`$ and extrinsics (rotation $`\mathrm{R}`$ and translation $`\mathrm{t}`$ w.r.t world frame): +```math +\mathrm{P} = \mathrm{K} \left[ \mathrm{R} | \mathrm{t}\right] \,. +``` + +Now, we can also imagine that there exists a transformation $`\mathrm{M}`$ mapping from the road plane to the world frame, s.t. +```math +x_w = \mathrm{M} x_r \quad \,. +``` + +Putting these relations together, we can transform from given image coordinates $`x_i`$ to coordinates in the road plane $`x_r`$ by inverting $`\mathrm{P}\mathrm{M}`$: +```math +x_r = \left( \mathrm{P} \mathrm{M} \right)^{-1} x_i \quad \,. +``` + +We have an input image and thus have the image coordinates $`x_i`$. The camera intrinsics and extrinsics are assumed to be known and have to be specified via the config file. + +What remains is the definition of $`\mathrm{M}`$. In the simplest case we co-locate world frame origin and road plane origin. Then the mapping +```math +\mathrm{M} = \begin{bmatrix}1&0&0\\0&1&0\\0&0&0\\0&0&1\end{bmatrix} +``` +would simply introduce a z-coordinate of 0 when going from road plane to world frame. For a better visualization, one would need to scale between road plane metric coordinates and road image plane pixels as well as do some translation. Have a look at the code for this last step, as the warping to the road plane is already working with the simple mapping. + +In the code, the vehicle-frame is used as the world frame. diff --git a/preprocessing/ipm/assets/example.png b/preprocessing/ipm/assets/example.png new file mode 100644 index 0000000..3fa3717 Binary files /dev/null and b/preprocessing/ipm/assets/example.png differ diff --git a/preprocessing/ipm/ipm.py b/preprocessing/ipm/ipm.py new file mode 100755 index 0000000..9991d9e --- /dev/null +++ b/preprocessing/ipm/ipm.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import os +import sys +import yaml +import numpy as np +import cv2 +import argparse +from tqdm import tqdm + + +class Camera: + + K = np.zeros([3, 3]) + R = np.zeros([3, 3]) + t = np.zeros([3, 1]) + P = np.zeros([3, 4]) + + def setK(self, fx, fy, px, py): + self.K[0, 0] = fx + self.K[1, 1] = fy + self.K[0, 2] = px + self.K[1, 2] = py + self.K[2, 2] = 1.0 + + def setR(self, y, p, r): + + Rz = np.array([[np.cos(-y), -np.sin(-y), 0.0], [np.sin(-y), np.cos(-y), 0.0], [0.0, 0.0, 1.0]]) + Ry = np.array([[np.cos(-p), 0.0, np.sin(-p)], [0.0, 1.0, 0.0], [-np.sin(-p), 0.0, np.cos(-p)]]) + Rx = np.array([[1.0, 0.0, 0.0], [0.0, np.cos(-r), -np.sin(-r)], [0.0, np.sin(-r), np.cos(-r)]]) + Rs = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0], [1.0, 0.0, 0.0]]) # switch axes (x = -y, y = -z, z = x) + self.R = Rs.dot(Rz.dot(Ry.dot(Rx))) + + def setT(self, XCam, YCam, ZCam): + X = np.array([XCam, YCam, ZCam]) + self.t = -self.R.dot(X) + + def updateP(self): + Rt = np.zeros([3, 4]) + Rt[0:3, 0:3] = self.R + Rt[0:3, 3] = self.t + self.P = self.K.dot(Rt) + + def __init__(self, config): + self.setK(config["fx"], config["fy"], config["px"], config["py"]) + self.setR(np.deg2rad(config["yaw"]), np.deg2rad(config["pitch"]), np.deg2rad(config["roll"])) + self.setT(config["XCam"], config["YCam"], config["ZCam"]) + self.updateP() + + +# parse command line arguments +parser = argparse.ArgumentParser(description="Warps camera images to the plane z=0 in the world frame.") +parser.add_argument("camera_img_pair", metavar="CAM IMG", nargs='*', help="camera config file and image file") +parser.add_argument("-wm", type=float, help="output image width in [m]", default=20) +parser.add_argument("-hm", type=float, help="output image height in [m]", default=40) +parser.add_argument("-r", type=float, help="output image resolution in [px/m]", default=20) +parser.add_argument("--drone", type=str, help="camera config file of drone to map to") +parser.add_argument("--batch", help="process folders of images instead of single images", action="store_true") +parser.add_argument("--output", help="output directory to write transformed images to") +parser.add_argument("--cc", help="use with color-coded images to enable NN-interpolation", action="store_true") +parser.add_argument("-v", help="only print homography matrices", action="store_true") +args = parser.parse_args() + + +# load camera configurations and image paths +cameraConfigs = [] +imagePathArgs = [] +for aIdx in range(int(len(args.camera_img_pair) / 2.0)): + with open(os.path.abspath(args.camera_img_pair[2*aIdx])) as stream: + cameraConfigs.append(yaml.safe_load(stream)) + imagePathArgs.append(args.camera_img_pair[2*aIdx+1]) +toDrone = False +if args.drone: + toDrone = True + with open(os.path.abspath(args.drone)) as stream: + droneConfig = yaml.safe_load(stream) + +# load image paths +imagePaths = [] +if not args.batch: + imagePaths.append(imagePathArgs) +else: + for path in imagePathArgs: + imagePaths.append([os.path.join(path, f) for f in sorted(os.listdir(path)) if f[0] != "."]) + imagePaths = list(map(list, zip(*imagePaths))) # transpose ([[f1,f2],[r1,r2]] -> [[f1,r1],[f2,r2]]) +outputFilenames = [os.path.basename(imageTuple[0]) for imageTuple in imagePaths] + +# create output directories +export = False +if args.output: + export = True + outputDir = os.path.abspath(args.output) + if not os.path.exists(outputDir): + os.makedirs(outputDir) + +# initialize camera objects +cams = [] +for config in cameraConfigs: + cams.append(Camera(config)) +if toDrone: + drone = Camera(droneConfig) + +# calculate output shape; adjust to match drone image, if specified +if not toDrone: + pxPerM = (args.r, args.r) + outputRes = (int(args.wm * pxPerM[0]), int(args.hm * pxPerM[1])) +else: + outputRes = (int(2 * droneConfig["py"]), int(2 * droneConfig["px"])) + dx = outputRes[1] / droneConfig["fx"] * droneConfig["ZCam"] + dy = outputRes[0] / droneConfig["fy"] * droneConfig["ZCam"] + pxPerM = (outputRes[0] / dy, outputRes[1] / dx) + +# setup mapping from street/top-image plane to world coords +shift = (outputRes[0] / 2.0, outputRes[1] / 2.0) +if toDrone: + shift = shift[0] + droneConfig["YCam"] * pxPerM[0], shift[1] - droneConfig["XCam"] * pxPerM[1] +M = np.array([[1.0 / pxPerM[1], 0.0, -shift[1] / pxPerM[1]], [0.0, -1.0 / pxPerM[0], shift[0] / pxPerM[0]], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]) + +# find IPM as inverse of P*M +IPMs = [] +for cam in cams: + IPMs.append(np.linalg.inv(cam.P.dot(M))) + +# print homographies +if args.v: + for idx, ipm in enumerate(IPMs): + print(f"OpenCV homography for {args.camera_img_pair[2*idx+1]}:") + print(ipm.tolist()) + exit(0) + +# setup masks to later clip invalid parts from transformed images (inefficient, but constant runtime) +masks = [] +for config in cameraConfigs: + mask = np.zeros((outputRes[0], outputRes[1], 3), dtype=np.bool) + for i in range(outputRes[1]): + for j in range(outputRes[0]): + theta = np.rad2deg(np.arctan2(-j + outputRes[0] / 2 - droneConfig["YCam"] * pxPerM[0], i - outputRes[1] / 2 + droneConfig["XCam"] * pxPerM[1])) + if abs(theta - config["yaw"]) > 90 and abs(theta - config["yaw"]) < 270: + mask[j,i,:] = True + masks.append(mask) + +# process images +progBarWrapper = tqdm(imagePaths) +for imageTuple in progBarWrapper: + + filename = os.path.basename(imageTuple[0]) + progBarWrapper.set_postfix_str(filename) + + # load images + images = [] + for imgPath in imageTuple: + images.append(cv2.imread(imgPath)) + + # warp input images + interpMode = cv2.INTER_NEAREST if args.cc else cv2.INTER_LINEAR + warpedImages = [] + for img, IPM in zip(images, IPMs): + warpedImages.append(cv2.warpPerspective(img, IPM, (outputRes[1], outputRes[0]), flags=interpMode)) + + # remove invalid areas (behind the camera) from warped images + for warpedImg, mask in zip(warpedImages, masks): + warpedImg[mask] = 0 + + # stitch separate images to total bird's-eye-view + birdsEyeView = np.zeros(warpedImages[0].shape, dtype=np.uint8) + for warpedImg in warpedImages: + mask = np.any(warpedImg != (0,0,0), axis=-1) + birdsEyeView[mask] = warpedImg[mask] + + # display or export bird's-eye-view + if export: + cv2.imwrite(os.path.join(outputDir, filename), birdsEyeView) + else: + cv2.namedWindow(filename, cv2.WINDOW_NORMAL) + cv2.imshow(filename, birdsEyeView) + cv2.waitKey(0) + cv2.destroyAllWindows() diff --git a/preprocessing/occlusion/README.md b/preprocessing/occlusion/README.md new file mode 100644 index 0000000..2e1769c --- /dev/null +++ b/preprocessing/occlusion/README.md @@ -0,0 +1,35 @@ +## Drone Footage Occlusion + +Vehicle-mounted cameras cannot observe the entire surroundings, e.g. a car behind a truck cannot be seen. +This script can be used to remove these non-visible areas from segmented drone camera footage. + +### Usage +``` +usage: occlusion.py [-h] [--batch] [--output OUTPUT] + img drone cam [cam ...] + +Determines the areas not visible from vehicle cameras and removes them from +drone camera footage. + +positional arguments: + img segmented drone image + drone drone camera config file + cam camera config file + +optional arguments: + -h, --help show this help message and exit + --batch process folders of images instead of single images + --output OUTPUT output directory to write output images to +``` + +### Example +```bash +./occlusion.py --batch images/ droneCameraConfig.yaml frontCameraConfig.yaml rearCameraConfig.yaml --output output_images/ +``` +![original](assets/example-original.png) ![occluded](assets/example-occluded.png) + +### How it works +From each camera origin defined by a camera configuration file, rays are cast to bordering pixels within the camera's field-of-view. All pixels along the ray are transferred from the input image to the output image. If a pixel color corresponding to a blocking class (e.g. car, building) is observed, that particular ray is stopped, leaving an occluded part in the output image. The blocking objects however are always transferred as a whole by using a flood-filling technique. + +### Performance +To speed up batch mode, the script processes multiple images in parallel. Still, there is room for performance optimization, as this is only a relatively basic Python script. diff --git a/preprocessing/occlusion/assets/example-occluded.png b/preprocessing/occlusion/assets/example-occluded.png new file mode 100644 index 0000000..6164d1d Binary files /dev/null and b/preprocessing/occlusion/assets/example-occluded.png differ diff --git a/preprocessing/occlusion/assets/example-original.png b/preprocessing/occlusion/assets/example-original.png new file mode 100644 index 0000000..c3072c3 Binary files /dev/null and b/preprocessing/occlusion/assets/example-original.png differ diff --git a/preprocessing/occlusion/occlusion.py b/preprocessing/occlusion/occlusion.py new file mode 100755 index 0000000..cbaf6b6 --- /dev/null +++ b/preprocessing/occlusion/occlusion.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python + +# ============================================================================== +# MIT License +# +# Copyright 2020 Institute for Automotive Engineering of RWTH Aachen University. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== + +import os +import argparse +import yaml +import tqdm +import multiprocessing +import numpy as np +import cv2 +import skimage.draw + + +BLOCKING_LABELS = ["building", "wall", "car", "truck", "bus", "caravan", "trailer", "train"] +TALL_NON_BLOCKING_LABELS = ["vegetation"] # will be visible behind small blocking objects (e.g. cars) +COLORS = { + "occluded" : (150, 150, 150), + "static" : ( 0, 0, 0), + "dynamic" : (111, 74, 0), + "ground" : ( 81, 0, 81), + "road" : (128, 64, 128), + "sidewalk" : (244, 35, 232), + "parking" : (250, 170, 160), + "rail track" : (230, 150, 140), + "building" : ( 70, 70, 70), + "wall" : (102, 102, 156), + "fence" : (190, 153, 153), + "guard rail" : (180, 165, 180), + "bridge" : (150, 100, 100), + "tunnel" : (150, 120, 90), + "pole" : (153, 153, 153), + "polegroup" : (153, 153, 153), + "traffic light": (250, 170, 30), + "traffic sign" : (220, 220, 0), + "vegetation" : (107, 142, 35), + "terrain" : (152, 251, 152), + "sky" : ( 70, 130, 180), + "person" : (255, 0, 0), + "rider" : (220, 20, 60), + "car" : ( 0, 0, 142), + "truck" : ( 0, 0, 70), + "bus" : ( 0, 60, 100), + "caravan" : ( 0, 0, 90), + "trailer" : ( 0, 0, 110), + "train" : ( 0, 80, 100), + "motorcycle" : ( 0, 0, 230), + "bicycle" : (119, 11, 32), + "roadmark" : (255, 255, 255) +} + +DUMMY_COLOR = tuple(np.random.randint(0, 256, 3)) +while DUMMY_COLOR in COLORS.values(): + DUMMY_COLOR = tuple(np.random.randint(0, 256, 3)) + + +class Camera: + + def __init__(self, config, frame, pxPerM): + + self.origin = (frame[0] + config["XCam"] * pxPerM[0], frame[1] - config["YCam"] * pxPerM[1]) + self.yaw = -config["yaw"] + self.fov = 2.0 * np.arctan(config["px"] / config["fx"]) * 180.0 / np.pi + thetaMin = self.yaw - self.fov / 2.0 + thetaMax = (self.yaw + self.fov / 2.0) + thetaMin = thetaMin % 180 if thetaMin < -180 else thetaMin + thetaMax = thetaMax % -180 if thetaMax > 180 else thetaMax + self.fovBounds = (thetaMin, thetaMax) + + def canSee(self, x, y): + + dx, dy = x - self.origin[0], y - self.origin[1] + theta = np.arctan2(dy, dx) * 180.0 / np.pi + if self.fovBounds[0] > self.fovBounds[1]: + return (self.fovBounds[0] <= theta) or (theta <= self.fovBounds[1]) + else: + return (self.fovBounds[0] <= theta) and (theta <= self.fovBounds[1]) + + +def floodFill(px, color, inputImg, outputImg): + + mask = np.zeros((inputImg.shape[0]+2, inputImg.shape[1]+2), np.uint8) + flags = 4 | (255 << 8) | cv2.FLOODFILL_FIXED_RANGE | cv2.FLOODFILL_MASK_ONLY + cv2.floodFill(image=inputImg, mask=mask, seedPoint=(px[0], px[1]), newVal=(255, 255, 255), loDiff=(1,1,1), upDiff=(1,1,1), flags=flags) + outputImg[np.where(mask[1:-1, 1:-1] == 255)] = color + + +def castRay(fromPoint, toPoint, inputImg, outputImg): + + # loop over all pixels along the ray, moving outwards + ray = list(zip(*skimage.draw.line(*(int(fromPoint[0]), int(fromPoint[1])), *(int(toPoint[0]), int(toPoint[1]))))) + stopRay = stopTransfer = False + for px in ray: + + # out-of-bounds check + if not (0 <= px[0] and px[0] < inputImg.shape[1] and 0 <= px[1] and px[1] < inputImg.shape[0]): + continue + + # check if ray hit a blocking object class + for label in BLOCKING_LABELS: + if (inputImg[px[1], px[0], :] == COLORS[label]).all(): + + # if car, continue ray to look for more blocking objects, else stop ray + if label == "car": + if stopTransfer: # if car behind another car, skip + continue + else: # if first car in line of ray + stopTransfer = True + else: + stopRay = True + + # transfer blocking object to output image + if not (outputImg[px[1], px[0], :] == COLORS[label]).all(): + floodFill(px, COLORS[label], inputImg, outputImg) + break + + if stopRay: # stop ray if blocked + break + if stopTransfer: # if transfer is stopped, still look for tall non-blocking labels to transfer + for label in TALL_NON_BLOCKING_LABELS: + if (inputImg[px[1], px[0], :] == COLORS[label]).all(): + outputImg[px[1], px[0], :] = inputImg[px[1], px[0], :] + break + else: # transfer px to output image + outputImg[px[1], px[0], :] = inputImg[px[1], px[0], :] + +# ============================================================================== + +# parse command line arguments and read image +parser = argparse.ArgumentParser(description="Determines the areas not visible from vehicle cameras and removes them from drone camera footage.") +parser.add_argument("img", help="segmented drone image") +parser.add_argument("drone", help="drone camera config file") +parser.add_argument("cam", nargs="+", help="camera config file") +parser.add_argument("--batch", help="process folders of images instead of single images", action="store_true") +parser.add_argument("--output", help="output directory to write output images to") +args = parser.parse_args() + +# load image paths +imagePaths = [] +if not args.batch: + imagePaths.append(os.path.abspath(args.img)) +else: + path = os.path.abspath(args.img) + imagePaths = [os.path.join(path, f) for f in sorted(os.listdir(path)) if f[0] != "."] + +# parse camera configs +with open(os.path.abspath(args.drone)) as stream: + droneConfig = yaml.safe_load(stream) +cameraConfigs = [] +for cameraConfig in args.cam: + with open(os.path.abspath(cameraConfig)) as stream: + cameraConfigs.append(yaml.safe_load(stream)) + +# create output directories +if args.output: + outputDir = os.path.abspath(args.output) + if not os.path.exists(outputDir): + os.makedirs(outputDir) + +# determine image dimensions in (m) +inputImg = cv2.imread(imagePaths[0]) +dxm = inputImg.shape[1] / droneConfig["fx"] * droneConfig["ZCam"] +dym = inputImg.shape[0] / droneConfig["fy"] * droneConfig["ZCam"] +pxPerM = (inputImg.shape[1] / dxm, inputImg.shape[0] / dym) +base_link = (int(inputImg.shape[1] / 2.0 - droneConfig["XCam"] * pxPerM[0]), int(inputImg.shape[0] / 2.0 + droneConfig["YCam"] * pxPerM[0])) + +# create cameras +cameras = [] +for cameraConfig in cameraConfigs: + cam = Camera(cameraConfig, base_link, pxPerM) + cameras.append(cam) + + +# define processing of a single image +def processImage(imagePath): + + filename = os.path.basename(imagePath) + + # read input image and create blank output image + inputImg = cv2.imread(imagePath) + inputImg = cv2.cvtColor(inputImg, cv2.COLOR_BGR2RGB) + outputImg = np.zeros(inputImg.shape, dtype=np.uint8) + np.array(COLORS["occluded"], dtype=np.uint8) + + # temporarily recolor ego vehicle (if in image), s.t. it does not block + if base_link[0] > 0 and base_link[1] > 0: + floodFill(base_link, DUMMY_COLOR, inputImg, inputImg) + + # loop over all border pixels to determine if ray is visible + rays = [] + for cam in cameras: + for x in range(inputImg.shape[1]): + if cam.canSee(x, 0): + rays.append((cam.origin, (x, 0))) + for x in range(inputImg.shape[1]): + if cam.canSee(x, inputImg.shape[0]): + rays.append((cam.origin, (x, inputImg.shape[0]))) + for y in range(inputImg.shape[0]): + if cam.canSee(0, y): + rays.append((cam.origin, (0, y))) + for y in range(inputImg.shape[0]): + if cam.canSee(inputImg.shape[1], y): + rays.append((cam.origin, (inputImg.shape[1], y))) + + # cast rays + for ray in rays: + castRay(ray[0], ray[1], inputImg, outputImg) + + # recolor ego vehicle as car and transfer to output + if base_link[0] > 0 and base_link[1] > 0: + floodFill(base_link, COLORS["car"], inputImg, outputImg) + floodFill(base_link, COLORS["car"], inputImg, outputImg) + + # display or export output image + outputImg = cv2.cvtColor(outputImg, cv2.COLOR_RGB2BGR) + if args.output: + cv2.imwrite(os.path.join(outputDir, filename), outputImg) + else: + cv2.namedWindow(filename, cv2.WINDOW_NORMAL) + cv2.imshow(filename, outputImg) + cv2.waitKey(0) + cv2.destroyAllWindows() + + +# process images in parallel +if args.batch: + print("Warning: This might take an extremely long time, are you sure you need to (re)generate the occluded labels?") + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + for _ in tqdm.tqdm(pool.imap(processImage, imagePaths), desc="Processing images", total=len(imagePaths), smoothing=0): + pass + pool.close() + pool.join() +else: + processImage(imagePaths[0]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f5a5a24 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,51 @@ +absl-py==0.9.0 +astor==0.8.1 +cachetools==4.0.0 +certifi==2019.11.28 +chardet==3.0.4 +ConfigArgParse==1.1 +cycler==0.10.0 +decorator==4.4.2 +gast==0.2.2 +google-auth==1.11.3 +google-auth-oauthlib==0.4.1 +google-pasta==0.2.0 +grpcio==1.27.2 +h5py==2.10.0 +idna==2.9 +imageio==2.8.0 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.0 +kiwisolver==1.1.0 +Markdown==3.2.1 +matplotlib==3.2.0 +networkx==2.4 +numpy==1.18.1 +oauthlib==3.1.0 +opencv-python==4.2.0.32 +opt-einsum==3.2.0 +pandas==1.0.2 +Pillow==7.0.0 +protobuf==3.11.3 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==2.4.6 +python-dateutil==2.8.1 +pytz==2019.3 +PyWavelets==1.1.1 +PyYAML==5.3 +requests==2.23.0 +requests-oauthlib==1.3.0 +rsa==4.0 +scikit-image==0.16.2 +scipy==1.4.1 +seaborn==0.10.0 +six==1.14.0 +tensorboard==2.1.1 +tensorflow==2.1.0 +tensorflow-estimator==2.1.0 +termcolor==1.1.0 +tqdm==4.43.0 +urllib3==1.25.8 +Werkzeug==1.0.0 +wrapt==1.12.1