From 4f715992ff38eb0980c8098a8084c2ef23e80485 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 03:11:18 -0500
Subject: [PATCH 1/9] Functional and Refactored demo.cpp for MobileNet

---
 .../object_detection_nanodet/CMakeLists.txt   |  29 ++
 models/object_detection_nanodet/demo.cpp      | 480 ++++++++++++++++++
 2 files changed, 509 insertions(+)
 create mode 100644 models/object_detection_nanodet/CMakeLists.txt
 create mode 100644 models/object_detection_nanodet/demo.cpp
diff --git a/models/object_detection_nanodet/CMakeLists.txt b/models/object_detection_nanodet/CMakeLists.txt
new file mode 100644
index 00000000..1141423a
--- /dev/null
+++ b/models/object_detection_nanodet/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.24)
+set(project_name "opencv_zoo_object_detection_nanodet")
+
+PROJECT (${project_name})
+
+set(OPENCV_VERSION "4.9.0")
+set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
+find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+
+file(GLOB SourceFile
+    "demo.cpp")
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    config: ${OpenCV_DIR}")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+# Declare the executable target built from your sources
+add_executable(${project_name} ${SourceFile})
+
+# Link your application with OpenCV libraries
+target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
new file mode 100644
index 00000000..9d0bf8b6
--- /dev/null
+++ b/models/object_detection_nanodet/demo.cpp
@@ -0,0 +1,480 @@
+#include <vector>
+#include <string>
+#include <iostream>
+
+#include <opencv2/opencv.hpp>
+
+using namespace std;
+using namespace cv;
+using namespace dnn;
+
+const auto backendTargetPairs = vector<pair<Backend, Target>>{
+    {DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
+    {DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
+    {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
+    {DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
+    {DNN_BACKEND_CANN, DNN_TARGET_NPU}
+};
+
+const vector<string> nanodetClassLabels = {
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+    "train", "truck", "boat", "traffic light", "fire hydrant",
+    "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+    "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+    "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat",
+    "baseball glove", "skateboard", "surfboard", "tennis racket",
+    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+    "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+    "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+    "mouse", "remote", "keyboard", "cell phone", "microwave",
+    "oven", "toaster", "sink", "refrigerator", "book", "clock",
+    "vase", "scissors", "teddy bear", "hair drier", "toothbrush" };
+
+class NanoDet {
+public:
+    NanoDet(const String& modelPath, const float probThresh = 0.35, const float iouThresh = 0.6, 
+            const Backend bId = DNN_BACKEND_DEFAULT, const Target tId = DNN_TARGET_CPU) :
+        modelPath(modelPath), probThreshold(probThresh),
+        iouThreshold(iouThresh), backendId(bId), targetId(tId), 
+        imageShape(416, 416), regMax(7)
+    {
+        this->strides = { 8, 16, 32, 64 };
+        this->net = readNet(modelPath);
+        setBackendAndTarget(bId, tId);
+        this->project = Mat::zeros(1, this->regMax + 1, CV_32F);
+        for (size_t i = 0; i <= this->regMax; ++i) 
+        {
+            this->project.at<float>(0, i) = static_cast<float>(i);
+        }
+        this->mean = Scalar(103.53, 116.28, 123.675);
+        this->std = Scalar(1.0 / 57.375, 1.0 / 57.12, 1.0 / 58.395);
+        this->generateAnchors();
+    }
+
+    void setBackendAndTarget(Backend bId, Target tId)
+    {
+        this->net.setPreferableBackend(bId);
+        this->net.setPreferableTarget(tId);
+    }
+
+    Mat preProcess(const Mat& inputImage)
+    {
+        Image2BlobParams paramNanodet;
+        paramNanodet.datalayout = DNN_LAYOUT_NCHW;
+        paramNanodet.ddepth = CV_32F;
+        paramNanodet.mean = this->mean;
+        paramNanodet.scalefactor = this->std;
+        paramNanodet.size = this->imageShape;
+        Mat blob;
+        blobFromImageWithParams(inputImage, blob, paramNanodet);
+        return blob;
+    }
+
+    Mat infer(const Mat& sourceImage)
+    {
+        Mat blob = this->preProcess(sourceImage);
+        this->net.setInput(blob);
+        vector<Mat> modelOutput;
+        this->net.forward(modelOutput, this->net.getUnconnectedOutLayersNames());
+        Mat preds = this->postProcess(modelOutput);
+        return preds;
+    }
+
+    Mat reshapeIfNeeded(const Mat& input) 
+    {
+        if (input.dims == 3)
+        {
+            return input.reshape(0, input.size[1]);
+        }
+        return input;
+    }
+
+    Mat softmaxActivation(const Mat& input) 
+    {
+        Mat x_exp, x_sum, x_repeat_sum, result;
+        exp(input.reshape(0, input.total() / (this->regMax + 1)), x_exp);
+        reduce(x_exp, x_sum, 1, REDUCE_SUM, CV_32F);
+        repeat(x_sum, 1, this->regMax + 1, x_repeat_sum);
+        divide(x_exp, x_repeat_sum, result);
+        return result;
+    }
+
+    Mat applyProjection(Mat& input) 
+    {
+        Mat repeat_project;
+        repeat(this->project, input.rows, 1, repeat_project);
+        multiply(input, repeat_project, input);
+        reduce(input, input, 1, REDUCE_SUM, CV_32F);
+        Mat projection = input.col(0).clone();
+        return projection.reshape(0, (4, projection.total() / 4));
+    }
+
+    void preNMS(Mat& anchors, Mat& bbox_pred, Mat& cls_score, const int nms_pre = 1000)
+    {
+        Mat max_scores;
+        reduce(cls_score, max_scores, 1, REDUCE_MAX);
+
+        Mat indices;
+        sortIdx(max_scores.t(), indices, SORT_DESCENDING);
+
+        Mat indices_float = indices.colRange(0, nms_pre);
+        Mat selected_anchors, selected_bbox_pred, selected_cls_score;
+        for (int j = 0; j < indices_float.cols; ++j) {
+            selected_anchors.push_back(anchors.row(indices_float.at<int>(j)));
+            selected_bbox_pred.push_back(bbox_pred.row(indices_float.at<int>(j)));
+            selected_cls_score.push_back(cls_score.row(indices_float.at<int>(j)));
+        }
+
+        anchors = selected_anchors;
+        bbox_pred = selected_bbox_pred;
+        cls_score = selected_cls_score;
+    }
+
+    void clipBoundingBoxes(Mat& x1, Mat& y1, Mat& x2, Mat& y2) 
+    {
+        Mat zeros = Mat::zeros(x1.size(), x1.type());
+        x1 = min(max(x1, zeros), Scalar(this->imageShape.width - 1));
+        y1 = min(max(y1, zeros), Scalar(this->imageShape.height - 1));
+        x2 = min(max(x2, zeros), Scalar(this->imageShape.width - 1));
+        y2 = min(max(y2, zeros), Scalar(this->imageShape.height - 1));
+    }
+
+    Mat calculateBoundingBoxes(const Mat& anchors, const Mat& bbox_pred) 
+    {
+        Mat x1 = anchors.col(0) - bbox_pred.col(0);
+        Mat y1 = anchors.col(1) - bbox_pred.col(1);
+        Mat x2 = anchors.col(0) + bbox_pred.col(2);
+        Mat y2 = anchors.col(1) + bbox_pred.col(3);
+
+        clipBoundingBoxes(x1, y1, x2, y2);
+
+        Mat bboxes;
+        hconcat(vector<Mat>{x1, y1, x2, y2}, bboxes);
+
+        return bboxes;
+    }
+
+    vector<Rect2d> bboxMatToRect2d(const Mat& bboxes)
+    {
+        Mat bboxes_wh(bboxes.clone());
+        bboxes_wh.colRange(2, 4) = bboxes_wh.colRange(2, 4) -= bboxes_wh.colRange(0, 2);
+        vector<Rect2d> boxesXYXY;
+        for (size_t i = 0; i < bboxes_wh.rows; i++) {
+            boxesXYXY.emplace_back(bboxes.at<float>(i, 0),
+                                   bboxes.at<float>(i, 1),
+                                   bboxes.at<float>(i, 2),
+                                   bboxes.at<float>(i, 3));
+        }
+        return boxesXYXY;
+    }
+
+    std::tuple<Mat, Mat> getClassIdAndConfidences(const Mat& scores)
+    {
+        Mat classIds(scores.rows, 1, CV_32FC1);
+        Mat confidences(scores.rows, 1, CV_32FC1);
+
+        for (size_t i = 0; i < scores.rows; ++i) {
+            Point maxLoc;
+            minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc);
+            classIds.at<float>(i) = maxLoc.x;
+            confidences.at<float>(i) = scores.at<float>(i, maxLoc.x);
+        }
+        return std::make_tuple(classIds, confidences);
+    }
+
+    Mat postProcess(const vector<Mat>& preds)
+    {
+        vector<Mat> cls_scores, bbox_preds;
+        for (size_t i = 0; i < preds.size(); i += 2) {
+            cls_scores.push_back(preds[i]);
+            bbox_preds.push_back(preds[i + 1]);
+        }
+
+        vector<Mat> bboxes_mlvl;
+        vector<Mat> scores_mlvl;
+
+        for (size_t i = 0; i < strides.size(); ++i) {
+            if (i >= cls_scores.size() || i >= bbox_preds.size()) continue;
+            // Extract necessary data
+            int stride = strides[i];
+            Mat cls_score = reshapeIfNeeded(cls_scores[i]);
+            Mat bbox_pred = reshapeIfNeeded(bbox_preds[i]);
+            Mat anchors = anchorsMlvl[i].t();
+
+            // Softmax activation, projection, and calculate bounding boxes
+            bbox_pred = softmaxActivation(bbox_pred);
+            bbox_pred = applyProjection(bbox_pred);
+            bbox_pred = stride * bbox_pred;
+
+            const int nms_pre = 1000;
+            if (nms_pre > 0 && cls_score.rows > nms_pre) {
+                preNMS(anchors, bbox_pred, cls_score, nms_pre);
+            }
+            
+            Mat bboxes = calculateBoundingBoxes(anchors, bbox_pred);
+
+            
+            bboxes_mlvl.push_back(bboxes);
+            scores_mlvl.push_back(cls_score);
+        }
+        Mat bboxes;
+        Mat scores;
+        vconcat(bboxes_mlvl, bboxes);
+        vconcat(scores_mlvl, scores);
+
+        vector<Rect2d> boxesXYXY = bboxMatToRect2d(bboxes);
+        auto [classIds, confidences] = getClassIdAndConfidences(scores);
+
+        vector<int> indices;
+        NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices);
+
+        Mat det_bboxes, det_conf, det_classid;
+        for (int idx : indices) {
+            det_bboxes.push_back(bboxes.row(idx));
+            det_conf.push_back(confidences.at<float>(idx));
+            det_classid.push_back(classIds.at<float>(idx));
+        }
+
+        Mat result;
+        if (!det_bboxes.empty()) {
+            hconcat(det_bboxes, det_conf, result);
+            hconcat(result, det_classid, result);
+        }
+
+        return result;
+    }
+
+    void generateAnchors()
+    {
+        for (const int stride : strides) {
+            int feat_h = this->imageShape.height / stride;
+            int feat_w = this->imageShape.width / stride;
+            
+            vector<Mat> anchors;
+            
+            for (int y = 0; y < feat_h; ++y) {
+                for (int x = 0; x < feat_w; ++x) {
+                    float shift_x = x * stride;
+                    float shift_y = y * stride;
+                    float cx = shift_x + 0.5 * (stride - 1);
+                    float cy = shift_y + 0.5 * (stride - 1);
+                    Mat anchor_point = (Mat_<float>(2, 1) << cx, cy);
+                    anchors.push_back(anchor_point);
+                }
+            }
+            Mat anchors_mat;
+            hconcat(anchors, anchors_mat);
+            this->anchorsMlvl.push_back(anchors_mat);
+        }
+    }
+private:
+    Net net;
+    String modelPath;
+    vector<int> strides;
+    Size imageShape;
+    int regMax;
+    float probThreshold;
+    float iouThreshold;
+    Backend backendId;
+    Target targetId;
+    Mat project;
+    Scalar mean;
+    Scalar std;
+    vector<Mat> anchorsMlvl;
+};
+
+// Function to resize and pad an image and return both the image and scale information
+tuple<Mat, vector<double>> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) {
+    Mat img = sourceImage.clone();
+
+    double top = 0, left = 0, newh = target_size.height, neww = target_size.width;
+
+    if (img.rows != img.cols) {
+        double hw_scale = static_cast<double>(img.rows) / img.cols;
+        if (hw_scale > 1) {
+            newh = target_size.height;
+            neww = static_cast<int>(target_size.width / hw_scale);
+            resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
+            left = static_cast<int>((target_size.width - neww) * 0.5);
+            copyMakeBorder(img, img, 0, 0, left, target_size.width - neww - left, BORDER_CONSTANT, Scalar(0));
+        } else {
+            newh = static_cast<int>(target_size.height * hw_scale);
+            neww = target_size.width;
+            resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
+            top = static_cast<int>((target_size.height - newh) * 0.5);
+            copyMakeBorder(img, img, top, target_size.height - newh - top, 0, 0, BORDER_CONSTANT, Scalar(0));
+        }
+    } else {
+        resize(img, img, target_size, 0, 0, INTER_AREA);
+    }
+    vector<double> letterbox_scale = {top, left, newh, neww};
+
+    return make_tuple(img, letterbox_scale);
+}
+
+// Function to scale bounding boxes back to original image coordinates
+vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const vector<double>& letterbox_scale) {
+    vector<int> ret(bbox.cols);
+
+    int h = original_image_shape.height;
+    int w = original_image_shape.width;
+    double top = letterbox_scale[0];
+    double left = letterbox_scale[1];
+    double newh = letterbox_scale[2];
+    double neww = letterbox_scale[3];
+
+    if (h == w) {
+        double ratio = static_cast<double>(h) / newh;
+        for (int& val : ret) {
+            val = static_cast<int>(val * ratio);
+        }
+        return ret;
+    }
+
+    double ratioh = static_cast<double>(h) / newh;
+    double ratiow = static_cast<double>(w) / neww;
+    ret[0] = max(static_cast<int>((bbox.at<float>(0) - left) * ratiow), 0);
+    ret[1] = max(static_cast<int>((bbox.at<float>(1) - top) * ratioh), 0);
+    ret[2] = min(static_cast<int>((bbox.at<float>(2) - left) * ratiow), w);
+    ret[3] = min(static_cast<int>((bbox.at<float>(3) - top) * ratioh), h);
+
+    return ret;
+}
+
+// Function to visualize predictions on an image
+Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, double fps = 0.0) {
+    Mat visualized_image = result_image.clone();
+
+    // Draw FPS if provided
+    if (fps > 0.0) {
+        string fps_text = "FPS: " + to_string(fps);
+        putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
+    }
+
+    // Draw bounding boxes and labels for each prediction
+    for (size_t i = 0; i < preds.rows; i++) {
+        Mat pred = preds.row(i);
+        Mat bbox = pred.colRange(0, 4);
+        double conf = pred.at<float>(4);
+        int classid = static_cast<int>(pred.at<float>(5));
+
+        // Convert bbox coordinates back to original image space
+        vector<int> unnormalized_bbox = unletterbox(bbox, visualized_image.size(), letterbox_scale);
+
+        // Draw bounding box
+        rectangle(visualized_image, Point(unnormalized_bbox[0], unnormalized_bbox[1]),
+                      Point(unnormalized_bbox[2], unnormalized_bbox[3]), Scalar(0, 255, 0), 2);
+
+        // Draw label
+        stringstream label;
+        label << nanodetClassLabels[classid] << ": " << fixed << setprecision(2) << conf;
+        putText(visualized_image, label.str(), Point(unnormalized_bbox[0], unnormalized_bbox[1] - 10),
+                  FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0), 2);
+    }
+
+    return visualized_image;
+}
+
+void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis)
+{
+    cvtColor(inputImage, inputImage, COLOR_BGR2RGB);
+    tuple<Mat, vector<double>> w = letterbox(inputImage);
+    Mat inputBlob = get<0>(w);
+    vector<double> letterboxScale  = get<1>(w);
+    
+    tm.start();
+    Mat predictions = nanodet.infer(inputBlob);
+    tm.stop();
+    cout << "Inference time: " << tm.getTimeMilli() << " ms\n";
+
+    Mat img = visualize(predictions, inputImage, letterboxScale, tm.getFPS());
+    cvtColor(img, img, COLOR_BGR2RGB);
+    if (save)
+    {
+        static const string kOutputName = "result.jpg";
+        imwrite(kOutputName, img);
+    }
+    if (vis)
+    {
+        static const string kWinName = "model";
+        imshow(kWinName, img);
+    }
+}
+
+
+const String keys =
+        "{ help  h          |                                               | Print help message. }"
+        "{ model m          | object_detection_nanodet_2022nov.onnx        | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx  }"
+        "{ input i          |                                               | Path to the input image. Omit for using the default camera.}"
+        "{ confidence       | 0.35                                          | Class confidence }"
+        "{ nms              | 0.6                                           | Enter nms IOU threshold }"
+        "{ save s           | false                                         | Specify to save results. This flag is invalid when using the camera. }"
+        "{ vis v            | false                                         | Specify to open a window for result visualization. This flag is invalid when using the camera. }"
+        "{ backend bt       | 0                                             | Choose one of computation backends: "
+        "0: (default) OpenCV implementation + CPU, "
+        "1: CUDA + GPU (CUDA), "
+        "2: CUDA + GPU (CUDA FP16), "
+        "3: TIM-VX + NPU, "
+        "4: CANN + NPU}";
+
+int main(int argc, char** argv)
+{
+    CommandLineParser parser(argc, argv, keys);
+
+    parser.about("Use this script to run Nanodet inference using OpenCV, a contribution by Sri Siddarth Chakaravarthy as part of GSOC_2022.");
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    string model = parser.get<String>("model");
+    string inputPath = parser.get<String>("input");
+    float confThreshold = parser.get<float>("confidence");
+    float nmsThreshold = parser.get<float>("nms");
+    bool save = parser.get<bool>("save");
+    bool vis = parser.get<bool>("vis");
+    int backendTargetid = parser.get<int>("backend");
+
+    if (model.empty())
+    {
+        CV_Error(Error::StsError, "Model file " + model + " not found");
+    }
+
+    NanoDet nanodet(model, confThreshold, nmsThreshold,
+                    backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
+
+    TickMeter tm;
+    if (parser.has("input"))
+    {
+        Mat inputImage = imread(samples::findFile(inputPath));
+        processImage(inputImage, nanodet, tm, save, vis);
+    }
+    else
+    {
+        VideoCapture cap;
+        cap.open(0);
+        if (!cap.isOpened())
+        {
+            CV_Error(Error::StsError, "Cannot open video or file");
+        }
+
+        Mat frame;
+        while (waitKey(1) < 0)
+        {
+            cap >> frame;
+            if (frame.empty())
+            {
+                cout << "Frame is empty" << endl;
+                waitKey();
+                break;
+            }
+            processImage(frame, nanodet, tm, save, vis);
+        }
+        cap.release();
+    }
+
+    return 0;
+}

From e554eb2cb1b1fc57ba4724b21d492158890fd4a3 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 03:15:17 -0500
Subject: [PATCH 2/9] Fix inference timer text, added timer reset.

---
 models/object_detection_nanodet/demo.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index 9d0bf8b6..cb77c8f5 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -471,6 +471,7 @@ int main(int argc, char** argv)
                 waitKey();
                 break;
             }
+            tm.reset();
             processImage(frame, nanodet, tm, save, vis);
         }
         cap.release();

From b937304576a0196d323213f26ea3cb79313f199a Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 03:50:56 -0500
Subject: [PATCH 3/9] Updated README.md, remove FPS text for single image
 processing

---
 models/object_detection_nanodet/README.md | 19 +++++++++++++++++++
 models/object_detection_nanodet/demo.cpp  | 21 ++++++++++++---------
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/models/object_detection_nanodet/README.md b/models/object_detection_nanodet/README.md
index f0adcc9b..03b203f4 100644
--- a/models/object_detection_nanodet/README.md
+++ b/models/object_detection_nanodet/README.md
@@ -7,6 +7,8 @@ Note:
 
 ## Demo
 
+### Python
+
 Run the following command to try the demo: 
 ```shell
 # detect on camera input
@@ -17,6 +19,23 @@ python demo.py --input /path/to/image -v
 Note: 
 - image result saved as "result.jpg"
 
+### C++
+
+Install latest OpenCV and CMake >= 3.24.0 to get started with:
+
+```shell
+# A typical and default installation path of OpenCV is /usr/local
+cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
+cmake --build build
+
+# detect on camera input
+./build/opencv_zoo_object_detection_nanodet
+# detect on an image
+./build/opencv_zoo_object_detection_nanodet -i=/path/to/image
+# get help messages
+./build/opencv_zoo_object_detection_nanodet -h
+```
+
 
 ## Results
 
diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index cb77c8f5..14aeb359 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -344,11 +344,11 @@ vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const
 }
 
 // Function to visualize predictions on an image
-Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, double fps = 0.0) {
+Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, bool video, double fps = 0.0) {
     Mat visualized_image = result_image.clone();
 
     // Draw FPS if provided
-    if (fps > 0.0) {
+    if (fps > 0.0 && video) {
         string fps_text = "FPS: " + to_string(fps);
         putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
     }
@@ -377,7 +377,7 @@ Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& l
     return visualized_image;
 }
 
-void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis)
+void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis, bool video)
 {
     cvtColor(inputImage, inputImage, COLOR_BGR2RGB);
     tuple<Mat, vector<double>> w = letterbox(inputImage);
@@ -389,7 +389,7 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b
     tm.stop();
     cout << "Inference time: " << tm.getTimeMilli() << " ms\n";
 
-    Mat img = visualize(predictions, inputImage, letterboxScale, tm.getFPS());
+    Mat img = visualize(predictions, inputImage, letterboxScale, video, tm.getFPS());
     cvtColor(img, img, COLOR_BGR2RGB);
     if (save)
     {
@@ -406,12 +406,12 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b
 
 const String keys =
         "{ help  h          |                                               | Print help message. }"
-        "{ model m          | object_detection_nanodet_2022nov.onnx        | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx  }"
+        "{ model m          | object_detection_nanodet_2022nov.onnx         | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx  }"
         "{ input i          |                                               | Path to the input image. Omit for using the default camera.}"
         "{ confidence       | 0.35                                          | Class confidence }"
         "{ nms              | 0.6                                           | Enter nms IOU threshold }"
-        "{ save s           | false                                         | Specify to save results. This flag is invalid when using the camera. }"
-        "{ vis v            | false                                         | Specify to open a window for result visualization. This flag is invalid when using the camera. }"
+        "{ save s           | true                                          | Specify to save results. This flag is invalid when using the camera. }"
+        "{ vis v            | true                                          | Specify to open a window for result visualization. This flag is invalid when using the camera. }"
         "{ backend bt       | 0                                             | Choose one of computation backends: "
         "0: (default) OpenCV implementation + CPU, "
         "1: CUDA + GPU (CUDA), "
@@ -450,7 +450,9 @@ int main(int argc, char** argv)
     if (parser.has("input"))
     {
         Mat inputImage = imread(samples::findFile(inputPath));
-        processImage(inputImage, nanodet, tm, save, vis);
+        static const bool kNotVideo = false;
+        processImage(inputImage, nanodet, tm, save, vis, kNotVideo);
+        waitKey(0);
     }
     else
     {
@@ -472,7 +474,8 @@ int main(int argc, char** argv)
                 break;
             }
             tm.reset();
-            processImage(frame, nanodet, tm, save, vis);
+            static const bool kIsVideo = true;
+            processImage(frame, nanodet, tm, save, vis, kIsVideo);
         }
         cap.release();
     }

From b8e692fe05cf3161e0060aab77e677a5e571ac18 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 04:07:08 -0500
Subject: [PATCH 4/9] Add matching saved image message

---
 models/object_detection_nanodet/demo.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index 14aeb359..f93be1d8 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -395,6 +395,10 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b
     {
         static const string kOutputName = "result.jpg";
         imwrite(kOutputName, img);
+        if (!video)
+        {
+            cout << "Results saved to " + kOutputName << endl;
+        }
     }
     if (vis)
     {

From d0c1147c5e086151916d34fa19fd2da21ec3fa13 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 04:15:30 -0500
Subject: [PATCH 5/9] Removing inference time printout for video inputs

---
 models/object_detection_nanodet/demo.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index f93be1d8..66f73e32 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -387,7 +387,10 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b
     tm.start();
     Mat predictions = nanodet.infer(inputBlob);
     tm.stop();
-    cout << "Inference time: " << tm.getTimeMilli() << " ms\n";
+    if (!video)
+    {
+        cout << "Inference time: " << tm.getTimeMilli() << " ms\n";
+    }
 
     Mat img = visualize(predictions, inputImage, letterboxScale, video, tm.getFPS());
     cvtColor(img, img, COLOR_BGR2RGB);

From 959a1707356875d635526a1dbbb5cf32d7ac7d2f Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 04:54:39 -0500
Subject: [PATCH 6/9] Update FPS text to 2 decimal places

---
 models/object_detection_nanodet/demo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index 66f73e32..acf5ed4b 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -349,8 +349,9 @@ Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& l
 
     // Draw FPS if provided
     if (fps > 0.0 && video) {
-        string fps_text = "FPS: " + to_string(fps);
-        putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
+        std::ostringstream fps_stream;
+        fps_stream << "FPS: " << std::fixed << std::setprecision(2) << fps;
+        putText(visualized_image, fps_stream.str(), Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
     }
 
     // Draw bounding boxes and labels for each prediction

From 1b021e05360be3e5938b67e3982c1e840b074d4e Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Sat, 24 Feb 2024 05:10:04 -0500
Subject: [PATCH 7/9] Update coding style for braces

---
 models/object_detection_nanodet/demo.cpp | 78 ++++++++++++++++--------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index acf5ed4b..549ef4e2 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -8,7 +8,8 @@ using namespace std;
 using namespace cv;
 using namespace dnn;
 
-const auto backendTargetPairs = vector<pair<Backend, Target>>{
+const auto backendTargetPairs = vector<pair<Backend, Target>>
+{
     {DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
     {DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
     {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
@@ -16,7 +17,8 @@ const auto backendTargetPairs = vector<pair<Backend, Target>>{
     {DNN_BACKEND_CANN, DNN_TARGET_NPU}
 };
 
-const vector<string> nanodetClassLabels = {
+const vector<string> nanodetClassLabels = 
+{
     "person", "bicycle", "car", "motorcycle", "airplane", "bus",
     "train", "truck", "boat", "traffic light", "fire hydrant",
     "stop sign", "parking meter", "bench", "bird", "cat", "dog",
@@ -30,9 +32,11 @@ const vector<string> nanodetClassLabels = {
     "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
     "mouse", "remote", "keyboard", "cell phone", "microwave",
     "oven", "toaster", "sink", "refrigerator", "book", "clock",
-    "vase", "scissors", "teddy bear", "hair drier", "toothbrush" };
+    "vase", "scissors", "teddy bear", "hair drier", "toothbrush" 
+};
 
-class NanoDet {
+class NanoDet 
+{
 public:
     NanoDet(const String& modelPath, const float probThresh = 0.35, const float iouThresh = 0.6, 
             const Backend bId = DNN_BACKEND_DEFAULT, const Target tId = DNN_TARGET_CPU) :
@@ -121,7 +125,8 @@ class NanoDet {
 
         Mat indices_float = indices.colRange(0, nms_pre);
         Mat selected_anchors, selected_bbox_pred, selected_cls_score;
-        for (int j = 0; j < indices_float.cols; ++j) {
+        for (int j = 0; j < indices_float.cols; ++j) 
+        {
             selected_anchors.push_back(anchors.row(indices_float.at<int>(j)));
             selected_bbox_pred.push_back(bbox_pred.row(indices_float.at<int>(j)));
             selected_cls_score.push_back(cls_score.row(indices_float.at<int>(j)));
@@ -161,7 +166,8 @@ class NanoDet {
         Mat bboxes_wh(bboxes.clone());
         bboxes_wh.colRange(2, 4) = bboxes_wh.colRange(2, 4) -= bboxes_wh.colRange(0, 2);
         vector<Rect2d> boxesXYXY;
-        for (size_t i = 0; i < bboxes_wh.rows; i++) {
+        for (size_t i = 0; i < bboxes_wh.rows; i++) 
+        {
             boxesXYXY.emplace_back(bboxes.at<float>(i, 0),
                                    bboxes.at<float>(i, 1),
                                    bboxes.at<float>(i, 2),
@@ -175,7 +181,8 @@ class NanoDet {
         Mat classIds(scores.rows, 1, CV_32FC1);
         Mat confidences(scores.rows, 1, CV_32FC1);
 
-        for (size_t i = 0; i < scores.rows; ++i) {
+        for (size_t i = 0; i < scores.rows; ++i) 
+        {
             Point maxLoc;
             minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc);
             classIds.at<float>(i) = maxLoc.x;
@@ -187,7 +194,8 @@ class NanoDet {
     Mat postProcess(const vector<Mat>& preds)
     {
         vector<Mat> cls_scores, bbox_preds;
-        for (size_t i = 0; i < preds.size(); i += 2) {
+        for (size_t i = 0; i < preds.size(); i += 2) 
+        {
             cls_scores.push_back(preds[i]);
             bbox_preds.push_back(preds[i + 1]);
         }
@@ -195,7 +203,8 @@ class NanoDet {
         vector<Mat> bboxes_mlvl;
         vector<Mat> scores_mlvl;
 
-        for (size_t i = 0; i < strides.size(); ++i) {
+        for (size_t i = 0; i < strides.size(); ++i) 
+        {
             if (i >= cls_scores.size() || i >= bbox_preds.size()) continue;
             // Extract necessary data
             int stride = strides[i];
@@ -209,7 +218,8 @@ class NanoDet {
             bbox_pred = stride * bbox_pred;
 
             const int nms_pre = 1000;
-            if (nms_pre > 0 && cls_score.rows > nms_pre) {
+            if (nms_pre > 0 && cls_score.rows > nms_pre) 
+            {
                 preNMS(anchors, bbox_pred, cls_score, nms_pre);
             }
             
@@ -231,14 +241,16 @@ class NanoDet {
         NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices);
 
         Mat det_bboxes, det_conf, det_classid;
-        for (int idx : indices) {
+        for (int idx : indices) 
+        {
             det_bboxes.push_back(bboxes.row(idx));
             det_conf.push_back(confidences.at<float>(idx));
             det_classid.push_back(classIds.at<float>(idx));
         }
 
         Mat result;
-        if (!det_bboxes.empty()) {
+        if (!det_bboxes.empty()) 
+        {
             hconcat(det_bboxes, det_conf, result);
             hconcat(result, det_classid, result);
         }
@@ -254,8 +266,10 @@ class NanoDet {
             
             vector<Mat> anchors;
             
-            for (int y = 0; y < feat_h; ++y) {
-                for (int x = 0; x < feat_w; ++x) {
+            for (int y = 0; y < feat_h; ++y) 
+            {
+                for (int x = 0; x < feat_w; ++x) 
+                {
                     float shift_x = x * stride;
                     float shift_y = y * stride;
                     float cx = shift_x + 0.5 * (stride - 1);
@@ -286,27 +300,34 @@ class NanoDet {
 };
 
 // Function to resize and pad an image and return both the image and scale information
-tuple<Mat, vector<double>> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) {
+tuple<Mat, vector<double>> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) 
+{
     Mat img = sourceImage.clone();
 
     double top = 0, left = 0, newh = target_size.height, neww = target_size.width;
 
-    if (img.rows != img.cols) {
+    if (img.rows != img.cols) 
+    {
         double hw_scale = static_cast<double>(img.rows) / img.cols;
-        if (hw_scale > 1) {
+        if (hw_scale > 1) 
+        {
             newh = target_size.height;
             neww = static_cast<int>(target_size.width / hw_scale);
             resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
             left = static_cast<int>((target_size.width - neww) * 0.5);
             copyMakeBorder(img, img, 0, 0, left, target_size.width - neww - left, BORDER_CONSTANT, Scalar(0));
-        } else {
+        } 
+        else 
+        {
             newh = static_cast<int>(target_size.height * hw_scale);
             neww = target_size.width;
             resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
             top = static_cast<int>((target_size.height - newh) * 0.5);
             copyMakeBorder(img, img, top, target_size.height - newh - top, 0, 0, BORDER_CONSTANT, Scalar(0));
         }
-    } else {
+    } 
+    else 
+    {
         resize(img, img, target_size, 0, 0, INTER_AREA);
     }
     vector<double> letterbox_scale = {top, left, newh, neww};
@@ -315,7 +336,8 @@ tuple<Mat, vector<double>> letterbox(const Mat& sourceImage, const Size& target_
 }
 
 // Function to scale bounding boxes back to original image coordinates
-vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const vector<double>& letterbox_scale) {
+vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const vector<double>& letterbox_scale) 
+{
     vector<int> ret(bbox.cols);
 
     int h = original_image_shape.height;
@@ -325,9 +347,11 @@ vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const
     double newh = letterbox_scale[2];
     double neww = letterbox_scale[3];
 
-    if (h == w) {
+    if (h == w) 
+    {
         double ratio = static_cast<double>(h) / newh;
-        for (int& val : ret) {
+        for (int& val : ret) 
+        {
             val = static_cast<int>(val * ratio);
         }
         return ret;
@@ -344,18 +368,21 @@ vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const
 }
 
 // Function to visualize predictions on an image
-Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, bool video, double fps = 0.0) {
+Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, bool video, double fps = 0.0) 
+{
     Mat visualized_image = result_image.clone();
 
     // Draw FPS if provided
-    if (fps > 0.0 && video) {
+    if (fps > 0.0 && video) 
+    {
         std::ostringstream fps_stream;
         fps_stream << "FPS: " << std::fixed << std::setprecision(2) << fps;
         putText(visualized_image, fps_stream.str(), Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
     }
 
     // Draw bounding boxes and labels for each prediction
-    for (size_t i = 0; i < preds.rows; i++) {
+    for (size_t i = 0; i < preds.rows; i++) 
+    {
         Mat pred = preds.row(i);
         Mat bbox = pred.colRange(0, 4);
         double conf = pred.at<float>(4);
@@ -487,6 +514,5 @@ int main(int argc, char** argv)
         }
         cap.release();
     }
-
     return 0;
 }

From ef0cf9b67db5eb0561bb3f4dc07485d3db02a64b Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Mon, 26 Feb 2024 03:00:58 -0500
Subject: [PATCH 8/9] Address PR comments. Adjusted to C++11 standard.

---
 models/object_detection_nanodet/demo.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index 549ef4e2..0af8018a 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -112,7 +112,7 @@ class NanoDet
         multiply(input, repeat_project, input);
         reduce(input, input, 1, REDUCE_SUM, CV_32F);
         Mat projection = input.col(0).clone();
-        return projection.reshape(0, (4, projection.total() / 4));
+        return projection.reshape(0, projection.total() / 4);
     }
 
     void preNMS(Mat& anchors, Mat& bbox_pred, Mat& cls_score, const int nms_pre = 1000)
@@ -235,7 +235,9 @@ class NanoDet
         vconcat(scores_mlvl, scores);
 
         vector<Rect2d> boxesXYXY = bboxMatToRect2d(bboxes);
-        auto [classIds, confidences] = getClassIdAndConfidences(scores);
+        std::tuple<Mat, Mat> classIdAndConfidences = getClassIdAndConfidences(scores);
+        Mat classIds = std::get<0>(classIdAndConfidences);
+        Mat confidences = std::get<1>(classIdAndConfidences);
 
         vector<int> indices;
         NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices);

From 9cafb71b08333d4c873a4ea9d03f13b5212bccc3 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryanl1288@hotmail.com>
Date: Tue, 27 Feb 2024 04:00:15 -0500
Subject: [PATCH 9/9] Addressed PR comments. Added C++11 cmake configuration,
 extracted classID and confidence function, and use NMSBoxesBatched now.

---
 .../object_detection_nanodet/CMakeLists.txt   |  3 ++
 models/object_detection_nanodet/demo.cpp      | 50 +++++++------------
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/models/object_detection_nanodet/CMakeLists.txt b/models/object_detection_nanodet/CMakeLists.txt
index 1141423a..cebba4d0 100644
--- a/models/object_detection_nanodet/CMakeLists.txt
+++ b/models/object_detection_nanodet/CMakeLists.txt
@@ -25,5 +25,8 @@ message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
 # Declare the executable target built from your sources
 add_executable(${project_name} ${SourceFile})
 
+# Set C++ compilation standard to C++11
+set(CMAKE_CXX_STANDARD 11)
+
 # Link your application with OpenCV libraries
 target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp
index 0af8018a..2e32276f 100644
--- a/models/object_detection_nanodet/demo.cpp
+++ b/models/object_detection_nanodet/demo.cpp
@@ -176,21 +176,6 @@ class NanoDet
         return boxesXYXY;
     }
 
-    std::tuple<Mat, Mat> getClassIdAndConfidences(const Mat& scores)
-    {
-        Mat classIds(scores.rows, 1, CV_32FC1);
-        Mat confidences(scores.rows, 1, CV_32FC1);
-
-        for (size_t i = 0; i < scores.rows; ++i) 
-        {
-            Point maxLoc;
-            minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc);
-            classIds.at<float>(i) = maxLoc.x;
-            confidences.at<float>(i) = scores.at<float>(i, maxLoc.x);
-        }
-        return std::make_tuple(classIds, confidences);
-    }
-
     Mat postProcess(const vector<Mat>& preds)
     {
         vector<Mat> cls_scores, bbox_preds;
@@ -235,28 +220,31 @@ class NanoDet
         vconcat(scores_mlvl, scores);
 
         vector<Rect2d> boxesXYXY = bboxMatToRect2d(bboxes);
-        std::tuple<Mat, Mat> classIdAndConfidences = getClassIdAndConfidences(scores);
-        Mat classIds = std::get<0>(classIdAndConfidences);
-        Mat confidences = std::get<1>(classIdAndConfidences);
+        vector<int> classIds;
+        vector<float> confidences;
+        for (size_t i = 0; i < scores.rows; ++i) 
+        {
+            Point maxLoc;
+            minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc);
+            classIds.push_back(maxLoc.x);
+            confidences.push_back(scores.at<float>(i, maxLoc.x));
+        }
 
         vector<int> indices;
-        NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices);
-
-        Mat det_bboxes, det_conf, det_classid;
-        for (int idx : indices) 
+        NMSBoxesBatched(boxesXYXY, confidences, classIds, probThreshold, iouThreshold, indices);
+        Mat result(int(indices.size()), 6, CV_32FC1);
+        int row = 0;
+        for (auto idx : indices)
         {
-            det_bboxes.push_back(bboxes.row(idx));
-            det_conf.push_back(confidences.at<float>(idx));
-            det_classid.push_back(classIds.at<float>(idx));
+            bboxes.rowRange(idx, idx + 1).copyTo(result(Rect(0, row, 4, 1)));
+            result.at<float>(row, 4) = confidences[idx];
+            result.at<float>(row, 5) = static_cast<float>(classIds[idx]);
+            row++;
         }
-
-        Mat result;
-        if (!det_bboxes.empty()) 
+        if (indices.size() == 0)
         {
-            hconcat(det_bboxes, det_conf, result);
-            hconcat(result, det_classid, result);
+            return Mat();
         }
-
         return result;
     }