From 4f715992ff38eb0980c8098a8084c2ef23e80485 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 03:11:18 -0500 Subject: [PATCH 1/9] Functional and Refactored demo.cpp for MobileNet --- .../object_detection_nanodet/CMakeLists.txt | 29 ++ models/object_detection_nanodet/demo.cpp | 480 ++++++++++++++++++ 2 files changed, 509 insertions(+) create mode 100644 models/object_detection_nanodet/CMakeLists.txt create mode 100644 models/object_detection_nanodet/demo.cpp diff --git a/models/object_detection_nanodet/CMakeLists.txt b/models/object_detection_nanodet/CMakeLists.txt new file mode 100644 index 00000000..1141423a --- /dev/null +++ b/models/object_detection_nanodet/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.24) +set(project_name "opencv_zoo_object_detection_nanodet") + +PROJECT (${project_name}) + +set(OPENCV_VERSION "4.9.0") +set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation") +find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH}) +# Find OpenCV, you may need to set OpenCV_DIR variable +# to the absolute path to the directory containing OpenCVConfig.cmake file +# via the command line or GUI + +file(GLOB SourceFile + "demo.cpp") +# If the package has been found, several variables will +# be set, you can find the full list with descriptions +# in the OpenCVConfig.cmake file. +# Print some message showing some of them +message(STATUS "OpenCV library status:") +message(STATUS " config: ${OpenCV_DIR}") +message(STATUS " version: ${OpenCV_VERSION}") +message(STATUS " libraries: ${OpenCV_LIBS}") +message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +# Declare the executable target built from your sources +add_executable(${project_name} ${SourceFile}) + +# Link your application with OpenCV libraries +target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp new file mode 100644 index 00000000..9d0bf8b6 --- /dev/null +++ b/models/object_detection_nanodet/demo.cpp @@ -0,0 +1,480 @@ +#include +#include +#include + +#include + +using namespace std; +using namespace cv; +using namespace dnn; + +const auto backendTargetPairs = vector>{ + {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16}, + {DNN_BACKEND_TIMVX, DNN_TARGET_NPU}, + {DNN_BACKEND_CANN, DNN_TARGET_NPU} +}; + +const vector nanodetClassLabels = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", + "train", "truck", "boat", "traffic light", "fire hydrant", + "stop sign", "parking meter", "bench", "bird", "cat", "dog", + "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", + "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", + "baseball glove", "skateboard", "surfboard", "tennis racket", + "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", + "banana", "apple", "sandwich", "orange", "broccoli", "carrot", + "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", + "mouse", "remote", "keyboard", "cell phone", "microwave", + "oven", "toaster", "sink", "refrigerator", "book", "clock", + "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; + +class NanoDet { +public: + NanoDet(const String& modelPath, const float probThresh = 0.35, const float iouThresh = 0.6, + const Backend bId = DNN_BACKEND_DEFAULT, const Target tId = DNN_TARGET_CPU) : + modelPath(modelPath), probThreshold(probThresh), + iouThreshold(iouThresh), backendId(bId), targetId(tId), + imageShape(416, 416), regMax(7) + { + this->strides = { 8, 16, 32, 64 }; + this->net = readNet(modelPath); + setBackendAndTarget(bId, tId); + this->project = Mat::zeros(1, this->regMax + 1, CV_32F); + for (size_t i = 0; i <= this->regMax; ++i) + { + this->project.at(0, i) = static_cast(i); + } + this->mean = Scalar(103.53, 116.28, 123.675); + this->std = Scalar(1.0 / 57.375, 1.0 / 57.12, 1.0 / 58.395); + this->generateAnchors(); + } + + void setBackendAndTarget(Backend bId, Target tId) + { + this->net.setPreferableBackend(bId); + this->net.setPreferableTarget(tId); + } + + Mat preProcess(const Mat& inputImage) + { + Image2BlobParams paramNanodet; + paramNanodet.datalayout = DNN_LAYOUT_NCHW; + paramNanodet.ddepth = CV_32F; + paramNanodet.mean = this->mean; + paramNanodet.scalefactor = this->std; + paramNanodet.size = this->imageShape; + Mat blob; + blobFromImageWithParams(inputImage, blob, paramNanodet); + return blob; + } + + Mat infer(const Mat& sourceImage) + { + Mat blob = this->preProcess(sourceImage); + this->net.setInput(blob); + vector modelOutput; + this->net.forward(modelOutput, this->net.getUnconnectedOutLayersNames()); + Mat preds = this->postProcess(modelOutput); + return preds; + } + + Mat reshapeIfNeeded(const Mat& input) + { + if (input.dims == 3) + { + return input.reshape(0, input.size[1]); + } + return input; + } + + Mat softmaxActivation(const Mat& input) + { + Mat x_exp, x_sum, x_repeat_sum, result; + exp(input.reshape(0, input.total() / (this->regMax + 1)), x_exp); + reduce(x_exp, x_sum, 1, REDUCE_SUM, CV_32F); + repeat(x_sum, 1, this->regMax + 1, x_repeat_sum); + divide(x_exp, x_repeat_sum, result); + return result; + } + + Mat applyProjection(Mat& input) + { + Mat repeat_project; + repeat(this->project, input.rows, 1, repeat_project); + multiply(input, repeat_project, input); + reduce(input, input, 1, REDUCE_SUM, CV_32F); + Mat projection = input.col(0).clone(); + return projection.reshape(0, (4, projection.total() / 4)); + } + + void preNMS(Mat& anchors, Mat& bbox_pred, Mat& cls_score, const int nms_pre = 1000) + { + Mat max_scores; + reduce(cls_score, max_scores, 1, REDUCE_MAX); + + Mat indices; + sortIdx(max_scores.t(), indices, SORT_DESCENDING); + + Mat indices_float = indices.colRange(0, nms_pre); + Mat selected_anchors, selected_bbox_pred, selected_cls_score; + for (int j = 0; j < indices_float.cols; ++j) { + selected_anchors.push_back(anchors.row(indices_float.at(j))); + selected_bbox_pred.push_back(bbox_pred.row(indices_float.at(j))); + selected_cls_score.push_back(cls_score.row(indices_float.at(j))); + } + + anchors = selected_anchors; + bbox_pred = selected_bbox_pred; + cls_score = selected_cls_score; + } + + void clipBoundingBoxes(Mat& x1, Mat& y1, Mat& x2, Mat& y2) + { + Mat zeros = Mat::zeros(x1.size(), x1.type()); + x1 = min(max(x1, zeros), Scalar(this->imageShape.width - 1)); + y1 = min(max(y1, zeros), Scalar(this->imageShape.height - 1)); + x2 = min(max(x2, zeros), Scalar(this->imageShape.width - 1)); + y2 = min(max(y2, zeros), Scalar(this->imageShape.height - 1)); + } + + Mat calculateBoundingBoxes(const Mat& anchors, const Mat& bbox_pred) + { + Mat x1 = anchors.col(0) - bbox_pred.col(0); + Mat y1 = anchors.col(1) - bbox_pred.col(1); + Mat x2 = anchors.col(0) + bbox_pred.col(2); + Mat y2 = anchors.col(1) + bbox_pred.col(3); + + clipBoundingBoxes(x1, y1, x2, y2); + + Mat bboxes; + hconcat(vector{x1, y1, x2, y2}, bboxes); + + return bboxes; + } + + vector bboxMatToRect2d(const Mat& bboxes) + { + Mat bboxes_wh(bboxes.clone()); + bboxes_wh.colRange(2, 4) = bboxes_wh.colRange(2, 4) -= bboxes_wh.colRange(0, 2); + vector boxesXYXY; + for (size_t i = 0; i < bboxes_wh.rows; i++) { + boxesXYXY.emplace_back(bboxes.at(i, 0), + bboxes.at(i, 1), + bboxes.at(i, 2), + bboxes.at(i, 3)); + } + return boxesXYXY; + } + + std::tuple getClassIdAndConfidences(const Mat& scores) + { + Mat classIds(scores.rows, 1, CV_32FC1); + Mat confidences(scores.rows, 1, CV_32FC1); + + for (size_t i = 0; i < scores.rows; ++i) { + Point maxLoc; + minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc); + classIds.at(i) = maxLoc.x; + confidences.at(i) = scores.at(i, maxLoc.x); + } + return std::make_tuple(classIds, confidences); + } + + Mat postProcess(const vector& preds) + { + vector cls_scores, bbox_preds; + for (size_t i = 0; i < preds.size(); i += 2) { + cls_scores.push_back(preds[i]); + bbox_preds.push_back(preds[i + 1]); + } + + vector bboxes_mlvl; + vector scores_mlvl; + + for (size_t i = 0; i < strides.size(); ++i) { + if (i >= cls_scores.size() || i >= bbox_preds.size()) continue; + // Extract necessary data + int stride = strides[i]; + Mat cls_score = reshapeIfNeeded(cls_scores[i]); + Mat bbox_pred = reshapeIfNeeded(bbox_preds[i]); + Mat anchors = anchorsMlvl[i].t(); + + // Softmax activation, projection, and calculate bounding boxes + bbox_pred = softmaxActivation(bbox_pred); + bbox_pred = applyProjection(bbox_pred); + bbox_pred = stride * bbox_pred; + + const int nms_pre = 1000; + if (nms_pre > 0 && cls_score.rows > nms_pre) { + preNMS(anchors, bbox_pred, cls_score, nms_pre); + } + + Mat bboxes = calculateBoundingBoxes(anchors, bbox_pred); + + + bboxes_mlvl.push_back(bboxes); + scores_mlvl.push_back(cls_score); + } + Mat bboxes; + Mat scores; + vconcat(bboxes_mlvl, bboxes); + vconcat(scores_mlvl, scores); + + vector boxesXYXY = bboxMatToRect2d(bboxes); + auto [classIds, confidences] = getClassIdAndConfidences(scores); + + vector indices; + NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices); + + Mat det_bboxes, det_conf, det_classid; + for (int idx : indices) { + det_bboxes.push_back(bboxes.row(idx)); + det_conf.push_back(confidences.at(idx)); + det_classid.push_back(classIds.at(idx)); + } + + Mat result; + if (!det_bboxes.empty()) { + hconcat(det_bboxes, det_conf, result); + hconcat(result, det_classid, result); + } + + return result; + } + + void generateAnchors() + { + for (const int stride : strides) { + int feat_h = this->imageShape.height / stride; + int feat_w = this->imageShape.width / stride; + + vector anchors; + + for (int y = 0; y < feat_h; ++y) { + for (int x = 0; x < feat_w; ++x) { + float shift_x = x * stride; + float shift_y = y * stride; + float cx = shift_x + 0.5 * (stride - 1); + float cy = shift_y + 0.5 * (stride - 1); + Mat anchor_point = (Mat_(2, 1) << cx, cy); + anchors.push_back(anchor_point); + } + } + Mat anchors_mat; + hconcat(anchors, anchors_mat); + this->anchorsMlvl.push_back(anchors_mat); + } + } +private: + Net net; + String modelPath; + vector strides; + Size imageShape; + int regMax; + float probThreshold; + float iouThreshold; + Backend backendId; + Target targetId; + Mat project; + Scalar mean; + Scalar std; + vector anchorsMlvl; +}; + +// Function to resize and pad an image and return both the image and scale information +tuple> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) { + Mat img = sourceImage.clone(); + + double top = 0, left = 0, newh = target_size.height, neww = target_size.width; + + if (img.rows != img.cols) { + double hw_scale = static_cast(img.rows) / img.cols; + if (hw_scale > 1) { + newh = target_size.height; + neww = static_cast(target_size.width / hw_scale); + resize(img, img, Size(neww, newh), 0, 0, INTER_AREA); + left = static_cast((target_size.width - neww) * 0.5); + copyMakeBorder(img, img, 0, 0, left, target_size.width - neww - left, BORDER_CONSTANT, Scalar(0)); + } else { + newh = static_cast(target_size.height * hw_scale); + neww = target_size.width; + resize(img, img, Size(neww, newh), 0, 0, INTER_AREA); + top = static_cast((target_size.height - newh) * 0.5); + copyMakeBorder(img, img, top, target_size.height - newh - top, 0, 0, BORDER_CONSTANT, Scalar(0)); + } + } else { + resize(img, img, target_size, 0, 0, INTER_AREA); + } + vector letterbox_scale = {top, left, newh, neww}; + + return make_tuple(img, letterbox_scale); +} + +// Function to scale bounding boxes back to original image coordinates +vector unletterbox(const Mat& bbox, const Size& original_image_shape, const vector& letterbox_scale) { + vector ret(bbox.cols); + + int h = original_image_shape.height; + int w = original_image_shape.width; + double top = letterbox_scale[0]; + double left = letterbox_scale[1]; + double newh = letterbox_scale[2]; + double neww = letterbox_scale[3]; + + if (h == w) { + double ratio = static_cast(h) / newh; + for (int& val : ret) { + val = static_cast(val * ratio); + } + return ret; + } + + double ratioh = static_cast(h) / newh; + double ratiow = static_cast(w) / neww; + ret[0] = max(static_cast((bbox.at(0) - left) * ratiow), 0); + ret[1] = max(static_cast((bbox.at(1) - top) * ratioh), 0); + ret[2] = min(static_cast((bbox.at(2) - left) * ratiow), w); + ret[3] = min(static_cast((bbox.at(3) - top) * ratioh), h); + + return ret; +} + +// Function to visualize predictions on an image +Mat visualize(const Mat& preds, const Mat& result_image, const vector& letterbox_scale, double fps = 0.0) { + Mat visualized_image = result_image.clone(); + + // Draw FPS if provided + if (fps > 0.0) { + string fps_text = "FPS: " + to_string(fps); + putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2); + } + + // Draw bounding boxes and labels for each prediction + for (size_t i = 0; i < preds.rows; i++) { + Mat pred = preds.row(i); + Mat bbox = pred.colRange(0, 4); + double conf = pred.at(4); + int classid = static_cast(pred.at(5)); + + // Convert bbox coordinates back to original image space + vector unnormalized_bbox = unletterbox(bbox, visualized_image.size(), letterbox_scale); + + // Draw bounding box + rectangle(visualized_image, Point(unnormalized_bbox[0], unnormalized_bbox[1]), + Point(unnormalized_bbox[2], unnormalized_bbox[3]), Scalar(0, 255, 0), 2); + + // Draw label + stringstream label; + label << nanodetClassLabels[classid] << ": " << fixed << setprecision(2) << conf; + putText(visualized_image, label.str(), Point(unnormalized_bbox[0], unnormalized_bbox[1] - 10), + FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0), 2); + } + + return visualized_image; +} + +void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis) +{ + cvtColor(inputImage, inputImage, COLOR_BGR2RGB); + tuple> w = letterbox(inputImage); + Mat inputBlob = get<0>(w); + vector letterboxScale = get<1>(w); + + tm.start(); + Mat predictions = nanodet.infer(inputBlob); + tm.stop(); + cout << "Inference time: " << tm.getTimeMilli() << " ms\n"; + + Mat img = visualize(predictions, inputImage, letterboxScale, tm.getFPS()); + cvtColor(img, img, COLOR_BGR2RGB); + if (save) + { + static const string kOutputName = "result.jpg"; + imwrite(kOutputName, img); + } + if (vis) + { + static const string kWinName = "model"; + imshow(kWinName, img); + } +} + + +const String keys = + "{ help h | | Print help message. }" + "{ model m | object_detection_nanodet_2022nov.onnx | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx }" + "{ input i | | Path to the input image. Omit for using the default camera.}" + "{ confidence | 0.35 | Class confidence }" + "{ nms | 0.6 | Enter nms IOU threshold }" + "{ save s | false | Specify to save results. This flag is invalid when using the camera. }" + "{ vis v | false | Specify to open a window for result visualization. This flag is invalid when using the camera. }" + "{ backend bt | 0 | Choose one of computation backends: " + "0: (default) OpenCV implementation + CPU, " + "1: CUDA + GPU (CUDA), " + "2: CUDA + GPU (CUDA FP16), " + "3: TIM-VX + NPU, " + "4: CANN + NPU}"; + +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, keys); + + parser.about("Use this script to run Nanodet inference using OpenCV, a contribution by Sri Siddarth Chakaravarthy as part of GSOC_2022."); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + string model = parser.get("model"); + string inputPath = parser.get("input"); + float confThreshold = parser.get("confidence"); + float nmsThreshold = parser.get("nms"); + bool save = parser.get("save"); + bool vis = parser.get("vis"); + int backendTargetid = parser.get("backend"); + + if (model.empty()) + { + CV_Error(Error::StsError, "Model file " + model + " not found"); + } + + NanoDet nanodet(model, confThreshold, nmsThreshold, + backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second); + + TickMeter tm; + if (parser.has("input")) + { + Mat inputImage = imread(samples::findFile(inputPath)); + processImage(inputImage, nanodet, tm, save, vis); + } + else + { + VideoCapture cap; + cap.open(0); + if (!cap.isOpened()) + { + CV_Error(Error::StsError, "Cannot open video or file"); + } + + Mat frame; + while (waitKey(1) < 0) + { + cap >> frame; + if (frame.empty()) + { + cout << "Frame is empty" << endl; + waitKey(); + break; + } + processImage(frame, nanodet, tm, save, vis); + } + cap.release(); + } + + return 0; +} From e554eb2cb1b1fc57ba4724b21d492158890fd4a3 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 03:15:17 -0500 Subject: [PATCH 2/9] Fix inference timer text, added timer reset. --- models/object_detection_nanodet/demo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index 9d0bf8b6..cb77c8f5 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -471,6 +471,7 @@ int main(int argc, char** argv) waitKey(); break; } + tm.reset(); processImage(frame, nanodet, tm, save, vis); } cap.release(); From b937304576a0196d323213f26ea3cb79313f199a Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 03:50:56 -0500 Subject: [PATCH 3/9] Updated README.md, remove FPS text for single image processing --- models/object_detection_nanodet/README.md | 19 +++++++++++++++++++ models/object_detection_nanodet/demo.cpp | 21 ++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/models/object_detection_nanodet/README.md b/models/object_detection_nanodet/README.md index f0adcc9b..03b203f4 100644 --- a/models/object_detection_nanodet/README.md +++ b/models/object_detection_nanodet/README.md @@ -7,6 +7,8 @@ Note: ## Demo +### Python + Run the following command to try the demo: ```shell # detect on camera input @@ -17,6 +19,23 @@ python demo.py --input /path/to/image -v Note: - image result saved as "result.jpg" +### C++ + +Install latest OpenCV and CMake >= 3.24.0 to get started with: + +```shell +# A typical and default installation path of OpenCV is /usr/local +cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation . +cmake --build build + +# detect on camera input +./build/opencv_zoo_object_detection_nanodet +# detect on an image +./build/opencv_zoo_object_detection_nanodet -i=/path/to/image +# get help messages +./build/opencv_zoo_object_detection_nanodet -h +``` + ## Results diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index cb77c8f5..14aeb359 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -344,11 +344,11 @@ vector unletterbox(const Mat& bbox, const Size& original_image_shape, const } // Function to visualize predictions on an image -Mat visualize(const Mat& preds, const Mat& result_image, const vector& letterbox_scale, double fps = 0.0) { +Mat visualize(const Mat& preds, const Mat& result_image, const vector& letterbox_scale, bool video, double fps = 0.0) { Mat visualized_image = result_image.clone(); // Draw FPS if provided - if (fps > 0.0) { + if (fps > 0.0 && video) { string fps_text = "FPS: " + to_string(fps); putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2); } @@ -377,7 +377,7 @@ Mat visualize(const Mat& preds, const Mat& result_image, const vector& l return visualized_image; } -void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis) +void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis, bool video) { cvtColor(inputImage, inputImage, COLOR_BGR2RGB); tuple> w = letterbox(inputImage); @@ -389,7 +389,7 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b tm.stop(); cout << "Inference time: " << tm.getTimeMilli() << " ms\n"; - Mat img = visualize(predictions, inputImage, letterboxScale, tm.getFPS()); + Mat img = visualize(predictions, inputImage, letterboxScale, video, tm.getFPS()); cvtColor(img, img, COLOR_BGR2RGB); if (save) { @@ -406,12 +406,12 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b const String keys = "{ help h | | Print help message. }" - "{ model m | object_detection_nanodet_2022nov.onnx | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx }" + "{ model m | object_detection_nanodet_2022nov.onnx | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx }" "{ input i | | Path to the input image. Omit for using the default camera.}" "{ confidence | 0.35 | Class confidence }" "{ nms | 0.6 | Enter nms IOU threshold }" - "{ save s | false | Specify to save results. This flag is invalid when using the camera. }" - "{ vis v | false | Specify to open a window for result visualization. This flag is invalid when using the camera. }" + "{ save s | true | Specify to save results. This flag is invalid when using the camera. }" + "{ vis v | true | Specify to open a window for result visualization. This flag is invalid when using the camera. }" "{ backend bt | 0 | Choose one of computation backends: " "0: (default) OpenCV implementation + CPU, " "1: CUDA + GPU (CUDA), " @@ -450,7 +450,9 @@ int main(int argc, char** argv) if (parser.has("input")) { Mat inputImage = imread(samples::findFile(inputPath)); - processImage(inputImage, nanodet, tm, save, vis); + static const bool kNotVideo = false; + processImage(inputImage, nanodet, tm, save, vis, kNotVideo); + waitKey(0); } else { @@ -472,7 +474,8 @@ int main(int argc, char** argv) break; } tm.reset(); - processImage(frame, nanodet, tm, save, vis); + static const bool kIsVideo = true; + processImage(frame, nanodet, tm, save, vis, kIsVideo); } cap.release(); } From b8e692fe05cf3161e0060aab77e677a5e571ac18 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 04:07:08 -0500 Subject: [PATCH 4/9] Add matching saved image message --- models/object_detection_nanodet/demo.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index 14aeb359..f93be1d8 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -395,6 +395,10 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b { static const string kOutputName = "result.jpg"; imwrite(kOutputName, img); + if (!video) + { + cout << "Results saved to " + kOutputName << endl; + } } if (vis) { From d0c1147c5e086151916d34fa19fd2da21ec3fa13 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 04:15:30 -0500 Subject: [PATCH 5/9] Removing inference time printout for video inputs --- models/object_detection_nanodet/demo.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index f93be1d8..66f73e32 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -387,7 +387,10 @@ void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, b tm.start(); Mat predictions = nanodet.infer(inputBlob); tm.stop(); - cout << "Inference time: " << tm.getTimeMilli() << " ms\n"; + if (!video) + { + cout << "Inference time: " << tm.getTimeMilli() << " ms\n"; + } Mat img = visualize(predictions, inputImage, letterboxScale, video, tm.getFPS()); cvtColor(img, img, COLOR_BGR2RGB); From 959a1707356875d635526a1dbbb5cf32d7ac7d2f Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 04:54:39 -0500 Subject: [PATCH 6/9] Update FPS text to 2 decimal places --- models/object_detection_nanodet/demo.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index 66f73e32..acf5ed4b 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -349,8 +349,9 @@ Mat visualize(const Mat& preds, const Mat& result_image, const vector& l // Draw FPS if provided if (fps > 0.0 && video) { - string fps_text = "FPS: " + to_string(fps); - putText(visualized_image, fps_text, Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2); + std::ostringstream fps_stream; + fps_stream << "FPS: " << std::fixed << std::setprecision(2) << fps; + putText(visualized_image, fps_stream.str(), Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2); } // Draw bounding boxes and labels for each prediction From 1b021e05360be3e5938b67e3982c1e840b074d4e Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 24 Feb 2024 05:10:04 -0500 Subject: [PATCH 7/9] Update coding style for braces --- models/object_detection_nanodet/demo.cpp | 78 ++++++++++++++++-------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index acf5ed4b..549ef4e2 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -8,7 +8,8 @@ using namespace std; using namespace cv; using namespace dnn; -const auto backendTargetPairs = vector>{ +const auto backendTargetPairs = vector> +{ {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16}, @@ -16,7 +17,8 @@ const auto backendTargetPairs = vector>{ {DNN_BACKEND_CANN, DNN_TARGET_NPU} }; -const vector nanodetClassLabels = { +const vector nanodetClassLabels = +{ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", @@ -30,9 +32,11 @@ const vector nanodetClassLabels = { "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", - "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; + "vase", "scissors", "teddy bear", "hair drier", "toothbrush" +}; -class NanoDet { +class NanoDet +{ public: NanoDet(const String& modelPath, const float probThresh = 0.35, const float iouThresh = 0.6, const Backend bId = DNN_BACKEND_DEFAULT, const Target tId = DNN_TARGET_CPU) : @@ -121,7 +125,8 @@ class NanoDet { Mat indices_float = indices.colRange(0, nms_pre); Mat selected_anchors, selected_bbox_pred, selected_cls_score; - for (int j = 0; j < indices_float.cols; ++j) { + for (int j = 0; j < indices_float.cols; ++j) + { selected_anchors.push_back(anchors.row(indices_float.at(j))); selected_bbox_pred.push_back(bbox_pred.row(indices_float.at(j))); selected_cls_score.push_back(cls_score.row(indices_float.at(j))); @@ -161,7 +166,8 @@ class NanoDet { Mat bboxes_wh(bboxes.clone()); bboxes_wh.colRange(2, 4) = bboxes_wh.colRange(2, 4) -= bboxes_wh.colRange(0, 2); vector boxesXYXY; - for (size_t i = 0; i < bboxes_wh.rows; i++) { + for (size_t i = 0; i < bboxes_wh.rows; i++) + { boxesXYXY.emplace_back(bboxes.at(i, 0), bboxes.at(i, 1), bboxes.at(i, 2), @@ -175,7 +181,8 @@ class NanoDet { Mat classIds(scores.rows, 1, CV_32FC1); Mat confidences(scores.rows, 1, CV_32FC1); - for (size_t i = 0; i < scores.rows; ++i) { + for (size_t i = 0; i < scores.rows; ++i) + { Point maxLoc; minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc); classIds.at(i) = maxLoc.x; @@ -187,7 +194,8 @@ class NanoDet { Mat postProcess(const vector& preds) { vector cls_scores, bbox_preds; - for (size_t i = 0; i < preds.size(); i += 2) { + for (size_t i = 0; i < preds.size(); i += 2) + { cls_scores.push_back(preds[i]); bbox_preds.push_back(preds[i + 1]); } @@ -195,7 +203,8 @@ class NanoDet { vector bboxes_mlvl; vector scores_mlvl; - for (size_t i = 0; i < strides.size(); ++i) { + for (size_t i = 0; i < strides.size(); ++i) + { if (i >= cls_scores.size() || i >= bbox_preds.size()) continue; // Extract necessary data int stride = strides[i]; @@ -209,7 +218,8 @@ class NanoDet { bbox_pred = stride * bbox_pred; const int nms_pre = 1000; - if (nms_pre > 0 && cls_score.rows > nms_pre) { + if (nms_pre > 0 && cls_score.rows > nms_pre) + { preNMS(anchors, bbox_pred, cls_score, nms_pre); } @@ -231,14 +241,16 @@ class NanoDet { NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices); Mat det_bboxes, det_conf, det_classid; - for (int idx : indices) { + for (int idx : indices) + { det_bboxes.push_back(bboxes.row(idx)); det_conf.push_back(confidences.at(idx)); det_classid.push_back(classIds.at(idx)); } Mat result; - if (!det_bboxes.empty()) { + if (!det_bboxes.empty()) + { hconcat(det_bboxes, det_conf, result); hconcat(result, det_classid, result); } @@ -254,8 +266,10 @@ class NanoDet { vector anchors; - for (int y = 0; y < feat_h; ++y) { - for (int x = 0; x < feat_w; ++x) { + for (int y = 0; y < feat_h; ++y) + { + for (int x = 0; x < feat_w; ++x) + { float shift_x = x * stride; float shift_y = y * stride; float cx = shift_x + 0.5 * (stride - 1); @@ -286,27 +300,34 @@ class NanoDet { }; // Function to resize and pad an image and return both the image and scale information -tuple> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) { +tuple> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416)) +{ Mat img = sourceImage.clone(); double top = 0, left = 0, newh = target_size.height, neww = target_size.width; - if (img.rows != img.cols) { + if (img.rows != img.cols) + { double hw_scale = static_cast(img.rows) / img.cols; - if (hw_scale > 1) { + if (hw_scale > 1) + { newh = target_size.height; neww = static_cast(target_size.width / hw_scale); resize(img, img, Size(neww, newh), 0, 0, INTER_AREA); left = static_cast((target_size.width - neww) * 0.5); copyMakeBorder(img, img, 0, 0, left, target_size.width - neww - left, BORDER_CONSTANT, Scalar(0)); - } else { + } + else + { newh = static_cast(target_size.height * hw_scale); neww = target_size.width; resize(img, img, Size(neww, newh), 0, 0, INTER_AREA); top = static_cast((target_size.height - newh) * 0.5); copyMakeBorder(img, img, top, target_size.height - newh - top, 0, 0, BORDER_CONSTANT, Scalar(0)); } - } else { + } + else + { resize(img, img, target_size, 0, 0, INTER_AREA); } vector letterbox_scale = {top, left, newh, neww}; @@ -315,7 +336,8 @@ tuple> letterbox(const Mat& sourceImage, const Size& target_ } // Function to scale bounding boxes back to original image coordinates -vector unletterbox(const Mat& bbox, const Size& original_image_shape, const vector& letterbox_scale) { +vector unletterbox(const Mat& bbox, const Size& original_image_shape, const vector& letterbox_scale) +{ vector ret(bbox.cols); int h = original_image_shape.height; @@ -325,9 +347,11 @@ vector unletterbox(const Mat& bbox, const Size& original_image_shape, const double newh = letterbox_scale[2]; double neww = letterbox_scale[3]; - if (h == w) { + if (h == w) + { double ratio = static_cast(h) / newh; - for (int& val : ret) { + for (int& val : ret) + { val = static_cast(val * ratio); } return ret; @@ -344,18 +368,21 @@ vector unletterbox(const Mat& bbox, const Size& original_image_shape, const } // Function to visualize predictions on an image -Mat visualize(const Mat& preds, const Mat& result_image, const vector& letterbox_scale, bool video, double fps = 0.0) { +Mat visualize(const Mat& preds, const Mat& result_image, const vector& letterbox_scale, bool video, double fps = 0.0) +{ Mat visualized_image = result_image.clone(); // Draw FPS if provided - if (fps > 0.0 && video) { + if (fps > 0.0 && video) + { std::ostringstream fps_stream; fps_stream << "FPS: " << std::fixed << std::setprecision(2) << fps; putText(visualized_image, fps_stream.str(), Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2); } // Draw bounding boxes and labels for each prediction - for (size_t i = 0; i < preds.rows; i++) { + for (size_t i = 0; i < preds.rows; i++) + { Mat pred = preds.row(i); Mat bbox = pred.colRange(0, 4); double conf = pred.at(4); @@ -487,6 +514,5 @@ int main(int argc, char** argv) } cap.release(); } - return 0; } From ef0cf9b67db5eb0561bb3f4dc07485d3db02a64b Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Mon, 26 Feb 2024 03:00:58 -0500 Subject: [PATCH 8/9] Address PR comments. Adjusted to C++11 standard. --- models/object_detection_nanodet/demo.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index 549ef4e2..0af8018a 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -112,7 +112,7 @@ class NanoDet multiply(input, repeat_project, input); reduce(input, input, 1, REDUCE_SUM, CV_32F); Mat projection = input.col(0).clone(); - return projection.reshape(0, (4, projection.total() / 4)); + return projection.reshape(0, projection.total() / 4); } void preNMS(Mat& anchors, Mat& bbox_pred, Mat& cls_score, const int nms_pre = 1000) @@ -235,7 +235,9 @@ class NanoDet vconcat(scores_mlvl, scores); vector boxesXYXY = bboxMatToRect2d(bboxes); - auto [classIds, confidences] = getClassIdAndConfidences(scores); + std::tuple classIdAndConfidences = getClassIdAndConfidences(scores); + Mat classIds = std::get<0>(classIdAndConfidences); + Mat confidences = std::get<1>(classIdAndConfidences); vector indices; NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices); From 9cafb71b08333d4c873a4ea9d03f13b5212bccc3 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 27 Feb 2024 04:00:15 -0500 Subject: [PATCH 9/9] Addressed PR comments. Added C++11 cmake configuration, extracted classID and confidence function, and use NMSBoxesBatched now. --- .../object_detection_nanodet/CMakeLists.txt | 3 ++ models/object_detection_nanodet/demo.cpp | 50 +++++++------------ 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/models/object_detection_nanodet/CMakeLists.txt b/models/object_detection_nanodet/CMakeLists.txt index 1141423a..cebba4d0 100644 --- a/models/object_detection_nanodet/CMakeLists.txt +++ b/models/object_detection_nanodet/CMakeLists.txt @@ -25,5 +25,8 @@ message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") # Declare the executable target built from your sources add_executable(${project_name} ${SourceFile}) +# Set C++ compilation standard to C++11 +set(CMAKE_CXX_STANDARD 11) + # Link your application with OpenCV libraries target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/object_detection_nanodet/demo.cpp b/models/object_detection_nanodet/demo.cpp index 0af8018a..2e32276f 100644 --- a/models/object_detection_nanodet/demo.cpp +++ b/models/object_detection_nanodet/demo.cpp @@ -176,21 +176,6 @@ class NanoDet return boxesXYXY; } - std::tuple getClassIdAndConfidences(const Mat& scores) - { - Mat classIds(scores.rows, 1, CV_32FC1); - Mat confidences(scores.rows, 1, CV_32FC1); - - for (size_t i = 0; i < scores.rows; ++i) - { - Point maxLoc; - minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc); - classIds.at(i) = maxLoc.x; - confidences.at(i) = scores.at(i, maxLoc.x); - } - return std::make_tuple(classIds, confidences); - } - Mat postProcess(const vector& preds) { vector cls_scores, bbox_preds; @@ -235,28 +220,31 @@ class NanoDet vconcat(scores_mlvl, scores); vector boxesXYXY = bboxMatToRect2d(bboxes); - std::tuple classIdAndConfidences = getClassIdAndConfidences(scores); - Mat classIds = std::get<0>(classIdAndConfidences); - Mat confidences = std::get<1>(classIdAndConfidences); + vector classIds; + vector confidences; + for (size_t i = 0; i < scores.rows; ++i) + { + Point maxLoc; + minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc); + classIds.push_back(maxLoc.x); + confidences.push_back(scores.at(i, maxLoc.x)); + } vector indices; - NMSBoxes(boxesXYXY, confidences, probThreshold, iouThreshold, indices); - - Mat det_bboxes, det_conf, det_classid; - for (int idx : indices) + NMSBoxesBatched(boxesXYXY, confidences, classIds, probThreshold, iouThreshold, indices); + Mat result(int(indices.size()), 6, CV_32FC1); + int row = 0; + for (auto idx : indices) { - det_bboxes.push_back(bboxes.row(idx)); - det_conf.push_back(confidences.at(idx)); - det_classid.push_back(classIds.at(idx)); + bboxes.rowRange(idx, idx + 1).copyTo(result(Rect(0, row, 4, 1))); + result.at(row, 4) = confidences[idx]; + result.at(row, 5) = static_cast(classIds[idx]); + row++; } - - Mat result; - if (!det_bboxes.empty()) + if (indices.size() == 0) { - hconcat(det_bboxes, det_conf, result); - hconcat(result, det_classid, result); + return Mat(); } - return result; }