diff --git a/README.zh-CN.md b/README.zh-CN.md
index ab182093133..650140552db 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -1,3 +1,9 @@
+#### 导出适配 RKNPU 的模型
+
+关于如何导出适配 RKNPU 分割/检测 模型,请参考 [RKOPT_README_CN.md](RKOPT_README_CN.md), 该优化只在导出模型时生效,训练代码按照原仓库的指引即可。
+
+---
+
diff --git a/RKOPT_README.md b/RKOPT_README.md
new file mode 100644
index 00000000000..60c7d0ff248
--- /dev/null
+++ b/RKOPT_README.md
@@ -0,0 +1,39 @@
+# RKNN optimization for exporting model
+
+## Source
+Base on https://github.com/ultralytics/ultralytics with commit id as c9be1f3cce89778f79fb462797b8ca0300e3813d
+
+
+
+
+## What different
+With inference result values unchanged, the following optimizations were applied:
+- Change output node, remove post-process from the model. (post-process block in model is unfriendly for quantization)
+- Remove dfl structure at the end of the model. (which slowdown the inference speed on NPU device)
+- Add a score-sum output branch to speedup post-process.
+
+All the removed operation will be done on CPU. (the CPU post-process could be found in **RKNN_Model_Zoo**)
+
+
+
+
+## Export ONNX model
+
+After meeting the environment requirements specified in "./requirements.txt," execute the following command to export the model (support detect/segment model):
+
+```
+# Adjust the model file path in "./ultralytics/cfg/default.yaml" (default is yolov8n.pt). If you trained your own model, please provide the corresponding path.
+# For example, filled with yolov8n.pt for detection model.
+# Filling with yolov8n-seg.pt for segmentation model.
+
+export PYTHONPATH=./
+python ./ultralytics/engine/exporter.py
+
+# Upon completion, the ".onnx" model will be generated. If the original model is "yolov8n.pt," the generated model will be "yolov8n.onnx"
+```
+
+
+
+## Convert to RKNN model, Python demo, C demo
+
+Please refer to https://github.com/airockchip/rknn_model_zoo.
\ No newline at end of file
diff --git a/RKOPT_README.zh-CN.md b/RKOPT_README.zh-CN.md
new file mode 100644
index 00000000000..28fbcdc51e3
--- /dev/null
+++ b/RKOPT_README.zh-CN.md
@@ -0,0 +1,45 @@
+# 导出 RKNPU 适配模型说明
+
+## Source
+
+ 本仓库基于 https://github.com/ultralytics/ultralytics 仓库的 c9be1f3cce89778f79fb462797b8ca0300e3813d commit 进行修改,验证.
+
+
+
+## 模型差异
+
+在基于不影响输出结果, 不需要重新训练模型的条件下, 有以下改动:
+
+- 修改输出结构, 移除后处理结构. (后处理结果对于量化不友好)
+
+- dfl 结构在 NPU 处理上性能不佳,移至模型外部的后处理阶段,此操作大部分情况下可提升推理性能。
+
+
+- 模型输出分支新增置信度的总和,用于后处理阶段加速阈值筛选。
+
+
+以上移除的操作, 均需要在外部使用CPU进行相应的处理. (对应的后处理代码可以在 **RKNN_Model_Zoo** 中找到)
+
+
+
+## 导出onnx模型
+
+在满足 ./requirements.txt 的环境要求后,执行以下语句导出模型
+
+```
+# 调整 ./ultralytics/cfg/default.yaml 中 model 文件路径,默认为 yolov8n.pt,若自己训练模型,请调接至对应的路径。支持检测、分割模型。
+# 如填入 yolov8n.pt 导出检测模型
+# 如填入 yolov8-seg.pt 导出分割模型
+
+export PYTHONPATH=./
+python ./ultralytics/engine/exporter.py
+
+# 执行完毕后,会生成 ONNX 模型. 假如原始模型为 yolov8n.pt,则生成 yolov8n.onnx 模型。
+```
+
+
+
+## 转RKNN模型、Python demo、C demo
+
+请参考 https://github.com/airockchip/rknn_model_zoo
+
diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml
index 5babd254a3e..d435dbdaef1 100644
--- a/ultralytics/cfg/default.yaml
+++ b/ultralytics/cfg/default.yaml
@@ -5,7 +5,7 @@ task: detect # (str) YOLO task, i.e. detect, segment, classify, pose
mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark
# Train settings -------------------------------------------------------------------------------------------------------
-model: # (str, optional) path to model file, i.e. yolov8n.pt, yolov8n.yaml
+model: yolov8m-seg.pt # (str, optional) path to model file, i.e. yolov8n.pt, yolov8n.yaml
data: # (str, optional) path to data file, i.e. coco128.yaml
epochs: 100 # (int) number of epochs to train for
patience: 50 # (int) epochs to wait for no observable improvement for early stopping of training
@@ -68,7 +68,7 @@ retina_masks: False # (bool) use high-resolution segmentation masks
boxes: True # (bool) Show boxes in segmentation predictions
# Export settings ------------------------------------------------------------------------------------------------------
-format: torchscript # (str) format to export to, choices at https://docs.ultralytics.com/modes/export/#export-formats
+format: rknn # (str) format to export to, choices at https://docs.ultralytics.com/modes/export/#export-formats
keras: False # (bool) use Kera=s
optimize: False # (bool) TorchScript: optimize for mobile
int8: False # (bool) CoreML/TF INT8 quantization
diff --git a/ultralytics/data/augment.py b/ultralytics/data/augment.py
index 12d09cff2cc..964c9f2f30a 100644
--- a/ultralytics/data/augment.py
+++ b/ultralytics/data/augment.py
@@ -369,7 +369,7 @@ def apply_bboxes(self, bboxes, M):
# Create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
- return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
+ return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
def apply_segments(self, segments, M):
"""
diff --git a/ultralytics/data/base.py b/ultralytics/data/base.py
index bfc3cc19e22..bb386302bbb 100644
--- a/ultralytics/data/base.py
+++ b/ultralytics/data/base.py
@@ -1,4 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
+import cv2
import glob
import math
@@ -9,7 +10,6 @@
from pathlib import Path
from typing import Optional
-import cv2
import numpy as np
import psutil
from torch.utils.data import Dataset
diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py
index 02cacf0dbd2..ab065c99310 100644
--- a/ultralytics/engine/exporter.py
+++ b/ultralytics/engine/exporter.py
@@ -57,6 +57,7 @@
from copy import deepcopy
from datetime import datetime
from pathlib import Path
+import cv2
import torch
@@ -88,8 +89,10 @@ def export_formats():
['TensorFlow Lite', 'tflite', '.tflite', True, False],
['TensorFlow Edge TPU', 'edgetpu', '_edgetpu.tflite', True, False],
['TensorFlow.js', 'tfjs', '_web_model', True, False],
- ['PaddlePaddle', 'paddle', '_paddle_model', True, True],
- ['ncnn', 'ncnn', '_ncnn_model', True, True], ]
+ ['PaddlePaddle', 'paddle', '_paddle_model', True, True],
+ ['ncnn', 'ncnn', '_ncnn_model', True, True],
+ ['RKNN', 'rknn', '_rknnopt.torchscript', True, False],
+ ]
return pandas.DataFrame(x, columns=['Format', 'Argument', 'Suffix', 'CPU', 'GPU'])
@@ -157,7 +160,8 @@ def __call__(self, model=None):
flags = [x == format for x in fmts]
if sum(flags) != 1:
raise ValueError(f"Invalid export format='{format}'. Valid formats are {fmts}")
- jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn = flags # export booleans
+ jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn, rknn = flags # export booleans
+
# Load PyTorch model
self.device = select_device('cpu' if self.args.device is None else self.args.device)
@@ -262,6 +266,8 @@ def __call__(self, model=None):
f[10], _ = self.export_paddle()
if ncnn: # ncnn
f[11], _ = self.export_ncnn()
+ if rknn:
+ f[12], _ = self.export_rknn()
# Finish
f = [str(x) for x in f if x] # filter out '' and None
@@ -297,6 +303,31 @@ def export_torchscript(self, prefix=colorstr('TorchScript:')):
ts.save(str(f), _extra_files=extra_files)
return f, None
+ @try_export
+ def export_rknn(self, prefix=colorstr('RKNN:')):
+ """YOLOv8 RKNN model export."""
+ LOGGER.info(f'\n{prefix} starting export with torch {torch.__version__}...')
+
+ # ts = torch.jit.trace(self.model, self.im, strict=False)
+ # f = str(self.file).replace(self.file.suffix, f'_rknnopt.torchscript')
+ # torch.jit.save(ts, str(f))
+
+ f = str(self.file).replace(self.file.suffix, f'.onnx')
+ opset_version = self.args.opset or get_latest_opset()
+ torch.onnx.export(
+ self.model,
+ self.im[0:1,:,:,:],
+ f,
+ verbose=False,
+ opset_version=12,
+ do_constant_folding=True, # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
+ input_names=['images'])
+
+ LOGGER.info(f'\n{prefix} feed {f} to RKNN-Toolkit or RKNN-Toolkit2 to generate RKNN model.\n'
+ 'Refer https://github.com/airockchip/rknn_model_zoo/tree/main/models/CV/object_detection/yolo')
+ return f, None
+
+
@try_export
def export_onnx(self, prefix=colorstr('ONNX:')):
"""YOLOv8 ONNX export."""
diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py
index e1c7ea82268..62febbfded4 100644
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@@ -80,7 +80,7 @@ def __init__(self,
super().__init__()
w = str(weights[0] if isinstance(weights, list) else weights)
nn_module = isinstance(weights, torch.nn.Module)
- pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn, triton = \
+ pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn, triton, rknn = \
self._model_type(w)
fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH)
@@ -385,6 +385,8 @@ def forward(self, im, augment=False, visualize=False):
mat_out = self.pyncnn.Mat()
ex.extract(output_name, mat_out)
y.append(np.array(mat_out)[None])
+ elif getattr(self, 'rknn', False):
+ assert "for inference, please refer to https://github.com/airockchip/rknn_model_zoo/"
elif self.triton: # NVIDIA Triton Inference Server
y = self.model(im)
else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
diff --git a/ultralytics/nn/modules.py b/ultralytics/nn/modules.py
new file mode 100644
index 00000000000..95bdec13ed3
--- /dev/null
+++ b/ultralytics/nn/modules.py
@@ -0,0 +1,597 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Common modules
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+
+from ultralytics.yolo.utils.tal import dist2bbox, make_anchors
+
+
+def autopad(k, p=None, d=1): # kernel, padding, dilation
+ """Pad to 'same' shape outputs."""
+ if d > 1:
+ k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
+ if p is None:
+ p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
+ return p
+
+
+class Conv(nn.Module):
+ """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
+ default_act = nn.SiLU() # default activation
+
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+ """Initialize Conv layer with given arguments including activation."""
+ super().__init__()
+ self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
+ self.bn = nn.BatchNorm2d(c2)
+ self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+ def forward(self, x):
+ """Apply convolution, batch normalization and activation to input tensor."""
+ return self.act(self.bn(self.conv(x)))
+
+ def forward_fuse(self, x):
+ """Perform transposed convolution of 2D data."""
+ return self.act(self.conv(x))
+
+
+class DWConv(Conv):
+ """Depth-wise convolution."""
+
+ def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation
+ super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
+
+
+class DWConvTranspose2d(nn.ConvTranspose2d):
+ """Depth-wise transpose convolution."""
+
+ def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out
+ super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
+
+
+class ConvTranspose(nn.Module):
+ """Convolution transpose 2d layer."""
+ default_act = nn.SiLU() # default activation
+
+ def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
+ """Initialize ConvTranspose2d layer with batch normalization and activation function."""
+ super().__init__()
+ self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
+ self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
+ self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+ def forward(self, x):
+ """Applies transposed convolutions, batch normalization and activation to input."""
+ return self.act(self.bn(self.conv_transpose(x)))
+
+ def forward_fuse(self, x):
+ """Applies activation and convolution transpose operation to input."""
+ return self.act(self.conv_transpose(x))
+
+
+class DFL(nn.Module):
+ """
+ Integral module of Distribution Focal Loss (DFL).
+ Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
+ """
+
+ def __init__(self, c1=16):
+ """Initialize a convolutional layer with a given number of input channels."""
+ super().__init__()
+ self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
+ x = torch.arange(c1, dtype=torch.float)
+ self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
+ self.c1 = c1
+
+ def forward(self, x):
+ """Applies a transformer layer on input tensor 'x' and returns a tensor."""
+ b, c, a = x.shape # batch, channels, anchors
+ return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
+ # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
+
+
+class TransformerLayer(nn.Module):
+ """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
+
+ def __init__(self, c, num_heads):
+ """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
+ super().__init__()
+ self.q = nn.Linear(c, c, bias=False)
+ self.k = nn.Linear(c, c, bias=False)
+ self.v = nn.Linear(c, c, bias=False)
+ self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
+ self.fc1 = nn.Linear(c, c, bias=False)
+ self.fc2 = nn.Linear(c, c, bias=False)
+
+ def forward(self, x):
+ """Apply a transformer block to the input x and return the output."""
+ x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
+ x = self.fc2(self.fc1(x)) + x
+ return x
+
+
+class TransformerBlock(nn.Module):
+ """Vision Transformer https://arxiv.org/abs/2010.11929."""
+
+ def __init__(self, c1, c2, num_heads, num_layers):
+ """Initialize a Transformer module with position embedding and specified number of heads and layers."""
+ super().__init__()
+ self.conv = None
+ if c1 != c2:
+ self.conv = Conv(c1, c2)
+ self.linear = nn.Linear(c2, c2) # learnable position embedding
+ self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
+ self.c2 = c2
+
+ def forward(self, x):
+ """Forward propagates the input through the bottleneck module."""
+ if self.conv is not None:
+ x = self.conv(x)
+ b, _, w, h = x.shape
+ p = x.flatten(2).permute(2, 0, 1)
+ return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
+
+
+class Bottleneck(nn.Module):
+ """Standard bottleneck."""
+
+ def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand
+ super().__init__()
+ c_ = int(c2 * e) # hidden channels
+ self.cv1 = Conv(c1, c_, k[0], 1)
+ self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+ self.add = shortcut and c1 == c2
+
+ def forward(self, x):
+ """'forward()' applies the YOLOv5 FPN to input data."""
+ return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+ """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
+ super().__init__()
+ c_ = int(c2 * e) # hidden channels
+ self.cv1 = Conv(c1, c_, 1, 1)
+ self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+ self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+ self.cv4 = Conv(2 * c_, c2, 1, 1)
+ self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
+ self.act = nn.SiLU()
+ self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+
+ def forward(self, x):
+ """Applies a CSP bottleneck with 3 convolutions."""
+ y1 = self.cv3(self.m(self.cv1(x)))
+ y2 = self.cv2(x)
+ return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
+
+
+class C3(nn.Module):
+ """CSP Bottleneck with 3 convolutions."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
+ super().__init__()
+ c_ = int(c2 * e) # hidden channels
+ self.cv1 = Conv(c1, c_, 1, 1)
+ self.cv2 = Conv(c1, c_, 1, 1)
+ self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
+ self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
+
+ def forward(self, x):
+ """Forward pass through the CSP bottleneck with 2 convolutions."""
+ return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+
+
+class C2(nn.Module):
+ """CSP Bottleneck with 2 convolutions."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
+ super().__init__()
+ self.c = int(c2 * e) # hidden channels
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+ self.cv2 = Conv(2 * self.c, c2, 1) # optional act=FReLU(c2)
+ # self.attention = ChannelAttention(2 * self.c) # or SpatialAttention()
+ self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
+
+ def forward(self, x):
+ """Forward pass through the CSP bottleneck with 2 convolutions."""
+ a, b = self.cv1(x).chunk(2, 1)
+ return self.cv2(torch.cat((self.m(a), b), 1))
+
+
+class C2f(nn.Module):
+ """CSP Bottleneck with 2 convolutions."""
+
+ def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
+ super().__init__()
+ self.c = int(c2 * e) # hidden channels
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+ self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2)
+ self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
+
+ def forward(self, x):
+ """Forward pass of a YOLOv5 CSPDarknet backbone layer."""
+ y = list(self.cv1(x).chunk(2, 1))
+ y.extend(m(y[-1]) for m in self.m)
+ return self.cv2(torch.cat(y, 1))
+
+ def forward_split(self, x):
+ """Applies spatial attention to module's input."""
+ y = list(self.cv1(x).split((self.c, self.c), 1))
+ y.extend(m(y[-1]) for m in self.m)
+ return self.cv2(torch.cat(y, 1))
+
+
+class ChannelAttention(nn.Module):
+ """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
+
+ def __init__(self, channels: int) -> None:
+ super().__init__()
+ self.pool = nn.AdaptiveAvgPool2d(1)
+ self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+ self.act = nn.Sigmoid()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x * self.act(self.fc(self.pool(x)))
+
+
+class SpatialAttention(nn.Module):
+ """Spatial-attention module."""
+
+ def __init__(self, kernel_size=7):
+ """Initialize Spatial-attention module with kernel size argument."""
+ super().__init__()
+ assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
+ padding = 3 if kernel_size == 7 else 1
+ self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
+ self.act = nn.Sigmoid()
+
+ def forward(self, x):
+ """Apply channel and spatial attention on input for feature recalibration."""
+ return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
+
+
+class CBAM(nn.Module):
+ """Convolutional Block Attention Module."""
+
+ def __init__(self, c1, kernel_size=7): # ch_in, kernels
+ super().__init__()
+ self.channel_attention = ChannelAttention(c1)
+ self.spatial_attention = SpatialAttention(kernel_size)
+
+ def forward(self, x):
+ """Applies the forward pass through C1 module."""
+ return self.spatial_attention(self.channel_attention(x))
+
+
+class C1(nn.Module):
+ """CSP Bottleneck with 1 convolution."""
+
+ def __init__(self, c1, c2, n=1): # ch_in, ch_out, number
+ super().__init__()
+ self.cv1 = Conv(c1, c2, 1, 1)
+ self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
+
+ def forward(self, x):
+ """Applies cross-convolutions to input in the C3 module."""
+ y = self.cv1(x)
+ return self.m(y) + y
+
+
+class C3x(C3):
+ """C3 module with cross-convolutions."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+ """Initialize C3TR instance and set default parameters."""
+ super().__init__(c1, c2, n, shortcut, g, e)
+ self.c_ = int(c2 * e)
+ self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
+
+
+class C3TR(C3):
+ """C3 module with TransformerBlock()."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+ """Initialize C3Ghost module with GhostBottleneck()."""
+ super().__init__(c1, c2, n, shortcut, g, e)
+ c_ = int(c2 * e)
+ self.m = TransformerBlock(c_, c_, 4, n)
+
+
+class C3Ghost(C3):
+ """C3 module with GhostBottleneck()."""
+
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+ """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
+ super().__init__(c1, c2, n, shortcut, g, e)
+ c_ = int(c2 * e) # hidden channels
+ self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
+
+
+class SPP(nn.Module):
+ """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
+
+ def __init__(self, c1, c2, k=(5, 9, 13)):
+ """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
+ super().__init__()
+ c_ = c1 // 2 # hidden channels
+ self.cv1 = Conv(c1, c_, 1, 1)
+ self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+ self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+ def forward(self, x):
+ """Forward pass of the SPP layer, performing spatial pyramid pooling."""
+ x = self.cv1(x)
+ return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+
+
+class SPPF(nn.Module):
+ """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
+
+ def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13))
+ super().__init__()
+ c_ = c1 // 2 # hidden channels
+ self.cv1 = Conv(c1, c_, 1, 1)
+ self.cv2 = Conv(c_ * 4, c2, 1, 1)
+ self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+
+ def forward(self, x):
+ """Forward pass through Ghost Convolution block."""
+ x = self.cv1(x)
+ y1 = self.m(x)
+ y2 = self.m(y1)
+ return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
+
+
+class Focus(nn.Module):
+ """Focus wh information into c-space."""
+
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
+ super().__init__()
+ self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
+ # self.contract = Contract(gain=2)
+
+ def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+ return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
+ # return self.conv(self.contract(x))
+
+
+class GhostConv(nn.Module):
+ """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
+
+ def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups
+ super().__init__()
+ c_ = c2 // 2 # hidden channels
+ self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
+ self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
+
+ def forward(self, x):
+ """Forward propagation through a Ghost Bottleneck layer with skip connection."""
+ y = self.cv1(x)
+ return torch.cat((y, self.cv2(y)), 1)
+
+
+class GhostBottleneck(nn.Module):
+ """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
+
+ def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride
+ super().__init__()
+ c_ = c2 // 2
+ self.conv = nn.Sequential(
+ GhostConv(c1, c_, 1, 1), # pw
+ DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
+ GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
+ self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
+ act=False)) if s == 2 else nn.Identity()
+
+ def forward(self, x):
+ """Applies skip connection and concatenation to input tensor."""
+ return self.conv(x) + self.shortcut(x)
+
+
+class Concat(nn.Module):
+ """Concatenate a list of tensors along dimension."""
+
+ def __init__(self, dimension=1):
+ """Concatenates a list of tensors along a specified dimension."""
+ super().__init__()
+ self.d = dimension
+
+ def forward(self, x):
+ """Forward pass for the YOLOv8 mask Proto module."""
+ return torch.cat(x, self.d)
+
+
+class Proto(nn.Module):
+ """YOLOv8 mask Proto module for segmentation models."""
+
+ def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks
+ super().__init__()
+ self.cv1 = Conv(c1, c_, k=3)
+ self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest')
+ self.cv2 = Conv(c_, c_, k=3)
+ self.cv3 = Conv(c_, c2)
+
+ def forward(self, x):
+ """Performs a forward pass through layers using an upsampled input image."""
+ return self.cv3(self.cv2(self.upsample(self.cv1(x))))
+
+
+class Ensemble(nn.ModuleList):
+ """Ensemble of models."""
+
+ def __init__(self):
+ """Initialize an ensemble of models."""
+ super().__init__()
+
+ def forward(self, x, augment=False, profile=False, visualize=False):
+ """Function generates the YOLOv5 network's final layer."""
+ y = [module(x, augment, profile, visualize)[0] for module in self]
+ # y = torch.stack(y).max(0)[0] # max ensemble
+ # y = torch.stack(y).mean(0) # mean ensemble
+ y = torch.cat(y, 2) # nms ensemble, y shape(B, HW, C)
+ return y, None # inference, train output
+
+
+# Model heads below ----------------------------------------------------------------------------------------------------
+
+
+class Detect(nn.Module):
+ """YOLOv8 Detect head for detection models."""
+ dynamic = False # force grid reconstruction
+ export = False # export mode
+ shape = None
+ anchors = torch.empty(0) # init
+ strides = torch.empty(0) # init
+
+ def __init__(self, nc=80, ch=()): # detection layer
+ super().__init__()
+ self.nc = nc # number of classes
+ self.nl = len(ch) # number of detection layers
+ self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
+ self.no = nc + self.reg_max * 4 # number of outputs per anchor
+ self.stride = torch.zeros(self.nl) # strides computed during build
+ c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc) # channels
+ self.cv2 = nn.ModuleList(
+ nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
+ self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
+ self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
+
+ def forward(self, x):
+ """Concatenates and returns predicted bounding boxes and class probabilities."""
+ shape = x[0].shape # BCHW
+ if self.export and self.format == 'rknn':
+ y = []
+ for i in range(self.nl):
+ y.append(self.cv2[i](x[i]))
+ cls = torch.sigmoid(self.cv3[i](x[i]))
+ cls_sum = torch.clamp(y[-1].sum(1, keepdim=True), 0, 1)
+ y.append(cls)
+ y.append(cls_sum)
+ return y
+
+ for i in range(self.nl):
+ x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
+ if self.training:
+ return x
+ elif self.dynamic or self.shape != shape:
+ self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
+ self.shape = shape
+
+ x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
+ if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
+ box = x_cat[:, :self.reg_max * 4]
+ cls = x_cat[:, self.reg_max * 4:]
+ else:
+ box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
+ dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+ y = torch.cat((dbox, cls.sigmoid()), 1)
+ return y if self.export else (y, x)
+
+ def bias_init(self):
+ """Initialize Detect() biases, WARNING: requires stride availability."""
+ m = self # self.model[-1] # Detect() module
+ # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+ # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
+ for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
+ a[-1].bias.data[:] = 1.0 # box
+ b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
+
+
+class Segment(Detect):
+ """YOLOv8 Segment head for segmentation models."""
+
+ def __init__(self, nc=80, nm=32, npr=256, ch=()):
+ """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
+ super().__init__(nc, ch)
+ self.nm = nm # number of masks
+ self.npr = npr # number of protos
+ self.proto = Proto(ch[0], self.npr, self.nm) # protos
+ self.detect = Detect.forward
+
+ c4 = max(ch[0] // 4, self.nm)
+ self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
+
+ def forward(self, x):
+ """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
+ p = self.proto(x[0]) # mask protos
+ bs = p.shape[0] # batch size
+
+ if self.export == True and self.format == 'rknn':
+ mc = [self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)]
+ else:
+ mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients
+ x = self.detect(self, x)
+ if self.training:
+ return x, mc, p
+
+ if self.export and self.format == 'rknn':
+ outputs = []
+ out_per_detect_branch = int(len(x)/self.nl)
+ for i in range(self.nl):
+ outputs.extend(x[i*out_per_detect_branch:(i+1)*out_per_detect_branch])
+ outputs.append(mc[i])
+ outputs.append(p)
+ return outputs
+
+ return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
+
+
+class Pose(Detect):
+ """YOLOv8 Pose head for keypoints models."""
+
+ def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
+ """Initialize YOLO network with default parameters and Convolutional Layers."""
+ super().__init__(nc, ch)
+ self.kpt_shape = kpt_shape # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+ self.nk = kpt_shape[0] * kpt_shape[1] # number of keypoints total
+ self.detect = Detect.forward
+
+ c4 = max(ch[0] // 4, self.nk)
+ self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
+
+ def forward(self, x):
+ """Perform forward pass through YOLO model and return predictions."""
+ bs = x[0].shape[0] # batch size
+ kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1) # (bs, 17*3, h*w)
+ x = self.detect(self, x)
+ if self.training:
+ return x, kpt
+ pred_kpt = self.kpts_decode(kpt)
+ return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
+
+ def kpts_decode(self, kpts):
+ """Decodes keypoints."""
+ ndim = self.kpt_shape[1]
+ y = kpts.clone()
+ if ndim == 3:
+ y[:, 2::3].sigmoid_() # inplace sigmoid
+ y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
+ y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
+ return y
+
+
+class Classify(nn.Module):
+ """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""
+
+ def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups
+ super().__init__()
+ c_ = 1280 # efficientnet_b0 size
+ self.conv = Conv(c1, c_, k, s, autopad(k, p), g)
+ self.pool = nn.AdaptiveAvgPool2d(1) # to x(b,c_,1,1)
+ self.drop = nn.Dropout(p=0.0, inplace=True)
+ self.linear = nn.Linear(c_, c2) # to x(b,c2)
+
+ def forward(self, x):
+ """Performs a forward pass of the YOLO model on input image data."""
+ if isinstance(x, list):
+ x = torch.cat(x, 1)
+ x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
+ return x if self.training else x.softmax(1)
diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py
index fffd102a3cf..5bf3a119fbc 100644
--- a/ultralytics/nn/modules/head.py
+++ b/ultralytics/nn/modules/head.py
@@ -43,6 +43,17 @@ def __init__(self, nc=80, ch=()): # detection layer
def forward(self, x):
"""Concatenates and returns predicted bounding boxes and class probabilities."""
shape = x[0].shape # BCHW
+
+ if self.export and self.format == 'rknn':
+ y = []
+ for i in range(self.nl):
+ y.append(self.cv2[i](x[i]))
+ cls = torch.sigmoid(self.cv3[i](x[i]))
+ cls_sum = torch.clamp(cls.sum(1, keepdim=True), 0, 1)
+ y.append(cls)
+ y.append(cls_sum)
+ return y
+
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
if self.training:
@@ -100,10 +111,22 @@ def forward(self, x):
p = self.proto(x[0]) # mask protos
bs = p.shape[0] # batch size
- mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients
+ if self.export and self.format == 'rknn':
+ mc = [self.cv4[i](x[i]) for i in range(self.nl)]
+ else:
+ mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients
+
x = self.detect(self, x)
if self.training:
return x, mc, p
+ if self.export and self.format == 'rknn':
+ bo = len(x)//3
+ relocated = []
+ for i in range(len(mc)):
+ relocated.extend(x[i*bo:(i+1)*bo])
+ relocated.extend([mc[i]])
+ relocated.extend([p])
+ return relocated
return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py
index c593ac27a41..41b3c780832 100644
--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@@ -106,7 +106,7 @@ def select_device(device='', batch=0, newline=False, verbose=True):
s += f'MPS ({get_cpu_info()})\n'
arg = 'mps'
else: # revert to CPU
- s += f'CPU ({get_cpu_info()})\n'
+ s += f'CPU ()\n'
arg = 'cpu'
if verbose and RANK == -1: