dmonitoringmodeld: use cl transform (#34235)

* needs cleanup * only if tici * bump tinygrad * check width * base modelframe * . * need to be args * more cleanup * no _frame in base * tici only * its DrivingModelFrame * .6 is fair --------- Co-authored-by: Comma Device <[email protected]>
commaai · Dec 14, 2024 · 684b0b9 · 684b0b9
1 parent b3ad7ef
commit 684b0b9
Show file tree

Hide file tree

Showing 9 changed files with 165 additions and 62 deletions.
diff --git a/common/transformations/model.py b/common/transformations/model.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from openpilot.common.transformations.orientation import rot_from_euler
-from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame
+from openpilot.common.transformations.camera import get_view_frame_from_calib_frame, view_frame_from_device_frame, _ar_ox_fisheye
 
 # segnet
 SEGNET_SIZE = (512, 384)
@@ -39,6 +39,13 @@
   [0.0,  sbigmodel_fl,      0.5 * (256 + MEDMODEL_CY)],
   [0.0,  0.0,                                     1.0]])
 
+DM_INPUT_SIZE = (1440, 960)
+dmonitoringmodel_fl = _ar_ox_fisheye.focal_length
+dmonitoringmodel_intrinsics = np.array([
+  [dmonitoringmodel_fl,  0.0, DM_INPUT_SIZE[0]/2],
+  [0.0, dmonitoringmodel_fl, DM_INPUT_SIZE[1]/2 - (_ar_ox_fisheye.height - DM_INPUT_SIZE[1])/2],
+  [0.0,  0.0, 1.0]])
+
 bigmodel_frame_from_calib_frame = np.dot(bigmodel_intrinsics,
   get_view_frame_from_calib_frame(0, 0, 0, 0))
 

diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
@@ -3,6 +3,9 @@
 from openpilot.system.hardware import TICI
 ## TODO this is hack
 if TICI:
+  from tinygrad.tensor import Tensor
+  from tinygrad.dtype import dtypes
+  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
   os.environ['QCOM'] = '1'
 else:
   from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
@@ -20,13 +23,13 @@
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.realtime import set_realtime_priority
-from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
+from openpilot.common.transformations.model import dmonitoringmodel_intrinsics, DM_INPUT_SIZE
+from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
-from tinygrad.tensor import Tensor
 
+MODEL_WIDTH, MODEL_HEIGHT = DM_INPUT_SIZE
 CALIB_LEN = 3
-MODEL_WIDTH = 1440
-MODEL_HEIGHT = 960
 FEATURE_LEN = 512
 OUTPUT_SIZE = 84 + FEATURE_LEN
 
@@ -67,26 +70,31 @@ class ModelState:
 
   def __init__(self, cl_ctx):
     assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
-    self.numpy_inputs = {'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
-                         'input_img': np.zeros((1,MODEL_HEIGHT * MODEL_WIDTH), dtype=np.uint8)}
-    self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
 
+    self.frame = MonitoringModelFrame(cl_ctx)
+    self.numpy_inputs = {
+      'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
+    }
 
     if TICI:
+      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
       with open(MODEL_PKL_PATH, "rb") as f:
         self.model_run = pickle.load(f)
     else:
       self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
 
-  def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
+  def run(self, buf:VisionBuf, calib:np.ndarray, transform:np.ndarray) -> tuple[np.ndarray, float]:
     self.numpy_inputs['calib'][0,:] = calib
 
     t1 = time.perf_counter()
-    # TODO use opencl buffer directly to make tensor
-    v_offset = buf.height - MODEL_HEIGHT
-    h_offset = (buf.width - MODEL_WIDTH) // 2
-    buf_data = buf.data.reshape(-1, buf.stride)
-    self.numpy_inputs['input_img'][:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH].reshape((1, -1))
+
+    input_img_cl = self.frame.prepare(buf, transform.flatten())
+    if TICI:
+      # The imgs tensors are backed by opencl memory, only need init once
+      if 'input_img' not in self.tensor_inputs:
+        self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, (1, MODEL_WIDTH*MODEL_HEIGHT), dtype=dtypes.uint8)
+    else:
+      self.numpy_inputs['input_img'] = self.frame.buffer_from_cl(input_img_cl).reshape((1, MODEL_WIDTH*MODEL_HEIGHT))
 
     if TICI:
       output = self.model_run(**self.tensor_inputs).numpy().flatten()
@@ -147,18 +155,23 @@ def main():
   pm = PubMaster(["driverStateV2"])
 
   calib = np.zeros(CALIB_LEN, dtype=np.float32)
+  model_transform = None
 
   while True:
     buf = vipc_client.recv()
     if buf is None:
       continue
 
+    if model_transform is None:
+      cam = _os_fisheye if buf.width == _os_fisheye.width else _ar_ox_fisheye
+      model_transform = np.linalg.inv(np.dot(dmonitoringmodel_intrinsics, np.linalg.inv(cam.intrinsics))).astype(np.float32)
+
     sm.update(0)
     if sm.updated["liveCalibration"]:
       calib[:] = np.array(sm["liveCalibration"].rpyCalib)
 
     t1 = time.perf_counter()
-    model_output, gpu_execution_time = model.run(buf, calib)
+    model_output, gpu_execution_time = model.run(buf, calib, model_transform)
     t2 = time.perf_counter()
 
     pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -30,7 +30,7 @@
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.selfdrive.modeld.constants import ModelConstants
-from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
 
 
 PROCESS_NAME = "selfdrive.modeld.modeld"
@@ -53,15 +53,15 @@ def __init__(self, vipc=None):
       self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 
 class ModelState:
-  frame: ModelFrame
-  wide_frame: ModelFrame
+  frame: DrivingModelFrame
+  wide_frame: DrivingModelFrame
   inputs: dict[str, np.ndarray]
   output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self, context: CLContext):
-    self.frame = ModelFrame(context)
-    self.wide_frame = ModelFrame(context)
+    self.frame = DrivingModelFrame(context)
+    self.wide_frame = DrivingModelFrame(context)
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
     self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
     self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)

diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc
@@ -1,32 +1,24 @@
 #include "selfdrive/modeld/models/commonmodel.h"
 
-#include <cassert>
 #include <cmath>
 #include <cstring>
 
 #include "common/clutil.h"
 
-ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
+DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
   input_frames = std::make_unique<uint8_t[]>(buf_size);
   input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
-
-  q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
-  y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
-  u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
-  v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
   img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
   region.origin = 4 * frame_size_bytes;
   region.size = frame_size_bytes;
   last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
 
-  transform_init(&transform, context, device_id);
   loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
+  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }
 
-cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection) {
-  transform_queue(&this->transform, q,
-                yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
-                y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
+cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
+  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
 
   for (int i = 0; i < 4; i++) {
     CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
@@ -41,19 +33,29 @@ cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, in
   return &input_frames_cl;
 }
 
-uint8_t* ModelFrame::buffer_from_cl(cl_mem *in_frames) {
-  CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, MODEL_FRAME_SIZE * 2 * sizeof(uint8_t), &input_frames[0], 0, nullptr, nullptr));
-  clFinish(q);
-  return &input_frames[0];
-}
-
-ModelFrame::~ModelFrame() {
-  transform_destroy(&transform);
+DrivingModelFrame::~DrivingModelFrame() {
+  deinit_transform();
   loadyuv_destroy(&loadyuv);
   CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
   CL_CHECK(clReleaseMemObject(last_img_cl));
-  CL_CHECK(clReleaseMemObject(v_cl));
-  CL_CHECK(clReleaseMemObject(u_cl));
-  CL_CHECK(clReleaseMemObject(y_cl));
   CL_CHECK(clReleaseCommandQueue(q));
-}
+}
+
+
+MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
+  input_frames = std::make_unique<uint8_t[]>(buf_size);
+  input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
+
+  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
+}
+
+cl_mem* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
+  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
+  clFinish(q);
+  return &y_cl;
+}
+
+MonitoringModelFrame::~MonitoringModelFrame() {
+  deinit_transform();
+  CL_CHECK(clReleaseCommandQueue(q));
+}
diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h
@@ -2,6 +2,7 @@
 
 #include <cfloat>
 #include <cstdlib>
+#include <cassert>
 
 #include <memory>
 
@@ -18,10 +19,54 @@
 
 class ModelFrame {
 public:
-  ModelFrame(cl_device_id device_id, cl_context context);
-  ~ModelFrame();
-  cl_mem* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform);
-  uint8_t* buffer_from_cl(cl_mem *in_frames);
+  ModelFrame(cl_device_id device_id, cl_context context) {
+    q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
+  }
+  virtual ~ModelFrame() {}
+  virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; }
+  uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
+    CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
+    clFinish(q);
+    return &input_frames[0];
+  }
+
+  int MODEL_WIDTH;
+  int MODEL_HEIGHT;
+  int MODEL_FRAME_SIZE;
+  int buf_size;
+
+protected:
+  cl_mem y_cl, u_cl, v_cl;
+  Transform transform;
+  cl_command_queue q;
+  std::unique_ptr<uint8_t[]> input_frames;
+
+  void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
+    y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
+    u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
+    v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
+    transform_init(&transform, context, device_id);
+  }
+
+  void deinit_transform() {
+    transform_destroy(&transform);
+    CL_CHECK(clReleaseMemObject(v_cl));
+    CL_CHECK(clReleaseMemObject(u_cl));
+    CL_CHECK(clReleaseMemObject(y_cl));
+  }
+
+  void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
+    transform_queue(&transform, q,
+        yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
+        y_cl, u_cl, v_cl, model_width, model_height, projection);
+  }
+};
+
+class DrivingModelFrame : public ModelFrame {
+public:
+  DrivingModelFrame(cl_device_id device_id, cl_context context);
+  ~DrivingModelFrame();
+  cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
 
   const int MODEL_WIDTH = 512;
   const int MODEL_HEIGHT = 256;
@@ -30,10 +75,22 @@ class ModelFrame {
   const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
 
 private:
-  Transform transform;
   LoadYUVState loadyuv;
-  cl_command_queue q;
-  cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl, input_frames_cl;
+  cl_mem img_buffer_20hz_cl, last_img_cl, input_frames_cl;
   cl_buffer_region region;
-  std::unique_ptr<uint8_t[]> input_frames;
+};
+
+class MonitoringModelFrame : public ModelFrame {
+public:
+  MonitoringModelFrame(cl_device_id device_id, cl_context context);
+  ~MonitoringModelFrame();
+  cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
+
+  const int MODEL_WIDTH = 1440;
+  const int MODEL_HEIGHT = 960;
+  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
+  const int buf_size = MODEL_FRAME_SIZE;
+
+private:
+  cl_mem input_frame_cl;
 };
diff --git a/selfdrive/modeld/models/commonmodel.pxd b/selfdrive/modeld/models/commonmodel.pxd
@@ -14,6 +14,13 @@ cdef extern from "common/clutil.h":
 cdef extern from "selfdrive/modeld/models/commonmodel.h":
   cppclass ModelFrame:
     int buf_size
-    ModelFrame(cl_device_id, cl_context)
+    unsigned char * buffer_from_cl(cl_mem*, int);
     cl_mem * prepare(cl_mem, int, int, int, int, mat3)
-    unsigned char * buffer_from_cl(cl_mem*);
+
+  cppclass DrivingModelFrame:
+    int buf_size
+    DrivingModelFrame(cl_device_id, cl_context)
+
+  cppclass MonitoringModelFrame:
+    int buf_size
+    MonitoringModelFrame(cl_device_id, cl_context)
diff --git a/selfdrive/modeld/models/commonmodel_pyx.pyx b/selfdrive/modeld/models/commonmodel_pyx.pyx
@@ -9,7 +9,7 @@ from libc.stdint cimport uintptr_t
 from msgq.visionipc.visionipc cimport cl_mem
 from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
 from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
-from .commonmodel cimport mat3, ModelFrame as cppModelFrame
+from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
 
 
 cdef class CLContext(BaseCLContext):
@@ -31,11 +31,10 @@ cdef class CLMem:
 def cl_from_visionbuf(VisionBuf buf):
   return CLMem.create(<void*>&buf.buf.buf_cl)
 
+
 cdef class ModelFrame:
   cdef cppModelFrame * frame
-
-  def __cinit__(self, CLContext context):
-    self.frame = new cppModelFrame(context.device_id, context.context)
+  cdef int buf_size
 
   def __dealloc__(self):
     del self.frame
@@ -49,5 +48,23 @@ cdef class ModelFrame:
 
   def buffer_from_cl(self, CLMem in_frames):
     cdef unsigned char * data2
-    data2 = self.frame.buffer_from_cl(in_frames.mem)
-    return np.asarray(<cnp.uint8_t[:self.frame.buf_size]> data2)
+    data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
+    return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
+
+
+cdef class DrivingModelFrame(ModelFrame):
+  cdef cppDrivingModelFrame * _frame
+
+  def __cinit__(self, CLContext context):
+    self._frame = new cppDrivingModelFrame(context.device_id, context.context)
+    self.frame = <cppModelFrame*>(self._frame)
+    self.buf_size = self._frame.buf_size
+
+cdef class MonitoringModelFrame(ModelFrame):
+  cdef cppMonitoringModelFrame * _frame
+
+  def __cinit__(self, CLContext context):
+    self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
+    self.frame = <cppModelFrame*>(self._frame)
+    self.buf_size = self._frame.buf_size
+
diff --git a/system/hardware/tici/tests/test_power_draw.py b/system/hardware/tici/tests/test_power_draw.py
@@ -33,7 +33,7 @@ def name(self):
 PROCS = [
   Proc(['camerad'], 1.75, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']),
   Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']),
-  Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']),
+  Proc(['dmonitoringmodeld'], 0.6, msgs=['driverStateV2']),
   Proc(['encoderd'], 0.23, msgs=[]),
 ]
 

diff --git a/tinygrad_repo b/tinygrad_repo