Skip to content

Commit

Permalink
Revert ISP image processing + tinygrad bump (commaai#34020)
Browse files Browse the repository at this point in the history
* Revert "Replace ThneedModel with TinygradModel (commaai#33532)"

This reverts commit da952e9.

* Revert "camerad: move E + D cams image pipelines to the IFE (commaai#33959)"

This reverts commit f2a1cce.
  • Loading branch information
adeebshihadeh authored and ssysm committed Feb 13, 2025
1 parent 0ee0568 commit e083acc
Show file tree
Hide file tree
Showing 27 changed files with 1,023 additions and 153 deletions.
2 changes: 1 addition & 1 deletion release/release_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"tools/joystick/",
"tools/longitudinal_maneuvers/",

"tinygrad_repo/examples/openpilot/compile3.py",
"tinygrad_repo/openpilot/compile2.py",
"tinygrad_repo/extra/onnx.py",
"tinygrad_repo/extra/onnx_ops.py",
"tinygrad_repo/extra/thneed.py",
Expand Down
37 changes: 24 additions & 13 deletions selfdrive/modeld/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ common_src = [
"transforms/transform.cc",
]

thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]

thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc

# SNPE except on Mac and ARM Linux
snpe_lib = []
if arch != "Darwin" and arch != "aarch64":
Expand Down Expand Up @@ -50,18 +59,20 @@ fn = File("models/supercombo").abspath
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)

# Compile tinygrad model
# TODO this is all super hacky
pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
if arch == 'larch64':
device_string = 'QCOM=1'
elif arch == 'Darwin' or arch == 'aarch64':
device_string = 'CLANG=1 IMAGE=0'
else:
device_string = 'GPU=1'
# Build thneed model
if arch == "larch64" or GetOption('pc_thneed'):
tinygrad_opts = []
if not GetOption('pc_thneed'):
# use FLOAT16 on device for speed + don't cache the CL kernels for space
tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"

lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)

for model_name in ['supercombo', 'dmonitoring_model']:
fn = File(f"models/{model_name}").abspath
cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
fn_dm = File("models/dmonitoring_model").abspath
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)

thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
6 changes: 6 additions & 0 deletions selfdrive/modeld/dmonitoringmodeld
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
#!/usr/bin/env bash

DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"

if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi

exec "$DIR/dmonitoringmodeld.py" "$@"
47 changes: 20 additions & 27 deletions selfdrive/modeld/dmonitoringmodeld.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
#!/usr/bin/env python3
import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import gc
import math
import time
import pickle
import ctypes
import numpy as np
from pathlib import Path
Expand All @@ -22,11 +14,9 @@
from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params
from openpilot.common.realtime import set_realtime_priority
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext #, cl_from_visionbuf
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
#from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
from tinygrad.tensor import Tensor
#from tinygrad.dtype import dtypes

CALIB_LEN = 3
MODEL_WIDTH = 1440
Expand All @@ -36,7 +26,9 @@

PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}

class DriverStateResult(ctypes.Structure):
_fields_ = [
Expand Down Expand Up @@ -67,32 +59,33 @@ class DMonitoringModelResult(ctypes.Structure):
class ModelState:
inputs: dict[str, np.ndarray]
output: np.ndarray
model: ModelRunner

def __init__(self, cl_ctx):
assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
self.numpy_inputs = {'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
'input_img': np.zeros((1,MODEL_HEIGHT * MODEL_WIDTH), dtype=np.uint8)}
self.img = None
self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
self.inputs = {
'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
'calib': np.zeros(CALIB_LEN, dtype=np.float32)}


with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
self.model.addInput("input_img", None)
self.model.addInput("calib", self.inputs['calib'])

def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.numpy_inputs['calib'][0,:] = calib
self.inputs['calib'][:] = calib

t1 = time.perf_counter()
# TODO use opencl buffer directly to make tensor
v_offset = buf.height - MODEL_HEIGHT
h_offset = (buf.width - MODEL_WIDTH) // 2
buf_data = buf.data.reshape(-1, buf.stride)
self.numpy_inputs['input_img'][:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH].reshape((1, -1))

tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
output = self.model_run(**tensor_inputs)['outputs'].numpy().flatten()
input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]

self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
t1 = time.perf_counter()
self.model.execute()
t2 = time.perf_counter()
return output, t2 - t1
return self.output, t2 - t1


def fill_driver_state(msg, ds_result: DriverStateResult):
Expand Down
72 changes: 28 additions & 44 deletions selfdrive/modeld/modeld.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
#!/usr/bin/env python3
import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import time
import pickle
import numpy as np
Expand All @@ -25,24 +18,21 @@
from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.parse_model_outputs import Parser
from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address

from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes

PROCESS_NAME = "selfdrive.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}

METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'

# TODO: should not hardcoded
IMG_INPUT_SHAPE = (1, 12, 128, 256)

class FrameMeta:
frame_id: int = 0
Expand All @@ -59,6 +49,7 @@ class ModelState:
inputs: dict[str, np.ndarray]
output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner

def __init__(self, context: CLContext):
self.frame = ModelFrame(context)
Expand All @@ -69,14 +60,13 @@ def __init__(self, context: CLContext):
self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32)

# img buffers are managed in openCL transform code
self.numpy_inputs = {
'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
'lateral_control_params': np.zeros((1, ModelConstants.LATERAL_CONTROL_PARAMS_LEN), dtype=np.float32),
'prev_desired_curv': np.zeros((1,(ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32),
'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32),
self.inputs = {
'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
}
self.img_inputs = {} # type: ignore

with open(METADATA_PATH, 'rb') as f:
model_metadata = pickle.load(f)
Expand All @@ -86,8 +76,11 @@ def __init__(self, context: CLContext):
self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser()

with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
self.model.addInput("input_imgs", None)
self.model.addInput("big_input_imgs", None)
for k,v in self.inputs.items():
self.model.addInput(k, v)

def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
Expand All @@ -104,27 +97,18 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_

self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)

self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
self.numpy_inputs['lateral_control_params'][:] = inputs['lateral_control_params']
input_imgs_cl = self.frame.prepare(buf, transform.flatten())
big_input_imgs_cl = self.wide_frame.prepare(wbuf, transform_wide.flatten())

if TICI:
# The imgs tensors are backed by opencl memory, only need init once
if 'input_imgs' not in self.img_inputs:
self.img_inputs['input_imgs'] = qcom_tensor_from_opencl_address(input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
self.img_inputs['big_input_imgs'] = qcom_tensor_from_opencl_address(big_input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
else:
self.img_inputs['input_imgs'] = Tensor(self.frame.buffer_from_cl(input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
self.img_inputs['big_input_imgs'] = Tensor(self.wide_frame.buffer_from_cl(big_input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()

self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']

self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))

tensor_inputs = {**self.img_inputs, **{k: Tensor(v) for k,v in self.numpy_inputs.items()}}
if prepare_only:
return None

self.output = self.model_run(**tensor_inputs)['outputs'].numpy().flatten()
self.model.execute()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output))

self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
Expand All @@ -134,9 +118,9 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :]

idxs = np.arange(-4,-100,-4)[::-1]
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
# TODO model only uses last value now, once that changes we need to input strided action history buffer
self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
return outputs


Expand Down Expand Up @@ -205,7 +189,7 @@ def main(demo=False):
cloudlog.info("modeld got CarParams: %s", CP.carName)

# TODO this needs more thought, use .2s extra for now to estimate other delays
steer_delay = .2
steer_delay = CP.steerActuatorDelay + .2

DH = DesireHelper()

Expand Down
29 changes: 14 additions & 15 deletions selfdrive/modeld/models/commonmodel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));

q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
Expand All @@ -23,7 +22,7 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
}

cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection) {
uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
Expand All @@ -32,19 +31,19 @@ cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, in
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
}
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);

copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);

// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return &input_frames_cl;
}

uint8_t* ModelFrame::buffer_from_cl(cl_mem *in_frames) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, MODEL_FRAME_SIZE * 2 * sizeof(uint8_t), &input_frames[0], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
if (output == NULL) {
CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);

// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
}

ModelFrame::~ModelFrame() {
Expand Down
7 changes: 3 additions & 4 deletions selfdrive/modeld/models/commonmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame();
cl_mem* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform);
uint8_t* buffer_from_cl(cl_mem *in_frames);
uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);

const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256;
Expand All @@ -33,7 +32,7 @@ class ModelFrame {
Transform transform;
LoadYUVState loadyuv;
cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl, input_frames_cl;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl;
cl_buffer_region region;
std::unique_ptr<uint8_t[]> input_frames;
};
};
3 changes: 1 addition & 2 deletions selfdrive/modeld/models/commonmodel.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,4 @@ cdef extern from "selfdrive/modeld/models/commonmodel.h":
cppclass ModelFrame:
int buf_size
ModelFrame(cl_device_id, cl_context)
cl_mem * prepare(cl_mem, int, int, int, int, mat3)
unsigned char * buffer_from_cl(cl_mem*);
unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
Loading

0 comments on commit e083acc

Please sign in to comment.