From 7d239092bcbda2c78a863658c40a70d76821c0b6 Mon Sep 17 00:00:00 2001 From: ktro2828 Date: Wed, 17 Apr 2024 20:54:52 +0900 Subject: [PATCH 1/3] tmp: update topk kernel Signed-off-by: ktro2828 --- .../preprocess/polyline_preprocess_kernel.cu | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu index ec5f113..232f71d 100644 --- a/lib/src/preprocess/polyline_preprocess_kernel.cu +++ b/lib/src/preprocess/polyline_preprocess_kernel.cu @@ -98,28 +98,39 @@ __global__ void setPreviousPositionKernel( __global__ void extractTopkKernel( const int K, const int L, const int P, const int B, const float offsetX, const float offsetY, const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline, - float * outPolyline) + int * topkIndex, float * outPolyline) { - // --- pseudo code --- - // mask = All(polyline != 0.0, dim=2) - // polylineCenter = polyline[:, :, 0:2].sum(dim=1) / clampMin(mask.sum(dim=1), min=1.0) - // offset = rotateAlongZ((offset_x, offset_y), target_state[:, 6]) - // targetOffsetPos = target_state[:, 0:2] + offset - // distances = (target_offset_pos - center) - // _, topkIdxs = distances.topk(k=K, descending=True) - // outPolyline = inPolyline[topkIdxs] - // ------------------- - - // int targetIdx = blockIdx.x; - - // const float targetX = targetState[targetIdx]; - // const float targetY = targetState[targetIdx + 1]; - // const float targetYaw = targetState[targetIdx + 6]; - // const float targetCos = cos(targetYaw); - // const float targetSin = sin(targetYaw); + int l = blockIdx.x * blockDim.x + threadIdx.x; + int b = blockIdx.y * blockDim.y + threadIdx.y; + if (l >= L || b >= B) { + return; + } - // const float transTargetX = targetCos * offsetX + targetSin * offsetY + targetX; - // const float transTargetY = -targetSin * offsetX + targetCos * offsetY + targetY; + // calculate polyline center + float sumX = 0.0f, sumY = 0.0f; + int validPoints = 0; + for (int p = 0; p < P; ++p) { + float x = inPolyline[(l * P + p) * PointDim]; + float y = inPolyline[(l * P + p) * PointDim + 1]; + if (x != 0.0f || y != 0.0f) { + sumX += x; + sumY += y; + ++validPoints; + } + } + float centerX = sumX / fmaxf(1.0f, validPoints); + float centerY = sumY / fmaxf(1.0f, validPoints); + + // apply offset to target state + float yaw = targetState[b * AgentDim + 6]; + float cosYaw = cosf(yaw); + float sinYaw = sinf(yaw); + float transOffsetX = offsetX * cosYaw - offsetY * sinYaw; + float transOffsetY = offsetX * sinYaw + offsetY * cosYaw; + float targetX = targetState[b * AgentDim] + transOffsetX; + float targetY = targetState[b * AgentDim + 1] + transOffsetY; + + float distance = sqrtf(powf(targetX - centerX, 2) + powf(targetY - centerY, 2)); } __global__ void calculatePolylineCenterKernel( From b732f33c087f0c36051ddf0b909f104eefd261f2 Mon Sep 17 00:00:00 2001 From: ktro2828 Date: Thu, 18 Apr 2024 17:54:48 +0900 Subject: [PATCH 2/3] feat: add support of extracting topk polylines Signed-off-by: ktro2828 --- CMakeLists.txt | 1 + include/mtr/cuda_helper.hpp | 1 + include/mtr/mtr.hpp | 5 +- .../preprocess/polyline_preprocess_kernel.cuh | 58 +++++--- .../preprocess/polyline_preprocess_kernel.cu | 135 +++++++++++++----- src/mtr.cpp | 6 +- 6 files changed, 144 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12b57a7..29ead5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,7 @@ target_link_libraries(custom_kernel ${CUDNN_LIBRARIES}) target_include_directories(custom_kernel PUBLIC lib/include + ${PROJECT_SOURCE_DIR}/include ) # MTR diff --git a/include/mtr/cuda_helper.hpp b/include/mtr/cuda_helper.hpp index 888e2ac..0ecb2c5 100644 --- a/include/mtr/cuda_helper.hpp +++ b/include/mtr/cuda_helper.hpp @@ -21,6 +21,7 @@ #include +#include #include #include #include diff --git a/include/mtr/mtr.hpp b/include/mtr/mtr.hpp index a92ca70..2820e0c 100644 --- a/include/mtr/mtr.hpp +++ b/include/mtr/mtr.hpp @@ -52,7 +52,7 @@ struct MTRConfig const std::vector & target_labels = {"VEHICLE", "PEDESTRIAN", "CYCLIST"}, const size_t num_past = 10, const size_t num_mode = 6, const size_t num_future = 80, const size_t max_num_polyline = 768, const size_t max_num_point = 20, - const float point_break_distance = 1.0f, const std::array & offset_xy = {30.0f, 0.0f}, + const float point_break_distance = 1.0f, const std::string & intention_point_filepath = "./data/waymo64.csv", const size_t num_intention_point_cluster = 64) : target_labels(target_labels), @@ -63,7 +63,6 @@ struct MTRConfig max_num_polyline(max_num_polyline), max_num_point(max_num_point), point_break_distance(point_break_distance), - offset_xy(offset_xy), intention_point_filepath(intention_point_filepath), num_intention_point_cluster(num_intention_point_cluster) { @@ -77,7 +76,6 @@ struct MTRConfig size_t max_num_polyline; size_t max_num_point; float point_break_distance; - std::array offset_xy; std::string intention_point_filepath; size_t num_intention_point_cluster; }; // struct MTRConfig @@ -166,7 +164,6 @@ class TrtMTR cuda::unique_ptr d_target_state_{nullptr}; cuda::unique_ptr d_intention_points_{nullptr}; cuda::unique_ptr d_polyline_{nullptr}; - cuda::unique_ptr d_topk_index_{nullptr}; // preprocessed inputs cuda::unique_ptr d_in_trajectory_{nullptr}; diff --git a/lib/include/preprocess/polyline_preprocess_kernel.cuh b/lib/include/preprocess/polyline_preprocess_kernel.cuh index 8cfcd08..265faea 100644 --- a/lib/include/preprocess/polyline_preprocess_kernel.cuh +++ b/lib/include/preprocess/polyline_preprocess_kernel.cuh @@ -46,24 +46,44 @@ __global__ void setPreviousPositionKernel( const int B, const int K, const int P, const int D, const bool * mask, float * polyline); /** - * @brief Extract TopK elements. + * @brief Calculate distances from targets to polylines. * - * @param K The number of K. - * @param L The number of source polylines. + * @note `inPolyline` must have been transformed to target coordinates system. + * + * @param B The number of target agents. + * @param L The number of polylines. * @param P The number of points contained in each polyline. + * @param AgentDim The number agent state dimensions. + * @param targetState Source target agent state, in shape [B*AgentDim]. + * @param PointDim The number of point dimensions. + * @param inPolyline Source polyline, in shape [B*L*P*PointDim]. + * @param inPolylineMask Source polyline mask, in shape [B*K*P]. + * @param outDistance Output distance, in shape [B*L]. + */ +__global__ void calculateCenterDistanceKernel( + const int B, const int L, const int P, const int AgentDim, const float * targetState, + const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance); + +/** + * @brief Extract K polylines with the smallest distances. + * + * @note This kernel is implemented using shared memory. + * + * @param K The number elements to be extracted (K <= L). * @param B The number of target agents. - * @param offsetX X offset position. - * @param offsetY Y offset position. - * @param AgentDim The number of agent state dimensions. - * @param targetState Source state of target agents, in shape [B*AgentDim]. - * @param PointDim The number of point state dimensions. - * @param inPolyline Source polylines, in shape [L*P*PointDim]. - * @param outPolyline Output polylines, in shape [K*P*PointDim]. + * @param L The number of polylines. + * @param P The number of points contained in each polyline. + * @param D The number of point dimensions. + * @param inPolyline Source polyline, in shape [B*L*P*D]. + * @param inPolylineMask Source polyline mask, in shape [B*L*P]. + * @param inDistance Source polyline distances, in shape [B*L]. + * @param outPolyline Output polyline, in shape [B*K*P*D]. + * @param outPolylineMask Output polyline mask, in shape [B*K*P]. */ -__global__ void extractTopkKernel( - const int K, const int L, const int P, const int B, const float offsetX, const float offsetY, - const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline, - float * outPolyline); +__global__ void extractTopKPolylineKernel( + const int K, const int B, const int L, const int P, const int D, const float * inPolyline, + const bool * inPolylineMask, const float * inDistance, float * outPolyline, + bool * outPolylineMask); /** * @brief Calculate the magnitudes of polylines. @@ -82,7 +102,7 @@ __global__ void calculatePolylineCenterKernel( /** * @brief In cases of the number of batch polylines (L) is greater than K, - * extacts the topK elements. + * extracts the topK elements. * * @param L The number of source polylines. * @param K The number of polylines expected as the model input. @@ -92,9 +112,6 @@ __global__ void calculatePolylineCenterKernel( * @param in_polyline Source polylines, in shape [L*P*PointDim]. * @param B The number of target agents. * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim]. - * @param offset_x The x offset. - * @param offset_y The y offset. - * @param topk_index A container to store topK indices, in shape [K]. * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)]. * @param out_polyline_mask Output polyline masks, in shape [B*K*P]. * @param out_polyline_center Output magnitudes of each polyline with respect to target coords, @@ -104,9 +121,8 @@ __global__ void calculatePolylineCenterKernel( */ cudaError_t polylinePreprocessWithTopkLauncher( const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, const float offsetX, const float offsetY, - int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center, - cudaStream_t stream); + const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, + float * out_polyline_center, cudaStream_t stream); /** * @brief Do preprocess for polyline if the number of batched polylines is K. diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu index 232f71d..c66e5de 100644 --- a/lib/src/preprocess/polyline_preprocess_kernel.cu +++ b/lib/src/preprocess/polyline_preprocess_kernel.cu @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mtr/cuda_helper.hpp" #include "preprocess/polyline_preprocess_kernel.cuh" +#include + #include __global__ void transformPolylineKernel( @@ -62,11 +65,13 @@ __global__ void transformPolylineKernel( out_polyline[out_idx + 5] = trans_dz; out_polyline[out_idx + 6] = type_id; - const int out_mask_idx = b * K * P + k * P + p; bool is_valid = false; - for (size_t i = 0; i < 6; ++i) { - is_valid += out_polyline[out_idx + i] != 0.0f; + for (size_t i = 0; i < PointDim - 1; ++i) { + if (out_polyline[out_idx + i] != 0.0f) { + is_valid = true; + } } + int out_mask_idx = b * K * P + k * P + p; out_polyline_mask[out_mask_idx] = is_valid; } @@ -95,42 +100,74 @@ __global__ void setPreviousPositionKernel( } } -__global__ void extractTopkKernel( - const int K, const int L, const int P, const int B, const float offsetX, const float offsetY, - const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline, - int * topkIndex, float * outPolyline) +__global__ void calculateCenterDistanceKernel( + const int B, const int L, const int P, const int AgentDim, const float * targetState, + const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance) { - int l = blockIdx.x * blockDim.x + threadIdx.x; - int b = blockIdx.y * blockDim.y + threadIdx.y; - if (l >= L || b >= B) { + int b = blockIdx.x * blockDim.x + threadIdx.x; + int l = blockIdx.y * blockDim.y + threadIdx.y; + if (b >= B || l >= L) { return; } // calculate polyline center float sumX = 0.0f, sumY = 0.0f; - int validPoints = 0; + int numValid = 0; for (int p = 0; p < P; ++p) { - float x = inPolyline[(l * P + p) * PointDim]; - float y = inPolyline[(l * P + p) * PointDim + 1]; - if (x != 0.0f || y != 0.0f) { + int idx = b * L * P + l * P + p; + float x = inPolyline[idx * PointDim]; + float y = inPolyline[idx * PointDim + 1]; + if (inPolylineMask[idx]) { sumX += x; sumY += y; - ++validPoints; + ++numValid; + } + } + float centerX = sumX / fmaxf(1.0f, numValid); + float centerY = sumY / fmaxf(1.0f, numValid); + + outDistance[b * L + l] = sqrtf(powf(centerX, 2) + powf(centerY, 2)); +} + +__global__ void extractTopKPolylineKernel( + const int K, const int B, const int L, const int P, const int D, const float * inPolyline, + const bool * inPolylineMask, const float * inDistance, float * outPolyline, + bool * outPolylineMask) +{ + int b = blockIdx.x; // Batch index + extern __shared__ float distances[]; + + // Load distances into shared memory + int tid = threadIdx.x; + if (tid < L) { + distances[tid] = inDistance[b * L + tid]; + } + __syncthreads(); + + // Simple selection of the smallest K distances + // (this part should be replaced with a more efficient sorting/selecting algorithm) + for (int k = 0; k < K; k++) { + float minDistance = FLT_MAX; + int minIndex = -1; + + for (int l = 0; l < L; l++) { + if (distances[l] < minDistance) { + minDistance = distances[l]; + minIndex = l; + } + } + + if (tid == k) { // this thread will handle copying the k-th smallest polyline + for (int p = 0; p < P; p++) { + for (int d = 0; d < D; d++) { + outPolyline[b * K * P * D + k * P * D + p * D + d] = + inPolyline[b * L * P * D + minIndex * P * D + p * D + d]; + } + outPolylineMask[b * K * P + k * P + p] = inPolylineMask[b * L * P + minIndex * P + p]; + } } + distances[minIndex] = FLT_MAX; // exclude this index from future consideration } - float centerX = sumX / fmaxf(1.0f, validPoints); - float centerY = sumY / fmaxf(1.0f, validPoints); - - // apply offset to target state - float yaw = targetState[b * AgentDim + 6]; - float cosYaw = cosf(yaw); - float sinYaw = sinf(yaw); - float transOffsetX = offsetX * cosYaw - offsetY * sinYaw; - float transOffsetY = offsetX * sinYaw + offsetY * cosYaw; - float targetX = targetState[b * AgentDim] + transOffsetX; - float targetY = targetState[b * AgentDim + 1] + transOffsetY; - - float distance = sqrtf(powf(targetX - centerX, 2) + powf(targetY - centerY, 2)); } __global__ void calculatePolylineCenterKernel( @@ -175,15 +212,47 @@ __global__ void calculatePolylineCenterKernel( cudaError_t polylinePreprocessWithTopkLauncher( const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, const float offset_x, const float offset_y, - int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center, - cudaStream_t stream) + const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, + float * out_polyline_center, cudaStream_t stream) { if (L < K) { std::cerr << "L must be greater than K, but got L: " << L << ", K: " << K << std::endl; return cudaError_t::cudaErrorInvalidValue; } + float *tmpPolyline, *tmpDistance; + bool * tmpPolylineMask; + CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * PointDim, stream)); + CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolylineMask, sizeof(bool) * B * L * P, stream)); + CHECK_CUDA_ERROR(cudaMallocAsync(&tmpDistance, sizeof(float) * B * L, stream)); + + // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess` + constexpr int threadsPerBlock = 256; + const dim3 block3dl(B, L / threadsPerBlock, P); + transformPolylineKernel<<>>( + L, P, PointDim, in_polyline, B, AgentDim, target_state, tmpPolyline, tmpPolylineMask); + + const dim3 block2dl(B, L / threadsPerBlock); + calculateCenterDistanceKernel<<>>( + B, L, P, AgentDim, target_state, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance); + + const size_t sharedMemSize = sizeof(float) * K; + extractTopKPolylineKernel<<>>( + K, B, L, P, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance, out_polyline, + out_polyline_mask); + + const dim3 block3dk(B, K / threadsPerBlock, P); + setPreviousPositionKernel<<>>( + B, K, P, PointDim, out_polyline_mask, out_polyline); + + const dim3 block2dk(B, K / threadsPerBlock); + calculatePolylineCenterKernel<<>>( + B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center); + + CHECK_CUDA_ERROR(cudaFree(tmpPolyline)); + CHECK_CUDA_ERROR(cudaFree(tmpPolylineMask)); + CHECK_CUDA_ERROR(cudaFree(tmpDistance)); + return cudaGetLastError(); } @@ -194,7 +263,7 @@ cudaError_t polylinePreprocessLauncher( { // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess` constexpr int threadsPerBlock = 256; - const dim3 block3d(B, K / threadsPerBlock, P); + const dim3 block3d(B, (K - 1) / threadsPerBlock, P); transformPolylineKernel<<>>( K, P, PointDim, in_polyline, B, AgentDim, target_state, out_polyline, out_polyline_mask); @@ -202,7 +271,7 @@ cudaError_t polylinePreprocessLauncher( setPreviousPositionKernel<<>>( B, K, P, PointDim, out_polyline_mask, out_polyline); - const dim3 block2d(B, K / threadsPerBlock); + const dim3 block2d(B, (K - 1) / threadsPerBlock); calculatePolylineCenterKernel<<>>( B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center); diff --git a/src/mtr.cpp b/src/mtr.cpp index f4f638a..8e83254 100644 --- a/src/mtr.cpp +++ b/src/mtr.cpp @@ -83,7 +83,6 @@ void TrtMTR::initCudaPtr(AgentData & agent_data, PolylineData & polyline_data) cuda::make_unique(agent_data.TargetNum * config_.num_intention_point_cluster * 2); d_polyline_ = cuda::make_unique( polyline_data.PolylineNum * polyline_data.PointNum * polyline_data.StateDim); - d_topk_index_ = cuda::make_unique(config_.max_num_polyline); // preprocessed input const size_t inDim = agent_data.StateDim + agent_data.ClassNum + agent_data.TimeLength + @@ -168,13 +167,12 @@ bool TrtMTR::preProcess(AgentData & agent_data, PolylineData & polyline_data) d_timestamps_.get(), d_trajectory_.get(), d_in_trajectory_.get(), d_in_trajectory_mask_.get(), d_in_last_pos_.get(), stream_)); - // TODO if (config_.max_num_polyline < polyline_data.PolylineNum) { CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher( polyline_data.PolylineNum, config_.max_num_polyline, polyline_data.PointNum, polyline_data.StateDim, d_polyline_.get(), agent_data.TargetNum, agent_data.StateDim, - d_target_state_.get(), config_.offset_xy[0], config_.offset_xy[1], d_topk_index_.get(), - d_in_polyline_.get(), d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_)); + d_target_state_.get(), d_in_polyline_.get(), d_in_polyline_mask_.get(), + d_in_polyline_center_.get(), stream_)); } else { assert( ("The number of config.max_num_polyline and PolylineData.PolylineNum must be same", From da1f893c057cbe11a22c1b8d86102ea011bbed75 Mon Sep 17 00:00:00 2001 From: ktro2828 Date: Fri, 19 Apr 2024 01:38:07 +0900 Subject: [PATCH 3/3] feat: update polyline preprocesses Signed-off-by: ktro2828 --- .../preprocess/polyline_preprocess_kernel.cuh | 117 +++++---- .../preprocess/polyline_preprocess_kernel.cu | 240 +++++++++--------- src/mtr.cpp | 2 +- 3 files changed, 182 insertions(+), 177 deletions(-) diff --git a/lib/include/preprocess/polyline_preprocess_kernel.cuh b/lib/include/preprocess/polyline_preprocess_kernel.cuh index 265faea..cfab41d 100644 --- a/lib/include/preprocess/polyline_preprocess_kernel.cuh +++ b/lib/include/preprocess/polyline_preprocess_kernel.cuh @@ -16,67 +16,71 @@ #define PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_ /** - * @brief Transform to target agent coordinates system. + * @brief Transform polylines to target agent coordinate system and extend its feature with previous + * x, y`(D->D+2)`. + * + * Points which all elements are 0.0 except of typeID are filled by 0.0 and the corresponding + * mask value is 0.0 too. * * @param K The number of polylines. * @param P The number of points contained in each polyline. - * @param PointDim The number of point state dimensions. - * @param in_polyline Source polylines, in shape [K*P*PointDim]. + * @param PointDim The number of point dimensions, expecting (x, y, z, dx, dy, dz, typeID). + * @param inPolyline Source polyline, in shape [K*P*D]. * @param B The number of target agents. - * @param AgentDim The number of agent state dimensions. - * @param target_state Source target state at the latest timestamp, in shape [B*AgentDim]. - * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)]. - * @param out_polyline_mask Output polyline mask, in shape [B*K*P]. + * @param AgentDim The number of agent state dimensions, expecting (x, y, z, length, width, height, + * yaw, vx, vy, ax, ay). + * @param targetState Source target agent states, in shape [B*AgentDim]. + * @param outPolyline Output polyline, in shape [B*K*P*(PointDim+2)]. + * @param outPolylineMask Output polyline mask, in shape [B*K*P]. */ __global__ void transformPolylineKernel( - const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask); + const int K, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask); /** - * @brief Set the previous xy position at the end of element. + * @brief Set the Previous Position Kernel object * * @param B The number of target agents. * @param K The number of polylines. * @param P The number of points contained in each polyline. - * @param D The number of point dimensions. - * @param mask The polyline mask, in shape [B*K*P]. - * @param polyline The container of polylines, in shape [B*K*P*D] + * @param D The number of point dimensions, expecting (x, y, ..., preX, preY). + * @param polyline Source polyline, in shape [B*K*P*D]. */ __global__ void setPreviousPositionKernel( - const int B, const int K, const int P, const int D, const bool * mask, float * polyline); + const int B, const int K, const int P, const int D, float * polyline); /** - * @brief Calculate distances from targets to polylines. + * @brief Calculate center distance from target agent to each polyline. * - * @note `inPolyline` must have been transformed to target coordinates system. + * @note Polyline must have been transformed to target agent coordinates system. * * @param B The number of target agents. - * @param L The number of polylines. + * @param K The number of polylines. * @param P The number of points contained in each polyline. - * @param AgentDim The number agent state dimensions. - * @param targetState Source target agent state, in shape [B*AgentDim]. - * @param PointDim The number of point dimensions. - * @param inPolyline Source polyline, in shape [B*L*P*PointDim]. - * @param inPolylineMask Source polyline mask, in shape [B*K*P]. - * @param outDistance Output distance, in shape [B*L]. + * @param D The number of point dimensions, expecting [x, y, ...]. + * @param polyline Source polyline, in shape [B*K*P*D]. + * @param polylineMask Source polyline mask, in shape [B*K*P]. + * @param distance Output calculated distances, in shape [B*K]. */ __global__ void calculateCenterDistanceKernel( - const int B, const int L, const int P, const int AgentDim, const float * targetState, - const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance); + const int B, const int K, const int P, const int D, const float * polyline, + const bool * polylineMask, float * distance); /** * @brief Extract K polylines with the smallest distances. * - * @note This kernel is implemented using shared memory. + * @note Because this kernel supposes to allocate shared memory dynamically it is necessary to + * specify `sizeof(float) * L` in the kernel execution configuration. * - * @param K The number elements to be extracted (K <= L). + * @param K The number of polylines to be extracted. * @param B The number of target agents. - * @param L The number of polylines. + * @param L The number of original polylines. * @param P The number of points contained in each polyline. * @param D The number of point dimensions. * @param inPolyline Source polyline, in shape [B*L*P*D]. * @param inPolylineMask Source polyline mask, in shape [B*L*P]. - * @param inDistance Source polyline distances, in shape [B*L]. + * @param inDistance Source distances from target agents to the centers of each polyline, in shape + * [B*L]. * @param outPolyline Output polyline, in shape [B*K*P*D]. * @param outPolylineMask Output polyline mask, in shape [B*K*P]. */ @@ -86,43 +90,46 @@ __global__ void extractTopKPolylineKernel( bool * outPolylineMask); /** - * @brief Calculate the magnitudes of polylines. + * @brief Calculate center positions of each polyline with respect to target agent coordinates + * system. + * + * @note Polyline must have been transformed to target agent coordinates system. * * @param B The number of target agents. * @param K The number of polylines. * @param P The number of points contained in each polyline. - * @param D The number of point dimensions. - * @param polyline Source polylines, in shape [B*K*P*(D + 2)]. - * @param mask Source polyline masks, in shape [B*K*P]. - * @param center Output magnitudes of polylines, in shape [B*K*3]. + * @param D The number of point dimensions, expecting (x, y, z, ...). + * @param polyline Source polyline, in shape [B*K*P*D]. + * @param polylineMask Source polyline mask, in shape [B*K*P]. + * @param center Output centers, in shape [B*K*3]. */ __global__ void calculatePolylineCenterKernel( - const int B, const int K, const int P, const int PointDim, const float * polyline, - const bool * mask, float * center); + const int B, const int K, const int P, const int D, const float * polyline, + const bool * polylineMask, float * center); /** * @brief In cases of the number of batch polylines (L) is greater than K, * extracts the topK elements. * - * @param L The number of source polylines. - * @param K The number of polylines expected as the model input. + * @param K The number of polylines to be extracted. + * @param L The number of original polylines. * @param P The number of points contained in each polyline. * @param PointDim The number of point state dimensions. - * @param AgentDim The number of agent state dimensions. - * @param in_polyline Source polylines, in shape [L*P*PointDim]. + * @param inPolyline Source polylines, in shape [L*P*PointDim]. * @param B The number of target agents. - * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim]. - * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)]. - * @param out_polyline_mask Output polyline masks, in shape [B*K*P]. - * @param out_polyline_center Output magnitudes of each polyline with respect to target coords, + * @param AgentDim The number of agent state dimensions. + * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim]. + * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)]. + * @param outPolylineMask Output polyline masks, in shape [B*K*P]. + * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords, * in shape [B*K*3]. * @param stream CUDA stream. * @return cudaError_t */ cudaError_t polylinePreprocessWithTopkLauncher( - const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, - float * out_polyline_center, cudaStream_t stream); + const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask, + float * outPolylineCenter, cudaStream_t stream); /** * @brief Do preprocess for polyline if the number of batched polylines is K. @@ -130,20 +137,20 @@ cudaError_t polylinePreprocessWithTopkLauncher( * @param K The number of polylines. * @param P The number of points contained in each polyline. * @param PointDim The number of point state dimensions. - * @param in_polyline Source polylines, in shape [K*P*PointDim]. + * @param inPolyline Source polylines, in shape [K*P*PointDim]. * @param B The number of target agents. * @param AgentDim The number of agent state dimensions. - * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim]. - * @param out_polyline Output polylines, in shape [B*K*P*(PointDim + 2)]. - * @param out_polyline_mask Output polyline masks, in shape [B*K*P]. - * @param out_polyline_center Output magnitudes of each polyline with respect to target coords, + * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim]. + * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)]. + * @param outPolylineMask Output polyline masks, in shape [B*K*P]. + * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords, * in shape [B*K*3]. * @param stream CUDA stream. * @return cudaError_t */ cudaError_t polylinePreprocessLauncher( - const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, - float * out_polyline_center, cudaStream_t stream); + const int K, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask, + float * outPolylineCenter, cudaStream_t stream); #endif // PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_ diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu index c66e5de..62e251b 100644 --- a/lib/src/preprocess/polyline_preprocess_kernel.cu +++ b/lib/src/preprocess/polyline_preprocess_kernel.cu @@ -20,8 +20,8 @@ #include __global__ void transformPolylineKernel( - const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask) + const int K, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask) { int b = blockIdx.x * blockDim.x + threadIdx.x; int k = blockIdx.y * blockDim.y + threadIdx.y; @@ -31,52 +31,59 @@ __global__ void transformPolylineKernel( return; } - const int src_polyline_idx = (k * P + p) * PointDim; - const float x = in_polyline[src_polyline_idx]; - const float y = in_polyline[src_polyline_idx + 1]; - const float z = in_polyline[src_polyline_idx + 2]; - const float dx = in_polyline[src_polyline_idx + 3]; - const float dy = in_polyline[src_polyline_idx + 4]; - const float dz = in_polyline[src_polyline_idx + 5]; - const float type_id = in_polyline[src_polyline_idx + 6]; - - const int center_idx = b * AgentDim; - const float center_x = target_state[center_idx]; - const float center_y = target_state[center_idx + 1]; - const float center_z = target_state[center_idx + 2]; - const float center_yaw = target_state[center_idx + 6]; - const float center_cos = cos(center_yaw); - const float center_sin = sin(center_yaw); - - // do transform - const float trans_x = center_cos * (x - center_x) - center_sin * (y - center_y); - const float trans_y = center_sin * (x - center_x) + center_cos * (y - center_y); - const float trans_z = z - center_z; - const float trans_dx = center_cos * dx - center_sin * dy; - const float trans_dy = center_sin * dx + center_cos * dy; - const float trans_dz = dz; - - const int out_idx = (b * K * P + k * P + p) * (PointDim + 2); - out_polyline[out_idx] = trans_x; - out_polyline[out_idx + 1] = trans_y; - out_polyline[out_idx + 2] = trans_z; - out_polyline[out_idx + 3] = trans_dx; - out_polyline[out_idx + 4] = trans_dy; - out_polyline[out_idx + 5] = trans_dz; - out_polyline[out_idx + 6] = type_id; - - bool is_valid = false; - for (size_t i = 0; i < PointDim - 1; ++i) { - if (out_polyline[out_idx + i] != 0.0f) { - is_valid = true; + const int inIdx = (k * P + p) * PointDim; + const int outIdx = b * K * P + k * P + p; + bool isValid = false; + for (int d = 0; d < PointDim - 1; ++d) { + if (inPolyline[inIdx + d] != 0.0f) { + isValid = true; } } - int out_mask_idx = b * K * P + k * P + p; - out_polyline_mask[out_mask_idx] = is_valid; + outPolylineMask[outIdx] = isValid; + + // initialize output polyline with 0.0 + for (int d = 0; d < PointDim + 2; ++d) { + outPolyline[outIdx * (PointDim + 2) + d] = 0.0f; + } + + // set transformed values if valid, otherwise all 0.0. + if (isValid) { + const float x = inPolyline[inIdx]; + const float y = inPolyline[inIdx + 1]; + const float z = inPolyline[inIdx + 2]; + const float dx = inPolyline[inIdx + 3]; + const float dy = inPolyline[inIdx + 4]; + const float dz = inPolyline[inIdx + 5]; + const float typeID = inPolyline[inIdx + 6]; + + const int centerIdx = b * AgentDim; + const float centerX = targetState[centerIdx]; + const float centerY = targetState[centerIdx + 1]; + const float centerZ = targetState[centerIdx + 2]; + const float centerYaw = targetState[centerIdx + 6]; + const float centerCos = cosf(centerYaw); + const float centerSin = sinf(centerYaw); + + // do transform + const float transX = centerCos * (x - centerX) - centerSin * (y - centerY); + const float transY = centerSin * (x - centerX) + centerCos * (y - centerY); + const float transZ = z - centerZ; + const float transDx = centerCos * dx - centerSin * dy; + const float transDy = centerSin * dx + centerCos * dy; + const float transDz = dz; + + outPolyline[outIdx * (PointDim + 2)] = transX; + outPolyline[outIdx * (PointDim + 2) + 1] = transY; + outPolyline[outIdx * (PointDim + 2) + 2] = transZ; + outPolyline[outIdx * (PointDim + 2) + 3] = transDx; + outPolyline[outIdx * (PointDim + 2) + 4] = transDy; + outPolyline[outIdx * (PointDim + 2) + 5] = transDz; + outPolyline[outIdx * (PointDim + 2) + 6] = typeID; + } } __global__ void setPreviousPositionKernel( - const int B, const int K, const int P, const int D, const bool * mask, float * polyline) + const int B, const int K, const int P, const int D, float * polyline) { int b = blockIdx.x * blockDim.x + threadIdx.x; int k = blockIdx.y * blockDim.y + threadIdx.y; @@ -86,27 +93,20 @@ __global__ void setPreviousPositionKernel( return; } - const int cur_idx = (b * K * P + k * P + p) * D; - const int pre_idx = k == 0 ? cur_idx : (b * K * P + (k - 1) * P + p) * D; - - polyline[cur_idx + D - 2] = polyline[pre_idx]; - polyline[cur_idx + D - 1] = polyline[pre_idx + 1]; + const int curIdx = (b * K * P + k * P + p) * D; + const int preIdx = p == 0 ? curIdx : (b * K * P + k * P + p - 1) * D; - const int mask_idx = b * K * P + k * P + p; - if (!mask[mask_idx]) { - for (int d = 0; d < D; ++d) { - polyline[cur_idx + d] = 0.0f; - } - } + polyline[curIdx + D - 2] = polyline[preIdx]; // x + polyline[curIdx + D - 1] = polyline[preIdx + 1]; // y } __global__ void calculateCenterDistanceKernel( - const int B, const int L, const int P, const int AgentDim, const float * targetState, - const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance) + const int B, const int K, const int P, const int D, const float * polyline, + const bool * polylineMask, float * distance) { int b = blockIdx.x * blockDim.x + threadIdx.x; - int l = blockIdx.y * blockDim.y + threadIdx.y; - if (b >= B || l >= L) { + int k = blockIdx.y * blockDim.y + threadIdx.y; + if (b >= B || k >= K) { return; } @@ -114,19 +114,17 @@ __global__ void calculateCenterDistanceKernel( float sumX = 0.0f, sumY = 0.0f; int numValid = 0; for (int p = 0; p < P; ++p) { - int idx = b * L * P + l * P + p; - float x = inPolyline[idx * PointDim]; - float y = inPolyline[idx * PointDim + 1]; - if (inPolylineMask[idx]) { - sumX += x; - sumY += y; + int idx = b * K * P + k * P + p; + if (polylineMask[idx]) { + sumX += polyline[idx * D]; + sumY += polyline[idx * D + 1]; ++numValid; } } float centerX = sumX / fmaxf(1.0f, numValid); float centerY = sumY / fmaxf(1.0f, numValid); - outDistance[b * L + l] = sqrtf(powf(centerX, 2) + powf(centerY, 2)); + distance[b * K + k] = hypot(centerX, centerY); } __global__ void extractTopKPolylineKernel( @@ -134,11 +132,13 @@ __global__ void extractTopKPolylineKernel( const bool * inPolylineMask, const float * inDistance, float * outPolyline, bool * outPolylineMask) { - int b = blockIdx.x; // Batch index + int b = blockIdx.x; // Batch index + int p = blockIdx.y * blockDim.y + threadIdx.y; // Point index + int d = blockIdx.z * blockDim.z + threadIdx.z; // Dim index extern __shared__ float distances[]; // Load distances into shared memory - int tid = threadIdx.x; + int tid = threadIdx.x; // Polyline index if (tid < L) { distances[tid] = inDistance[b * L + tid]; } @@ -156,29 +156,22 @@ __global__ void extractTopKPolylineKernel( minIndex = l; } } + __syncthreads(); if (tid == k) { // this thread will handle copying the k-th smallest polyline - for (int p = 0; p < P; p++) { - for (int d = 0; d < D; d++) { - outPolyline[b * K * P * D + k * P * D + p * D + d] = - inPolyline[b * L * P * D + minIndex * P * D + p * D + d]; - } - outPolylineMask[b * K * P + k * P + p] = inPolylineMask[b * L * P + minIndex * P + p]; - } + int inIdx = b * L * P + minIndex * P + p; + int outIdx = b * K * P + k * P + p; + outPolyline[outIdx * D + d] = inPolyline[inIdx * D + d]; + outPolylineMask[outIdx] = inPolylineMask[inIdx]; } distances[minIndex] = FLT_MAX; // exclude this index from future consideration } } __global__ void calculatePolylineCenterKernel( - const int B, const int K, const int P, const int PointDim, const float * polyline, - const bool * mask, float * center) + const int B, const int K, const int P, const int D, const float * polyline, + const bool * polylineMask, float * center) { - // --- pseudo code --- - // sum = (polylines[:, :, :, 0:3] * mask[:, :, :, None]).sum(dim=2) - // center = sum / clampMIN(mask.sum(dim=2), min=1.0) - // ------------------- - int b = blockIdx.x * blockDim.x + threadIdx.x; int k = blockIdx.y * blockDim.y + threadIdx.y; @@ -187,67 +180,70 @@ __global__ void calculatePolylineCenterKernel( } // initialize with 0.0 - int center_idx = (b * K + k) * 3; + int centerIdx = (b * K + k) * 3; for (int d = 0; d < 3; ++d) { - center[center_idx + d] = 0.0f; + center[centerIdx + d] = 0.0f; } - float sum_xyz[3] = {0.0f, 0.0f, 0.0f}; - int count = 0; + // calculate polyline center + float sumX = 0.0f, sumY = 0.0f, sumZ = 0.0f; + int numValid = 0; for (int p = 0; p < P; ++p) { - int src_idx = b * K * P + k * P + p; - if (mask[src_idx]) { - for (int d = 0; d < 3; ++d) { - sum_xyz[d] += polyline[src_idx * PointDim + d]; - } - ++count; + int idx = b * K * P + k * P + p; + if (polylineMask[idx]) { + sumX += polyline[idx * D]; + sumY += polyline[idx * D + 1]; + sumZ += polyline[idx * D + 2]; + ++numValid; } } - count = max(count, 1); - for (int d = 0; d < 3; ++d) { - center[center_idx + d] = sum_xyz[d] / static_cast(count); - } + center[centerIdx] = sumX / fmaxf(1.0f, numValid); + center[centerIdx + 1] = sumY / fmaxf(1.0f, numValid); + center[centerIdx + 2] = sumZ / fmaxf(1.0f, numValid); } cudaError_t polylinePreprocessWithTopkLauncher( - const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, - float * out_polyline_center, cudaStream_t stream) + const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask, + float * outPolylineCenter, cudaStream_t stream) { if (L < K) { std::cerr << "L must be greater than K, but got L: " << L << ", K: " << K << std::endl; return cudaError_t::cudaErrorInvalidValue; } + const int outPointDim = PointDim + 2; + float *tmpPolyline, *tmpDistance; bool * tmpPolylineMask; - CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * PointDim, stream)); + CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * outPointDim, stream)); CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolylineMask, sizeof(bool) * B * L * P, stream)); CHECK_CUDA_ERROR(cudaMallocAsync(&tmpDistance, sizeof(float) * B * L, stream)); // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess` constexpr int threadsPerBlock = 256; - const dim3 block3dl(B, L / threadsPerBlock, P); - transformPolylineKernel<<>>( - L, P, PointDim, in_polyline, B, AgentDim, target_state, tmpPolyline, tmpPolylineMask); + const dim3 blocks1(B, L / threadsPerBlock, P); + transformPolylineKernel<<>>( + L, P, PointDim, inPolyline, B, AgentDim, targetState, tmpPolyline, tmpPolylineMask); - const dim3 block2dl(B, L / threadsPerBlock); - calculateCenterDistanceKernel<<>>( - B, L, P, AgentDim, target_state, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance); + const dim3 blocks2(B, L / threadsPerBlock); + calculateCenterDistanceKernel<<>>( + B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance); - const size_t sharedMemSize = sizeof(float) * K; - extractTopKPolylineKernel<<>>( - K, B, L, P, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance, out_polyline, - out_polyline_mask); + const dim3 blocks3(B, P, outPointDim); + const size_t sharedMemSize = sizeof(float) * L; + extractTopKPolylineKernel<<>>( + K, B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance, outPolyline, + outPolylineMask); - const dim3 block3dk(B, K / threadsPerBlock, P); - setPreviousPositionKernel<<>>( - B, K, P, PointDim, out_polyline_mask, out_polyline); + const dim3 blocks4(B, K / threadsPerBlock, P); + setPreviousPositionKernel<<>>( + B, K, P, outPointDim, outPolyline); - const dim3 block2dk(B, K / threadsPerBlock); - calculatePolylineCenterKernel<<>>( - B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center); + const dim3 blocks5(B, K / threadsPerBlock); + calculatePolylineCenterKernel<<>>( + B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter); CHECK_CUDA_ERROR(cudaFree(tmpPolyline)); CHECK_CUDA_ERROR(cudaFree(tmpPolylineMask)); @@ -257,23 +253,25 @@ cudaError_t polylinePreprocessWithTopkLauncher( } cudaError_t polylinePreprocessLauncher( - const int K, const int P, const int PointDim, const float * in_polyline, const int B, - const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask, - float * out_polyline_center, cudaStream_t stream) + const int K, const int P, const int PointDim, const float * inPolyline, const int B, + const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask, + float * outPolylineCenter, cudaStream_t stream) { + const int outPointDim = PointDim + 2; + // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess` constexpr int threadsPerBlock = 256; const dim3 block3d(B, (K - 1) / threadsPerBlock, P); transformPolylineKernel<<>>( - K, P, PointDim, in_polyline, B, AgentDim, target_state, out_polyline, out_polyline_mask); + K, P, PointDim, inPolyline, B, AgentDim, targetState, outPolyline, outPolylineMask); setPreviousPositionKernel<<>>( - B, K, P, PointDim, out_polyline_mask, out_polyline); + B, K, P, outPointDim, outPolyline); const dim3 block2d(B, (K - 1) / threadsPerBlock); calculatePolylineCenterKernel<<>>( - B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center); + B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter); return cudaGetLastError(); } diff --git a/src/mtr.cpp b/src/mtr.cpp index 8e83254..13e641d 100644 --- a/src/mtr.cpp +++ b/src/mtr.cpp @@ -169,7 +169,7 @@ bool TrtMTR::preProcess(AgentData & agent_data, PolylineData & polyline_data) if (config_.max_num_polyline < polyline_data.PolylineNum) { CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher( - polyline_data.PolylineNum, config_.max_num_polyline, polyline_data.PointNum, + config_.max_num_polyline, polyline_data.PolylineNum, polyline_data.PointNum, polyline_data.StateDim, d_polyline_.get(), agent_data.TargetNum, agent_data.StateDim, d_target_state_.get(), d_in_polyline_.get(), d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));