From 7d239092bcbda2c78a863658c40a70d76821c0b6 Mon Sep 17 00:00:00 2001
From: ktro2828 <kotaro.uetake@tier4.jp>
Date: Wed, 17 Apr 2024 20:54:52 +0900
Subject: [PATCH 1/3] tmp: update topk kernel

Signed-off-by: ktro2828 <kotaro.uetake@tier4.jp>
---
 .../preprocess/polyline_preprocess_kernel.cu  | 51 +++++++++++--------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu
index ec5f113..232f71d 100644
--- a/lib/src/preprocess/polyline_preprocess_kernel.cu
+++ b/lib/src/preprocess/polyline_preprocess_kernel.cu
@@ -98,28 +98,39 @@ __global__ void setPreviousPositionKernel(
 __global__ void extractTopkKernel(
   const int K, const int L, const int P, const int B, const float offsetX, const float offsetY,
   const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline,
-  float * outPolyline)
+  int * topkIndex, float * outPolyline)
 {
-  // --- pseudo code ---
-  // mask = All(polyline != 0.0, dim=2)
-  // polylineCenter = polyline[:, :, 0:2].sum(dim=1) / clampMin(mask.sum(dim=1), min=1.0)
-  // offset = rotateAlongZ((offset_x, offset_y), target_state[:, 6])
-  // targetOffsetPos = target_state[:, 0:2] + offset
-  // distances = (target_offset_pos - center)
-  // _, topkIdxs = distances.topk(k=K, descending=True)
-  // outPolyline = inPolyline[topkIdxs]
-  // -------------------
-
-  // int targetIdx = blockIdx.x;
-
-  // const float targetX = targetState[targetIdx];
-  // const float targetY = targetState[targetIdx + 1];
-  // const float targetYaw = targetState[targetIdx + 6];
-  // const float targetCos = cos(targetYaw);
-  // const float targetSin = sin(targetYaw);
+  int l = blockIdx.x * blockDim.x + threadIdx.x;
+  int b = blockIdx.y * blockDim.y + threadIdx.y;
+  if (l >= L || b >= B) {
+    return;
+  }
 
-  // const float transTargetX = targetCos * offsetX + targetSin * offsetY + targetX;
-  // const float transTargetY = -targetSin * offsetX + targetCos * offsetY + targetY;
+  // calculate polyline center
+  float sumX = 0.0f, sumY = 0.0f;
+  int validPoints = 0;
+  for (int p = 0; p < P; ++p) {
+    float x = inPolyline[(l * P + p) * PointDim];
+    float y = inPolyline[(l * P + p) * PointDim + 1];
+    if (x != 0.0f || y != 0.0f) {
+      sumX += x;
+      sumY += y;
+      ++validPoints;
+    }
+  }
+  float centerX = sumX / fmaxf(1.0f, validPoints);
+  float centerY = sumY / fmaxf(1.0f, validPoints);
+
+  // apply offset to target state
+  float yaw = targetState[b * AgentDim + 6];
+  float cosYaw = cosf(yaw);
+  float sinYaw = sinf(yaw);
+  float transOffsetX = offsetX * cosYaw - offsetY * sinYaw;
+  float transOffsetY = offsetX * sinYaw + offsetY * cosYaw;
+  float targetX = targetState[b * AgentDim] + transOffsetX;
+  float targetY = targetState[b * AgentDim + 1] + transOffsetY;
+
+  float distance = sqrtf(powf(targetX - centerX, 2) + powf(targetY - centerY, 2));
 }
 
 __global__ void calculatePolylineCenterKernel(

From b732f33c087f0c36051ddf0b909f104eefd261f2 Mon Sep 17 00:00:00 2001
From: ktro2828 <kotaro.uetake@tier4.jp>
Date: Thu, 18 Apr 2024 17:54:48 +0900
Subject: [PATCH 2/3] feat: add support of extracting topk polylines

Signed-off-by: ktro2828 <kotaro.uetake@tier4.jp>
---
 CMakeLists.txt                                |   1 +
 include/mtr/cuda_helper.hpp                   |   1 +
 include/mtr/mtr.hpp                           |   5 +-
 .../preprocess/polyline_preprocess_kernel.cuh |  58 +++++---
 .../preprocess/polyline_preprocess_kernel.cu  | 135 +++++++++++++-----
 src/mtr.cpp                                   |   6 +-
 6 files changed, 144 insertions(+), 62 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12b57a7..29ead5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,7 @@ target_link_libraries(custom_kernel
   ${CUDNN_LIBRARIES})
 target_include_directories(custom_kernel PUBLIC
   lib/include
+  ${PROJECT_SOURCE_DIR}/include
 )
 
 # MTR
diff --git a/include/mtr/cuda_helper.hpp b/include/mtr/cuda_helper.hpp
index 888e2ac..0ecb2c5 100644
--- a/include/mtr/cuda_helper.hpp
+++ b/include/mtr/cuda_helper.hpp
@@ -21,6 +21,7 @@
 
 #include <cuda_runtime_api.h>
 
+#include <iostream>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
diff --git a/include/mtr/mtr.hpp b/include/mtr/mtr.hpp
index a92ca70..2820e0c 100644
--- a/include/mtr/mtr.hpp
+++ b/include/mtr/mtr.hpp
@@ -52,7 +52,7 @@ struct MTRConfig
     const std::vector<std::string> & target_labels = {"VEHICLE", "PEDESTRIAN", "CYCLIST"},
     const size_t num_past = 10, const size_t num_mode = 6, const size_t num_future = 80,
     const size_t max_num_polyline = 768, const size_t max_num_point = 20,
-    const float point_break_distance = 1.0f, const std::array<float, 2> & offset_xy = {30.0f, 0.0f},
+    const float point_break_distance = 1.0f,
     const std::string & intention_point_filepath = "./data/waymo64.csv",
     const size_t num_intention_point_cluster = 64)
   : target_labels(target_labels),
@@ -63,7 +63,6 @@ struct MTRConfig
     max_num_polyline(max_num_polyline),
     max_num_point(max_num_point),
     point_break_distance(point_break_distance),
-    offset_xy(offset_xy),
     intention_point_filepath(intention_point_filepath),
     num_intention_point_cluster(num_intention_point_cluster)
   {
@@ -77,7 +76,6 @@ struct MTRConfig
   size_t max_num_polyline;
   size_t max_num_point;
   float point_break_distance;
-  std::array<float, 2> offset_xy;
   std::string intention_point_filepath;
   size_t num_intention_point_cluster;
 };  // struct MTRConfig
@@ -166,7 +164,6 @@ class TrtMTR
   cuda::unique_ptr<float[]> d_target_state_{nullptr};
   cuda::unique_ptr<float[]> d_intention_points_{nullptr};
   cuda::unique_ptr<float[]> d_polyline_{nullptr};
-  cuda::unique_ptr<int[]> d_topk_index_{nullptr};
 
   // preprocessed inputs
   cuda::unique_ptr<float[]> d_in_trajectory_{nullptr};
diff --git a/lib/include/preprocess/polyline_preprocess_kernel.cuh b/lib/include/preprocess/polyline_preprocess_kernel.cuh
index 8cfcd08..265faea 100644
--- a/lib/include/preprocess/polyline_preprocess_kernel.cuh
+++ b/lib/include/preprocess/polyline_preprocess_kernel.cuh
@@ -46,24 +46,44 @@ __global__ void setPreviousPositionKernel(
   const int B, const int K, const int P, const int D, const bool * mask, float * polyline);
 
 /**
- * @brief Extract TopK elements.
+ * @brief Calculate distances from targets to polylines.
  *
- * @param K The number of K.
- * @param L The number of source polylines.
+ * @note `inPolyline` must have been transformed to target coordinates system.
+ *
+ * @param B The number of target agents.
+ * @param L The number of polylines.
  * @param P The number of points contained in each polyline.
+ * @param AgentDim The number agent state dimensions.
+ * @param targetState Source target agent state, in shape [B*AgentDim].
+ * @param PointDim The number of point dimensions.
+ * @param inPolyline Source polyline, in shape [B*L*P*PointDim].
+ * @param inPolylineMask Source polyline mask, in shape [B*K*P].
+ * @param outDistance Output distance, in shape [B*L].
+ */
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int L, const int P, const int AgentDim, const float * targetState,
+  const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance);
+
+/**
+ * @brief Extract K polylines with the smallest distances.
+ *
+ * @note This kernel is implemented using shared memory.
+ *
+ * @param K The number elements to be extracted (K <= L).
  * @param B The number of target agents.
- * @param offsetX X offset position.
- * @param offsetY Y offset position.
- * @param AgentDim The number of agent state dimensions.
- * @param targetState Source state of target agents, in shape [B*AgentDim].
- * @param PointDim The number of point state dimensions.
- * @param inPolyline Source polylines, in shape [L*P*PointDim].
- * @param outPolyline Output polylines, in shape [K*P*PointDim].
+ * @param L The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions.
+ * @param inPolyline Source polyline, in shape [B*L*P*D].
+ * @param inPolylineMask Source polyline mask, in shape [B*L*P].
+ * @param inDistance Source polyline distances, in shape [B*L].
+ * @param outPolyline Output polyline, in shape [B*K*P*D].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
  */
-__global__ void extractTopkKernel(
-  const int K, const int L, const int P, const int B, const float offsetX, const float offsetY,
-  const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline,
-  float * outPolyline);
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask);
 
 /**
  * @brief Calculate the magnitudes of polylines.
@@ -82,7 +102,7 @@ __global__ void calculatePolylineCenterKernel(
 
 /**
  * @brief In cases of the number of batch polylines (L) is greater than K,
- *  extacts the topK elements.
+ *  extracts the topK elements.
  *
  * @param L The number of source polylines.
  * @param K The number of polylines expected as the model input.
@@ -92,9 +112,6 @@ __global__ void calculatePolylineCenterKernel(
  * @param in_polyline Source polylines, in shape [L*P*PointDim].
  * @param B The number of target agents.
  * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim].
- * @param offset_x The x offset.
- * @param offset_y The y offset.
- * @param topk_index A container to store topK indices, in shape [K].
  * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)].
  * @param out_polyline_mask Output polyline masks, in shape [B*K*P].
  * @param out_polyline_center Output magnitudes of each polyline with respect to target coords,
@@ -104,9 +121,8 @@ __global__ void calculatePolylineCenterKernel(
  */
 cudaError_t polylinePreprocessWithTopkLauncher(
   const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, const float offsetX, const float offsetY,
-  int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center,
-  cudaStream_t stream);
+  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
+  float * out_polyline_center, cudaStream_t stream);
 
 /**
  * @brief Do preprocess for polyline if the number of batched polylines is K.
diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu
index 232f71d..c66e5de 100644
--- a/lib/src/preprocess/polyline_preprocess_kernel.cu
+++ b/lib/src/preprocess/polyline_preprocess_kernel.cu
@@ -12,8 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mtr/cuda_helper.hpp"
 #include "preprocess/polyline_preprocess_kernel.cuh"
 
+#include <float.h>
+
 #include <iostream>
 
 __global__ void transformPolylineKernel(
@@ -62,11 +65,13 @@ __global__ void transformPolylineKernel(
   out_polyline[out_idx + 5] = trans_dz;
   out_polyline[out_idx + 6] = type_id;
 
-  const int out_mask_idx = b * K * P + k * P + p;
   bool is_valid = false;
-  for (size_t i = 0; i < 6; ++i) {
-    is_valid += out_polyline[out_idx + i] != 0.0f;
+  for (size_t i = 0; i < PointDim - 1; ++i) {
+    if (out_polyline[out_idx + i] != 0.0f) {
+      is_valid = true;
+    }
   }
+  int out_mask_idx = b * K * P + k * P + p;
   out_polyline_mask[out_mask_idx] = is_valid;
 }
 
@@ -95,42 +100,74 @@ __global__ void setPreviousPositionKernel(
   }
 }
 
-__global__ void extractTopkKernel(
-  const int K, const int L, const int P, const int B, const float offsetX, const float offsetY,
-  const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline,
-  int * topkIndex, float * outPolyline)
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int L, const int P, const int AgentDim, const float * targetState,
+  const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance)
 {
-  int l = blockIdx.x * blockDim.x + threadIdx.x;
-  int b = blockIdx.y * blockDim.y + threadIdx.y;
-  if (l >= L || b >= B) {
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int l = blockIdx.y * blockDim.y + threadIdx.y;
+  if (b >= B || l >= L) {
     return;
   }
 
   // calculate polyline center
   float sumX = 0.0f, sumY = 0.0f;
-  int validPoints = 0;
+  int numValid = 0;
   for (int p = 0; p < P; ++p) {
-    float x = inPolyline[(l * P + p) * PointDim];
-    float y = inPolyline[(l * P + p) * PointDim + 1];
-    if (x != 0.0f || y != 0.0f) {
+    int idx = b * L * P + l * P + p;
+    float x = inPolyline[idx * PointDim];
+    float y = inPolyline[idx * PointDim + 1];
+    if (inPolylineMask[idx]) {
       sumX += x;
       sumY += y;
-      ++validPoints;
+      ++numValid;
+    }
+  }
+  float centerX = sumX / fmaxf(1.0f, numValid);
+  float centerY = sumY / fmaxf(1.0f, numValid);
+
+  outDistance[b * L + l] = sqrtf(powf(centerX, 2) + powf(centerY, 2));
+}
+
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask)
+{
+  int b = blockIdx.x;  // Batch index
+  extern __shared__ float distances[];
+
+  // Load distances into shared memory
+  int tid = threadIdx.x;
+  if (tid < L) {
+    distances[tid] = inDistance[b * L + tid];
+  }
+  __syncthreads();
+
+  // Simple selection of the smallest K distances
+  // (this part should be replaced with a more efficient sorting/selecting algorithm)
+  for (int k = 0; k < K; k++) {
+    float minDistance = FLT_MAX;
+    int minIndex = -1;
+
+    for (int l = 0; l < L; l++) {
+      if (distances[l] < minDistance) {
+        minDistance = distances[l];
+        minIndex = l;
+      }
+    }
+
+    if (tid == k) {  // this thread will handle copying the k-th smallest polyline
+      for (int p = 0; p < P; p++) {
+        for (int d = 0; d < D; d++) {
+          outPolyline[b * K * P * D + k * P * D + p * D + d] =
+            inPolyline[b * L * P * D + minIndex * P * D + p * D + d];
+        }
+        outPolylineMask[b * K * P + k * P + p] = inPolylineMask[b * L * P + minIndex * P + p];
+      }
     }
+    distances[minIndex] = FLT_MAX;  // exclude this index from future consideration
   }
-  float centerX = sumX / fmaxf(1.0f, validPoints);
-  float centerY = sumY / fmaxf(1.0f, validPoints);
-
-  // apply offset to target state
-  float yaw = targetState[b * AgentDim + 6];
-  float cosYaw = cosf(yaw);
-  float sinYaw = sinf(yaw);
-  float transOffsetX = offsetX * cosYaw - offsetY * sinYaw;
-  float transOffsetY = offsetX * sinYaw + offsetY * cosYaw;
-  float targetX = targetState[b * AgentDim] + transOffsetX;
-  float targetY = targetState[b * AgentDim + 1] + transOffsetY;
-
-  float distance = sqrtf(powf(targetX - centerX, 2) + powf(targetY - centerY, 2));
 }
 
 __global__ void calculatePolylineCenterKernel(
@@ -175,15 +212,47 @@ __global__ void calculatePolylineCenterKernel(
 
 cudaError_t polylinePreprocessWithTopkLauncher(
   const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, const float offset_x, const float offset_y,
-  int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center,
-  cudaStream_t stream)
+  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
+  float * out_polyline_center, cudaStream_t stream)
 {
   if (L < K) {
     std::cerr << "L must be greater than K, but got L: " << L << ", K: " << K << std::endl;
     return cudaError_t::cudaErrorInvalidValue;
   }
 
+  float *tmpPolyline, *tmpDistance;
+  bool * tmpPolylineMask;
+  CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * PointDim, stream));
+  CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolylineMask, sizeof(bool) * B * L * P, stream));
+  CHECK_CUDA_ERROR(cudaMallocAsync(&tmpDistance, sizeof(float) * B * L, stream));
+
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  const dim3 block3dl(B, L / threadsPerBlock, P);
+  transformPolylineKernel<<<block3dl, threadsPerBlock, 0, stream>>>(
+    L, P, PointDim, in_polyline, B, AgentDim, target_state, tmpPolyline, tmpPolylineMask);
+
+  const dim3 block2dl(B, L / threadsPerBlock);
+  calculateCenterDistanceKernel<<<block2dl, threadsPerBlock, 0, stream>>>(
+    B, L, P, AgentDim, target_state, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance);
+
+  const size_t sharedMemSize = sizeof(float) * K;
+  extractTopKPolylineKernel<<<B, threadsPerBlock, sharedMemSize, stream>>>(
+    K, B, L, P, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance, out_polyline,
+    out_polyline_mask);
+
+  const dim3 block3dk(B, K / threadsPerBlock, P);
+  setPreviousPositionKernel<<<block3dk, threadsPerBlock, 0, stream>>>(
+    B, K, P, PointDim, out_polyline_mask, out_polyline);
+
+  const dim3 block2dk(B, K / threadsPerBlock);
+  calculatePolylineCenterKernel<<<block2dk, threadsPerBlock, 0, stream>>>(
+    B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center);
+
+  CHECK_CUDA_ERROR(cudaFree(tmpPolyline));
+  CHECK_CUDA_ERROR(cudaFree(tmpPolylineMask));
+  CHECK_CUDA_ERROR(cudaFree(tmpDistance));
+
   return cudaGetLastError();
 }
 
@@ -194,7 +263,7 @@ cudaError_t polylinePreprocessLauncher(
 {
   // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
   constexpr int threadsPerBlock = 256;
-  const dim3 block3d(B, K / threadsPerBlock, P);
+  const dim3 block3d(B, (K - 1) / threadsPerBlock, P);
 
   transformPolylineKernel<<<block3d, threadsPerBlock, 0, stream>>>(
     K, P, PointDim, in_polyline, B, AgentDim, target_state, out_polyline, out_polyline_mask);
@@ -202,7 +271,7 @@ cudaError_t polylinePreprocessLauncher(
   setPreviousPositionKernel<<<block3d, threadsPerBlock, 0, stream>>>(
     B, K, P, PointDim, out_polyline_mask, out_polyline);
 
-  const dim3 block2d(B, K / threadsPerBlock);
+  const dim3 block2d(B, (K - 1) / threadsPerBlock);
   calculatePolylineCenterKernel<<<block2d, threadsPerBlock, 0, stream>>>(
     B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center);
 
diff --git a/src/mtr.cpp b/src/mtr.cpp
index f4f638a..8e83254 100644
--- a/src/mtr.cpp
+++ b/src/mtr.cpp
@@ -83,7 +83,6 @@ void TrtMTR::initCudaPtr(AgentData & agent_data, PolylineData & polyline_data)
     cuda::make_unique<float[]>(agent_data.TargetNum * config_.num_intention_point_cluster * 2);
   d_polyline_ = cuda::make_unique<float[]>(
     polyline_data.PolylineNum * polyline_data.PointNum * polyline_data.StateDim);
-  d_topk_index_ = cuda::make_unique<int[]>(config_.max_num_polyline);
 
   // preprocessed input
   const size_t inDim = agent_data.StateDim + agent_data.ClassNum + agent_data.TimeLength +
@@ -168,13 +167,12 @@ bool TrtMTR::preProcess(AgentData & agent_data, PolylineData & polyline_data)
     d_timestamps_.get(), d_trajectory_.get(), d_in_trajectory_.get(), d_in_trajectory_mask_.get(),
     d_in_last_pos_.get(), stream_));
 
-  // TODO
   if (config_.max_num_polyline < polyline_data.PolylineNum) {
     CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher(
       polyline_data.PolylineNum, config_.max_num_polyline, polyline_data.PointNum,
       polyline_data.StateDim, d_polyline_.get(), agent_data.TargetNum, agent_data.StateDim,
-      d_target_state_.get(), config_.offset_xy[0], config_.offset_xy[1], d_topk_index_.get(),
-      d_in_polyline_.get(), d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
+      d_target_state_.get(), d_in_polyline_.get(), d_in_polyline_mask_.get(),
+      d_in_polyline_center_.get(), stream_));
   } else {
     assert(
       ("The number of config.max_num_polyline and PolylineData.PolylineNum must be same",

From da1f893c057cbe11a22c1b8d86102ea011bbed75 Mon Sep 17 00:00:00 2001
From: ktro2828 <kotaro.uetake@tier4.jp>
Date: Fri, 19 Apr 2024 01:38:07 +0900
Subject: [PATCH 3/3] feat: update polyline preprocesses

Signed-off-by: ktro2828 <kotaro.uetake@tier4.jp>
---
 .../preprocess/polyline_preprocess_kernel.cuh | 117 +++++----
 .../preprocess/polyline_preprocess_kernel.cu  | 240 +++++++++---------
 src/mtr.cpp                                   |   2 +-
 3 files changed, 182 insertions(+), 177 deletions(-)

diff --git a/lib/include/preprocess/polyline_preprocess_kernel.cuh b/lib/include/preprocess/polyline_preprocess_kernel.cuh
index 265faea..cfab41d 100644
--- a/lib/include/preprocess/polyline_preprocess_kernel.cuh
+++ b/lib/include/preprocess/polyline_preprocess_kernel.cuh
@@ -16,67 +16,71 @@
 #define PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
 
 /**
- * @brief Transform to target agent coordinates system.
+ * @brief Transform polylines to target agent coordinate system and extend its feature with previous
+ * x, y`(D->D+2)`.
+ *
+ * Points which all elements are 0.0 except of typeID are filled by 0.0 and the corresponding
+ * mask value is 0.0 too.
  *
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param PointDim The number of point state dimensions.
- * @param in_polyline Source polylines, in shape [K*P*PointDim].
+ * @param PointDim The number of point dimensions, expecting (x, y, z, dx, dy, dz, typeID).
+ * @param inPolyline Source polyline, in shape [K*P*D].
  * @param B The number of target agents.
- * @param AgentDim The number of agent state dimensions.
- * @param target_state Source target state at the latest timestamp, in shape [B*AgentDim].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)].
- * @param out_polyline_mask Output polyline mask, in shape [B*K*P].
+ * @param AgentDim The number of agent state dimensions, expecting (x, y, z, length, width, height,
+ * yaw, vx, vy, ax, ay).
+ * @param targetState Source target agent states, in shape [B*AgentDim].
+ * @param outPolyline Output polyline, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
  */
 __global__ void transformPolylineKernel(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask);
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask);
 
 /**
- * @brief Set the previous xy position at the end of element.
+ * @brief Set the Previous Position Kernel object
  *
  * @param B The number of target agents.
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param D The number of point dimensions.
- * @param mask The polyline mask, in shape [B*K*P].
- * @param polyline The container of polylines, in shape [B*K*P*D]
+ * @param D The number of point dimensions, expecting (x, y, ..., preX, preY).
+ * @param polyline Source polyline, in shape [B*K*P*D].
  */
 __global__ void setPreviousPositionKernel(
-  const int B, const int K, const int P, const int D, const bool * mask, float * polyline);
+  const int B, const int K, const int P, const int D, float * polyline);
 
 /**
- * @brief Calculate distances from targets to polylines.
+ * @brief Calculate center distance from target agent to each polyline.
  *
- * @note `inPolyline` must have been transformed to target coordinates system.
+ * @note Polyline must have been transformed to target agent coordinates system.
  *
  * @param B The number of target agents.
- * @param L The number of polylines.
+ * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param AgentDim The number agent state dimensions.
- * @param targetState Source target agent state, in shape [B*AgentDim].
- * @param PointDim The number of point dimensions.
- * @param inPolyline Source polyline, in shape [B*L*P*PointDim].
- * @param inPolylineMask Source polyline mask, in shape [B*K*P].
- * @param outDistance Output distance, in shape [B*L].
+ * @param D The number of point dimensions, expecting [x, y, ...].
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param distance Output calculated distances, in shape [B*K].
  */
 __global__ void calculateCenterDistanceKernel(
-  const int B, const int L, const int P, const int AgentDim, const float * targetState,
-  const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance);
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance);
 
 /**
  * @brief Extract K polylines with the smallest distances.
  *
- * @note This kernel is implemented using shared memory.
+ * @note Because this kernel supposes to allocate shared memory dynamically it is necessary to
+ * specify `sizeof(float) * L` in the kernel execution configuration.
  *
- * @param K The number elements to be extracted (K <= L).
+ * @param K The number of polylines to be extracted.
  * @param B The number of target agents.
- * @param L The number of polylines.
+ * @param L The number of original polylines.
  * @param P The number of points contained in each polyline.
  * @param D The number of point dimensions.
  * @param inPolyline Source polyline, in shape [B*L*P*D].
  * @param inPolylineMask Source polyline mask, in shape [B*L*P].
- * @param inDistance Source polyline distances, in shape [B*L].
+ * @param inDistance Source distances from target agents to the centers of each polyline, in shape
+ * [B*L].
  * @param outPolyline Output polyline, in shape [B*K*P*D].
  * @param outPolylineMask Output polyline mask, in shape [B*K*P].
  */
@@ -86,43 +90,46 @@ __global__ void extractTopKPolylineKernel(
   bool * outPolylineMask);
 
 /**
- * @brief Calculate the magnitudes of polylines.
+ * @brief Calculate center positions of each polyline with respect to target agent coordinates
+ * system.
+ *
+ * @note Polyline must have been transformed to target agent coordinates system.
  *
  * @param B The number of target agents.
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param D The number of point dimensions.
- * @param polyline Source polylines, in shape [B*K*P*(D + 2)].
- * @param mask Source polyline masks, in shape [B*K*P].
- * @param center Output magnitudes of polylines, in shape [B*K*3].
+ * @param D The number of point dimensions, expecting (x, y, z, ...).
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param center Output centers, in shape [B*K*3].
  */
 __global__ void calculatePolylineCenterKernel(
-  const int B, const int K, const int P, const int PointDim, const float * polyline,
-  const bool * mask, float * center);
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center);
 
 /**
  * @brief In cases of the number of batch polylines (L) is greater than K,
  *  extracts the topK elements.
  *
- * @param L The number of source polylines.
- * @param K The number of polylines expected as the model input.
+ * @param K The number of polylines to be extracted.
+ * @param L The number of original polylines.
  * @param P The number of points contained in each polyline.
  * @param PointDim The number of point state dimensions.
- * @param AgentDim The number of agent state dimensions.
- * @param in_polyline Source polylines, in shape [L*P*PointDim].
+ * @param inPolyline Source polylines, in shape [L*P*PointDim].
  * @param B The number of target agents.
- * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)].
- * @param out_polyline_mask Output polyline masks, in shape [B*K*P].
- * @param out_polyline_center Output magnitudes of each polyline with respect to target coords,
+ * @param AgentDim The number of agent state dimensions.
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
  *  in shape [B*K*3].
  * @param stream CUDA stream.
  * @return cudaError_t
  */
 cudaError_t polylinePreprocessWithTopkLauncher(
-  const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream);
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream);
 
 /**
  * @brief Do preprocess for polyline if the number of batched polylines is K.
@@ -130,20 +137,20 @@ cudaError_t polylinePreprocessWithTopkLauncher(
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
  * @param PointDim The number of point state dimensions.
- * @param in_polyline Source polylines, in shape [K*P*PointDim].
+ * @param inPolyline Source polylines, in shape [K*P*PointDim].
  * @param B The number of target agents.
  * @param AgentDim The number of agent state dimensions.
- * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim + 2)].
- * @param out_polyline_mask Output polyline masks, in shape [B*K*P].
- * @param out_polyline_center Output magnitudes of each polyline with respect to target coords,
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
  *  in shape [B*K*3].
  * @param stream CUDA stream.
  * @return cudaError_t
  */
 cudaError_t polylinePreprocessLauncher(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream);
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream);
 
 #endif  // PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
diff --git a/lib/src/preprocess/polyline_preprocess_kernel.cu b/lib/src/preprocess/polyline_preprocess_kernel.cu
index c66e5de..62e251b 100644
--- a/lib/src/preprocess/polyline_preprocess_kernel.cu
+++ b/lib/src/preprocess/polyline_preprocess_kernel.cu
@@ -20,8 +20,8 @@
 #include <iostream>
 
 __global__ void transformPolylineKernel(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask)
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
@@ -31,52 +31,59 @@ __global__ void transformPolylineKernel(
     return;
   }
 
-  const int src_polyline_idx = (k * P + p) * PointDim;
-  const float x = in_polyline[src_polyline_idx];
-  const float y = in_polyline[src_polyline_idx + 1];
-  const float z = in_polyline[src_polyline_idx + 2];
-  const float dx = in_polyline[src_polyline_idx + 3];
-  const float dy = in_polyline[src_polyline_idx + 4];
-  const float dz = in_polyline[src_polyline_idx + 5];
-  const float type_id = in_polyline[src_polyline_idx + 6];
-
-  const int center_idx = b * AgentDim;
-  const float center_x = target_state[center_idx];
-  const float center_y = target_state[center_idx + 1];
-  const float center_z = target_state[center_idx + 2];
-  const float center_yaw = target_state[center_idx + 6];
-  const float center_cos = cos(center_yaw);
-  const float center_sin = sin(center_yaw);
-
-  // do transform
-  const float trans_x = center_cos * (x - center_x) - center_sin * (y - center_y);
-  const float trans_y = center_sin * (x - center_x) + center_cos * (y - center_y);
-  const float trans_z = z - center_z;
-  const float trans_dx = center_cos * dx - center_sin * dy;
-  const float trans_dy = center_sin * dx + center_cos * dy;
-  const float trans_dz = dz;
-
-  const int out_idx = (b * K * P + k * P + p) * (PointDim + 2);
-  out_polyline[out_idx] = trans_x;
-  out_polyline[out_idx + 1] = trans_y;
-  out_polyline[out_idx + 2] = trans_z;
-  out_polyline[out_idx + 3] = trans_dx;
-  out_polyline[out_idx + 4] = trans_dy;
-  out_polyline[out_idx + 5] = trans_dz;
-  out_polyline[out_idx + 6] = type_id;
-
-  bool is_valid = false;
-  for (size_t i = 0; i < PointDim - 1; ++i) {
-    if (out_polyline[out_idx + i] != 0.0f) {
-      is_valid = true;
+  const int inIdx = (k * P + p) * PointDim;
+  const int outIdx = b * K * P + k * P + p;
+  bool isValid = false;
+  for (int d = 0; d < PointDim - 1; ++d) {
+    if (inPolyline[inIdx + d] != 0.0f) {
+      isValid = true;
     }
   }
-  int out_mask_idx = b * K * P + k * P + p;
-  out_polyline_mask[out_mask_idx] = is_valid;
+  outPolylineMask[outIdx] = isValid;
+
+  // initialize output polyline with 0.0
+  for (int d = 0; d < PointDim + 2; ++d) {
+    outPolyline[outIdx * (PointDim + 2) + d] = 0.0f;
+  }
+
+  // set transformed values if valid, otherwise all 0.0.
+  if (isValid) {
+    const float x = inPolyline[inIdx];
+    const float y = inPolyline[inIdx + 1];
+    const float z = inPolyline[inIdx + 2];
+    const float dx = inPolyline[inIdx + 3];
+    const float dy = inPolyline[inIdx + 4];
+    const float dz = inPolyline[inIdx + 5];
+    const float typeID = inPolyline[inIdx + 6];
+
+    const int centerIdx = b * AgentDim;
+    const float centerX = targetState[centerIdx];
+    const float centerY = targetState[centerIdx + 1];
+    const float centerZ = targetState[centerIdx + 2];
+    const float centerYaw = targetState[centerIdx + 6];
+    const float centerCos = cosf(centerYaw);
+    const float centerSin = sinf(centerYaw);
+
+    // do transform
+    const float transX = centerCos * (x - centerX) - centerSin * (y - centerY);
+    const float transY = centerSin * (x - centerX) + centerCos * (y - centerY);
+    const float transZ = z - centerZ;
+    const float transDx = centerCos * dx - centerSin * dy;
+    const float transDy = centerSin * dx + centerCos * dy;
+    const float transDz = dz;
+
+    outPolyline[outIdx * (PointDim + 2)] = transX;
+    outPolyline[outIdx * (PointDim + 2) + 1] = transY;
+    outPolyline[outIdx * (PointDim + 2) + 2] = transZ;
+    outPolyline[outIdx * (PointDim + 2) + 3] = transDx;
+    outPolyline[outIdx * (PointDim + 2) + 4] = transDy;
+    outPolyline[outIdx * (PointDim + 2) + 5] = transDz;
+    outPolyline[outIdx * (PointDim + 2) + 6] = typeID;
+  }
 }
 
 __global__ void setPreviousPositionKernel(
-  const int B, const int K, const int P, const int D, const bool * mask, float * polyline)
+  const int B, const int K, const int P, const int D, float * polyline)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
@@ -86,27 +93,20 @@ __global__ void setPreviousPositionKernel(
     return;
   }
 
-  const int cur_idx = (b * K * P + k * P + p) * D;
-  const int pre_idx = k == 0 ? cur_idx : (b * K * P + (k - 1) * P + p) * D;
-
-  polyline[cur_idx + D - 2] = polyline[pre_idx];
-  polyline[cur_idx + D - 1] = polyline[pre_idx + 1];
+  const int curIdx = (b * K * P + k * P + p) * D;
+  const int preIdx = p == 0 ? curIdx : (b * K * P + k * P + p - 1) * D;
 
-  const int mask_idx = b * K * P + k * P + p;
-  if (!mask[mask_idx]) {
-    for (int d = 0; d < D; ++d) {
-      polyline[cur_idx + d] = 0.0f;
-    }
-  }
+  polyline[curIdx + D - 2] = polyline[preIdx];      // x
+  polyline[curIdx + D - 1] = polyline[preIdx + 1];  // y
 }
 
 __global__ void calculateCenterDistanceKernel(
-  const int B, const int L, const int P, const int AgentDim, const float * targetState,
-  const int PointDim, const float * inPolyline, const bool * inPolylineMask, float * outDistance)
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
-  int l = blockIdx.y * blockDim.y + threadIdx.y;
-  if (b >= B || l >= L) {
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+  if (b >= B || k >= K) {
     return;
   }
 
@@ -114,19 +114,17 @@ __global__ void calculateCenterDistanceKernel(
   float sumX = 0.0f, sumY = 0.0f;
   int numValid = 0;
   for (int p = 0; p < P; ++p) {
-    int idx = b * L * P + l * P + p;
-    float x = inPolyline[idx * PointDim];
-    float y = inPolyline[idx * PointDim + 1];
-    if (inPolylineMask[idx]) {
-      sumX += x;
-      sumY += y;
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
       ++numValid;
     }
   }
   float centerX = sumX / fmaxf(1.0f, numValid);
   float centerY = sumY / fmaxf(1.0f, numValid);
 
-  outDistance[b * L + l] = sqrtf(powf(centerX, 2) + powf(centerY, 2));
+  distance[b * K + k] = hypot(centerX, centerY);
 }
 
 __global__ void extractTopKPolylineKernel(
@@ -134,11 +132,13 @@ __global__ void extractTopKPolylineKernel(
   const bool * inPolylineMask, const float * inDistance, float * outPolyline,
   bool * outPolylineMask)
 {
-  int b = blockIdx.x;  // Batch index
+  int b = blockIdx.x;                             // Batch index
+  int p = blockIdx.y * blockDim.y + threadIdx.y;  // Point index
+  int d = blockIdx.z * blockDim.z + threadIdx.z;  // Dim index
   extern __shared__ float distances[];
 
   // Load distances into shared memory
-  int tid = threadIdx.x;
+  int tid = threadIdx.x;  // Polyline index
   if (tid < L) {
     distances[tid] = inDistance[b * L + tid];
   }
@@ -156,29 +156,22 @@ __global__ void extractTopKPolylineKernel(
         minIndex = l;
       }
     }
+    __syncthreads();
 
     if (tid == k) {  // this thread will handle copying the k-th smallest polyline
-      for (int p = 0; p < P; p++) {
-        for (int d = 0; d < D; d++) {
-          outPolyline[b * K * P * D + k * P * D + p * D + d] =
-            inPolyline[b * L * P * D + minIndex * P * D + p * D + d];
-        }
-        outPolylineMask[b * K * P + k * P + p] = inPolylineMask[b * L * P + minIndex * P + p];
-      }
+      int inIdx = b * L * P + minIndex * P + p;
+      int outIdx = b * K * P + k * P + p;
+      outPolyline[outIdx * D + d] = inPolyline[inIdx * D + d];
+      outPolylineMask[outIdx] = inPolylineMask[inIdx];
     }
     distances[minIndex] = FLT_MAX;  // exclude this index from future consideration
   }
 }
 
 __global__ void calculatePolylineCenterKernel(
-  const int B, const int K, const int P, const int PointDim, const float * polyline,
-  const bool * mask, float * center)
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center)
 {
-  // --- pseudo code ---
-  // sum = (polylines[:, :, :, 0:3] * mask[:, :, :, None]).sum(dim=2)
-  // center = sum / clampMIN(mask.sum(dim=2), min=1.0)
-  // -------------------
-
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -187,67 +180,70 @@ __global__ void calculatePolylineCenterKernel(
   }
 
   // initialize with 0.0
-  int center_idx = (b * K + k) * 3;
+  int centerIdx = (b * K + k) * 3;
   for (int d = 0; d < 3; ++d) {
-    center[center_idx + d] = 0.0f;
+    center[centerIdx + d] = 0.0f;
   }
 
-  float sum_xyz[3] = {0.0f, 0.0f, 0.0f};
-  int count = 0;
+  // calculate polyline center
+  float sumX = 0.0f, sumY = 0.0f, sumZ = 0.0f;
+  int numValid = 0;
   for (int p = 0; p < P; ++p) {
-    int src_idx = b * K * P + k * P + p;
-    if (mask[src_idx]) {
-      for (int d = 0; d < 3; ++d) {
-        sum_xyz[d] += polyline[src_idx * PointDim + d];
-      }
-      ++count;
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
+      sumZ += polyline[idx * D + 2];
+      ++numValid;
     }
   }
-  count = max(count, 1);
 
-  for (int d = 0; d < 3; ++d) {
-    center[center_idx + d] = sum_xyz[d] / static_cast<float>(count);
-  }
+  center[centerIdx] = sumX / fmaxf(1.0f, numValid);
+  center[centerIdx + 1] = sumY / fmaxf(1.0f, numValid);
+  center[centerIdx + 2] = sumZ / fmaxf(1.0f, numValid);
 }
 
 cudaError_t polylinePreprocessWithTopkLauncher(
-  const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream)
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream)
 {
   if (L < K) {
     std::cerr << "L must be greater than K, but got L: " << L << ", K: " << K << std::endl;
     return cudaError_t::cudaErrorInvalidValue;
   }
 
+  const int outPointDim = PointDim + 2;
+
   float *tmpPolyline, *tmpDistance;
   bool * tmpPolylineMask;
-  CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * PointDim, stream));
+  CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolyline, sizeof(float) * B * L * P * outPointDim, stream));
   CHECK_CUDA_ERROR(cudaMallocAsync(&tmpPolylineMask, sizeof(bool) * B * L * P, stream));
   CHECK_CUDA_ERROR(cudaMallocAsync(&tmpDistance, sizeof(float) * B * L, stream));
 
   // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
   constexpr int threadsPerBlock = 256;
-  const dim3 block3dl(B, L / threadsPerBlock, P);
-  transformPolylineKernel<<<block3dl, threadsPerBlock, 0, stream>>>(
-    L, P, PointDim, in_polyline, B, AgentDim, target_state, tmpPolyline, tmpPolylineMask);
+  const dim3 blocks1(B, L / threadsPerBlock, P);
+  transformPolylineKernel<<<blocks1, threadsPerBlock, 0, stream>>>(
+    L, P, PointDim, inPolyline, B, AgentDim, targetState, tmpPolyline, tmpPolylineMask);
 
-  const dim3 block2dl(B, L / threadsPerBlock);
-  calculateCenterDistanceKernel<<<block2dl, threadsPerBlock, 0, stream>>>(
-    B, L, P, AgentDim, target_state, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance);
+  const dim3 blocks2(B, L / threadsPerBlock);
+  calculateCenterDistanceKernel<<<blocks2, threadsPerBlock, 0, stream>>>(
+    B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance);
 
-  const size_t sharedMemSize = sizeof(float) * K;
-  extractTopKPolylineKernel<<<B, threadsPerBlock, sharedMemSize, stream>>>(
-    K, B, L, P, PointDim, tmpPolyline, tmpPolylineMask, tmpDistance, out_polyline,
-    out_polyline_mask);
+  const dim3 blocks3(B, P, outPointDim);
+  const size_t sharedMemSize = sizeof(float) * L;
+  extractTopKPolylineKernel<<<blocks3, threadsPerBlock, sharedMemSize, stream>>>(
+    K, B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance, outPolyline,
+    outPolylineMask);
 
-  const dim3 block3dk(B, K / threadsPerBlock, P);
-  setPreviousPositionKernel<<<block3dk, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline_mask, out_polyline);
+  const dim3 blocks4(B, K / threadsPerBlock, P);
+  setPreviousPositionKernel<<<blocks4, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline);
 
-  const dim3 block2dk(B, K / threadsPerBlock);
-  calculatePolylineCenterKernel<<<block2dk, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center);
+  const dim3 blocks5(B, K / threadsPerBlock);
+  calculatePolylineCenterKernel<<<blocks5, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
 
   CHECK_CUDA_ERROR(cudaFree(tmpPolyline));
   CHECK_CUDA_ERROR(cudaFree(tmpPolylineMask));
@@ -257,23 +253,25 @@ cudaError_t polylinePreprocessWithTopkLauncher(
 }
 
 cudaError_t polylinePreprocessLauncher(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream)
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream)
 {
+  const int outPointDim = PointDim + 2;
+
   // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
   constexpr int threadsPerBlock = 256;
   const dim3 block3d(B, (K - 1) / threadsPerBlock, P);
 
   transformPolylineKernel<<<block3d, threadsPerBlock, 0, stream>>>(
-    K, P, PointDim, in_polyline, B, AgentDim, target_state, out_polyline, out_polyline_mask);
+    K, P, PointDim, inPolyline, B, AgentDim, targetState, outPolyline, outPolylineMask);
 
   setPreviousPositionKernel<<<block3d, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline_mask, out_polyline);
+    B, K, P, outPointDim, outPolyline);
 
   const dim3 block2d(B, (K - 1) / threadsPerBlock);
   calculatePolylineCenterKernel<<<block2d, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center);
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
 
   return cudaGetLastError();
 }
diff --git a/src/mtr.cpp b/src/mtr.cpp
index 8e83254..13e641d 100644
--- a/src/mtr.cpp
+++ b/src/mtr.cpp
@@ -169,7 +169,7 @@ bool TrtMTR::preProcess(AgentData & agent_data, PolylineData & polyline_data)
 
   if (config_.max_num_polyline < polyline_data.PolylineNum) {
     CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher(
-      polyline_data.PolylineNum, config_.max_num_polyline, polyline_data.PointNum,
+      config_.max_num_polyline, polyline_data.PolylineNum, polyline_data.PointNum,
       polyline_data.StateDim, d_polyline_.get(), agent_data.TargetNum, agent_data.StateDim,
       d_target_state_.get(), d_in_polyline_.get(), d_in_polyline_mask_.get(),
       d_in_polyline_center_.get(), stream_));