Issue/213 添加conv算子cpu/cuda实现 #218

Graylatzhou · 2025-05-13T08:24:56Z

src/infiniop/ops/conv/cpu/conv_cpu.cc

Ziminli · 2025-05-28T02:58:26Z

src/infiniop/ops/conv/cpu/conv_cpu.cc

+template <typename Tdata>
+infiniStatus_t conv_cpu(
+    const ConvInfo &info,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *w,
+    const void *bias) {
+    auto y_ptr = reinterpret_cast<Tdata *>(y);
+    auto x_ptr = reinterpret_cast<const Tdata *>(x);
+    auto w_ptr = reinterpret_cast<const Tdata *>(w);
+    if constexpr (std::is_same<Tdata, float>::value) {
+        std::fill(y_ptr, y_ptr + calculateOutputSize(info), 0.0f);
+    } else if constexpr (std::is_same<Tdata, fp16_t>::value) {
+        fp16_t zero_val = utils::cast<fp16_t>(0.0f);
+        std::fill(y_ptr, y_ptr + calculateOutputSize(info), zero_val);
+    } else {
+        std::fill(y_ptr, y_ptr + calculateOutputSize(info), static_cast<Tdata>(0));
+    }
+    _conv_cpu<Tdata, Tdata>(info, workspace, workspace_size, y_ptr, x_ptr, w_ptr);
+    if (bias != nullptr) {
+        auto bias_ptr = reinterpret_cast<const Tdata *>(bias);
+#pragma omp parallel for
+        for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) {
+            size_t channel_idx = (i / info.spatial_sizes) % info.out_channels;
+            y_ptr[i] += bias_ptr[channel_idx];
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <>
+infiniStatus_t conv_cpu<fp16_t>(
+    const ConvInfo &info,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *w,
+    const void *bias) {
+    auto y_float = reinterpret_cast<float *>(workspace);
+    auto x_half = reinterpret_cast<const fp16_t *>(x);
+    auto w_half = reinterpret_cast<const fp16_t *>(w);
+
+    std::fill(y_float, y_float + calculateOutputSize(info), 0.0f);
+
+    void *conv_workspace = y_float + calculateOutputSize(info);
+    size_t conv_workspace_size = workspace_size - calculateOutputSize(info) * sizeof(float);
+
+    _conv_cpu<fp16_t, float>(info, conv_workspace, conv_workspace_size, y_float, x_half, w_half);
+
+    auto y_half = reinterpret_cast<fp16_t *>(y);
+    if (bias != nullptr) {
+        auto bias_half = reinterpret_cast<const fp16_t *>(bias);
+#pragma omp parallel for
+        for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) {
+            size_t channel_idx = (i / info.spatial_sizes) % info.out_channels;
+            float bias_value = utils::cast<float>(bias_half[channel_idx]);
+            y_float[i] += bias_value;
+            y_half[i] = utils::cast<fp16_t>(y_float[i]);
+        }
+    } else {
+#pragma omp parallel for
+        for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) {
+            y_half[i] = utils::cast<fp16_t>(y_float[i]);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}


calculateOutputSize(info) 在每个函数里的结果用变量暂存一下，不需要反复调用函数计算；

下面有 fp16_t 的特化情况了所以上面的 conv_cpu 其实就不需要处理 fp16_t 的情况了

Ziminli · 2025-05-28T09:18:56Z

src/infiniop/ops/conv/cpu/conv_cpu.cc

+void _conv_cpu(
+    const ConvInfo &info,
+    void *workspace,
+    size_t workspace_size,
+    Ydata *y,
+    const Xdata *x,
+    const Xdata *w) {
+    if (needsPadding(info)) {
+        auto padded_x = reinterpret_cast<Xdata *>(workspace);
+        std::vector<size_t> padded_shape(info.ndim + 2);
+        padded_shape[0] = info.batch;
+        padded_shape[1] = info.in_channels;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            padded_shape[i + 2] = info.input_dims[i] + 2 * info.pads_info[i];
+        }
+        if constexpr (std::is_same<Xdata, fp16_t>::value) {
+            fp16_t zero_val = utils::cast<fp16_t>(0.0f);
+            std::fill(padded_x, padded_x + calculatePaddedInputSize(info), zero_val);
+        } else if constexpr (std::is_same<Xdata, float>::value) {
+            std::fill(padded_x, padded_x + calculatePaddedInputSize(info), 0.0f);
+        } else {
+            std::fill(padded_x, padded_x + calculatePaddedInputSize(info), static_cast<Xdata>(0));
+        }
+        fillPaddedInput(info, x, padded_shape.data(), padded_x, 0, 0, 0);
+
+        applyConv(info, y, padded_x, w, padded_shape.data());
+    } else {
+        std::vector<size_t> shape(info.ndim + 2);
+        shape[0] = info.batch;
+        shape[1] = info.in_channels;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            shape[i + 2] = info.input_dims[i];
+        }
+        applyConv(info, y, x, w, shape.data());
+    }


padded_shape 可以在创建 desc 阶段就计算好传入 info 中，无需占用计算时间再算

Ziminli · 2025-05-28T09:47:39Z

src/infiniop/ops/conv/cpu/conv_cpu.cc

+    const Xdata *x,
+    const Xdata *w,
+    const size_t *x_shape) {
+#pragma omp parallel for


用 #pragma omp parallel for collapse(2) schedule(dynamic) 吧

Ziminli · 2025-05-28T11:16:10Z

src/infiniop/ops/conv/cuda/conv_cuda.cu

+    CHECK_RESULT(result);
+    size_t workspace_size = result->handler->workspace_size;
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), workspace_size,


(line 38) 用 std::move

Ziminli · 2025-05-28T11:43:30Z

src/infiniop/ops/conv/info.h

这里不太能这么设计，

ConvInfo 本身还好，但推荐统一用一个 meta mem pool 然后每个 meta（i.e., input_dims, kernel_dims, etc.) 用 offset 获取。可以参考 ElementwiseInfo 的设计；

去除 CudnnConvHandler, 这个 cuda 特定的信息结构可以在 cuda 的 Opaque 里设计。Opaque 对外隐藏，各个后端平台可以在里面进行自己的设计，这里可以详细阅读 gemm.h 里的对该设计的解释。ConvInfo 里只应有通用的属性和信息，不应该包含例如 CudnnConvHandler 这种结构（即便用条件编译），这本身不是个好的设计。

第二点中提及的设计形式目前 InfiniCore 还没有（没有使用 cudnn 的），但 conv 的具体实现（cudnn 调用和检查等）可以参考之前提及的 operators 123 PR。

convinfo已经修改为通过met mem pool和offset来获取meta

cudnn handle已经修改为通过opaque来管理

Graylatzhou and others added 4 commits May 7, 2025 04:16

Issue/213 conv cuda端cpp

ce894fc

Merge branch 'InfiniTensor:main' into main

acdd7db

Merge branch 'InfiniTensor:main' into main

2fd5e26

Issue/213 添加Conv算子cpu/cuda实现(还未添加bias)

4730550

Graylatzhou force-pushed the main branch 6 times, most recently from ca9e24b to d52e116 Compare May 13, 2025 18:47

Issue/213 cpu端修复fp16报错

750bbbf

Graylatzhou force-pushed the main branch 3 times, most recently from 2605926 to 62ff480 Compare May 14, 2025 17:28

Issue/213 添加bias实现

01c987a

Graylatzhou force-pushed the main branch 2 times, most recently from 7f0dc8a to 665e041 Compare May 15, 2025 16:19

Merge branch 'InfiniTensor:main' into main

47cfc3b

Graylatzhou force-pushed the main branch from 665e041 to 47cfc3b Compare May 16, 2025 11:23

Graylatzhou and others added 4 commits May 16, 2025 19:28

Merge branch 'InfiniTensor:main' into main

46d9bd1

Merge branch 'InfiniTensor:main' into main

44936c4

Issue/213 fp32 tolerance修改为1e-6

3c6633f

Merge branch 'InfiniTensor:main' into main

e3d1d6e

Ziminli requested changes May 28, 2025

View reviewed changes

Graylatzhou added 6 commits May 28, 2025 21:44

Issue/213 info类修改为meta mem pool,cpu端修改

3affd5a

Issue/213 format

afb10bd

Issue/213 在Opaque中实现cudnn handle的管理

9d998a3

Issue/213 format code

9155c2a

Issue/213 修复omp导致的报错

1e95e37

Issue/213 修复可能的除0错误

92cd125

Graylatzhou force-pushed the main branch 4 times, most recently from 96eaa5b to 9038baa Compare May 29, 2025 08:54

Issue/213 修复CI中的报错

7f2084c

Graylatzhou force-pushed the main branch from 9038baa to 7f2084c Compare May 29, 2025 09:00

Graylatzhou and others added 3 commits May 29, 2025 20:37

Issue/213 修复divided by zero报错

61d8e99

Merge branch 'InfiniTensor:main' into main

08d44a1

Merge branch 'InfiniTensor:main' into main

0986908

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Issue/213 添加conv算子cpu/cuda实现 #218

Issue/213 添加conv算子cpu/cuda实现 #218

Uh oh!

Graylatzhou commented May 13, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Ziminli May 28, 2025

Uh oh!

Graylatzhou May 28, 2025

Uh oh!

Ziminli May 28, 2025

Uh oh!

Graylatzhou May 28, 2025

Uh oh!

Ziminli May 28, 2025

Uh oh!

Ziminli May 28, 2025

Uh oh!

Graylatzhou May 28, 2025

Uh oh!

Ziminli May 28, 2025

Uh oh!

Graylatzhou May 28, 2025

Uh oh!

Graylatzhou May 28, 2025

Uh oh!

Uh oh!

Issue/213 添加conv算子cpu/cuda实现 #218

Are you sure you want to change the base?

Issue/213 添加conv算子cpu/cuda实现 #218

Uh oh!

Conversation

Graylatzhou commented May 13, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!