-
Notifications
You must be signed in to change notification settings - Fork 14
Issue/213 添加conv算子cpu/cuda实现 #218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Graylatzhou
commented
May 13, 2025
ca9e24b
to
d52e116
Compare
2605926
to
62ff480
Compare
7f0dc8a
to
665e041
Compare
template <typename Tdata> | ||
infiniStatus_t conv_cpu( | ||
const ConvInfo &info, | ||
void *workspace, | ||
size_t workspace_size, | ||
void *y, | ||
const void *x, | ||
const void *w, | ||
const void *bias) { | ||
auto y_ptr = reinterpret_cast<Tdata *>(y); | ||
auto x_ptr = reinterpret_cast<const Tdata *>(x); | ||
auto w_ptr = reinterpret_cast<const Tdata *>(w); | ||
if constexpr (std::is_same<Tdata, float>::value) { | ||
std::fill(y_ptr, y_ptr + calculateOutputSize(info), 0.0f); | ||
} else if constexpr (std::is_same<Tdata, fp16_t>::value) { | ||
fp16_t zero_val = utils::cast<fp16_t>(0.0f); | ||
std::fill(y_ptr, y_ptr + calculateOutputSize(info), zero_val); | ||
} else { | ||
std::fill(y_ptr, y_ptr + calculateOutputSize(info), static_cast<Tdata>(0)); | ||
} | ||
_conv_cpu<Tdata, Tdata>(info, workspace, workspace_size, y_ptr, x_ptr, w_ptr); | ||
if (bias != nullptr) { | ||
auto bias_ptr = reinterpret_cast<const Tdata *>(bias); | ||
#pragma omp parallel for | ||
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) { | ||
size_t channel_idx = (i / info.spatial_sizes) % info.out_channels; | ||
y_ptr[i] += bias_ptr[channel_idx]; | ||
} | ||
} | ||
return INFINI_STATUS_SUCCESS; | ||
} | ||
|
||
template <> | ||
infiniStatus_t conv_cpu<fp16_t>( | ||
const ConvInfo &info, | ||
void *workspace, | ||
size_t workspace_size, | ||
void *y, | ||
const void *x, | ||
const void *w, | ||
const void *bias) { | ||
auto y_float = reinterpret_cast<float *>(workspace); | ||
auto x_half = reinterpret_cast<const fp16_t *>(x); | ||
auto w_half = reinterpret_cast<const fp16_t *>(w); | ||
|
||
std::fill(y_float, y_float + calculateOutputSize(info), 0.0f); | ||
|
||
void *conv_workspace = y_float + calculateOutputSize(info); | ||
size_t conv_workspace_size = workspace_size - calculateOutputSize(info) * sizeof(float); | ||
|
||
_conv_cpu<fp16_t, float>(info, conv_workspace, conv_workspace_size, y_float, x_half, w_half); | ||
|
||
auto y_half = reinterpret_cast<fp16_t *>(y); | ||
if (bias != nullptr) { | ||
auto bias_half = reinterpret_cast<const fp16_t *>(bias); | ||
#pragma omp parallel for | ||
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) { | ||
size_t channel_idx = (i / info.spatial_sizes) % info.out_channels; | ||
float bias_value = utils::cast<float>(bias_half[channel_idx]); | ||
y_float[i] += bias_value; | ||
y_half[i] = utils::cast<fp16_t>(y_float[i]); | ||
} | ||
} else { | ||
#pragma omp parallel for | ||
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(calculateOutputSize(info)); ++i) { | ||
y_half[i] = utils::cast<fp16_t>(y_float[i]); | ||
} | ||
} | ||
|
||
return INFINI_STATUS_SUCCESS; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
calculateOutputSize(info)
在每个函数里的结果用变量暂存一下,不需要反复调用函数计算;- 下面有
fp16_t
的特化情况了 所以上面的conv_cpu
其实就不需要处理 fp16_t 的情况了
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
已修正
void _conv_cpu( | ||
const ConvInfo &info, | ||
void *workspace, | ||
size_t workspace_size, | ||
Ydata *y, | ||
const Xdata *x, | ||
const Xdata *w) { | ||
if (needsPadding(info)) { | ||
auto padded_x = reinterpret_cast<Xdata *>(workspace); | ||
std::vector<size_t> padded_shape(info.ndim + 2); | ||
padded_shape[0] = info.batch; | ||
padded_shape[1] = info.in_channels; | ||
for (size_t i = 0; i < info.ndim; ++i) { | ||
padded_shape[i + 2] = info.input_dims[i] + 2 * info.pads_info[i]; | ||
} | ||
if constexpr (std::is_same<Xdata, fp16_t>::value) { | ||
fp16_t zero_val = utils::cast<fp16_t>(0.0f); | ||
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), zero_val); | ||
} else if constexpr (std::is_same<Xdata, float>::value) { | ||
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), 0.0f); | ||
} else { | ||
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), static_cast<Xdata>(0)); | ||
} | ||
fillPaddedInput(info, x, padded_shape.data(), padded_x, 0, 0, 0); | ||
|
||
applyConv(info, y, padded_x, w, padded_shape.data()); | ||
} else { | ||
std::vector<size_t> shape(info.ndim + 2); | ||
shape[0] = info.batch; | ||
shape[1] = info.in_channels; | ||
for (size_t i = 0; i < info.ndim; ++i) { | ||
shape[i + 2] = info.input_dims[i]; | ||
} | ||
applyConv(info, y, x, w, shape.data()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
padded_shape
可以在创建 desc 阶段就计算好传入 info 中,无需占用计算时间再算
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
已修改
const Xdata *x, | ||
const Xdata *w, | ||
const size_t *x_shape) { | ||
#pragma omp parallel for |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
用 #pragma omp parallel for collapse(2) schedule(dynamic)
吧
CHECK_RESULT(result); | ||
size_t workspace_size = result->handler->workspace_size; | ||
*desc_ptr = new Descriptor( | ||
dtype, result.take(), workspace_size, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(line 38) 用 std::move
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
已修改
src/infiniop/ops/conv/info.h
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里不太能这么设计,
ConvInfo
本身还好,但推荐统一用一个 meta mem pool 然后每个 meta(i.e.,input_dims
,kernel_dims
, etc.) 用 offset 获取。可以参考ElementwiseInfo
的设计;- 去除
CudnnConvHandler
, 这个 cuda 特定的信息结构可以在 cuda 的Opaque
里设计。Opaque
对外隐藏,各个后端平台可以在里面进行自己的设计,这里可以详细阅读 gemm.h 里的对该设计的解释。ConvInfo
里只应有通用的属性和信息,不应该包含例如CudnnConvHandler
这种结构(即便用条件编译),这本身不是个好的设计。 - 第二点中提及的设计形式目前 InfiniCore 还没有(没有使用 cudnn 的),但 conv 的具体实现(cudnn 调用和检查等)可以参考之前提及的 operators 123 PR。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
convinfo已经修改为通过met mem pool和offset来获取meta
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cudnn handle已经修改为通过opaque来管理
96eaa5b
to
9038baa
Compare