Skip to content

Commit

Permalink
remove blocked format
Browse files Browse the repository at this point in the history
Signed-off-by: fishbell <[email protected]>
  • Loading branch information
songbell committed Jan 24, 2025
1 parent 6e0ab3b commit fd2117a
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,6 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG

OUTPUT_VEC_TYPE res;

INPUT1_TYPE input_scale_val = IN_SCALE_VAL;

INPUT1_TYPE input_shift_val = IN_SHIFT_VAL;

INPUT1_TYPE output_scale_val = OUT_SCALE_VAL;

INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL;


#if HAS_CLAMP
#if CAN_USE_OUTPUT_RANGE
INPUT1_TYPE output_low_val = OUT_LO_VAL;
Expand All @@ -67,21 +58,21 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
#if CAN_USE_OUTPUT_RANGE

#if HAS_PRE_SHIFT
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val;
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL + IN_SHIFT_VAL;
#else
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val;
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL;
#endif

#if HAS_OUTPUT_RANGE_ROUND
val = round(val);
#endif

#if HAS_POST_SCALE
val *= output_scale_val;
val *= OUT_SCALE_VAL;
#endif

#if HAS_POST_SHIFT
val += output_shift_val;
val += OUT_SHIFT_VAL;
#endif

#if HAS_CLAMP
Expand All @@ -107,17 +98,17 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
#endif

#if HAS_PRE_SHIFT
val = round(val * input_scale_val + input_shift_val);
val = round(val * IN_SCALE_VAL + IN_SHIFT_VAL);
#else
val = round(val * input_scale_val);
val = round(val * IN_SCALE_VAL);
#endif

#if HAS_POST_SCALE
val *= output_scale_val;
val *= OUT_SCALE_VAL;
#endif

#if HAS_POST_SHIFT
val += output_shift_val;
val += OUT_SHIFT_VAL;
#endif

#endif // CAN_USE_OUTPUT_RANGE
Expand All @@ -127,7 +118,7 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
// *********************************** //

#if FEATURE_BLOCKED_FORMAT
if (of < OUTPUT_FEATURE_NUM)
//if (of < OUTPUT_FEATURE_NUM)
#endif
#if OUTPUT_IS_FP
res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,27 +51,6 @@ JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_par
const CommonDispatchData& dispatchData) const {
JitConstants jit = Parent::GetJitConstants(params, dispatchData);

if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) {
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true));
jit.AddConstant(MakeJitConstant("GWS_BATCH", 2));
jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1));
jit.AddConstant(MakeJitConstant("GWS_YX", 0));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
} else {
auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]);
jit.Merge(tensor_jits);
}

auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi;
auto has_output_range_round =
!(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8);
Expand Down Expand Up @@ -106,31 +85,25 @@ bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
!params.per_tensor_output_scale || !params.per_tensor_output_shift ||
(params.has_pre_shift && !params.per_tensor_input_shift))
return false;
// TBD, do we really need the strick block_size checking to support blocked foramt?
for (size_t i = 0; i < params.inputs.size(); i++) {
const auto input_layout = params.inputs[i].GetLayout();
const auto batch_size = params.inputs[i].Batch().v;
const auto feature_size = params.inputs[i].Feature().v;
if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) ||
(input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
input_layout == DataLayout::fs_b_yx_fsv32 ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0)))
/*auto check_blocked_format = [] (const DataTensor& dt) -> bool {
// if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute
auto feature_block_size = 16;
auto feature_size = dt.Feature().v;
if (feature_size % feature_block_size != 0)
return false;
}
if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) ||
params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
(params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 &&
(params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0)))
if (dt.DoubleBlockedLayout()) {
auto batch_size = dt.Batch().v;
if (batch_size % feature_block_size != 0)
return false;
}
return true;
};*/
if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0)
return false;
/*if (!params.outputs[0].SimpleLayout()) {
//return check_blocked_format(params.outputs[0]);
return false;
// TBD maybe need more stric check?
}*/
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,85 @@ TEST(quantize_gpu, dynamic) {
}
}

TEST(quantize_gpu, opt_vec_kernel) {
auto& engine = get_test_engine();

auto input = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::bfyx });
auto input_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto input_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto output_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto output_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });

layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };

set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
5.0f, 2.0f, 2.0f, 3.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,

1.0f, 1.0f, 1.0f, 1.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f,

1.0f, 2.0f, 3.0f, 4.0f,
5.0f, 2.0f, 2.0f, 3.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,

1.0f, 1.0f, 1.0f, 1.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f });

set_values(input_low, { 0.0f});
set_values(input_high, { 10.0f});

set_values(output_low, { 0.0f });
set_values(output_high, { 255.0f });

std::vector<uint8_t> ref_data = {0, 54, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 26, 26,
26, 26, 26, 26, 102, 153, 76, 76, 76, 128, 26, 26, 26, 26, 26, 26,
26, 51, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 26, 26,
26, 26, 26, 26, 102, 153, 76, 76, 76, 128, 26, 26, 26, 26, 26, 26};

topology topology;
topology.add(
input_layout("input", in_dyn_layout),
data("input_low", input_low),
data("input_high", input_high),
data("output_low", output_low),
data("output_high", output_high),
quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8)
);

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("input", input);

auto inst = network.get_primitive("quantize");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());

auto outputs = network.execute();

auto output = outputs.at("quantize").get_memory();
cldnn::mem_lock<uint8_t> output_ptr(output, get_test_stream());

// Check that layout and memory contains logical size of tensor
ASSERT_EQ(output->count(), (size_t)64);
ASSERT_EQ(output->get_layout().count(), (size_t)64);

ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint8_t));

for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i;
}
}

TEST(quantize_gpu, dynamic_fsv16) {
auto& engine = get_test_engine();

Expand Down Expand Up @@ -1050,7 +1129,6 @@ struct quantize_random_test : testing::TestWithParam<quantize_random_test_params
FAIL() << "Not supported inputs number: " << params.inputs_num;
}


network net_opt(engine, topo_opt, get_test_default_config(engine));
net_opt.set_input_data("input_opt", input_opt);

Expand Down

0 comments on commit fd2117a

Please sign in to comment.