Skip to content

Commit

Permalink
[BugFix] Fix check of overflow for converting floating to intergal (#…
Browse files Browse the repository at this point in the history
…49707)

Signed-off-by: zihe.liu <[email protected]>
(cherry picked from commit ebc98dd)

# Conflicts:
#	be/src/exprs/vectorized/cast_expr.cpp
#	be/src/formats/avro/numeric_column.cpp
#	be/test/CMakeLists.txt
#	test/sql/test_function/R/test_cast
#	test/sql/test_function/T/test_cast
  • Loading branch information
ZiheLiu authored and mergify[bot] committed Aug 13, 2024
1 parent 6d1e2a5 commit de14287
Show file tree
Hide file tree
Showing 11 changed files with 775 additions and 9 deletions.
16 changes: 8 additions & 8 deletions be/src/exprs/vectorized/cast_expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
#include "types/hll.h"
#include "util/date_func.h"
#include "util/json.h"
<<<<<<< HEAD:be/src/exprs/vectorized/cast_expr.cpp
#include "velocypack/Iterator.h"
=======
#include "util/json_converter.h"
#include "util/mysql_global.h"
#include "util/numeric_types.h"
>>>>>>> ebc98ddaa4 ([BugFix] Fix check of overflow for converting floating to intergal (#49707)):be/src/exprs/cast_expr.cpp

namespace starrocks::vectorized {

Expand Down Expand Up @@ -389,21 +395,15 @@ DEFINE_UNARY_FN_WITH_IMPL(ImplicitToNumber, value) {
}

DEFINE_UNARY_FN_WITH_IMPL(NumberCheck, value) {
// std::numeric_limits<T>::lowest() is a finite value x such that there is no other
// finite value y where y < x.
// This is different from std::numeric_limits<T>::min() for floating-point types.
// So we use lowest instead of min for lower bound of all types.
return (value < (Type)std::numeric_limits<ResultType>::lowest()) |
(value > (Type)std::numeric_limits<ResultType>::max());
return check_number_overflow<Type, ResultType>(value);
}

DEFINE_UNARY_FN_WITH_IMPL(NumberCheckWithThrowException, value) {
// std::numeric_limits<T>::lowest() is a finite value x such that there is no other
// finite value y where y < x.
// This is different from std::numeric_limits<T>::min() for floating-point types.
// So we use lowest instead of min for lower bound of all types.
auto result = (value < (Type)std::numeric_limits<ResultType>::lowest()) |
(value > (Type)std::numeric_limits<ResultType>::max());
const auto result = NumberCheck::apply<Type, ResultType>(value);
if (result) {
std::stringstream ss;
if constexpr (std::is_same_v<Type, __int128_t>) {
Expand Down
221 changes: 221 additions & 0 deletions be/src/formats/avro/numeric_column.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "numeric_column.h"

#include "column/fixed_length_column.h"
#include "gutil/strings/substitute.h"
#include "util/numeric_types.h"
#include "util/string_parser.hpp"

namespace starrocks {

template <typename FromType, typename ToType>
static inline bool checked_cast(const FromType& from, ToType* to) {
*to = static_cast<ToType>(from);

// NOTE: use lowest() because float and double needed.
DIAGNOSTIC_PUSH
#if defined(__clang__)
DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
#endif
return check_number_overflow<FromType, ToType>(from);
DIAGNOSTIC_POP
}

template <typename T>
static Status add_column_with_numeric_value(FixedLengthColumn<T>* column, const TypeDescriptor& type_desc,
const std::string& name, const avro_value_t& value) {
switch (avro_value_get_type(&value)) {
case AVRO_INT32: {
int in;
if (avro_value_get_int(&value, &in) != 0) {
auto err_msg = strings::Substitute("Get int value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}
T out{};

if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}
case AVRO_INT64: {
int64_t in;
if (avro_value_get_long(&value, &in) != 0) {
auto err_msg = strings::Substitute("Get int64 value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}
T out{};

if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}
case AVRO_BOOLEAN: {
int in;
if (avro_value_get_boolean(&value, &in) != 0) {
auto err_msg = strings::Substitute("Get boolean value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}
T out{};

if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}

case AVRO_FLOAT: {
float in;
if (avro_value_get_float(&value, &in) != 0) {
auto err_msg = strings::Substitute("Get float value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}

T out{};

if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}

case AVRO_DOUBLE: {
double in;
if (avro_value_get_double(&value, &in) != 0) {
auto err_msg = strings::Substitute("Get double value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}

T out{};

if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}

default: {
auto err_msg = strings::Substitute("Unsupported value type. column=$0", name);
return Status::DataQualityError(err_msg);
}
}
return Status::OK();
}

template <typename T>
static Status add_column_with_string_value_numeric(FixedLengthColumn<T>* column, const TypeDescriptor& type_desc,
const std::string& name, const avro_value_t& value) {
const char* in;
size_t size;
if (avro_value_get_string(&value, &in, &size) != 0) {
auto err_msg = strings::Substitute("Get string value error. column=$0", name);
return Status::InvalidArgument(err_msg);
}

// The size returned for a string object will include the NUL terminator,
// it will be one more than you’d get from calling strlen on the content.
// Please refer to this link: https://avro.apache.org/docs/1.11.1/api/c/
--size;

StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;

T v{};
if constexpr (std::is_floating_point<T>::value) {
v = StringParser::string_to_float<T>(in, size, &parse_result);
} else {
v = StringParser::string_to_int<T>(in, size, &parse_result);
}

if (parse_result == StringParser::PARSE_SUCCESS) {
column->append_numbers(&v, sizeof(v));
return Status::OK();
} else {
// Attemp to parse the string as float.
auto d = StringParser::string_to_float<double>(in, size, &parse_result);
if (parse_result == StringParser::PARSE_SUCCESS) {
if (!checked_cast(d, &v)) {
column->append_numbers(&v, sizeof(v));
return Status::OK();
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, d);
return Status::InvalidArgument(err_msg);
}
}

std::string err_msg = strings::Substitute("Unable to cast string value to BIGINT. value=$0, column=$1",
std::string(in, size), name);
return Status::InvalidArgument(err_msg);
}
}

template <typename T>
Status add_numeric_column(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value) {
auto numeric_column = down_cast<FixedLengthColumn<T>*>(column);
avro_type_t type = avro_value_get_type(&value);
switch (type) {
case AVRO_INT32:
case AVRO_INT64:
case AVRO_FLOAT:
case AVRO_DOUBLE:
case AVRO_BOOLEAN: {
return add_column_with_numeric_value(numeric_column, type_desc, name, value);
}

case AVRO_STRING: {
return add_column_with_string_value_numeric(numeric_column, type_desc, name, value);
}

default: {
auto err_msg = strings::Substitute("Unsupported value type. Numeric type is required. column=$0", name);
return Status::InvalidArgument(err_msg);
}
}
return Status::OK();
}

template Status add_numeric_column<int64_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<int32_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<int16_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<int8_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<uint8_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<double>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);
template Status add_numeric_column<float>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
const avro_value_t& value);

} // namespace starrocks
3 changes: 2 additions & 1 deletion be/src/formats/json/numeric_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "column/fixed_length_column.h"
#include "gutil/strings/substitute.h"
#include "util/numeric_types.h"
#include "util/string_parser.hpp"

namespace starrocks::vectorized {
Expand All @@ -18,7 +19,7 @@ static inline bool checked_cast(const FromType& from, ToType* to) {
#if defined(__clang__)
DIAGNOSTIC_IGNORE("-Wimplicit-const-int-float-conversion")
#endif
return (from < std::numeric_limits<ToType>::lowest() || from > std::numeric_limits<ToType>::max());
return check_number_overflow<FromType, ToType>(from);
DIAGNOSTIC_POP
}

Expand Down
58 changes: 58 additions & 0 deletions be/src/util/numeric_types.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <limits>
#include <type_traits>

namespace starrocks {

template <typename FromType, typename ToType>
static constexpr FromType floating_to_intergral_lower_bound =
static_cast<FromType>(std::numeric_limits<ToType>::lowest());

template <typename FromType, typename ToType>
static constexpr FromType floating_to_intergral_upper_bound = static_cast<FromType>(2) *
(std::numeric_limits<ToType>::max() / 2 + 1);

/// Check whether the value of type `FromType` overflows when converted to type `ToType`.
/// If overflow, return true; otherwise, return false.
template <typename FromType, typename ToType>
bool check_number_overflow(FromType value) {
if constexpr (std::is_floating_point_v<FromType> && std::is_integral_v<ToType>) {
// For floating-point numbers, we cannot use `value > (Type)std::numeric_limits<ResultType>::max()` to
// determine whether `value` exceeds the maximum value of ResultType. The reason is as follows:
//
// `std::numeric_limits<ResultType>::max()` is `2^n-1`, where n is 63, 31, 15 or 7, this number cannot be
// exactly represented by floating-point numbers, so when converted to Type, it will be rounded up to `2^n`.
// Therefore, when `value` is `2^n`, `value > (Type)std::numeric_limits<ResultType>::max()` will return false.
// However, in actual conversion, overflow will occur, resulting in the maximum or minimum value of ResultType,
// depending on the architecture, compiler, and compilation parameters.
//
// Because `2^n` can be exactly represented by floating-point numbers, we use `value >= (Type)2^n` to determine
// whether it is overflow, rather than `value > (Type)2^n-1`.
return !(value >= floating_to_intergral_lower_bound<FromType, ToType> &&
value < floating_to_intergral_upper_bound<FromType, ToType>);
} else {
// std::numeric_limits<T>::lowest() is a finite value x such that there is no other
// finite value y where y < x.
// This is different from std::numeric_limits<T>::min() for floating-point types.
// So we use lowest instead of min for lower bound of all types.
return (value < (FromType)std::numeric_limits<ToType>::lowest()) |
(value > (FromType)std::numeric_limits<ToType>::max());
}
}

} // namespace starrocks
7 changes: 7 additions & 0 deletions be/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,13 @@ set(EXEC_FILES
./util/ratelimit_test.cpp
./util/cpu_usage_info_test.cpp
./util/concurrent_limiter_test.cpp
<<<<<<< HEAD
=======
./util/stack_trace_mutex_test.cpp
./util/download_util_test.cpp
./util/numeric_types_test.cpp
./gutil/cpu_test.cc
>>>>>>> ebc98ddaa4 ([BugFix] Fix check of overflow for converting floating to intergal (#49707))
./gutil/sysinfo-test.cc
./service/lake_service_test.cpp
)
Expand Down
Loading

0 comments on commit de14287

Please sign in to comment.