[BugFix] Fix check of overflow for converting floating to intergal (#…

…49707) Signed-off-by: zihe.liu <[email protected]> (cherry picked from commit ebc98dd) # Conflicts: # be/src/exprs/vectorized/cast_expr.cpp # be/src/formats/avro/numeric_column.cpp # be/test/CMakeLists.txt # test/sql/test_function/R/test_cast # test/sql/test_function/T/test_cast
StarRocks · Aug 13, 2024 · de14287 · de14287
1 parent 6d1e2a5
commit de14287
Show file tree

Hide file tree

Showing 11 changed files with 775 additions and 9 deletions.
diff --git a/be/src/exprs/vectorized/cast_expr.cpp b/be/src/exprs/vectorized/cast_expr.cpp
@@ -30,7 +30,13 @@
 #include "types/hll.h"
 #include "util/date_func.h"
 #include "util/json.h"
+<<<<<<< HEAD:be/src/exprs/vectorized/cast_expr.cpp
 #include "velocypack/Iterator.h"
+=======
+#include "util/json_converter.h"
+#include "util/mysql_global.h"
+#include "util/numeric_types.h"
+>>>>>>> ebc98ddaa4 ([BugFix] Fix check of overflow for converting floating to intergal (#49707)):be/src/exprs/cast_expr.cpp
 
 namespace starrocks::vectorized {
 
@@ -389,21 +395,15 @@ DEFINE_UNARY_FN_WITH_IMPL(ImplicitToNumber, value) {
 }
 
 DEFINE_UNARY_FN_WITH_IMPL(NumberCheck, value) {
-    // std::numeric_limits<T>::lowest() is a finite value x such that there is no other
-    // finite value y where y < x.
-    // This is different from std::numeric_limits<T>::min() for floating-point types.
-    // So we use lowest instead of min for lower bound of all types.
-    return (value < (Type)std::numeric_limits<ResultType>::lowest()) |
-           (value > (Type)std::numeric_limits<ResultType>::max());
+    return check_number_overflow<Type, ResultType>(value);
 }
 
 DEFINE_UNARY_FN_WITH_IMPL(NumberCheckWithThrowException, value) {
     // std::numeric_limits<T>::lowest() is a finite value x such that there is no other
     // finite value y where y < x.
     // This is different from std::numeric_limits<T>::min() for floating-point types.
     // So we use lowest instead of min for lower bound of all types.
-    auto result = (value < (Type)std::numeric_limits<ResultType>::lowest()) |
-                  (value > (Type)std::numeric_limits<ResultType>::max());
+    const auto result = NumberCheck::apply<Type, ResultType>(value);
     if (result) {
         std::stringstream ss;
         if constexpr (std::is_same_v<Type, __int128_t>) {

diff --git a/be/src/formats/avro/numeric_column.cpp b/be/src/formats/avro/numeric_column.cpp
@@ -0,0 +1,221 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "numeric_column.h"
+
+#include "column/fixed_length_column.h"
+#include "gutil/strings/substitute.h"
+#include "util/numeric_types.h"
+#include "util/string_parser.hpp"
+
+namespace starrocks {
+
+template <typename FromType, typename ToType>
+static inline bool checked_cast(const FromType& from, ToType* to) {
+    *to = static_cast<ToType>(from);
+
+    // NOTE: use lowest() because float and double needed.
+    DIAGNOSTIC_PUSH
+#if defined(__clang__)
+    DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+    return check_number_overflow<FromType, ToType>(from);
+    DIAGNOSTIC_POP
+}
+
+template <typename T>
+static Status add_column_with_numeric_value(FixedLengthColumn<T>* column, const TypeDescriptor& type_desc,
+                                            const std::string& name, const avro_value_t& value) {
+    switch (avro_value_get_type(&value)) {
+    case AVRO_INT32: {
+        int in;
+        if (avro_value_get_int(&value, &in) != 0) {
+            auto err_msg = strings::Substitute("Get int value error. column=$0", name);
+            return Status::InvalidArgument(err_msg);
+        }
+        T out{};
+
+        if (!checked_cast(in, &out)) {
+            column->append_numbers(&out, sizeof(out));
+        } else {
+            auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
+            return Status::InvalidArgument(err_msg);
+        }
+        return Status::OK();
+    }
+    case AVRO_INT64: {
+        int64_t in;
+        if (avro_value_get_long(&value, &in) != 0) {
+            auto err_msg = strings::Substitute("Get int64 value error. column=$0", name);
+            return Status::InvalidArgument(err_msg);
+        }
+        T out{};
+
+        if (!checked_cast(in, &out)) {
+            column->append_numbers(&out, sizeof(out));
+        } else {
+            auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
+            return Status::InvalidArgument(err_msg);
+        }
+        return Status::OK();
+    }
+    case AVRO_BOOLEAN: {
+        int in;
+        if (avro_value_get_boolean(&value, &in) != 0) {
+            auto err_msg = strings::Substitute("Get boolean value error. column=$0", name);
+            return Status::InvalidArgument(err_msg);
+        }
+        T out{};
+
+        if (!checked_cast(in, &out)) {
+            column->append_numbers(&out, sizeof(out));
+        } else {
+            auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
+            return Status::InvalidArgument(err_msg);
+        }
+        return Status::OK();
+    }
+
+    case AVRO_FLOAT: {
+        float in;
+        if (avro_value_get_float(&value, &in) != 0) {
+            auto err_msg = strings::Substitute("Get float value error. column=$0", name);
+            return Status::InvalidArgument(err_msg);
+        }
+
+        T out{};
+
+        if (!checked_cast(in, &out)) {
+            column->append_numbers(&out, sizeof(out));
+        } else {
+            auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
+            return Status::InvalidArgument(err_msg);
+        }
+        return Status::OK();
+    }
+
+    case AVRO_DOUBLE: {
+        double in;
+        if (avro_value_get_double(&value, &in) != 0) {
+            auto err_msg = strings::Substitute("Get double value error. column=$0", name);
+            return Status::InvalidArgument(err_msg);
+        }
+
+        T out{};
+
+        if (!checked_cast(in, &out)) {
+            column->append_numbers(&out, sizeof(out));
+        } else {
+            auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
+            return Status::InvalidArgument(err_msg);
+        }
+        return Status::OK();
+    }
+
+    default: {
+        auto err_msg = strings::Substitute("Unsupported value type. column=$0", name);
+        return Status::DataQualityError(err_msg);
+    }
+    }
+    return Status::OK();
+}
+
+template <typename T>
+static Status add_column_with_string_value_numeric(FixedLengthColumn<T>* column, const TypeDescriptor& type_desc,
+                                                   const std::string& name, const avro_value_t& value) {
+    const char* in;
+    size_t size;
+    if (avro_value_get_string(&value, &in, &size) != 0) {
+        auto err_msg = strings::Substitute("Get string value error. column=$0", name);
+        return Status::InvalidArgument(err_msg);
+    }
+
+    // The size returned for a string object will include the NUL terminator,
+    // it will be one more than you’d get from calling strlen on the content.
+    // Please refer to this link: https://avro.apache.org/docs/1.11.1/api/c/
+    --size;
+
+    StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
+
+    T v{};
+    if constexpr (std::is_floating_point<T>::value) {
+        v = StringParser::string_to_float<T>(in, size, &parse_result);
+    } else {
+        v = StringParser::string_to_int<T>(in, size, &parse_result);
+    }
+
+    if (parse_result == StringParser::PARSE_SUCCESS) {
+        column->append_numbers(&v, sizeof(v));
+        return Status::OK();
+    } else {
+        // Attemp to parse the string as float.
+        auto d = StringParser::string_to_float<double>(in, size, &parse_result);
+        if (parse_result == StringParser::PARSE_SUCCESS) {
+            if (!checked_cast(d, &v)) {
+                column->append_numbers(&v, sizeof(v));
+                return Status::OK();
+            } else {
+                auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, d);
+                return Status::InvalidArgument(err_msg);
+            }
+        }
+
+        std::string err_msg = strings::Substitute("Unable to cast string value to BIGINT. value=$0, column=$1",
+                                                  std::string(in, size), name);
+        return Status::InvalidArgument(err_msg);
+    }
+}
+
+template <typename T>
+Status add_numeric_column(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                          const avro_value_t& value) {
+    auto numeric_column = down_cast<FixedLengthColumn<T>*>(column);
+    avro_type_t type = avro_value_get_type(&value);
+    switch (type) {
+    case AVRO_INT32:
+    case AVRO_INT64:
+    case AVRO_FLOAT:
+    case AVRO_DOUBLE:
+    case AVRO_BOOLEAN: {
+        return add_column_with_numeric_value(numeric_column, type_desc, name, value);
+    }
+
+    case AVRO_STRING: {
+        return add_column_with_string_value_numeric(numeric_column, type_desc, name, value);
+    }
+
+    default: {
+        auto err_msg = strings::Substitute("Unsupported value type. Numeric type is required. column=$0", name);
+        return Status::InvalidArgument(err_msg);
+    }
+    }
+    return Status::OK();
+}
+
+template Status add_numeric_column<int64_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                            const avro_value_t& value);
+template Status add_numeric_column<int32_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                            const avro_value_t& value);
+template Status add_numeric_column<int16_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                            const avro_value_t& value);
+template Status add_numeric_column<int8_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                           const avro_value_t& value);
+template Status add_numeric_column<uint8_t>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                            const avro_value_t& value);
+template Status add_numeric_column<double>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                           const avro_value_t& value);
+template Status add_numeric_column<float>(Column* column, const TypeDescriptor& type_desc, const std::string& name,
+                                          const avro_value_t& value);
+
+} // namespace starrocks
diff --git a/be/src/formats/json/numeric_column.cpp b/be/src/formats/json/numeric_column.cpp
@@ -4,6 +4,7 @@
 
 #include "column/fixed_length_column.h"
 #include "gutil/strings/substitute.h"
+#include "util/numeric_types.h"
 #include "util/string_parser.hpp"
 
 namespace starrocks::vectorized {
@@ -18,7 +19,7 @@ static inline bool checked_cast(const FromType& from, ToType* to) {
 #if defined(__clang__)
     DIAGNOSTIC_IGNORE("-Wimplicit-const-int-float-conversion")
 #endif
-    return (from < std::numeric_limits<ToType>::lowest() || from > std::numeric_limits<ToType>::max());
+    return check_number_overflow<FromType, ToType>(from);
     DIAGNOSTIC_POP
 }
 

diff --git a/be/src/util/numeric_types.h b/be/src/util/numeric_types.h
@@ -0,0 +1,58 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <type_traits>
+
+namespace starrocks {
+
+template <typename FromType, typename ToType>
+static constexpr FromType floating_to_intergral_lower_bound =
+        static_cast<FromType>(std::numeric_limits<ToType>::lowest());
+
+template <typename FromType, typename ToType>
+static constexpr FromType floating_to_intergral_upper_bound = static_cast<FromType>(2) *
+                                                              (std::numeric_limits<ToType>::max() / 2 + 1);
+
+/// Check whether the value of type `FromType` overflows when converted to type `ToType`.
+/// If overflow, return true; otherwise, return false.
+template <typename FromType, typename ToType>
+bool check_number_overflow(FromType value) {
+    if constexpr (std::is_floating_point_v<FromType> && std::is_integral_v<ToType>) {
+        // For floating-point numbers, we cannot use `value > (Type)std::numeric_limits<ResultType>::max()` to
+        // determine whether `value` exceeds the maximum value of ResultType. The reason is as follows:
+        //
+        // `std::numeric_limits<ResultType>::max()` is `2^n-1`, where n is 63, 31, 15 or 7, this number cannot be
+        // exactly represented by floating-point numbers, so when converted to Type, it will be rounded up to `2^n`.
+        // Therefore, when `value` is `2^n`, `value > (Type)std::numeric_limits<ResultType>::max()` will return false.
+        // However, in actual conversion, overflow will occur, resulting in the maximum or minimum value of ResultType,
+        // depending on the architecture, compiler, and compilation parameters.
+        //
+        // Because `2^n` can be exactly represented by floating-point numbers, we use `value >= (Type)2^n` to determine
+        // whether it is overflow, rather than `value > (Type)2^n-1`.
+        return !(value >= floating_to_intergral_lower_bound<FromType, ToType> &&
+                 value < floating_to_intergral_upper_bound<FromType, ToType>);
+    } else {
+        // std::numeric_limits<T>::lowest() is a finite value x such that there is no other
+        // finite value y where y < x.
+        // This is different from std::numeric_limits<T>::min() for floating-point types.
+        // So we use lowest instead of min for lower bound of all types.
+        return (value < (FromType)std::numeric_limits<ToType>::lowest()) |
+               (value > (FromType)std::numeric_limits<ToType>::max());
+    }
+}
+
+} // namespace starrocks
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
@@ -329,6 +329,13 @@ set(EXEC_FILES
         ./util/ratelimit_test.cpp
         ./util/cpu_usage_info_test.cpp
         ./util/concurrent_limiter_test.cpp
+<<<<<<< HEAD
+=======
+        ./util/stack_trace_mutex_test.cpp
+        ./util/download_util_test.cpp
+        ./util/numeric_types_test.cpp
+        ./gutil/cpu_test.cc
+>>>>>>> ebc98ddaa4 ([BugFix] Fix check of overflow for converting floating to intergal (#49707))
         ./gutil/sysinfo-test.cc
         ./service/lake_service_test.cpp
         )