diff --git a/be/src/vec/data_types/serde/data_type_array_serde.h b/be/src/vec/data_types/serde/data_type_array_serde.h index 13c40e60777258..2798596c82373b 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.h +++ b/be/src/vec/data_types/serde/data_type_array_serde.h @@ -101,6 +101,8 @@ class DataTypeArraySerDe : public DataTypeSerDe { nested_serde->set_return_object_as_string(value); } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return {nested_serde}; } + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_map_serde.h b/be/src/vec/data_types/serde/data_type_map_serde.h index 5e10a7ec3f2a20..d95726824703ca 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.h +++ b/be/src/vec/data_types/serde/data_type_map_serde.h @@ -95,6 +95,10 @@ class DataTypeMapSerDe : public DataTypeSerDe { value_serde->set_return_object_as_string(value); } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { + return {key_serde, value_serde}; + } + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h b/be/src/vec/data_types/serde/data_type_nullable_serde.h index e9af344fb65f75..c7dac856621542 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.h +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h @@ -99,6 +99,8 @@ class DataTypeNullableSerDe : public DataTypeSerDe { int row_num) const override; Status read_one_cell_from_json(IColumn& column, const rapidjson::Value& result) const override; + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return {nested_serde}; } + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 46236faa926c6f..6caa51d2663089 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -98,6 +98,10 @@ class IColumn; class Arena; class IDataType; +class DataTypeSerDe; +using DataTypeSerDeSPtr = std::shared_ptr; +using DataTypeSerDeSPtrs = std::vector; + // Deserialize means read from different file format or memory format, // for example read from arrow, read from parquet. // Serialize means write the column cell or the total column into another @@ -332,6 +336,11 @@ class DataTypeSerDe { Arena& mem_pool, int row_num) const; virtual Status read_one_cell_from_json(IColumn& column, const rapidjson::Value& result) const; + virtual DataTypeSerDeSPtrs get_nested_serdes() const { + throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, + "Method get_nested_serdes is not supported for this serde"); + } + protected: bool _return_object_as_string = false; // This parameter indicates what level the serde belongs to and is mainly used for complex types @@ -374,9 +383,6 @@ inline void checkArrowStatus(const arrow::Status& status, const std::string& col } } -using DataTypeSerDeSPtr = std::shared_ptr; -using DataTypeSerDeSPtrs = std::vector; - DataTypeSerDeSPtrs create_data_type_serdes( const std::vector>& types); DataTypeSerDeSPtrs create_data_type_serdes(const std::vector& slots); diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.h b/be/src/vec/data_types/serde/data_type_struct_serde.h index 84e988e150bbdc..5cd6f89e42fd6e 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.h +++ b/be/src/vec/data_types/serde/data_type_struct_serde.h @@ -171,6 +171,8 @@ class DataTypeStructSerDe : public DataTypeSerDe { } } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return elem_serdes_ptrs; } + private: std::optional try_get_position_by_name(const String& name) const; diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index e3c2c1f332e11b..307edc265beb6d 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -54,8 +54,11 @@ #include "util/slice.h" #include "util/uid_util.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_map.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" #include "vec/common/assert_cast.h" #include "vec/common/typeid_cast.h" #include "vec/core/block.h" @@ -164,10 +167,18 @@ void NewJsonReader::_init_file_description() { } Status NewJsonReader::init_reader( - const std::unordered_map& col_default_value_ctx) { + const std::unordered_map& col_default_value_ctx, + bool is_load) { + _is_load = is_load; + // generate _col_default_value_map RETURN_IF_ERROR(_get_column_default_value(_file_slot_descs, col_default_value_ctx)); + //use serde insert data to column. + for (auto* slot_desc : _file_slot_descs) { + _serdes.emplace_back(slot_desc->get_data_type_ptr()->get_serde()); + } + // create decompressor. // _decompressor may be nullptr if this is not a compressed file RETURN_IF_ERROR(Decompressor::create_decompressor(_file_compress_type, &_decompressor)); @@ -387,6 +398,9 @@ Status NewJsonReader::_get_range_params() { if (_params.file_attributes.__isset.fuzzy_parse) { _fuzzy_parse = _params.file_attributes.fuzzy_parse; } + if (_range.table_format_params.table_format_type == "hive") { + _is_hive_table = true; + } return Status::OK(); } @@ -474,8 +488,8 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState* /*state*/, Block& block bool valid = false; if (_next_row >= _total_rows) { // parse json and generic document Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next + if (_is_load && st.is()) { + continue; // continue to read next (for load, after this , already append error to file.) } RETURN_IF_ERROR(st); if (*is_empty_row) { @@ -752,7 +766,21 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl int ctx_idx = 0; bool has_valid_value = false; - for (auto* slot_desc : slot_descs) { + + if (_is_hive_table) { + //don't like _fuzzy_parse,each line read in must modify name_map once. + + for (int i = 0; i < objectValue.MemberCount(); ++i) { + auto it = objectValue.MemberBegin() + i; + std::string field_name(it->name.GetString(), it->name.GetStringLength()); + std::transform(field_name.begin(), field_name.end(), field_name.begin(), ::tolower); + + //Use the last value with the same name. + _name_map.emplace(field_name, i); + } + } + for (size_t slot_idx = 0; slot_idx < slot_descs.size(); ++slot_idx) { + auto* slot_desc = slot_descs[slot_idx]; if (!slot_desc->is_materialized()) { continue; } @@ -761,7 +789,7 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl auto* column_ptr = block.get_by_position(dest_index).column->assume_mutable().get(); rapidjson::Value::ConstMemberIterator it = objectValue.MemberEnd(); - if (_fuzzy_parse) { + if (_fuzzy_parse || _is_hive_table) { auto idx_it = _name_map.find(slot_desc->col_name()); if (idx_it != _name_map.end() && idx_it->second < objectValue.MemberCount()) { it = objectValue.MemberBegin() + idx_it->second; @@ -773,20 +801,21 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl if (it != objectValue.MemberEnd()) { const rapidjson::Value& value = it->value; - RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc->type(), column_ptr, + slot_desc->col_name(), _serdes[slot_idx], valid)); if (!(*valid)) { return Status::OK(); } has_valid_value = true; } else { // not found, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[slot_idx], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } } - if (!has_valid_value) { + if (!has_valid_value && _is_load) { // there is no valid value in json line but has filled with default value before // so remove this line in block string col_names; @@ -810,79 +839,188 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + *valid = true; + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + } else { - return Status::InternalError("It should not here."); + // We can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + // Or for any type, use `NewJsonReader::_print_json_value(*value)`. + + const char* str_value = nullptr; + char tmp_buf[128] = {0}; + size_t wbytes = 0; + std::string json_str; + + switch (value->GetType()) { + case rapidjson::Type::kStringType: + str_value = value->GetString(); + wbytes = value->GetStringLength(); + break; + case rapidjson::Type::kNumberType: + if (value->IsUint()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); + } else if (value->IsInt()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); + } else if (value->IsUint64()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); + } else if (value->IsInt64()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); + } else if (value->IsFloat() || value->IsDouble()) { + auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); + wbytes = end - tmp_buf; + } else { + return Status::InternalError("It should not here."); + } + str_value = tmp_buf; + break; + case rapidjson::Type::kFalseType: + wbytes = 1; + str_value = (char*)"0"; + break; + case rapidjson::Type::kTrueType: + wbytes = 1; + str_value = (char*)"1"; + break; + default: + // for other type, we convert it to string to save + json_str = NewJsonReader::_print_json_value(*value); + wbytes = json_str.size(); + str_value = json_str.c_str(); + break; + } + Slice slice {str_value, wbytes}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); } - str_value = tmp_buf; - break; - case rapidjson::Type::kFalseType: - wbytes = 1; - str_value = (char*)"0"; - break; - case rapidjson::Type::kTrueType: - wbytes = 1; - str_value = (char*)"1"; - break; - case rapidjson::Type::kNullType: - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - *value, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); + } else if (type_desc.type == TYPE_STRUCT) { + if (!value->IsObject()) [[unlikely]] { + return Status::DataQualityError( + "Json value isn't object, but the column `{}` is struct.", column_name); } - // return immediately to prevent from repeatedly insert_data - *valid = true; - return Status::OK(); - default: - // for other type like array or object. we convert it to string to save - json_str = NewJsonReader::_print_json_value(*value); - wbytes = json_str.size(); - str_value = json_str.c_str(); - break; - } + auto sub_col_size = type_desc.children.size(); + const auto& struct_value = value->GetObject(); + + auto sub_serdes = data_serde->get_nested_serdes(); + auto struct_column_ptr = assert_cast(data_column_ptr); + + std::map sub_col_name_to_idx; + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx], sub_col_idx); + } + + std::vector sub_values(sub_col_size, nullptr); + for (const auto& sub : struct_value) { + if (!sub.name.IsString()) [[unlikely]] { + return Status::DataQualityError( + "Json file struct column `{}` subfield name isn't a String", column_name); + } + + auto sub_key_char = sub.name.GetString(); + auto sub_key_length = sub.name.GetStringLength(); + + std::string sub_key(sub_key_char, sub_key_length); + std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(), ::tolower); + + if (sub_col_name_to_idx.find(sub_key) == sub_col_name_to_idx.end()) [[unlikely]] { + continue; + } + size_t sub_column_idx = sub_col_name_to_idx[sub_key]; + sub_values[sub_column_idx] = &sub.value; + } + + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + auto sub_value = sub_values[sub_col_idx]; + + const auto& sub_col_type = type_desc.children[sub_col_idx]; + + RETURN_IF_ERROR(_write_data_to_column( + sub_value, sub_col_type, struct_column_ptr->get_column(sub_col_idx).get_ptr(), + column_name + "." + type_desc.field_names[sub_col_idx], sub_serdes[sub_col_idx], + valid)); + } + } else if (type_desc.type == TYPE_MAP) { + if (!value->IsObject()) [[unlikely]] { + return Status::DataQualityError("Json value isn't object, but the column `{}` is map.", + column_name); + } + const auto& object_value = value->GetObject(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto map_column_ptr = assert_cast(data_column_ptr); - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - DCHECK(slot_desc->type().type == TYPE_VARCHAR || slot_desc->type().type == TYPE_STRING) - << slot_desc->type().type << ", query id: " << print_id(_state->query_id()); - assert_cast(column_ptr)->insert_data(str_value, wbytes); + for (const auto& member_value : object_value) { + RETURN_IF_ERROR(_write_data_to_column( + &member_value.name, type_desc.children[0], + map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(), + column_name + ".key", sub_serdes[0], valid)); + + RETURN_IF_ERROR(_write_data_to_column( + &member_value.value, type_desc.children[1], + map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(), + column_name + ".value", sub_serdes[1], valid)); + } + + auto& offsets = map_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + object_value.MemberCount()); + } else if (type_desc.type == TYPE_ARRAY) { + if (!value->IsArray()) [[unlikely]] { + return Status::DataQualityError("Json value isn't array, but the column `{}` is array.", + column_name); + } + const auto& array_value = value->GetArray(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto array_column_ptr = assert_cast(data_column_ptr); + + for (const auto& sub_value : array_value) { + RETURN_IF_ERROR(_write_data_to_column(&sub_value, type_desc.children[0], + array_column_ptr->get_data().get_ptr(), + column_name + ".element", sub_serdes[0], valid)); + } + auto& offsets = array_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + array_value.Size()); + } else { + return Status::InternalError("Not support load to complex column."); + } *valid = true; return Status::OK(); @@ -914,20 +1052,21 @@ Status NewJsonReader::_write_columns_by_jsonpath(rapidjson::Value& objectValue, // if json_values' size > 1, it means we just match an array, not a wrapped one, so no need to unwrap. json_values = &((*json_values)[0]); } - RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i], column_ptr, valid)); + RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i]->type(), column_ptr, + slot_descs[i]->col_name(), _serdes[i], valid)); if (!(*valid)) { return Status::OK(); } has_valid_value = true; } else { // not found, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } } - if (!has_valid_value) { + if (!has_valid_value && _is_load) { // there is no valid value in json line but has filled with default value before // so remove this line in block for (int i = 0; i < block.columns(); ++i) { @@ -1074,7 +1213,7 @@ Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc // step2: get json value by json doc Status st = _get_json_value(&size, eof, &error, is_empty_row); - if (st.is()) { + if (_is_load && st.is()) { return Status::OK(); } RETURN_IF_ERROR(st); @@ -1349,25 +1488,39 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val for (auto field : *value) { std::string_view key = field.unescaped_key(); StringRef name_ref(key.data(), key.size()); + std::string key_string; + if (_is_hive_table) { + key_string = name_ref.to_string(); + std::transform(key_string.begin(), key_string.end(), key_string.begin(), ::tolower); + name_ref = StringRef(key_string); + } const size_t column_index = _column_index(name_ref, key_index++); if (UNLIKELY(ssize_t(column_index) < 0)) { // This key is not exist in slot desc, just ignore continue; } if (_seen_columns[column_index]) { - continue; + if (_is_hive_table) { + //Since value can only be traversed once, + // we can only insert the original value first, then delete it, and then reinsert the new value + block.get_by_position(column_index).column->assume_mutable()->pop_back(1); + } else { + continue; + } } simdjson::ondemand::value val = field.value(); auto* column_ptr = block.get_by_position(column_index).column->assume_mutable().get(); - RETURN_IF_ERROR( - _simdjson_write_data_to_column(val, slot_descs[column_index], column_ptr, valid)); + RETURN_IF_ERROR(_simdjson_write_data_to_column( + val, slot_descs[column_index]->type(), column_ptr, + slot_descs[column_index]->col_name(), _serdes[column_index], valid)); if (!(*valid)) { return Status::OK(); } _seen_columns[column_index] = true; has_valid_value = true; } - if (!has_valid_value) { + + if (!has_valid_value && _is_load) { string col_names; for (auto* slot_desc : slot_descs) { col_names.append(slot_desc->col_name() + ", "); @@ -1400,7 +1553,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val auto* column_ptr = block.get_by_position(i).column->assume_mutable().get(); if (column_ptr->size() < cur_row_count + 1) { DCHECK(column_ptr->size() == cur_row_count); - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } @@ -1409,12 +1562,6 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val DCHECK(column_ptr->size() == cur_row_count + 1); } -#ifndef NDEBUG - // Check all columns rows matched - for (size_t i = 0; i < block.columns(); ++i) { - DCHECK_EQ(block.get_by_position(i).column->size(), cur_row_count + 1); - } -#endif // There is at least one valid value here DCHECK(nullcount < block.columns()); *valid = true; @@ -1422,54 +1569,180 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val } Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value, - SlotDescriptor* slot_desc, IColumn* column, - bool* valid) { - // write + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, + DataTypeSerDeSPtr serde, bool* valid) { ColumnNullable* nullable_column = nullptr; - IColumn* column_ptr = nullptr; - if (slot_desc->is_nullable()) { - nullable_column = assert_cast(column); - column_ptr = &nullable_column->get_nested_column(); - } - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - auto* column_string = assert_cast(column_ptr); - switch (value.type()) { - case simdjson::ondemand::json_type::null: { - if (column->is_nullable()) { - // insert_default already push 1 to null_map - nullable_column->insert_default(); + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + + if (column_ptr->is_nullable()) { + nullable_column = reinterpret_cast(column_ptr); + + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. + if (value.type() != simdjson::ondemand::json_type::null) { + nullable_column->get_null_map_data().push_back(0); } else { + nullable_column->insert_default(); + *valid = true; + return Status::OK(); + } + } else if (value.type() == simdjson::ondemand::json_type::null) [[unlikely]] { + if (_is_load) { RETURN_IF_ERROR(_append_error_msg( nullptr, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); + column_name, valid)); return Status::OK(); - } - break; - } - case simdjson::ondemand::json_type::boolean: { - nullable_column->get_null_map_data().push_back(0); - if (value.get_bool()) { - column_string->insert_data("1", 1); } else { - column_string->insert_data("0", 1); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - break; } - default: { + + if (_is_load || !type_desc.is_complex_type()) { if (value.type() == simdjson::ondemand::json_type::string) { - auto* unescape_buffer = - reinterpret_cast(_simdjson_ondemand_unscape_padding_buffer.data()); - std::string_view unescaped_value = - _ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer); - nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(unescaped_value.data(), unescaped_value.length()); - break; + std::string_view value_string = value.get_string(); + Slice slice {value_string.data(), value_string.size()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + + } else { + // Maybe we can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + std::string_view json_str = simdjson::to_json_string(value); + Slice slice {json_str.data(), json_str.size()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + } + } else if (type_desc.type == TYPE_STRUCT) { + if (value.type() != simdjson::ondemand::json_type::object) [[unlikely]] { + return Status::DataQualityError( + "Json value isn't object, but the column `{}` is struct.", column_name); + } + + auto sub_col_size = type_desc.children.size(); + simdjson::ondemand::object struct_value = value.get_object(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto struct_column_ptr = assert_cast(data_column_ptr); + + std::map sub_col_name_to_idx; + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx], sub_col_idx); + } + vector has_value(sub_col_size, false); + for (simdjson::ondemand::field sub : struct_value) { + std::string_view sub_key_view = sub.unescaped_key(); + std::string sub_key(sub_key_view.data(), sub_key_view.length()); + std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(), ::tolower); + + if (sub_col_name_to_idx.find(sub_key) == sub_col_name_to_idx.end()) [[unlikely]] { + continue; + } + size_t sub_column_idx = sub_col_name_to_idx[sub_key]; + auto sub_column_ptr = struct_column_ptr->get_column(sub_column_idx).get_ptr(); + + if (has_value[sub_column_idx]) [[unlikely]] { + // Since struct_value can only be traversed once, we can only insert + // the original value first, then delete it, and then reinsert the new value. + sub_column_ptr->pop_back(1); + } + has_value[sub_column_idx] = true; + + const auto& sub_col_type = type_desc.children[sub_column_idx]; + RETURN_IF_ERROR(_simdjson_write_data_to_column( + sub.value(), sub_col_type, sub_column_ptr, column_name + "." + sub_key, + sub_serdes[sub_column_idx], valid)); } - auto value_str = simdjson::to_json_string(value).value(); - nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(value_str.data(), value_str.length()); - } + + //fill missing subcolumn + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + if (has_value[sub_col_idx] == true) { + continue; + } + + auto sub_column_ptr = struct_column_ptr->get_column(sub_col_idx).get_ptr(); + if (sub_column_ptr->is_nullable()) { + sub_column_ptr->insert_default(); + continue; + } else [[unlikely]] { + return Status::DataQualityError( + "Json file structColumn miss field {} and this column isn't nullable.", + column_name + "." + type_desc.field_names[sub_col_idx]); + } + } + } else if (type_desc.type == TYPE_MAP) { + if (value.type() != simdjson::ondemand::json_type::object) [[unlikely]] { + return Status::DataQualityError("Json value isn't object, but the column `{}` is map.", + column_name); + } + simdjson::ondemand::object object_value = value.get_object(); + + auto sub_serdes = data_serde->get_nested_serdes(); + auto map_column_ptr = assert_cast(data_column_ptr); + + size_t field_count = 0; + for (simdjson::ondemand::field member_value : object_value) { + auto f = [](std::string_view key_view, const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, DataTypeSerDeSPtr serde, + vectorized::DataTypeSerDe::FormatOptions serde_options, bool* valid) { + auto data_column_ptr = column_ptr; + auto data_serde = serde; + if (column_ptr->is_nullable()) { + auto nullable_column = static_cast(column_ptr); + + nullable_column->get_null_map_data().push_back(0); + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + } + Slice slice(key_view.data(), key_view.length()); + + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + serde_options)); + return Status::OK(); + }; + + RETURN_IF_ERROR(f(member_value.unescaped_key(), type_desc.children[0], + map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(), + sub_serdes[0], _serde_options, valid)); + + simdjson::ondemand::value field_value = member_value.value(); + RETURN_IF_ERROR(_simdjson_write_data_to_column( + field_value, type_desc.children[1], + map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(), + column_name + ".value", sub_serdes[1], valid)); + field_count++; + } + + auto& offsets = map_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + field_count); + + } else if (type_desc.type == TYPE_ARRAY) { + if (value.type() != simdjson::ondemand::json_type::array) [[unlikely]] { + return Status::DataQualityError("Json value isn't array, but the column `{}` is array.", + column_name); + } + + simdjson::ondemand::array array_value = value.get_array(); + + auto sub_serdes = data_serde->get_nested_serdes(); + auto array_column_ptr = assert_cast(data_column_ptr); + + int field_count = 0; + for (simdjson::ondemand::value sub_value : array_value) { + RETURN_IF_ERROR(_simdjson_write_data_to_column( + sub_value, type_desc.children[0], array_column_ptr->get_data().get_ptr(), + column_name + ".element", sub_serdes[0], valid)); + field_count++; + } + auto& offsets = array_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + field_count); + + } else { + return Status::InternalError("Not support load to complex column."); } *valid = true; return Status::OK(); @@ -1677,13 +1950,14 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( has_valid_value = true; } else if (i >= _parsed_jsonpaths.size() || st.is()) { // not match in jsondata, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } else { - RETURN_IF_ERROR( - _simdjson_write_data_to_column(json_value, slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value, slot_desc->type(), + column_ptr, slot_desc->col_name(), + _serdes[i], valid)); if (!(*valid)) { return Status::OK(); } @@ -1741,25 +2015,30 @@ Status NewJsonReader::_get_column_default_value( return Status::OK(); } -Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, IColumn* column_ptr, - bool* valid) { - if (slot_desc->is_nullable()) { - auto* nullable_column = reinterpret_cast(column_ptr); - column_ptr = &nullable_column->get_nested_column(); - auto col_value = _col_default_value_map.find(slot_desc->col_name()); - if (col_value == _col_default_value_map.end()) { +Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde, + IColumn* column_ptr, bool* valid) { + auto col_value = _col_default_value_map.find(slot_desc->col_name()); + if (col_value == _col_default_value_map.end()) { + if (slot_desc->is_nullable()) { + auto* nullable_column = static_cast(column_ptr); nullable_column->insert_default(); } else { - const std::string& v_str = col_value->second; - nullable_column->get_null_map_data().push_back(0); - assert_cast(column_ptr)->insert_data(v_str.c_str(), v_str.size()); + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + nullptr, "The column `{}` is not nullable, but it's not found in jsondata.", + slot_desc->col_name(), valid)); + } else { + return Status::DataQualityError( + "The column `{}` is not nullable, but it's not found in jsondata.", + slot_desc->col_name()); + } } } else { - RETURN_IF_ERROR(_append_error_msg( - nullptr, "The column `{}` is not nullable, but it's not found in jsondata.", - slot_desc->col_name(), valid)); + const std::string& v_str = col_value->second; + Slice column_default_value {v_str}; + RETURN_IF_ERROR(serde->deserialize_one_cell_from_json(*column_ptr, column_default_value, + _serde_options)); } - *valid = true; return Status::OK(); } diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 0df3747b8c2a38..6828b6b2abfadb 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -88,7 +88,8 @@ class NewJsonReader : public GenericReader { ~NewJsonReader() override = default; Status init_reader(const std::unordered_map& - col_default_value_ctx); + col_default_value_ctx, + bool is_load); Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status get_columns(std::unordered_map* name_to_type, std::unordered_set* missing_cols) override; @@ -129,7 +130,8 @@ class NewJsonReader : public GenericReader { const std::vector& slot_descs, bool* valid); Status _write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr, + const TypeDescriptor& type_desc, vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid); Status _write_columns_by_jsonpath(rapidjson::Value& objectValue, @@ -178,8 +180,10 @@ class NewJsonReader : public GenericReader { const std::vector& slot_descs, bool* valid); Status _simdjson_write_data_to_column(simdjson::ondemand::value& value, - SlotDescriptor* slot_desc, - vectorized::IColumn* column_ptr, bool* valid); + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, + bool* valid); Status _simdjson_write_columns_by_jsonpath(simdjson::ondemand::object* value, const std::vector& slot_descs, @@ -197,8 +201,8 @@ class NewJsonReader : public GenericReader { const std::unordered_map& col_default_value_ctx); - Status _fill_missing_column(SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr, - bool* valid); + Status _fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde, + vectorized::IColumn* column_ptr, bool* valid); RuntimeState* _state = nullptr; RuntimeProfile* _profile = nullptr; @@ -283,6 +287,22 @@ class NewJsonReader : public GenericReader { std::unique_ptr _ondemand_json_parser; // column to default value string map std::unordered_map _col_default_value_map; + + bool _is_load = true; + //Used to indicate whether it is a stream load. When loading, only data will be inserted into columnString. + //If an illegal value is encountered during the load process, `_append_error_msg` should be called + //instead of directly returning `Status::DataQualityError` + + bool _is_hive_table = false; + // In hive : create table xxx ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; + // Hive will not allow you to create columns with the same name but different case, including field names inside + // structs, and will automatically convert uppercase names in create sql to lowercase.However, when Hive loads data + // to table, the column names in the data may be uppercase,and there may be multiple columns with + // the same name but different capitalization.We refer to the behavior of hive, convert all column names + // in the data to lowercase,and use the last one as the insertion value + + DataTypeSerDeSPtrs _serdes; + vectorized::DataTypeSerDe::FormatOptions _serde_options; }; } // namespace vectorized diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 997eef02090912..ba8048f73a9719 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -931,8 +931,8 @@ Status VFileScanner::_get_next_reader() { _cur_reader = NewJsonReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, &_scanner_eof, _io_ctx.get()); - init_status = - ((NewJsonReader*)(_cur_reader.get()))->init_reader(_col_default_value_ctx); + init_status = ((NewJsonReader*)(_cur_reader.get())) + ->init_reader(_col_default_value_ctx, _is_load); break; } case TFileFormatType::FORMAT_AVRO: { diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql new file mode 100644 index 00000000000000..adf0f7d56b27d9 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql @@ -0,0 +1,35 @@ +use `default`; + + +CREATE TABLE json_nested_complex_table ( + user_ID STRING, + user_PROFILE STRUCT< + name: STRING, + AGE: INT, + preferences: MAP< + STRING, + STRUCT< + preference_ID: INT, + preference_VALUES: ARRAY + > + > + >, + activity_LOG ARRAY< + STRUCT< + activity_DATE: STRING, + activities: MAP< + STRING, + STRUCT< + `DETAILS`: STRING, + metrics: MAP + > + > + > + > +) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' + +LOCATION + '/user/doris/preinstalled_data/json/json_nested_complex_table'; + + +msck repair table json_nested_complex_table; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql new file mode 100644 index 00000000000000..73df8cba557bcb --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql @@ -0,0 +1,73 @@ +use `default`; + + +CREATE TABLE json_all_complex_types ( + `id` int, + `boolean_col` boolean, + `tinyint_col` tinyint, + `smallint_col` smallint, + `int_col` int, + `bigint_col` bigint, + `float_col` float, + `double_col` double, + `decimal_col1` decimal(9,0), + `decimal_col2` decimal(8,4), + `decimal_col3` decimal(18,6), + `decimal_col4` decimal(38,12), + `string_col` string, + `binary_col` binary, + `date_col` date, + `timestamp_col1` timestamp, + `timestamp_col2` timestamp, + `timestamp_col3` timestamp, + `char_col1` char(50), + `char_col2` char(100), + `char_col3` char(255), + `varchar_col1` varchar(50), + `varchar_col2` varchar(100), + `varchar_col3` varchar(255), + `t_map_string` map, + `t_map_varchar` map, + `t_map_char` map, + `t_map_int` map, + `t_map_bigint` map, + `t_map_float` map, + `t_map_double` map, + `t_map_boolean` map, + `t_map_decimal_precision_2` map, + `t_map_decimal_precision_4` map, + `t_map_decimal_precision_8` map, + `t_map_decimal_precision_17` map, + `t_map_decimal_precision_18` map, + `t_map_decimal_precision_38` map, + `t_array_string` array, + `t_array_int` array, + `t_array_bigint` array, + `t_array_float` array, + `t_array_double` array, + `t_array_boolean` array, + `t_array_varchar` array, + `t_array_char` array, + `t_array_decimal_precision_2` array, + `t_array_decimal_precision_4` array, + `t_array_decimal_precision_8` array, + `t_array_decimal_precision_17` array, + `t_array_decimal_precision_18` array, + `t_array_decimal_precision_38` array, + `t_struct_bigint` struct, + `t_complex` map>>, + `t_struct_nested` struct>, + `t_struct_null` struct, + `t_struct_non_nulls_after_nulls` struct, + `t_nested_struct_non_nulls_after_nulls` struct>, + `t_map_null_value` map, + `t_array_string_starting_with_nulls` array, + `t_array_string_with_nulls_in_between` array, + `t_array_string_ending_with_nulls` array, + `t_array_string_all_nulls` array + ) PARTITIONED BY (`dt` string) +ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' +LOCATION + '/user/doris/preinstalled_data/json/json_all_complex_types'; + +msck repair table json_all_complex_types; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql new file mode 100644 index 00000000000000..ec99e72d2f5780 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql @@ -0,0 +1,13 @@ +use `default`; + + +CREATE TABLE json_load_data_table ( + `id` int, + `col1` int, + `col2` struct< col2a:int, col2b:string>, + `col3` map +) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' +LOCATION + '/user/doris/preinstalled_data/json/json_load_data_table'; + +msck repair table json_load_data_table; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 new file mode 100644 index 00000000000000..5fe37cbc6f098e --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 @@ -0,0 +1,3 @@ +{"id":1,"boolean_col":true,"tinyint_col":127,"smallint_col":32767,"int_col":2147483647,"bigint_col":9223372036854775807,"float_col":123.45,"double_col":123456.789,"decimal_col1":123456789,"decimal_col2":1234.5678,"decimal_col3":123456.789012,"decimal_col4":123456789.012345678901,"string_col":"string_value","binary_col":"binary_value","date_col":"2024-03-20","timestamp_col1":"2024-03-20 12:00:00","timestamp_col2":"2024-03-20 12:00:00.123456789","timestamp_col3":"2024-03-20 12:00:00.123456789","char_col1":"char_value1 ","char_col2":"char_value2 ","char_col3":"char_value3 ","varchar_col1":"varchar_value1","varchar_col2":"varchar_value2","varchar_col3":"varchar_value3","t_map_string":{"key1":"value1"},"t_map_varchar":{"key1":"value1"},"t_map_char":{"a ":"b "},"t_map_int":{"1":10},"t_map_bigint":{"1":100000000000},"t_map_float":{"1.1":10.1},"t_map_double":{"1.1":10.1},"t_map_boolean":{"true":false},"t_map_decimal_precision_2":{"1.1":1.1},"t_map_decimal_precision_4":{"1.23":1.23},"t_map_decimal_precision_8":{"1.2345":1.2345},"t_map_decimal_precision_17":{"1.23456789":1.23456789},"t_map_decimal_precision_18":{"1.23456789":1.23456789},"t_map_decimal_precision_38":{"1.2345678901234568":1.2345678901234568},"t_array_string":["string1","string2"],"t_array_int":[1,2,3],"t_array_bigint":[100000000000,200000000000],"t_array_float":[1.1,2.2],"t_array_double":[1.123456789,2.123456789],"t_array_boolean":[true,false],"t_array_varchar":["varchar1","varchar2"],"t_array_char":["char1 ","char2 "],"t_array_decimal_precision_2":[1.1,2.2],"t_array_decimal_precision_4":[1.23,2.34],"t_array_decimal_precision_8":[1.2345,2.3456],"t_array_decimal_precision_17":[1.23456789,2.34567891],"t_array_decimal_precision_18":[1.23456789,2.34567891],"t_array_decimal_precision_38":[1.2345678901234568,2.3456789012345679],"t_struct_bigint":{"s_bigint":1234567890},"t_complex":{"key":[{"s_int":123}]},"t_struct_nested":{"struct_field":["value1","value2"]},"t_struct_null":{"struct_field_null":null,"struct_field_null2":null},"t_struct_non_nulls_after_nulls":{"struct_non_nulls_after_nulls1":123,"struct_non_nulls_after_nulls2":"value"},"t_nested_struct_non_nulls_after_nulls":{"struct_field1":123,"struct_field2":"value","strict_field3":{"nested_struct_field1":123,"nested_struct_field2":"nested_value"}},"t_map_null_value":{"null_key":null},"t_array_string_starting_with_nulls":[null,"value1","value2"],"t_array_string_with_nulls_in_between":["value1",null,"value2"],"t_array_string_ending_with_nulls":["value1","value2",null],"t_array_string_all_nulls":[null,null,null]} +{"id":2,"boolean_col":false,"tinyint_col":58,"smallint_col":12345,"int_col":2147483000,"bigint_col":null,"float_col":789.56,"double_col":654321.123,"decimal_col1":987654321,"decimal_col2":5678.1234,"decimal_col3":987654.321098,"decimal_col4":987654321.098765432109,"string_col":"changed_string","binary_col":"new_binary_value","date_col":"2025-05-25","timestamp_col1":"2025-05-25 15:30:00","timestamp_col2":"2025-05-25 15:30:00.654321987","timestamp_col3":"2025-05-25 15:30:00.654321987","char_col1":"char_new_value1 ","char_col2":"char_new_value2 ","char_col3":"char_new_value3 ","varchar_col1":"varchar_new_value1","varchar_col2":"varchar_new_value2","varchar_col3":"varchar_new_value3","t_map_string":{"key2":"value2"},"t_map_varchar":{"key2":"value2"},"t_map_char":{"x ":"y "},"t_map_int":{"2":20},"t_map_bigint":{"2":200000000000},"t_map_float":{"2.2":20.2},"t_map_double":{"2.2":20.2},"t_map_boolean":{"false":true},"t_map_decimal_precision_2":{"2.2":2.2},"t_map_decimal_precision_4":{"2.34":2.34},"t_map_decimal_precision_8":{"2.3456":2.3456},"t_map_decimal_precision_17":{"2.34567891":2.34567891},"t_map_decimal_precision_18":{"2.34567891":2.34567891},"t_map_decimal_precision_38":{"2.3456789012345679":2.3456789012345679},"t_array_string":["string3","string4"],"t_array_int":[4,5,6],"t_array_bigint":[300000000000,400000000000],"t_array_float":[2.2,3.3],"t_array_double":[2.123456789,3.123456789],"t_array_boolean":[false,true],"t_array_varchar":["varchar3","varchar4"],"t_array_char":["char3 ","char4 "],"t_array_decimal_precision_2":[2.2,3.3],"t_array_decimal_precision_4":[2.34,3.45],"t_array_decimal_precision_8":[2.3456,3.4567],"t_array_decimal_precision_17":[2.34567891,3.45678901],"t_array_decimal_precision_18":[2.34567891,3.45678901],"t_array_decimal_precision_38":[2.3456789012345679,3.4567890123456789],"t_struct_bigint":{"s_bigint":9876543210},"t_complex":{"key2":[{"s_int":456}]},"t_struct_nested":{"struct_field":["new_value1","new_value2"]},"t_struct_null":{"struct_field_null":null,"struct_field_null2":null},"t_struct_non_nulls_after_nulls":{"struct_non_nulls_after_nulls1":456,"struct_non_nulls_after_nulls2":"new_value"},"t_nested_struct_non_nulls_after_nulls":{"struct_field1":456,"struct_field2":"new_value","strict_field3":{"nested_struct_field1":456,"nested_struct_field2":"nested_value2"}},"t_map_null_value":{"null_key":null},"t_array_string_starting_with_nulls":[null,"new_value1","new_value2"],"t_array_string_with_nulls_in_between":["new_value1",null,"new_value2"],"t_array_string_ending_with_nulls":["new_value1","new_value2",null],"t_array_string_all_nulls":[null,null,null]} +{"id":3,"boolean_col":false,"tinyint_col":-128,"smallint_col":-32768,"int_col":-2147483648,"bigint_col":-9223372036854775808,"float_col":-3.4028235E38,"double_col":-1.7976931348623157E308,"decimal_col1":-999999999,"decimal_col2":-9999.9999,"decimal_col3":-999999999.999999,"decimal_col4":null,"string_col":"min_string_value","binary_col":"xxxx","date_col":"2001-01-01","timestamp_col1":"2001-01-01 00:00:00","timestamp_col2":"2001-01-01 00:00:00","timestamp_col3":"2001-01-01 00:00:00","char_col1":"char_min_value1 ","char_col2":"char_min_value2 ","char_col3":"char_min_value3 ","varchar_col1":"varchar_min_value1","varchar_col2":"varchar_min_value2","varchar_col3":"varchar_min_value3","t_map_string":{"min_key":"min_value"},"t_map_varchar":{"min_key":"min_value"},"t_map_char":{"a ":"z "},"t_map_int":{"-1":-100},"t_map_bigint":{"-1":-100000000000},"t_map_float":{"-1.1":-10.1},"t_map_double":{"-1.1":-10.1},"t_map_boolean":{"false":true},"t_map_decimal_precision_2":{"-1.1":-1.1},"t_map_decimal_precision_4":{"-1.23":-1.23},"t_map_decimal_precision_8":{"-1.2345":-1.2345},"t_map_decimal_precision_17":{"-1.23456789":-1.23456789},"t_map_decimal_precision_18":{"-1.23456789":-1.23456789},"t_map_decimal_precision_38":{"-1.2345678901234568":-1.2345678901234568},"t_array_string":["min_string1","min_string2"],"t_array_int":[-10,-5,-3],"t_array_bigint":[-100000000000,-200000000000],"t_array_float":[-1.1,-2.2],"t_array_double":[-1.123456789,-2.123456789],"t_array_boolean":[false,true],"t_array_varchar":["min_varchar1","min_varchar2"],"t_array_char":["min_char1 ","min_char2 "],"t_array_decimal_precision_2":[-1.1,-2.2],"t_array_decimal_precision_4":[-1.23,-2.34],"t_array_decimal_precision_8":[-1.2345,-2.3456],"t_array_decimal_precision_17":[-1.23456789,-2.34567891],"t_array_decimal_precision_18":[-1.23456789,-2.34567891],"t_array_decimal_precision_38":[-1.2345678901234568,-2.3456789012345679],"t_struct_bigint":{"s_bigint":-1234567890},"t_complex":{"min_key":[{"s_int":-123}]},"t_struct_nested":{"struct_field":["min_value1","min_value2"]},"t_struct_null":{"struct_field_null":null,"struct_field_null2":null},"t_struct_non_nulls_after_nulls":{"struct_non_nulls_after_nulls1":-123,"struct_non_nulls_after_nulls2":"min_value"},"t_nested_struct_non_nulls_after_nulls":{"struct_field1":-123,"struct_field2":"min_value","strict_field3":{"nested_struct_field1":-123,"nested_struct_field2":"nested_value"}},"t_map_null_value":{"null_key":null},"t_array_string_starting_with_nulls":[null,"min_value1","min_value2"],"t_array_string_with_nulls_in_between":["min_value1",null,"min_value2"],"t_array_string_ending_with_nulls":["min_value1","min_value2",null],"t_array_string_all_nulls":[null,null,null]} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 new file mode 100644 index 00000000000000..0a823bee693d76 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 @@ -0,0 +1 @@ +{"id":4,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":123.45,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_char":null,"t_map_int":{"1":10},"t_map_bigint":null,"t_map_float":null,"t_map_double":null,"t_map_boolean":null,"t_map_decimal_precision_2":null,"t_map_decimal_precision_4":null,"t_map_decimal_precision_8":null,"t_map_decimal_precision_17":null,"t_map_decimal_precision_18":null,"t_map_decimal_precision_38":null,"t_array_string":null,"t_array_int":null,"t_array_bigint":null,"t_array_float":null,"t_array_double":null,"t_array_boolean":null,"t_array_varchar":null,"t_array_char":null,"t_array_decimal_precision_2":null,"t_array_decimal_precision_4":null,"t_array_decimal_precision_8":[1.2345,2.3456],"t_array_decimal_precision_17":null,"t_array_decimal_precision_18":null,"t_array_decimal_precision_38":null,"t_struct_bigint":null,"t_complex":null,"t_struct_nested":null,"t_struct_null":null,"t_struct_non_nulls_after_nulls":null,"t_nested_struct_non_nulls_after_nulls":null,"t_map_null_value":null,"t_array_string_starting_with_nulls":[null,"value1","value2"],"t_array_string_with_nulls_in_between":null,"t_array_string_ending_with_nulls":null,"t_array_string_all_nulls":null} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 new file mode 100644 index 00000000000000..a5e46399fdd553 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 @@ -0,0 +1,2 @@ +{"id":5,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_char":null,"t_map_int":null,"t_map_bigint":null,"t_map_float":null,"t_map_double":null,"t_map_boolean":null,"t_map_decimal_precision_2":null,"t_map_decimal_precision_4":null,"t_map_decimal_precision_8":null,"t_map_decimal_precision_17":null,"t_map_decimal_precision_18":null,"t_map_decimal_precision_38":null,"t_array_string":null,"t_array_int":null,"t_array_bigint":null,"t_array_float":null,"t_array_double":null,"t_array_boolean":null,"t_array_varchar":null,"t_array_char":null,"t_array_decimal_precision_2":null,"t_array_decimal_precision_4":null,"t_array_decimal_precision_8":null,"t_array_decimal_precision_17":null,"t_array_decimal_precision_18":null,"t_array_decimal_precision_38":null,"t_struct_bigint":null,"t_complex":null,"t_struct_nested":null,"t_struct_null":null,"t_struct_non_nulls_after_nulls":null,"t_nested_struct_non_nulls_after_nulls":null,"t_map_null_value":null,"t_array_string_starting_with_nulls":null,"t_array_string_with_nulls_in_between":null,"t_array_string_ending_with_nulls":null,"t_array_string_all_nulls":null} +{"id":6,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_char":null,"t_map_int":null,"t_map_bigint":null,"t_map_float":null,"t_map_double":null,"t_map_boolean":null,"t_map_decimal_precision_2":null,"t_map_decimal_precision_4":null,"t_map_decimal_precision_8":null,"t_map_decimal_precision_17":null,"t_map_decimal_precision_18":null,"t_map_decimal_precision_38":null,"t_array_string":null,"t_array_int":null,"t_array_bigint":null,"t_array_float":null,"t_array_double":null,"t_array_boolean":null,"t_array_varchar":null,"t_array_char":null,"t_array_decimal_precision_2":null,"t_array_decimal_precision_4":null,"t_array_decimal_precision_8":null,"t_array_decimal_precision_17":null,"t_array_decimal_precision_18":null,"t_array_decimal_precision_38":null,"t_struct_bigint":null,"t_complex":null,"t_struct_nested":null,"t_struct_null":null,"t_struct_non_nulls_after_nulls":null,"t_nested_struct_non_nulls_after_nulls":null,"t_map_null_value":null,"t_array_string_starting_with_nulls":null,"t_array_string_with_nulls_in_between":null,"t_array_string_ending_with_nulls":null,"t_array_string_all_nulls":null} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 new file mode 100644 index 00000000000000..70d1265f98d826 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 @@ -0,0 +1,13 @@ +{"id":1,"col1":10,"col2":{"col2a":10,"col2b":"string1"},"col3":{"1":"string10"}} +{"id":2,"col1":10,"col1":20,"col2":{"col2b":"string2","col2a":0,"Col2A":20},"col3":{"2":"string2"}} +{"id":3,"col1":10,"col1":20,"COL1":30,"COL2":{"col2a":30,"col2b":"string3"}} +{"id":4,"COL1":40,"col2":{"col2a":10,"col2b":"string4","new_col":"new_val","col2a":40},"col3":{"4":"string4"}} +{"id":5} +{"id":6,"col1":60,"col2":{"COL2a":60,"col2b":600},"col3":{"6":600}} +{"id":7,"col1":70,"col3":{"7":"string7"},"col2":{"col2b":"string7","col2a":70}} + + + + +{} +{"a":5} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 new file mode 100644 index 00000000000000..11342c441bce00 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 @@ -0,0 +1,2 @@ +{"user_id":"user1","user_profile":{"name":"Alice","age":28,"preferences":{"sports":{"preference_id":101,"preference_values":["soccer","tennis"]},"music":{"preference_id":102,"preference_values":["rock","classical"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"workout":{"details":"Morning run","metrics":{"duration":30.5,"calories":200.0}},"reading":{"details":"Read book on Hive","metrics":{"pages":50.0,"time":2.0}}}},{"activity_date":"2024-08-02","activities":{"travel":{"details":"Flight to NY","metrics":{"distance":500.0,"time":3.0}},"meeting":{"details":"Project meeting","metrics":{"duration":1.5,"participants":5.0}}}}]} +{"user_id":"user2","user_profile":{"name":"Bob","age":32,"preferences":{"books":{"preference_id":201,"preference_values":["fiction","non-fiction"]},"travel":{"preference_id":202,"preference_values":["beaches","mountains"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"hiking":{"details":"Mountain trail","metrics":{"distance":10.0,"elevation":500.0}},"photography":{"details":"Wildlife photoshoot","metrics":{"photos_taken":100.0,"time":4.0}}}},{"activity_date":"2024-08-02","activities":{"workshop":{"details":"Photography workshop","metrics":{"duration":3.0,"participants":15.0}},"shopping":{"details":"Bought camera gear","metrics":{"items":5.0,"cost":1500.0}}}}]} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 new file mode 100644 index 00000000000000..e1b0befc7bca31 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 @@ -0,0 +1 @@ +{"user_id":"user3","user_profile":{"name":"Carol","age":24,"preferences":{"food":{"preference_id":301,"preference_values":["vegan","desserts"]},"movies":{"preference_id":302,"preference_values":["action","comedy"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"cooking":{"details":"Made vegan meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"details":"Watched action movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_date":"2024-08-02","activities":{"gym":{"details":"Strength training","metrics":{"duration":1.0,"calories":300.0}},"shopping":{"details":"Bought groceries","metrics":{"items":10.0,"cost":100.0}}}}]} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 new file mode 100644 index 00000000000000..08f1586f3aa91c --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 @@ -0,0 +1,2 @@ +{"user_ID":"user4","user_PROFILE":{"name":"Carol","age":24,"preferences":{"food":{"preference_ID":301,"preference_VALUES":["vegan","desserts"]},"movies":{"preference_ID":302,"preference_VALUES":["action","comedy"]}}},"activity_LOG":[{"activity_DATE":"2024-08-01","activities":{"cooking":{"DETAILS":"Made vegan meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"DETAILS":"Watched action movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_DATE":"2024-08-02","activities":{"gym":{"DETAILS":"Strength training","metrics":{"duration":1.0,"calories":300.0}},"shopping":{"DETAILS":"Bought groceries","metrics":{"items":10.0,"cost":100.0}}}}]} + diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 97032467cec765..0f839d238b2b1e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -94,6 +94,9 @@ public class HiveMetaStoreClientHelper { private static final Pattern digitPattern = Pattern.compile("(\\d+)"); + public static final String HIVE_JSON_SERDE = "org.apache.hive.hcatalog.data.JsonSerDe"; + public static final String LEGACY_HIVE_JSON_SERDE = "org.apache.hadoop.hive.serde2.JsonSerDe"; + public enum HiveFileFormat { TEXT_FILE(0, "text"), PARQUET(1, "parquet"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index e710bdb935d7bc..3a2a4d3eb5c6ae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -364,14 +364,21 @@ public TableIf getTargetTable() { @Override public TFileFormatType getFileFormatType() throws UserException { TFileFormatType type = null; - String inputFormatName = hmsTable.getRemoteTable().getSd().getInputFormat(); + Table table = hmsTable.getRemoteTable(); + String inputFormatName = table.getSd().getInputFormat(); String hiveFormat = HiveMetaStoreClientHelper.HiveFileFormat.getFormat(inputFormatName); if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.PARQUET.getDesc())) { type = TFileFormatType.FORMAT_PARQUET; } else if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.ORC.getDesc())) { type = TFileFormatType.FORMAT_ORC; } else if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.TEXT_FILE.getDesc())) { - type = TFileFormatType.FORMAT_CSV_PLAIN; + String serDeLib = table.getSd().getSerdeInfo().getSerializationLib(); + if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE) + || serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) { + type = TFileFormatType.FORMAT_JSON; + } else { + type = TFileFormatType.FORMAT_CSV_PLAIN; + } } return type; } @@ -383,11 +390,12 @@ protected Map getLocationProperties() throws UserException { @Override protected TFileAttributes getFileAttributes() throws UserException { - TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + TFileAttributes fileAttributes = new TFileAttributes(); Table table = hmsTable.getRemoteTable(); // TODO: separate hive text table and OpenCsv table String serDeLib = table.getSd().getSerdeInfo().getSerializationLib(); if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // set properties of LazySimpleSerDe // 1. set column separator textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table)); @@ -401,7 +409,10 @@ protected TFileAttributes getFileAttributes() throws UserException { HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0])); // 6. set null format textParams.setNullFormat(HiveProperties.getNullFormat(table)); + fileAttributes.setTextParams(textParams); + fileAttributes.setHeaderType(""); } else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // set set properties of OpenCSVSerde // 1. set column separator textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table)); @@ -411,17 +422,29 @@ protected TFileAttributes getFileAttributes() throws UserException { textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]); // 4. set escape char textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]); + fileAttributes.setTextParams(textParams); + fileAttributes.setHeaderType(""); + if (textParams.isSetEnclose()) { + fileAttributes.setTrimDoubleQuotes(true); + } + } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + textParams.setColumnSeparator("\t"); + textParams.setLineDelimiter("\n"); + fileAttributes.setTextParams(textParams); + + fileAttributes.setJsonpaths(""); + fileAttributes.setJsonRoot(""); + fileAttributes.setNumAsString(true); + fileAttributes.setFuzzyParse(false); + fileAttributes.setReadJsonByLine(true); + fileAttributes.setStripOuterArray(false); + fileAttributes.setHeaderType(""); } else { throw new UserException( "unsupported hive table serde: " + serDeLib); } - TFileAttributes fileAttributes = new TFileAttributes(); - fileAttributes.setTextParams(textParams); - fileAttributes.setHeaderType(""); - if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) { - fileAttributes.setTrimDoubleQuotes(true); - } return fileAttributes; } diff --git a/regression-test/data/external_table_p0/hive/hive_json_basic_test.out b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out new file mode 100644 index 00000000000000..9023f5d72b1ac3 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out @@ -0,0 +1,115 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q1 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 varchar_value1 varchar_value2 varchar_value3 {"key1":"value1"} {"key1":"value1"} {"a ":"b "} {1:10} {1:100000000000} {1.1:10.1} {1.1:10.1} {1:0} {1.1:1.1} {1.23:1.23} {1.2345:1.2345} {1.23456789:1.23456789} {1.23456789:1.23456789} {1.2345678901234568:1.2345678901234568} ["string1", "string2"] [1, 2, 3] [100000000000, 200000000000] [1.1, 2.2] [1.123456789, 2.123456789] [1, 0] ["varchar1", "varchar2"] ["char1 ", "char2 "] [1.1, 2.2] [1.23, 2.34] [1.2345, 2.3456] [1.23456789, 2.34567891] [1.23456789, 2.34567891] [1.2345678901234568, 2.3456789012345679] {"s_bigint":1234567890} {"key":[{"s_int":123}]} {"struct_field":["value1", "value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":123, "struct_non_nulls_after_nulls2":"value"} {"struct_field1":123, "struct_field2":"value", "strict_field3":{"nested_struct_field1":123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "value1", "value2"] ["value1", null, "value2"] ["value1", "value2", null] [null, null, null] dt1 +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 varchar_new_value1 varchar_new_value2 varchar_new_value3 {"key2":"value2"} {"key2":"value2"} {"x ":"y "} {2:20} {2:200000000000} {2.2:20.2} {2.2:20.2} {0:1} {2.2:2.2} {2.34:2.34} {2.3456:2.3456} {2.34567891:2.34567891} {2.34567891:2.34567891} {2.3456789012345679:2.3456789012345679} ["string3", "string4"] [4, 5, 6] [300000000000, 400000000000] [2.2, 3.3] [2.123456789, 3.123456789] [0, 1] ["varchar3", "varchar4"] ["char3 ", "char4 "] [2.2, 3.3] [2.34, 3.45] [2.3456, 3.4567] [2.34567891, 3.45678901] [2.34567891, 3.45678901] [2.3456789012345679, 3.4567890123456789] {"s_bigint":9876543210} {"key2":[{"s_int":456}]} {"struct_field":["new_value1", "new_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":456, "struct_non_nulls_after_nulls2":"new_value"} {"struct_field1":456, "struct_field2":"new_value", "strict_field3":{"nested_struct_field1":456, "nested_struct_field2":"nested_value2"}} {"null_key":null} [null, "new_value1", "new_value2"] ["new_value1", null, "new_value2"] ["new_value1", "new_value2", null] [null, null, null] dt1 +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 varchar_min_value1 varchar_min_value2 varchar_min_value3 {"min_key":"min_value"} {"min_key":"min_value"} {"a ":"z "} {-1:-100} {-1:-100000000000} {-1.1:-10.1} {-1.1:-10.1} {0:1} {-1.1:-1.1} {-1.23:-1.23} {-1.2345:-1.2345} {-1.23456789:-1.23456789} {-1.23456789:-1.23456789} {-1.2345678901234568:-1.2345678901234568} ["min_string1", "min_string2"] [-10, -5, -3] [-100000000000, -200000000000] [-1.1, -2.2] [-1.123456789, -2.123456789] [0, 1] ["min_varchar1", "min_varchar2"] ["min_char1 ", "min_char2 "] [-1.1, -2.2] [-1.23, -2.34] [-1.2345, -2.3456] [-1.23456789, -2.34567891] [-1.23456789, -2.34567891] [-1.2345678901234568, -2.3456789012345679] {"s_bigint":-1234567890} {"min_key":[{"s_int":-123}]} {"struct_field":["min_value1", "min_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":-123, "struct_non_nulls_after_nulls2":"min_value"} {"struct_field1":-123, "struct_field2":"min_value", "strict_field3":{"nested_struct_field1":-123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "min_value1", "min_value2"] ["min_value1", null, "min_value2"] ["min_value1", "min_value2", null] [null, null, null] dt1 +4 \N \N \N \N \N 123.45 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N {1:10} \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N [1.2345, 2.3456] \N \N \N \N \N \N \N \N \N \N [null, "value1", "value2"] \N \N \N dt2 +5 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 +6 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 + +-- !q2 -- +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 varchar_min_value1 varchar_min_value2 varchar_min_value3 {"min_key":"min_value"} {"min_key":"min_value"} {"a ":"z "} {-1:-100} {-1:-100000000000} {-1.1:-10.1} {-1.1:-10.1} {0:1} {-1.1:-1.1} {-1.23:-1.23} {-1.2345:-1.2345} {-1.23456789:-1.23456789} {-1.23456789:-1.23456789} {-1.2345678901234568:-1.2345678901234568} ["min_string1", "min_string2"] [-10, -5, -3] [-100000000000, -200000000000] [-1.1, -2.2] [-1.123456789, -2.123456789] [0, 1] ["min_varchar1", "min_varchar2"] ["min_char1 ", "min_char2 "] [-1.1, -2.2] [-1.23, -2.34] [-1.2345, -2.3456] [-1.23456789, -2.34567891] [-1.23456789, -2.34567891] [-1.2345678901234568, -2.3456789012345679] {"s_bigint":-1234567890} {"min_key":[{"s_int":-123}]} {"struct_field":["min_value1", "min_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":-123, "struct_non_nulls_after_nulls2":"min_value"} {"struct_field1":-123, "struct_field2":"min_value", "strict_field3":{"nested_struct_field1":-123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "min_value1", "min_value2"] ["min_value1", null, "min_value2"] ["min_value1", "min_value2", null] [null, null, null] dt1 + +-- !q3 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 varchar_value1 varchar_value2 varchar_value3 {"key1":"value1"} {"key1":"value1"} {"a ":"b "} {1:10} {1:100000000000} {1.1:10.1} {1.1:10.1} {1:0} {1.1:1.1} {1.23:1.23} {1.2345:1.2345} {1.23456789:1.23456789} {1.23456789:1.23456789} {1.2345678901234568:1.2345678901234568} ["string1", "string2"] [1, 2, 3] [100000000000, 200000000000] [1.1, 2.2] [1.123456789, 2.123456789] [1, 0] ["varchar1", "varchar2"] ["char1 ", "char2 "] [1.1, 2.2] [1.23, 2.34] [1.2345, 2.3456] [1.23456789, 2.34567891] [1.23456789, 2.34567891] [1.2345678901234568, 2.3456789012345679] {"s_bigint":1234567890} {"key":[{"s_int":123}]} {"struct_field":["value1", "value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":123, "struct_non_nulls_after_nulls2":"value"} {"struct_field1":123, "struct_field2":"value", "strict_field3":{"nested_struct_field1":123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "value1", "value2"] ["value1", null, "value2"] ["value1", "value2", null] [null, null, null] dt1 + +-- !q4 -- +123.45 +789.56 +-3.4028235e+38 +123.45 + +-- !q5 -- +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 varchar_new_value1 varchar_new_value2 varchar_new_value3 {"key2":"value2"} {"key2":"value2"} {"x ":"y "} {2:20} {2:200000000000} {2.2:20.2} {2.2:20.2} {0:1} {2.2:2.2} {2.34:2.34} {2.3456:2.3456} {2.34567891:2.34567891} {2.34567891:2.34567891} {2.3456789012345679:2.3456789012345679} ["string3", "string4"] [4, 5, 6] [300000000000, 400000000000] [2.2, 3.3] [2.123456789, 3.123456789] [0, 1] ["varchar3", "varchar4"] ["char3 ", "char4 "] [2.2, 3.3] [2.34, 3.45] [2.3456, 3.4567] [2.34567891, 3.45678901] [2.34567891, 3.45678901] [2.3456789012345679, 3.4567890123456789] {"s_bigint":9876543210} {"key2":[{"s_int":456}]} {"struct_field":["new_value1", "new_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":456, "struct_non_nulls_after_nulls2":"new_value"} {"struct_field1":456, "struct_field2":"new_value", "strict_field3":{"nested_struct_field1":456, "nested_struct_field2":"nested_value2"}} {"null_key":null} [null, "new_value1", "new_value2"] ["new_value1", null, "new_value2"] ["new_value1", "new_value2", null] [null, null, null] dt1 + +-- !q6 -- +user1 {"name":"Alice", "age":28, "preferences":{"sports":{"preference_id":101, "preference_values":["soccer", "tennis"]}, "music":{"preference_id":102, "preference_values":["rock", "classical"]}}} [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201, "preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202, "preference_values":["beaches", "mountains"]}}} [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q7 -- +user1 [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q8 -- +\N \N \N \N +\N \N \N \N +1 10 {"col2a":10, "col2b":"string1"} {1:"string10"} +2 20 {"col2a":20, "col2b":"string2"} {2:"string2"} +3 30 {"col2a":30, "col2b":"string3"} \N +4 40 {"col2a":40, "col2b":"string4"} {4:"string4"} +5 \N \N \N +6 60 {"col2a":60, "col2b":"600"} {6:"600"} +7 70 {"col2a":70, "col2b":"string7"} {7:"string7"} + +-- !q9 -- +\N \N +\N \N +\N 5 +10 1 +20 2 +30 3 +40 4 +60 6 +70 7 + +-- !q1 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 varchar_value1 varchar_value2 varchar_value3 {"key1":"value1"} {"key1":"value1"} {"a ":"b "} {1:10} {1:100000000000} {1.1:10.1} {1.1:10.1} {1:0} {1.1:1.1} {1.23:1.23} {1.2345:1.2345} {1.23456789:1.23456789} {1.23456789:1.23456789} {1.2345678901234568:1.2345678901234568} ["string1", "string2"] [1, 2, 3] [100000000000, 200000000000] [1.1, 2.2] [1.123456789, 2.123456789] [1, 0] ["varchar1", "varchar2"] ["char1 ", "char2 "] [1.1, 2.2] [1.23, 2.34] [1.2345, 2.3456] [1.23456789, 2.34567891] [1.23456789, 2.34567891] [1.2345678901234568, 2.3456789012345679] {"s_bigint":1234567890} {"key":[{"s_int":123}]} {"struct_field":["value1", "value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":123, "struct_non_nulls_after_nulls2":"value"} {"struct_field1":123, "struct_field2":"value", "strict_field3":{"nested_struct_field1":123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "value1", "value2"] ["value1", null, "value2"] ["value1", "value2", null] [null, null, null] dt1 +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 varchar_new_value1 varchar_new_value2 varchar_new_value3 {"key2":"value2"} {"key2":"value2"} {"x ":"y "} {2:20} {2:200000000000} {2.2:20.2} {2.2:20.2} {0:1} {2.2:2.2} {2.34:2.34} {2.3456:2.3456} {2.34567891:2.34567891} {2.34567891:2.34567891} {2.3456789012345679:2.3456789012345679} ["string3", "string4"] [4, 5, 6] [300000000000, 400000000000] [2.2, 3.3] [2.123456789, 3.123456789] [0, 1] ["varchar3", "varchar4"] ["char3 ", "char4 "] [2.2, 3.3] [2.34, 3.45] [2.3456, 3.4567] [2.34567891, 3.45678901] [2.34567891, 3.45678901] [2.3456789012345679, 3.4567890123456789] {"s_bigint":9876543210} {"key2":[{"s_int":456}]} {"struct_field":["new_value1", "new_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":456, "struct_non_nulls_after_nulls2":"new_value"} {"struct_field1":456, "struct_field2":"new_value", "strict_field3":{"nested_struct_field1":456, "nested_struct_field2":"nested_value2"}} {"null_key":null} [null, "new_value1", "new_value2"] ["new_value1", null, "new_value2"] ["new_value1", "new_value2", null] [null, null, null] dt1 +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 varchar_min_value1 varchar_min_value2 varchar_min_value3 {"min_key":"min_value"} {"min_key":"min_value"} {"a ":"z "} {-1:-100} {-1:-100000000000} {-1.1:-10.1} {-1.1:-10.1} {0:1} {-1.1:-1.1} {-1.23:-1.23} {-1.2345:-1.2345} {-1.23456789:-1.23456789} {-1.23456789:-1.23456789} {-1.2345678901234568:-1.2345678901234568} ["min_string1", "min_string2"] [-10, -5, -3] [-100000000000, -200000000000] [-1.1, -2.2] [-1.123456789, -2.123456789] [0, 1] ["min_varchar1", "min_varchar2"] ["min_char1 ", "min_char2 "] [-1.1, -2.2] [-1.23, -2.34] [-1.2345, -2.3456] [-1.23456789, -2.34567891] [-1.23456789, -2.34567891] [-1.2345678901234568, -2.3456789012345679] {"s_bigint":-1234567890} {"min_key":[{"s_int":-123}]} {"struct_field":["min_value1", "min_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":-123, "struct_non_nulls_after_nulls2":"min_value"} {"struct_field1":-123, "struct_field2":"min_value", "strict_field3":{"nested_struct_field1":-123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "min_value1", "min_value2"] ["min_value1", null, "min_value2"] ["min_value1", "min_value2", null] [null, null, null] dt1 +4 \N \N \N \N \N 123.45 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N {1:10} \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N [1.2345, 2.3456] \N \N \N \N \N \N \N \N \N \N [null, "value1", "value2"] \N \N \N dt2 +5 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 +6 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 + +-- !q2 -- +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 varchar_min_value1 varchar_min_value2 varchar_min_value3 {"min_key":"min_value"} {"min_key":"min_value"} {"a ":"z "} {-1:-100} {-1:-100000000000} {-1.1:-10.1} {-1.1:-10.1} {0:1} {-1.1:-1.1} {-1.23:-1.23} {-1.2345:-1.2345} {-1.23456789:-1.23456789} {-1.23456789:-1.23456789} {-1.2345678901234568:-1.2345678901234568} ["min_string1", "min_string2"] [-10, -5, -3] [-100000000000, -200000000000] [-1.1, -2.2] [-1.123456789, -2.123456789] [0, 1] ["min_varchar1", "min_varchar2"] ["min_char1 ", "min_char2 "] [-1.1, -2.2] [-1.23, -2.34] [-1.2345, -2.3456] [-1.23456789, -2.34567891] [-1.23456789, -2.34567891] [-1.2345678901234568, -2.3456789012345679] {"s_bigint":-1234567890} {"min_key":[{"s_int":-123}]} {"struct_field":["min_value1", "min_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":-123, "struct_non_nulls_after_nulls2":"min_value"} {"struct_field1":-123, "struct_field2":"min_value", "strict_field3":{"nested_struct_field1":-123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "min_value1", "min_value2"] ["min_value1", null, "min_value2"] ["min_value1", "min_value2", null] [null, null, null] dt1 + +-- !q3 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 varchar_value1 varchar_value2 varchar_value3 {"key1":"value1"} {"key1":"value1"} {"a ":"b "} {1:10} {1:100000000000} {1.1:10.1} {1.1:10.1} {1:0} {1.1:1.1} {1.23:1.23} {1.2345:1.2345} {1.23456789:1.23456789} {1.23456789:1.23456789} {1.2345678901234568:1.2345678901234568} ["string1", "string2"] [1, 2, 3] [100000000000, 200000000000] [1.1, 2.2] [1.123456789, 2.123456789] [1, 0] ["varchar1", "varchar2"] ["char1 ", "char2 "] [1.1, 2.2] [1.23, 2.34] [1.2345, 2.3456] [1.23456789, 2.34567891] [1.23456789, 2.34567891] [1.2345678901234568, 2.3456789012345679] {"s_bigint":1234567890} {"key":[{"s_int":123}]} {"struct_field":["value1", "value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":123, "struct_non_nulls_after_nulls2":"value"} {"struct_field1":123, "struct_field2":"value", "strict_field3":{"nested_struct_field1":123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "value1", "value2"] ["value1", null, "value2"] ["value1", "value2", null] [null, null, null] dt1 + +-- !q4 -- +123.45 +789.56 +-3.4028235e+38 +123.45 + +-- !q5 -- +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 varchar_new_value1 varchar_new_value2 varchar_new_value3 {"key2":"value2"} {"key2":"value2"} {"x ":"y "} {2:20} {2:200000000000} {2.2:20.2} {2.2:20.2} {0:1} {2.2:2.2} {2.34:2.34} {2.3456:2.3456} {2.34567891:2.34567891} {2.34567891:2.34567891} {2.3456789012345679:2.3456789012345679} ["string3", "string4"] [4, 5, 6] [300000000000, 400000000000] [2.2, 3.3] [2.123456789, 3.123456789] [0, 1] ["varchar3", "varchar4"] ["char3 ", "char4 "] [2.2, 3.3] [2.34, 3.45] [2.3456, 3.4567] [2.34567891, 3.45678901] [2.34567891, 3.45678901] [2.3456789012345679, 3.4567890123456789] {"s_bigint":9876543210} {"key2":[{"s_int":456}]} {"struct_field":["new_value1", "new_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":456, "struct_non_nulls_after_nulls2":"new_value"} {"struct_field1":456, "struct_field2":"new_value", "strict_field3":{"nested_struct_field1":456, "nested_struct_field2":"nested_value2"}} {"null_key":null} [null, "new_value1", "new_value2"] ["new_value1", null, "new_value2"] ["new_value1", "new_value2", null] [null, null, null] dt1 + +-- !q6 -- +user1 {"name":"Alice", "age":28, "preferences":{"sports":{"preference_id":101, "preference_values":["soccer", "tennis"]}, "music":{"preference_id":102, "preference_values":["rock", "classical"]}}} [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201, "preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202, "preference_values":["beaches", "mountains"]}}} [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q7 -- +user1 [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q8 -- +\N \N \N \N +\N \N \N \N +1 10 {"col2a":10, "col2b":"string1"} {1:"string10"} +2 20 {"col2a":20, "col2b":"string2"} {2:"string2"} +3 30 {"col2a":30, "col2b":"string3"} \N +4 40 {"col2a":40, "col2b":"string4"} {4:"string4"} +5 \N \N \N +6 60 {"col2a":60, "col2b":"600"} {6:"600"} +7 70 {"col2a":70, "col2b":"string7"} {7:"string7"} + +-- !q9 -- +\N \N +\N \N +\N 5 +10 1 +20 2 +30 3 +40 4 +60 6 +70 7 + diff --git a/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy new file mode 100644 index 00000000000000..9d05e1a4c7403d --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("hive_json_basic_test", "p0,external,hive,external_docker,external_docker_hive") { + + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive2", "hive3"]) { + try { + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_hive_json_basic_test" + String broker_name = "hdfs" + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + String tb1 = """json_all_complex_types""" + String tb2 = """json_nested_complex_table""" + String tb3 = """json_load_data_table""" + + def tables = sql """ show tables """ + logger.info("tables = ${tables}") + + qt_q1 """ select * from ${tb1} order by id """ + qt_q2 """ select * from ${tb1} where tinyint_col < 0 order by id """ + qt_q3 """ select * from ${tb1} where bigint_col > 0 order by id """ + qt_q4 """ select float_col from ${tb1} where float_col is not null order by id """ + qt_q5 """ select * from ${tb1} where id = 2 order by id """ + + + + qt_q6 """ select * from ${tb2} order by user_id""" + qt_q7 """ select user_id,activity_log from ${tb2} order by user_id""" + + + order_qt_q8 """ select * from ${tb3} order by id """ + + order_qt_q9 """ select col1,id from ${tb3} order by id """ + + + + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +}