From df03e91deb0f269fad124bef49f681e3520c72ad Mon Sep 17 00:00:00 2001 From: Yohan Wal <59358312+CookiePieWw@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:17:57 +0800 Subject: [PATCH] feat: add respective get_by_path UDFs for JSON type (#4720) * feat: add respectiv get_by_path udf for json type * Apply review comments Co-authored-by: Weny Xu * fix: fix compile error * refactor: change name of UDFs, add some tests --------- Co-authored-by: Weny Xu --- src/common/function/src/scalars/json.rs | 7 + .../function/src/scalars/json/json_get.rs | 454 ++++++++++++++++++ .../standalone/common/function/json.result | 198 ++++++++ .../cases/standalone/common/function/json.sql | 56 +++ 4 files changed, 715 insertions(+) create mode 100644 src/common/function/src/scalars/json/json_get.rs create mode 100644 tests/cases/standalone/common/function/json.result create mode 100644 tests/cases/standalone/common/function/json.sql diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index 3812b33f235f..26d63d3b45e1 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -13,9 +13,11 @@ // limitations under the License. use std::sync::Arc; +mod json_get; mod json_to_string; mod to_json; +use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString}; use json_to_string::JsonToStringFunction; use to_json::ToJsonFunction; @@ -27,5 +29,10 @@ impl JsonFunction { pub fn register(registry: &FunctionRegistry) { registry.register(Arc::new(JsonToStringFunction)); registry.register(Arc::new(ToJsonFunction)); + + registry.register(Arc::new(JsonGetInt)); + registry.register(Arc::new(JsonGetFloat)); + registry.register(Arc::new(JsonGetString)); + registry.register(Arc::new(JsonGetBool)); } } diff --git a/src/common/function/src/scalars/json/json_get.rs b/src/common/function/src/scalars/json/json_get.rs new file mode 100644 index 000000000000..78ddc1d2642c --- /dev/null +++ b/src/common/function/src/scalars/json/json_get.rs @@ -0,0 +1,454 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{self, Display}; + +use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu}; +use common_query::prelude::Signature; +use datafusion::logical_expr::Volatility; +use datatypes::data_type::ConcreteDataType; +use datatypes::prelude::VectorRef; +use datatypes::scalars::ScalarVectorBuilder; +use datatypes::vectors::{ + BooleanVectorBuilder, Float64VectorBuilder, Int64VectorBuilder, MutableVector, + StringVectorBuilder, +}; +use snafu::ensure; + +use crate::function::{Function, FunctionContext}; + +fn get_json_by_path(json: &[u8], path: &str) -> Option> { + let json_path = jsonb::jsonpath::parse_json_path(path.as_bytes()); + match json_path { + Ok(json_path) => { + let mut sub_jsonb = Vec::new(); + let mut sub_offsets = Vec::new(); + match jsonb::get_by_path(json, json_path, &mut sub_jsonb, &mut sub_offsets) { + Ok(_) => Some(sub_jsonb), + Err(_) => None, + } + } + _ => None, + } +} + +/// Get the value from the JSONB by the given path and return it as specified type. +/// If the path does not exist or the value is not the type specified, return `NULL`. +macro_rules! json_get { + // e.g. name = JsonGetInt, type = Int64, rust_type = i64, doc = "Get the value from the JSONB by the given path and return it as an integer." + ($name: ident, $type: ident, $rust_type: ident, $doc:expr) => { + paste::paste! { + #[doc = $doc] + #[derive(Clone, Debug, Default)] + pub struct $name; + + impl Function for $name { + fn name(&self) -> &str { + stringify!([<$name:snake>]) + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::[<$type:snake _datatype>]()) + } + + fn signature(&self) -> Signature { + Signature::exact( + vec![ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype(), + ], + Volatility::Immutable, + ) + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure!( + columns.len() == 2, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect exactly two, have: {}", + columns.len() + ), + } + ); + let jsons = &columns[0]; + let paths = &columns[1]; + + let size = jsons.len(); + let datatype = jsons.data_type(); + let mut results = [<$type VectorBuilder>]::with_capacity(size); + + match datatype { + // JSON data type uses binary vector + ConcreteDataType::Binary(_) => { + for i in 0..size { + let json = jsons.get_ref(i); + let path = paths.get_ref(i); + + let json = json.as_binary(); + let path = path.as_string(); + let result = match (json, path) { + (Ok(Some(json)), Ok(Some(path))) => { + get_json_by_path(json, path) + .and_then(|json| { jsonb::[](&json).ok() }) + } + _ => None, + }; + + results.push(result); + } + } + _ => { + return UnsupportedInputDataTypeSnafu { + function: stringify!([<$name:snake>]), + datatypes: columns.iter().map(|c| c.data_type()).collect::>(), + } + .fail(); + } + } + + Ok(results.to_vector()) + } + } + + impl Display for $name { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", stringify!([<$name:snake>]).to_ascii_uppercase()) + } + } + } + }; +} + +json_get!( + JsonGetInt, + Int64, + i64, + "Get the value from the JSONB by the given path and return it as an integer." +); + +json_get!( + JsonGetFloat, + Float64, + f64, + "Get the value from the JSONB by the given path and return it as a float." +); + +json_get!( + JsonGetBool, + Boolean, + bool, + "Get the value from the JSONB by the given path and return it as a boolean." +); + +/// Get the value from the JSONB by the given path and return it as a string. +#[derive(Clone, Debug, Default)] +pub struct JsonGetString; + +impl Function for JsonGetString { + fn name(&self) -> &str { + "json_get_string" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::string_datatype()) + } + + fn signature(&self) -> Signature { + Signature::exact( + vec![ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype(), + ], + Volatility::Immutable, + ) + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure!( + columns.len() == 2, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect exactly two, have: {}", + columns.len() + ), + } + ); + let jsons = &columns[0]; + let paths = &columns[1]; + + let size = jsons.len(); + let datatype = jsons.data_type(); + let mut results = StringVectorBuilder::with_capacity(size); + + match datatype { + // JSON data type uses binary vector + ConcreteDataType::Binary(_) => { + for i in 0..size { + let json = jsons.get_ref(i); + let path = paths.get_ref(i); + + let json = json.as_binary(); + let path = path.as_string(); + let result = match (json, path) { + (Ok(Some(json)), Ok(Some(path))) => { + get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok()) + } + _ => None, + }; + + results.push(result.as_deref()); + } + } + _ => { + return UnsupportedInputDataTypeSnafu { + function: "json_get_string", + datatypes: columns.iter().map(|c| c.data_type()).collect::>(), + } + .fail(); + } + } + + Ok(results.to_vector()) + } +} + +impl Display for JsonGetString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", "json_get_string".to_ascii_uppercase()) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_query::prelude::TypeSignature; + use datatypes::scalars::ScalarVector; + use datatypes::vectors::{BinaryVector, StringVector}; + + use super::*; + + #[test] + fn test_json_get_int() { + let json_get_int = JsonGetInt; + + assert_eq!("json_get_int", json_get_int.name()); + assert_eq!( + ConcreteDataType::int64_datatype(), + json_get_int + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(json_get_int.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": 2}, "b": 2, "c": 3}"#, + r#"{"a": 4, "b": {"c": 6}, "c": 6}"#, + r#"{"a": 7, "b": 8, "c": {"a": 7}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(2), Some(4), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = json_get_int + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_i64().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_json_get_float() { + let json_get_float = JsonGetFloat; + + assert_eq!("json_get_float", json_get_float.name()); + assert_eq!( + ConcreteDataType::float64_datatype(), + json_get_float + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(json_get_float.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": 2.1}, "b": 2.2, "c": 3.3}"#, + r#"{"a": 4.4, "b": {"c": 6.6}, "c": 6.6}"#, + r#"{"a": 7.7, "b": 8.8, "c": {"a": 7.7}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(2.1), Some(4.4), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = json_get_float + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_f64().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_json_get_bool() { + let json_get_bool = JsonGetBool; + + assert_eq!("json_get_bool", json_get_bool.name()); + assert_eq!( + ConcreteDataType::boolean_datatype(), + json_get_bool + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(json_get_bool.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": true}, "b": false, "c": true}"#, + r#"{"a": false, "b": {"c": true}, "c": false}"#, + r#"{"a": true, "b": false, "c": {"a": true}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(true), Some(false), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = json_get_bool + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_boolean().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_json_get_string() { + let json_get_string = JsonGetString; + + assert_eq!("json_get_string", json_get_string.name()); + assert_eq!( + ConcreteDataType::string_datatype(), + json_get_string + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(json_get_string.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": "a"}, "b": "b", "c": "c"}"#, + r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#, + r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#, + ]; + let paths = vec!["$.a.b", "$.a", ""]; + let results = [Some("a"), Some("d"), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = json_get_string + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_string().unwrap(); + assert_eq!(*gt, result); + } + } +} diff --git a/tests/cases/standalone/common/function/json.result b/tests/cases/standalone/common/function/json.result new file mode 100644 index 000000000000..f2a59b9d7066 --- /dev/null +++ b/tests/cases/standalone/common/function/json.result @@ -0,0 +1,198 @@ +-- json_get functions -- +SELECT json_get_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); + ++---------------------------------------------------------------------+ +| json_get_int(to_json(Utf8("{"a": {"b": {"c": 1}}}")),Utf8("a.b.c")) | ++---------------------------------------------------------------------+ +| 1 | ++---------------------------------------------------------------------+ + +SELECT json_get_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a:b.c'); + ++---------------------------------------------------------------------------+ +| json_get_float(to_json(Utf8("{"a": {"b": {"c": 1.234}}}")),Utf8("a:b.c")) | ++---------------------------------------------------------------------------+ +| 1.234 | ++---------------------------------------------------------------------------+ + +SELECT json_get_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b:c'); + ++----------------------------------------------------------------------------+ +| json_get_string(to_json(Utf8("{"a": {"b": {"c": "foo"}}}")),Utf8("a.b:c")) | ++----------------------------------------------------------------------------+ +| foo | ++----------------------------------------------------------------------------+ + +SELECT json_get_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b["c"]'); + ++----------------------------------------------------------------------------+ +| json_get_bool(to_json(Utf8("{"a": {"b": {"c": true}}}")),Utf8("a.b["c"]")) | ++----------------------------------------------------------------------------+ +| true | ++----------------------------------------------------------------------------+ + +SELECT json_get_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + ++--------------------------------------------------------------------------+ +| json_get_int(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | ++--------------------------------------------------------------------------+ +| | ++--------------------------------------------------------------------------+ + +SELECT json_get_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + ++-----------------------------------------------------------------------------+ +| json_get_string(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | ++-----------------------------------------------------------------------------+ +| | ++-----------------------------------------------------------------------------+ + +-- test functions with table rows -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +Affected Rows: 0 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1}}}'), 1); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1.234}}}'), 2); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": "foo"}}}'), 3); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); + +Affected Rows: 1 + +SELECT json_get_int(j, 'a.b.c') FROM jsons; + ++-------------------------------------+ +| json_get_int(jsons.j,Utf8("a.b.c")) | ++-------------------------------------+ +| 1 | +| | +| | +| 1 | ++-------------------------------------+ + +SELECT json_get_float(j, 'a["b"].c') FROM jsons; + ++------------------------------------------+ +| json_get_float(jsons.j,Utf8("a["b"].c")) | ++------------------------------------------+ +| 1.0 | +| 1.234 | +| | +| 1.0 | ++------------------------------------------+ + +SELECT json_get_string(j, 'a.b.c?(@ == 1)') FROM jsons; + ++-------------------------------------------------+ +| json_get_string(jsons.j,Utf8("a.b.c?(@ == 1)")) | ++-------------------------------------------------+ +| 1 | +| | +| | +| | ++-------------------------------------------------+ + +SELECT json_get_bool(j, 'a.b.c') FROM jsons; + ++--------------------------------------+ +| json_get_bool(jsons.j,Utf8("a.b.c")) | ++--------------------------------------+ +| | +| | +| | +| true | ++--------------------------------------+ + +SELECT json_get_int(j, 'a.b["c"]') FROM jsons; + ++----------------------------------------+ +| json_get_int(jsons.j,Utf8("a.b["c"]")) | ++----------------------------------------+ +| 1 | +| | +| | +| 1 | ++----------------------------------------+ + +DROP TABLE jsons; + +Affected Rows: 0 + +-- test functions with arrays -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +Affected Rows: 0 + +INSERT INTO jsons VALUES(to_json('["a", "bcde", "", "Long time ago, there is a little pig flying in the sky"]'), 1); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[true, false, false, false]'), 2); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[1, 0, -2147483649, 2147483648]'), 3); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4); + +Affected Rows: 1 + +SELECT json_get_int(j, '[0]') FROM jsons; + ++-----------------------------------+ +| json_get_int(jsons.j,Utf8("[0]")) | ++-----------------------------------+ +| | +| 1 | +| 1 | +| | ++-----------------------------------+ + +SELECT json_get_float(j, '[1]') FROM jsons; + ++-------------------------------------+ +| json_get_float(jsons.j,Utf8("[1]")) | ++-------------------------------------+ +| | +| 0.0 | +| 0.0 | +| 3.141592653589793 | ++-------------------------------------+ + +SELECT json_get_bool(j, '[2]') FROM jsons; + ++------------------------------------+ +| json_get_bool(jsons.j,Utf8("[2]")) | ++------------------------------------+ +| | +| false | +| | +| | ++------------------------------------+ + +SELECT json_get_string(j, '[3]') FROM jsons; + ++--------------------------------------------------------+ +| json_get_string(jsons.j,Utf8("[3]")) | ++--------------------------------------------------------+ +| Long time ago, there is a little pig flying in the sky | +| false | +| 2147483648 | +| 1e100 | ++--------------------------------------------------------+ + +DROP TABLE jsons; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/json.sql b/tests/cases/standalone/common/function/json.sql new file mode 100644 index 000000000000..c6214ae0f8b9 --- /dev/null +++ b/tests/cases/standalone/common/function/json.sql @@ -0,0 +1,56 @@ +-- json_get functions -- +SELECT json_get_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); + +SELECT json_get_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a:b.c'); + +SELECT json_get_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b:c'); + +SELECT json_get_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b["c"]'); + +SELECT json_get_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + +SELECT json_get_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + +-- test functions with table rows -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1}}}'), 1); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1.234}}}'), 2); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": "foo"}}}'), 3); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); + +SELECT json_get_int(j, 'a.b.c') FROM jsons; + +SELECT json_get_float(j, 'a["b"].c') FROM jsons; + +SELECT json_get_string(j, 'a.b.c?(@ == 1)') FROM jsons; + +SELECT json_get_bool(j, 'a.b.c') FROM jsons; + +SELECT json_get_int(j, 'a.b["c"]') FROM jsons; + +DROP TABLE jsons; + +-- test functions with arrays -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +INSERT INTO jsons VALUES(to_json('["a", "bcde", "", "Long time ago, there is a little pig flying in the sky"]'), 1); + +INSERT INTO jsons VALUES(to_json('[true, false, false, false]'), 2); + +INSERT INTO jsons VALUES(to_json('[1, 0, -2147483649, 2147483648]'), 3); + +INSERT INTO jsons VALUES(to_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4); + +SELECT json_get_int(j, '[0]') FROM jsons; + +SELECT json_get_float(j, '[1]') FROM jsons; + +SELECT json_get_bool(j, '[2]') FROM jsons; + +SELECT json_get_string(j, '[3]') FROM jsons; + +DROP TABLE jsons;