Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wrong JSON key classification #161

Merged
merged 11 commits into from
Aug 4, 2024
Merged
4 changes: 4 additions & 0 deletions example/s3select_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,16 @@ int process_json_query(const char* input_query,const char* fname)


size_t read_sz = input_file_stream.read(buff.data(),BUFFER_SIZE).gcount();
#ifdef DEBUG_CHUNK_READ
int chunk_count=0;
#endif
size_t bytes_read=0;
while(read_sz)
{
bytes_read += read_sz;
#ifdef DEBUG_CHUNK_READ
std::cout << "read next chunk " << chunk_count++ << ":" << read_sz << ":" << bytes_read << "\r";
#endif

result.clear();

Expand Down
77 changes: 42 additions & 35 deletions include/s3select.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ struct push_json_from_clause : public base_ast_builder
};
static push_json_from_clause g_push_json_from_clause;

struct push_json_from_clause_key_path : public base_ast_builder
{
void builder(s3select* self, const char* a, const char* b) const;
};
static push_json_from_clause_key_path g_push_json_from_clause_key_path;

struct push_limit_clause : public base_ast_builder
{
void builder(s3select* self, const char* a, const char* b) const;
Expand Down Expand Up @@ -789,7 +795,7 @@ struct s3select : public bsc::grammar<s3select>

json_s3_object = ((S3SELECT_KW(JSON_ROOT_OBJECT)) >> *(bsc::str_p(".") >> json_path_element))[BOOST_BIND_ACTION(push_json_from_clause)];

json_path_element = bsc::lexeme_d[+( bsc::alnum_p | bsc::str_p("_")) ];
json_path_element = (bsc::lexeme_d[+( bsc::alnum_p | bsc::str_p("_") ) ] | bsc::str_p("*") | (string))[BOOST_BIND_ACTION(push_json_from_clause_key_path)];

object_path = "/" >> *( fs_type >> "/") >> fs_type;

Expand Down Expand Up @@ -915,11 +921,12 @@ struct s3select : public bsc::grammar<s3select>

variable = (variable_name >> "." >> variable_name) | variable_name;

// json_variable_name is the JSON projection, i.e. _1.a.b[10]
json_variable_name = bsc::str_p("_1") >> +("." >> (json_array | json_object) );

json_object = (variable_name)[BOOST_BIND_ACTION(push_json_object)];
json_object = (variable_name | string)[BOOST_BIND_ACTION(push_json_object)];

json_array = (variable_name >> +(bsc::str_p("[") >> number[BOOST_BIND_ACTION(push_array_number)] >> bsc::str_p("]")) )[BOOST_BIND_ACTION(push_json_array_name)];
json_array = ((variable_name | string) >> +(bsc::str_p("[") >> number[BOOST_BIND_ACTION(push_array_number)] >> bsc::str_p("]")) )[BOOST_BIND_ACTION(push_json_array_name)];
}


Expand Down Expand Up @@ -979,35 +986,33 @@ void push_from_clause::builder(s3select* self, const char* a, const char* b) con
self->getAction()->exprQ.clear();
}

void push_json_from_clause::builder(s3select* self, const char* a, const char* b) const
std::string json_path_remove_double_quote(const char* a, const char* b)
{
//upon query accessing key which contains meta-char, it must use string-construct(double quotes),
//the engine should remove double quotes for later processing.
std::string token(a, b);
if(*a == '"') //TODO single quote ?
{
std::string tmp = token.substr(1,token.find('"',1)-1);
token = tmp;
}
return token;
}

void push_json_from_clause_key_path::builder(s3select* self, const char* a, const char* b) const
{
std::string token(a, b),table_name,alias_name;

//TODO handle the star-operation ('*') in from-clause. build the parameters for json-reader search-api's.
std::vector<std::string> variable_key_path;
const char* delimiter = ".";
auto pos = token.find(delimiter);
std::string token = json_path_remove_double_quote(a,b);
self->getAction()->json_from_clause.push_back(token);
}

if(pos != std::string::npos)
{
token = token.substr(strlen(JSON_ROOT_OBJECT)+1,token.size());
pos = token.find(delimiter);
do
{
variable_key_path.push_back(token.substr(0,pos));
if(pos != std::string::npos)
token = token.substr(pos+1,token.size());
else
token = "";
pos = token.find(delimiter);
}while(token.size());
}
else
void push_json_from_clause::builder(s3select* self, const char* a, const char* b) const
{
if(self->getAction()->json_from_clause.size() == 0)
{
variable_key_path.push_back(JSON_ROOT_OBJECT);
self->getAction()->json_from_clause.push_back(JSON_ROOT_OBJECT);
}

self->getAction()->json_from_clause = variable_key_path;
}

void push_limit_clause::builder(s3select* self, const char* a, const char* b) const
Expand Down Expand Up @@ -1129,7 +1134,6 @@ void push_json_variable::builder(s3select* self, const char* a, const char* b) c
{//purpose: handle the use case of json-variable structure (_1.a.b.c)

std::string token(a, b);
std::vector<std::string> variable_key_path;

//the following flow determine the index per json variable reside on statement.
//per each discovered json_variable, it search the json-variables-vector whether it already exists.
Expand Down Expand Up @@ -1159,7 +1163,8 @@ void push_array_number::builder(s3select* self, const char* a, const char* b) co

void push_json_array_name::builder(s3select* self, const char* a, const char* b) const
{
std::string token(a, b);
std::string token = json_path_remove_double_quote(a,b);

size_t found = token.find("[");
std::string array_name = token.substr(0,found);

Expand All @@ -1186,7 +1191,7 @@ void push_json_array_name::builder(s3select* self, const char* a, const char* b)

void push_json_object::builder(s3select* self, const char* a, const char* b) const
{
std::string token(a, b);
std::string token = json_path_remove_double_quote(a,b);

//DEBUG - TEMP std::cout << "push_json_object " << token << std::endl;

Expand Down Expand Up @@ -3225,10 +3230,6 @@ class json_object : public base_s3object
f_push_key_value_into_scratch_area_per_star_operation = [this](s3selectEngine::scratch_area::json_key_value_t& key_value)
{return push_key_value_into_scratch_area_per_star_operation(key_value);};

//setting the container for all json-variables, to be extracted by the json reader
JsonHandler.set_statement_json_variables(query->get_json_variables_access());


//calling to getMatchRow. processing a single row per each call.
JsonHandler.set_s3select_processing_callback(f_sql);
//upon excat match between input-json-key-path and sql-statement-variable-path the callback pushes to scratch area
Expand Down Expand Up @@ -3270,6 +3271,9 @@ class json_object : public base_s3object
}

m_sa->set_parquet_type();//TODO json type

//setting the container for all json-variables, to be extracted by the json reader
JsonHandler.set_statement_json_variables(query->get_json_variables_access());
}

json_object(s3select* query):base_s3object(query),m_processed_bytes(0),m_end_of_stream(false),m_row_count(0),star_operation_ind(false),m_init_json_processor_ind(false)
Expand Down Expand Up @@ -3337,7 +3341,10 @@ class json_object : public base_s3object

int push_key_value_into_scratch_area_per_star_operation(s3selectEngine::scratch_area::json_key_value_t& key_value)
{
m_sa->get_star_operation_cont()->push_back( key_value );
//upon star-operation on nested JSON, there could be many keys in a single row (actually, there is no limitation).
//for many cases these keys are duplicated in the scope of a single-row (row is defined according to SQL statement).
//the following routine saves only unique keys.
m_sa->json_push_key_value_per_star_operation(key_value);
return 0;
}

Expand Down
11 changes: 11 additions & 0 deletions include/s3select_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -2268,6 +2268,17 @@ the whole system resides in a container [ docker pull galsl/fedora_38:tpcds_v2 ]
-- avoid timeout upon long processing #152
-- replace assert with an exception to avoid crashing the process #151
-- fix for the use-case of not-operator on a string #153

-- the negation-operation may cause a wrong-build of the AST, that later may cause a crash. that operation is missing handling of several operators. #160
-- the crash happened upon calling more than once the parse_query, the second call accessed an incomplete object in the AST. #160

-- fix for identifies key-value as key-object or key-array #161
-- wrong initialization of the JSON parser engine had caused missing a projection-key-path upon using different combinations of the from-clause path. #161
-- a fix for copy-constructor, upon a JSON value is an empty string, it causes a wrong result. #161
-- modification for JSON star operation, the new-type container saves only unique keys, to avoid high memory consumption. #161
-- the from-clause can handle a wild-card. i.e. upon wild-card(*) it skips the corresponding part in projection-key-path. #161
-- key-path may include meta-char(like a dot) select _1."i.e."[0] the "i.e." is part of the key-path. #161

)";

_fn_engine_version()
Expand Down
48 changes: 41 additions & 7 deletions include/s3select_json_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,12 @@ void key()

if(reader_position_state().required_path.size())//current state is a key
{
std::vector<std::string>* filter = &reader_position_state().required_path;
std::vector<std::string>* projection_key_path = &reader_position_state().required_path;
auto required_key_depth_size = reader_position_state().required_key_depth_size;
if(std::equal((*key_path).begin()+(*from_clause).size() + required_key_depth_size, //key-path-start-point + from-clause-depth-size + key-depth
(*key_path).end(),
(*filter).begin(),
(*filter).end(), iequal_predicate))
(*projection_key_path).begin(),
(*projection_key_path).end(), iequal_predicate))
{
increase_current_state();//key match according to user request, advancing to the next state
}
Expand Down Expand Up @@ -640,15 +640,45 @@ class JsonParserHandler : public rapidjson::BaseReaderHandler<rapidjson::UTF8<>,
m_current_depth_non_anonymous++;
}

if(from_clause.size() == 0 || std::equal(key_path.begin(), key_path.end(), from_clause.begin(), from_clause.end(), iequal_predicate)) {
prefix_match = true;
}

variable_match_operations.key();

return true;
}

template<typename predicate>
bool from_clause_matcher(std::vector<std::string>& _key_path,
std::vector<std::string>& _from_clause,
predicate p)
{
//iterate on both path's
//upon a part of from-clause is '*' it consider 'equal' to the counter part (projection), should skip to the next part
//
//from-clause = a.*.c ; projection-key = a.b.c ; since the '*' is on the secod position
//it means b=exists-in-from-clause, it should skip to next part (the third).

std::vector<std::string>::iterator it_key_path = _key_path.begin();
std::vector<std::string>::iterator it_from_clause = _from_clause.begin();

while(it_from_clause != _from_clause.end())
{
if (it_key_path == _key_path.end()) return false;

if ((it_from_clause->compare("*")==0) || p(*it_key_path,*it_from_clause))
{it_key_path++; it_from_clause++;}
else
return false;
};
return true;

};


void set_prefix_match(){
if(from_clause.size() == 0 || from_clause_matcher(key_path, from_clause, iequal_predicate)) {
prefix_match = true; //it is not prefix_match in the case its a key/value . it is a prefix match in the case it is a key of array or key of an object
}
}

bool is_already_row_started()
{
if(state == row_state::OBJECT_START_ROW || state == row_state::ARRAY_START_ROW)
Expand All @@ -658,6 +688,8 @@ class JsonParserHandler : public rapidjson::BaseReaderHandler<rapidjson::UTF8<>,
}

bool StartObject() {
set_prefix_match();

json_element_state.push_back(OBJECT_STATE);
m_current_depth++;
if(key_path.size()){
Expand Down Expand Up @@ -690,6 +722,8 @@ class JsonParserHandler : public rapidjson::BaseReaderHandler<rapidjson::UTF8<>,
}

bool StartArray() {
set_prefix_match();

json_element_state.push_back(ARRAY_STATE);
m_current_depth++;
if(key_path.size()){
Expand Down
Loading
Loading