Skip to content

Commit

Permalink
Expose delimiter character in JSON reader options to JSON reader APIs (
Browse files Browse the repository at this point in the history
…#17266)

Fixes #17261 
Removes delimiter symbol group from whitespace normalization FST since it is run post-tokenization.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #17266
  • Loading branch information
shrshi authored Nov 9, 2024
1 parent 7a499f6 commit 5cbdcd0
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 32 deletions.
8 changes: 5 additions & 3 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,13 @@ void write_json(data_sink* sink,
/**
* @brief Normalize single quotes to double quotes using FST
*
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
* @param indata Input device buffer
* @param delimiter Line-separating delimiter character in JSONL inputs
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
char delimiter,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

Expand Down
49 changes: 26 additions & 23 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
SINGLE_QUOTE_CHAR, ///< Quote character SG: '
ESCAPE_CHAR, ///< Escape character SG: '\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
DELIM_CHAR, ///< Delimiter character SG
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
Expand All @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC;
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);

// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
{{'\"'}, {'\''}, {'\\'}, {'\n'}}};
auto get_sgid_lut(SymbolT delim)
{
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
{{'\"'}, {'\''}, {'\\'}, {delim}}};
return symbol_groups;
}

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
/* IN_STATE " ' \ \n OTHER */
/* IN_STATE " ' \ <delim> OTHER */
/* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
/* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
Expand Down Expand Up @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes {

namespace normalize_whitespace {

// We do not need a symbol group for the delimiter character since whitespace normalization
// now occurs after tokenization.
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
ESCAPE_CHAR, ///< Escape character SG: '\\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' '
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
// Alias for readability of symbol group ids
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
{{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};

std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}};

/**
* -------- FST states ---------
* -----------------------------
* TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
* | quotes as well as any other character not enclosed by a string. Also handles
* | newline character present within a string
* TT_DQS | Double-quoted string state handling all characters within double quotes except
* | newline character
* | quotes as well as any other character not enclosed by a string.
* TT_DQS | Double-quoted string state handling all characters within double quotes
* TT_DEC | State handling escaped characters inside double-quoted string. Note that this
* | state is necessary to process escaped double-quote characters. Without this
* | state, whitespaces following escaped double quotes inside strings may be removed.
Expand All @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
{/* IN_STATE " \ \n <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
{/* IN_STATE " \ <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};

// The DFA's starting state
constexpr StateT start_state = static_cast<StateT>(TT_OOS);
Expand Down Expand Up @@ -302,18 +304,19 @@ struct TransduceToNormalizedWS {
namespace detail {

void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
char delimiter,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
normalize_quotes::TransduceToNormalizedQuotes{}),
stream);
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
normalize_quotes::TransduceToNormalizedQuotes{}),
stream);

rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/json/read_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
// If input JSON buffer has single quotes and option to normalize single quotes is enabled,
// invoke pre-processing FST
if (reader_opts.is_enabled_normalize_single_quotes()) {
normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
normalize_single_quotes(
bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref());
}

auto buffer =
Expand Down
21 changes: 16 additions & 5 deletions cpp/tests/io/json/json_quote_normalization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
// Base test fixture for tests
struct JsonNormalizationTest : public cudf::test::BaseFixture {};

void run_test(std::string const& host_input, std::string const& expected_host_output)
void run_test(std::string const& host_input,
std::string const& expected_host_output,
char delimiter = '\n')
{
// RMM memory resource
std::shared_ptr<rmm::mr::device_memory_resource> rsc =
Expand All @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou

// Preprocessing FST
cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get());

std::string preprocessed_host_output(device_data.size(), 0);
CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
Expand Down Expand Up @@ -172,29 +174,38 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter)
{
std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"};
std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"};
run_test(input, output, 'z');
}

TEST_F(JsonNormalizationTest, ReadJsonOption)
{
// RMM memory resource
std::shared_ptr<rmm::mr::device_memory_resource> rsc =
std::make_shared<rmm::mr::cuda_memory_resource>();

// Test input
std::string const host_input = R"({"A":'TEST"'})";
std::string const host_input = R"({"a": "1\n2"}h{'a': 12})";
cudf::io::json_reader_options input_options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{host_input.data(), host_input.size()})
.lines(true)
.delimiter('h')
.normalize_single_quotes(true);

cudf::io::table_with_metadata processed_table =
cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());

// Expected table
std::string const expected_input = R"({"A":"TEST\""})";
std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})";
cudf::io::json_reader_options expected_input_options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{expected_input.data(), expected_input.size()})
.lines(true);
.lines(true)
.delimiter('h');

cudf::io::table_with_metadata expected_table =
cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
Expand Down

0 comments on commit 5cbdcd0

Please sign in to comment.