Expose delimiter character in JSON reader options to JSON reader APIs (…

…#17266) Fixes #17261 Removes delimiter symbol group from whitespace normalization FST since it is run post-tokenization. Authors: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) URL: #17266
rapidsai · Nov 9, 2024 · 5cbdcd0 · 5cbdcd0
1 parent 7a499f6
commit 5cbdcd0
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 32 deletions.
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
@@ -57,11 +57,13 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param indata Input device buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
+ * @param indata    Input device buffer
+ * @param delimiter Line-separating delimiter character in JSONL inputs
+ * @param stream    CUDA stream used for device memory operations and kernel launches
+ * @param mr        Device memory resource to use for device memory allocation
  */
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 

diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
@@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
   SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
   ESCAPE_CHAR,        ///< Escape character SG: '\'
-  NEWLINE_CHAR,       ///< Newline character SG: '\n'
+  DELIM_CHAR,         ///< Delimiter character SG
   OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
 };
@@ -72,13 +72,17 @@ constexpr auto TT_SEC            = dfa_states::TT_SEC;
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
-  {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'\"'}, {'\''}, {'\\'}, {delim}}};
+  return symbol_groups;
+}
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
-  /* IN_STATE      "       '       \       \n    OTHER  */
+  /* IN_STATE      "       '       \     <delim>    OTHER  */
   /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
   /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
   /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
@@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes {
 
 namespace normalize_whitespace {
 
+// We do not need a symbol group for the delimiter character since whitespace normalization
+// now occurs after tokenization.
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
   ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
   WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
   OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
 };
 // Alias for readability of symbol group ids
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}};
 
 /**
  * -------- FST states ---------
  * -----------------------------
  * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
+ *        |   quotes as well as any other character not enclosed by a string.
+ * TT_DQS | Double-quoted string state handling all characters within double quotes
  * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
@@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+  {/* IN_STATE      "       \     <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
 
 // The DFA's starting state
 constexpr StateT start_state = static_cast<StateT>(TT_OOS);
@@ -302,18 +304,19 @@ struct TransduceToNormalizedWS {
 namespace detail {
 
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
-  auto parser =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
-                            normalize_quotes::TransduceToNormalizedQuotes{}),
-                          stream);
+  auto parser                           = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+    fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+      normalize_quotes::TransduceToNormalizedQuotes{}),
+    stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
@@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
+    normalize_single_quotes(
+      bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref());
   }
 
   auto buffer =

diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -34,7 +34,9 @@
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
+void run_test(std::string const& host_input,
+              std::string const& expected_host_output,
+              char delimiter = '\n')
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
@@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
+  cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
@@ -172,29 +174,38 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces
   run_test(input, output);
 }
 
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter)
+{
+  std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"};
+  std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"};
+  run_test(input, output, 'z');
+}
+
 TEST_F(JsonNormalizationTest, ReadJsonOption)
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   // Test input
-  std::string const host_input = R"({"A":'TEST"'})";
+  std::string const host_input = R"({"a": "1\n2"}h{'a': 12})";
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
       .lines(true)
+      .delimiter('h')
       .normalize_single_quotes(true);
 
   cudf::io::table_with_metadata processed_table =
     cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
 
   // Expected table
-  std::string const expected_input = R"({"A":"TEST\""})";
+  std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})";
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
-      .lines(true);
+      .lines(true)
+      .delimiter('h');
 
   cudf::io::table_with_metadata expected_table =
     cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());