From 865ff6e9d45db567bb3e194419cbc52815d1e0f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonah=20Br=C3=BCchert?= Date: Mon, 29 Apr 2024 17:41:47 +0200 Subject: [PATCH 1/2] Unescape csv strings --- include/utl/parser/csv.h | 22 ++++++++++++++++++++-- include/utl/parser/csv_range.h | 2 +- test/parser/pipe_csv_test.cc | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/include/utl/parser/csv.h b/include/utl/parser/csv.h index 4558e0f..79049c5 100644 --- a/include/utl/parser/csv.h +++ b/include/utl/parser/csv.h @@ -53,6 +53,19 @@ inline void parse_column(cstr& s, T& arg) { adjust_for_quote + adjust_for_cr)); } +inline void unescape_quoted_string(std::string& arg) { + std::string::size_type found_at = 0; + while ((found_at = arg.find('"', found_at)) != std::string::npos) { + if (found_at < arg.size() - 1 && arg[found_at + 1] == '"') { + arg.erase(found_at, 1); // Since the string is now one character shorter, + // found_at now points to the next character + ++found_at; // Skip following character ("), we are now after the "" + } else { + ++found_at; // Continue search from next character + } + } +} + template ::value, int> = 0> inline void parse_value(cstr& s, IntType& arg) { @@ -71,8 +84,13 @@ inline void parse_value(cstr& s, bool& arg) { s = s.skip_whitespace_front(); parse_arg(s, arg); } -inline void parse_value(cstr& s, std::string& arg) { parse_arg(s, arg); } -inline void parse_value(cstr& s, cstr& arg) { parse_arg(s, arg); } +inline void parse_value(cstr& s, std::string& arg) { + parse_arg(s, arg); + unescape_quoted_string(arg); +} +inline void parse_value(cstr& s, cstr& arg) { + parse_arg(s, arg); +} template typename std::enable_if::type read( diff --git a/include/utl/parser/csv_range.h b/include/utl/parser/csv_range.h index 8714990..1d25042 100644 --- a/include/utl/parser/csv_range.h +++ b/include/utl/parser/csv_range.h @@ -94,7 +94,7 @@ struct csv_range : public LineRange { T t{}; cista::for_each_field(t, [&, i = 0u](auto& f) mutable { if (row[i]) { - parse_arg(row[i], f.val()); + parse_value(row[i], f.val()); } ++i; }); diff --git a/test/parser/pipe_csv_test.cc b/test/parser/pipe_csv_test.cc index b0d3509..beb0ffd 100644 --- a/test/parser/pipe_csv_test.cc +++ b/test/parser/pipe_csv_test.cc @@ -101,7 +101,7 @@ TEST(pipe_csv, csv_escaped_string) { | vec(); ASSERT_TRUE(result.size() == 1); - EXPECT_TRUE(result[0].foo_.val() == R"([""asd"", ""bsd""])"); + EXPECT_TRUE(result[0].foo_.val() == R"(["asd", "bsd"])"); EXPECT_TRUE(result[0].bar_.val() == "asd"); EXPECT_TRUE(result[0].baz_.val() == "xxx"); } From 6f2c66f4477cd056d6328615b6816e1fbd89a755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonah=20Br=C3=BCchert?= Date: Thu, 2 May 2024 18:39:50 +0200 Subject: [PATCH 2/2] Add test to make sure we don't crash on invalid csv --- test/parser/pipe_csv_test.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/parser/pipe_csv_test.cc b/test/parser/pipe_csv_test.cc index beb0ffd..c61427f 100644 --- a/test/parser/pipe_csv_test.cc +++ b/test/parser/pipe_csv_test.cc @@ -105,3 +105,24 @@ TEST(pipe_csv, csv_escaped_string) { EXPECT_TRUE(result[0].bar_.val() == "asd"); EXPECT_TRUE(result[0].baz_.val() == "xxx"); } + +TEST(pipe_csv, csv_invalid_escaped_string) { + struct dat { + csv_col foo_; + csv_col bar_; + csv_col baz_; + }; + + // This is invalid, but we need to make sure not to crash + constexpr auto const input = R"(BAR,FOO,BAZ +"asd","[""asd"", ""bsd""]","xxx"" +)"; + auto const result = line_range{make_buf_reader(input, {})} // + | csv() // + | vec(); + + ASSERT_TRUE(result.size() == 1); + EXPECT_TRUE(result[0].foo_.val() == R"(["asd", "bsd"])"); + EXPECT_TRUE(result[0].bar_.val() == "asd"); + EXPECT_TRUE(result[0].baz_.val() == R"(xxx")"); +}