Skip to content

Commit

Permalink
Merge pull request #219 from bazsi/filterx-parse-csv-refinements
Browse files Browse the repository at this point in the history
Filterx parse csv refinements
  • Loading branch information
bazsi authored Jul 25, 2024
2 parents 51145fc + 0fe2636 commit 4c2c08d
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 9 deletions.
29 changes: 25 additions & 4 deletions lib/scanner/csv-scanner/csv-scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,27 @@ _is_whitespace_char(const gchar *str)
}

static void
_skip_whitespace(const gchar **src)
_skip_whitespace(CSVScanner *self, const gchar **src)
{
while (_is_whitespace_char(*src))
(*src)++;
if (self->current_quote)
{
/* quoted value, all whitespace is to be removed, even if the delimiter is considered whitespace */
while (_is_whitespace_char(*src))
(*src)++;
}
else
{
/* in case the value is unquoted, delimiters are never considered
* whitespace. This plays a role in case the delimiter is either a
* space or a tab. In those cases, a new delimiter starts the next
* new value to be extracted. */
while (_is_whitespace_char(*src))
{
if (_strchr_optimized_for_single_char_haystack(self->options->delimiters, **src))
break;
(*src)++;
}
}
}

static void
Expand All @@ -184,7 +201,7 @@ _parse_left_whitespace(CSVScanner *self)
if ((self->options->flags & CSV_SCANNER_STRIP_WHITESPACE) == 0)
return;

_skip_whitespace(&self->src);
_skip_whitespace(self, &self->src);
}

static gint
Expand Down Expand Up @@ -379,6 +396,10 @@ _get_value_length_without_right_whitespace(CSVScanner *self)
{
gint len = self->current_value->len;

/* if the value was quoted, we can get rid off whitespace even if they are
* part of delimiters. In any other case we won't have delimiters as
* whitespace on the right side (as they started the next value) */

while (len > 0 && _is_whitespace_char(self->current_value->str + len - 1))
len--;

Expand Down
2 changes: 1 addition & 1 deletion lib/scanner/csv-scanner/csv-scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ typedef struct
CSV_STATE_FINISH,
} state;
const gchar *src;
gint current_column;
GString *current_value;
gint current_column;
gchar current_quote;
} CSVScanner;

Expand Down
79 changes: 79 additions & 0 deletions lib/scanner/csv-scanner/tests/test_csv_scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,85 @@ Test(csv_scanner, partial_input)
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, strip_whitespace_will_not_strip_delimiter_characters)
{
csv_scanner_init(&scanner, _default_options_with_flags(3, CSV_SCANNER_STRIP_WHITESPACE), "foo\t\tbaz");
csv_scanner_options_set_delimiters(&options, "\t");

cr_expect(_column_index_equals(0));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(0, "foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(1, ""));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(2, "baz"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, strip_whitespace_will_strips_spaces_while_not_stripping_delimiter_characters)
{
csv_scanner_init(&scanner, _default_options_with_flags(3, CSV_SCANNER_STRIP_WHITESPACE),
"'\t\t foo \t\t'\t \t baz ");
csv_scanner_options_set_delimiters(&options, "\t");

cr_expect(_column_index_equals(0));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(0, "foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(1, ""));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(2, "baz"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, strip_whitespace_and_quoted_values_will_strip_embedded_whitespace)
{
csv_scanner_init(&scanner, _default_options_with_flags(3, CSV_SCANNER_STRIP_WHITESPACE), " foo \t \t baz ");
csv_scanner_options_set_delimiters(&options, "\t");

cr_expect(_column_index_equals(0));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(0, "foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(1, ""));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_equals(2, "baz"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, greedy_column)
{
csv_scanner_init(&scanner, _default_options_with_flags(2, CSV_SCANNER_GREEDY), "foo,bar,baz");
Expand Down
9 changes: 6 additions & 3 deletions modules/csvparser/filterx-func-parse-csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,19 @@ _extract_opts(FilterXFunctionParseCSV *self, FilterXFunctionArgs *args, GError *
opt_flags &= ~CSV_SCANNER_GREEDY;
}

flag_val = filterx_function_args_get_named_literal_boolean(args, FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACES,
flag_val = filterx_function_args_get_named_literal_boolean(args, FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACE,
&exists,
&flag_err);
if (!exists)
flag_val = filterx_function_args_get_named_literal_boolean(args, FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACES,
&exists,
&flag_err);
if (exists)
{

if (flag_err)
{
error_str = FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACES " argument evaluation error";
error_str = FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACE " argument evaluation error";
goto error;
}

Expand Down Expand Up @@ -347,7 +351,6 @@ filterx_function_parse_csv_new(const gchar *function_name, FilterXFunctionArgs *
self->super.super.free_fn = _free;
csv_scanner_options_set_delimiters(&self->options, ",");
csv_scanner_options_set_quote_pairs(&self->options, "\"\"''");
csv_scanner_options_set_flags(&self->options, CSV_SCANNER_STRIP_WHITESPACE);
csv_scanner_options_set_dialect(&self->options, CSV_SCANNER_ESCAPE_NONE);

if (!_extract_args(self, args, error) ||
Expand Down
3 changes: 2 additions & 1 deletion modules/csvparser/filterx-func-parse-csv.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_DELIMITER "delimiter"
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRING_DELIMITERS "string_delimiters"
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_DIALECT "dialect"
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACE "strip_whitespace"
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACES "strip_whitespaces"
#define FILTERX_FUNC_PARSE_CSV_ARG_NAME_GREEDY "greedy"
#define FILTERX_FUNC_PARSE_CSV_USAGE "Usage: parse_csv(msg_str [" \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_COLUMNS"=json_array, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_DELIMITER"=string, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRING_DELIMITERS"=json_array, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_DIALECT"=string, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACES"=boolean, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_STRIP_WHITESPACE"=boolean, " \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_GREEDY"=boolean])"
#define FILTERX_FUNC_PARSE_ERR_EMPTY_DELIMITER "Either '" \
FILTERX_FUNC_PARSE_CSV_ARG_NAME_DELIMITER"' or '" \
Expand Down

0 comments on commit 4c2c08d

Please sign in to comment.