Skip to content

Commit

Permalink
E021-06: SUBSTRING function (#113)
Browse files Browse the repository at this point in the history
`SUBSTRING` can be constructed in several forms:

  SUBSTRING(
    value
    FROM start_position
    [ FOR string_length ]
    [ USING { CHARACTERS | OCTETS } ]
  )

`start_position` starts at 1 for the first character or byte. If
`start_position` is out of bounds (either before the start or after the
end) the returned value will be empty.

If `string_length` is not provided, all characters or bytes until the
end will be included. Otherwise, only `string_length` will be included.
If `string_length` goes beyond the end of the string it will only be
used until the end.

If `CHARACTERS` is specified the `start_position` and `string_length`
will count in characters (this works with multibyte characters) whereas
`OCTETS` will strictly count in bytes. If `USING` is not provided,
`CHARACTERS` will be used.
  • Loading branch information
elliotchance authored Jul 14, 2022
1 parent 25d4b4c commit f84fb96
Show file tree
Hide file tree
Showing 9 changed files with 496 additions and 2 deletions.
42 changes: 42 additions & 0 deletions docs/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,48 @@ Matching is case-sensitive.
VALUES POSITION('xx' IN 'hello Hello');
-- COL1: 0
SUBSTRING(CHARACTER VARYING FROM INTEGER ...) CHARACTER VARYING
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

``SUBSTRING`` can be constructed in several forms:

.. code-block:: text
SUBSTRING(
value
FROM start_position
[ FOR string_length ]
[ USING { CHARACTERS | OCTETS } ]
)
``start_position`` starts at 1 for the first character or byte. If
``start_position`` is out of bounds (either before the start or after the end)
the returned value will be empty.

If ``string_length`` is not provided, all characters or bytes until the end will
be included. Otherwise, only ``string_length`` will be included. If
``string_length`` goes beyond the end of the string it will only be used until
the end.

If ``CHARACTERS`` is specified the ``start_position`` and ``string_length`` will
count in characters (this works with multibyte characters) whereas ``OCTETS``
will strictly count in bytes. If ``USING`` is not provided, ``CHARACTERS`` will
be used.

.. code-block:: sql
VALUES SUBSTRING('hello' FROM 2);
-- COL1: ello
VALUES SUBSTRING('hello' FROM 20);
-- COL1:
VALUES SUBSTRING('hello world' FROM 3 FOR 5);
-- COL1: llo w
VALUES SUBSTRING('Жabڣc' FROM 4 USING OCTETS);
-- COL1: bڣc
``UPPER(CHARACTER VARYING) CHARACTER VARYING``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
2 changes: 1 addition & 1 deletion docs/sql-compliance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ As of the latest version (or at least the version of this documentation)
* - ✅ E021-05
- ``OCTET_LENGTH`` function

* - E021-06
* - E021-06
- ``SUBSTRING`` function

* - ✅ E021-07
Expand Down
27 changes: 26 additions & 1 deletion grammar.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,8 @@
| DESC -> no

<character value function> /* Expr */ ::=
<fold>
<character substring function>
| <fold>

<fold> /* Expr */ ::=
UPPER <left paren> <character value expression> <right paren> -> upper
Expand Down Expand Up @@ -1038,3 +1039,27 @@

<time fractional seconds precision> /* int */ ::=
<unsigned integer>

<character substring function> /* Expr */ ::=
SUBSTRING <left paren> <character value expression>
FROM <start position> <right paren> -> substring1
| SUBSTRING <left paren> <character value expression>
FROM <start position>
FOR <string length> <right paren> -> substring2
| SUBSTRING <left paren> <character value expression>
FROM <start position>
USING <char length units> <right paren> -> substring3
| SUBSTRING <left paren> <character value expression>
FROM <start position>
FOR <string length>
USING <char length units> <right paren> -> substring4

<start position> /* Expr */ ::=
<numeric value expression>

<string length> /* Expr */ ::=
<numeric value expression>

<char length units> /* string */ ::=
CHARACTERS
| OCTETS
68 changes: 68 additions & 0 deletions tests/substring.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
EXPLAIN VALUES SUBSTRING('hello' FROM 3);
-- EXPLAIN: VALUES (COL1 CHARACTER VARYING) = ROW(SUBSTRING('hello' FROM 3 USING CHARACTERS))

EXPLAIN VALUES SUBSTRING('hello world' FROM 3 FOR 5);
-- EXPLAIN: VALUES (COL1 CHARACTER VARYING) = ROW(SUBSTRING('hello world' FROM 3 FOR 5 USING CHARACTERS))

EXPLAIN VALUES SUBSTRING('hello world' FROM 3 USING CHARACTERS);
-- EXPLAIN: VALUES (COL1 CHARACTER VARYING) = ROW(SUBSTRING('hello world' FROM 3 USING CHARACTERS))

EXPLAIN VALUES SUBSTRING('hello world' FROM 3 FOR 5 USING CHARACTERS);
-- EXPLAIN: VALUES (COL1 CHARACTER VARYING) = ROW(SUBSTRING('hello world' FROM 3 FOR 5 USING CHARACTERS))

EXPLAIN VALUES SUBSTRING('hello world' FROM 3 FOR 2 + 3 USING OCTETS);
-- EXPLAIN: VALUES (COL1 CHARACTER VARYING) = ROW(SUBSTRING('hello world' FROM 3 FOR 2 + 3 USING OCTETS))

VALUES SUBSTRING('hello' FROM 0);
-- COL1:

VALUES SUBSTRING('hello' FROM 2);
-- COL1: ello

VALUES SUBSTRING('hello' FROM 4);
-- COL1: lo

VALUES SUBSTRING('hello' FROM 5);
-- COL1: o

VALUES SUBSTRING('hello' FROM 6);
-- COL1:

VALUES SUBSTRING('hello' FROM 20);
-- COL1:

VALUES SUBSTRING('hello' FROM -1);
-- COL1:

VALUES SUBSTRING('hello world' FROM 3 FOR 5);
-- COL1: llo w

VALUES SUBSTRING('hello world' FROM 3 USING CHARACTERS);
-- COL1: llo world

VALUES SUBSTRING('hello world' FROM 3 FOR 5 USING CHARACTERS);
-- COL1: llo w

VALUES SUBSTRING('Жabڣc' FROM 1);
-- COL1: Жabڣc

VALUES SUBSTRING('Жabڣc' FROM 2);
-- COL1: abڣc

VALUES SUBSTRING('Жabڣc' FROM 1 FOR 1);
-- COL1: Ж

VALUES SUBSTRING('Жabڣc' FROM 1 FOR 2);
-- COL1: Жa

VALUES SUBSTRING('Жabڣc' FROM 3 USING OCTETS);
-- COL1: abڣc

VALUES SUBSTRING('Жabڣc' FROM 4 USING OCTETS);
-- COL1: bڣc

VALUES SUBSTRING('Жabڣc' FROM 1 FOR 2 USING OCTETS);
-- COL1: Ж

VALUES SUBSTRING('Жabڣc' FROM 3 FOR 2 USING OCTETS);
-- COL1: ab
29 changes: 29 additions & 0 deletions vsql/ast.v
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Expr = BetweenExpr
| QueryExpression
| RowExpr
| SimilarExpr
| SubstringExpr
| UnaryExpr
| Value

Expand Down Expand Up @@ -99,6 +100,9 @@ fn (e Expr) pstr(params map[string]Value) string {
SimilarExpr {
e.pstr(params)
}
SubstringExpr {
e.pstr(params)
}
UnaryExpr {
e.pstr(params)
}
Expand Down Expand Up @@ -524,3 +528,28 @@ struct DropSchemaStmt {
schema_name Identifier
behavior string // CASCADE or RESTRICT
}

struct SubstringExpr {
value Expr
from Expr // NoExpr when missing
@for Expr // NoExpr when missing
using string // CHARACTERS or OCTETS or ''
}

fn (e SubstringExpr) str() string {
return e.pstr(map[string]Value{})
}

fn (e SubstringExpr) pstr(params map[string]Value) string {
mut s := 'SUBSTRING(${e.value.pstr(params)}'

if e.from !is NoExpr {
s += ' FROM ${e.from.pstr(params)}'
}

if e.@for !is NoExpr {
s += ' FOR ${[email protected](params)}'
}

return s + ' USING $e.using)'
}
37 changes: 37 additions & 0 deletions vsql/eval.v
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ fn eval_as_type(conn &Connection, data Row, e Expr, params map[string]Value) ?Ty
LocalTimestampExpr {
return new_type('TIMESTAMP WITHOUT TIME ZONE', 0)
}
SubstringExpr {
return new_type('CHARACTER VARYING', 0)
}
}
}

Expand Down Expand Up @@ -176,6 +179,9 @@ fn eval_as_value(conn &Connection, data Row, e Expr, params map[string]Value) ?V
SimilarExpr {
return eval_similar(conn, data, e, params)
}
SubstringExpr {
return eval_substring(conn, data, e, params)
}
UnaryExpr {
return eval_unary(conn, data, e, params)
}
Expand Down Expand Up @@ -338,6 +344,37 @@ fn eval_like(conn &Connection, data Row, e LikeExpr, params map[string]Value) ?V
return new_boolean_value(result)
}

fn eval_substring(conn &Connection, data Row, e SubstringExpr, params map[string]Value) ?Value {
value := eval_as_value(conn, data, e.value, params)?
from := int((eval_as_value(conn, data, e.from, params)?).f64_value) - 1

if e.using == 'CHARACTERS' {
characters := value.string_value.runes()

if from >= characters.len || from < 0 {
return new_varchar_value('', 0)
}

mut @for := characters.len - from
if e.@for !is NoExpr {
@for = int((eval_as_value(conn, data, e.@for, params)?).f64_value)
}

return new_varchar_value(characters[from..from + @for].string(), 0)
}

if from >= value.string_value.len || from < 0 {
return new_varchar_value('', 0)
}

mut @for := value.string_value.len - from
if e.@for !is NoExpr {
@for = int((eval_as_value(conn, data, e.@for, params)?).f64_value)
}

return new_varchar_value(value.string_value.substr(from, from + @for), 0)
}

fn eval_binary(conn &Connection, data Row, e BinaryExpr, params map[string]Value) ?Value {
left := eval_as_value(conn, data, e.left, params)?
right := eval_as_value(conn, data, e.right, params)?
Expand Down
10 changes: 10 additions & 0 deletions vsql/expr.v
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ fn expr_is_agg(conn &Connection, e Expr) ?bool {
return nested_agg_unsupported(e)
}
}
SubstringExpr {
if expr_is_agg(conn, e.value)? || expr_is_agg(conn, e.from)?
|| expr_is_agg(conn, e.@for)? {
return nested_agg_unsupported(e)
}
}
UnaryExpr {
if expr_is_agg(conn, e.expr)? {
return nested_agg_unsupported(e)
Expand Down Expand Up @@ -118,6 +124,10 @@ fn resolve_identifiers(e Expr, tables map[string]Table) ?Expr {
return SimilarExpr{resolve_identifiers(e.left, tables)?, resolve_identifiers(e.right,
tables)?, e.not}
}
SubstringExpr {
return SubstringExpr{resolve_identifiers(e.value, tables)?, resolve_identifiers(e.from,
tables)?, resolve_identifiers(e.@for, tables)?, e.using}
}
UnaryExpr {
return UnaryExpr{e.op, resolve_identifiers(e.expr, tables)?}
}
Expand Down
Loading

0 comments on commit f84fb96

Please sign in to comment.