From de24e27d1574f284f6951c96a47215fb4a662b81 Mon Sep 17 00:00:00 2001 From: Victor Petrovykh Date: Wed, 21 Feb 2024 04:47:52 -0500 Subject: [PATCH] Add endianness to int converter functions. Add `std::Endian` enum with two values `big` and `little`. Update the bytes <-> int converter functions with a required `Endian` parameter. Update the docs to reflect these changes. --- docs/stdlib/bytes.rst | 56 ++++++++++--- docs/stdlib/numbers.rst | 50 +++++++++--- edb/buildmeta.py | 2 +- edb/lib/std/70-converters.edgeql | 133 ++++++++++++++++++++++++++----- tests/test_edgeql_functions.py | 71 +++++++++++------ 5 files changed, 245 insertions(+), 67 deletions(-) diff --git a/docs/stdlib/bytes.rst b/docs/stdlib/bytes.rst index 43f27bdb75c..228820ddc05 100644 --- a/docs/stdlib/bytes.rst +++ b/docs/stdlib/bytes.rst @@ -12,6 +12,9 @@ Bytes * - :eql:type:`bytes` - Byte sequence + * - :eql:type:`Endian` + - An enum for indicating integer value encoding. + * - :eql:op:`bytes[i] ` - :eql:op-desc:`bytesidx` @@ -98,6 +101,36 @@ Bytes db> select to_json("\"SGVsbG8gRWRnZURCIQ==\""); {b'Hello EdgeDB!'} + +---------- + + +.. eql:type:: std::Endian + + .. versionadded:: 5.0 + + An enum for indicating integer value encoding. + + This enum is used by the :eql:func:`to_int16`, :eql:func:`to_int32`, + :eql:func:`to_int64` and the :eql:func:`to_bytes` converters working with + :eql:type:`bytes` and integers. + + ``Endian.Big`` stands for big-endian encoding going from most significant + byte to least. ``Endian.Little`` stands for little-endian encoding going + from least to most significant byte. + + .. code-block:: edgeql-repl + + db> select to_bytes(16908295, Endian.Big); + {b'\x01\x02\x00\x07'} + db> select to_int32(b'\x01\x02\x00\x07', Endian.Big); + {16908295} + db> select to_bytes(16908295, Endian.Little); + {b'\x07\x00\x02\x01'} + db> select to_int32(b'\x07\x00\x02\x01', Endian.Little); + {16908295} + + ---------- @@ -148,10 +181,9 @@ Bytes --------- .. eql:function:: std::to_bytes(s: str) -> bytes - std::to_bytes(val: int16) -> bytes - std::to_bytes(val: int32) -> bytes - std::to_bytes(val: int64) -> bytes - std::to_bytes(val: int64) -> bytes + std::to_bytes(val: int16, endian: Endian) -> bytes + std::to_bytes(val: int32, endian: Endian) -> bytes + std::to_bytes(val: int64, endian: Endian) -> bytes std::to_bytes(val: uuid) -> bytes :index: encode stringencoder @@ -167,18 +199,24 @@ Bytes db> select to_bytes('テキスト'); {b'\xe3\x83\x86\xe3\x82\xad\xe3\x82\xb9\xe3\x83\x88'} - The integer values are encoded as big-endian (most significant bit comes - first) byte strings: + The integer values can be encoded as big-endian (most significant bit + comes first) byte strings: .. code-block:: edgeql-repl - db> select to_bytes(31); + db> select to_bytes(31, Endian.Big); {b'\x00\x1f'} - db> select to_bytes(31); + db> select to_bytes(31, Endian.Big); {b'\x00\x00\x00\x1f'} - db> select to_bytes(123456789123456789); + db> select to_bytes(123456789123456789, Endian.Big); {b'\x01\xb6\x9bK\xac\xd0_\x15'} + .. note:: + + Due to underlying implementation details using big-endian encoding + results in slightly faster performance of ``to_bytes`` when converting + integers. + The UUID values are converted to the underlying string of 16 bytes: .. code-block:: edgeql-repl diff --git a/docs/stdlib/numbers.rst b/docs/stdlib/numbers.rst index ed28ac8bf8b..9fc882553e1 100644 --- a/docs/stdlib/numbers.rst +++ b/docs/stdlib/numbers.rst @@ -135,7 +135,8 @@ between all numeric types. All numeric types can also be cast to and from :eql:type:`str` and :eql:type:`json`. ----------- +Definitions +----------- .. eql:type:: std::int16 @@ -814,7 +815,7 @@ from :eql:type:`str` and :eql:type:`json`. .. eql:function:: std::to_int16(s: str, fmt: optional str={}) -> int16 - std::to_int16(val: bytes) -> int16 + std::to_int16(val: bytes, endian: Endian) -> int16 :index: parse int16 @@ -831,20 +832,27 @@ from :eql:type:`str` and :eql:type:`json`. db> select to_int16('23%', '99%'); {23} - The bytes conversion function expects exactly 2 bytes using big-endian - representation. + The bytes conversion function expects exactly 2 bytes with specified + endianness. .. code-block:: edgeql-repl - db> select to_int16(b'\x00\x07'); + db> select to_int16(b'\x00\x07', Endian.Big); + {7} + db> select to_int16(b'\x07\x00', Endian.Little); {7} + .. note:: + + Due to underlying implementation details using big-endian encoding + results in slightly faster performance of ``to_int16``. + ------------ .. eql:function:: std::to_int32(s: str, fmt: optional str={}) -> int32 - std::to_int32(val: bytes) -> int32 + std::to_int32(val: bytes, endian: Endian) -> int32 :index: parse int32 @@ -861,20 +869,27 @@ from :eql:type:`str` and :eql:type:`json`. db> select to_int32('1000023%', '9999999%'); {1000023} - The bytes conversion function expects exactly 4 bytes using big-endian - representation. + The bytes conversion function expects exactly 4 bytes with specified + endianness. .. code-block:: edgeql-repl - db> select to_int32(b'\x01\x02\x00\x07'); + db> select to_int32(b'\x01\x02\x00\x07', Endian.Big); + {16908295} + db> select to_int32(b'\x07\x00\x02\x01', Endian.Little); {16908295} + .. note:: + + Due to underlying implementation details using big-endian encoding + results in slightly faster performance of ``to_int32``. + ------------ .. eql:function:: std::to_int64(s: str, fmt: optional str={}) -> int64 - std::to_int64(val: bytes) -> int64 + std::to_int64(val: bytes, endian: Endian) -> int64 :index: parse int64 @@ -891,14 +906,23 @@ from :eql:type:`str` and :eql:type:`json`. db> select to_int64('10000234567%', '99999999999%'); {10000234567} - The bytes conversion function expects exactly 8 bytes using big-endian - representation. + The bytes conversion function expects exactly 8 bytes with specified + endianness. .. code-block:: edgeql-repl - db> select to_int64(b'\x01\x02\x00\x07\x11\x22\x33\x44'); + db> select to_int64(b'\x01\x02\x00\x07\x11\x22\x33\x44', + ... Endian.Big); + {72620574343574340} + db> select to_int64(b'\x44\x33\x22\x11\x07\x00\x02\x01', + ... Endian.Little); {72620574343574340} + .. note:: + + Due to underlying implementation details using big-endian encoding + results in slightly faster performance of ``to_int64``. + ------------ diff --git a/edb/buildmeta.py b/edb/buildmeta.py index 19f3cf9d827..866a65e5363 100644 --- a/edb/buildmeta.py +++ b/edb/buildmeta.py @@ -44,7 +44,7 @@ # Increment this whenever the database layout or stdlib changes. -EDGEDB_CATALOG_VERSION = 2024_02_16_14_00 +EDGEDB_CATALOG_VERSION = 2024_02_20_00_00 EDGEDB_MAJOR_VERSION = 5 diff --git a/edb/lib/std/70-converters.edgeql b/edb/lib/std/70-converters.edgeql index 9477fca6fb7..a23a5baa396 100644 --- a/edb/lib/std/70-converters.edgeql +++ b/edb/lib/std/70-converters.edgeql @@ -270,38 +270,77 @@ std::to_bytes(s: std::str) -> std::bytes { }; +CREATE SCALAR TYPE +std::Endian EXTENDING enum; + + CREATE FUNCTION -std::to_bytes(val: std::int16) -> std::bytes +std::to_bytes(val: std::int16, endian: std::Endian) -> std::bytes { CREATE ANNOTATION std::description := - 'Convert an int16 to big-endian binary format'; + 'Convert an int16 using specified endian binary format.'; SET volatility := 'Immutable'; USING SQL $$ - SELECT int2send(val); + SELECT + CASE WHEN (endian = 'Little') THEN + substring(bin, 2, 1) + || substring(bin, 1, 1) + ELSE + bin + END + FROM ( + SELECT int2send(val) AS bin + ) AS t; $$; }; CREATE FUNCTION -std::to_bytes(val: std::int32) -> std::bytes +std::to_bytes(val: std::int32, endian: std::Endian) -> std::bytes { CREATE ANNOTATION std::description := - 'Convert an int32 to big-endian binary format'; + 'Convert an int32 using specified endian binary format.'; SET volatility := 'Immutable'; USING SQL $$ - SELECT int4send(val); + SELECT + CASE WHEN (endian = 'Little') THEN + substring(bin, 4, 1) + || substring(bin, 3, 1) + || substring(bin, 2, 1) + || substring(bin, 1, 1) + ELSE + bin + END + FROM ( + SELECT int4send(val) AS bin + ) AS t; $$; }; CREATE FUNCTION -std::to_bytes(val: std::int64) -> std::bytes +std::to_bytes(val: std::int64, endian: std::Endian) -> std::bytes { CREATE ANNOTATION std::description := - 'Convert an int64 to big-endian binary format'; + 'Convert an int64 using specified endian binary format.'; SET volatility := 'Immutable'; USING SQL $$ - SELECT int8send(val); + SELECT + CASE WHEN (endian = 'Little') THEN + substring(bin, 8, 1) + || substring(bin, 7, 1) + || substring(bin, 6, 1) + || substring(bin, 5, 1) + || substring(bin, 4, 1) + || substring(bin, 3, 1) + || substring(bin, 2, 1) + || substring(bin, 1, 1) + ELSE + bin + END + FROM ( + SELECT int8send(val) AS bin + ) AS t; $$; }; @@ -310,7 +349,7 @@ CREATE FUNCTION std::to_bytes(val: std::uuid) -> std::bytes { CREATE ANNOTATION std::description := - 'Convert an UUID to binary format'; + 'Convert an UUID to binary format.'; SET volatility := 'Immutable'; USING SQL $$ SELECT uuid_send(val); @@ -526,15 +565,33 @@ std::to_int64(s: std::str, fmt: OPTIONAL str={}) -> std::int64 CREATE FUNCTION -std::to_int64(val: std::bytes) -> std::int64 +std::to_int64(val: std::bytes, endian: std::Endian) -> std::int64 { CREATE ANNOTATION std::description := - 'Convert bytes into `int64` value using big-endian format.'; + 'Convert bytes into `int64` value.'; SET volatility := 'Immutable'; USING SQL $$ SELECT CASE WHEN (length(val) = 8) THEN - ('x' || right(val::bytea::text, 16))::bit(64)::bigint + ( + 'x' + || right( + ( + CASE WHEN (endian = 'Little') THEN + substring(val, 8, 1) + || substring(val, 7, 1) + || substring(val, 6, 1) + || substring(val, 5, 1) + || substring(val, 4, 1) + || substring(val, 3, 1) + || substring(val, 2, 1) + || substring(val, 1, 1) + ELSE + val + END + )::text, 16 + ) + )::bit(64)::int8 ELSE edgedb.raise( 0::int8, @@ -578,15 +635,29 @@ std::to_int32(s: std::str, fmt: OPTIONAL str={}) -> std::int32 CREATE FUNCTION -std::to_int32(val: std::bytes) -> std::int32 +std::to_int32(val: std::bytes, endian: std::Endian) -> std::int32 { CREATE ANNOTATION std::description := - 'Convert bytes into `int32` value using big-endian format.'; + 'Convert bytes into `int32` value.'; SET volatility := 'Immutable'; USING SQL $$ SELECT CASE WHEN (length(val) = 4) THEN - ('x' || right(val::bytea::text, 8))::bit(32)::int + ( + 'x' + || right( + ( + CASE WHEN (endian = 'Little') THEN + substring(val, 4, 1) + || substring(val, 3, 1) + || substring(val, 2, 1) + || substring(val, 1, 1) + ELSE + val + END + )::text, 8 + ) + )::bit(32)::int4 ELSE edgedb.raise( 0::int4, @@ -630,15 +701,39 @@ std::to_int16(s: std::str, fmt: OPTIONAL str={}) -> std::int16 CREATE FUNCTION -std::to_int16(val: std::bytes) -> std::int16 +std::to_int16(val: std::bytes, endian: std::Endian) -> std::int16 { CREATE ANNOTATION std::description := - 'Convert bytes into `int16` value using big-endian format.'; + 'Convert bytes into `int16` value.'; SET volatility := 'Immutable'; + # There is no direct cast from bits to int2 in Postgres, so we need to use + # the bit(32)::int4 as an intermediary value. However, the first bit is + # the sign bit and must be preserved as such, otherwise we will have + # overflow when casting from int4 to int2. So we pad the bytes with 0 on + # the right (which happens by default when casting 2 bytes from text to + # bit(32)) and then right-shift preserving the sign bit. This results in + # the int4 value in the lower two bytes being fully compatible with int2 + # value. USING SQL $$ SELECT CASE WHEN (length(val) = 2) THEN - ('x' || right(val::bytea::text, 4))::bit(16)::int::smallint + ( + ( + ( + 'x' + || right( + ( + CASE WHEN (endian = 'Little') THEN + substring(val, 2, 1) + || substring(val, 1, 1) + ELSE + val + END + )::text, 4 + ) + )::bit(32)::int4 + )>>16 + )::int2 ELSE edgedb.raise( 0::int2, diff --git a/tests/test_edgeql_functions.py b/tests/test_edgeql_functions.py index 9e8afe778ac..f8977d2521b 100644 --- a/tests/test_edgeql_functions.py +++ b/tests/test_edgeql_functions.py @@ -3119,26 +3119,32 @@ async def test_edgeql_functions_string_bytes_conversion_error(self): async def test_edgeql_functions_int_bytes_conversion_01(self): # Make sure we can convert the bytes to ints and back - twobytes = b'\x7f\x3a' - for numbytes in [2, 4, 8]: - raw = twobytes * (numbytes // 2) - typename = f'int{numbytes * 8}' - await self.assert_query_result( - f''' - WITH - val := <{typename}>$val, - bin := $bin, - SELECT ( - val = to_{typename}(bin), - bin = to_bytes(val), + for num in range(256): + byte = num.to_bytes() + for numbytes in [2, 4, 8]: + raw = byte * numbytes + typename = f'int{numbytes * 8}' + await self.assert_query_result( + f''' + WITH + val_b := <{typename}>$val_b, + val_l := <{typename}>$val_l, + bin := $bin, + SELECT ( + val_b = to_{typename}(bin, Endian.Big), + val_l = to_{typename}(bin, Endian.Little), + bin = to_bytes(val_b, Endian.Big), + bin = to_bytes(val_l, Endian.Little), + ) + ''', + {(True, True, True, True)}, + variables={ + "val_b": int.from_bytes(raw, 'big', signed=True), + "val_l": int.from_bytes(raw, 'little', signed=True), + "bin": raw, + }, + msg=f'Failed to convert {raw!r} to int or vice versa' ) - ''', - {(True, True)}, - variables={ - "val": int.from_bytes(raw, 'big'), - "bin": raw, - }, - ) async def test_edgeql_functions_int_bytes_conversion_02(self): with self.assertRaisesRegex( @@ -3148,7 +3154,7 @@ async def test_edgeql_functions_int_bytes_conversion_02(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int16(b'\x01') + SELECT to_int16(b'\x01', Endian.Big) ''', ) @@ -3159,7 +3165,10 @@ async def test_edgeql_functions_int_bytes_conversion_02(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int16(to_bytes(123)) + SELECT to_int16( + to_bytes(123, Endian.Big), + Endian.Big, + ) ''', ) @@ -3171,7 +3180,10 @@ async def test_edgeql_functions_int_bytes_conversion_03(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int32(to_bytes(23)) + SELECT to_int32( + to_bytes(23, Endian.Big), + Endian.Big, + ) ''', ) @@ -3182,7 +3194,10 @@ async def test_edgeql_functions_int_bytes_conversion_03(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int32(to_bytes(16908295)) + SELECT to_int32( + to_bytes(16908295, Endian.Big), + Endian.Big, + ) ''', ) @@ -3194,7 +3209,10 @@ async def test_edgeql_functions_int_bytes_conversion_04(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int64(to_bytes(23)) + SELECT to_int64( + to_bytes(23, Endian.Big), + Endian.Big, + ) ''', ) @@ -3205,7 +3223,10 @@ async def test_edgeql_functions_int_bytes_conversion_04(self): async with self.con.transaction(): await self.con.execute( r''' - SELECT to_int64(b'\x00\x00' ++ to_bytes(62620574343574340)) + SELECT to_int64( + b'\x00' ++ to_bytes(62620574343574340, Endian.Big), + Endian.Big, + ) ''', )