From fa0d58bb57de03671a47a04ad5f97b52374fa3a5 Mon Sep 17 00:00:00 2001 From: KtorZ Date: Tue, 15 Oct 2024 22:43:09 +0200 Subject: [PATCH] experiment with: aiken/cbor.{deserialise} --- CHANGELOG.md | 1 + aiken.lock | 2 +- lib/aiken/cbor.ak | 399 ++++++++++++++++++++++++++--------------- lib/aiken/cbor.test.ak | 268 +++++++++++++++++++++++++++ 4 files changed, 526 insertions(+), 144 deletions(-) create mode 100644 lib/aiken/cbor.test.ak diff --git a/CHANGELOG.md b/CHANGELOG.md index 01dec0c..950cb4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Added +- [`aiken/cbor.{deserialise}`](https://aiken-lang.github.io/stdlib/aiken/cbor.html#deserialise): to recover `Data` from CBOR bytes. - [`aiken/collection/pairs.{insert_with_by_ascending_key}`](https://aiken-lang.github.io/stdlib/aiken/collection/pairs.html#insert_with_by_ascending_key): for inserting in pairs while specifying how to combine values on key conflict. ## v2.1.0 - 2024-09-14 diff --git a/aiken.lock b/aiken.lock index 0d2932a..b7a0d45 100644 --- a/aiken.lock +++ b/aiken.lock @@ -13,4 +13,4 @@ requirements = [] source = "github" [etags] -"aiken-lang/fuzz@v2" = [{ secs_since_epoch = 1727870812, nanos_since_epoch = 978021000 }, "64a32283418d58cade34059d3855b857e84505541158c541c460cafa0d355475"] +"aiken-lang/fuzz@v2" = [{ secs_since_epoch = 1729023353, nanos_since_epoch = 135235000 }, "64a32283418d58cade34059d3855b857e84505541158c541c460cafa0d355475"] diff --git a/lib/aiken/cbor.ak b/lib/aiken/cbor.ak index 693a047..d6ffa87 100644 --- a/lib/aiken/cbor.ak +++ b/lib/aiken/cbor.ak @@ -1,66 +1,6 @@ use aiken use aiken/builtin.{decode_utf8, serialise_data} - -/// Serialise any value to binary, encoding using [CBOR](https://www.rfc-editor.org/rfc/rfc8949). -/// -/// This is particularly useful in combination with hashing functions, as a way -/// to obtain a byte representation that matches the serialised representation -/// used by the ledger in the context of on-chain code. -/// -/// Note that the output matches the output of [`diagnostic`](#diagnostic), -/// though with a different encoding. [`diagnostic`](#diagnostic) is merely a -/// textual representation of the CBOR encoding that is human friendly and -/// useful for debugging. -/// -/// ```aiken -/// cbor.serialise(42) == #"182a" -/// cbor.serialise(#"a1b2") == #"42a1b2" -/// cbor.serialise([]) == #"80" -/// cbor.serialise((1, 2)) == #"9f0102ff" -/// cbor.serialise((1, #"ff", 3)) == #"9f0141ff03ff" -/// cbor.serialise([(1, #"ff")]) == #"a10141ff" -/// cbor.serialise(Some(42)) == #"d8799f182aff" -/// cbor.serialise(None) == #"d87a80" -/// ``` -pub fn serialise(self: Data) -> ByteArray { - serialise_data(self) -} - -test serialise_1() { - serialise(42) == #"182a" -} - -test serialise_2() { - serialise(#"a1b2") == #"42a1b2" -} - -test serialise_3() { - serialise([]) == #"80" -} - -test serialise_4() { - serialise((1, 2)) == #"9f0102ff" -} - -test serialise_5() { - serialise((1, #"ff", 3)) == #"9f0141ff03ff" -} - -test serialise_6() { - serialise([(1, #"ff")]) == #"9f9f0141ffffff" -} - -test serialise_7() { - serialise(Some(42)) == #"d8799f182aff" -} - -test serialise_8() { - serialise(None) == #"d87a80" -} - -test serialise_9() { - serialise([Pair(1, #"ff")]) == #"a10141ff" -} +use aiken/primitive/bytearray /// Obtain a String representation of _anything_. This is particularly (and only) useful for tracing /// and debugging. This function is expensive and should not be used in any production code as it @@ -88,90 +28,263 @@ pub fn diagnostic(self: Data) -> String { |> decode_utf8 } -test diagnostic_1() { - diagnostic(42) == @"42" -} - -test diagnostic_2() { - diagnostic(#"a1b2") == @"h'A1B2'" -} - -test diagnostic_3() { - diagnostic([1, 2, 3]) == @"[_ 1, 2, 3]" -} - -test diagnostic_4() { - diagnostic([]) == @"[]" -} - -test diagnostic_5() { - diagnostic((1, 2)) == @"[_ 1, 2]" -} - -test diagnostic_6() { - diagnostic((1, #"ff", 3)) == @"[_ 1, h'FF', 3]" -} - -test diagnostic_7() { - diagnostic([(1, #"ff")]) == @"[_ [_ 1, h'FF']]" -} - -test diagnostic_7_alt() { - diagnostic([Pair(1, #"ff")]) == @"{_ 1: h'FF' }" -} - -test diagnostic_8() { - diagnostic(Some(42)) == @"121([_ 42])" -} - -test diagnostic_9() { - diagnostic(None) == @"122([])" -} - -test diagnostic_10() { - let xs: List<(Int, Int)> = [] - diagnostic(xs) == @"[]" -} - -test diagnostic_10_alt() { - let xs: Pairs = [] - diagnostic(xs) == @"{}" -} - -type Foo { - foo: Bar, -} - -type Bar { - A - B(Int) -} - -test diagnostic_11() { - diagnostic(Foo { foo: A }) == @"121([_ 121([])])" -} - -test diagnostic_12() { - diagnostic(Foo { foo: B(42) }) == @"121([_ 122([_ 42])])" -} - -type Baz { - a0: Int, - b0: ByteArray, -} - -test diagnostic_13() { - diagnostic(Baz { a0: 14, b0: #"ff" }) == @"121([_ 14, h'FF'])" -} - -test diagnostic_14() { - diagnostic([0]) == @"[_ 0]" +/// Deserialise a [CBOR](https://www.rfc-editor.org/rfc/rfc8949) Data. This is the reverse operation of [serialise](#serialise). +/// In particular, we have the following property: +/// +/// ```aiken +/// cbor.deserialise(cbor.serialise(any_data)) == Some(any_data) +/// ``` +/// +/// > [!CAUTION] +/// > Unfortunately, this function isn't derived from a builtin primitive. It +/// > is therefore an order of magnitude more expensive than its counterpart +/// > and shall be used with care. +/// > +/// > In general, one might prefer avoiding deserialisation unless truly necessary. +/// > Yet, it may come in handy for testing and in rare scenarios. +pub fn deserialise(bytes: ByteArray) -> Option { + let length = bytearray.length(bytes) + + let peek = + fn(offset: Int, callback: fn(Byte) -> Decoder) -> Decoder { + fn(cursor) { + if cursor >= length { + deserialise_failure + } else { + callback(bytearray.at(bytes, cursor))(cursor + offset) + } + } + } + + let take = + fn(n: Int, callback: fn(ByteArray) -> Decoder) -> Decoder { + fn(cursor) { + if cursor >= length { + deserialise_failure + } else { + callback(builtin.slice_bytearray(cursor, n, bytes))(cursor + n) + } + } + } + + let Pair(result, consumed) = decode_data(peek, take)(0) + + if consumed != length { + None + } else { + Some(result) + } } -test diagnostic_15() { - diagnostic(-42) == @"-42" +/// Serialise any value to binary, encoding using [CBOR](https://www.rfc-editor.org/rfc/rfc8949). +/// +/// This is particularly useful in combination with hashing functions, as a way +/// to obtain a byte representation that matches the serialised representation +/// used by the ledger in the context of on-chain code. +/// +/// Note that the output matches the output of [`diagnostic`](#diagnostic), +/// though with a different encoding. [`diagnostic`](#diagnostic) is merely a +/// textual representation of the CBOR encoding that is human friendly and +/// useful for debugging. +/// +/// ```aiken +/// cbor.serialise(42) == #"182a" +/// cbor.serialise(#"a1b2") == #"42a1b2" +/// cbor.serialise([]) == #"80" +/// cbor.serialise((1, 2)) == #"9f0102ff" +/// cbor.serialise((1, #"ff", 3)) == #"9f0141ff03ff" +/// cbor.serialise([(1, #"ff")]) == #"a10141ff" +/// cbor.serialise(Some(42)) == #"d8799f182aff" +/// cbor.serialise(None) == #"d87a80" +/// ``` +pub fn serialise(self: Data) -> ByteArray { + serialise_data(self) } -test diagnostic_16() { - diagnostic([-1, 0, 1]) == @"[_ -1, 0, 1]" +type Byte = + Int + +type Decoder = + fn(Int) -> Pair + +type Peek = + fn(Int, fn(Byte) -> Decoder) -> Decoder + +type Take = + fn(Int, fn(ByteArray) -> Decoder) -> Decoder + +fn return(data: Data) -> Decoder { + fn(cursor) { Pair(data, cursor) } +} + +const deserialise_failure: Pair = { + let empty: Data = "" + Pair(empty, -1) + } + +const token_begin_bytes = 0x5f + +const token_begin_list = 0x9f + +const token_begin_map = 0xbf + +const token_break = 0xff + +fn decode_data(peek: Peek, take: Take) -> Decoder { + let next <- peek(1) + let major_type = next / 32 + if major_type <= 2 { + if major_type == 0 { + let i <- decode_uint(peek, take, next) + return(builtin.i_data(i)) + } else if major_type == 1 { + let i <- decode_uint(peek, take, next - 32) + return(builtin.i_data(-i - 1)) + } else { + if next == token_begin_bytes { + let b <- decode_chunks(peek, take) + return(builtin.b_data(b)) + } else { + let b <- decode_bytes(peek, take, next - 64) + return(builtin.b_data(b)) + } + } + } else if major_type == 6 { + let tag <- decode_uint(peek, take, next - 192) + let next <- peek(1) + if tag == 102 { + fn(_) { deserialise_failure } + } else { + let ix = + if tag >= 1280 { + tag - 1280 + 7 + } else { + tag - 121 + } + + if next == token_begin_list { + let fields <- decode_indefinite(peek, take, decode_data) + return(builtin.constr_data(ix, fields)) + } else { + let size <- decode_uint(peek, take, next - 128) + let fields <- decode_definite(peek, take, decode_data, size) + return(builtin.constr_data(ix, fields)) + } + } + } else if major_type == 4 { + if next == token_begin_list { + let xs <- decode_indefinite(peek, take, decode_data) + return(builtin.list_data(xs)) + } else { + let size <- decode_uint(peek, take, next - 128) + let xs <- decode_definite(peek, take, decode_data, size) + return(builtin.list_data(xs)) + } + } else if major_type == 5 { + if next == token_begin_map { + let xs <- decode_indefinite(peek, take, decode_pair) + return(builtin.map_data(xs)) + } else { + let size <- decode_uint(peek, take, next - 160) + let xs <- decode_definite(peek, take, decode_pair, size) + return(builtin.map_data(xs)) + } + } else { + fn(_) { deserialise_failure } + } +} + +fn decode_pair(peek: Peek, take: Take) -> Decoder> { + fn(cursor) { + let Pair(k, cursor) = decode_data(peek, take)(cursor) + let Pair(v, cursor) = decode_data(peek, take)(cursor) + Pair(Pair(k, v), cursor) + } +} + +fn decode_uint( + peek: Peek, + take: Take, + header: Int, + and_then: fn(Int) -> Decoder, +) -> Decoder { + if header < 24 { + and_then(header) + } else if header == 24 { + let payload <- peek(1) + and_then(payload) + } else if header < 28 { + let width = bytearray.at(#[2, 4, 8], header - 25) + let payload <- take(width) + and_then(bytearray.to_int_big_endian(payload)) + } else { + fn(_) { deserialise_failure } + } +} + +fn decode_bytes( + peek: Peek, + take: Take, + header: Int, + and_then: fn(ByteArray) -> Decoder, +) -> Decoder { + let width <- decode_uint(peek, take, header) + let bytes <- take(width) + and_then(bytes) +} + +fn decode_chunks( + peek: Peek, + take: Take, + and_then: fn(ByteArray) -> Decoder, +) -> Decoder { + let next <- peek(0) + if next == token_break { + fn(cursor) { and_then("")(cursor + 1) } + } else { + let next <- peek(1) + let chunk <- decode_bytes(peek, take, next - 64) + let chunks <- decode_chunks(peek, take) + and_then(builtin.append_bytearray(chunk, chunks)) + } +} + +fn decode_definite( + peek: Peek, + take: Take, + decode_one: fn(Peek, Take) -> Decoder, + size: Int, + and_then: fn(List) -> Decoder, +) -> Decoder { + if size <= 0 { + and_then([]) + } else { + fn(cursor) { + let Pair(elem, cursor) = decode_one(peek, take)(cursor) + { + let elems <- decode_definite(peek, take, decode_one, size - 1) + and_then([elem, ..elems]) + }(cursor) + } + } +} + +fn decode_indefinite( + peek: Peek, + take: Take, + decode_one: fn(Peek, Take) -> Decoder, + and_then: fn(List) -> Decoder, +) -> Decoder { + let next <- peek(0) + if next == token_break { + fn(cursor) { and_then([])(cursor + 1) } + } else { + fn(cursor) { + let Pair(elem, cursor) = decode_one(peek, take)(cursor) + { + let elems <- decode_indefinite(peek, take, decode_one) + and_then([elem, ..elems]) + }(cursor) + } + } } diff --git a/lib/aiken/cbor.test.ak b/lib/aiken/cbor.test.ak new file mode 100644 index 0000000..96bb958 --- /dev/null +++ b/lib/aiken/cbor.test.ak @@ -0,0 +1,268 @@ +use aiken/cbor.{deserialise, diagnostic, serialise} +use aiken/fuzz + +// ------------------------------------------------------------------ diagnostic + +test diagnostic_1() { + diagnostic(42) == @"42" +} + +test diagnostic_2() { + diagnostic(#"a1b2") == @"h'A1B2'" +} + +test diagnostic_3() { + diagnostic([1, 2, 3]) == @"[_ 1, 2, 3]" +} + +test diagnostic_4() { + diagnostic([]) == @"[]" +} + +test diagnostic_5() { + diagnostic((1, 2)) == @"[_ 1, 2]" +} + +test diagnostic_6() { + diagnostic((1, #"ff", 3)) == @"[_ 1, h'FF', 3]" +} + +test diagnostic_7() { + diagnostic([(1, #"ff")]) == @"[_ [_ 1, h'FF']]" +} + +test diagnostic_7_alt() { + diagnostic([Pair(1, #"ff")]) == @"{_ 1: h'FF' }" +} + +test diagnostic_8() { + diagnostic(Some(42)) == @"121([_ 42])" +} + +test diagnostic_9() { + diagnostic(None) == @"122([])" +} + +test diagnostic_10() { + let xs: List<(Int, Int)> = [] + diagnostic(xs) == @"[]" +} + +test diagnostic_10_alt() { + let xs: Pairs = [] + diagnostic(xs) == @"{}" +} + +type Foo { + foo: Bar, +} + +type Bar { + A + B(Int) +} + +test diagnostic_11() { + diagnostic(Foo { foo: A }) == @"121([_ 121([])])" +} + +test diagnostic_12() { + diagnostic(Foo { foo: B(42) }) == @"121([_ 122([_ 42])])" +} + +type Baz { + a0: Int, + b0: ByteArray, +} + +test diagnostic_13() { + diagnostic(Baz { a0: 14, b0: #"ff" }) == @"121([_ 14, h'FF'])" +} + +test diagnostic_14() { + diagnostic([0]) == @"[_ 0]" +} + +test diagnostic_15() { + diagnostic(-42) == @"-42" +} + +test diagnostic_16() { + diagnostic([-1, 0, 1]) == @"[_ -1, 0, 1]" +} + +// ------------------------------------------------------------------ serialise + +test serialise_1() { + serialise(42) == #"182a" +} + +test serialise_2() { + serialise(#"a1b2") == #"42a1b2" +} + +test serialise_3() { + serialise([]) == #"80" +} + +test serialise_4() { + serialise((1, 2)) == #"9f0102ff" +} + +test serialise_5() { + serialise((1, #"ff", 3)) == #"9f0141ff03ff" +} + +test serialise_6() { + serialise([(1, #"ff")]) == #"9f9f0141ffffff" +} + +test serialise_7() { + serialise(Some(42)) == #"d8799f182aff" +} + +test serialise_8() { + serialise(None) == #"d87a80" +} + +test serialise_9() { + serialise([Pair(1, #"ff")]) == #"a10141ff" +} + +// ------------------------------------------------------------------ deserialise + +type AnyData { + AnyInt(Int) + AnyByteArray(ByteArray) + AnyList(List) + AnyPairs(Pairs) + AnyUnaryConstr0(UnaryConstr0) + AnyUnaryConstr1(UnaryConstr1) + AnyUnaryConstr2(UnaryConstr2) + AnyBinaryConstr0(BinaryConstr0) + AnyBinaryConstr1(BinaryConstr1) +} + +type UnaryConstr0 { + UnaryConstr0 +} + +type UnaryConstr1 { + field0: String, +} + +type UnaryConstr2 { + field0: Int, + field1: List>, +} + +type BinaryConstr0 = + Bool + +type BinaryConstr1 = + Option + +fn any_pair(any_key: Fuzzer, any_value: Fuzzer) -> Fuzzer> { + let k <- fuzz.and_then(any_key) + let v <- fuzz.map(any_value) + Pair(k, v) +} + +fn any_data() -> Fuzzer { + fuzz.either6( + { + let i <- fuzz.map(fuzz.int()) + AnyInt(i) + }, + { + let bs <- fuzz.map(fuzz.bytearray()) + AnyByteArray(bs) + }, + { + let xs <- fuzz.map(fuzz.list(fuzz.int())) + AnyList(xs) + }, + { + let ps <- fuzz.map(fuzz.list(any_pair(fuzz.bytearray(), fuzz.int()))) + AnyPairs(ps) + }, + fuzz.either3( + fuzz.constant(AnyUnaryConstr0(UnaryConstr0)), + fuzz.constant(AnyUnaryConstr1(UnaryConstr1(@"lorem ipsum"))), + { + let i <- fuzz.and_then(fuzz.int()) + let xs <- fuzz.map(fuzz.list(fuzz.list(fuzz.bytearray()))) + AnyUnaryConstr2(UnaryConstr2(i, xs)) + }, + ), + fuzz.either( + { + let b <- fuzz.map(fuzz.bool()) + AnyBinaryConstr0(b) + }, + { + let o <- fuzz.map(fuzz.option(fuzz.int())) + AnyBinaryConstr1(o) + }, + ), + ) +} + +test prop_deserialise_any_data(any via any_data()) { + when any is { + AnyInt(i) -> { + fuzz.label(@"Int") + expect Some(data) = deserialise(serialise(i)) + expect i_decoded: Int = data + i_decoded == i + } + AnyByteArray(bs) -> { + fuzz.label(@"ByteArray") + expect Some(data) = deserialise(serialise(bs)) + expect bs_decoded: ByteArray = data + bs_decoded == bs + } + AnyList(xs) -> { + fuzz.label(@"List") + expect Some(data) = deserialise(serialise(xs)) + expect xs_decoded: List = data + xs_decoded == xs + } + AnyPairs(ps) -> { + fuzz.label(@"Pairs") + expect Some(data) = deserialise(serialise(ps)) + expect ps_decoded: Pairs = data + ps_decoded == ps + } + AnyUnaryConstr0(constr) -> { + fuzz.label(@"(unary) Constr") + expect Some(data) = deserialise(serialise(constr)) + expect constr_decoded: UnaryConstr0 = data + constr_decoded == constr + } + AnyUnaryConstr1(constr) -> { + fuzz.label(@"(unary) Constr") + expect Some(data) = deserialise(serialise(constr)) + expect constr_decoded: UnaryConstr1 = data + constr_decoded == constr + } + AnyUnaryConstr2(constr) -> { + fuzz.label(@"(unary) Constr") + expect Some(data) = deserialise(serialise(constr)) + expect constr_decoded: UnaryConstr2 = data + constr_decoded == constr + } + AnyBinaryConstr0(constr) -> { + fuzz.label(@"(binary) Constr") + expect Some(data) = deserialise(serialise(constr)) + expect constr_decoded: BinaryConstr0 = data + constr_decoded == constr + } + AnyBinaryConstr1(constr) -> { + fuzz.label(@"(binary) Constr") + expect Some(data) = deserialise(serialise(constr)) + expect constr_decoded: BinaryConstr1 = data + constr_decoded == constr + } + } +}