From fa0d58bb57de03671a47a04ad5f97b52374fa3a5 Mon Sep 17 00:00:00 2001
From: KtorZ <matthias.benkort@gmail.com>
Date: Tue, 15 Oct 2024 22:43:09 +0200
Subject: [PATCH] experiment with: aiken/cbor.{deserialise}

---
 CHANGELOG.md           |   1 +
 aiken.lock             |   2 +-
 lib/aiken/cbor.ak      | 399 ++++++++++++++++++++++++++---------------
 lib/aiken/cbor.test.ak | 268 +++++++++++++++++++++++++++
 4 files changed, 526 insertions(+), 144 deletions(-)
 create mode 100644 lib/aiken/cbor.test.ak

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01dec0c..950cb4b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Added
 
+- [`aiken/cbor.{deserialise}`](https://aiken-lang.github.io/stdlib/aiken/cbor.html#deserialise): to recover `Data` from CBOR bytes.
 - [`aiken/collection/pairs.{insert_with_by_ascending_key}`](https://aiken-lang.github.io/stdlib/aiken/collection/pairs.html#insert_with_by_ascending_key): for inserting in pairs while specifying how to combine values on key conflict.
 
 ## v2.1.0 - 2024-09-14
diff --git a/aiken.lock b/aiken.lock
index 0d2932a..b7a0d45 100644
--- a/aiken.lock
+++ b/aiken.lock
@@ -13,4 +13,4 @@ requirements = []
 source = "github"
 
 [etags]
-"aiken-lang/fuzz@v2" = [{ secs_since_epoch = 1727870812, nanos_since_epoch = 978021000 }, "64a32283418d58cade34059d3855b857e84505541158c541c460cafa0d355475"]
+"aiken-lang/fuzz@v2" = [{ secs_since_epoch = 1729023353, nanos_since_epoch = 135235000 }, "64a32283418d58cade34059d3855b857e84505541158c541c460cafa0d355475"]
diff --git a/lib/aiken/cbor.ak b/lib/aiken/cbor.ak
index 693a047..d6ffa87 100644
--- a/lib/aiken/cbor.ak
+++ b/lib/aiken/cbor.ak
@@ -1,66 +1,6 @@
 use aiken
 use aiken/builtin.{decode_utf8, serialise_data}
-
-/// Serialise any value to binary, encoding using [CBOR](https://www.rfc-editor.org/rfc/rfc8949).
-///
-/// This is particularly useful in combination with hashing functions, as a way
-/// to obtain a byte representation that matches the serialised representation
-/// used by the ledger in the context of on-chain code.
-///
-/// Note that the output matches the output of [`diagnostic`](#diagnostic),
-/// though with a different encoding. [`diagnostic`](#diagnostic) is merely a
-/// textual representation of the CBOR encoding that is human friendly and
-/// useful for debugging.
-///
-/// ```aiken
-/// cbor.serialise(42) == #"182a"
-/// cbor.serialise(#"a1b2") == #"42a1b2"
-/// cbor.serialise([]) == #"80"
-/// cbor.serialise((1, 2)) == #"9f0102ff"
-/// cbor.serialise((1, #"ff", 3)) == #"9f0141ff03ff"
-/// cbor.serialise([(1, #"ff")]) == #"a10141ff"
-/// cbor.serialise(Some(42)) == #"d8799f182aff"
-/// cbor.serialise(None) == #"d87a80"
-/// ```
-pub fn serialise(self: Data) -> ByteArray {
-  serialise_data(self)
-}
-
-test serialise_1() {
-  serialise(42) == #"182a"
-}
-
-test serialise_2() {
-  serialise(#"a1b2") == #"42a1b2"
-}
-
-test serialise_3() {
-  serialise([]) == #"80"
-}
-
-test serialise_4() {
-  serialise((1, 2)) == #"9f0102ff"
-}
-
-test serialise_5() {
-  serialise((1, #"ff", 3)) == #"9f0141ff03ff"
-}
-
-test serialise_6() {
-  serialise([(1, #"ff")]) == #"9f9f0141ffffff"
-}
-
-test serialise_7() {
-  serialise(Some(42)) == #"d8799f182aff"
-}
-
-test serialise_8() {
-  serialise(None) == #"d87a80"
-}
-
-test serialise_9() {
-  serialise([Pair(1, #"ff")]) == #"a10141ff"
-}
+use aiken/primitive/bytearray
 
 /// Obtain a String representation of _anything_. This is particularly (and only) useful for tracing
 /// and debugging. This function is expensive and should not be used in any production code as it
@@ -88,90 +28,263 @@ pub fn diagnostic(self: Data) -> String {
     |> decode_utf8
 }
 
-test diagnostic_1() {
-  diagnostic(42) == @"42"
-}
-
-test diagnostic_2() {
-  diagnostic(#"a1b2") == @"h'A1B2'"
-}
-
-test diagnostic_3() {
-  diagnostic([1, 2, 3]) == @"[_ 1, 2, 3]"
-}
-
-test diagnostic_4() {
-  diagnostic([]) == @"[]"
-}
-
-test diagnostic_5() {
-  diagnostic((1, 2)) == @"[_ 1, 2]"
-}
-
-test diagnostic_6() {
-  diagnostic((1, #"ff", 3)) == @"[_ 1, h'FF', 3]"
-}
-
-test diagnostic_7() {
-  diagnostic([(1, #"ff")]) == @"[_ [_ 1, h'FF']]"
-}
-
-test diagnostic_7_alt() {
-  diagnostic([Pair(1, #"ff")]) == @"{_ 1: h'FF' }"
-}
-
-test diagnostic_8() {
-  diagnostic(Some(42)) == @"121([_ 42])"
-}
-
-test diagnostic_9() {
-  diagnostic(None) == @"122([])"
-}
-
-test diagnostic_10() {
-  let xs: List<(Int, Int)> = []
-  diagnostic(xs) == @"[]"
-}
-
-test diagnostic_10_alt() {
-  let xs: Pairs<Int, Int> = []
-  diagnostic(xs) == @"{}"
-}
-
-type Foo {
-  foo: Bar,
-}
-
-type Bar {
-  A
-  B(Int)
-}
-
-test diagnostic_11() {
-  diagnostic(Foo { foo: A }) == @"121([_ 121([])])"
-}
-
-test diagnostic_12() {
-  diagnostic(Foo { foo: B(42) }) == @"121([_ 122([_ 42])])"
-}
-
-type Baz {
-  a0: Int,
-  b0: ByteArray,
-}
-
-test diagnostic_13() {
-  diagnostic(Baz { a0: 14, b0: #"ff" }) == @"121([_ 14, h'FF'])"
-}
-
-test diagnostic_14() {
-  diagnostic([0]) == @"[_ 0]"
+/// Deserialise a [CBOR](https://www.rfc-editor.org/rfc/rfc8949) Data. This is the reverse operation of [serialise](#serialise).
+/// In particular, we have the following property:
+///
+/// ```aiken
+/// cbor.deserialise(cbor.serialise(any_data)) == Some(any_data)
+/// ```
+///
+/// > [!CAUTION]
+/// > Unfortunately, this function isn't derived from a builtin primitive. It
+/// > is therefore an order of magnitude more expensive than its counterpart
+/// > and shall be used with care.
+/// >
+/// > In general, one might prefer avoiding deserialisation unless truly necessary.
+/// > Yet, it may come in handy for testing and in rare scenarios.
+pub fn deserialise(bytes: ByteArray) -> Option<Data> {
+  let length = bytearray.length(bytes)
+
+  let peek =
+    fn(offset: Int, callback: fn(Byte) -> Decoder<Data>) -> Decoder<Data> {
+      fn(cursor) {
+        if cursor >= length {
+          deserialise_failure
+        } else {
+          callback(bytearray.at(bytes, cursor))(cursor + offset)
+        }
+      }
+    }
+
+  let take =
+    fn(n: Int, callback: fn(ByteArray) -> Decoder<Data>) -> Decoder<Data> {
+      fn(cursor) {
+        if cursor >= length {
+          deserialise_failure
+        } else {
+          callback(builtin.slice_bytearray(cursor, n, bytes))(cursor + n)
+        }
+      }
+    }
+
+  let Pair(result, consumed) = decode_data(peek, take)(0)
+
+  if consumed != length {
+    None
+  } else {
+    Some(result)
+  }
 }
 
-test diagnostic_15() {
-  diagnostic(-42) == @"-42"
+/// Serialise any value to binary, encoding using [CBOR](https://www.rfc-editor.org/rfc/rfc8949).
+///
+/// This is particularly useful in combination with hashing functions, as a way
+/// to obtain a byte representation that matches the serialised representation
+/// used by the ledger in the context of on-chain code.
+///
+/// Note that the output matches the output of [`diagnostic`](#diagnostic),
+/// though with a different encoding. [`diagnostic`](#diagnostic) is merely a
+/// textual representation of the CBOR encoding that is human friendly and
+/// useful for debugging.
+///
+/// ```aiken
+/// cbor.serialise(42) == #"182a"
+/// cbor.serialise(#"a1b2") == #"42a1b2"
+/// cbor.serialise([]) == #"80"
+/// cbor.serialise((1, 2)) == #"9f0102ff"
+/// cbor.serialise((1, #"ff", 3)) == #"9f0141ff03ff"
+/// cbor.serialise([(1, #"ff")]) == #"a10141ff"
+/// cbor.serialise(Some(42)) == #"d8799f182aff"
+/// cbor.serialise(None) == #"d87a80"
+/// ```
+pub fn serialise(self: Data) -> ByteArray {
+  serialise_data(self)
 }
 
-test diagnostic_16() {
-  diagnostic([-1, 0, 1]) == @"[_ -1, 0, 1]"
+type Byte =
+  Int
+
+type Decoder<a> =
+  fn(Int) -> Pair<a, Int>
+
+type Peek<a> =
+  fn(Int, fn(Byte) -> Decoder<a>) -> Decoder<a>
+
+type Take<a> =
+  fn(Int, fn(ByteArray) -> Decoder<a>) -> Decoder<a>
+
+fn return(data: Data) -> Decoder<Data> {
+  fn(cursor) { Pair(data, cursor) }
+}
+
+const deserialise_failure: Pair<Data, Int> = {
+    let empty: Data = ""
+    Pair(empty, -1)
+  }
+
+const token_begin_bytes = 0x5f
+
+const token_begin_list = 0x9f
+
+const token_begin_map = 0xbf
+
+const token_break = 0xff
+
+fn decode_data(peek: Peek<Data>, take: Take<Data>) -> Decoder<Data> {
+  let next <- peek(1)
+  let major_type = next / 32
+  if major_type <= 2 {
+    if major_type == 0 {
+      let i <- decode_uint(peek, take, next)
+      return(builtin.i_data(i))
+    } else if major_type == 1 {
+      let i <- decode_uint(peek, take, next - 32)
+      return(builtin.i_data(-i - 1))
+    } else {
+      if next == token_begin_bytes {
+        let b <- decode_chunks(peek, take)
+        return(builtin.b_data(b))
+      } else {
+        let b <- decode_bytes(peek, take, next - 64)
+        return(builtin.b_data(b))
+      }
+    }
+  } else if major_type == 6 {
+    let tag <- decode_uint(peek, take, next - 192)
+    let next <- peek(1)
+    if tag == 102 {
+      fn(_) { deserialise_failure }
+    } else {
+      let ix =
+        if tag >= 1280 {
+          tag - 1280 + 7
+        } else {
+          tag - 121
+        }
+
+      if next == token_begin_list {
+        let fields <- decode_indefinite(peek, take, decode_data)
+        return(builtin.constr_data(ix, fields))
+      } else {
+        let size <- decode_uint(peek, take, next - 128)
+        let fields <- decode_definite(peek, take, decode_data, size)
+        return(builtin.constr_data(ix, fields))
+      }
+    }
+  } else if major_type == 4 {
+    if next == token_begin_list {
+      let xs <- decode_indefinite(peek, take, decode_data)
+      return(builtin.list_data(xs))
+    } else {
+      let size <- decode_uint(peek, take, next - 128)
+      let xs <- decode_definite(peek, take, decode_data, size)
+      return(builtin.list_data(xs))
+    }
+  } else if major_type == 5 {
+    if next == token_begin_map {
+      let xs <- decode_indefinite(peek, take, decode_pair)
+      return(builtin.map_data(xs))
+    } else {
+      let size <- decode_uint(peek, take, next - 160)
+      let xs <- decode_definite(peek, take, decode_pair, size)
+      return(builtin.map_data(xs))
+    }
+  } else {
+    fn(_) { deserialise_failure }
+  }
+}
+
+fn decode_pair(peek: Peek<Data>, take: Take<Data>) -> Decoder<Pair<Data, Data>> {
+  fn(cursor) {
+    let Pair(k, cursor) = decode_data(peek, take)(cursor)
+    let Pair(v, cursor) = decode_data(peek, take)(cursor)
+    Pair(Pair(k, v), cursor)
+  }
+}
+
+fn decode_uint(
+  peek: Peek<Data>,
+  take: Take<Data>,
+  header: Int,
+  and_then: fn(Int) -> Decoder<Data>,
+) -> Decoder<Data> {
+  if header < 24 {
+    and_then(header)
+  } else if header == 24 {
+    let payload <- peek(1)
+    and_then(payload)
+  } else if header < 28 {
+    let width = bytearray.at(#[2, 4, 8], header - 25)
+    let payload <- take(width)
+    and_then(bytearray.to_int_big_endian(payload))
+  } else {
+    fn(_) { deserialise_failure }
+  }
+}
+
+fn decode_bytes(
+  peek: Peek<Data>,
+  take: Take<Data>,
+  header: Int,
+  and_then: fn(ByteArray) -> Decoder<Data>,
+) -> Decoder<Data> {
+  let width <- decode_uint(peek, take, header)
+  let bytes <- take(width)
+  and_then(bytes)
+}
+
+fn decode_chunks(
+  peek: Peek<Data>,
+  take: Take<Data>,
+  and_then: fn(ByteArray) -> Decoder<Data>,
+) -> Decoder<Data> {
+  let next <- peek(0)
+  if next == token_break {
+    fn(cursor) { and_then("")(cursor + 1) }
+  } else {
+    let next <- peek(1)
+    let chunk <- decode_bytes(peek, take, next - 64)
+    let chunks <- decode_chunks(peek, take)
+    and_then(builtin.append_bytearray(chunk, chunks))
+  }
+}
+
+fn decode_definite(
+  peek: Peek<Data>,
+  take: Take<Data>,
+  decode_one: fn(Peek<Data>, Take<Data>) -> Decoder<a>,
+  size: Int,
+  and_then: fn(List<a>) -> Decoder<Data>,
+) -> Decoder<Data> {
+  if size <= 0 {
+    and_then([])
+  } else {
+    fn(cursor) {
+      let Pair(elem, cursor) = decode_one(peek, take)(cursor)
+      {
+        let elems <- decode_definite(peek, take, decode_one, size - 1)
+        and_then([elem, ..elems])
+      }(cursor)
+    }
+  }
+}
+
+fn decode_indefinite(
+  peek: Peek<Data>,
+  take: Take<Data>,
+  decode_one: fn(Peek<Data>, Take<Data>) -> Decoder<a>,
+  and_then: fn(List<a>) -> Decoder<Data>,
+) -> Decoder<Data> {
+  let next <- peek(0)
+  if next == token_break {
+    fn(cursor) { and_then([])(cursor + 1) }
+  } else {
+    fn(cursor) {
+      let Pair(elem, cursor) = decode_one(peek, take)(cursor)
+      {
+        let elems <- decode_indefinite(peek, take, decode_one)
+        and_then([elem, ..elems])
+      }(cursor)
+    }
+  }
 }
diff --git a/lib/aiken/cbor.test.ak b/lib/aiken/cbor.test.ak
new file mode 100644
index 0000000..96bb958
--- /dev/null
+++ b/lib/aiken/cbor.test.ak
@@ -0,0 +1,268 @@
+use aiken/cbor.{deserialise, diagnostic, serialise}
+use aiken/fuzz
+
+// ------------------------------------------------------------------ diagnostic
+
+test diagnostic_1() {
+  diagnostic(42) == @"42"
+}
+
+test diagnostic_2() {
+  diagnostic(#"a1b2") == @"h'A1B2'"
+}
+
+test diagnostic_3() {
+  diagnostic([1, 2, 3]) == @"[_ 1, 2, 3]"
+}
+
+test diagnostic_4() {
+  diagnostic([]) == @"[]"
+}
+
+test diagnostic_5() {
+  diagnostic((1, 2)) == @"[_ 1, 2]"
+}
+
+test diagnostic_6() {
+  diagnostic((1, #"ff", 3)) == @"[_ 1, h'FF', 3]"
+}
+
+test diagnostic_7() {
+  diagnostic([(1, #"ff")]) == @"[_ [_ 1, h'FF']]"
+}
+
+test diagnostic_7_alt() {
+  diagnostic([Pair(1, #"ff")]) == @"{_ 1: h'FF' }"
+}
+
+test diagnostic_8() {
+  diagnostic(Some(42)) == @"121([_ 42])"
+}
+
+test diagnostic_9() {
+  diagnostic(None) == @"122([])"
+}
+
+test diagnostic_10() {
+  let xs: List<(Int, Int)> = []
+  diagnostic(xs) == @"[]"
+}
+
+test diagnostic_10_alt() {
+  let xs: Pairs<Int, Int> = []
+  diagnostic(xs) == @"{}"
+}
+
+type Foo {
+  foo: Bar,
+}
+
+type Bar {
+  A
+  B(Int)
+}
+
+test diagnostic_11() {
+  diagnostic(Foo { foo: A }) == @"121([_ 121([])])"
+}
+
+test diagnostic_12() {
+  diagnostic(Foo { foo: B(42) }) == @"121([_ 122([_ 42])])"
+}
+
+type Baz {
+  a0: Int,
+  b0: ByteArray,
+}
+
+test diagnostic_13() {
+  diagnostic(Baz { a0: 14, b0: #"ff" }) == @"121([_ 14, h'FF'])"
+}
+
+test diagnostic_14() {
+  diagnostic([0]) == @"[_ 0]"
+}
+
+test diagnostic_15() {
+  diagnostic(-42) == @"-42"
+}
+
+test diagnostic_16() {
+  diagnostic([-1, 0, 1]) == @"[_ -1, 0, 1]"
+}
+
+// ------------------------------------------------------------------ serialise
+
+test serialise_1() {
+  serialise(42) == #"182a"
+}
+
+test serialise_2() {
+  serialise(#"a1b2") == #"42a1b2"
+}
+
+test serialise_3() {
+  serialise([]) == #"80"
+}
+
+test serialise_4() {
+  serialise((1, 2)) == #"9f0102ff"
+}
+
+test serialise_5() {
+  serialise((1, #"ff", 3)) == #"9f0141ff03ff"
+}
+
+test serialise_6() {
+  serialise([(1, #"ff")]) == #"9f9f0141ffffff"
+}
+
+test serialise_7() {
+  serialise(Some(42)) == #"d8799f182aff"
+}
+
+test serialise_8() {
+  serialise(None) == #"d87a80"
+}
+
+test serialise_9() {
+  serialise([Pair(1, #"ff")]) == #"a10141ff"
+}
+
+// ------------------------------------------------------------------ deserialise
+
+type AnyData {
+  AnyInt(Int)
+  AnyByteArray(ByteArray)
+  AnyList(List<Int>)
+  AnyPairs(Pairs<ByteArray, Int>)
+  AnyUnaryConstr0(UnaryConstr0)
+  AnyUnaryConstr1(UnaryConstr1)
+  AnyUnaryConstr2(UnaryConstr2)
+  AnyBinaryConstr0(BinaryConstr0)
+  AnyBinaryConstr1(BinaryConstr1)
+}
+
+type UnaryConstr0 {
+  UnaryConstr0
+}
+
+type UnaryConstr1 {
+  field0: String,
+}
+
+type UnaryConstr2 {
+  field0: Int,
+  field1: List<List<ByteArray>>,
+}
+
+type BinaryConstr0 =
+  Bool
+
+type BinaryConstr1 =
+  Option<Int>
+
+fn any_pair(any_key: Fuzzer<k>, any_value: Fuzzer<v>) -> Fuzzer<Pair<k, v>> {
+  let k <- fuzz.and_then(any_key)
+  let v <- fuzz.map(any_value)
+  Pair(k, v)
+}
+
+fn any_data() -> Fuzzer<AnyData> {
+  fuzz.either6(
+    {
+      let i <- fuzz.map(fuzz.int())
+      AnyInt(i)
+    },
+    {
+      let bs <- fuzz.map(fuzz.bytearray())
+      AnyByteArray(bs)
+    },
+    {
+      let xs <- fuzz.map(fuzz.list(fuzz.int()))
+      AnyList(xs)
+    },
+    {
+      let ps <- fuzz.map(fuzz.list(any_pair(fuzz.bytearray(), fuzz.int())))
+      AnyPairs(ps)
+    },
+    fuzz.either3(
+      fuzz.constant(AnyUnaryConstr0(UnaryConstr0)),
+      fuzz.constant(AnyUnaryConstr1(UnaryConstr1(@"lorem ipsum"))),
+      {
+        let i <- fuzz.and_then(fuzz.int())
+        let xs <- fuzz.map(fuzz.list(fuzz.list(fuzz.bytearray())))
+        AnyUnaryConstr2(UnaryConstr2(i, xs))
+      },
+    ),
+    fuzz.either(
+      {
+        let b <- fuzz.map(fuzz.bool())
+        AnyBinaryConstr0(b)
+      },
+      {
+        let o <- fuzz.map(fuzz.option(fuzz.int()))
+        AnyBinaryConstr1(o)
+      },
+    ),
+  )
+}
+
+test prop_deserialise_any_data(any via any_data()) {
+  when any is {
+    AnyInt(i) -> {
+      fuzz.label(@"Int")
+      expect Some(data) = deserialise(serialise(i))
+      expect i_decoded: Int = data
+      i_decoded == i
+    }
+    AnyByteArray(bs) -> {
+      fuzz.label(@"ByteArray")
+      expect Some(data) = deserialise(serialise(bs))
+      expect bs_decoded: ByteArray = data
+      bs_decoded == bs
+    }
+    AnyList(xs) -> {
+      fuzz.label(@"List")
+      expect Some(data) = deserialise(serialise(xs))
+      expect xs_decoded: List<Int> = data
+      xs_decoded == xs
+    }
+    AnyPairs(ps) -> {
+      fuzz.label(@"Pairs")
+      expect Some(data) = deserialise(serialise(ps))
+      expect ps_decoded: Pairs<ByteArray, Int> = data
+      ps_decoded == ps
+    }
+    AnyUnaryConstr0(constr) -> {
+      fuzz.label(@"(unary) Constr")
+      expect Some(data) = deserialise(serialise(constr))
+      expect constr_decoded: UnaryConstr0 = data
+      constr_decoded == constr
+    }
+    AnyUnaryConstr1(constr) -> {
+      fuzz.label(@"(unary) Constr")
+      expect Some(data) = deserialise(serialise(constr))
+      expect constr_decoded: UnaryConstr1 = data
+      constr_decoded == constr
+    }
+    AnyUnaryConstr2(constr) -> {
+      fuzz.label(@"(unary) Constr")
+      expect Some(data) = deserialise(serialise(constr))
+      expect constr_decoded: UnaryConstr2 = data
+      constr_decoded == constr
+    }
+    AnyBinaryConstr0(constr) -> {
+      fuzz.label(@"(binary) Constr")
+      expect Some(data) = deserialise(serialise(constr))
+      expect constr_decoded: BinaryConstr0 = data
+      constr_decoded == constr
+    }
+    AnyBinaryConstr1(constr) -> {
+      fuzz.label(@"(binary) Constr")
+      expect Some(data) = deserialise(serialise(constr))
+      expect constr_decoded: BinaryConstr1 = data
+      constr_decoded == constr
+    }
+  }
+}