Skip to content

Commit 892ef2d

Browse files
authored
Adds APIs for accessing encoding of raw stream items (#760)
* Uses the bump allocator to handle text escape processing, allowing `RawSymbolTokenRef` to hold a reference to a `&'bump str` instead of potentially owning a `String`. This change allows the `RawSymbolTokenRef` type to implement `Copy`, which in turn allows all of the `LazyExpandedValue`- and `LazyValue`-related types to also implement `Copy`. * Removes the `RawSymbolToken` type, which is now redundant to the `RawSymbolTokenRef` type. * Adds a `Span` type that provides access to the input bytes that comprised various raw stream items. * Adds a `LazyRawVersionMarker` trait and per-encoding impls that can provide a `Span` upon request. * Adds a `LazyRawField` trait and per-encoding impls that can provide a `Span` upon request. * Adds an `UnexpandedField` type that can represent both raw struct fields and struct fields from a template body. This simplified the code for expanding structs. * Adds methods to convert container types back to the general value type. * Adds `EncodedBinaryValueData_1_0` and `EncodedBinaryAnnotations_1_0` types that can be used to access spans and ranges for the various components of a binary 1.0 value. This patch exposes many functions and types which we likely wish to feature gate, but that change is being left for a future PR.
1 parent b087e7f commit 892ef2d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2540
-1739
lines changed

examples/write_log_events.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,9 @@ mod example {
194194
.write(11, event.thread_id)?
195195
.write(12, &event.thread_name)?
196196
// v--- The fixed strings from the log statement are also SIDs
197-
.write(13, RawSymbolToken::SymbolId(17))? // logger name
198-
.write(14, RawSymbolToken::SymbolId(18))? // log level
199-
.write(15, RawSymbolToken::SymbolId(19))? // format
197+
.write(13, RawSymbolTokenRef::SymbolId(17))? // logger name
198+
.write(14, RawSymbolTokenRef::SymbolId(18))? // log level
199+
.write(15, RawSymbolTokenRef::SymbolId(19))? // format
200200
.write(16, &event.parameters)?;
201201
struct_.close()
202202
}

src/lazy/any_encoding.rs

Lines changed: 335 additions & 134 deletions
Large diffs are not rendered by default.

src/lazy/binary/encoded_value.rs

Lines changed: 40 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
use crate::lazy::binary::raw::type_descriptor::Header;
2-
use crate::types::SymbolId;
32
use crate::IonType;
43
use std::ops::Range;
54

@@ -53,33 +52,29 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
5352
// and IonType.
5453
pub(crate) header: HeaderType,
5554

56-
// Each encoded value has up to five components, appearing in the following order:
55+
// Each encoded value has up to four components, appearing in the following order:
5756
//
58-
// [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
57+
// [ annotations? | header (type descriptor) | header_length? | value_body ]
5958
//
6059
// Components shown with a `?` are optional.
6160
//
6261
// EncodedValue stores the offset of the type descriptor byte from the beginning of the
6362
// data source (`header_offset`). The lengths of the other fields can be used to calculate
6463
// their positions relative to the type descriptor byte. For example, to find the offset of the
65-
// field ID (if present), we can do:
66-
// header_offset - annotations_header_length - field_id_length
64+
// annotations header (if present), we can do:
65+
// header_offset - annotations_header_length
6766
//
6867
// This allows us to store a single `usize` for the header offset, while other lengths can be
69-
// packed into a `u8`. Values are not permitted to have a field ID or annotations that take
70-
// more than 255 bytes to represent.
68+
// packed into a `u8`. In this implementation, values are not permitted to have annotations that
69+
// take more than 255 bytes to represent.
7170
//
7271
// We store the offset for the header byte because it is guaranteed to be present for all values.
73-
// Field IDs and annotations appear earlier in the stream but are optional.
74-
75-
// The number of bytes used to encode the field ID (if present) preceding the Ion value. If
76-
// `field_id` is undefined, `field_id_length` will be zero.
77-
pub field_id_length: u8,
78-
// If this value is inside a struct, `field_id` will contain the SymbolId that represents
79-
// its field name.
80-
pub field_id: Option<SymbolId>,
72+
// Annotations appear earlier in the stream but are optional.
73+
8174
// The number of bytes used to encode the annotations wrapper (if present) preceding the Ion
82-
// value. If `annotations` is empty, `annotations_header_length` will be zero.
75+
// value. If `annotations` is empty, `annotations_header_length` will be zero. The annotations
76+
// wrapper contains several fields: an opcode, a wrapper length, a sequence length, and the
77+
// sequence itself.
8378
pub annotations_header_length: u8,
8479
// The number of bytes used to encode the series of symbol IDs inside the annotations wrapper.
8580
pub annotations_sequence_length: u8,
@@ -89,9 +84,9 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
8984
pub length_length: u8,
9085
// The number of bytes used to encode the value itself, not including the header byte
9186
// or length fields.
92-
pub value_length: usize,
87+
pub value_body_length: usize,
9388
// The sum total of:
94-
// field_id_length + annotations_header_length + header_length + value_length
89+
// annotations_header_length + header_length + value_length
9590
// While this can be derived from the above fields, storing it for reuse offers a modest
9691
// optimization. `total_length` is needed when stepping into a value, skipping a value,
9792
// and reading a value's data.
@@ -127,53 +122,27 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
127122
/// If the value can fit in the type descriptor byte (e.g. `true`, `false`, `null`, `0`),
128123
/// this function will return 0.
129124
#[inline(always)]
130-
pub fn value_length(&self) -> usize {
131-
self.value_length
125+
pub fn value_body_length(&self) -> usize {
126+
self.value_body_length
132127
}
133128

134129
/// The offset of the first byte following the header (including length bytes, if present).
135130
/// If `value_length()` returns zero, this offset is actually the first byte of
136131
/// the next encoded value and should not be read.
137-
pub fn value_offset(&self) -> usize {
132+
pub fn value_body_offset(&self) -> usize {
138133
self.header_offset + self.header_length()
139134
}
140135

141136
/// Returns an offset Range containing any bytes following the header.
142-
pub fn value_range(&self) -> Range<usize> {
143-
let start = self.value_offset();
144-
let end = start + self.value_length;
137+
pub fn value_body_range(&self) -> Range<usize> {
138+
let start = self.value_body_offset();
139+
let end = start + self.value_body_length;
145140
start..end
146141
}
147142

148143
/// Returns the index of the first byte that is beyond the end of the current value's encoding.
149144
pub fn value_end_exclusive(&self) -> usize {
150-
self.value_offset() + self.value_length
151-
}
152-
153-
/// Returns the number of bytes used to encode this value's field ID, if present.
154-
pub fn field_id_length(&self) -> Option<usize> {
155-
self.field_id.as_ref()?;
156-
Some(self.field_id_length as usize)
157-
}
158-
159-
/// Returns the offset of the first byte used to encode this value's field ID, if present.
160-
pub fn field_id_offset(&self) -> Option<usize> {
161-
self.field_id.as_ref()?;
162-
Some(
163-
self.header_offset
164-
- self.annotations_header_length as usize
165-
- self.field_id_length as usize,
166-
)
167-
}
168-
169-
/// Returns an offset Range that contains the bytes used to encode this value's field ID,
170-
/// if present.
171-
pub fn field_id_range(&self) -> Option<Range<usize>> {
172-
if let Some(start) = self.field_id_offset() {
173-
let end = start + self.field_id_length as usize;
174-
return Some(start..end);
175-
}
176-
None
145+
self.value_body_offset() + self.value_body_length
177146
}
178147

179148
/// Returns true if this encoded value has an annotations wrapper.
@@ -233,20 +202,28 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
233202
None
234203
}
235204

236-
/// Returns the total number of bytes used to represent the current value, including the
237-
/// field ID (if any), its annotations (if any), its header (type descriptor + length bytes),
238-
/// and its value.
205+
/// Returns the total number of bytes used to represent the current value, including
206+
/// its annotations (if any), its header (type descriptor + length bytes), and the body of
207+
/// the value.
239208
pub fn total_length(&self) -> usize {
240209
self.total_length
241210
}
242211

243212
/// The offset Range (starting from the beginning of the stream) that contains this value's
244-
/// complete encoding, including annotations. (It does not include the leading field ID, if
245-
/// any.)
213+
/// complete encoding, including annotations.
246214
pub fn annotated_value_range(&self) -> Range<usize> {
247-
// [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
215+
// [ annotations? | header (type descriptor) | header_length? | value ]
216+
let start = self.header_offset - self.annotations_header_length as usize;
217+
let end = start + self.total_length;
218+
start..end
219+
}
220+
221+
/// The offset Range (starting from the beginning of the stream) that contains this value's
222+
/// complete encoding, not including any annotations.
223+
pub fn unannotated_value_range(&self) -> Range<usize> {
224+
// [ annotations? | header (type descriptor) | header_length? | value ]
248225
let start = self.header_offset - self.annotations_header_length as usize;
249-
let end = start - self.field_id_length as usize + self.total_length;
226+
let end = start + self.total_length;
250227
start..end
251228
}
252229

@@ -264,20 +241,18 @@ mod tests {
264241

265242
#[test]
266243
fn accessors() -> IonResult<()> {
267-
// 3-byte String with 1-byte annotation and field ID $10
244+
// 3-byte String with 1-byte annotation
268245
let value = EncodedValue {
269246
header: Header {
270247
ion_type: IonType::String,
271248
ion_type_code: IonTypeCode::String,
272249
length_code: 3,
273250
},
274-
field_id_length: 1,
275-
field_id: Some(10),
276251
annotations_header_length: 3,
277252
annotations_sequence_length: 1,
278253
header_offset: 200,
279254
length_length: 0,
280-
value_length: 3,
255+
value_body_length: 3,
281256
total_length: 7,
282257
};
283258
assert_eq!(value.ion_type(), IonType::String);
@@ -292,18 +267,15 @@ mod tests {
292267
assert_eq!(value.header_offset(), 200);
293268
assert_eq!(value.header_length(), 1);
294269
assert_eq!(value.header_range(), 200..201);
295-
assert_eq!(value.field_id_length(), Some(1));
296-
assert_eq!(value.field_id_offset(), Some(196));
297-
assert_eq!(value.field_id_range(), Some(196..197));
298270
assert!(value.has_annotations());
299271
assert_eq!(value.annotations_range(), Some(197..200));
300272
assert_eq!(value.annotations_header_length(), Some(3));
301273
assert_eq!(value.annotations_sequence_offset(), Some(199));
302274
assert_eq!(value.annotations_sequence_length(), Some(1));
303275
assert_eq!(value.annotations_sequence_range(), Some(199..200));
304-
assert_eq!(value.value_length(), 3);
305-
assert_eq!(value.value_offset(), 201);
306-
assert_eq!(value.value_range(), 201..204);
276+
assert_eq!(value.value_body_length(), 3);
277+
assert_eq!(value.value_body_offset(), 201);
278+
assert_eq!(value.value_body_range(), 201..204);
307279
assert_eq!(value.value_end_exclusive(), 204);
308280
assert_eq!(value.total_length(), 7);
309281
Ok(())

src/lazy/binary/immutable_buffer.rs

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,25 @@
1+
use std::fmt::{Debug, Formatter};
2+
use std::mem;
3+
use std::ops::Range;
4+
5+
use num_bigint::{BigInt, BigUint, Sign};
6+
17
use crate::binary::constants::v1_0::{length_codes, IVM};
28
use crate::binary::int::DecodedInt;
39
use crate::binary::uint::DecodedUInt;
410
use crate::binary::var_int::VarInt;
511
use crate::binary::var_uint::VarUInt;
612
use crate::lazy::binary::encoded_value::EncodedValue;
13+
use crate::lazy::binary::raw::r#struct::LazyRawBinaryFieldName_1_0;
714
use crate::lazy::binary::raw::type_descriptor::{Header, TypeDescriptor, ION_1_0_TYPE_DESCRIPTORS};
8-
use crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0;
15+
use crate::lazy::binary::raw::value::{LazyRawBinaryValue_1_0, LazyRawBinaryVersionMarker_1_0};
16+
use crate::lazy::decoder::LazyRawFieldExpr;
917
use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
1018
use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt;
19+
use crate::lazy::encoding::BinaryEncoding_1_0;
1120
use crate::result::IonFailure;
1221
use crate::types::UInt;
1322
use crate::{Int, IonError, IonResult, IonType};
14-
use num_bigint::{BigInt, BigUint, Sign};
15-
use std::fmt::{Debug, Formatter};
16-
use std::mem;
1723

1824
// This limit is used for stack-allocating buffer space to encode/decode UInts.
1925
const UINT_STACK_BUFFER_SIZE: usize = 16;
@@ -69,7 +75,7 @@ impl<'a> ImmutableBuffer<'a> {
6975
}
7076

7177
/// Returns a slice containing all of the buffer's bytes.
72-
pub fn bytes(&self) -> &[u8] {
78+
pub fn bytes(&self) -> &'a [u8] {
7379
self.data
7480
}
7581

@@ -100,6 +106,10 @@ impl<'a> ImmutableBuffer<'a> {
100106
self.data.len()
101107
}
102108

109+
pub fn range(&self) -> Range<usize> {
110+
self.offset..self.offset + self.len()
111+
}
112+
103113
/// Returns `true` if there are no bytes in the buffer. Otherwise, returns `false`.
104114
pub fn is_empty(&self) -> bool {
105115
self.data.is_empty()
@@ -143,15 +153,16 @@ impl<'a> ImmutableBuffer<'a> {
143153
/// returns an `Ok(_)` containing a `(major, minor)` version tuple.
144154
///
145155
/// See: <https://amazon-ion.github.io/ion-docs/docs/binary.html#value-streams>
146-
pub fn read_ivm(self) -> ParseResult<'a, (u8, u8)> {
156+
pub fn read_ivm(self) -> ParseResult<'a, LazyRawBinaryVersionMarker_1_0<'a>> {
147157
let bytes = self
148158
.peek_n_bytes(IVM.len())
149159
.ok_or_else(|| IonError::incomplete("an IVM", self.offset()))?;
150160

151161
match bytes {
152162
[0xE0, major, minor, 0xEA] => {
153-
let version = (*major, *minor);
154-
Ok((version, self.consume(IVM.len())))
163+
let matched = ImmutableBuffer::new_with_offset(bytes, self.offset);
164+
let marker = LazyRawBinaryVersionMarker_1_0::new(matched, *major, *minor);
165+
Ok((marker, self.consume(IVM.len())))
155166
}
156167
invalid_ivm => IonResult::decoding_error(format!("invalid IVM: {invalid_ivm:?}")),
157168
}
@@ -607,7 +618,7 @@ impl<'a> ImmutableBuffer<'a> {
607618
}
608619

609620
/// Reads a field ID and a value from the buffer.
610-
pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawBinaryValue_1_0<'a>>> {
621+
pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawFieldExpr<'a, BinaryEncoding_1_0>>> {
611622
let mut input = self;
612623
if self.is_empty() {
613624
// We're at the end of the struct
@@ -625,7 +636,7 @@ impl<'a> ImmutableBuffer<'a> {
625636
let mut type_descriptor = input_after_field_id.peek_type_descriptor()?;
626637
if type_descriptor.is_nop() {
627638
// Read past NOP fields until we find the first one that's an actual value
628-
// or we run out of struct bytes. Note that we read the NOP field(s) from `self` (the
639+
// or we run out of struct bytes. Note that we read the NOP field(s) from `input` (the
629640
// initial input) rather than `input_after_field_id` because it simplifies
630641
// the logic of `read_struct_field_nop_pad()`, which is very rarely called.
631642
(field_id_var_uint, input_after_field_id) = match input.read_struct_field_nop_pad()? {
@@ -643,15 +654,12 @@ impl<'a> ImmutableBuffer<'a> {
643654
};
644655
}
645656

646-
let field_id_length = field_id_var_uint.size_in_bytes() as u8;
647657
let field_id = field_id_var_uint.value();
658+
let matched_field_id = input.slice(0, field_id_var_uint.size_in_bytes());
659+
let field_name = LazyRawBinaryFieldName_1_0::new(field_id, matched_field_id);
648660

649-
let mut value = input_after_field_id.read_value(type_descriptor)?;
650-
value.encoded_value.field_id = Some(field_id);
651-
value.encoded_value.field_id_length = field_id_length;
652-
value.encoded_value.total_length += field_id_length as usize;
653-
value.input = input;
654-
Ok(Some(value))
661+
let field_value = input_after_field_id.read_value(type_descriptor)?;
662+
Ok(Some(LazyRawFieldExpr::NameValue(field_name, field_value)))
655663
}
656664

657665
#[cold]
@@ -745,15 +753,12 @@ impl<'a> ImmutableBuffer<'a> {
745753

746754
let encoded_value = EncodedValue {
747755
header,
748-
// If applicable, these are populated by the caller: `peek_field()`
749-
field_id_length: 0,
750-
field_id: None,
751756
// If applicable, these are populated by the caller: `read_annotated_value()`
752757
annotations_header_length: 0,
753758
annotations_sequence_length: 0,
754759
header_offset,
755760
length_length,
756-
value_length,
761+
value_body_length: value_length,
757762
total_length,
758763
};
759764
let lazy_value = LazyRawBinaryValue_1_0 {
@@ -810,10 +815,12 @@ pub struct AnnotationsWrapper {
810815

811816
#[cfg(test)]
812817
mod tests {
813-
use super::*;
814-
use crate::IonError;
815818
use num_traits::Num;
816819

820+
use crate::IonError;
821+
822+
use super::*;
823+
817824
fn input_test<A: AsRef<[u8]>>(input: A) {
818825
let input = ImmutableBuffer::new(input.as_ref());
819826
// We can peek at the first byte...

0 commit comments

Comments
 (0)