@@ -34,7 +34,66 @@ use std::sync::Arc;
34
34
/// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35
35
/// meaning that take / filter operations can be implemented without copying the underlying data.
36
36
///
37
+ /// See [`StringViewArray`] for storing utf8 encoded string data and
38
+ /// [`BinaryViewArray`] for storing bytes.
39
+ ///
37
40
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
41
+ ///
42
+ /// A `GenericByteViewArray` stores variable length byte strings. An array of
43
+ /// `N` elements is stored as `N` fixed length "views" and a variable number
44
+ /// of variable length "buffers".
45
+ ///
46
+ /// Each view is a `u128` value layout is different depending on the
47
+ /// length of the string stored at that location:
48
+ ///
49
+ /// ```text
50
+ /// ┌──────┬────────────────────────┐
51
+ /// │length│ string value │
52
+ /// Strings (len <= 12) │ │ (padded with 0) │
53
+ /// └──────┴────────────────────────┘
54
+ /// 0 31 127
55
+ ///
56
+ /// ┌───────┬───────┬───────┬───────┐
57
+ /// │length │prefix │ buf │offset │
58
+ /// Strings (len > 12) │ │ │ index │ │
59
+ /// └───────┴───────┴───────┴───────┘
60
+ /// 0 31 63 95 127
61
+ /// ```
62
+ ///
63
+ /// * Strings with length <= 12 are stored directly in the view.
64
+ ///
65
+ /// * Strings with length > 12: The first four bytes are stored inline in the
66
+ /// view and the entire string is stored in one of the buffers.
67
+ ///
68
+ /// Unlike [`GenericByteArray`], there are no constraints on the offsets other
69
+ /// than they must point into a valid buffer. However, they can be out of order,
70
+ /// non continuous and overlapping.
71
+ ///
72
+ /// For example, in the following diagram, the strings "FishWasInTownToday" and
73
+ /// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
74
+ /// separate buffer while the string "LavaMonster" is stored inlined in the
75
+ /// view. In this case, the same bytes for "Fish" are used to store both strings.
76
+ ///
77
+ /// ```text
78
+ /// ┌───┐
79
+ /// ┌──────┬──────┬──────┬──────┐ offset │...│
80
+ /// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
81
+ /// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
82
+ /// ┌──────┬──────┬──────┬──────┐ │mpl│
83
+ /// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
84
+ /// └──────┴──────┴──────┴──────┘ │ced│
85
+ /// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
86
+ /// "LavaMonster" │ 11 │ LavaMonster\0 │ │hWa│
87
+ /// └──────┴────────────────────┘ offset │sIn│
88
+ /// 115 │Tow│
89
+ /// │nTo│
90
+ /// │day│
91
+ /// u128 "views" │Yay│
92
+ /// buffer 0 │...│
93
+ /// └───┘
94
+ /// ```
95
+ /// [`GenericByteArray`]: crate::array::GenericByteArray
96
+
38
97
pub struct GenericByteViewArray < T : ByteViewType + ?Sized > {
39
98
data_type : DataType ,
40
99
views : ScalarBuffer < u128 > ,
@@ -332,10 +391,19 @@ where
332
391
}
333
392
334
393
/// A [`GenericByteViewArray`] of `[u8]`
394
+ ///
395
+ /// # Example
396
+ /// ```
397
+ /// use arrow_array::BinaryViewArray;
398
+ /// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
399
+ /// assert_eq!(array.value(0), b"hello");
400
+ /// assert_eq!(array.value(3), b"large payload over 12 bytes");
401
+ /// ```
335
402
pub type BinaryViewArray = GenericByteViewArray < BinaryViewType > ;
336
403
337
- /// A [`GenericByteViewArray`] of `str`
404
+ /// A [`GenericByteViewArray`] that stores uf8 data
338
405
///
406
+ /// # Example
339
407
/// ```
340
408
/// use arrow_array::StringViewArray;
341
409
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
0 commit comments