22
22
//!
23
23
//! [Gorilla paper]: https://www.vldb.org/pvldb/vol8/p1816-teller.pdf
24
24
25
- use modelardb_common:: schemas:: COMPRESSED_METADATA_SIZE_IN_BYTES ;
26
25
use modelardb_common:: types:: { Timestamp , UnivariateId , UnivariateIdBuilder , Value , ValueBuilder } ;
27
26
28
27
use crate :: models;
@@ -34,6 +33,10 @@ use crate::models::ErrorBound;
34
33
pub struct Gorilla {
35
34
/// Maximum relative error for the value of each data point.
36
35
error_bound : ErrorBound ,
36
+ /// Min value compressed and added to `compressed_values`.
37
+ min_value : Value ,
38
+ /// Max value compressed and added to `compressed_values`.
39
+ max_value : Value ,
37
40
/// Last value compressed and added to `compressed_values`.
38
41
last_value : Value ,
39
42
/// Number of leading zero bits for the last value that was compressed by
@@ -52,6 +55,8 @@ impl Gorilla {
52
55
pub fn new ( error_bound : ErrorBound ) -> Self {
53
56
Self {
54
57
error_bound,
58
+ min_value : Value :: NAN ,
59
+ max_value : Value :: NAN ,
55
60
last_value : 0.0 ,
56
61
last_leading_zero_bits : u8:: MAX ,
57
62
last_trailing_zero_bits : 0 ,
@@ -60,9 +65,33 @@ impl Gorilla {
60
65
}
61
66
}
62
67
63
- /// Compress `value` using XOR and a variable length binary encoding and
64
- /// append the compressed value to an internal buffer in [`Gorilla`].
65
- pub fn compress_value ( & mut self , value : Value ) {
68
+ /// Store the first value in full if this instance of [`Gorilla`] is empty and then compress the
69
+ /// remaining `values` using XOR and a variable length binary encoding before storing them.
70
+ pub fn compress_values ( & mut self , values : & [ Value ] ) {
71
+ for value in values {
72
+ if self . compressed_values . is_empty ( ) {
73
+ // Store the first value uncompressed using size_of::<Value> bits.
74
+ self . compressed_values
75
+ . append_bits ( value. to_bits ( ) , models:: VALUE_SIZE_IN_BITS ) ;
76
+
77
+ self . update_min_max_and_last_value ( * value) ;
78
+ } else {
79
+ self . compress_value_xor_last_value ( * value) ;
80
+ } ;
81
+ }
82
+ }
83
+
84
+ /// Assume `last_value` is stored fully elsewhere, set it as the current last value, and then
85
+ /// compress each of the values in `values` using XOR and a variable length binary encoding.
86
+ pub fn compress_values_without_first ( & mut self , values : & [ Value ] , model_last_value : Value ) {
87
+ self . last_value = model_last_value;
88
+ for value in values {
89
+ self . compress_value_xor_last_value ( * value) ;
90
+ }
91
+ }
92
+
93
+ /// Compress `value` using XOR and a variable length binary encoding and then store it.
94
+ fn compress_value_xor_last_value ( & mut self , value : Value ) {
66
95
// The best case for Gorilla is storing duplicate values.
67
96
let value = if models:: is_value_within_error_bound ( self . error_bound , value, self . last_value )
68
97
{
@@ -75,11 +104,7 @@ impl Gorilla {
75
104
let last_value_as_integer = self . last_value . to_bits ( ) ;
76
105
let value_xor_last_value = value_as_integer ^ last_value_as_integer;
77
106
78
- if self . compressed_values . is_empty ( ) {
79
- // Store the first value uncompressed using size_of::<Value> bits.
80
- self . compressed_values
81
- . append_bits ( value_as_integer, models:: VALUE_SIZE_IN_BITS ) ;
82
- } else if value_xor_last_value == 0 {
107
+ if value_xor_last_value == 0 {
83
108
// Store each repeated value as a single zero bit.
84
109
self . compressed_values . append_a_zero_bit ( ) ;
85
110
} else {
@@ -119,45 +144,49 @@ impl Gorilla {
119
144
self . last_trailing_zero_bits = trailing_zero_bits;
120
145
}
121
146
}
122
- self . last_value = value;
123
- self . length += 1 ;
124
- }
125
147
126
- /// Return the number of values currently compressed using XOR and a
127
- /// variable length binary encoding.
128
- #[ allow( clippy:: len_without_is_empty) ]
129
- pub fn len ( & self ) -> usize {
130
- self . length
148
+ self . update_min_max_and_last_value ( value) ;
131
149
}
132
150
133
- /// Return the number of bytes currently used per data point on average.
134
- pub fn bytes_per_value ( & self ) -> f32 {
135
- // Gorilla does not use metadata for encoding values, only the data in compressed_values.
136
- ( COMPRESSED_METADATA_SIZE_IN_BYTES . to_owned ( ) + self . compressed_values . len ( ) ) as f32
137
- / self . length as f32
151
+ /// Update the current minimum, maximum, and last value based on `value`.
152
+ fn update_min_max_and_last_value ( & mut self , value : Value ) {
153
+ self . min_value = Value :: min ( self . min_value , value) ;
154
+ self . max_value = Value :: max ( self . max_value , value) ;
155
+ self . last_value = value;
156
+ self . length += 1 ;
138
157
}
139
158
140
159
/// Return the values compressed using XOR and a variable length binary
141
- /// encoding.
142
- pub fn compressed_values ( self ) -> Vec < u8 > {
143
- self . compressed_values . finish ( )
160
+ /// encoding, the compressed minimum value, and the compressed maximum value.
161
+ pub fn model ( self ) -> ( Vec < u8 > , Value , Value ) {
162
+ (
163
+ self . compressed_values . finish ( ) ,
164
+ self . min_value ,
165
+ self . max_value ,
166
+ )
144
167
}
145
168
}
146
169
147
- /// Compute the sum of the values for a time series segment whose values are
148
- /// compressed using Gorilla's compression method for floating-point values.
149
- pub fn sum ( start_time : Timestamp , end_time : Timestamp , timestamps : & [ u8 ] , values : & [ u8 ] ) -> Value {
170
+ /// Compute the sum of the values for a time series segment whose values are compressed using
171
+ /// Gorilla's compression method for floating-point values. If `maybe_model_last_value` is provided,
172
+ /// it is assumed the first value in `values` is compressed against it instead of being stored in
173
+ /// full, i.e., uncompressed.
174
+ pub fn sum ( length : usize , values : & [ u8 ] , maybe_model_last_value : Option < Value > ) -> Value {
150
175
// This function replicates code from gorilla::grid() as it isn't necessary
151
176
// to store the univariate ids, timestamps, and values in arrays for a sum.
152
177
// So any changes to the decompression must be mirrored in gorilla::grid().
153
- let length = models:: len ( start_time, end_time, timestamps) ;
154
178
let mut bits = BitReader :: try_new ( values) . unwrap ( ) ;
155
179
let mut leading_zeros = u8:: MAX ;
156
180
let mut trailing_zeros: u8 = 0 ;
157
- let mut last_value = bits. read_bits ( models:: VALUE_SIZE_IN_BITS ) ;
158
181
159
- // The first value is stored uncompressed using size_of::<Value> bits.
160
- let mut sum = Value :: from_bits ( last_value) ;
182
+ let ( mut last_value, mut sum) = if let Some ( model_last_value) = maybe_model_last_value {
183
+ // The first value is stored compressed against model_last_value.
184
+ ( model_last_value. to_bits ( ) , 0.0 )
185
+ } else {
186
+ // The first value is stored uncompressed using size_of::<Value> bits.
187
+ let first_value = bits. read_bits ( models:: VALUE_SIZE_IN_BITS ) ;
188
+ ( first_value, Value :: from_bits ( first_value) )
189
+ } ;
161
190
162
191
// Then values are stored using XOR and a variable length binary encoding.
163
192
for _ in 0 ..length - 1 {
@@ -182,29 +211,38 @@ pub fn sum(start_time: Timestamp, end_time: Timestamp, timestamps: &[u8], values
182
211
sum
183
212
}
184
213
185
- /// Decompress the values in `values` for the `timestamps` without matching
186
- /// values in `value_builder`. The values in `values` are compressed using
187
- /// Gorilla's compression method for floating-point values. `univariate_ids`
188
- /// and `values` are appended to `univariate_id_builder` and `value_builder`.
214
+ /// Decompress all of the values in `values` for the `timestamps` without matching values in
215
+ /// `value_builder`. The values in `values` are compressed using Gorilla's compression method for
216
+ /// floating-point values. `univariate_ids` and `values` are appended to `univariate_id_builder` and
217
+ /// `value_builder`. If `maybe_model_last_value` is provided, it is assumed the first value in
218
+ /// `values` is compressed against it instead of being stored in full, i.e., uncompressed.
189
219
pub fn grid (
190
220
univariate_id : UnivariateId ,
191
221
values : & [ u8 ] ,
192
222
univariate_id_builder : & mut UnivariateIdBuilder ,
193
223
timestamps : & [ Timestamp ] ,
194
224
value_builder : & mut ValueBuilder ,
225
+ maybe_model_last_value : Option < Value > ,
195
226
) {
196
227
// Changes to the decompression must be mirrored in gorilla::sum().
197
228
let mut bits = BitReader :: try_new ( values) . unwrap ( ) ;
198
229
let mut leading_zeros = u8:: MAX ;
199
230
let mut trailing_zeros: u8 = 0 ;
200
- let mut last_value = bits. read_bits ( models:: VALUE_SIZE_IN_BITS ) ;
201
231
202
- // The first value is stored uncompressed using size_of::<Value> bits.
203
- univariate_id_builder. append_value ( univariate_id) ;
204
- value_builder. append_value ( Value :: from_bits ( last_value) ) ;
232
+ let mut last_value = if let Some ( model_last_value) = maybe_model_last_value {
233
+ // The first value is stored compressed against model_last_value.
234
+ model_last_value. to_bits ( )
235
+ } else {
236
+ // The first value is stored uncompressed using size_of::<Value> bits.
237
+ let first_value = bits. read_bits ( models:: VALUE_SIZE_IN_BITS ) ;
238
+ univariate_id_builder. append_value ( univariate_id) ;
239
+ value_builder. append_value ( Value :: from_bits ( first_value) ) ;
240
+ first_value
241
+ } ;
205
242
206
- // Then values are stored using XOR and a variable length binary encoding.
207
- for _ in 0 ..timestamps. len ( ) - 1 {
243
+ // Then values are stored using XOR and a variable length binary encoding. If last_value was
244
+ // provided by the model, the first value has not been read from values so all must be read now.
245
+ for _ in 0 ..timestamps. len ( ) - maybe_model_last_value. is_none ( ) as usize {
208
246
if bits. read_bit ( ) {
209
247
if bits. read_bit ( ) {
210
248
// New leading and trailing zeros.
@@ -241,7 +279,7 @@ mod tests {
241
279
#[ test]
242
280
fn test_empty_sequence ( ) {
243
281
let error_bound = ErrorBound :: try_new ( 0.0 ) . unwrap ( ) ;
244
- assert ! ( Gorilla :: new( error_bound) . compressed_values ( ) . is_empty( ) ) ;
282
+ assert ! ( Gorilla :: new( error_bound) . model ( ) . 0 . is_empty( ) ) ;
245
283
}
246
284
247
285
proptest ! {
@@ -250,7 +288,7 @@ mod tests {
250
288
let error_bound = ErrorBound :: try_new( 0.0 ) . unwrap( ) ;
251
289
let mut model_type = Gorilla :: new( error_bound) ;
252
290
253
- model_type. compress_value ( value) ;
291
+ model_type. compress_values ( & [ value] ) ;
254
292
255
293
prop_assert!( models:: equal_or_nan( value as f64 , model_type. last_value as f64 ) ) ;
256
294
prop_assert_eq!( model_type. last_leading_zero_bits, u8 :: MAX ) ;
@@ -262,8 +300,7 @@ mod tests {
262
300
let error_bound = ErrorBound :: try_new( 0.0 ) . unwrap( ) ;
263
301
let mut model_type = Gorilla :: new( error_bound) ;
264
302
265
- model_type. compress_value( value) ;
266
- model_type. compress_value( value) ;
303
+ model_type. compress_values( & [ value, value] ) ;
267
304
268
305
prop_assert!( models:: equal_or_nan( value as f64 , model_type. last_value as f64 ) ) ;
269
306
prop_assert_eq!( model_type. last_leading_zero_bits, u8 :: MAX ) ;
@@ -276,8 +313,7 @@ mod tests {
276
313
let error_bound = ErrorBound :: try_new ( 0.0 ) . unwrap ( ) ;
277
314
let mut model_type = Gorilla :: new ( error_bound) ;
278
315
279
- model_type. compress_value ( 37.0 ) ;
280
- model_type. compress_value ( 73.0 ) ;
316
+ model_type. compress_values ( & [ 37.0 , 73.0 ] ) ;
281
317
282
318
assert ! ( models:: equal_or_nan( 73.0 , model_type. last_value as f64 ) ) ;
283
319
assert_eq ! ( model_type. last_leading_zero_bits, 8 ) ;
@@ -289,9 +325,7 @@ mod tests {
289
325
let error_bound = ErrorBound :: try_new ( 0.0 ) . unwrap ( ) ;
290
326
let mut model_type = Gorilla :: new ( error_bound) ;
291
327
292
- model_type. compress_value ( 37.0 ) ;
293
- model_type. compress_value ( 71.0 ) ;
294
- model_type. compress_value ( 73.0 ) ;
328
+ model_type. compress_values ( & [ 37.0 , 71.0 , 73.0 ] ) ;
295
329
296
330
assert ! ( models:: equal_or_nan( 73.0 , model_type. last_value as f64 ) ) ;
297
331
assert_eq ! ( model_type. last_leading_zero_bits, 8 ) ;
@@ -303,12 +337,12 @@ mod tests {
303
337
let error_bound = ErrorBound :: try_new ( 10.0 ) . unwrap ( ) ;
304
338
let mut model_type = Gorilla :: new ( error_bound) ;
305
339
306
- model_type. compress_value ( 10.0 ) ;
340
+ model_type. compress_values ( & [ 10.0 ] ) ;
307
341
let before_last_value = model_type. last_value ;
308
342
let before_last_leading_zero_bits = model_type. last_leading_zero_bits ;
309
343
let before_last_trailing_zero_bits = model_type. last_trailing_zero_bits ;
310
344
311
- model_type. compress_value ( 11.0 ) ;
345
+ model_type. compress_values ( & [ 11.0 ] ) ;
312
346
313
347
// State should be unchanged when the value is within the error bound.
314
348
assert_eq ! ( before_last_value, model_type. last_value) ;
@@ -328,7 +362,7 @@ mod tests {
328
362
fn test_sum( values in collection:: vec( ProptestValue :: ANY , 0 ..50 ) ) {
329
363
prop_assume!( !values. is_empty( ) ) ;
330
364
let compressed_values = compress_values_using_gorilla( & values) ;
331
- let sum = sum( 1 , values. len( ) as i64 , & values . len ( ) . to_be_bytes ( ) , & compressed_values ) ;
365
+ let sum = sum( values. len( ) , & compressed_values , None ) ;
332
366
let expected_sum = aggregate:: sum( & ValueArray :: from_iter_values( values) ) . unwrap( ) ;
333
367
prop_assert!( models:: equal_or_nan( expected_sum as f64 , sum as f64 ) ) ;
334
368
}
@@ -349,7 +383,8 @@ mod tests {
349
383
& compressed_values,
350
384
& mut univariate_id_builder,
351
385
& timestamps,
352
- & mut value_builder
386
+ & mut value_builder,
387
+ None ,
353
388
) ;
354
389
355
390
let univariate_ids_array = univariate_id_builder. finish( ) ;
@@ -371,10 +406,8 @@ mod tests {
371
406
fn compress_values_using_gorilla ( values : & [ Value ] ) -> Vec < u8 > {
372
407
let error_bound = ErrorBound :: try_new ( 0.0 ) . unwrap ( ) ;
373
408
let mut model_type = Gorilla :: new ( error_bound) ;
374
- for value in values {
375
- model_type. compress_value ( * value) ;
376
- }
377
- model_type. compressed_values ( )
409
+ model_type. compress_values ( values) ;
410
+ model_type. compressed_values . finish ( )
378
411
}
379
412
380
413
fn slice_of_value_equal ( values_one : & [ Value ] , values_two : & [ Value ] ) -> bool {
0 commit comments