Skip to content

Commit 916b8dc

Browse files
Refactor compression / XOR last model value (#105)
* Disable residuals to make XOR to last model value * Split modelardb_compression into more sub-modules * Simplify names of types in the compression module * Further simplify names of types in compression * Compress inline residuals using model's end value * Fix computing slope and intercept with residuals * Replace a and b with slope and intercept for Swing * Document remaining functions, methods, and types * Update sum and grid to take residuals into account * Structure generated_columns() like error_bounds() * Update based on comments from @CGodiksen * Fix typo pointed out by @CGodiksen
1 parent 4e20954 commit 916b8dc

File tree

21 files changed

+2193
-1820
lines changed

21 files changed

+2193
-1820
lines changed

crates/modelardb_compression/src/compression.rs

Lines changed: 743 additions & 0 deletions
Large diffs are not rendered by default.

crates/modelardb_compression/src/lib.rs

Lines changed: 53 additions & 1310 deletions
Large diffs are not rendered by default.

crates/modelardb_compression/src/merge.rs

Lines changed: 439 additions & 0 deletions
Large diffs are not rendered by default.

crates/modelardb_compression/src/models/bits.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ impl BitVecBuilder {
143143
}
144144

145145
/// Return the number of bytes required to store the appended bits.
146-
pub fn len(&self) -> usize {
146+
#[cfg(test)]
147+
fn len(&self) -> usize {
147148
self.bytes.len() + (self.remaining_bits != 8) as usize
148149
}
149150

crates/modelardb_compression/src/models/gorilla.rs

Lines changed: 92 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
//!
2323
//! [Gorilla paper]: https://www.vldb.org/pvldb/vol8/p1816-teller.pdf
2424
25-
use modelardb_common::schemas::COMPRESSED_METADATA_SIZE_IN_BYTES;
2625
use modelardb_common::types::{Timestamp, UnivariateId, UnivariateIdBuilder, Value, ValueBuilder};
2726

2827
use crate::models;
@@ -34,6 +33,10 @@ use crate::models::ErrorBound;
3433
pub struct Gorilla {
3534
/// Maximum relative error for the value of each data point.
3635
error_bound: ErrorBound,
36+
/// Min value compressed and added to `compressed_values`.
37+
min_value: Value,
38+
/// Max value compressed and added to `compressed_values`.
39+
max_value: Value,
3740
/// Last value compressed and added to `compressed_values`.
3841
last_value: Value,
3942
/// Number of leading zero bits for the last value that was compressed by
@@ -52,6 +55,8 @@ impl Gorilla {
5255
pub fn new(error_bound: ErrorBound) -> Self {
5356
Self {
5457
error_bound,
58+
min_value: Value::NAN,
59+
max_value: Value::NAN,
5560
last_value: 0.0,
5661
last_leading_zero_bits: u8::MAX,
5762
last_trailing_zero_bits: 0,
@@ -60,9 +65,33 @@ impl Gorilla {
6065
}
6166
}
6267

63-
/// Compress `value` using XOR and a variable length binary encoding and
64-
/// append the compressed value to an internal buffer in [`Gorilla`].
65-
pub fn compress_value(&mut self, value: Value) {
68+
/// Store the first value in full if this instance of [`Gorilla`] is empty and then compress the
69+
/// remaining `values` using XOR and a variable length binary encoding before storing them.
70+
pub fn compress_values(&mut self, values: &[Value]) {
71+
for value in values {
72+
if self.compressed_values.is_empty() {
73+
// Store the first value uncompressed using size_of::<Value> bits.
74+
self.compressed_values
75+
.append_bits(value.to_bits(), models::VALUE_SIZE_IN_BITS);
76+
77+
self.update_min_max_and_last_value(*value);
78+
} else {
79+
self.compress_value_xor_last_value(*value);
80+
};
81+
}
82+
}
83+
84+
/// Assume `last_value` is stored fully elsewhere, set it as the current last value, and then
85+
/// compress each of the values in `values` using XOR and a variable length binary encoding.
86+
pub fn compress_values_without_first(&mut self, values: &[Value], model_last_value: Value) {
87+
self.last_value = model_last_value;
88+
for value in values {
89+
self.compress_value_xor_last_value(*value);
90+
}
91+
}
92+
93+
/// Compress `value` using XOR and a variable length binary encoding and then store it.
94+
fn compress_value_xor_last_value(&mut self, value: Value) {
6695
// The best case for Gorilla is storing duplicate values.
6796
let value = if models::is_value_within_error_bound(self.error_bound, value, self.last_value)
6897
{
@@ -75,11 +104,7 @@ impl Gorilla {
75104
let last_value_as_integer = self.last_value.to_bits();
76105
let value_xor_last_value = value_as_integer ^ last_value_as_integer;
77106

78-
if self.compressed_values.is_empty() {
79-
// Store the first value uncompressed using size_of::<Value> bits.
80-
self.compressed_values
81-
.append_bits(value_as_integer, models::VALUE_SIZE_IN_BITS);
82-
} else if value_xor_last_value == 0 {
107+
if value_xor_last_value == 0 {
83108
// Store each repeated value as a single zero bit.
84109
self.compressed_values.append_a_zero_bit();
85110
} else {
@@ -119,45 +144,49 @@ impl Gorilla {
119144
self.last_trailing_zero_bits = trailing_zero_bits;
120145
}
121146
}
122-
self.last_value = value;
123-
self.length += 1;
124-
}
125147

126-
/// Return the number of values currently compressed using XOR and a
127-
/// variable length binary encoding.
128-
#[allow(clippy::len_without_is_empty)]
129-
pub fn len(&self) -> usize {
130-
self.length
148+
self.update_min_max_and_last_value(value);
131149
}
132150

133-
/// Return the number of bytes currently used per data point on average.
134-
pub fn bytes_per_value(&self) -> f32 {
135-
// Gorilla does not use metadata for encoding values, only the data in compressed_values.
136-
(COMPRESSED_METADATA_SIZE_IN_BYTES.to_owned() + self.compressed_values.len()) as f32
137-
/ self.length as f32
151+
/// Update the current minimum, maximum, and last value based on `value`.
152+
fn update_min_max_and_last_value(&mut self, value: Value) {
153+
self.min_value = Value::min(self.min_value, value);
154+
self.max_value = Value::max(self.max_value, value);
155+
self.last_value = value;
156+
self.length += 1;
138157
}
139158

140159
/// Return the values compressed using XOR and a variable length binary
141-
/// encoding.
142-
pub fn compressed_values(self) -> Vec<u8> {
143-
self.compressed_values.finish()
160+
/// encoding, the compressed minimum value, and the compressed maximum value.
161+
pub fn model(self) -> (Vec<u8>, Value, Value) {
162+
(
163+
self.compressed_values.finish(),
164+
self.min_value,
165+
self.max_value,
166+
)
144167
}
145168
}
146169

147-
/// Compute the sum of the values for a time series segment whose values are
148-
/// compressed using Gorilla's compression method for floating-point values.
149-
pub fn sum(start_time: Timestamp, end_time: Timestamp, timestamps: &[u8], values: &[u8]) -> Value {
170+
/// Compute the sum of the values for a time series segment whose values are compressed using
171+
/// Gorilla's compression method for floating-point values. If `maybe_model_last_value` is provided,
172+
/// it is assumed the first value in `values` is compressed against it instead of being stored in
173+
/// full, i.e., uncompressed.
174+
pub fn sum(length: usize, values: &[u8], maybe_model_last_value: Option<Value>) -> Value {
150175
// This function replicates code from gorilla::grid() as it isn't necessary
151176
// to store the univariate ids, timestamps, and values in arrays for a sum.
152177
// So any changes to the decompression must be mirrored in gorilla::grid().
153-
let length = models::len(start_time, end_time, timestamps);
154178
let mut bits = BitReader::try_new(values).unwrap();
155179
let mut leading_zeros = u8::MAX;
156180
let mut trailing_zeros: u8 = 0;
157-
let mut last_value = bits.read_bits(models::VALUE_SIZE_IN_BITS);
158181

159-
// The first value is stored uncompressed using size_of::<Value> bits.
160-
let mut sum = Value::from_bits(last_value);
182+
let (mut last_value, mut sum) = if let Some(model_last_value) = maybe_model_last_value {
183+
// The first value is stored compressed against model_last_value.
184+
(model_last_value.to_bits(), 0.0)
185+
} else {
186+
// The first value is stored uncompressed using size_of::<Value> bits.
187+
let first_value = bits.read_bits(models::VALUE_SIZE_IN_BITS);
188+
(first_value, Value::from_bits(first_value))
189+
};
161190

162191
// Then values are stored using XOR and a variable length binary encoding.
163192
for _ in 0..length - 1 {
@@ -182,29 +211,38 @@ pub fn sum(start_time: Timestamp, end_time: Timestamp, timestamps: &[u8], values
182211
sum
183212
}
184213

185-
/// Decompress the values in `values` for the `timestamps` without matching
186-
/// values in `value_builder`. The values in `values` are compressed using
187-
/// Gorilla's compression method for floating-point values. `univariate_ids`
188-
/// and `values` are appended to `univariate_id_builder` and `value_builder`.
214+
/// Decompress all of the values in `values` for the `timestamps` without matching values in
215+
/// `value_builder`. The values in `values` are compressed using Gorilla's compression method for
216+
/// floating-point values. `univariate_ids` and `values` are appended to `univariate_id_builder` and
217+
/// `value_builder`. If `maybe_model_last_value` is provided, it is assumed the first value in
218+
/// `values` is compressed against it instead of being stored in full, i.e., uncompressed.
189219
pub fn grid(
190220
univariate_id: UnivariateId,
191221
values: &[u8],
192222
univariate_id_builder: &mut UnivariateIdBuilder,
193223
timestamps: &[Timestamp],
194224
value_builder: &mut ValueBuilder,
225+
maybe_model_last_value: Option<Value>,
195226
) {
196227
// Changes to the decompression must be mirrored in gorilla::sum().
197228
let mut bits = BitReader::try_new(values).unwrap();
198229
let mut leading_zeros = u8::MAX;
199230
let mut trailing_zeros: u8 = 0;
200-
let mut last_value = bits.read_bits(models::VALUE_SIZE_IN_BITS);
201231

202-
// The first value is stored uncompressed using size_of::<Value> bits.
203-
univariate_id_builder.append_value(univariate_id);
204-
value_builder.append_value(Value::from_bits(last_value));
232+
let mut last_value = if let Some(model_last_value) = maybe_model_last_value {
233+
// The first value is stored compressed against model_last_value.
234+
model_last_value.to_bits()
235+
} else {
236+
// The first value is stored uncompressed using size_of::<Value> bits.
237+
let first_value = bits.read_bits(models::VALUE_SIZE_IN_BITS);
238+
univariate_id_builder.append_value(univariate_id);
239+
value_builder.append_value(Value::from_bits(first_value));
240+
first_value
241+
};
205242

206-
// Then values are stored using XOR and a variable length binary encoding.
207-
for _ in 0..timestamps.len() - 1 {
243+
// Then values are stored using XOR and a variable length binary encoding. If last_value was
244+
// provided by the model, the first value has not been read from values so all must be read now.
245+
for _ in 0..timestamps.len() - maybe_model_last_value.is_none() as usize {
208246
if bits.read_bit() {
209247
if bits.read_bit() {
210248
// New leading and trailing zeros.
@@ -241,7 +279,7 @@ mod tests {
241279
#[test]
242280
fn test_empty_sequence() {
243281
let error_bound = ErrorBound::try_new(0.0).unwrap();
244-
assert!(Gorilla::new(error_bound).compressed_values().is_empty());
282+
assert!(Gorilla::new(error_bound).model().0.is_empty());
245283
}
246284

247285
proptest! {
@@ -250,7 +288,7 @@ mod tests {
250288
let error_bound = ErrorBound::try_new(0.0).unwrap();
251289
let mut model_type = Gorilla::new(error_bound);
252290

253-
model_type.compress_value(value);
291+
model_type.compress_values(&[value]);
254292

255293
prop_assert!(models::equal_or_nan(value as f64, model_type.last_value as f64));
256294
prop_assert_eq!(model_type.last_leading_zero_bits, u8::MAX);
@@ -262,8 +300,7 @@ mod tests {
262300
let error_bound = ErrorBound::try_new(0.0).unwrap();
263301
let mut model_type = Gorilla::new(error_bound);
264302

265-
model_type.compress_value(value);
266-
model_type.compress_value(value);
303+
model_type.compress_values(&[value, value]);
267304

268305
prop_assert!(models::equal_or_nan(value as f64, model_type.last_value as f64));
269306
prop_assert_eq!(model_type.last_leading_zero_bits, u8::MAX);
@@ -276,8 +313,7 @@ mod tests {
276313
let error_bound = ErrorBound::try_new(0.0).unwrap();
277314
let mut model_type = Gorilla::new(error_bound);
278315

279-
model_type.compress_value(37.0);
280-
model_type.compress_value(73.0);
316+
model_type.compress_values(&[37.0, 73.0]);
281317

282318
assert!(models::equal_or_nan(73.0, model_type.last_value as f64));
283319
assert_eq!(model_type.last_leading_zero_bits, 8);
@@ -289,9 +325,7 @@ mod tests {
289325
let error_bound = ErrorBound::try_new(0.0).unwrap();
290326
let mut model_type = Gorilla::new(error_bound);
291327

292-
model_type.compress_value(37.0);
293-
model_type.compress_value(71.0);
294-
model_type.compress_value(73.0);
328+
model_type.compress_values(&[37.0, 71.0, 73.0]);
295329

296330
assert!(models::equal_or_nan(73.0, model_type.last_value as f64));
297331
assert_eq!(model_type.last_leading_zero_bits, 8);
@@ -303,12 +337,12 @@ mod tests {
303337
let error_bound = ErrorBound::try_new(10.0).unwrap();
304338
let mut model_type = Gorilla::new(error_bound);
305339

306-
model_type.compress_value(10.0);
340+
model_type.compress_values(&[10.0]);
307341
let before_last_value = model_type.last_value;
308342
let before_last_leading_zero_bits = model_type.last_leading_zero_bits;
309343
let before_last_trailing_zero_bits = model_type.last_trailing_zero_bits;
310344

311-
model_type.compress_value(11.0);
345+
model_type.compress_values(&[11.0]);
312346

313347
// State should be unchanged when the value is within the error bound.
314348
assert_eq!(before_last_value, model_type.last_value);
@@ -328,7 +362,7 @@ mod tests {
328362
fn test_sum(values in collection::vec(ProptestValue::ANY, 0..50)) {
329363
prop_assume!(!values.is_empty());
330364
let compressed_values = compress_values_using_gorilla(&values);
331-
let sum = sum(1, values.len() as i64, &values.len().to_be_bytes(), &compressed_values);
365+
let sum = sum(values.len(), &compressed_values, None);
332366
let expected_sum = aggregate::sum(&ValueArray::from_iter_values(values)).unwrap();
333367
prop_assert!(models::equal_or_nan(expected_sum as f64, sum as f64));
334368
}
@@ -349,7 +383,8 @@ mod tests {
349383
&compressed_values,
350384
&mut univariate_id_builder,
351385
&timestamps,
352-
&mut value_builder
386+
&mut value_builder,
387+
None,
353388
);
354389

355390
let univariate_ids_array = univariate_id_builder.finish();
@@ -371,10 +406,8 @@ mod tests {
371406
fn compress_values_using_gorilla(values: &[Value]) -> Vec<u8> {
372407
let error_bound = ErrorBound::try_new(0.0).unwrap();
373408
let mut model_type = Gorilla::new(error_bound);
374-
for value in values {
375-
model_type.compress_value(*value);
376-
}
377-
model_type.compressed_values()
409+
model_type.compress_values(values);
410+
model_type.compressed_values.finish()
378411
}
379412

380413
fn slice_of_value_equal(values_one: &[Value], values_two: &[Value]) -> bool {

0 commit comments

Comments
 (0)