Skip to content

Commit 4560a87

Browse files
authored
Some fixes to BtrBlocks compressor (#2500)
* Downscaling of patch indices (todo: are there other patches than bitpacking?) * todo: Why does btrblocks incorrectly choose ALP vs dict?
1 parent 114368c commit 4560a87

File tree

13 files changed

+77
-30
lines changed

13 files changed

+77
-30
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/quickstart/python.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Use :func:`~vortex.compress` to compress the Vortex array and check the relative
3737

3838
>>> cvtx = vx.compress(vtx)
3939
>>> cvtx.nbytes
40-
14415
40+
15166
4141
>>> cvtx.nbytes / vtx.nbytes
4242
0.10...
4343

encodings/alp/src/alp_rd/array.rs

+7-3
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ impl ALPRDArray {
117117
&self.right_parts
118118
}
119119

120+
#[inline]
121+
pub fn right_bit_width(&self) -> u8 {
122+
self.right_bit_width
123+
}
124+
120125
/// Patches of left-most bits.
121126
pub fn left_parts_patches(&self) -> Option<&Patches> {
122127
self.left_parts_patches.as_ref()
@@ -128,9 +133,8 @@ impl ALPRDArray {
128133
&self.left_parts_dictionary
129134
}
130135

131-
#[inline]
132-
pub(crate) fn right_bit_width(&self) -> u8 {
133-
self.right_bit_width
136+
pub fn replace_left_parts_patches(&mut self, patches: Option<Patches>) {
137+
self.left_parts_patches = patches;
134138
}
135139
}
136140

encodings/fastlanes/src/bitpacking/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ impl BitPackedArray {
181181
self.patches.as_ref()
182182
}
183183

184+
pub fn replace_patches(&mut self, patches: Option<Patches>) {
185+
self.patches = patches;
186+
}
187+
184188
#[inline]
185189
pub fn offset(&self) -> u16 {
186190
self.offset

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ packages = ["dummy"] # Required for workspace project
1515

1616
[tool.uv]
1717
managed = true
18-
required-version = ">=0.5.0"
18+
required-version = ">=0.6.0"
1919
# Currently, all dev dependencies live in the root since uv doesn't have transitive dev dependencies.
2020
# See: https://github.com/astral-sh/uv/issues/7541
2121
dev-dependencies = [

pyvortex/src/compress.rs

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use pyo3::prelude::*;
2-
use vortex::sampling_compressor::SamplingCompressor;
2+
use vortex::compressor::BtrBlocksCompressor;
33

44
use crate::arrays::PyArray;
55
use crate::install_module;
@@ -47,9 +47,6 @@ pub(crate) fn init(py: Python, parent: &Bound<PyModule>) -> PyResult<()> {
4747
/// 'vortex.alp(0x11)(f64?, len=1000)'
4848
#[pyfunction]
4949
pub fn compress<'py>(array: &'py Bound<'py, PyArray>) -> PyResult<Bound<'py, PyArray>> {
50-
let compressor = SamplingCompressor::default();
51-
let inner = compressor
52-
.compress(array.borrow().as_ref(), None)?
53-
.into_array();
54-
PyArray::init(array.py(), inner)
50+
let compressed = BtrBlocksCompressor.compress(array.borrow().as_ref())?;
51+
PyArray::init(array.py(), compressed)
5552
}

uv.lock

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-btrblocks/src/float.rs

+13-13
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use vortex_runend::compress::runend_encode;
1414
use self::stats::FloatStats;
1515
use crate::float::dictionary::dictionary_encode;
1616
use crate::integer::{IntCompressor, IntegerStats};
17+
use crate::patches::compress_patches;
1718
use crate::{
1819
Compressor, CompressorStats, GenerateStatsOptions, Scheme,
1920
estimate_compression_ratio_with_sampling, integer,
@@ -184,14 +185,6 @@ impl Scheme for ALPScheme {
184185
return Ok(0.0);
185186
}
186187

187-
// If Dict/RLE is feasible, we want to do that before ALP, and then only ALP encode
188-
// the values.
189-
if stats.average_run_length >= RUN_END_THRESHOLD
190-
|| stats.distinct_values_count < stats.value_count / 2
191-
{
192-
return Ok(0.0);
193-
}
194-
195188
estimate_compression_ratio_with_sampling(
196189
self,
197190
stats,
@@ -225,10 +218,9 @@ impl Scheme for ALPScheme {
225218
let compressed_alp_ints =
226219
IntCompressor::compress(&alp_ints, is_sample, allowed_cascading - 1, &int_excludes)?;
227220

228-
Ok(
229-
ALPArray::try_new(compressed_alp_ints, alp.exponents(), alp.patches().cloned())?
230-
.into_array(),
231-
)
221+
let patches = alp.patches().map(compress_patches).transpose()?;
222+
223+
Ok(ALPArray::try_new(compressed_alp_ints, alp.exponents(), patches)?.into_array())
232224
}
233225
}
234226

@@ -273,7 +265,15 @@ impl Scheme for ALPRDScheme {
273265
ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"),
274266
};
275267

276-
Ok(encoder.encode(stats.source()).into_array())
268+
let mut alp_rd = encoder.encode(stats.source());
269+
270+
let patches = alp_rd
271+
.left_parts_patches()
272+
.map(compress_patches)
273+
.transpose()?;
274+
alp_rd.replace_left_parts_patches(patches);
275+
276+
Ok(alp_rd.into_array())
277277
}
278278
}
279279

vortex-btrblocks/src/integer.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use vortex_zigzag::{ZigZagArray, zigzag_encode};
2626

2727
use crate::downscale::downscale_integer_array;
2828
use crate::integer::dictionary::dictionary_encode;
29+
use crate::patches::compress_patches;
2930
use crate::{
3031
Compressor, CompressorStats, GenerateStatsOptions, Scheme,
3132
estimate_compression_ratio_with_sampling,
@@ -406,7 +407,11 @@ impl Scheme for BitPackingScheme {
406407
if bw as usize == stats.source().ptype().bit_width() {
407408
return Ok(stats.source().clone().into_array());
408409
}
409-
let packed = bitpack_encode(stats.source(), bw)?;
410+
let mut packed = bitpack_encode(stats.source(), bw)?;
411+
412+
let patches = packed.patches().map(compress_patches).transpose()?;
413+
packed.replace_patches(patches);
414+
410415
Ok(packed.into_array())
411416
}
412417
}

vortex-btrblocks/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ pub use crate::temporal::compress_temporal;
1717
mod downscale;
1818
mod float;
1919
pub mod integer;
20+
mod patches;
2021
mod sample;
2122
mod string;
2223
mod temporal;

vortex-btrblocks/src/patches.rs

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use vortex_array::Array;
2+
use vortex_array::arrays::ConstantArray;
3+
use vortex_array::compute::scalar_at;
4+
use vortex_array::patches::Patches;
5+
use vortex_error::VortexResult;
6+
7+
use crate::downscale::downscale_integer_array;
8+
9+
/// Compresses the given patches by downscaling integers and checking for constant values.
10+
pub fn compress_patches(patches: &Patches) -> VortexResult<Patches> {
11+
// Downscale the patch indices.
12+
let indices = downscale_integer_array(patches.indices().clone())?;
13+
14+
// Check if the values are constant.
15+
let values = patches.values();
16+
let values = if values
17+
.statistics()
18+
.compute_is_constant()
19+
.unwrap_or_default()
20+
{
21+
ConstantArray::new(scalar_at(values, 0)?, values.len()).into_array()
22+
} else {
23+
values.clone()
24+
};
25+
26+
Ok(Patches::new(
27+
patches.array_len(),
28+
patches.offset(),
29+
indices,
30+
values,
31+
))
32+
}

vortex/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ workspace = true
2424
[dependencies]
2525
vortex-alp = { workspace = true }
2626
vortex-array = { workspace = true }
27+
vortex-btrblocks = { workspace = true }
2728
vortex-buffer = { workspace = true }
2829
vortex-bytebool = { workspace = true }
2930
vortex-datetime-dtype = { workspace = true }

vortex/src/lib.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33
pub use vortex_array::*;
44
pub use {
5-
vortex_buffer as buffer, vortex_datetime_dtype as datetime_dtype, vortex_dtype as dtype,
6-
vortex_error as error, vortex_expr as expr, vortex_file as file,
7-
vortex_flatbuffers as flatbuffers, vortex_io as io, vortex_ipc as ipc, vortex_layout as layout,
8-
vortex_mask as mask, vortex_proto as proto, vortex_sampling_compressor as sampling_compressor,
9-
vortex_scalar as scalar,
5+
vortex_btrblocks as compressor, vortex_buffer as buffer,
6+
vortex_datetime_dtype as datetime_dtype, vortex_dtype as dtype, vortex_error as error,
7+
vortex_expr as expr, vortex_file as file, vortex_flatbuffers as flatbuffers, vortex_io as io,
8+
vortex_ipc as ipc, vortex_layout as layout, vortex_mask as mask, vortex_proto as proto,
9+
vortex_sampling_compressor as sampling_compressor, vortex_scalar as scalar,
1010
};
1111

1212
pub mod encodings {

0 commit comments

Comments
 (0)