Some fixes to BtrBlocks compressor (#2500)

gatesn · web-flow · commit 4560a8728fad · 2025-02-25T17:32:33.000Z
* Downscaling of patch indices (todo: are there other patches than
bitpacking?)
* todo: Why does btrblocks incorrectly choose ALP vs dict?
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/quickstart/python.rst b/docs/quickstart/python.rst
@@ -37,7 +37,7 @@ Use :func:`~vortex.compress` to compress the Vortex array and check the relative
 
    >>> cvtx = vx.compress(vtx)
    >>> cvtx.nbytes
-   14415
+   15166
    >>> cvtx.nbytes / vtx.nbytes
    0.10...
 
diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs
@@ -117,6 +117,11 @@ impl ALPRDArray {
         &self.right_parts
     }
 
+    #[inline]
+    pub fn right_bit_width(&self) -> u8 {
+        self.right_bit_width
+    }
+
     /// Patches of left-most bits.
     pub fn left_parts_patches(&self) -> Option<&Patches> {
         self.left_parts_patches.as_ref()
@@ -128,9 +133,8 @@ impl ALPRDArray {
         &self.left_parts_dictionary
     }
 
-    #[inline]
-    pub(crate) fn right_bit_width(&self) -> u8 {
-        self.right_bit_width
+    pub fn replace_left_parts_patches(&mut self, patches: Option<Patches>) {
+        self.left_parts_patches = patches;
     }
 }
 
diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs
@@ -181,6 +181,10 @@ impl BitPackedArray {
         self.patches.as_ref()
     }
 
+    pub fn replace_patches(&mut self, patches: Option<Patches>) {
+        self.patches = patches;
+    }
+
     #[inline]
     pub fn offset(&self) -> u16 {
         self.offset
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ packages = ["dummy"] # Required for workspace project
 
 [tool.uv]
 managed = true
-required-version = ">=0.5.0"
+required-version = ">=0.6.0"
 # Currently, all dev dependencies live in the root since uv doesn't have transitive dev dependencies.
 #  See: https://github.com/astral-sh/uv/issues/7541
 dev-dependencies = [
diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs
@@ -1,5 +1,5 @@
 use pyo3::prelude::*;
-use vortex::sampling_compressor::SamplingCompressor;
+use vortex::compressor::BtrBlocksCompressor;
 
 use crate::arrays::PyArray;
 use crate::install_module;
@@ -47,9 +47,6 @@ pub(crate) fn init(py: Python, parent: &Bound<PyModule>) -> PyResult<()> {
 ///    'vortex.alp(0x11)(f64?, len=1000)'
 #[pyfunction]
 pub fn compress<'py>(array: &'py Bound<'py, PyArray>) -> PyResult<Bound<'py, PyArray>> {
-    let compressor = SamplingCompressor::default();
-    let inner = compressor
-        .compress(array.borrow().as_ref(), None)?
-        .into_array();
-    PyArray::init(array.py(), inner)
+    let compressed = BtrBlocksCompressor.compress(array.borrow().as_ref())?;
+    PyArray::init(array.py(), compressed)
 }
diff --git a/uv.lock b/uv.lock
diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs
@@ -14,6 +14,7 @@ use vortex_runend::compress::runend_encode;
 use self::stats::FloatStats;
 use crate::float::dictionary::dictionary_encode;
 use crate::integer::{IntCompressor, IntegerStats};
+use crate::patches::compress_patches;
 use crate::{
     Compressor, CompressorStats, GenerateStatsOptions, Scheme,
     estimate_compression_ratio_with_sampling, integer,
@@ -184,14 +185,6 @@ impl Scheme for ALPScheme {
             return Ok(0.0);
         }
 
-        // If Dict/RLE is feasible, we want to do that before ALP, and then only ALP encode
-        // the values.
-        if stats.average_run_length >= RUN_END_THRESHOLD
-            || stats.distinct_values_count < stats.value_count / 2
-        {
-            return Ok(0.0);
-        }
-
         estimate_compression_ratio_with_sampling(
             self,
             stats,
@@ -225,10 +218,9 @@ impl Scheme for ALPScheme {
         let compressed_alp_ints =
             IntCompressor::compress(&alp_ints, is_sample, allowed_cascading - 1, &int_excludes)?;
 
-        Ok(
-            ALPArray::try_new(compressed_alp_ints, alp.exponents(), alp.patches().cloned())?
-                .into_array(),
-        )
+        let patches = alp.patches().map(compress_patches).transpose()?;
+
+        Ok(ALPArray::try_new(compressed_alp_ints, alp.exponents(), patches)?.into_array())
     }
 }
 
@@ -273,7 +265,15 @@ impl Scheme for ALPRDScheme {
             ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"),
         };
 
-        Ok(encoder.encode(stats.source()).into_array())
+        let mut alp_rd = encoder.encode(stats.source());
+
+        let patches = alp_rd
+            .left_parts_patches()
+            .map(compress_patches)
+            .transpose()?;
+        alp_rd.replace_left_parts_patches(patches);
+
+        Ok(alp_rd.into_array())
     }
 }
 
diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs
@@ -26,6 +26,7 @@ use vortex_zigzag::{ZigZagArray, zigzag_encode};
 
 use crate::downscale::downscale_integer_array;
 use crate::integer::dictionary::dictionary_encode;
+use crate::patches::compress_patches;
 use crate::{
     Compressor, CompressorStats, GenerateStatsOptions, Scheme,
     estimate_compression_ratio_with_sampling,
@@ -406,7 +407,11 @@ impl Scheme for BitPackingScheme {
         if bw as usize == stats.source().ptype().bit_width() {
             return Ok(stats.source().clone().into_array());
         }
-        let packed = bitpack_encode(stats.source(), bw)?;
+        let mut packed = bitpack_encode(stats.source(), bw)?;
+
+        let patches = packed.patches().map(compress_patches).transpose()?;
+        packed.replace_patches(patches);
+
         Ok(packed.into_array())
     }
 }
diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs
@@ -17,6 +17,7 @@ pub use crate::temporal::compress_temporal;
 mod downscale;
 mod float;
 pub mod integer;
+mod patches;
 mod sample;
 mod string;
 mod temporal;
diff --git a/vortex-btrblocks/src/patches.rs b/vortex-btrblocks/src/patches.rs
@@ -0,0 +1,32 @@
+use vortex_array::Array;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::compute::scalar_at;
+use vortex_array::patches::Patches;
+use vortex_error::VortexResult;
+
+use crate::downscale::downscale_integer_array;
+
+/// Compresses the given patches by downscaling integers and checking for constant values.
+pub fn compress_patches(patches: &Patches) -> VortexResult<Patches> {
+    // Downscale the patch indices.
+    let indices = downscale_integer_array(patches.indices().clone())?;
+
+    // Check if the values are constant.
+    let values = patches.values();
+    let values = if values
+        .statistics()
+        .compute_is_constant()
+        .unwrap_or_default()
+    {
+        ConstantArray::new(scalar_at(values, 0)?, values.len()).into_array()
+    } else {
+        values.clone()
+    };
+
+    Ok(Patches::new(
+        patches.array_len(),
+        patches.offset(),
+        indices,
+        values,
+    ))
+}
diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml
@@ -24,6 +24,7 @@ workspace = true
 [dependencies]
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
+vortex-btrblocks = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-bytebool = { workspace = true }
 vortex-datetime-dtype = { workspace = true }
diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs
@@ -2,11 +2,11 @@
 
 pub use vortex_array::*;
 pub use {
-    vortex_buffer as buffer, vortex_datetime_dtype as datetime_dtype, vortex_dtype as dtype,
-    vortex_error as error, vortex_expr as expr, vortex_file as file,
-    vortex_flatbuffers as flatbuffers, vortex_io as io, vortex_ipc as ipc, vortex_layout as layout,
-    vortex_mask as mask, vortex_proto as proto, vortex_sampling_compressor as sampling_compressor,
-    vortex_scalar as scalar,
+    vortex_btrblocks as compressor, vortex_buffer as buffer,
+    vortex_datetime_dtype as datetime_dtype, vortex_dtype as dtype, vortex_error as error,
+    vortex_expr as expr, vortex_file as file, vortex_flatbuffers as flatbuffers, vortex_io as io,
+    vortex_ipc as ipc, vortex_layout as layout, vortex_mask as mask, vortex_proto as proto,
+    vortex_sampling_compressor as sampling_compressor, vortex_scalar as scalar,
 };
 
 pub mod encodings {

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,11 @@ impl ALPRDArray {`
`117`	`117`	`&self.right_parts`
`118`	`118`	`}`
`119`	`119`
	`120`	`+ #[inline]`
	`121`	`+ pub fn right_bit_width(&self) -> u8 {`
	`122`	`+ self.right_bit_width`
	`123`	`+ }`
	`124`	`+`
`120`	`125`	`/// Patches of left-most bits.`
`121`	`126`	`pub fn left_parts_patches(&self) -> Option<&Patches> {`
`122`	`127`	`self.left_parts_patches.as_ref()`
`@@ -128,9 +133,8 @@ impl ALPRDArray {`
`128`	`133`	`&self.left_parts_dictionary`
`129`	`134`	`}`
`130`	`135`
`131`		`- #[inline]`
`132`		`- pub(crate) fn right_bit_width(&self) -> u8 {`
`133`		`- self.right_bit_width`
	`136`	`+ pub fn replace_left_parts_patches(&mut self, patches: Option<Patches>) {`
	`137`	`+ self.left_parts_patches = patches;`
`134`	`138`	`}`
`135`	`139`	`}`
`136`	`140`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ use vortex_zigzag::{ZigZagArray, zigzag_encode};`
`26`	`26`
`27`	`27`	`use crate::downscale::downscale_integer_array;`
`28`	`28`	`use crate::integer::dictionary::dictionary_encode;`
	`29`	`+use crate::patches::compress_patches;`
`29`	`30`	`use crate::{`
`30`	`31`	`Compressor, CompressorStats, GenerateStatsOptions, Scheme,`
`31`	`32`	`estimate_compression_ratio_with_sampling,`
`@@ -406,7 +407,11 @@ impl Scheme for BitPackingScheme {`
`406`	`407`	`if bw as usize == stats.source().ptype().bit_width() {`
`407`	`408`	`return Ok(stats.source().clone().into_array());`
`408`	`409`	`}`
`409`		`- let packed = bitpack_encode(stats.source(), bw)?;`
	`410`	`+ let mut packed = bitpack_encode(stats.source(), bw)?;`
	`411`	`+`
	`412`	`+ let patches = packed.patches().map(compress_patches).transpose()?;`
	`413`	`+ packed.replace_patches(patches);`
	`414`	`+`
`410`	`415`	`Ok(packed.into_array())`
`411`	`416`	`}`
`412`	`417`	`}`