From 32b4d6049faf7c7dfec6c895ca080bdfd69b4861 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:11:40 -0500 Subject: [PATCH 01/11] Impl RoaringBitmap::from_bitmap_bytes --- roaring/src/bitmap/serialization.rs | 177 +++++++++++++++++++++++++++- 1 file changed, 176 insertions(+), 1 deletion(-) diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index f04f5039..c0a8caa5 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -7,6 +7,7 @@ use std::io; use crate::bitmap::container::{Container, ARRAY_LIMIT}; use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; +use crate::bitmap::util; use crate::RoaringBitmap; pub const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; @@ -47,6 +48,143 @@ impl RoaringBitmap { 8 + container_sizes } + /// Creates a `RoaringBitmap` from a byte slice, interpreting the bytes as a bitmap with a specified offset. + /// + /// # Arguments + /// + /// - `offset: u32` - The starting position in the bitmap where the byte slice will be applied, specified in bits. + /// This means that if `offset` is `n`, the first byte in the slice will correspond to the `n`th bit(0-indexed) in the bitmap. + /// Must be a multiple of 8. + /// - `bytes: &[u8]` - The byte slice containing the bitmap data. The bytes are interpreted in little-endian order. + /// + /// # Interpretation of `bytes` + /// + /// The `bytes` slice is interpreted in little-endian order. Each byte is read from least significant bit (LSB) to most significant bit (MSB). + /// For example, the byte `0b00000101` represents the bits `1, 0, 1, 0, 0, 0, 0, 0` in that order (see Examples section). + /// + /// # Panics + /// + /// This function will panic if `offset` is not a multiple of 8. + /// + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// + /// let bytes = [0b00000101, 0b00000010, 0b00000000, 0b10000000]; + /// // ^^^^^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^ + /// // 76543210 98 + /// let rb = RoaringBitmap::from_bitmap_bytes(0, &bytes); + /// assert!(rb.contains(0)); + /// assert!(!rb.contains(1)); + /// assert!(rb.contains(2)); + /// assert!(rb.contains(9)); + /// assert!(rb.contains(31)); + /// + /// let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); + /// assert!(rb.contains(8)); + /// assert!(!rb.contains(9)); + /// assert!(rb.contains(10)); + /// assert!(rb.contains(17)); + /// assert!(rb.contains(39)); + /// ``` + #[inline] + pub fn from_bitmap_bytes(offset: u32, bytes: &[u8]) -> RoaringBitmap { + #[inline(always)] + fn next_multiple_of_u32(n: u32, multiple: u32) -> u32 { + (n + multiple - 1) / multiple * multiple + } + #[inline(always)] + fn next_multiple_of_usize(n: usize, multiple: usize) -> usize { + (n + multiple - 1) / multiple * multiple + } + + const BITS_IN_ONE_CONTAINER: usize = u64::BITS as usize * BITMAP_LENGTH; + const BYTES_IN_ONE_CONTAINER: usize = BITS_IN_ONE_CONTAINER / 8; + assert_eq!(offset % 8, 0, "offset must be a multiple of 8"); + let byte_offset = offset as usize / 8; + let n_containers_needed = + (bytes.len() + (BYTES_IN_ONE_CONTAINER - 1)) / BYTES_IN_ONE_CONTAINER + 1; + let mut containers = Vec::with_capacity(n_containers_needed); + + let (offset, bytes) = if byte_offset % BYTES_IN_ONE_CONTAINER == 0 { + (offset, bytes) + } else { + let next_container_byte_offset = + next_multiple_of_usize(byte_offset, BYTES_IN_ONE_CONTAINER); + + let number_of_bytes_in_first_container = next_container_byte_offset - byte_offset; + let number_of_bytes_copied_to_first_container = + bytes.len().min(number_of_bytes_in_first_container); + + let (first_container_bytes, bytes_left) = + bytes.split_at(number_of_bytes_copied_to_first_container); + let (first_container_key, _) = util::split(offset); + + let len: u64 = first_container_bytes.iter().map(|&b| b.count_ones() as u64).sum(); + if len != 0 { + let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); + // Safety: + // * `first_container_bytes` is a slice of `bytes` and is guaranteed to be smaller than `BITMAP_LENGTH` u64s(= `BYTES_IN_ONE_CONTAINER` u8s) + // * `count` argument of `add` never equals to `BYTE_IN_ONE_CONTAINER` because at least one byte will be copied to the first container + // This is guaranteed by the condition `byte_offset % BYTES_IN_ONE_CONTAINER != 0` + let bits_ptr = unsafe { + bits.as_mut_ptr() + .cast::() + .add(BYTES_IN_ONE_CONTAINER - number_of_bytes_in_first_container) + }; + + debug_assert!(first_container_bytes.len() <= number_of_bytes_in_first_container); + // Safety: + // * `first_container_bytes` is a slice of `bytes` and is guaranteed to be smaller than or equal to `number_of_bytes_in_first_container` + unsafe { + core::ptr::copy_nonoverlapping( + first_container_bytes.as_ptr(), + bits_ptr, + first_container_bytes.len(), + ) + }; + + let store = BitmapStore::from_unchecked(len, bits); + let mut container = + Container { key: first_container_key, store: Store::Bitmap(store) }; + container.ensure_correct_store(); + + containers.push(container); + } + + (next_multiple_of_u32(offset, BITS_IN_ONE_CONTAINER as u32), bytes_left) + }; + + let bitmap_store_chunks = bytes.chunks(BYTES_IN_ONE_CONTAINER); + + let (offset_key, _) = util::split(offset); + for (i, chunk) in bitmap_store_chunks.enumerate() { + let len: u64 = chunk.iter().map(|&b| b.count_ones() as u64).sum(); + if len == 0 { + continue; + } + let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); + let bits_ptr = bits.as_mut_ptr().cast::(); + + debug_assert!(chunk.len() <= BITMAP_LENGTH * size_of::()); + // Safety: + // * `chunk` is a slice of `bytes` and is guaranteed to be smaller than `BITMAP_LENGTH` u64s + unsafe { core::ptr::copy_nonoverlapping(chunk.as_ptr(), bits_ptr, chunk.len()) }; + + let store = BitmapStore::from_unchecked(len, bits); + + let mut container = + Container { key: offset_key + i as u16, store: Store::Bitmap(store) }; + container.ensure_correct_store(); + + containers.push(container); + } + + RoaringBitmap { containers } + } + /// Serialize this bitmap into [the standard Roaring on-disk format][format]. /// This is compatible with the official C/C++, Java and Go implementations. /// @@ -256,7 +394,7 @@ impl RoaringBitmap { #[cfg(test)] mod test { - use crate::RoaringBitmap; + use crate::{bitmap::store::BITMAP_LENGTH, RoaringBitmap}; use proptest::prelude::*; proptest! { @@ -270,6 +408,43 @@ mod test { } } + #[test] + fn test_from_bitmap_bytes() { + const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32; + const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8; + let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize]; + bytes.extend(&[0x00; CONTAINER_OFFSET_IN_BYTES as usize]); + bytes.extend(&[0b00000001, 0b00000010, 0b00000011, 0b00000100]); + + let offset = 32; + let rb = RoaringBitmap::from_bitmap_bytes(offset, &bytes); + for i in 0..offset { + assert!(!rb.contains(i), "{i} should not be in the bitmap"); + } + for i in offset..offset + CONTAINER_OFFSET { + assert!(rb.contains(i), "{i} should be in the bitmap"); + } + for i in offset + CONTAINER_OFFSET..offset + CONTAINER_OFFSET * 2 { + assert!(!rb.contains(i), "{i} should not be in the bitmap"); + } + for bit in [0, 9, 16, 17, 26] { + let i = bit + offset + CONTAINER_OFFSET * 2; + assert!(rb.contains(i), "{i} should be in the bitmap"); + } + + assert_eq!(rb.len(), CONTAINER_OFFSET as u64 + 5); + + // Ensure the empty container is not created + let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize]; + bytes.extend(&[0xff]); + let rb = RoaringBitmap::from_bitmap_bytes(0, &bytes); + + assert_eq!(rb.min(), Some(CONTAINER_OFFSET)); + + let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); + assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8)); + } + #[test] fn test_deserialize_overflow_s_plus_len() { let data = vec![59, 48, 0, 0, 255, 130, 254, 59, 48, 2, 0, 41, 255, 255, 166, 197, 4, 0, 2]; From 9834c01a148583c5807eccc58d8c1a6b20343803 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:27:20 -0500 Subject: [PATCH 02/11] Add benchmark for from_bitmap_bytes --- benchmarks/benches/lib.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index b5c92882..c5f360a9 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -149,6 +149,27 @@ fn creation(c: &mut Criterion) { group.throughput(Throughput::Elements(dataset.bitmaps.iter().map(|rb| rb.len()).sum())); + group.bench_function(BenchmarkId::new("from_bitmap_bytes", &dataset.name), |b| { + let bitmap_bytes = dataset_numbers + .iter() + .map(|bitmap_numbers| { + let max_number = *bitmap_numbers.iter().max().unwrap() as usize; + let mut buf = vec![0u8; max_number / 8 + 1]; + for n in bitmap_numbers { + let byte = (n / 8) as usize; + let bit = n % 8; + buf[byte] |= 1 << bit; + } + buf + }) + .collect::>(); + b.iter(|| { + for bitmap_bytes in &bitmap_bytes { + black_box(RoaringBitmap::from_bitmap_bytes(0, bitmap_bytes)); + } + }) + }); + group.bench_function(BenchmarkId::new("from_sorted_iter", &dataset.name), |b| { b.iter(|| { for bitmap_numbers in &dataset_numbers { From 4c4a2ddabea8f12fbb60b24369a0355cf26c1647 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Fri, 6 Sep 2024 07:24:12 +0900 Subject: [PATCH 03/11] Handle big endian system in from_bitmap_bytes --- roaring/src/bitmap/serialization.rs | 77 +++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index c0a8caa5..3065ed67 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -99,6 +99,56 @@ impl RoaringBitmap { fn next_multiple_of_usize(n: usize, multiple: usize) -> usize { (n + multiple - 1) / multiple * multiple } + /// Copies bits from `src` to `dst` at `bits_offset` bits offset. + /// Safety: `src` must be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s, + /// considering `byte_offset`. + #[inline(always)] + #[cfg(target_endian = "little")] + unsafe fn copy_bits(src: &[u8], dst: &mut [u64; BITMAP_LENGTH], byte_offset: usize) { + debug_assert!(src.len() + byte_offset <= BYTES_IN_ONE_CONTAINER); + + // Safety: + // * `byte_offset` is within the bounds of `dst`, guaranteed by the caller. + let bits_ptr = unsafe { dst.as_mut_ptr().cast::().add(byte_offset) }; + // Safety: + // * `src` is a slice of `bytes` and is guaranteed to be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s considering `byte_offset`, + // guaranteed by the caller. + unsafe { + core::ptr::copy_nonoverlapping(src.as_ptr(), bits_ptr, src.len()); + } + } + /// Copies bits from `src` to `dst` at `bits_offset` bits offset. + /// Safety: `src` must be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s, + /// considering `byte_offset`. + #[inline(always)] + #[cfg(target_endian = "big")] + unsafe fn copy_bits(src: &[u8], dst: &mut [u64; BITMAP_LENGTH], byte_offset: usize) { + debug_assert!(src.len() + byte_offset <= BYTES_IN_ONE_CONTAINER); + + if byte_offset % 8 != 0 { + let mut bytes = [0u8; 8]; + + let src_range = 0..(8 - byte_offset % 8).min(src.len()); + bytes[8 - src_range.len()..8].copy_from_slice(&src[src_range]); + dst[byte_offset / 8] = u64::from_le_bytes(bytes); + } + + let aligned_u64_offset = (byte_offset + 7) / 8; + + // Iterate over the src data and copy it to dst as little-endian u64 values + for i in aligned_u64_offset..((byte_offset + src.len() + 7) / 8) { + let mut bytes = [0u8; 8]; + + let src_range = + (i - aligned_u64_offset) * 8..((i - aligned_u64_offset + 1) * 8).min(src.len()); + // println!("src_range: {:?}", src_range); + bytes[0..src_range.len()].copy_from_slice(&src[src_range]); + // println!("bytes: {:x?}", &bytes); + + // Write the updated u64 value back to dst + dst[i] = u64::from_le_bytes(bytes); + } + } const BITS_IN_ONE_CONTAINER: usize = u64::BITS as usize * BITMAP_LENGTH; const BYTES_IN_ONE_CONTAINER: usize = BITS_IN_ONE_CONTAINER / 8; @@ -126,23 +176,12 @@ impl RoaringBitmap { if len != 0 { let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); // Safety: - // * `first_container_bytes` is a slice of `bytes` and is guaranteed to be smaller than `BITMAP_LENGTH` u64s(= `BYTES_IN_ONE_CONTAINER` u8s) - // * `count` argument of `add` never equals to `BYTE_IN_ONE_CONTAINER` because at least one byte will be copied to the first container - // This is guaranteed by the condition `byte_offset % BYTES_IN_ONE_CONTAINER != 0` - let bits_ptr = unsafe { - bits.as_mut_ptr() - .cast::() - .add(BYTES_IN_ONE_CONTAINER - number_of_bytes_in_first_container) - }; - - debug_assert!(first_container_bytes.len() <= number_of_bytes_in_first_container); - // Safety: // * `first_container_bytes` is a slice of `bytes` and is guaranteed to be smaller than or equal to `number_of_bytes_in_first_container` unsafe { - core::ptr::copy_nonoverlapping( - first_container_bytes.as_ptr(), - bits_ptr, - first_container_bytes.len(), + copy_bits( + first_container_bytes, + bits.as_mut(), + BYTES_IN_ONE_CONTAINER - number_of_bytes_in_first_container, ) }; @@ -166,13 +205,11 @@ impl RoaringBitmap { continue; } let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); - let bits_ptr = bits.as_mut_ptr().cast::(); - - debug_assert!(chunk.len() <= BITMAP_LENGTH * size_of::()); // Safety: // * `chunk` is a slice of `bytes` and is guaranteed to be smaller than `BITMAP_LENGTH` u64s - unsafe { core::ptr::copy_nonoverlapping(chunk.as_ptr(), bits_ptr, chunk.len()) }; - + unsafe { + copy_bits(chunk, bits.as_mut(), 0); + } let store = BitmapStore::from_unchecked(len, bits); let mut container = From b15c11b26a6a9427361c90c83ceabb21f59ccde3 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Fri, 6 Sep 2024 07:33:11 +0900 Subject: [PATCH 04/11] Add big endian test to CI --- .github/workflows/test.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73958355..74c8b0ee 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -70,6 +70,20 @@ jobs: command: test args: --features serde + - name: miri setup + if: matrix.rust == 'nightly' + uses: actions-rs/cargo@v1 + with: + command: miri + args: setup + + - name: Test bit endian + if: matrix.rust == 'nightly' + uses: actions-rs/cargo@v1 + with: + command: miri + args: test --target s390x-unknown-linux-gnu --package roaring --lib -- bitmap::serialization::test::test_from_bitmap_bytes + - name: Test no default features uses: actions-rs/cargo@v1 with: From 0758f8af272432115c414dd4fcdcbbb6f1b1af03 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Fri, 6 Sep 2024 07:38:47 +0900 Subject: [PATCH 05/11] Update description of bit order in from_bitmap_bytes --- roaring/src/bitmap/serialization.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index 3065ed67..7fb15f3a 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -55,13 +55,14 @@ impl RoaringBitmap { /// - `offset: u32` - The starting position in the bitmap where the byte slice will be applied, specified in bits. /// This means that if `offset` is `n`, the first byte in the slice will correspond to the `n`th bit(0-indexed) in the bitmap. /// Must be a multiple of 8. - /// - `bytes: &[u8]` - The byte slice containing the bitmap data. The bytes are interpreted in little-endian order. + /// - `bytes: &[u8]` - The byte slice containing the bitmap data. The bytes are interpreted in "Least-Significant-First" bit order. /// /// # Interpretation of `bytes` /// - /// The `bytes` slice is interpreted in little-endian order. Each byte is read from least significant bit (LSB) to most significant bit (MSB). + /// The `bytes` slice is interpreted in "Least-Significant-First" bit order. Each byte is read from least significant bit (LSB) to most significant bit (MSB). /// For example, the byte `0b00000101` represents the bits `1, 0, 1, 0, 0, 0, 0, 0` in that order (see Examples section). /// + /// /// # Panics /// /// This function will panic if `offset` is not a multiple of 8. From e9a215c1410911711ef04db0af755dfe305a9fd3 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Fri, 6 Sep 2024 11:06:14 -0400 Subject: [PATCH 06/11] Speed up from_bitmap_bytes, and use less unsafe * Directly create an array/bitmap store based on the count of bits * Use u64 words to count bits and enumerate bits * Store in little endian, then just fixup any touched words * Only use unsafe to reinterpret a whole bitmap container as bytes, everything else is safe * Allow adding bytes up to the last possible offset --- roaring/src/bitmap/container.rs | 7 + roaring/src/bitmap/serialization.rs | 197 ++++++++------------ roaring/src/bitmap/store/array_store/mod.rs | 28 +++ roaring/src/bitmap/store/bitmap_store.rs | 28 +++ roaring/src/bitmap/store/mod.rs | 29 +++ 5 files changed, 166 insertions(+), 123 deletions(-) diff --git a/roaring/src/bitmap/container.rs b/roaring/src/bitmap/container.rs index 9b238866..dc8eeda6 100644 --- a/roaring/src/bitmap/container.rs +++ b/roaring/src/bitmap/container.rs @@ -30,6 +30,13 @@ impl Container { pub fn full(key: u16) -> Container { Container { key, store: Store::full() } } + + pub fn from_lsb0_bytes(key: u16, bytes: &[u8], byte_offset: usize) -> Option { + Some(Container { + key, + store: Store::from_lsb0_bytes(bytes, byte_offset)?, + }) + } } impl Container { diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index 7fb15f3a..f38f89f6 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -1,15 +1,14 @@ +use crate::bitmap::container::{Container, ARRAY_LIMIT}; +use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; +use crate::RoaringBitmap; use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use core::convert::Infallible; use core::ops::RangeInclusive; +use core::mem::size_of; use std::error::Error; use std::io; -use crate::bitmap::container::{Container, ARRAY_LIMIT}; -use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; -use crate::bitmap::util; -use crate::RoaringBitmap; - pub const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; pub const SERIAL_COOKIE: u16 = 12347; pub const NO_OFFSET_THRESHOLD: usize = 4; @@ -65,7 +64,7 @@ impl RoaringBitmap { /// /// # Panics /// - /// This function will panic if `offset` is not a multiple of 8. + /// This function will panic if `offset` is not a multiple of 8, or if `bytes.len() + offset` is greater than 2^32. /// /// /// # Examples @@ -90,134 +89,66 @@ impl RoaringBitmap { /// assert!(rb.contains(17)); /// assert!(rb.contains(39)); /// ``` - #[inline] - pub fn from_bitmap_bytes(offset: u32, bytes: &[u8]) -> RoaringBitmap { - #[inline(always)] - fn next_multiple_of_u32(n: u32, multiple: u32) -> u32 { - (n + multiple - 1) / multiple * multiple - } - #[inline(always)] - fn next_multiple_of_usize(n: usize, multiple: usize) -> usize { - (n + multiple - 1) / multiple * multiple - } - /// Copies bits from `src` to `dst` at `bits_offset` bits offset. - /// Safety: `src` must be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s, - /// considering `byte_offset`. - #[inline(always)] - #[cfg(target_endian = "little")] - unsafe fn copy_bits(src: &[u8], dst: &mut [u64; BITMAP_LENGTH], byte_offset: usize) { - debug_assert!(src.len() + byte_offset <= BYTES_IN_ONE_CONTAINER); - - // Safety: - // * `byte_offset` is within the bounds of `dst`, guaranteed by the caller. - let bits_ptr = unsafe { dst.as_mut_ptr().cast::().add(byte_offset) }; - // Safety: - // * `src` is a slice of `bytes` and is guaranteed to be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s considering `byte_offset`, - // guaranteed by the caller. - unsafe { - core::ptr::copy_nonoverlapping(src.as_ptr(), bits_ptr, src.len()); - } - } - /// Copies bits from `src` to `dst` at `bits_offset` bits offset. - /// Safety: `src` must be smaller than or equal to `BYTES_IN_ONE_CONTAINER` u8s, - /// considering `byte_offset`. - #[inline(always)] - #[cfg(target_endian = "big")] - unsafe fn copy_bits(src: &[u8], dst: &mut [u64; BITMAP_LENGTH], byte_offset: usize) { - debug_assert!(src.len() + byte_offset <= BYTES_IN_ONE_CONTAINER); - - if byte_offset % 8 != 0 { - let mut bytes = [0u8; 8]; - - let src_range = 0..(8 - byte_offset % 8).min(src.len()); - bytes[8 - src_range.len()..8].copy_from_slice(&src[src_range]); - dst[byte_offset / 8] = u64::from_le_bytes(bytes); - } - - let aligned_u64_offset = (byte_offset + 7) / 8; - - // Iterate over the src data and copy it to dst as little-endian u64 values - for i in aligned_u64_offset..((byte_offset + src.len() + 7) / 8) { - let mut bytes = [0u8; 8]; - - let src_range = - (i - aligned_u64_offset) * 8..((i - aligned_u64_offset + 1) * 8).min(src.len()); - // println!("src_range: {:?}", src_range); - bytes[0..src_range.len()].copy_from_slice(&src[src_range]); - // println!("bytes: {:x?}", &bytes); + pub fn from_bitmap_bytes(offset: u32, mut bytes: &[u8]) -> RoaringBitmap { + assert_eq!(offset % 8, 0, "offset must be a multiple of 8"); - // Write the updated u64 value back to dst - dst[i] = u64::from_le_bytes(bytes); - } + if bytes.is_empty() { + return RoaringBitmap::new(); } - const BITS_IN_ONE_CONTAINER: usize = u64::BITS as usize * BITMAP_LENGTH; - const BYTES_IN_ONE_CONTAINER: usize = BITS_IN_ONE_CONTAINER / 8; - assert_eq!(offset % 8, 0, "offset must be a multiple of 8"); - let byte_offset = offset as usize / 8; - let n_containers_needed = - (bytes.len() + (BYTES_IN_ONE_CONTAINER - 1)) / BYTES_IN_ONE_CONTAINER + 1; + // Using inclusive range avoids overflow: the max exclusive value is 2^32 (u32::MAX + 1). + let end_bit_inc = u32::try_from(bytes.len()) + .ok() + .and_then(|len_bytes| len_bytes.checked_mul(8)) + // `bytes` is non-empty, so len_bits is > 0 + .and_then(|len_bits| offset.checked_add(len_bits - 1)) + .expect("offset + bytes.len() must be <= 2^32"); + + // offsets are in bytes + let (mut start_container, start_offset) = + (offset as usize >> 16, (offset as usize % 0x1_0000) / 8); + let (end_container_inc, end_offset) = + (end_bit_inc as usize >> 16, (end_bit_inc as usize % 0x1_0000 + 1) / 8); + + let n_containers_needed = end_container_inc + 1 - start_container; let mut containers = Vec::with_capacity(n_containers_needed); - let (offset, bytes) = if byte_offset % BYTES_IN_ONE_CONTAINER == 0 { - (offset, bytes) - } else { - let next_container_byte_offset = - next_multiple_of_usize(byte_offset, BYTES_IN_ONE_CONTAINER); - - let number_of_bytes_in_first_container = next_container_byte_offset - byte_offset; - let number_of_bytes_copied_to_first_container = - bytes.len().min(number_of_bytes_in_first_container); - - let (first_container_bytes, bytes_left) = - bytes.split_at(number_of_bytes_copied_to_first_container); - let (first_container_key, _) = util::split(offset); - - let len: u64 = first_container_bytes.iter().map(|&b| b.count_ones() as u64).sum(); - if len != 0 { - let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); - // Safety: - // * `first_container_bytes` is a slice of `bytes` and is guaranteed to be smaller than or equal to `number_of_bytes_in_first_container` - unsafe { - copy_bits( - first_container_bytes, - bits.as_mut(), - BYTES_IN_ONE_CONTAINER - number_of_bytes_in_first_container, - ) - }; - - let store = BitmapStore::from_unchecked(len, bits); - let mut container = - Container { key: first_container_key, store: Store::Bitmap(store) }; - container.ensure_correct_store(); + // Handle a partial first container + if start_offset != 0 { + let end_byte = if end_container_inc == start_container { + end_offset + } else { + BITMAP_LENGTH * size_of::() + }; + let (src, rest) = bytes.split_at(end_byte - start_offset); + bytes = rest; + + if let Some(container) = + Container::from_lsb0_bytes(start_container as u16, src, start_offset) + { containers.push(container); } - (next_multiple_of_u32(offset, BITS_IN_ONE_CONTAINER as u32), bytes_left) - }; - - let bitmap_store_chunks = bytes.chunks(BYTES_IN_ONE_CONTAINER); + start_container += 1; + } + + // Handle all full containers + for full_container_key in start_container..end_container_inc { + let (src, rest) = bytes.split_at(BITMAP_LENGTH * size_of::()); + bytes = rest; - let (offset_key, _) = util::split(offset); - for (i, chunk) in bitmap_store_chunks.enumerate() { - let len: u64 = chunk.iter().map(|&b| b.count_ones() as u64).sum(); - if len == 0 { - continue; - } - let mut bits: Box<[u64; BITMAP_LENGTH]> = Box::new([0; BITMAP_LENGTH]); - // Safety: - // * `chunk` is a slice of `bytes` and is guaranteed to be smaller than `BITMAP_LENGTH` u64s - unsafe { - copy_bits(chunk, bits.as_mut(), 0); + if let Some(container) = Container::from_lsb0_bytes(full_container_key as u16, src, 0) { + containers.push(container); } - let store = BitmapStore::from_unchecked(len, bits); - - let mut container = - Container { key: offset_key + i as u16, store: Store::Bitmap(store) }; - container.ensure_correct_store(); + } - containers.push(container); + // Handle a last container + if !bytes.is_empty() { + if let Some(container) = Container::from_lsb0_bytes(end_container_inc as u16, bytes, 0) + { + containers.push(container); + } } RoaringBitmap { containers } @@ -476,11 +407,31 @@ mod test { let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize]; bytes.extend(&[0xff]); let rb = RoaringBitmap::from_bitmap_bytes(0, &bytes); - assert_eq!(rb.min(), Some(CONTAINER_OFFSET)); let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8)); + + + // Ensure we can set the last byte + let bytes = [0x80]; + let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes); + assert_eq!(rb.len(), 1); + assert!(rb.contains(u32::MAX)); + } + + #[test] + #[should_panic(expected = "multiple of 8")] + fn test_from_bitmap_bytes_invalid_offset() { + let bytes = [0x01]; + RoaringBitmap::from_bitmap_bytes(1, &bytes); + } + + #[test] + #[should_panic(expected = "<= 2^32")] + fn test_from_bitmap_bytes_overflow() { + let bytes = [0x01, 0x01]; + RoaringBitmap::from_bitmap_bytes(u32::MAX - 7, &bytes); } #[test] diff --git a/roaring/src/bitmap/store/array_store/mod.rs b/roaring/src/bitmap/store/array_store/mod.rs index 883db31f..f3a713bd 100644 --- a/roaring/src/bitmap/store/array_store/mod.rs +++ b/roaring/src/bitmap/store/array_store/mod.rs @@ -7,6 +7,7 @@ use core::cmp::Ordering; use core::cmp::Ordering::*; use core::fmt::{Display, Formatter}; use core::ops::{BitAnd, BitAndAssign, BitOr, BitXor, RangeInclusive, Sub, SubAssign}; +use core::mem::size_of; #[cfg(not(feature = "std"))] use alloc::vec::Vec; @@ -47,6 +48,33 @@ impl ArrayStore { } } + pub fn from_lsb0_bytes(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self { + type Word = u64; + + let mut vec = Vec::with_capacity(bits_set as usize); + + let chunks = bytes.chunks_exact(size_of::()); + let remainder = chunks.remainder(); + for (index, chunk) in chunks.enumerate() { + let bit_index = (byte_offset + index * size_of::()) * 8; + let mut word = Word::from_le_bytes(chunk.try_into().unwrap()); + + while word != 0 { + vec.push((word.trailing_zeros() + bit_index as u32) as u16); + word &= word - 1; + } + } + for (index, mut byte) in remainder.iter().copied().enumerate() { + let bit_index = (byte_offset + (bytes.len() - remainder.len()) + index) * 8; + while byte != 0 { + vec.push((byte.trailing_zeros() + bit_index as u32) as u16); + byte &= byte - 1; + } + } + + Self::from_vec_unchecked(vec) + } + pub fn insert(&mut self, index: u16) -> bool { self.vec.binary_search(&index).map_err(|loc| self.vec.insert(loc, index)).is_err() } diff --git a/roaring/src/bitmap/store/bitmap_store.rs b/roaring/src/bitmap/store/bitmap_store.rs index 4b89e0e0..daff3776 100644 --- a/roaring/src/bitmap/store/bitmap_store.rs +++ b/roaring/src/bitmap/store/bitmap_store.rs @@ -1,5 +1,6 @@ use core::borrow::Borrow; use core::fmt::{Display, Formatter}; +use core::mem::size_of; use core::ops::{BitAndAssign, BitOrAssign, BitXorAssign, RangeInclusive, SubAssign}; use super::ArrayStore; @@ -35,6 +36,33 @@ impl BitmapStore { } } + pub fn from_lsb0_bytes_unchecked(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self { + assert!(byte_offset + bytes.len() <= BITMAP_LENGTH * size_of::()); + + let mut bits = Box::new([0u64; BITMAP_LENGTH]); + // Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements, + // and has no padding/uninitialized data. + let dst = unsafe { + std::slice::from_raw_parts_mut( + bits.as_mut_ptr().cast::(), + BITMAP_LENGTH * size_of::(), + ) + }; + let dst = &mut dst[byte_offset..][..bytes.len()]; + dst.copy_from_slice(bytes); + + let start_word = byte_offset / size_of::(); + let end_word = (byte_offset + bytes.len() + (size_of::() - 1)) / size_of::(); + + // The 0th byte is the least significant byte, so we've written the bytes in little-endian + // order, convert to native endian. Expect this to get optimized away for little-endian. + for word in &mut bits[start_word..end_word] { + *word = u64::from_le(*word); + } + + Self::from_unchecked(bits_set, bits) + } + /// /// Create a new BitmapStore from a given len and bits array /// It is up to the caller to ensure len == cardinality of bits diff --git a/roaring/src/bitmap/store/mod.rs b/roaring/src/bitmap/store/mod.rs index 625b8137..ce7acde9 100644 --- a/roaring/src/bitmap/store/mod.rs +++ b/roaring/src/bitmap/store/mod.rs @@ -49,6 +49,35 @@ impl Store { Store::Bitmap(BitmapStore::full()) } + pub fn from_lsb0_bytes(bytes: &[u8], byte_offset: usize) -> Option { + assert!(byte_offset + bytes.len() <= BITMAP_LENGTH * mem::size_of::()); + + // It seems to be pretty considerably faster to count the bits + // using u64s than for each byte + let bits_set = { + let mut bits_set = 0; + let chunks = bytes.chunks_exact(mem::size_of::()); + let remainder = chunks.remainder(); + for chunk in chunks { + let chunk = u64::from_ne_bytes(chunk.try_into().unwrap()); + bits_set += u64::from(chunk.count_ones()); + } + for byte in remainder { + bits_set += u64::from(byte.count_ones()); + } + bits_set + }; + if bits_set == 0 { + return None; + } + + Some(if bits_set < ARRAY_LIMIT { + Array(ArrayStore::from_lsb0_bytes(bytes, byte_offset, bits_set)) + } else { + Bitmap(BitmapStore::from_lsb0_bytes_unchecked(bytes, byte_offset, bits_set)) + }) + } + pub fn insert(&mut self, index: u16) -> bool { match self { Array(vec) => vec.insert(index), From 09e9d6dd2cac460f96475315c28ff2059fb1763f Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Sat, 7 Sep 2024 10:55:40 -0400 Subject: [PATCH 07/11] add special case for creating a full bitmap container we can setting an initial value in that case --- roaring/src/bitmap/serialization.rs | 8 +++- roaring/src/bitmap/store/bitmap_store.rs | 50 +++++++++++++++--------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index f38f89f6..b618866a 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -413,11 +413,17 @@ mod test { assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8)); - // Ensure we can set the last byte + // Ensure we can set the last byte in an array container let bytes = [0x80]; let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes); assert_eq!(rb.len(), 1); assert!(rb.contains(u32::MAX)); + + // Ensure we can set the last byte in a bitmap container + let bytes = vec![0xFF; 0x1_0000 / 8]; + let rb = RoaringBitmap::from_bitmap_bytes(0xFFFF0000, &bytes); + assert_eq!(rb.len(), 0x1_0000); + assert!(rb.contains(u32::MAX)); } #[test] diff --git a/roaring/src/bitmap/store/bitmap_store.rs b/roaring/src/bitmap/store/bitmap_store.rs index daff3776..1a4c4013 100644 --- a/roaring/src/bitmap/store/bitmap_store.rs +++ b/roaring/src/bitmap/store/bitmap_store.rs @@ -37,27 +37,41 @@ impl BitmapStore { } pub fn from_lsb0_bytes_unchecked(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self { - assert!(byte_offset + bytes.len() <= BITMAP_LENGTH * size_of::()); - - let mut bits = Box::new([0u64; BITMAP_LENGTH]); - // Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements, - // and has no padding/uninitialized data. - let dst = unsafe { - std::slice::from_raw_parts_mut( - bits.as_mut_ptr().cast::(), - BITMAP_LENGTH * size_of::(), - ) + const BITMAP_BYTES: usize = BITMAP_LENGTH * size_of::(); + assert!(byte_offset.checked_add(bytes.len()).map_or(false, |sum| sum <= BITMAP_BYTES)); + + // If we know we're writing the full bitmap, we can avoid the initial memset to 0 + let mut bits = if bytes.len() == BITMAP_BYTES { + debug_assert_eq!(byte_offset, 0); // Must be true from the above assert + + // Safety: We've checked that the length is correct, and we use an unaligned load in case + // the bytes are not 8 byte aligned. + // The optimizer can see through this, and avoid the double copy to copy directly into + // the allocated box from bytes with memcpy + let bytes_as_words = + unsafe { bytes.as_ptr().cast::<[u64; BITMAP_LENGTH]>().read_unaligned() }; + Box::new(bytes_as_words) + } else { + let mut bits = Box::new([0u64; BITMAP_LENGTH]); + // Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements, + // and has no padding/uninitialized data. + let dst = unsafe { + std::slice::from_raw_parts_mut(bits.as_mut_ptr().cast::(), BITMAP_BYTES) + }; + let dst = &mut dst[byte_offset..][..bytes.len()]; + dst.copy_from_slice(bytes); + bits }; - let dst = &mut dst[byte_offset..][..bytes.len()]; - dst.copy_from_slice(bytes); - let start_word = byte_offset / size_of::(); - let end_word = (byte_offset + bytes.len() + (size_of::() - 1)) / size_of::(); + if !cfg!(target_endian = "little") { + // Convert all words we touched (even partially) to little-endian + let start_word = byte_offset / size_of::(); + let end_word = (byte_offset + bytes.len() + (size_of::() - 1)) / size_of::(); - // The 0th byte is the least significant byte, so we've written the bytes in little-endian - // order, convert to native endian. Expect this to get optimized away for little-endian. - for word in &mut bits[start_word..end_word] { - *word = u64::from_le(*word); + // The 0th byte is the least significant byte, so we've written the bytes in little-endian + for word in &mut bits[start_word..end_word] { + *word = u64::from_le(*word); + } } Self::from_unchecked(bits_set, bits) From f0dbb7288828b4780f5de8940a3e8de4027aba71 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Tue, 1 Oct 2024 13:38:18 +0900 Subject: [PATCH 08/11] Apply cargo fmt --- roaring/src/bitmap/container.rs | 7 ++----- roaring/src/bitmap/serialization.rs | 13 ++++++------- roaring/src/bitmap/store/array_store/mod.rs | 6 +++--- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/roaring/src/bitmap/container.rs b/roaring/src/bitmap/container.rs index dc8eeda6..64c6c561 100644 --- a/roaring/src/bitmap/container.rs +++ b/roaring/src/bitmap/container.rs @@ -30,12 +30,9 @@ impl Container { pub fn full(key: u16) -> Container { Container { key, store: Store::full() } } - + pub fn from_lsb0_bytes(key: u16, bytes: &[u8], byte_offset: usize) -> Option { - Some(Container { - key, - store: Store::from_lsb0_bytes(bytes, byte_offset)?, - }) + Some(Container { key, store: Store::from_lsb0_bytes(bytes, byte_offset)? }) } } diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index b618866a..ba02d4c2 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -4,8 +4,8 @@ use crate::RoaringBitmap; use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use core::convert::Infallible; -use core::ops::RangeInclusive; use core::mem::size_of; +use core::ops::RangeInclusive; use std::error::Error; use std::io; @@ -132,7 +132,7 @@ impl RoaringBitmap { start_container += 1; } - + // Handle all full containers for full_container_key in start_container..end_container_inc { let (src, rest) = bytes.split_at(BITMAP_LENGTH * size_of::()); @@ -411,28 +411,27 @@ mod test { let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8)); - - + // Ensure we can set the last byte in an array container let bytes = [0x80]; let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes); assert_eq!(rb.len(), 1); assert!(rb.contains(u32::MAX)); - + // Ensure we can set the last byte in a bitmap container let bytes = vec![0xFF; 0x1_0000 / 8]; let rb = RoaringBitmap::from_bitmap_bytes(0xFFFF0000, &bytes); assert_eq!(rb.len(), 0x1_0000); assert!(rb.contains(u32::MAX)); } - + #[test] #[should_panic(expected = "multiple of 8")] fn test_from_bitmap_bytes_invalid_offset() { let bytes = [0x01]; RoaringBitmap::from_bitmap_bytes(1, &bytes); } - + #[test] #[should_panic(expected = "<= 2^32")] fn test_from_bitmap_bytes_overflow() { diff --git a/roaring/src/bitmap/store/array_store/mod.rs b/roaring/src/bitmap/store/array_store/mod.rs index f3a713bd..44e1b461 100644 --- a/roaring/src/bitmap/store/array_store/mod.rs +++ b/roaring/src/bitmap/store/array_store/mod.rs @@ -6,8 +6,8 @@ use crate::bitmap::store::array_store::visitor::{CardinalityCounter, VecWriter}; use core::cmp::Ordering; use core::cmp::Ordering::*; use core::fmt::{Display, Formatter}; -use core::ops::{BitAnd, BitAndAssign, BitOr, BitXor, RangeInclusive, Sub, SubAssign}; use core::mem::size_of; +use core::ops::{BitAnd, BitAndAssign, BitOr, BitXor, RangeInclusive, Sub, SubAssign}; #[cfg(not(feature = "std"))] use alloc::vec::Vec; @@ -50,7 +50,7 @@ impl ArrayStore { pub fn from_lsb0_bytes(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self { type Word = u64; - + let mut vec = Vec::with_capacity(bits_set as usize); let chunks = bytes.chunks_exact(size_of::()); @@ -71,7 +71,7 @@ impl ArrayStore { byte &= byte - 1; } } - + Self::from_vec_unchecked(vec) } From 915e58228ca2abc8053bf82d0856c277872e79ed Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Tue, 1 Oct 2024 13:48:16 +0900 Subject: [PATCH 09/11] Rename from_bitmap_bytes to from_lsb0_bytes --- .github/workflows/test.yml | 2 +- benchmarks/benches/lib.rs | 4 ++-- roaring/src/bitmap/serialization.rs | 26 +++++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 74c8b0ee..1c7b592f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -82,7 +82,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: miri - args: test --target s390x-unknown-linux-gnu --package roaring --lib -- bitmap::serialization::test::test_from_bitmap_bytes + args: test --target s390x-unknown-linux-gnu --package roaring --lib -- bitmap::serialization::test::test_from_lsb0_bytes - name: Test no default features uses: actions-rs/cargo@v1 diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index c5f360a9..e2413f18 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -149,7 +149,7 @@ fn creation(c: &mut Criterion) { group.throughput(Throughput::Elements(dataset.bitmaps.iter().map(|rb| rb.len()).sum())); - group.bench_function(BenchmarkId::new("from_bitmap_bytes", &dataset.name), |b| { + group.bench_function(BenchmarkId::new("from_lsb0_bytes", &dataset.name), |b| { let bitmap_bytes = dataset_numbers .iter() .map(|bitmap_numbers| { @@ -165,7 +165,7 @@ fn creation(c: &mut Criterion) { .collect::>(); b.iter(|| { for bitmap_bytes in &bitmap_bytes { - black_box(RoaringBitmap::from_bitmap_bytes(0, bitmap_bytes)); + black_box(RoaringBitmap::from_lsb0_bytes(0, bitmap_bytes)); } }) }); diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index ba02d4c2..8dabf8d1 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -75,21 +75,21 @@ impl RoaringBitmap { /// let bytes = [0b00000101, 0b00000010, 0b00000000, 0b10000000]; /// // ^^^^^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^ /// // 76543210 98 - /// let rb = RoaringBitmap::from_bitmap_bytes(0, &bytes); + /// let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes); /// assert!(rb.contains(0)); /// assert!(!rb.contains(1)); /// assert!(rb.contains(2)); /// assert!(rb.contains(9)); /// assert!(rb.contains(31)); /// - /// let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); + /// let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes); /// assert!(rb.contains(8)); /// assert!(!rb.contains(9)); /// assert!(rb.contains(10)); /// assert!(rb.contains(17)); /// assert!(rb.contains(39)); /// ``` - pub fn from_bitmap_bytes(offset: u32, mut bytes: &[u8]) -> RoaringBitmap { + pub fn from_lsb0_bytes(offset: u32, mut bytes: &[u8]) -> RoaringBitmap { assert_eq!(offset % 8, 0, "offset must be a multiple of 8"); if bytes.is_empty() { @@ -378,7 +378,7 @@ mod test { } #[test] - fn test_from_bitmap_bytes() { + fn test_from_lsb0_bytes() { const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32; const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8; let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize]; @@ -386,7 +386,7 @@ mod test { bytes.extend(&[0b00000001, 0b00000010, 0b00000011, 0b00000100]); let offset = 32; - let rb = RoaringBitmap::from_bitmap_bytes(offset, &bytes); + let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes); for i in 0..offset { assert!(!rb.contains(i), "{i} should not be in the bitmap"); } @@ -406,37 +406,37 @@ mod test { // Ensure the empty container is not created let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize]; bytes.extend(&[0xff]); - let rb = RoaringBitmap::from_bitmap_bytes(0, &bytes); + let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes); assert_eq!(rb.min(), Some(CONTAINER_OFFSET)); - let rb = RoaringBitmap::from_bitmap_bytes(8, &bytes); + let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes); assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8)); // Ensure we can set the last byte in an array container let bytes = [0x80]; - let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes); + let rb = RoaringBitmap::from_lsb0_bytes(0xFFFFFFF8, &bytes); assert_eq!(rb.len(), 1); assert!(rb.contains(u32::MAX)); // Ensure we can set the last byte in a bitmap container let bytes = vec![0xFF; 0x1_0000 / 8]; - let rb = RoaringBitmap::from_bitmap_bytes(0xFFFF0000, &bytes); + let rb = RoaringBitmap::from_lsb0_bytes(0xFFFF0000, &bytes); assert_eq!(rb.len(), 0x1_0000); assert!(rb.contains(u32::MAX)); } #[test] #[should_panic(expected = "multiple of 8")] - fn test_from_bitmap_bytes_invalid_offset() { + fn test_from_lsb0_bytes_invalid_offset() { let bytes = [0x01]; - RoaringBitmap::from_bitmap_bytes(1, &bytes); + RoaringBitmap::from_lsb0_bytes(1, &bytes); } #[test] #[should_panic(expected = "<= 2^32")] - fn test_from_bitmap_bytes_overflow() { + fn test_from_lsb0_bytes_overflow() { let bytes = [0x01, 0x01]; - RoaringBitmap::from_bitmap_bytes(u32::MAX - 7, &bytes); + RoaringBitmap::from_lsb0_bytes(u32::MAX - 7, &bytes); } #[test] From 53ec34b20ee0a7f5c89cb1d7c273e0b4fc2ce612 Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:46:40 +0900 Subject: [PATCH 10/11] Fix CI error --- .github/workflows/test.yml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1c7b592f..3f06c302 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,6 +41,22 @@ jobs: override: true components: rustfmt, clippy + - name: Install miri + uses: actions-rs/toolchain@v1 + if: matrix.rust == 'nightly' + with: + profile: minimal + toolchain: nightly + override: true + components: miri + + - name: miri setup + uses: actions-rs/cargo@v1 + if: matrix.rust == 'nightly' + with: + command: miri + args: setup + - name: Fetch uses: actions-rs/cargo@v1 with: @@ -70,16 +86,9 @@ jobs: command: test args: --features serde - - name: miri setup - if: matrix.rust == 'nightly' - uses: actions-rs/cargo@v1 - with: - command: miri - args: setup - - name: Test bit endian - if: matrix.rust == 'nightly' uses: actions-rs/cargo@v1 + if: matrix.rust == 'nightly' with: command: miri args: test --target s390x-unknown-linux-gnu --package roaring --lib -- bitmap::serialization::test::test_from_lsb0_bytes From 2a9a0c2f2cad4181164ca4c9e12a9a391d26eb3f Mon Sep 17 00:00:00 2001 From: lemolatoon <63438515+lemolatoon@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:59:39 +0900 Subject: [PATCH 11/11] Fix cargo clippy warning --- roaring/src/bitmap/serialization.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs index 8dabf8d1..3ac12638 100644 --- a/roaring/src/bitmap/serialization.rs +++ b/roaring/src/bitmap/serialization.rs @@ -382,8 +382,8 @@ mod test { const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32; const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8; let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize]; - bytes.extend(&[0x00; CONTAINER_OFFSET_IN_BYTES as usize]); - bytes.extend(&[0b00000001, 0b00000010, 0b00000011, 0b00000100]); + bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]); + bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]); let offset = 32; let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes); @@ -405,7 +405,7 @@ mod test { // Ensure the empty container is not created let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize]; - bytes.extend(&[0xff]); + bytes.extend([0xff]); let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes); assert_eq!(rb.min(), Some(CONTAINER_OFFSET));