|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +use std::ops::Range; |
| 19 | + |
| 20 | +use arrow_array::{Array, BooleanArray}; |
| 21 | +use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer}; |
| 22 | +use arrow_data::bit_iterator::BitIndexIterator; |
| 23 | + |
| 24 | +use super::{RowSelection, RowSelector}; |
| 25 | + |
| 26 | +/// A selection of rows, similar to [`RowSelection`], but based on a boolean array |
| 27 | +#[derive(Debug, Clone, PartialEq, Eq)] |
| 28 | +pub struct BooleanRowSelection { |
| 29 | + selector: BooleanBuffer, |
| 30 | +} |
| 31 | + |
| 32 | +impl BooleanRowSelection { |
| 33 | + /// Create a new [`BooleanRowSelection] from a list of [`BooleanArray`]. |
| 34 | + pub fn from_filters(filters: &[BooleanArray]) -> Self { |
| 35 | + let arrays: Vec<&dyn Array> = filters.iter().map(|x| x as &dyn Array).collect(); |
| 36 | + let result = arrow_select::concat::concat(&arrays).unwrap().into_data(); |
| 37 | + let (boolean_array, _null) = BooleanArray::from(result).into_parts(); |
| 38 | + BooleanRowSelection { |
| 39 | + selector: boolean_array, |
| 40 | + } |
| 41 | + } |
| 42 | + |
| 43 | + /// Create a new [`BooleanRowSelection`] with all rows unselected |
| 44 | + pub fn new_unselected(row_count: usize) -> Self { |
| 45 | + let buffer = BooleanBuffer::new_unset(row_count); |
| 46 | + |
| 47 | + BooleanRowSelection { selector: buffer } |
| 48 | + } |
| 49 | + |
| 50 | + /// Create a new [`BooleanRowSelection`] with all rows selected |
| 51 | + pub fn new_selected(row_count: usize) -> Self { |
| 52 | + let buffer = BooleanBuffer::new_set(row_count); |
| 53 | + |
| 54 | + BooleanRowSelection { selector: buffer } |
| 55 | + } |
| 56 | + |
| 57 | + /// Returns a new [`BooleanRowSelection`] that selects the inverse of this [`BooleanRowSelection`]. |
| 58 | + pub fn as_inverted(&self) -> Self { |
| 59 | + let buffer = !&self.selector; |
| 60 | + BooleanRowSelection { selector: buffer } |
| 61 | + } |
| 62 | + |
| 63 | + /// Returns the number of rows in this [`BooleanRowSelection`]. |
| 64 | + pub fn len(&self) -> usize { |
| 65 | + self.selector.len() |
| 66 | + } |
| 67 | + |
| 68 | + /// Returns the number of rows selected by this [`BooleanRowSelection`]. |
| 69 | + pub fn row_count(&self) -> usize { |
| 70 | + self.selector.count_set_bits() |
| 71 | + } |
| 72 | + |
| 73 | + /// Create a new [`BooleanRowSelection`] from a list of consecutive ranges. |
| 74 | + pub fn from_consecutive_ranges( |
| 75 | + ranges: impl Iterator<Item = Range<usize>>, |
| 76 | + total_rows: usize, |
| 77 | + ) -> Self { |
| 78 | + let mut buffer = BooleanBufferBuilder::new(total_rows); |
| 79 | + let mut last_end = 0; |
| 80 | + |
| 81 | + for range in ranges { |
| 82 | + let len = range.end - range.start; |
| 83 | + if len == 0 { |
| 84 | + continue; |
| 85 | + } |
| 86 | + |
| 87 | + if range.start > last_end { |
| 88 | + buffer.append_n(range.start - last_end, false); |
| 89 | + } |
| 90 | + buffer.append_n(len, true); |
| 91 | + last_end = range.end; |
| 92 | + } |
| 93 | + |
| 94 | + if last_end != total_rows { |
| 95 | + buffer.append_n(total_rows - last_end, false); |
| 96 | + } |
| 97 | + |
| 98 | + BooleanRowSelection { |
| 99 | + selector: buffer.finish(), |
| 100 | + } |
| 101 | + } |
| 102 | + |
| 103 | + /// Compute the union of two [`BooleanRowSelection`] |
| 104 | + /// For example: |
| 105 | + /// self: NNYYYYNNYYNYN |
| 106 | + /// other: NYNNNNNNN |
| 107 | + /// |
| 108 | + /// returned: NYYYYYNNYYNYN |
| 109 | + #[must_use] |
| 110 | + pub fn union(&self, other: &Self) -> Self { |
| 111 | + // use arrow::compute::kernels::boolean::or; |
| 112 | + |
| 113 | + let union_selectors = &self.selector | &other.selector; |
| 114 | + |
| 115 | + BooleanRowSelection { |
| 116 | + selector: union_selectors, |
| 117 | + } |
| 118 | + } |
| 119 | + |
| 120 | + /// Compute the intersection of two [`BooleanRowSelection`] |
| 121 | + /// For example: |
| 122 | + /// self: NNYYYYNNYYNYN |
| 123 | + /// other: NYNNNNNNY |
| 124 | + /// |
| 125 | + /// returned: NNNNNNNNYYNYN |
| 126 | + #[must_use] |
| 127 | + pub fn intersection(&self, other: &Self) -> Self { |
| 128 | + let intersection_selectors = &self.selector & &other.selector; |
| 129 | + |
| 130 | + BooleanRowSelection { |
| 131 | + selector: intersection_selectors, |
| 132 | + } |
| 133 | + } |
| 134 | + |
| 135 | + /// Combines this [`BooleanRowSelection`] with another using logical AND on the selected bits. |
| 136 | + /// |
| 137 | + /// Unlike [`intersection`], the `other` [`BooleanRowSelection`] must have exactly as many set bits as `self`. |
| 138 | + /// This method will keep only the bits in `self` that are also set in `other` |
| 139 | + /// at the positions corresponding to `self`'s set bits. |
| 140 | + pub fn and_then(&self, other: &Self) -> Self { |
| 141 | + // Ensure that 'other' has exactly as many set bits as 'self' |
| 142 | + debug_assert_eq!( |
| 143 | + self.row_count(), |
| 144 | + other.len(), |
| 145 | + "The 'other' selection must have exactly as many set bits as 'self'." |
| 146 | + ); |
| 147 | + |
| 148 | + if self.len() == other.len() { |
| 149 | + // fast path if the two selections are the same length |
| 150 | + // common if this is the first predicate |
| 151 | + debug_assert_eq!(self.row_count(), self.len()); |
| 152 | + return self.intersection(other); |
| 153 | + } |
| 154 | + |
| 155 | + let mut buffer = MutableBuffer::from_len_zeroed(self.len()); |
| 156 | + buffer.copy_from_slice(self.selector.values()); |
| 157 | + let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.len()); |
| 158 | + |
| 159 | + // Create iterators for 'self' and 'other' bits |
| 160 | + let mut other_bits = other.selector.iter(); |
| 161 | + |
| 162 | + for bit_idx in self.true_iter() { |
| 163 | + let predicate = other_bits |
| 164 | + .next() |
| 165 | + .expect("Mismatch in set bits between self and other"); |
| 166 | + if !predicate { |
| 167 | + builder.set_bit(bit_idx, false); |
| 168 | + } |
| 169 | + } |
| 170 | + |
| 171 | + BooleanRowSelection { |
| 172 | + selector: builder.finish(), |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + /// Returns an iterator over the indices of the set bits in this [`BooleanRowSelection`] |
| 177 | + pub fn true_iter(&self) -> BitIndexIterator<'_> { |
| 178 | + self.selector.set_indices() |
| 179 | + } |
| 180 | + |
| 181 | + /// Returns `true` if this [`BooleanRowSelection`] selects any rows |
| 182 | + pub fn selects_any(&self) -> bool { |
| 183 | + self.true_iter().next().is_some() |
| 184 | + } |
| 185 | + |
| 186 | + /// Returns a new [`BooleanRowSelection`] that selects the rows in this [`BooleanRowSelection`] from `offset` to `offset + len` |
| 187 | + pub fn slice(&self, offset: usize, len: usize) -> BooleanArray { |
| 188 | + BooleanArray::new(self.selector.slice(offset, len), None) |
| 189 | + } |
| 190 | +} |
| 191 | + |
| 192 | +impl From<Vec<RowSelector>> for BooleanRowSelection { |
| 193 | + fn from(selection: Vec<RowSelector>) -> Self { |
| 194 | + let selection = RowSelection::from(selection); |
| 195 | + RowSelection::into(selection) |
| 196 | + } |
| 197 | +} |
| 198 | + |
| 199 | +impl From<RowSelection> for BooleanRowSelection { |
| 200 | + fn from(selection: RowSelection) -> Self { |
| 201 | + let total_rows = selection.row_count(); |
| 202 | + let mut builder = BooleanBufferBuilder::new(total_rows); |
| 203 | + |
| 204 | + for selector in selection.iter() { |
| 205 | + if selector.skip { |
| 206 | + builder.append_n(selector.row_count, false); |
| 207 | + } else { |
| 208 | + builder.append_n(selector.row_count, true); |
| 209 | + } |
| 210 | + } |
| 211 | + |
| 212 | + BooleanRowSelection { |
| 213 | + selector: builder.finish(), |
| 214 | + } |
| 215 | + } |
| 216 | +} |
| 217 | + |
| 218 | +impl From<&BooleanRowSelection> for RowSelection { |
| 219 | + fn from(selection: &BooleanRowSelection) -> Self { |
| 220 | + let array = BooleanArray::new(selection.selector.clone(), None); |
| 221 | + RowSelection::from_filters(&[array]) |
| 222 | + } |
| 223 | +} |
| 224 | + |
| 225 | +#[cfg(test)] |
| 226 | +mod tests { |
| 227 | + use rand::Rng; |
| 228 | + |
| 229 | + use super::*; |
| 230 | + |
| 231 | + fn generate_random_row_selection(total_rows: usize, selection_ratio: f64) -> BooleanArray { |
| 232 | + let mut rng = rand::thread_rng(); |
| 233 | + let bools: Vec<bool> = (0..total_rows) |
| 234 | + .map(|_| rng.gen_bool(selection_ratio)) |
| 235 | + .collect(); |
| 236 | + BooleanArray::from(bools) |
| 237 | + } |
| 238 | + |
| 239 | + #[test] |
| 240 | + fn test_boolean_row_selection_round_trip() { |
| 241 | + let total_rows = 1_000; |
| 242 | + for &selection_ratio in &[0.0, 0.1, 0.5, 0.9, 1.0] { |
| 243 | + let selection = generate_random_row_selection(total_rows, selection_ratio); |
| 244 | + let boolean_selection = BooleanRowSelection::from_filters(&[selection]); |
| 245 | + let row_selection = RowSelection::from(&boolean_selection); |
| 246 | + let boolean_selection_again = row_selection.into(); |
| 247 | + assert_eq!(boolean_selection, boolean_selection_again); |
| 248 | + } |
| 249 | + } |
| 250 | + |
| 251 | + #[test] |
| 252 | + fn test_boolean_union_intersection() { |
| 253 | + let total_rows = 1_000; |
| 254 | + |
| 255 | + let base_boolean_selection = |
| 256 | + BooleanRowSelection::from_filters(&[generate_random_row_selection(total_rows, 0.1)]); |
| 257 | + let base_row_selection = RowSelection::from(&base_boolean_selection); |
| 258 | + for &selection_ratio in &[0.0, 0.1, 0.5, 0.9, 1.0] { |
| 259 | + let boolean_selection = |
| 260 | + BooleanRowSelection::from_filters(&[generate_random_row_selection( |
| 261 | + total_rows, |
| 262 | + selection_ratio, |
| 263 | + )]); |
| 264 | + let row_selection = RowSelection::from(&boolean_selection); |
| 265 | + |
| 266 | + let boolean_union = boolean_selection.union(&base_boolean_selection); |
| 267 | + let row_union = row_selection.union(&base_row_selection); |
| 268 | + assert_eq!(boolean_union, BooleanRowSelection::from(row_union)); |
| 269 | + |
| 270 | + let boolean_intersection = boolean_selection.intersection(&base_boolean_selection); |
| 271 | + let row_intersection = row_selection.intersection(&base_row_selection); |
| 272 | + assert_eq!( |
| 273 | + boolean_intersection, |
| 274 | + BooleanRowSelection::from(row_intersection) |
| 275 | + ); |
| 276 | + } |
| 277 | + } |
| 278 | + |
| 279 | + #[test] |
| 280 | + fn test_boolean_selection_and_then() { |
| 281 | + // Initial mask: 001011010101 |
| 282 | + let self_filters = vec![BooleanArray::from(vec![ |
| 283 | + false, false, true, false, true, true, false, true, false, true, false, true, |
| 284 | + ])]; |
| 285 | + let self_selection = BooleanRowSelection::from_filters(&self_filters); |
| 286 | + |
| 287 | + // Predicate mask (only for selected bits): 001101 |
| 288 | + let other_filters = vec![BooleanArray::from(vec![ |
| 289 | + false, false, true, true, false, true, |
| 290 | + ])]; |
| 291 | + let other_selection = BooleanRowSelection::from_filters(&other_filters); |
| 292 | + |
| 293 | + let result = self_selection.and_then(&other_selection); |
| 294 | + |
| 295 | + // Expected result: 000001010001 |
| 296 | + let expected_filters = vec![BooleanArray::from(vec![ |
| 297 | + false, false, false, false, false, true, false, true, false, false, false, true, |
| 298 | + ])]; |
| 299 | + let expected_selection = BooleanRowSelection::from_filters(&expected_filters); |
| 300 | + |
| 301 | + assert_eq!(result, expected_selection); |
| 302 | + } |
| 303 | + |
| 304 | + #[test] |
| 305 | + #[should_panic( |
| 306 | + expected = "The 'other' selection must have exactly as many set bits as 'self'." |
| 307 | + )] |
| 308 | + fn test_and_then_mismatched_set_bits() { |
| 309 | + let self_filters = vec![BooleanArray::from(vec![true, true, false])]; |
| 310 | + let self_selection = BooleanRowSelection::from_filters(&self_filters); |
| 311 | + |
| 312 | + // 'other' has only one set bit, but 'self' has two |
| 313 | + let other_filters = vec![BooleanArray::from(vec![true, false, false])]; |
| 314 | + let other_selection = BooleanRowSelection::from_filters(&other_filters); |
| 315 | + |
| 316 | + // This should panic |
| 317 | + let _ = self_selection.and_then(&other_selection); |
| 318 | + } |
| 319 | +} |
0 commit comments