|
17 | 17 |
|
18 | 18 | //! Common utilities for implementing string functions
|
19 | 19 |
|
20 |
| -use std::collections::HashMap; |
21 | 20 | use std::fmt::{Display, Formatter};
|
22 | 21 | use std::sync::Arc;
|
23 | 22 |
|
24 | 23 | use arrow::array::{
|
25 | 24 | new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef,
|
26 |
| - BooleanArray, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, |
| 25 | + GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, |
27 | 26 | StringBuilder, StringViewArray,
|
28 | 27 | };
|
29 | 28 | use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
|
30 | 29 | use arrow::datatypes::DataType;
|
31 |
| -use arrow_buffer::BooleanBufferBuilder; |
32 | 30 | use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
|
| 31 | +use datafusion_common::Result; |
33 | 32 | use datafusion_common::{exec_err, ScalarValue};
|
34 |
| -use datafusion_common::{DataFusionError, Result}; |
35 | 33 | use datafusion_expr::ColumnarValue;
|
36 |
| -use regex::Regex; |
37 | 34 |
|
38 | 35 | pub(crate) enum TrimType {
|
39 | 36 | Left,
|
@@ -481,96 +478,3 @@ where
|
481 | 478 | GenericStringArray::<O>::new_unchecked(offsets, values, nulls)
|
482 | 479 | }))
|
483 | 480 | }
|
484 |
| - |
485 |
| -#[cfg(doc)] |
486 |
| -use arrow::array::LargeStringArray; |
487 |
| -/// Perform SQL `array ~ regex_array` operation on |
488 |
| -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. |
489 |
| -/// |
490 |
| -/// If `regex_array` element has an empty value, the corresponding result value is always true. |
491 |
| -/// |
492 |
| -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, |
493 |
| -/// which allow special search modes, such as case-insensitive and multi-line mode. |
494 |
| -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) |
495 |
| -/// for more information. |
496 |
| -/// |
497 |
| -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. |
498 |
| -/// |
499 |
| -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 |
500 |
| -pub fn regexp_is_match<'a, S1, S2, S3>( |
501 |
| - array: &'a S1, |
502 |
| - regex_array: &'a S2, |
503 |
| - flags_array: Option<&'a S3>, |
504 |
| -) -> Result<BooleanArray, DataFusionError> |
505 |
| -where |
506 |
| - &'a S1: StringArrayType<'a>, |
507 |
| - &'a S2: StringArrayType<'a>, |
508 |
| - &'a S3: StringArrayType<'a>, |
509 |
| -{ |
510 |
| - if array.len() != regex_array.len() { |
511 |
| - return Err(DataFusionError::Execution( |
512 |
| - "Cannot perform comparison operation on arrays of different length" |
513 |
| - .to_string(), |
514 |
| - )); |
515 |
| - } |
516 |
| - |
517 |
| - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); |
518 |
| - |
519 |
| - let mut patterns: HashMap<String, Regex> = HashMap::new(); |
520 |
| - let mut result = BooleanBufferBuilder::new(array.len()); |
521 |
| - |
522 |
| - let complete_pattern = match flags_array { |
523 |
| - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( |
524 |
| - |(pattern, flags)| { |
525 |
| - pattern.map(|pattern| match flags { |
526 |
| - Some(flag) => format!("(?{flag}){pattern}"), |
527 |
| - None => pattern.to_string(), |
528 |
| - }) |
529 |
| - }, |
530 |
| - )) as Box<dyn Iterator<Item = Option<String>>>, |
531 |
| - None => Box::new( |
532 |
| - regex_array |
533 |
| - .iter() |
534 |
| - .map(|pattern| pattern.map(|pattern| pattern.to_string())), |
535 |
| - ), |
536 |
| - }; |
537 |
| - |
538 |
| - array |
539 |
| - .iter() |
540 |
| - .zip(complete_pattern) |
541 |
| - .map(|(value, pattern)| { |
542 |
| - match (value, pattern) { |
543 |
| - (Some(_), Some(pattern)) if pattern == *"" => { |
544 |
| - result.append(true); |
545 |
| - } |
546 |
| - (Some(value), Some(pattern)) => { |
547 |
| - let existing_pattern = patterns.get(&pattern); |
548 |
| - let re = match existing_pattern { |
549 |
| - Some(re) => re, |
550 |
| - None => { |
551 |
| - let re = Regex::new(pattern.as_str()).map_err(|e| { |
552 |
| - DataFusionError::Execution(format!( |
553 |
| - "Regular expression did not compile: {e:?}" |
554 |
| - )) |
555 |
| - })?; |
556 |
| - patterns.entry(pattern).or_insert(re) |
557 |
| - } |
558 |
| - }; |
559 |
| - result.append(re.is_match(value)); |
560 |
| - } |
561 |
| - _ => result.append(false), |
562 |
| - } |
563 |
| - Ok(()) |
564 |
| - }) |
565 |
| - .collect::<Result<Vec<()>, DataFusionError>>()?; |
566 |
| - |
567 |
| - let data = unsafe { |
568 |
| - ArrayDataBuilder::new(DataType::Boolean) |
569 |
| - .len(array.len()) |
570 |
| - .buffers(vec![result.into()]) |
571 |
| - .nulls(nulls) |
572 |
| - .build_unchecked() |
573 |
| - }; |
574 |
| - |
575 |
| - Ok(BooleanArray::from(data)) |
576 |
| -} |
0 commit comments