Skip to content

Commit 0f56e66

Browse files
DimchikkkDima
authored andcommitted
improve performance of regexp_count (apache#13364)
* improve performance of regexp_count * fix clippy * collect with Int64Array to eliminate one temp Vec --------- Co-authored-by: Dima <[email protected]>
1 parent de28a30 commit 0f56e66

File tree

1 file changed

+39
-43
lines changed

1 file changed

+39
-43
lines changed

datafusion/functions/src/regex/regexpcount.rs

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ use datafusion_expr::{
3030
};
3131
use itertools::izip;
3232
use regex::Regex;
33-
use std::collections::hash_map::Entry;
3433
use std::collections::HashMap;
3534
use std::sync::{Arc, OnceLock};
3635

@@ -312,12 +311,12 @@ where
312311

313312
let pattern = compile_regex(regex, flags_scalar)?;
314313

315-
Ok(Arc::new(Int64Array::from_iter_values(
314+
Ok(Arc::new(
316315
values
317316
.iter()
318317
.map(|value| count_matches(value, &pattern, start_scalar))
319-
.collect::<Result<Vec<i64>, ArrowError>>()?,
320-
)))
318+
.collect::<Result<Int64Array, ArrowError>>()?,
319+
))
321320
}
322321
(true, true, false) => {
323322
let regex = match regex_scalar {
@@ -336,17 +335,17 @@ where
336335
)));
337336
}
338337

339-
Ok(Arc::new(Int64Array::from_iter_values(
338+
Ok(Arc::new(
340339
values
341340
.iter()
342341
.zip(flags_array.iter())
343342
.map(|(value, flags)| {
344343
let pattern =
345344
compile_and_cache_regex(regex, flags, &mut regex_cache)?;
346-
count_matches(value, &pattern, start_scalar)
345+
count_matches(value, pattern, start_scalar)
347346
})
348-
.collect::<Result<Vec<i64>, ArrowError>>()?,
349-
)))
347+
.collect::<Result<Int64Array, ArrowError>>()?,
348+
))
350349
}
351350
(true, false, true) => {
352351
let regex = match regex_scalar {
@@ -360,13 +359,13 @@ where
360359

361360
let start_array = start_array.unwrap();
362361

363-
Ok(Arc::new(Int64Array::from_iter_values(
362+
Ok(Arc::new(
364363
values
365364
.iter()
366365
.zip(start_array.iter())
367366
.map(|(value, start)| count_matches(value, &pattern, start))
368-
.collect::<Result<Vec<i64>, ArrowError>>()?,
369-
)))
367+
.collect::<Result<Int64Array, ArrowError>>()?,
368+
))
370369
}
371370
(true, false, false) => {
372371
let regex = match regex_scalar {
@@ -385,7 +384,7 @@ where
385384
)));
386385
}
387386

388-
Ok(Arc::new(Int64Array::from_iter_values(
387+
Ok(Arc::new(
389388
izip!(
390389
values.iter(),
391390
start_array.unwrap().iter(),
@@ -395,10 +394,10 @@ where
395394
let pattern =
396395
compile_and_cache_regex(regex, flags, &mut regex_cache)?;
397396

398-
count_matches(value, &pattern, start)
397+
count_matches(value, pattern, start)
399398
})
400-
.collect::<Result<Vec<i64>, ArrowError>>()?,
401-
)))
399+
.collect::<Result<Int64Array, ArrowError>>()?,
400+
))
402401
}
403402
(false, true, true) => {
404403
if values.len() != regex_array.len() {
@@ -409,7 +408,7 @@ where
409408
)));
410409
}
411410

412-
Ok(Arc::new(Int64Array::from_iter_values(
411+
Ok(Arc::new(
413412
values
414413
.iter()
415414
.zip(regex_array.iter())
@@ -424,10 +423,10 @@ where
424423
flags_scalar,
425424
&mut regex_cache,
426425
)?;
427-
count_matches(value, &pattern, start_scalar)
426+
count_matches(value, pattern, start_scalar)
428427
})
429-
.collect::<Result<Vec<i64>, ArrowError>>()?,
430-
)))
428+
.collect::<Result<Int64Array, ArrowError>>()?,
429+
))
431430
}
432431
(false, true, false) => {
433432
if values.len() != regex_array.len() {
@@ -447,7 +446,7 @@ where
447446
)));
448447
}
449448

450-
Ok(Arc::new(Int64Array::from_iter_values(
449+
Ok(Arc::new(
451450
izip!(values.iter(), regex_array.iter(), flags_array.iter())
452451
.map(|(value, regex, flags)| {
453452
let regex = match regex {
@@ -458,10 +457,10 @@ where
458457
let pattern =
459458
compile_and_cache_regex(regex, flags, &mut regex_cache)?;
460459

461-
count_matches(value, &pattern, start_scalar)
460+
count_matches(value, pattern, start_scalar)
462461
})
463-
.collect::<Result<Vec<i64>, ArrowError>>()?,
464-
)))
462+
.collect::<Result<Int64Array, ArrowError>>()?,
463+
))
465464
}
466465
(false, false, true) => {
467466
if values.len() != regex_array.len() {
@@ -481,7 +480,7 @@ where
481480
)));
482481
}
483482

484-
Ok(Arc::new(Int64Array::from_iter_values(
483+
Ok(Arc::new(
485484
izip!(values.iter(), regex_array.iter(), start_array.iter())
486485
.map(|(value, regex, start)| {
487486
let regex = match regex {
@@ -494,10 +493,10 @@ where
494493
flags_scalar,
495494
&mut regex_cache,
496495
)?;
497-
count_matches(value, &pattern, start)
496+
count_matches(value, pattern, start)
498497
})
499-
.collect::<Result<Vec<i64>, ArrowError>>()?,
500-
)))
498+
.collect::<Result<Int64Array, ArrowError>>()?,
499+
))
501500
}
502501
(false, false, false) => {
503502
if values.len() != regex_array.len() {
@@ -526,7 +525,7 @@ where
526525
)));
527526
}
528527

529-
Ok(Arc::new(Int64Array::from_iter_values(
528+
Ok(Arc::new(
530529
izip!(
531530
values.iter(),
532531
regex_array.iter(),
@@ -541,27 +540,24 @@ where
541540

542541
let pattern =
543542
compile_and_cache_regex(regex, flags, &mut regex_cache)?;
544-
count_matches(value, &pattern, start)
543+
count_matches(value, pattern, start)
545544
})
546-
.collect::<Result<Vec<i64>, ArrowError>>()?,
547-
)))
545+
.collect::<Result<Int64Array, ArrowError>>()?,
546+
))
548547
}
549548
}
550549
}
551550

552-
fn compile_and_cache_regex(
553-
regex: &str,
554-
flags: Option<&str>,
555-
regex_cache: &mut HashMap<String, Regex>,
556-
) -> Result<Regex, ArrowError> {
557-
match regex_cache.entry(regex.to_string()) {
558-
Entry::Vacant(entry) => {
559-
let compiled = compile_regex(regex, flags)?;
560-
entry.insert(compiled.clone());
561-
Ok(compiled)
562-
}
563-
Entry::Occupied(entry) => Ok(entry.get().to_owned()),
551+
fn compile_and_cache_regex<'a>(
552+
regex: &'a str,
553+
flags: Option<&'a str>,
554+
regex_cache: &'a mut HashMap<String, Regex>,
555+
) -> Result<&'a Regex, ArrowError> {
556+
if !regex_cache.contains_key(regex) {
557+
let compiled = compile_regex(regex, flags)?;
558+
regex_cache.insert(regex.to_string(), compiled);
564559
}
560+
Ok(regex_cache.get(regex).unwrap())
565561
}
566562

567563
fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {

0 commit comments

Comments
 (0)