From 25b1adb0fce7b718bbdc7f3457f40bf207bde3e2 Mon Sep 17 00:00:00 2001 From: billyoyo Date: Tue, 10 Oct 2023 17:42:52 +0100 Subject: [PATCH 1/5] add builtin regex function --- Cargo.lock | 21 ++++++++++++++------- Cargo.toml | 1 + src/primitive/defs.rs | 5 +++++ src/primitive/mod.rs | 12 ++++++++++++ tests/units.ua | 2 ++ 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f14091b8d..cbefa5cbb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2378,25 +2378,25 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.6" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" +checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87" dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.0", ] [[package]] name = "regex-automata" -version = "0.3.9" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.0", ] [[package]] @@ -2405,6 +2405,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "regex-syntax" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d" + [[package]] name = "reqwest" version = "0.11.22" @@ -2904,7 +2910,7 @@ dependencies = [ "once_cell", "onig", "plist", - "regex-syntax", + "regex-syntax 0.7.5", "serde", "serde_json", "thiserror", @@ -3258,6 +3264,7 @@ dependencies = [ "paste", "rand", "rayon", + "regex", "rustls", "serde", "serde_yaml", diff --git a/Cargo.toml b/Cargo.toml index 6a9a8f34b..78fabc7a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ webpki-roots = { version = "0.25.0", optional = true } viuer = { version = "0.7.1", optional = true } num_cpus = "1.16.0" rayon = "1.8.0" +regex = "1.10.0" [features] audio = ["hodaun", "crossbeam-channel", "lockfree"] diff --git a/src/primitive/defs.rs b/src/primitive/defs.rs index aaf54c070..02d4dbcca 100644 --- a/src/primitive/defs.rs +++ b/src/primitive/defs.rs @@ -1496,6 +1496,11 @@ primitive!( /// [under][now] can be used to time a function. /// ex: ⍜now(5&sl1) (0, Now, Misc, "now"), + /// Parse a regex pattern on a string + /// + /// Returns an array of boxed string, with one string per matching group + /// ex: regex "([a-z]+)" "hello world" + (2, Regex, Misc, "regex"), /// The number of radians in a quarter circle /// /// Equivalent to `divide``2``pi` or `divide``4``tau` diff --git a/src/primitive/mod.rs b/src/primitive/mod.rs index 69c9d1fca..8bede3074 100644 --- a/src/primitive/mod.rs +++ b/src/primitive/mod.rs @@ -22,6 +22,7 @@ use std::{ use enum_iterator::{all, Sequence}; use once_cell::sync::Lazy; use rand::prelude::*; +use regex::Regex; use crate::{ algorithm::{fork, loops}, @@ -596,6 +597,17 @@ impl Primitive { Primitive::InvTrace => trace(env, true)?, Primitive::Dump => dump(env)?, Primitive::Sys(io) => io.run(env)?, + Primitive::Regex => { + let pattern = env.pop(1)?.as_string(env, "Pattern must be a string")?; + let matching = env.pop(1)?.as_string(env, "Matching target must be a string")?; + + let re = Regex::new(pattern.as_str()).unwrap(); + let matches = re.find_iter(matching.as_str()) + .map(|m| Function::constant(m.as_str()).into()) + .reduce(|a, b| Value::join(a, b, env).unwrap()).unwrap(); + + env.push(matches); + } } Ok(()) } diff --git a/tests/units.ua b/tests/units.ua index 62e3af34a..ec97526bc 100644 --- a/tests/units.ua +++ b/tests/units.ua @@ -132,3 +132,5 @@ ParseOrZero ← ⍣parse⋅⋅0 ⍤.=92 -@\0 @\\ ⍤.=97 -@\0 @a ⍤.=1114111 -@\0 @\_ + +⍤. ↧⊙(≅ "hello" ⊔⊡0) ≅ "world" ⊔⊡1 . regex "([a-z]+)" "hello world" From e2fd68eecd1e645ebb12954f42e86ec3c507b850 Mon Sep 17 00:00:00 2001 From: billyoyo Date: Tue, 10 Oct 2023 18:02:57 +0100 Subject: [PATCH 2/5] add regex cache to uiua environment --- src/primitive/mod.rs | 3 +-- src/run.rs | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/primitive/mod.rs b/src/primitive/mod.rs index 8bede3074..cf39d3654 100644 --- a/src/primitive/mod.rs +++ b/src/primitive/mod.rs @@ -22,7 +22,6 @@ use std::{ use enum_iterator::{all, Sequence}; use once_cell::sync::Lazy; use rand::prelude::*; -use regex::Regex; use crate::{ algorithm::{fork, loops}, @@ -601,7 +600,7 @@ impl Primitive { let pattern = env.pop(1)?.as_string(env, "Pattern must be a string")?; let matching = env.pop(1)?.as_string(env, "Matching target must be a string")?; - let re = Regex::new(pattern.as_str()).unwrap(); + let re = env.parse_regex_pattern(pattern); let matches = re.find_iter(matching.as_str()) .map(|m| Function::constant(m.as_str()).into()) .reduce(|a, b| Value::join(a, b, env).unwrap()).unwrap(); diff --git a/src/run.rs b/src/run.rs index 4eab4b733..4dcb362fe 100644 --- a/src/run.rs +++ b/src/run.rs @@ -11,6 +11,7 @@ use std::{ use instant::Duration; use parking_lot::Mutex; +use regex::Regex; use crate::{ array::Array, @@ -62,6 +63,7 @@ pub struct Uiua { cli_file_path: PathBuf, /// The system backend pub(crate) backend: Arc, + regex_cache: Arc>>, } #[derive(Clone)] @@ -178,6 +180,7 @@ impl Uiua { cli_file_path: PathBuf::new(), execution_limit: None, execution_start: 0.0, + regex_cache: Arc::new(Mutex::new(HashMap::new())) } } /// Create a new Uiua runtime with a custom IO backend @@ -738,6 +741,17 @@ code: pub(crate) fn func_fill(&self) -> Option> { self.scope.fills.functions.last().cloned() } + pub(crate) fn parse_regex_pattern(&self, pattern: String) -> Regex { + let mut binding = self.regex_cache.lock(); + let cached_pattern = binding.get(&pattern); + if cached_pattern.is_none() { + let regex = Regex::new(&pattern).unwrap(); + binding.insert(pattern, regex.clone()); + regex + } else { + cached_pattern.unwrap().clone() + } + } /// Do something with the fill context set pub(crate) fn with_fill( &mut self, @@ -826,6 +840,7 @@ code: backend: self.backend.clone(), execution_limit: self.execution_limit, execution_start: self.execution_start, + regex_cache: self.regex_cache.clone() }; self.backend .spawn(env, Box::new(f)) From 3669fe4a1048b57a079f663d29b8a45435e6a6f2 Mon Sep 17 00:00:00 2001 From: billyoyo Date: Tue, 10 Oct 2023 18:04:30 +0100 Subject: [PATCH 3/5] add cache test for regex function --- tests/units.ua | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/units.ua b/tests/units.ua index ec97526bc..4b25966a3 100644 --- a/tests/units.ua +++ b/tests/units.ua @@ -134,3 +134,4 @@ ParseOrZero ← ⍣parse⋅⋅0 ⍤.=1114111 -@\0 @\_ ⍤. ↧⊙(≅ "hello" ⊔⊡0) ≅ "world" ⊔⊡1 . regex "([a-z]+)" "hello world" +⍤. ↧⊙(≅ "hello" ⊔⊡0) ≅ "world" ⊔⊡1 . regex "([a-z]+)" "hello world" From 78282468a1e6f2dca8aa5558aaff882773c8ecf2 Mon Sep 17 00:00:00 2001 From: billyoyo Date: Tue, 10 Oct 2023 23:25:59 +0100 Subject: [PATCH 4/5] create thread local cache and remove regex cache from env --- src/primitive/mod.rs | 36 +++++++++++++++++++++++++++++++----- src/run.rs | 15 --------------- tests/units.ua | 2 ++ 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/src/primitive/mod.rs b/src/primitive/mod.rs index cf39d3654..7d10b6cb5 100644 --- a/src/primitive/mod.rs +++ b/src/primitive/mod.rs @@ -16,12 +16,15 @@ use std::{ sync::{ atomic::{self, AtomicUsize}, OnceLock, + Arc }, + collections::HashMap, }; use enum_iterator::{all, Sequence}; use once_cell::sync::Lazy; use rand::prelude::*; +use regex::Regex; use crate::{ algorithm::{fork, loops}, @@ -35,6 +38,10 @@ use crate::{ Uiua, UiuaError, UiuaResult, }; +thread_local! { + pub static REGEX_CACHE: RefCell> = RefCell::new(HashMap::new()); +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Sequence)] pub enum PrimClass { Stack, @@ -600,12 +607,31 @@ impl Primitive { let pattern = env.pop(1)?.as_string(env, "Pattern must be a string")?; let matching = env.pop(1)?.as_string(env, "Matching target must be a string")?; - let re = env.parse_regex_pattern(pattern); - let matches = re.find_iter(matching.as_str()) - .map(|m| Function::constant(m.as_str()).into()) - .reduce(|a, b| Value::join(a, b, env).unwrap()).unwrap(); + let re = REGEX_CACHE.with_borrow_mut(|cache| { + let cached_pattern = cache.get(&pattern); + if cached_pattern.is_none() { + let regex = Regex::new(&pattern); + if regex.is_ok() { + cache.insert(pattern.clone(), regex.clone().unwrap()); + } + regex + } else { + Ok(cached_pattern.unwrap().clone()) + } + }); + + if re.is_ok() { + let matches = re.unwrap().find_iter(matching.as_str()) + .map(|m| Function::constant(m.as_str()).into()) + .reduce(|a, b| Value::join(a, b, env).unwrap()); - env.push(matches); + env.push(matches.unwrap_or(Array::>::default().into())); + } else { + return Err(env.error(format!( + "Invalid pattern: {}", + pattern + ))) + } } } Ok(()) diff --git a/src/run.rs b/src/run.rs index 4dcb362fe..4eab4b733 100644 --- a/src/run.rs +++ b/src/run.rs @@ -11,7 +11,6 @@ use std::{ use instant::Duration; use parking_lot::Mutex; -use regex::Regex; use crate::{ array::Array, @@ -63,7 +62,6 @@ pub struct Uiua { cli_file_path: PathBuf, /// The system backend pub(crate) backend: Arc, - regex_cache: Arc>>, } #[derive(Clone)] @@ -180,7 +178,6 @@ impl Uiua { cli_file_path: PathBuf::new(), execution_limit: None, execution_start: 0.0, - regex_cache: Arc::new(Mutex::new(HashMap::new())) } } /// Create a new Uiua runtime with a custom IO backend @@ -741,17 +738,6 @@ code: pub(crate) fn func_fill(&self) -> Option> { self.scope.fills.functions.last().cloned() } - pub(crate) fn parse_regex_pattern(&self, pattern: String) -> Regex { - let mut binding = self.regex_cache.lock(); - let cached_pattern = binding.get(&pattern); - if cached_pattern.is_none() { - let regex = Regex::new(&pattern).unwrap(); - binding.insert(pattern, regex.clone()); - regex - } else { - cached_pattern.unwrap().clone() - } - } /// Do something with the fill context set pub(crate) fn with_fill( &mut self, @@ -840,7 +826,6 @@ code: backend: self.backend.clone(), execution_limit: self.execution_limit, execution_start: self.execution_start, - regex_cache: self.regex_cache.clone() }; self.backend .spawn(env, Box::new(f)) diff --git a/tests/units.ua b/tests/units.ua index 4b25966a3..f32da4223 100644 --- a/tests/units.ua +++ b/tests/units.ua @@ -135,3 +135,5 @@ ParseOrZero ← ⍣parse⋅⋅0 ⍤. ↧⊙(≅ "hello" ⊔⊡0) ≅ "world" ⊔⊡1 . regex "([a-z]+)" "hello world" ⍤. ↧⊙(≅ "hello" ⊔⊡0) ≅ "world" ⊔⊡1 . regex "([a-z]+)" "hello world" +⍤. ≅ {} regex "([0-9]+)" "hello world" +⍤. ⍣(regex "([a-z]" "hello world") (1;) From c92e67579bbf9aab6b4c4f9b7dbcb64839777483 Mon Sep 17 00:00:00 2001 From: billyoyo Date: Wed, 11 Oct 2023 19:30:20 +0100 Subject: [PATCH 5/5] change with_borrow_mut to with --- src/primitive/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/primitive/mod.rs b/src/primitive/mod.rs index 7d10b6cb5..8646313e2 100644 --- a/src/primitive/mod.rs +++ b/src/primitive/mod.rs @@ -607,7 +607,8 @@ impl Primitive { let pattern = env.pop(1)?.as_string(env, "Pattern must be a string")?; let matching = env.pop(1)?.as_string(env, "Matching target must be a string")?; - let re = REGEX_CACHE.with_borrow_mut(|cache| { + let re = REGEX_CACHE.with(|cache_ref| { + let mut cache = cache_ref.borrow_mut(); let cached_pattern = cache.get(&pattern); if cached_pattern.is_none() { let regex = Regex::new(&pattern);