Merge branch 'neon-mmd:rolling' into 532

neon-mmd · Sep 12, 2024 · 85a4ba1 · 85a4ba1
2 parents c6b9340 + 193b4e3
commit 85a4ba1
Show file tree

Hide file tree

Showing 10 changed files with 710 additions and 640 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "websurfx"
-version = "1.17.20"
+version = "1.17.22"
 edition = "2021"
 description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
 repository = "https://github.com/neon-mmd/websurfx"
@@ -27,6 +27,7 @@ tokio = { version = "1.32.0", features = [
 ], default-features = false }
 serde = { version = "1.0.209", default-features = false, features = ["derive"] }
 serde_json = { version = "1.0.122", default-features = false }
+bincode = {version="1.3.3", default-features=false}
 maud = { version = "0.26.0", default-features = false, features = [
     "actix-web",
 ] }
@@ -48,24 +49,21 @@ mlua = { version = "0.9.9", features = [
 redis = { version = "0.25.4", features = [
     "tokio-comp",
     "connection-manager",
+    "tcp_nodelay"
 ], default-features = false, optional = true }
 blake3 = { version = "1.5.4", default-features = false }
 error-stack = { version = "0.4.0", default-features = false, features = [
     "std",
 ] }
 async-trait = { version = "0.1.80", default-features = false }
 regex = { version = "1.9.4", features = ["perf"], default-features = false }
-smallvec = { version = "1.13.1", features = [
-    "union",
-    "serde",
-], default-features = false }
 futures = { version = "0.3.30", default-features = false, features = ["alloc"] }
 dhat = { version = "0.3.2", optional = true, default-features = false }
 mimalloc = { version = "0.1.43", default-features = false }
 async-once-cell = { version = "0.5.3", default-features = false }
 actix-governor = { version = "0.5.0", default-features = false }
-mini-moka = { version = "0.10", optional = true, default-features = false, features = [
-    "sync",
+moka = { version = "0.12.8", optional = true, default-features = false, features = [
+    "future",
 ] }
 async-compression = { version = "0.4.12", default-features = false, features = [
     "brotli",
@@ -82,8 +80,8 @@ base64 = { version = "0.21.5", default-features = false, features = [
 cfg-if = { version = "1.0.0", default-features = false, optional = true }
 keyword_extraction = { version = "1.4.3", default-features = false, features = [
     "tf_idf",
+    "rayon",
 ] }
-
 stop-words = { version = "0.8.0", default-features = false, features = ["iso"] }
 thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [
     "moby",
@@ -104,8 +102,6 @@ lightningcss = { version = "1.0.0-alpha.57", default-features = false, features
 # Temporary fork with fix
 minify-js = { git = "https://github.com/RuairidhWilliamson/minify-js", branch = "master", version = "0.6.0", default-features = false} 
 
-
-
 [profile.dev]
 opt-level = 0
 debug = true
@@ -180,7 +176,7 @@ opt-level = "z"
 use-synonyms-search = ["thesaurus/static"]
 default = ["memory-cache"]
 dhat-heap = ["dep:dhat"]
-memory-cache = ["dep:mini-moka"]
+memory-cache = ["dep:moka"]
 redis-cache = ["dep:redis", "dep:base64"]
 compress-cache-results = ["dep:async-compression", "dep:cfg-if"]
 encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"]

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -36,7 +36,7 @@
             haskellPackages.hadolint
             nodejs
             nodePackages_latest.cspell
-            nodePackages_latest.eslint
+            eslint
             nodePackages_latest.markdownlint-cli2
             nodePackages_latest.stylelint
             redis

diff --git a/src/cache/cacher.rs b/src/cache/cacher.rs
@@ -2,10 +2,9 @@
 //! from the upstream search engines in a json format.
 
 use error_stack::Report;
+use futures::future::join_all;
 #[cfg(feature = "memory-cache")]
-use mini_moka::sync::Cache as MokaCache;
-#[cfg(feature = "memory-cache")]
-use mini_moka::sync::ConcurrentCacheExt;
+use moka::future::Cache as MokaCache;
 
 #[cfg(feature = "memory-cache")]
 use std::time::Duration;
@@ -376,29 +375,38 @@ impl Cacher for RedisCache {
     }
 }
 /// TryInto implementation for SearchResults from Vec<u8>
-use std::convert::TryInto;
+use std::{convert::TryInto, sync::Arc};
 
 impl TryInto<SearchResults> for Vec<u8> {
     type Error = CacheError;
 
     fn try_into(self) -> Result<SearchResults, Self::Error> {
-        serde_json::from_slice(&self).map_err(|_| CacheError::SerializationError)
+        bincode::deserialize_from(self.as_slice()).map_err(|_| CacheError::SerializationError)
     }
 }
 
 impl TryInto<Vec<u8>> for &SearchResults {
     type Error = CacheError;
 
     fn try_into(self) -> Result<Vec<u8>, Self::Error> {
-        serde_json::to_vec(self).map_err(|_| CacheError::SerializationError)
+        bincode::serialize(self).map_err(|_| CacheError::SerializationError)
     }
 }
 
 /// Memory based cache backend.
 #[cfg(feature = "memory-cache")]
 pub struct InMemoryCache {
     /// The backend cache which stores data.
-    cache: MokaCache<String, Vec<u8>>,
+    cache: Arc<MokaCache<String, Vec<u8>>>,
+}
+
+#[cfg(feature = "memory-cache")]
+impl Clone for InMemoryCache {
+    fn clone(&self) -> Self {
+        Self {
+            cache: self.cache.clone(),
+        }
+    }
 }
 
 #[cfg(feature = "memory-cache")]
@@ -408,15 +416,17 @@ impl Cacher for InMemoryCache {
         log::info!("Initialising in-memory cache");
 
         InMemoryCache {
-            cache: MokaCache::builder()
-                .time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
-                .build(),
+            cache: Arc::new(
+                MokaCache::builder()
+                    .time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
+                    .build(),
+            ),
         }
     }
 
     async fn cached_results(&mut self, url: &str) -> Result<SearchResults, Report<CacheError>> {
         let hashed_url_string = self.hash_url(url);
-        match self.cache.get(&hashed_url_string) {
+        match self.cache.get(&hashed_url_string).await {
             Some(res) => self.post_process_search_results(res).await,
             None => Err(Report::new(CacheError::MissingValue)),
         }
@@ -427,13 +437,18 @@ impl Cacher for InMemoryCache {
         search_results: &[SearchResults],
         urls: &[String],
     ) -> Result<(), Report<CacheError>> {
+        let mut tasks: Vec<_> = Vec::with_capacity(urls.len());
         for (url, search_result) in urls.iter().zip(search_results.iter()) {
             let hashed_url_string = self.hash_url(url);
             let bytes = self.pre_process_search_results(search_result).await?;
-            self.cache.insert(hashed_url_string, bytes);
+            let new_self = self.clone();
+            tasks.push(tokio::spawn(async move {
+                new_self.cache.insert(hashed_url_string, bytes).await
+            }));
         }
 
-        self.cache.sync();
+        join_all(tasks).await;
+
         Ok(())
     }
 }

diff --git a/src/cache/redis_cacher.rs b/src/cache/redis_cacher.rs
@@ -16,7 +16,7 @@ const REDIS_PIPELINE_SIZE: usize = 3;
 /// connect to.
 pub struct RedisCache {
     /// It stores a pool of connections ready to be used.
-    connection_pool: Vec<ConnectionManager>,
+    connection_pool: Box<[ConnectionManager]>,
     /// It stores the size of the connection pool (in other words the number of
     /// connections that should be stored in the pool).
     pool_size: u8,
@@ -58,13 +58,13 @@ impl RedisCache {
             }));
         }
 
-        let mut outputs = Vec::new();
+        let mut outputs = Vec::with_capacity(tasks.len());
         for task in tasks {
             outputs.push(task.await??);
         }
 
         let redis_cache = RedisCache {
-            connection_pool: outputs,
+            connection_pool: outputs.into_boxed_slice(),
             pool_size,
             current_connection: Default::default(),
             cache_ttl,

diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs
@@ -3,7 +3,6 @@
 
 use super::engine_models::EngineError;
 use serde::{Deserialize, Serialize};
-use smallvec::SmallVec;
 #[cfg(any(
     feature = "use-synonyms-search",
     feature = "use-non-static-synonyms-search"
@@ -23,7 +22,7 @@ pub struct SearchResult {
     /// The description of the search result.
     pub description: String,
     /// The names of the upstream engines from which this results were provided.
-    pub engine: SmallVec<[String; 0]>,
+    pub engine: Vec<String>,
     /// The td-tdf score of the result in regards to the title, url and description and the user's query
     pub relevance_score: f32,
 }
@@ -153,10 +152,10 @@ impl EngineErrorInfo {
 #[serde(rename_all = "camelCase")]
 pub struct SearchResults {
     /// Stores the individual serializable `SearchResult` struct into a vector of
-    pub results: Vec<SearchResult>,
+    pub results: Box<[SearchResult]>,
     /// Stores the information on which engines failed with their engine name
     /// and the type of error that caused it.
-    pub engine_errors_info: Vec<EngineErrorInfo>,
+    pub engine_errors_info: Box<[EngineErrorInfo]>,
     /// Stores the flag option which holds the check value that the following
     /// search query was disallowed when the safe search level set to 4 and it
     /// was present in the `Blocklist` file.
@@ -183,10 +182,10 @@ impl SearchResults {
     /// the search url.
     /// * `engine_errors_info` - Takes an array of structs which contains information regarding
     /// which engines failed with their names, reason and their severity color name.
-    pub fn new(results: Vec<SearchResult>, engine_errors_info: &[EngineErrorInfo]) -> Self {
+    pub fn new(results: Box<[SearchResult]>, engine_errors_info: Box<[EngineErrorInfo]>) -> Self {
         Self {
             results,
-            engine_errors_info: engine_errors_info.to_owned(),
+            engine_errors_info,
             disallowed: Default::default(),
             filtered: Default::default(),
             safe_search_level: Default::default(),
@@ -205,11 +204,11 @@ impl SearchResults {
     }
 
     /// A getter function that gets the value of `engine_errors_info`.
-    pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
+    pub fn engine_errors_info(&mut self) -> Box<[EngineErrorInfo]> {
         std::mem::take(&mut self.engine_errors_info)
     }
     /// A getter function that gets the value of `results`.
-    pub fn results(&mut self) -> Vec<SearchResult> {
+    pub fn results(&mut self) -> Box<[SearchResult]> {
         self.results.clone()
     }
 
@@ -254,27 +253,50 @@ fn calculate_tf_idf(
     let tf_idf = TfIdf::new(params);
     let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
     let query_tokens = tokener.split_into_words();
-    let mut search_tokens = vec![];
 
-    for token in query_tokens {
-        #[cfg(any(
-            feature = "use-synonyms-search",
-            feature = "use-non-static-synonyms-search"
-        ))]
-        {
-            // find some synonyms and add them to the search  (from wordnet or moby if feature is enabled)
-            let synonyms = synonyms(&token);
-            search_tokens.extend(synonyms)
-        }
-        search_tokens.push(token);
-    }
+    #[cfg(any(
+        feature = "use-synonyms-search",
+        feature = "use-non-static-synonyms-search"
+    ))]
+    let mut extra_tokens = vec![];
 
-    let mut total_score = 0.0f32;
-    for token in search_tokens.iter() {
-        total_score += tf_idf.get_score(token);
-    }
+    let total_score: f32 = query_tokens
+        .iter()
+        .map(|token| {
+            #[cfg(any(
+                feature = "use-synonyms-search",
+                feature = "use-non-static-synonyms-search"
+            ))]
+            {
+                // find some synonyms and add them to the search  (from wordnet or moby if feature is enabled)
+                extra_tokens.extend(synonyms(token))
+            }
+
+            tf_idf.get_score(token)
+        })
+        .sum();
+
+    #[cfg(not(any(
+        feature = "use-synonyms-search",
+        feature = "use-non-static-synonyms-search"
+    )))]
+    let result = total_score / (query_tokens.len() as f32);
+
+    #[cfg(any(
+        feature = "use-synonyms-search",
+        feature = "use-non-static-synonyms-search"
+    ))]
+    let extra_total_score: f32 = extra_tokens
+        .iter()
+        .map(|token| tf_idf.get_score(token))
+        .sum();
 
-    let result = total_score / (search_tokens.len() as f32);
+    #[cfg(any(
+        feature = "use-synonyms-search",
+        feature = "use-non-static-synonyms-search"
+    ))]
+    let result =
+        (extra_total_score + total_score) / ((query_tokens.len() + extra_tokens.len()) as f32);
 
     f32::from(!result.is_nan()) * result
 }
diff --git a/src/models/server_models.rs b/src/models/server_models.rs
@@ -11,7 +11,7 @@ use super::parser_models::Style;
 pub struct SearchParams {
     /// It stores the search parameter option `q` (or query in simple words)
     /// of the search url.
-    pub q: Option<String>,
+    pub q: Option<Cow<'static, str>>,
     /// It stores the search parameter `page` (or pageno in simple words)
     /// of the search url.
     pub page: Option<u32>,
@@ -29,7 +29,7 @@ pub struct Cookie<'a> {
     /// It stores the colorscheme name used for the website theme.
     pub colorscheme: Cow<'a, str>,
     /// It stores the user selected upstream search engines selected from the UI.
-    pub engines: Cow<'a, Vec<Cow<'a, str>>>,
+    pub engines: Cow<'a, [Cow<'a, str>]>,
     /// It stores the user selected safe search level from the UI.
     pub safe_search_level: u8,
 }