From 81b2bbf7f8b17571dccfa3b8748db450714b4786 Mon Sep 17 00:00:00 2001 From: taoky Date: Fri, 29 Mar 2024 03:47:09 +0800 Subject: [PATCH] Size DB serialize/deserialize, and miss TTL --- Cargo.lock | 19 ++++++++++++ Cargo.toml | 5 +++- src/main.rs | 84 ++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 90 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d3ad35b..0124340 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,6 +125,15 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -181,6 +190,7 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", + "serde", "wasm-bindgen", "windows-targets 0.52.4", ] @@ -549,6 +559,12 @@ dependencies = [ "libm", ] +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "1.2.0" @@ -2002,15 +2018,18 @@ name = "yukina" version = "0.1.0" dependencies = [ "anyhow", + "bincode", "chrono", "clap", "console", "humansize", + "humantime", "indicatif", "ipnetwork", "parse-size", "regex", "reqwest", + "serde", "shadow-rs", "sled", "test-log", diff --git a/Cargo.toml b/Cargo.toml index 6f92f1c..139c4e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ description = "YUKI-based Next-generation Async-cache" [dependencies] anyhow = { version = "1.0.81", features = ["backtrace"] } -chrono = "0.4.35" +chrono = { version = "0.4.35", features = ["serde"] } clap = { version = "4.5.3", features = ["derive"] } console = { version = "0.15", default-features = false, features = ["ansi-parsing"] } humansize = "2.1.3" @@ -24,6 +24,9 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.5.0" walkdir = "2.5.0" shadow-rs = "0.26.1" +serde = { version = "1.0", features = ["derive"] } +bincode = "1.3.3" +humantime = "2.1.0" [dev-dependencies] test-log = { version = "0.2.14", default-features = false, features = ["trace"] } diff --git a/src/main.rs b/src/main.rs index 973ea72..bf83530 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ use clap::Parser; use ipnetwork::{IpNetwork, Ipv4Network, Ipv6Network}; use parse_size::parse_size; use regex::Regex; +use serde::{Deserialize, Serialize}; use std::{ collections::{HashMap, HashSet}, io::{BufRead, BufReader}, @@ -68,6 +69,10 @@ struct Cli { /// A kv database of file size to speed up stage3 in case yukina would run frequently #[clap(long)] size_database: Option, + + /// Size database Miss TTL + #[clap(long, default_value = "2d")] + size_database_ttl: humantime::Duration, } enum LogFileType { @@ -183,6 +188,12 @@ impl Ord for VoteValue { } } +#[derive(Debug, Serialize, Deserialize)] +struct SizeDBItem { + size: Option, + record_time: DateTime, +} + /// Analyse nginx logs and get user votes fn stage1(args: &Cli) -> UserVote { let mut entries: Vec<_> = std::fs::read_dir(&args.log_path) @@ -360,6 +371,18 @@ fn stage2(args: &Cli) -> FileStats { FileStats::new(res) } +fn insert_db(db: Option<&sled::Db>, key: &str, size: Option) { + if let Some(db) = db { + let size_item = SizeDBItem { + size, + record_time: Utc::now(), + }; + if let Err(e) = db.insert(key, bincode::serialize(&size_item).unwrap()) { + tracing::warn!("Size db insert failed: {}", e); + } + } +} + /// Get size of non-existing files, and normalize vote value async fn stage3( args: &Cli, @@ -373,6 +396,7 @@ async fn stage3( // Stats counters let mut local_hit = 0; let mut sizedb_hit = 0; + let mut sizedb_nonexist = 0; let mut remote_hit = 0; let mut remote_miss = 0; @@ -386,22 +410,50 @@ async fn stage3( (*value, true, true) } else { // if size_db, require sled first - let mut size: Option = None; + let mut size_item: Option = None; + let mut exceeded_miss_ttl = false; if let Some(db) = &size_db { if let Ok(Some(s)) = db.get(url_path) { - if let [b0, b1, b2, b3, b4, b5, b6, b7] = *s { - size = Some(u64::from_le_bytes([b0, b1, b2, b3, b4, b5, b6, b7])); + if let Ok(s) = bincode::deserialize::(&s) { + if s.size.is_none() { + let ttl: std::time::Duration = args.size_database_ttl.into(); + let duration = Utc::now() + .signed_duration_since(s.record_time) + .to_std() + .unwrap_or(std::time::Duration::default()); + if duration > ttl { + exceeded_miss_ttl = true; + let _ = db.remove(url_path); + } + } + size_item = Some(s); } } } - if let Some(size) = size { - tracing::debug!( - "File does not exist locally: {} (sizedb {})", - url_path, - size - ); - sizedb_hit += 1; - (size, false, true) + + // a bit ugly but seems no better solution without nightly rust + let size_db_condition = size_item.is_some() && !exceeded_miss_ttl; + if size_db_condition { + let size_item = size_item.unwrap(); + match size_item.size { + Some(size) => { + if size == 0 { + tracing::warn!("Empty file: {}", url_path); + } + tracing::debug!( + "File does not exist locally: {} (sizedb {})", + url_path, + size + ); + sizedb_hit += 1; + (size, false, true) + } + None => { + tracing::info!("File not found at remote (from sizedb): {}", url_path); + sizedb_nonexist += 1; + (0, false, false) + } + } } else { let url = args .url @@ -428,15 +480,12 @@ async fn stage3( size ); remote_hit += 1; - if let Some(db) = &size_db { - if let Err(e) = db.insert(url_path, &size.to_le_bytes()) { - tracing::warn!("Size db insert failed: {}", e); - } - } + insert_db(size_db.as_ref(), url_path, Some(size)); (size, false, true) } Err(e) => { tracing::info!("Invalid file ({}): {}", e, url_path); + insert_db(size_db.as_ref(), url_path, None); remote_miss += 1; (0, false, false) } @@ -456,9 +505,10 @@ async fn stage3( } progressbar.finish(); tracing::info!( - "Local hit: {}, SizeDB hit: {}, Remote hit: {}, Remote miss: {}", + "Local hit: {}, SizeDB hit: {}, SizeDB 404: {}, Remote hit: {}, Remote miss: {}", local_hit, sizedb_hit, + sizedb_nonexist, remote_hit, remote_miss );