From ac70409b55b27adb9c539c5123b908818ec5a57e Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 16 Jul 2024 09:14:31 -0400 Subject: [PATCH 01/14] Add logging around Zarr manifest cache interactions --- src/zarrman/mod.rs | 78 ++++++++++++++++++++++++++++++---------- src/zarrman/resources.rs | 2 +- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 769cff8..19e2064 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -20,9 +20,13 @@ pub(crate) use self::resources::*; use crate::dav::ErrorClass; use crate::httputil::{BuildClientError, Client, HttpError, HttpUrl}; use crate::paths::{Component, PureDirPath, PurePath}; -use moka::future::{Cache, CacheBuilder}; +use moka::{ + future::{Cache, CacheBuilder}, + ops::compute::{CompResult, Op}, +}; use serde::Deserialize; use std::sync::Arc; +use std::time::Duration; use thiserror::Error; /// The manifest root URL. @@ -77,6 +81,15 @@ impl ZarrManClient { let inner = Client::new()?; let manifests = CacheBuilder::new(MANIFEST_CACHE_SIZE) .name("zarr-manifests") + .time_to_idle(Duration::from_secs(300)) + .eviction_listener(|path, _, cause| { + tracing::debug!( + event = "manifest_cache_evict", + manifest = ?path, + ?cause, + "Zarr manifest evicted from cache", + ); + }) .build(); let manifest_root_url = MANIFEST_ROOT_URL .parse::() @@ -266,17 +279,52 @@ impl ZarrManClient { &self, path: &ManifestPath, ) -> Result, ZarrManError> { - self.manifests - .try_get_with_by_ref(path, async move { - self.inner - .get_json::( - path.under_manifest_root(&self.manifest_root_url), - ) - .await - .map(Arc::new) + let result = self + .manifests + .entry_by_ref(path) + .and_try_compute_with(|entry| async move { + if entry.is_none() { + tracing::debug!( + event = "manifest_cache_miss_pre", + manifest = ?path, + cache_len = self.manifests.entry_count(), + "Cache miss for Zarr manifest; about to fetch from repository", + ); + self.inner + .get_json::( + path.under_manifest_root(&self.manifest_root_url), + ) + .await + .map(|zman| Op::Put(Arc::new(zman))) + } else { + Ok(Op::Nop) + } }) - .await - .map_err(Into::into) + .await?; + let entry = match result { + CompResult::Inserted(entry) => { + tracing::debug!( + event = "manifest_cache_miss_post", + manifest = ?path, + cache_len = self.manifests.entry_count(), + "Fetched Zarr manifest from repository", + ); + entry + } + CompResult::Unchanged(entry) => { + tracing::debug!( + event = "manifest_cache_hit", + manifest = ?path, + cache_len = self.manifests.entry_count(), + "Fetched Zarr manifest from cache", + ); + entry + } + _ => unreachable!( + "Call to and_try_compute_with() should only ever return Inserted or Unchanged" + ), + }; + Ok(entry.into_value()) } /// Convert the [`manifest::ManifestEntry`] `entry` with path `entry_path` @@ -345,7 +393,7 @@ impl ZarrManClient { pub(crate) enum ZarrManError { /// An HTTP error occurred while interacting with the manifest tree #[error(transparent)] - Http(#[from] Arc), + Http(#[from] HttpError), /// The request path was invalid for the `/zarrs/` hierarchy #[error("invalid path requested: {path:?}")] @@ -371,12 +419,6 @@ impl ZarrManError { } } -impl From for ZarrManError { - fn from(e: HttpError) -> ZarrManError { - Arc::new(e).into() - } -} - /// A directory listing parsed from the response to a `GET` request to a /// directory in the manifest tree #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index b793305..1681665 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -77,7 +77,7 @@ impl fmt::Debug for ManifestPath { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, - r#""{}/{}/{}/""#, + r#""{}{}/{}/""#, self.prefix.escape_debug(), self.zarr_id.escape_debug(), self.checksum.escape_debug() From 7a188a090c0670b92bbce3307f21b5d0b0c9f0c3 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 16 Jul 2024 09:24:27 -0400 Subject: [PATCH 02/14] Format logs as JSON --- Cargo.lock | 13 +++++++++++++ Cargo.toml | 2 +- src/main.rs | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bd17f11..05cc90b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2897,6 +2897,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.18" @@ -2904,12 +2914,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "nu-ansi-term", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "time", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 20ce04b..e3bed8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,7 @@ tokio = { version = "1.39.3", features = ["macros", "net", "rt-multi-thread"] } tower = { version = "0.5.0", features = ["util"] } tower-http = { version = "0.5.2", features = ["set-header", "trace"] } tracing = "0.1.40" -tracing-subscriber = { version = "0.3.18", features = ["local-time", "time"] } +tracing-subscriber = { version = "0.3.18", features = ["json", "local-time", "time"] } url = { version = "2.5.2", features = ["serde"] } xml-rs = "0.8.21" diff --git a/src/main.rs b/src/main.rs index 19b5e68..b6bee20 100644 --- a/src/main.rs +++ b/src/main.rs @@ -29,7 +29,7 @@ use axum::{ }; use clap::Parser; use std::fmt; -use std::io::{stderr, IsTerminal}; +use std::io::stderr; use std::net::IpAddr; use std::sync::Arc; use tower::service_fn; @@ -77,8 +77,8 @@ fn main() -> anyhow::Result<()> { tracing_subscriber::registry() .with( tracing_subscriber::fmt::layer() + .json() .with_timer(timer) - .with_ansi(stderr().is_terminal()) .with_writer(stderr), ) .with( From 749c4f166d30511ff0a9191fb31aec9b6efb35dc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 16 Jul 2024 09:31:38 -0400 Subject: [PATCH 03/14] Adjust display of Zarr manifest paths in logs --- src/zarrman/mod.rs | 8 ++++---- src/zarrman/resources.rs | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 19e2064..8269154 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -85,7 +85,7 @@ impl ZarrManClient { .eviction_listener(|path, _, cause| { tracing::debug!( event = "manifest_cache_evict", - manifest = ?path, + manifest = %path, ?cause, "Zarr manifest evicted from cache", ); @@ -286,7 +286,7 @@ impl ZarrManClient { if entry.is_none() { tracing::debug!( event = "manifest_cache_miss_pre", - manifest = ?path, + manifest = %path, cache_len = self.manifests.entry_count(), "Cache miss for Zarr manifest; about to fetch from repository", ); @@ -305,7 +305,7 @@ impl ZarrManClient { CompResult::Inserted(entry) => { tracing::debug!( event = "manifest_cache_miss_post", - manifest = ?path, + manifest = %path, cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from repository", ); @@ -314,7 +314,7 @@ impl ZarrManClient { CompResult::Unchanged(entry) => { tracing::debug!( event = "manifest_cache_hit", - manifest = ?path, + manifest = %path, cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from cache", ); diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index 1681665..b3322d9 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -73,15 +73,15 @@ impl ManifestPath { } } +impl fmt::Display for ManifestPath { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}{}/{}.json", self.prefix, self.zarr_id, self.checksum) + } +} + impl fmt::Debug for ManifestPath { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - r#""{}{}/{}/""#, - self.prefix.escape_debug(), - self.zarr_id.escape_debug(), - self.checksum.escape_debug() - ) + write!(f, "{:?}", self.to_string()) } } From e4e2c547b924c8e89efe2b04d4535d5b51a15683 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 16 Jul 2024 09:40:15 -0400 Subject: [PATCH 04/14] Rename `cache_len` to `approx_cache_len` --- src/zarrman/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 8269154..98a6c95 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -287,7 +287,7 @@ impl ZarrManClient { tracing::debug!( event = "manifest_cache_miss_pre", manifest = %path, - cache_len = self.manifests.entry_count(), + approx_cache_len = self.manifests.entry_count(), "Cache miss for Zarr manifest; about to fetch from repository", ); self.inner @@ -306,7 +306,7 @@ impl ZarrManClient { tracing::debug!( event = "manifest_cache_miss_post", manifest = %path, - cache_len = self.manifests.entry_count(), + approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from repository", ); entry @@ -315,7 +315,7 @@ impl ZarrManClient { tracing::debug!( event = "manifest_cache_hit", manifest = %path, - cache_len = self.manifests.entry_count(), + approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from cache", ); entry From 27ebcb4f1e15a41b59ea2d5d67213e4552c2632b Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 16 Jul 2024 09:56:28 -0400 Subject: [PATCH 05/14] Adjust cache log event fields In case we ever add logs with different types of events or for other caches --- src/zarrman/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 98a6c95..59420bc 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -84,7 +84,8 @@ impl ZarrManClient { .time_to_idle(Duration::from_secs(300)) .eviction_listener(|path, _, cause| { tracing::debug!( - event = "manifest_cache_evict", + cache_event = "evict", + cache = "zarr-manifests", manifest = %path, ?cause, "Zarr manifest evicted from cache", @@ -285,7 +286,8 @@ impl ZarrManClient { .and_try_compute_with(|entry| async move { if entry.is_none() { tracing::debug!( - event = "manifest_cache_miss_pre", + cache_event = "miss_pre", + cache = "zarr-manifests", manifest = %path, approx_cache_len = self.manifests.entry_count(), "Cache miss for Zarr manifest; about to fetch from repository", @@ -304,7 +306,8 @@ impl ZarrManClient { let entry = match result { CompResult::Inserted(entry) => { tracing::debug!( - event = "manifest_cache_miss_post", + cache_event = "miss_post", + cache = "zarr-manifests", manifest = %path, approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from repository", @@ -313,7 +316,8 @@ impl ZarrManClient { } CompResult::Unchanged(entry) => { tracing::debug!( - event = "manifest_cache_hit", + cache_event = "hit", + cache = "zarr-manifests", manifest = %path, approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from cache", From 4b658f72940df6a22459665988041632606170e7 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 17 Jul 2024 09:29:07 -0400 Subject: [PATCH 06/14] Emit a span for `get_zarr_manifest()` --- Cargo.lock | 2 ++ Cargo.toml | 1 + src/main.rs | 3 +-- src/zarrman/mod.rs | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 05cc90b..c910cef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -853,6 +853,7 @@ dependencies = [ "tracing", "tracing-subscriber", "url", + "uuid", "xml-rs", ] @@ -3057,6 +3058,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", + "rand", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index e3bed8d..cbd2c9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ tower-http = { version = "0.5.2", features = ["set-header", "trace"] } tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", features = ["json", "local-time", "time"] } url = { version = "2.5.2", features = ["serde"] } +uuid = { version = "1.10.0", features = ["fast-rng", "v4"] } xml-rs = "0.8.21" [dev-dependencies] diff --git a/src/main.rs b/src/main.rs index b6bee20..83dd6d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -29,7 +29,6 @@ use axum::{ }; use clap::Parser; use std::fmt; -use std::io::stderr; use std::net::IpAddr; use std::sync::Arc; use tower::service_fn; @@ -79,7 +78,7 @@ fn main() -> anyhow::Result<()> { tracing_subscriber::fmt::layer() .json() .with_timer(timer) - .with_writer(stderr), + .with_writer(std::io::stderr), ) .with( Targets::new() diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 59420bc..222ac19 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -276,6 +276,7 @@ impl ZarrManClient { /// Retrieve the Zarr manifest at the given [`ManifestPath`] in the /// manifest tree, either via an HTTP request or from a cache + #[tracing::instrument(skip_all, fields(id = %uuid::Uuid::new_v4(), manifest = %path))] async fn get_zarr_manifest( &self, path: &ManifestPath, From 29560ee2c7cfc4813802d7415e3238e228dfd6a6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 26 Jul 2024 13:09:05 -0400 Subject: [PATCH 07/14] Script for analyzing caching logs --- tools/zarr-cache-stats.py | 297 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100755 tools/zarr-cache-stats.py diff --git a/tools/zarr-cache-stats.py b/tools/zarr-cache-stats.py new file mode 100755 index 0000000..f8f7f01 --- /dev/null +++ b/tools/zarr-cache-stats.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +""" +This script takes as arguments one or more `dandidav` Heroku logs (assumed to +be consecutive) and outputs various statistics related to the Zarr +manifest cache. +""" + +from __future__ import annotations +from collections import Counter +from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +import json +import logging +from pathlib import Path +import re +import statistics +import sys +import textwrap + +log = logging.getLogger(__name__) + + +@dataclass +class HerokuLog: + timestamp: datetime + target: str + message: str + line: str + + +@dataclass +class Events: + misses: list[Miss] = field(default_factory=list) + killed_misses: list[KilledMiss] = field(default_factory=list) + hits: list[Hit] = field(default_factory=list) + evictions: list[Eviction] = field(default_factory=list) + + +@dataclass(frozen=True) +class MissKey: + span_id: str + manifest_path: str + + +@dataclass +class MissMoment: + timestamp: datetime + approx_cache_len: int + + +@dataclass +class Miss: + manifest_path: str + start: MissMoment + end: MissMoment + #: Number of other misses in progress as of the end of this miss + parallel: int + + +@dataclass +class KilledMiss: + manifest_path: str + start: MissMoment + killed_at: datetime + + +@dataclass +class Hit: + manifest_path: str + timestamp: datetime + # `last_access` is `None` if there was no previous access, presumably + # because such occurred prior to the start of the given logs. + last_access: datetime | None + approx_cache_len: int + #: The zero-based index of this cache entry among all entries when ordered + #: from most-recently accessed (hit or missed) to least-recently accessed + #: as of just before this hit, i.e., the number of other entries in the + #: cache that were accessed since the last time this entry was accessed + recency_index: int + + +@dataclass +class Eviction: + manifest_path: str + timestamp: datetime + cause: RemovalCause + + +class RemovalCause(Enum): + # + EXPIRED = "Expired" + EXPLICIT = "Explicit" + REPLACED = "Replaced" + SIZE = "Size" + + +@dataclass +class Statistics: + qty: int + mean: float + stddev: float + minimum: float + q1: float + median: float + q3: float + maximum: float + + @classmethod + def for_values(cls, values: Iterable[float]) -> Statistics: + vals = list(values) + qty = len(vals) + minimum = min(vals) + maximum = max(vals) + mean = statistics.fmean(vals) + stddev = statistics.stdev(vals, xbar=mean) + median = statistics.median(vals) + q1, _, q3 = statistics.quantiles(vals) + return cls( + qty=qty, + mean=mean, + stddev=stddev, + minimum=minimum, + q1=q1, + median=median, + q3=q3, + maximum=maximum, + ) + + def __str__(self) -> str: + return ( + f"Qty: {self.qty}\n" + f"Min: {self.minimum}\n" + f"Q1: {self.q1}\n" + f"Med: {self.median}\n" + f"Q3: {self.q3}\n" + f"Max: {self.maximum}\n" + f"Avg: {self.mean}\n" + f"Stddev: {self.stddev}" + ) + + +def main() -> None: + logging.basicConfig( + format="%(asctime)s [%(levelname)-8s] %(message)s", + datefmt="%H:%M:%S", + level=logging.DEBUG, + ) + events = process_logs(map(Path, sys.argv[1:])) + summarize(events) + + +def iterlogs(filepaths: Iterable[Path]) -> Iterator[HerokuLog]: + seen = set() + for p in filepaths: + log.info("Processing %s ...", p) + with p.open() as fp: + for line in fp: + # Sometimes the same log message ends up in more than one log + # file, so weed out duplicates: + if line in seen: + continue + seen.add(line) + m = re.match( + r"(?P\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d\.\d+[-+]\d\d:\d\d)" + r" (?P\S+): ", + line, + ) + if not m: + log.warning("Failed to parse Heroku log line: %s", line) + continue + timestamp = datetime.fromisoformat(m["timestamp"]) + target = m["target"] + message = line[m.end() :].strip() + yield HerokuLog( + timestamp=timestamp, target=target, message=message, line=line + ) + + +def process_logs(logfiles: Iterable[Path]) -> Events: + events = Events() + misses_in_progress = {} + last_accesses = {} + for lg in iterlogs(logfiles): + if lg.target == "app[web.1]": + entry = json.loads(lg.message) + timestamp = datetime.fromisoformat(entry["timestamp"]) + fields = entry.get("fields", {}) + if "cache_event" in fields and fields.get("cache") == "zarr-manifests": + span_id: str | None = entry.get("span", {}).get("id") + manifest_path: str = fields["manifest"] + match fields["cache_event"]: + case "miss_pre": + assert span_id is not None + misses_in_progress[ + MissKey(span_id=span_id, manifest_path=manifest_path) + ] = MissMoment( + timestamp=timestamp, + approx_cache_len=fields["approx_cache_len"], + ) + case "miss_post": + assert span_id is not None + key = MissKey(span_id=span_id, manifest_path=manifest_path) + if (start := misses_in_progress.pop(key, None)) is not None: + end = MissMoment( + timestamp=timestamp, + approx_cache_len=fields["approx_cache_len"], + ) + events.misses.append( + Miss( + manifest_path=manifest_path, + start=start, + end=end, + parallel=len(misses_in_progress), + ) + ) + last_accesses[manifest_path] = timestamp + case "hit": + last_access = last_accesses.get(manifest_path) + if last_access is not None: + recency_index = sum( + 1 for ts in last_accesses.values() if ts > last_access + ) + else: + recency_index = len(last_accesses) + events.hits.append( + Hit( + manifest_path=manifest_path, + timestamp=timestamp, + last_access=last_access, + approx_cache_len=fields["approx_cache_len"], + recency_index=recency_index, + ) + ) + last_accesses[manifest_path] = timestamp + case "evict": + events.evictions.append( + Eviction( + manifest_path=manifest_path, + timestamp=timestamp, + cause=RemovalCause(fields["cause"]), + ) + ) + last_accesses.pop(manifest_path, None) + case other: + log.warning( + "Invalid 'cache_event' field value %r: %s", other, lg.line + ) + elif lg.target == "heroku[web.1]" and lg.message in ( + "Stopping all processes with SIGTERM", + "Stopping process with SIGKILL", + ): + events.killed_misses.extend( + KilledMiss( + manifest_path=k.manifest_path, start=v, killed_at=lg.timestamp + ) + for k, v in misses_in_progress.items() + ) + misses_in_progress = {} + last_accesses = {} + return events + + +def summarize(events: Events) -> None: + miss_duration_stats = Statistics.for_values( + (m.end.timestamp - m.start.timestamp).total_seconds() for m in events.misses + ) + print("Miss durations:") + print(textwrap.indent(str(miss_duration_stats), " " * 4)) + print() + + miss_parallel_stats = Statistics.for_values(m.parallel for m in events.misses) + print("Miss parallels:") + print(textwrap.indent(str(miss_parallel_stats), " " * 4)) + print() + + hit_recencies = Counter(hit.recency_index for hit in events.hits) + print("Hit recencies:") + print(" Miss:", len(events.misses)) + for recency, qty in sorted(hit_recencies.items()): + print(f" {recency}: {qty}") + print() + + evict_stats = Counter(ev.cause for ev in events.evictions) + print("Eviction causes:") + for cause, qty in sorted(evict_stats.items()): + print(f" {cause}: {qty}") + + +if __name__ == "__main__": + main() From d1ab6a71b524541eee71639c9469ad157d3950a0 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 26 Jul 2024 13:14:37 -0400 Subject: [PATCH 08/14] Move the manifest cache time-to-idle to a const --- src/zarrman/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 222ac19..a02513d 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -50,6 +50,9 @@ static ENTRY_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr /// The maximum number of manifests cached at once const MANIFEST_CACHE_SIZE: u64 = 16; +/// Expire any manifest cache entries that haven't been accessed for this long +const MANIFEST_CACHE_IDLE_EXPIRY: Duration = Duration::from_secs(300); + /// A client for fetching data about Zarrs via Zarr manifest files #[derive(Clone, Debug)] pub(crate) struct ZarrManClient { @@ -81,7 +84,7 @@ impl ZarrManClient { let inner = Client::new()?; let manifests = CacheBuilder::new(MANIFEST_CACHE_SIZE) .name("zarr-manifests") - .time_to_idle(Duration::from_secs(300)) + .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) .eviction_listener(|path, _, cause| { tracing::debug!( cache_event = "evict", From e920ecec84a4cb678a2a36a24d680fc9edf19b8d Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 5 Aug 2024 10:34:47 -0400 Subject: [PATCH 09/14] Log sizes of individual Zarr manifest cache entries --- Cargo.lock | 170 ++++++++++++++++++++++++++++++++++++---- Cargo.toml | 1 + src/paths/component.rs | 11 +++ src/zarrman/manifest.rs | 8 +- src/zarrman/mod.rs | 6 +- 5 files changed, 177 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c910cef..793d490 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -81,7 +81,35 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", +] + +[[package]] +name = "attribute-derive" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c124f12ade4e670107b132722d0ad1a5c9790bcbc1b265336369ea05626b4498" +dependencies = [ + "attribute-derive-macro", + "proc-macro2", + "quote", + "syn 2.0.75", +] + +[[package]] +name = "attribute-derive-macro" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b217a07446e0fb086f83401a98297e2d81492122f5874db5391bd270a185f88" +dependencies = [ + "collection_literals", + "interpolator", + "proc-macro-error", + "proc-macro-utils", + "proc-macro2", + "quote", + "quote-use", + "syn 2.0.75", ] [[package]] @@ -678,7 +706,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -687,6 +715,12 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "collection_literals" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186dce98367766de751c42c4f03970fc60fc012296e706ccbb9d5df9b6c1e271" + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -828,6 +862,7 @@ dependencies = [ "clap", "enum_dispatch", "futures-util", + "get-size", "humansize", "indoc", "itertools", @@ -877,6 +912,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derive-where" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.75", +] + [[package]] name = "diff" version = "0.1.13" @@ -941,7 +987,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -1068,7 +1114,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -1111,6 +1157,26 @@ dependencies = [ "version_check", ] +[[package]] +name = "get-size" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b61e2dab7eedce93a83ab3468b919873ff16bac5a3e704011ff836d22b2120" +dependencies = [ + "get-size-derive", +] + +[[package]] +name = "get-size-derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13a1bcfb855c1f340d5913ab542e36f25a1c56f57de79022928297632435dec2" +dependencies = [ + "attribute-derive", + "quote", + "syn 2.0.75", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -1456,6 +1522,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "interpolator" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71dd52191aae121e8611f1e8dc3e324dd0dd1dee1e6dd91d10ee07a3cfb4d9d8" + [[package]] name = "ipnet" version = "2.9.0" @@ -1796,7 +1868,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -1827,7 +1899,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -1877,6 +1949,41 @@ dependencies = [ "yansi", ] +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-utils" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f59e109e2f795a5070e69578c4dc101068139f74616778025ae1011d4cd41a8" +dependencies = [ + "proc-macro2", + "quote", + "smallvec", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -1958,6 +2065,29 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "quote-use" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7b5abe3fe82fdeeb93f44d66a7b444dedf2e4827defb0a8e69c437b2de2ef94" +dependencies = [ + "quote", + "quote-use-macros", + "syn 2.0.75", +] + +[[package]] +name = "quote-use-macros" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97ea44c7e20f16017a76a245bb42188517e13d16dcb1aa18044bc406cdc3f4af" +dependencies = [ + "derive-where", + "proc-macro2", + "quote", + "syn 2.0.75", +] + [[package]] name = "rand" version = "0.8.5" @@ -2192,7 +2322,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn", + "syn 2.0.75", "unicode-ident", ] @@ -2433,7 +2563,7 @@ checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -2600,6 +2730,16 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.75" @@ -2672,7 +2812,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -2758,7 +2898,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -2874,7 +3014,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -3126,7 +3266,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -3160,7 +3300,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3218,7 +3358,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -3412,7 +3552,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index cbd2c9a..31d0618 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ bytes = "1.7.1" clap = { version = "4.5.16", default-features = false, features = ["derive", "error-context", "help", "std", "suggestions", "usage", "wrap_help"] } enum_dispatch = "0.3.13" futures-util = "0.3.30" +get-size = { version = "0.1.4", features = ["derive"] } humansize = "2.1.3" indoc = "2.0.5" itertools = "0.13.0" diff --git a/src/paths/component.rs b/src/paths/component.rs index 580c3fa..36dc694 100644 --- a/src/paths/component.rs +++ b/src/paths/component.rs @@ -1,3 +1,4 @@ +use get_size::GetSize; use smartstring::alias::CompactString; use thiserror::Error; @@ -36,6 +37,16 @@ impl Component { } } +impl GetSize for Component { + fn get_heap_size(&self) -> usize { + if self.0.is_inline() { + 0 + } else { + self.0.capacity() + } + } +} + #[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] pub(crate) enum ParseComponentError { #[error("path components cannot be empty")] diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs index 65fdcf3..721a82c 100644 --- a/src/zarrman/manifest.rs +++ b/src/zarrman/manifest.rs @@ -1,11 +1,12 @@ use crate::paths::{Component, PurePath}; +use get_size::GetSize; use itertools::{Itertools, Position}; use serde::Deserialize; use std::collections::BTreeMap; use time::OffsetDateTime; /// A parsed Zarr manifest -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Deserialize, Eq, GetSize, PartialEq)] pub(super) struct Manifest { /// A tree of the Zarr's entries pub(super) entries: ManifestFolder, @@ -39,7 +40,7 @@ pub(super) enum EntryRef<'a> { /// subdirectory names to the entries & subdirectories pub(super) type ManifestFolder = BTreeMap; -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Deserialize, Eq, GetSize, PartialEq)] #[serde(untagged)] pub(super) enum FolderEntry { Folder(ManifestFolder), @@ -48,7 +49,7 @@ pub(super) enum FolderEntry { /// Information on a Zarr entry in a manifest as of the point in time /// represented by the manifest -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Deserialize, Eq, GetSize, PartialEq)] pub(super) struct ManifestEntry { // IMPORTANT: Keep these fields in this order so that deserialization will // work properly! @@ -56,6 +57,7 @@ pub(super) struct ManifestEntry { pub(super) version_id: String, /// The entry's S3 object's modification time + #[get_size(size = 0)] // Nothing on the heap #[serde(with = "time::serde::rfc3339")] pub(super) modified: OffsetDateTime, diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index a02513d..0106f1a 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -20,6 +20,7 @@ pub(crate) use self::resources::*; use crate::dav::ErrorClass; use crate::httputil::{BuildClientError, Client, HttpError, HttpUrl}; use crate::paths::{Component, PureDirPath, PurePath}; +use get_size::GetSize; use moka::{ future::{Cache, CacheBuilder}, ops::compute::{CompResult, Op}, @@ -85,11 +86,12 @@ impl ZarrManClient { let manifests = CacheBuilder::new(MANIFEST_CACHE_SIZE) .name("zarr-manifests") .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) - .eviction_listener(|path, _, cause| { + .eviction_listener(|path, manifest: Arc, cause| { tracing::debug!( cache_event = "evict", cache = "zarr-manifests", manifest = %path, + manifest_size = manifest.get_size(), ?cause, "Zarr manifest evicted from cache", ); @@ -313,6 +315,7 @@ impl ZarrManClient { cache_event = "miss_post", cache = "zarr-manifests", manifest = %path, + manifest_size = entry.value().get_size(), approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from repository", ); @@ -323,6 +326,7 @@ impl ZarrManClient { cache_event = "hit", cache = "zarr-manifests", manifest = %path, + manifest_size = entry.value().get_size(), approx_cache_len = self.manifests.entry_count(), "Fetched Zarr manifest from cache", ); From b7c13fbdaca1ef447f8d1cc67e8d117a56f34524 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 5 Aug 2024 10:44:11 -0400 Subject: [PATCH 10/14] Limit Zarr manifest cache by total bytes --- src/zarrman/mod.rs | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 0106f1a..2357b05 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -48,8 +48,9 @@ static MANIFEST_ROOT_URL: &str = /// `{ENTRY_DOWNLOAD_PREFIX}/{zarr_id}/{entry_path}`. static ENTRY_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr/"; -/// The maximum number of manifests cached at once -const MANIFEST_CACHE_SIZE: u64 = 16; +/// Limit the manifest cache to storing no more than this many bytes of parsed +/// manifests at once +const MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB /// Expire any manifest cache entries that haven't been accessed for this long const MANIFEST_CACHE_IDLE_EXPIRY: Duration = Duration::from_secs(300); @@ -83,20 +84,24 @@ impl ZarrManClient { /// Returns an error if construction of the inner `reqwest::Client` fails pub(crate) fn new() -> Result { let inner = Client::new()?; - let manifests = CacheBuilder::new(MANIFEST_CACHE_SIZE) - .name("zarr-manifests") - .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) - .eviction_listener(|path, manifest: Arc, cause| { - tracing::debug!( - cache_event = "evict", - cache = "zarr-manifests", - manifest = %path, - manifest_size = manifest.get_size(), - ?cause, - "Zarr manifest evicted from cache", - ); - }) - .build(); + let manifests: Cache> = + CacheBuilder::new(MANIFEST_CACHE_TOTAL_BYTES) + .name("zarr-manifests") + .weigher(|_, manifest: &Arc| { + u32::try_from(manifest.get_size()).unwrap_or(u32::MAX) + }) + .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) + .eviction_listener(|path, manifest, cause| { + tracing::debug!( + cache_event = "evict", + cache = "zarr-manifests", + manifest = %path, + manifest_size = manifest.get_size(), + ?cause, + "Zarr manifest evicted from cache", + ); + }) + .build(); let manifest_root_url = MANIFEST_ROOT_URL .parse::() .expect("MANIFEST_ROOT_URL should be a valid HTTP URL"); @@ -296,6 +301,7 @@ impl ZarrManClient { cache = "zarr-manifests", manifest = %path, approx_cache_len = self.manifests.entry_count(), + approx_cache_size = self.manifests.weighted_size(), "Cache miss for Zarr manifest; about to fetch from repository", ); self.inner @@ -317,6 +323,7 @@ impl ZarrManClient { manifest = %path, manifest_size = entry.value().get_size(), approx_cache_len = self.manifests.entry_count(), + approx_cache_size = self.manifests.weighted_size(), "Fetched Zarr manifest from repository", ); entry @@ -328,6 +335,7 @@ impl ZarrManClient { manifest = %path, manifest_size = entry.value().get_size(), approx_cache_len = self.manifests.entry_count(), + approx_cache_size = self.manifests.weighted_size(), "Fetched Zarr manifest from cache", ); entry From 7f72bab1f15a3154e6a7a8ef7012d5bd0f679bce Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 5 Aug 2024 11:22:05 -0400 Subject: [PATCH 11/14] Abstract/encapsulate Zarr manifest tree interactions --- src/main.rs | 5 +- src/zarrman/consts.rs | 27 +++++ src/zarrman/fetcher.rs | 137 ++++++++++++++++++++++++++ src/zarrman/mod.rs | 217 +++++------------------------------------ src/zarrman/util.rs | 48 +++++++++ 5 files changed, 237 insertions(+), 197 deletions(-) create mode 100644 src/zarrman/consts.rs create mode 100644 src/zarrman/fetcher.rs create mode 100644 src/zarrman/util.rs diff --git a/src/main.rs b/src/main.rs index 83dd6d9..81c095c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,7 +13,7 @@ use crate::consts::{CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE}; use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; use crate::httputil::HttpUrl; -use crate::zarrman::ZarrManClient; +use crate::zarrman::{ManifestFetcher, ZarrManClient}; use anyhow::Context; use axum::{ body::Body, @@ -97,7 +97,8 @@ fn main() -> anyhow::Result<()> { async fn run() -> anyhow::Result<()> { let args = Arguments::parse(); let dandi = DandiClient::new(args.api_url)?; - let zarrman = ZarrManClient::new()?; + let zarrfetcher = ManifestFetcher::new()?; + let zarrman = ZarrManClient::new(zarrfetcher); let templater = Templater::new(args.title)?; let dav = Arc::new(DandiDav { dandi, diff --git a/src/zarrman/consts.rs b/src/zarrman/consts.rs new file mode 100644 index 0000000..110e034 --- /dev/null +++ b/src/zarrman/consts.rs @@ -0,0 +1,27 @@ +//! Constants and compile-time configuration for the `/zarrs/` hierarchy +use std::time::Duration; + +/// The manifest root URL. +/// +/// This is the base URL of the manifest tree (a URL hierarchy containing Zarr +/// manifests). +/// +/// The current value is a subdirectory of a mirror of +/// . +pub(super) static MANIFEST_ROOT_URL: &str = + "https://datasets.datalad.org/dandi/zarr-manifests/zarr-manifests-v2-sorted/"; + +/// The URL beneath which Zarr entries listed in the Zarr manifests should be +/// available for download. +/// +/// Given a Zarr with Zarr ID `zarr_id` and an entry therein at path +/// `entry_path`, the download URL for the entry is expected to be +/// `{ENTRY_DOWNLOAD_PREFIX}/{zarr_id}/{entry_path}`. +pub(super) static ENTRY_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr/"; + +/// Limit the manifest cache to storing no more than this many bytes of parsed +/// manifests at once +pub(super) const MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB + +/// Expire any manifest cache entries that haven't been accessed for this long +pub(super) const MANIFEST_CACHE_IDLE_EXPIRY: Duration = Duration::from_secs(300); diff --git a/src/zarrman/fetcher.rs b/src/zarrman/fetcher.rs new file mode 100644 index 0000000..8de7eb7 --- /dev/null +++ b/src/zarrman/fetcher.rs @@ -0,0 +1,137 @@ +use super::consts::{MANIFEST_CACHE_IDLE_EXPIRY, MANIFEST_CACHE_TOTAL_BYTES, MANIFEST_ROOT_URL}; +use super::manifest::Manifest; +use super::resources::ManifestPath; +use super::util::{Index, ZarrManError}; +use crate::httputil::{BuildClientError, Client, HttpError, HttpUrl}; +use crate::paths::PureDirPath; +use get_size::GetSize; +use moka::{ + future::{Cache, CacheBuilder}, + ops::compute::{CompResult, Op}, +}; +use std::sync::Arc; + +/// A client for fetching & caching data from the manifest tree +#[derive(Clone, Debug)] +pub(crate) struct ManifestFetcher { + /// The HTTP client used for making requests to the manifest tree + inner: Client, + + /// A cache of parsed manifest files, keyed by their path under + /// `MANIFEST_ROOT_URL` + cache: Cache>, + + /// [`MANIFEST_ROOT_URL`], parsed into an [`HttpUrl`] + manifest_root_url: HttpUrl, +} + +impl ManifestFetcher { + /// Construct a new client instance + /// + /// # Errors + /// + /// Returns an error if construction of the inner `reqwest::Client` fails + pub(crate) fn new() -> Result { + let inner = Client::new()?; + let cache: Cache> = + CacheBuilder::new(MANIFEST_CACHE_TOTAL_BYTES) + .name("zarr-manifests") + .weigher(|_, manifest: &Arc| { + u32::try_from(manifest.get_size()).unwrap_or(u32::MAX) + }) + .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) + .eviction_listener(|path, manifest, cause| { + tracing::debug!( + cache_event = "evict", + cache = "zarr-manifests", + manifest = %path, + manifest_size = manifest.get_size(), + ?cause, + "Zarr manifest evicted from cache", + ); + }) + .build(); + let manifest_root_url = MANIFEST_ROOT_URL + .parse::() + .expect("MANIFEST_ROOT_URL should be a valid HTTP URL"); + Ok(ManifestFetcher { + inner, + cache, + manifest_root_url, + }) + } + + /// Retrieve the manifest index in the given directory of the manifest + /// tree. + /// + /// `path` must be relative to the manifest root. A `path` of `None` + /// denotes the manifest root itself. + pub(super) async fn fetch_index(&self, path: Option<&PureDirPath>) -> Result { + let mut url = self.manifest_root_url.clone(); + if let Some(p) = path { + url.extend(p.component_strs()).ensure_dirpath(); + } + self.inner.get_json::(url).await + } + + /// Retrieve the Zarr manifest at the given [`ManifestPath`] in the + /// manifest tree, either via an HTTP request or from a cache + #[tracing::instrument(skip_all, fields(id = %uuid::Uuid::new_v4(), manifest = %path))] + pub(super) async fn fetch_manifest( + &self, + path: &ManifestPath, + ) -> Result, ZarrManError> { + let result = self + .cache + .entry_by_ref(path) + .and_try_compute_with(|entry| async move { + if entry.is_none() { + tracing::debug!( + cache_event = "miss_pre", + cache = "zarr-manifests", + manifest = %path, + approx_cache_len = self.cache.entry_count(), + approx_cache_size = self.cache.weighted_size(), + "Cache miss for Zarr manifest; about to fetch from repository", + ); + self.inner + .get_json::(path.under_manifest_root(&self.manifest_root_url)) + .await + .map(|zman| Op::Put(Arc::new(zman))) + } else { + Ok(Op::Nop) + } + }) + .await?; + let entry = match result { + CompResult::Inserted(entry) => { + tracing::debug!( + cache_event = "miss_post", + cache = "zarr-manifests", + manifest = %path, + manifest_size = entry.value().get_size(), + approx_cache_len = self.cache.entry_count(), + approx_cache_size = self.cache.weighted_size(), + "Fetched Zarr manifest from repository", + ); + entry + } + CompResult::Unchanged(entry) => { + tracing::debug!( + cache_event = "hit", + cache = "zarr-manifests", + manifest = %path, + manifest_size = entry.value().get_size(), + approx_cache_len = self.cache.entry_count(), + approx_cache_size = self.cache.weighted_size(), + "Fetched Zarr manifest from cache", + ); + entry + } + _ => unreachable!( + "Call to and_try_compute_with() should only ever return Inserted or Unchanged" + ), + }; + Ok(entry.into_value()) + } +} diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 2357b05..9d25532 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -12,61 +12,25 @@ //! `.json` extension changed to `.zarr`) containing the respective Zarrs' //! entry hierarchies. +mod consts; +mod fetcher; mod manifest; mod path; mod resources; +mod util; +use self::consts::ENTRY_DOWNLOAD_PREFIX; +pub(crate) use self::fetcher::ManifestFetcher; use self::path::ReqPath; pub(crate) use self::resources::*; -use crate::dav::ErrorClass; -use crate::httputil::{BuildClientError, Client, HttpError, HttpUrl}; -use crate::paths::{Component, PureDirPath, PurePath}; -use get_size::GetSize; -use moka::{ - future::{Cache, CacheBuilder}, - ops::compute::{CompResult, Op}, -}; -use serde::Deserialize; -use std::sync::Arc; -use std::time::Duration; -use thiserror::Error; - -/// The manifest root URL. -/// -/// This is the base URL of the manifest tree (a URL hierarchy containing Zarr -/// manifests). -/// -/// The current value is a subdirectory of a mirror of -/// . -static MANIFEST_ROOT_URL: &str = - "https://datasets.datalad.org/dandi/zarr-manifests/zarr-manifests-v2-sorted/"; - -/// The URL beneath which Zarr entries listed in the Zarr manifests should be -/// available for download. -/// -/// Given a Zarr with Zarr ID `zarr_id` and an entry therein at path -/// `entry_path`, the download URL for the entry is expected to be -/// `{ENTRY_DOWNLOAD_PREFIX}/{zarr_id}/{entry_path}`. -static ENTRY_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr/"; - -/// Limit the manifest cache to storing no more than this many bytes of parsed -/// manifests at once -const MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB - -/// Expire any manifest cache entries that haven't been accessed for this long -const MANIFEST_CACHE_IDLE_EXPIRY: Duration = Duration::from_secs(300); +pub(crate) use self::util::ZarrManError; +use crate::httputil::HttpUrl; +use crate::paths::{PureDirPath, PurePath}; /// A client for fetching data about Zarrs via Zarr manifest files #[derive(Clone, Debug)] pub(crate) struct ZarrManClient { - /// The HTTP client used for making requests to the manifest tree - inner: Client, - - /// A cache of parsed manifest files, keyed by their path under - /// `MANIFEST_ROOT_URL` - manifests: Cache>, - - /// [`MANIFEST_ROOT_URL`], parsed into an [`HttpUrl`] - manifest_root_url: HttpUrl, + /// The actual client for fetching & caching Zarr manifests + fetcher: ManifestFetcher, /// [`ENTRY_DOWNLOAD_PREFIX`], parsed into an [`HttpUrl`] entry_download_prefix: HttpUrl, @@ -78,46 +42,18 @@ pub(crate) struct ZarrManClient { impl ZarrManClient { /// Construct a new client instance - /// - /// # Errors - /// - /// Returns an error if construction of the inner `reqwest::Client` fails - pub(crate) fn new() -> Result { - let inner = Client::new()?; - let manifests: Cache> = - CacheBuilder::new(MANIFEST_CACHE_TOTAL_BYTES) - .name("zarr-manifests") - .weigher(|_, manifest: &Arc| { - u32::try_from(manifest.get_size()).unwrap_or(u32::MAX) - }) - .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) - .eviction_listener(|path, manifest, cause| { - tracing::debug!( - cache_event = "evict", - cache = "zarr-manifests", - manifest = %path, - manifest_size = manifest.get_size(), - ?cause, - "Zarr manifest evicted from cache", - ); - }) - .build(); - let manifest_root_url = MANIFEST_ROOT_URL - .parse::() - .expect("MANIFEST_ROOT_URL should be a valid HTTP URL"); + pub(crate) fn new(fetcher: ManifestFetcher) -> Self { let entry_download_prefix = ENTRY_DOWNLOAD_PREFIX .parse::() .expect("ENTRY_DOWNLOAD_PREFIX should be a valid HTTP URL"); let web_path_prefix = "zarrs/" .parse::() .expect(r#""zarrs/" should be a valid directory path"#); - Ok(ZarrManClient { - inner, - manifests, - manifest_root_url, + ZarrManClient { + fetcher, entry_download_prefix, web_path_prefix, - }) + } } /// Retrieve the resources at the top level of `/zarrs/`, i.e., those @@ -148,14 +84,14 @@ impl ZarrManClient { } ReqPath::Manifest(path) => { // Make a request to confirm that manifest exists - let _ = self.get_zarr_manifest(&path).await?; + let _ = self.fetcher.fetch_manifest(&path).await?; Ok(ZarrManResource::Manifest(Manifest { path })) } ReqPath::InManifest { manifest_path, entry_path, } => { - let man = self.get_zarr_manifest(&manifest_path).await?; + let man = self.fetcher.fetch_manifest(&manifest_path).await?; match man.get(&entry_path) { Some(manifest::EntryRef::Folder(_)) => { let web_path = manifest_path @@ -197,7 +133,7 @@ impl ZarrManClient { Ok(ZarrManResourceWithChildren::WebFolder { folder, children }) } ReqPath::Manifest(path) => { - let man = self.get_zarr_manifest(&path).await?; + let man = self.fetcher.fetch_manifest(&path).await?; let children = self.convert_manifest_folder_children(&path, None, &man.entries); let folder = Manifest { path }; Ok(ZarrManResourceWithChildren::Manifest { folder, children }) @@ -206,7 +142,7 @@ impl ZarrManClient { manifest_path, entry_path, } => { - let man = self.get_zarr_manifest(&manifest_path).await?; + let man = self.fetcher.fetch_manifest(&manifest_path).await?; match man.get(&entry_path) { Some(manifest::EntryRef::Folder(folref)) => { let web_path = manifest_path @@ -236,18 +172,14 @@ impl ZarrManClient { /// Retrieve the resources in the given directory of the manifest tree. /// - /// `path` must be relative to the manifest root. Unlike the - /// `get_resource*()` methods, Zarr manifests are not transparently - /// converted to collections. + /// `path` must be relative to the manifest root. A `path` of `None` + /// denotes the manifest root itself. Unlike the `get_resource*()` + /// methods, Zarr manifests are not transparently converted to collections. async fn get_index_entries( &self, path: Option<&PureDirPath>, ) -> Result, ZarrManError> { - let mut url = self.manifest_root_url.clone(); - if let Some(p) = path { - url.extend(p.component_strs()).ensure_dirpath(); - } - let index = self.inner.get_json::(url).await?; + let index = self.fetcher.fetch_index(path).await?; let mut entries = Vec::with_capacity(index.files.len().saturating_add(index.directories.len())); if let Some(path) = path { @@ -284,69 +216,6 @@ impl ZarrManClient { Ok(entries) } - /// Retrieve the Zarr manifest at the given [`ManifestPath`] in the - /// manifest tree, either via an HTTP request or from a cache - #[tracing::instrument(skip_all, fields(id = %uuid::Uuid::new_v4(), manifest = %path))] - async fn get_zarr_manifest( - &self, - path: &ManifestPath, - ) -> Result, ZarrManError> { - let result = self - .manifests - .entry_by_ref(path) - .and_try_compute_with(|entry| async move { - if entry.is_none() { - tracing::debug!( - cache_event = "miss_pre", - cache = "zarr-manifests", - manifest = %path, - approx_cache_len = self.manifests.entry_count(), - approx_cache_size = self.manifests.weighted_size(), - "Cache miss for Zarr manifest; about to fetch from repository", - ); - self.inner - .get_json::( - path.under_manifest_root(&self.manifest_root_url), - ) - .await - .map(|zman| Op::Put(Arc::new(zman))) - } else { - Ok(Op::Nop) - } - }) - .await?; - let entry = match result { - CompResult::Inserted(entry) => { - tracing::debug!( - cache_event = "miss_post", - cache = "zarr-manifests", - manifest = %path, - manifest_size = entry.value().get_size(), - approx_cache_len = self.manifests.entry_count(), - approx_cache_size = self.manifests.weighted_size(), - "Fetched Zarr manifest from repository", - ); - entry - } - CompResult::Unchanged(entry) => { - tracing::debug!( - cache_event = "hit", - cache = "zarr-manifests", - manifest = %path, - manifest_size = entry.value().get_size(), - approx_cache_len = self.manifests.entry_count(), - approx_cache_size = self.manifests.weighted_size(), - "Fetched Zarr manifest from cache", - ); - entry - } - _ => unreachable!( - "Call to and_try_compute_with() should only ever return Inserted or Unchanged" - ), - }; - Ok(entry.into_value()) - } - /// Convert the [`manifest::ManifestEntry`] `entry` with path `entry_path` /// in the manifest at `manifest_path` to a [`ManifestEntry`]. /// @@ -408,45 +277,3 @@ impl ZarrManClient { children } } - -#[derive(Debug, Error)] -pub(crate) enum ZarrManError { - /// An HTTP error occurred while interacting with the manifest tree - #[error(transparent)] - Http(#[from] HttpError), - - /// The request path was invalid for the `/zarrs/` hierarchy - #[error("invalid path requested: {path:?}")] - InvalidPath { path: PurePath }, - - /// An request was made for a nonexistent path inside an extant Zarr - #[error("path {entry_path:?} inside manifest at {manifest_path:?} does not exist")] - ManifestPathNotFound { - manifest_path: ManifestPath, - entry_path: PurePath, - }, -} - -impl ZarrManError { - /// Classify the general type of error - pub(crate) fn class(&self) -> ErrorClass { - match self { - ZarrManError::Http(source) => source.class(), - ZarrManError::InvalidPath { .. } | ZarrManError::ManifestPathNotFound { .. } => { - ErrorClass::NotFound - } - } - } -} - -/// A directory listing parsed from the response to a `GET` request to a -/// directory in the manifest tree -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -struct Index { - // Returned by the manifest tree API but not used by dandidav: - //path: String, - /// The names of the files in the directory - files: Vec, - /// The names of the subdirectories of the directory - directories: Vec, -} diff --git a/src/zarrman/util.rs b/src/zarrman/util.rs new file mode 100644 index 0000000..5e99fae --- /dev/null +++ b/src/zarrman/util.rs @@ -0,0 +1,48 @@ +use super::resources::ManifestPath; +use crate::dav::ErrorClass; +use crate::httputil::HttpError; +use crate::paths::{Component, PurePath}; +use serde::Deserialize; +use thiserror::Error; + +#[derive(Debug, Error)] +pub(crate) enum ZarrManError { + /// An HTTP error occurred while interacting with the manifest tree + #[error(transparent)] + Http(#[from] HttpError), + + /// The request path was invalid for the `/zarrs/` hierarchy + #[error("invalid path requested: {path:?}")] + InvalidPath { path: PurePath }, + + /// An request was made for a nonexistent path inside an extant Zarr + #[error("path {entry_path:?} inside manifest at {manifest_path:?} does not exist")] + ManifestPathNotFound { + manifest_path: ManifestPath, + entry_path: PurePath, + }, +} + +impl ZarrManError { + /// Classify the general type of error + pub(crate) fn class(&self) -> ErrorClass { + match self { + ZarrManError::Http(source) => source.class(), + ZarrManError::InvalidPath { .. } | ZarrManError::ManifestPathNotFound { .. } => { + ErrorClass::NotFound + } + } + } +} + +/// A directory listing parsed from the response to a `GET` request to a +/// directory in the manifest tree +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +pub(super) struct Index { + // Returned by the manifest tree API but not used by dandidav: + //pub(super) path: String, + /// The names of the files in the directory + pub(super) files: Vec, + /// The names of the subdirectories of the directory + pub(super) directories: Vec, +} From 824bfb7983b725dd8cb6cc450fb1b4fc0b7e5ae4 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 5 Aug 2024 11:51:47 -0400 Subject: [PATCH 12/14] Log cache contents every hour --- Cargo.toml | 4 +-- src/consts.rs | 4 +++ src/main.rs | 5 +++- src/zarrman/fetcher.rs | 52 +++++++++++++++++++++++++++++++++++++++ tools/zarr-cache-stats.py | 2 ++ 5 files changed, 64 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 31d0618..be33bf0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,7 +45,7 @@ smartstring = "1.0.1" tera = { version = "1.20.0", default-features = false } thiserror = "1.0.63" time = { version = "0.3.36", features = ["formatting", "macros", "parsing", "serde"] } -tokio = { version = "1.39.3", features = ["macros", "net", "rt-multi-thread"] } +tokio = { version = "1.39.3", features = ["macros", "net", "rt-multi-thread", "time"] } tower = { version = "0.5.0", features = ["util"] } tower-http = { version = "0.5.2", features = ["set-header", "trace"] } tracing = "0.1.40" @@ -199,7 +199,7 @@ ignored_unit_patterns = "deny" impl_trait_in_params = "deny" implicit_clone = "deny" imprecise_flops = "deny" -infinite_loop = "deny" +#infinite_loop = "deny" index_refutable_slice = "deny" invalid_upcast_comparisons = "deny" items_after_statements = "deny" diff --git a/src/consts.rs b/src/consts.rs index 48bbd93..65761df 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,4 +1,5 @@ //! Constants and program-wide compile-time configuration +use std::time::Duration; use time::{format_description::FormatItem, macros::format_description}; /// The "User-Agent" value sent in outgoing HTTP requests @@ -57,6 +58,9 @@ pub(crate) static HTML_TIMESTAMP_FORMAT: &[FormatItem<'_>] = /// This list must be kept in sorted order; this is enforced by a test below. pub(crate) static FAST_NOT_EXIST: &[&str] = &[".bzr", ".git", ".nols", ".svn"]; +/// Interval between periodic logging of the Zarr manifest cache's contents +pub(crate) const ZARR_MANIFEST_CACHE_DUMP_PERIOD: Duration = Duration::from_secs(3600); + #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index 81c095c..4a14b91 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,9 @@ mod paths; mod s3; mod streamutil; mod zarrman; -use crate::consts::{CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE}; +use crate::consts::{ + CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE, ZARR_MANIFEST_CACHE_DUMP_PERIOD, +}; use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; use crate::httputil::HttpUrl; @@ -98,6 +100,7 @@ async fn run() -> anyhow::Result<()> { let args = Arguments::parse(); let dandi = DandiClient::new(args.api_url)?; let zarrfetcher = ManifestFetcher::new()?; + zarrfetcher.install_periodic_dump(ZARR_MANIFEST_CACHE_DUMP_PERIOD); let zarrman = ZarrManClient::new(zarrfetcher); let templater = Templater::new(args.title)?; let dav = Arc::new(DandiDav { diff --git a/src/zarrman/fetcher.rs b/src/zarrman/fetcher.rs index 8de7eb7..2943e33 100644 --- a/src/zarrman/fetcher.rs +++ b/src/zarrman/fetcher.rs @@ -9,7 +9,9 @@ use moka::{ future::{Cache, CacheBuilder}, ops::compute::{CompResult, Op}, }; +use serde::Serialize; use std::sync::Arc; +use std::time::Duration; /// A client for fetching & caching data from the manifest tree #[derive(Clone, Debug)] @@ -134,4 +136,54 @@ impl ManifestFetcher { }; Ok(entry.into_value()) } + + pub(crate) fn install_periodic_dump(&self, period: Duration) { + let this = self.clone(); + let mut schedule = tokio::time::interval(period); + schedule.reset(); // Don't tick immediately + schedule.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + tokio::spawn({ + async move { + loop { + schedule.tick().await; + this.log_cache(); + } + } + }); + } + + pub(crate) fn log_cache(&self) { + let entries = self + .cache + .iter() + .map(|(path, manifest)| EntryStat { + manifest_path: path.to_string(), + size: manifest.get_size(), + }) + .collect::>(); + match serde_json::to_string(&entries) { + Ok(entries_json) => { + tracing::debug!( + cache_event = "dump", + cache = "zarr-manifests", + %entries_json, + "Dumping cached manifests and their sizes", + ); + } + Err(e) => { + tracing::warn!( + cache_event = "dump-error", + cache = "zarr-manifests", + error = %e, + "Failed to serialize cache contents as JSON", + ); + } + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +struct EntryStat { + manifest_path: String, + size: usize, } diff --git a/tools/zarr-cache-stats.py b/tools/zarr-cache-stats.py index f8f7f01..6d31a84 100755 --- a/tools/zarr-cache-stats.py +++ b/tools/zarr-cache-stats.py @@ -248,6 +248,8 @@ def process_logs(logfiles: Iterable[Path]) -> Events: ) ) last_accesses.pop(manifest_path, None) + case "dump" | "dump-error": + pass case other: log.warning( "Invalid 'cache_event' field value %r: %s", other, lg.line From bc6ed8e793e00d62e22d00e75bd376c2712e82c8 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 6 Aug 2024 08:47:49 -0400 Subject: [PATCH 13/14] `--zarrman-cache-bytes` --- CHANGELOG.md | 6 ++++++ README.md | 4 ++++ src/consts.rs | 4 ++++ src/main.rs | 11 +++++++---- src/zarrman/consts.rs | 4 ---- src/zarrman/fetcher.rs | 39 +++++++++++++++++++-------------------- 6 files changed, 40 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 039b0f7..9a888c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ In Development - Return 502 status when a backend returns an invalid response - Require `--api-url` (and other URLs retrieved from APIs) to be HTTP(S) - Added various developer-facing documents to the repository +- Format all log lines as JSON +- Add logging of Zarr manifest cache events +- Limit Zarr manifest cache by total size of entries + - Add a `-Z`/`--zarrman-cache-bytes` option for setting the cache size +- Expire idle Zarr manifest cache entries +- Log Zarr manifest cache entries every hour v0.4.0 (2024-07-09) ------------------- diff --git a/README.md b/README.md index e60ac5b..70c34c4 100644 --- a/README.md +++ b/README.md @@ -119,3 +119,7 @@ Options - `-T `, `--title <TITLE>` — Specify the site name to use in HTML/web views of collections (used inside `<title>`'s and as the root breadcrumb text) [default: dandidav] + +- `-Z <INT>`, `--zarrman-cache-bytes <INT>` — Specify the maximum number of + bytes of parsed Zarr manifest files to store in the Zarr manifest cache at + once [default: 100 MiB] diff --git a/src/consts.rs b/src/consts.rs index 65761df..96fcb39 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -61,6 +61,10 @@ pub(crate) static FAST_NOT_EXIST: &[&str] = &[".bzr", ".git", ".nols", ".svn"]; /// Interval between periodic logging of the Zarr manifest cache's contents pub(crate) const ZARR_MANIFEST_CACHE_DUMP_PERIOD: Duration = Duration::from_secs(3600); +/// Default size of the Zarr manifest cache; the cache is limited to storing no +/// more than this many bytes of parsed manifests at once +pub(crate) const ZARR_MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB + #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index 4a14b91..21930db 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,9 +9,7 @@ mod paths; mod s3; mod streamutil; mod zarrman; -use crate::consts::{ - CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE, ZARR_MANIFEST_CACHE_DUMP_PERIOD, -}; +use crate::consts::*; use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; use crate::httputil::HttpUrl; @@ -67,6 +65,11 @@ struct Arguments { /// Site name to use in HTML collection pages #[arg(short = 'T', long, default_value = env!("CARGO_PKG_NAME"))] title: String, + + /// Limit the Zarr manifest cache to storing no more than this many bytes + /// of parsed manifests at once + #[arg(short = 'Z', long, default_value_t = ZARR_MANIFEST_CACHE_TOTAL_BYTES, value_name = "INT")] + zarrman_cache_bytes: u64, } // See @@ -99,7 +102,7 @@ fn main() -> anyhow::Result<()> { async fn run() -> anyhow::Result<()> { let args = Arguments::parse(); let dandi = DandiClient::new(args.api_url)?; - let zarrfetcher = ManifestFetcher::new()?; + let zarrfetcher = ManifestFetcher::new(args.zarrman_cache_bytes)?; zarrfetcher.install_periodic_dump(ZARR_MANIFEST_CACHE_DUMP_PERIOD); let zarrman = ZarrManClient::new(zarrfetcher); let templater = Templater::new(args.title)?; diff --git a/src/zarrman/consts.rs b/src/zarrman/consts.rs index 110e034..32bee51 100644 --- a/src/zarrman/consts.rs +++ b/src/zarrman/consts.rs @@ -19,9 +19,5 @@ pub(super) static MANIFEST_ROOT_URL: &str = /// `{ENTRY_DOWNLOAD_PREFIX}/{zarr_id}/{entry_path}`. pub(super) static ENTRY_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr/"; -/// Limit the manifest cache to storing no more than this many bytes of parsed -/// manifests at once -pub(super) const MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB - /// Expire any manifest cache entries that haven't been accessed for this long pub(super) const MANIFEST_CACHE_IDLE_EXPIRY: Duration = Duration::from_secs(300); diff --git a/src/zarrman/fetcher.rs b/src/zarrman/fetcher.rs index 2943e33..0845906 100644 --- a/src/zarrman/fetcher.rs +++ b/src/zarrman/fetcher.rs @@ -1,4 +1,4 @@ -use super::consts::{MANIFEST_CACHE_IDLE_EXPIRY, MANIFEST_CACHE_TOTAL_BYTES, MANIFEST_ROOT_URL}; +use super::consts::{MANIFEST_CACHE_IDLE_EXPIRY, MANIFEST_ROOT_URL}; use super::manifest::Manifest; use super::resources::ManifestPath; use super::util::{Index, ZarrManError}; @@ -33,26 +33,25 @@ impl ManifestFetcher { /// # Errors /// /// Returns an error if construction of the inner `reqwest::Client` fails - pub(crate) fn new() -> Result<Self, BuildClientError> { + pub(crate) fn new(cache_size: u64) -> Result<Self, BuildClientError> { let inner = Client::new()?; - let cache: Cache<ManifestPath, Arc<Manifest>> = - CacheBuilder::new(MANIFEST_CACHE_TOTAL_BYTES) - .name("zarr-manifests") - .weigher(|_, manifest: &Arc<Manifest>| { - u32::try_from(manifest.get_size()).unwrap_or(u32::MAX) - }) - .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) - .eviction_listener(|path, manifest, cause| { - tracing::debug!( - cache_event = "evict", - cache = "zarr-manifests", - manifest = %path, - manifest_size = manifest.get_size(), - ?cause, - "Zarr manifest evicted from cache", - ); - }) - .build(); + let cache: Cache<ManifestPath, Arc<Manifest>> = CacheBuilder::new(cache_size) + .name("zarr-manifests") + .weigher(|_, manifest: &Arc<Manifest>| { + u32::try_from(manifest.get_size()).unwrap_or(u32::MAX) + }) + .time_to_idle(MANIFEST_CACHE_IDLE_EXPIRY) + .eviction_listener(|path, manifest, cause| { + tracing::debug!( + cache_event = "evict", + cache = "zarr-manifests", + manifest = %path, + manifest_size = manifest.get_size(), + ?cause, + "Zarr manifest evicted from cache", + ); + }) + .build(); let manifest_root_url = MANIFEST_ROOT_URL .parse::<HttpUrl>() .expect("MANIFEST_ROOT_URL should be a valid HTTP URL"); From 36a059c2bdb08ba33cff13edc8e138666fce50a8 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" <git@varonathe.org> Date: Thu, 15 Aug 2024 08:15:25 -0400 Subject: [PATCH 14/14] Change `--zarrman-cache-bytes` to `--zarrman-cache-mb` --- CHANGELOG.md | 2 +- README.md | 6 +++--- src/consts.rs | 4 ---- src/main.rs | 10 +++++----- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a888c4..fcc2a32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ In Development - Format all log lines as JSON - Add logging of Zarr manifest cache events - Limit Zarr manifest cache by total size of entries - - Add a `-Z`/`--zarrman-cache-bytes` option for setting the cache size + - Add a `-Z`/`--zarrman-cache-mb` option for setting the cache size - Expire idle Zarr manifest cache entries - Log Zarr manifest cache entries every hour diff --git a/README.md b/README.md index 70c34c4..dfeb852 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,6 @@ Options views of collections (used inside `<title>`'s and as the root breadcrumb text) [default: dandidav] -- `-Z <INT>`, `--zarrman-cache-bytes <INT>` — Specify the maximum number of - bytes of parsed Zarr manifest files to store in the Zarr manifest cache at - once [default: 100 MiB] +- `-Z <INT>`, `--zarrman-cache-mb <INT>` — Specify the maximum number of + megabytes (1,000,000 bytes) of parsed Zarr manifest files to store in the + Zarr manifest cache at once [default: 100] diff --git a/src/consts.rs b/src/consts.rs index 96fcb39..65761df 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -61,10 +61,6 @@ pub(crate) static FAST_NOT_EXIST: &[&str] = &[".bzr", ".git", ".nols", ".svn"]; /// Interval between periodic logging of the Zarr manifest cache's contents pub(crate) const ZARR_MANIFEST_CACHE_DUMP_PERIOD: Duration = Duration::from_secs(3600); -/// Default size of the Zarr manifest cache; the cache is limited to storing no -/// more than this many bytes of parsed manifests at once -pub(crate) const ZARR_MANIFEST_CACHE_TOTAL_BYTES: u64 = 100 * 1024 * 1024; // 100 MiB - #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index 21930db..f78d257 100644 --- a/src/main.rs +++ b/src/main.rs @@ -66,10 +66,10 @@ struct Arguments { #[arg(short = 'T', long, default_value = env!("CARGO_PKG_NAME"))] title: String, - /// Limit the Zarr manifest cache to storing no more than this many bytes - /// of parsed manifests at once - #[arg(short = 'Z', long, default_value_t = ZARR_MANIFEST_CACHE_TOTAL_BYTES, value_name = "INT")] - zarrman_cache_bytes: u64, + /// Limit the Zarr manifest cache to storing no more than this many + /// megabytes of parsed manifests at once + #[arg(short = 'Z', long, default_value_t = 100, value_name = "INT")] + zarrman_cache_mb: u64, } // See @@ -102,7 +102,7 @@ fn main() -> anyhow::Result<()> { async fn run() -> anyhow::Result<()> { let args = Arguments::parse(); let dandi = DandiClient::new(args.api_url)?; - let zarrfetcher = ManifestFetcher::new(args.zarrman_cache_bytes)?; + let zarrfetcher = ManifestFetcher::new(args.zarrman_cache_mb * 1_000_000)?; zarrfetcher.install_periodic_dump(ZARR_MANIFEST_CACHE_DUMP_PERIOD); let zarrman = ZarrManClient::new(zarrfetcher); let templater = Templater::new(args.title)?;