From ee97591e5d236ebabe8b038b67359c8e709b9730 Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:31:33 +0000 Subject: [PATCH 1/9] Mem limiter prototype Signed-off-by: Monthon Klongklaew --- Cargo.lock | 10 ++ mountpoint-s3-client/src/failure_client.rs | 8 +- mountpoint-s3-client/src/mock_client.rs | 10 +- .../src/mock_client/throughput_client.rs | 6 +- mountpoint-s3-client/src/object_client.rs | 14 ++ mountpoint-s3-client/src/s3_crt_client.rs | 7 + mountpoint-s3/Cargo.toml | 1 + mountpoint-s3/examples/prefetch_benchmark.rs | 12 +- mountpoint-s3/src/cli.rs | 22 ++- mountpoint-s3/src/fs.rs | 20 ++- mountpoint-s3/src/lib.rs | 3 +- mountpoint-s3/src/mem_limiter.rs | 103 +++++++++++++ mountpoint-s3/src/prefetch.rs | 142 ++++++++++++++---- .../src/prefetch/backpressure_controller.rs | 95 ++++++++++-- mountpoint-s3/src/prefetch/caching_stream.rs | 39 +++-- mountpoint-s3/src/prefetch/part_queue.rs | 34 +++-- mountpoint-s3/src/prefetch/part_stream.rs | 28 +++- mountpoint-s3/src/prefetch/task.rs | 13 +- 18 files changed, 477 insertions(+), 90 deletions(-) create mode 100644 mountpoint-s3/src/mem_limiter.rs diff --git a/Cargo.lock b/Cargo.lock index 97abdc5db..eb1142658 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2057,6 +2057,15 @@ dependencies = [ "url", ] +[[package]] +name = "humansize" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cb51c9a029ddc91b07a787f1d86b53ccfa49b0e86688c946ebe8d3555685dd7" +dependencies = [ + "libm", +] + [[package]] name = "humantime" version = "2.1.0" @@ -2482,6 +2491,7 @@ dependencies = [ "hdrhistogram", "hex", "httpmock", + "humansize", "lazy_static", "libc", "linked-hash-map", diff --git a/mountpoint-s3-client/src/failure_client.rs b/mountpoint-s3-client/src/failure_client.rs index b7d78a25d..bbc5cf254 100644 --- a/mountpoint-s3-client/src/failure_client.rs +++ b/mountpoint-s3-client/src/failure_client.rs @@ -16,8 +16,8 @@ use pin_project::pin_project; use crate::object_client::{ DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, PutObjectRequest, - PutObjectResult, UploadReview, + MemoryUsageStats, ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, + PutObjectRequest, PutObjectResult, UploadReview, }; use crate::ObjectClient; @@ -85,6 +85,10 @@ where self.client.initial_read_window_size() } + fn mem_usage_stats(&self) -> Option { + self.client.mem_usage_stats() + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client.rs b/mountpoint-s3-client/src/mock_client.rs index b663a9463..75d8d47e3 100644 --- a/mountpoint-s3-client/src/mock_client.rs +++ b/mountpoint-s3-client/src/mock_client.rs @@ -26,9 +26,9 @@ use crate::error_metadata::{ClientErrorMetadata, ProvideErrorMetadata}; use crate::object_client::{ Checksum, ChecksumAlgorithm, DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesParts, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, - HeadObjectResult, ListObjectsError, ListObjectsResult, ObjectAttribute, ObjectClient, ObjectClientError, - ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, PutObjectResult, - PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, + HeadObjectResult, ListObjectsError, ListObjectsResult, MemoryUsageStats, ObjectAttribute, ObjectClient, + ObjectClientError, ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, + PutObjectResult, PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, }; mod leaky_bucket; @@ -572,6 +572,10 @@ impl ObjectClient for MockClient { } } + fn mem_usage_stats(&self) -> Option { + None + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client/throughput_client.rs b/mountpoint-s3-client/src/mock_client/throughput_client.rs index 065791069..610f81ef4 100644 --- a/mountpoint-s3-client/src/mock_client/throughput_client.rs +++ b/mountpoint-s3-client/src/mock_client/throughput_client.rs @@ -13,7 +13,7 @@ use crate::mock_client::{MockClient, MockClientConfig, MockClientError, MockObje use crate::object_client::{ DeleteObjectError, DeleteObjectResult, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, + MemoryUsageStats, ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, }; use crate::types::ETag; @@ -113,6 +113,10 @@ impl ObjectClient for ThroughputMockClient { self.inner.initial_read_window_size() } + fn mem_usage_stats(&self) -> Option { + self.inner.mem_usage_stats() + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/object_client.rs b/mountpoint-s3-client/src/object_client.rs index b92bf796c..22b10d9f0 100644 --- a/mountpoint-s3-client/src/object_client.rs +++ b/mountpoint-s3-client/src/object_client.rs @@ -63,6 +63,16 @@ impl FromStr for ETag { } } +/// Memory usage stats for the client +pub struct MemoryUsageStats { + /// Reserved memory for the client. For [S3CrtClient], this value is a sum of primary storage + /// and secondary storage reserved memory. + pub mem_reserved: u64, + /// Actual used memory for the client. For [S3CrtClient], this value is a sum of primanry + /// storage and secondary storage used memory. + pub mem_used: u64, +} + /// A generic interface to S3-like object storage services. /// /// This trait defines the common methods that all object services implement. @@ -89,6 +99,10 @@ pub trait ObjectClient { /// This can be `None` if backpressure is disabled. fn initial_read_window_size(&self) -> Option; + /// Query current memory usage stats for the client. This can be `None` if the client + /// does not record the stats. + fn mem_usage_stats(&self) -> Option; + /// Delete a single object from the object store. /// /// DeleteObject will succeed even if the object within the bucket does not exist. diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index caf6bab45..caa3a7e87 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -1208,6 +1208,13 @@ impl ObjectClient for S3CrtClient { } } + fn mem_usage_stats(&self) -> Option { + let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); + let mem_reserved = crt_buffer_pool_stats.primary_reserved + crt_buffer_pool_stats.secondary_reserved; + let mem_used = crt_buffer_pool_stats.primary_used + crt_buffer_pool_stats.secondary_used; + Some(MemoryUsageStats { mem_reserved, mem_used }) + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3/Cargo.toml b/mountpoint-s3/Cargo.toml index 9e28aaca5..a6155487e 100644 --- a/mountpoint-s3/Cargo.toml +++ b/mountpoint-s3/Cargo.toml @@ -44,6 +44,7 @@ tracing = { version = "0.1.35", features = ["log"] } tracing-log = "0.2.0" tracing-subscriber = { version = "0.3.14", features = ["env-filter"] } async-stream = "0.3.5" +humansize = "2.1.3" [target.'cfg(target_os = "linux")'.dependencies] procfs = { version = "0.16.0", default-features = false } diff --git a/mountpoint-s3/examples/prefetch_benchmark.rs b/mountpoint-s3/examples/prefetch_benchmark.rs index fb1b93c2b..7532edde4 100644 --- a/mountpoint-s3/examples/prefetch_benchmark.rs +++ b/mountpoint-s3/examples/prefetch_benchmark.rs @@ -6,6 +6,8 @@ use std::time::Instant; use clap::{Arg, Command}; use futures::executor::block_on; +use mountpoint_s3::mem_limiter::MemoryLimiter; +use mountpoint_s3::object::ObjectId; use mountpoint_s3::prefetch::{default_prefetch, Prefetch, PrefetchResult}; use mountpoint_s3_client::config::{EndpointConfig, S3ClientConfig}; use mountpoint_s3_client::types::ETag; @@ -91,6 +93,7 @@ fn main() { config = config.part_size(part_size); } let client = Arc::new(S3CrtClient::new(config).expect("couldn't create client")); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let head_object_result = block_on(client.head_object(bucket, key)).expect("HeadObject failed"); let size = head_object_result.object.size; @@ -103,10 +106,17 @@ fn main() { let start = Instant::now(); + let object_id = ObjectId::new(key.clone(), etag.clone()); thread::scope(|scope| { for _ in 0..downloads { let received_bytes = received_bytes.clone(); - let mut request = manager.prefetch(client.clone(), bucket, key, size, etag.clone()); + let mut request = manager.prefetch( + client.clone(), + mem_limiter.clone(), + bucket.clone(), + object_id.clone(), + size, + ); scope.spawn(|| { futures::executor::block_on(async move { diff --git a/mountpoint-s3/src/cli.rs b/mountpoint-s3/src/cli.rs index 79132536e..a247f539a 100644 --- a/mountpoint-s3/src/cli.rs +++ b/mountpoint-s3/src/cli.rs @@ -24,6 +24,7 @@ use mountpoint_s3_crt::io::event_loop::EventLoopGroup; use nix::sys::signal::Signal; use nix::unistd::ForkResult; use regex::Regex; +use sysinfo::{RefreshKind, System}; use crate::build_info; use crate::data_cache::{CacheLimit, DiskDataCache, DiskDataCacheConfig, ExpressDataCache, ManagedCacheDir}; @@ -156,6 +157,15 @@ pub struct CliArgs { )] pub max_threads: u64, + #[clap( + long, + help = "Maximum memory usage target [default: 95% of total system memory with a minimum of 512 MiB]", + value_name = "MiB", + value_parser = value_parser!(u64).range(512..), + help_heading = CLIENT_OPTIONS_HEADER + )] + pub max_memory_target: Option, + #[clap( long, help = "Part size for multi-part GET and PUT in bytes", @@ -432,12 +442,14 @@ impl CliArgs { let mut filter = if self.debug { String::from("debug") } else { - String::from("warn") + String::from("info") }; let crt_verbosity = if self.debug_crt { "debug" } else { "off" }; filter.push_str(&format!(",{}={}", AWSCRT_LOG_TARGET, crt_verbosity)); if self.log_metrics { filter.push_str(&format!(",{}=info", metrics::TARGET_NAME)); + } else { + filter.push_str(&format!(",{}=off", metrics::TARGET_NAME)); } filter }; @@ -774,6 +786,14 @@ where filesystem_config.allow_overwrite = args.allow_overwrite; filesystem_config.s3_personality = s3_personality; filesystem_config.server_side_encryption = ServerSideEncryption::new(args.sse.clone(), args.sse_kms_key_id.clone()); + filesystem_config.mem_limit = if let Some(max_mem_target) = args.max_memory_target { + max_mem_target * 1024 * 1024 + } else { + const MINIMUM_MEM_LIMIT: u64 = 512 * 1024 * 1024; + let sys = System::new_with_specifics(RefreshKind::everything()); + let default_mem_target = (sys.total_memory() as f64 * 0.95) as u64; + default_mem_target.max(MINIMUM_MEM_LIMIT) + }; // Written in this awkward way to force us to update it if we add new checksum types filesystem_config.use_upload_checksums = match args.upload_checksums { diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index b65762a3c..4e2af1a78 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -21,6 +21,8 @@ use crate::inode::{ Inode, InodeError, InodeKind, LookedUp, ReadHandle, ReaddirHandle, Superblock, SuperblockConfig, WriteHandle, }; use crate::logging; +use crate::mem_limiter::MemoryLimiter; +use crate::object::ObjectId; use crate::prefetch::{Prefetch, PrefetchResult}; use crate::prefix::Prefix; use crate::s3::S3Personality; @@ -151,9 +153,14 @@ where None => return Err(err!(libc::EBADF, "no E-Tag for inode {}", lookup.inode.ino())), Some(etag) => ETag::from_str(etag).expect("E-Tag should be set"), }; - let request = fs - .prefetcher - .prefetch(fs.client.clone(), &fs.bucket, &full_key, object_size, etag.clone()); + let object_id = ObjectId::new(full_key, etag); + let request = fs.prefetcher.prefetch( + fs.client.clone(), + fs.mem_limiter.clone(), + fs.bucket.clone(), + object_id, + object_size, + ); let handle = FileHandleState::Read { handle, request }; metrics::gauge!("fs.current_handles", "type" => "read").increment(1.0); Ok(handle) @@ -393,6 +400,8 @@ pub struct S3FilesystemConfig { pub server_side_encryption: ServerSideEncryption, /// Use additional checksums for uploads pub use_upload_checksums: bool, + /// Memory limit + pub mem_limit: u64, } impl Default for S3FilesystemConfig { @@ -413,6 +422,7 @@ impl Default for S3FilesystemConfig { s3_personality: S3Personality::default(), server_side_encryption: Default::default(), use_upload_checksums: true, + mem_limit: 512 * 1024 * 1024, } } } @@ -526,6 +536,7 @@ where { config: S3FilesystemConfig, client: Client, + mem_limiter: Arc>, superblock: Superblock, prefetcher: Prefetcher, uploader: Uploader, @@ -556,7 +567,7 @@ where s3_personality: config.s3_personality, }; let superblock = Superblock::new(bucket, prefix, superblock_config); - + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), config.mem_limit)); let uploader = Uploader::new( client.clone(), config.storage_class.to_owned(), @@ -567,6 +578,7 @@ where Self { config, client, + mem_limiter, superblock, prefetcher, uploader, diff --git a/mountpoint-s3/src/lib.rs b/mountpoint-s3/src/lib.rs index 084cd3cc9..e0475a0eb 100644 --- a/mountpoint-s3/src/lib.rs +++ b/mountpoint-s3/src/lib.rs @@ -7,8 +7,9 @@ pub mod fs; pub mod fuse; mod inode; pub mod logging; +pub mod mem_limiter; pub mod metrics; -mod object; +pub mod object; pub mod prefetch; pub mod prefix; pub mod s3; diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs new file mode 100644 index 000000000..ae366b621 --- /dev/null +++ b/mountpoint-s3/src/mem_limiter.rs @@ -0,0 +1,103 @@ +use std::sync::atomic::Ordering; + +use humansize::make_format; +use metrics::atomics::AtomicU64; +use tracing::{debug, info}; + +use mountpoint_s3_client::ObjectClient; + +#[derive(Debug)] +pub struct MemoryLimiter { + client: Client, + mem_limit: u64, + /// Actual allocated memory for data in the part queue + prefetcher_mem_used: AtomicU64, + /// Reserved memory for data we have requested via the request task but may not + /// arrives yet. + prefetcher_mem_reserved: AtomicU64, + /// Additional reserved memory for other non-buffer usage like storing metadata + additional_mem_reserved: u64, +} + +impl MemoryLimiter { + pub fn new(client: Client, mem_limit: u64) -> Self { + let min_reserved = 128 * 1024 * 1024; + let reserved_mem = (mem_limit / 8).max(min_reserved); + let formatter = make_format(humansize::BINARY); + debug!( + "target memory usage is {} with {} reserved memory", + formatter(mem_limit), + formatter(reserved_mem) + ); + Self { + client, + mem_limit, + prefetcher_mem_used: AtomicU64::new(0), + prefetcher_mem_reserved: AtomicU64::new(0), + additional_mem_reserved: reserved_mem, + } + } + + /// Commit the actual memory used. We only record data from the prefetcher for now. + pub fn allocate(&self, size: u64) { + self.prefetcher_mem_used.fetch_add(size, Ordering::SeqCst); + metrics::gauge!("prefetch.bytes_in_queue").increment(size as f64); + } + + /// Free the actual memory used. + pub fn free(&self, size: u64) { + self.prefetcher_mem_used.fetch_sub(size, Ordering::SeqCst); + metrics::gauge!("prefetch.bytes_in_queue").decrement(size as f64); + } + + /// Reserve the memory for future uses. + pub fn reserve(&self, size: u64) { + self.prefetcher_mem_reserved.fetch_add(size, Ordering::SeqCst); + metrics::gauge!("prefetch.bytes_reserved").increment(size as f64); + } + + /// Release the reserved memory. + pub fn release(&self, size: u64) { + self.prefetcher_mem_reserved.fetch_sub(size, Ordering::SeqCst); + metrics::gauge!("prefetch.bytes_reserved").decrement(size as f64); + } + + pub fn available_mem(&self) -> u64 { + let fs_mem_usage = self + .prefetcher_mem_used + .load(Ordering::SeqCst) + .max(self.prefetcher_mem_reserved.load(Ordering::SeqCst)); + let mut available_mem = self + .mem_limit + .saturating_sub(fs_mem_usage) + .saturating_sub(self.additional_mem_reserved); + if let Some(client_stats) = self.client.mem_usage_stats() { + let client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); + available_mem = available_mem.saturating_sub(client_mem_usage); + } + available_mem + } + + pub fn print_total_usage(&self) { + let formatter = make_format(humansize::BINARY); + let prefetcher_mem_used = self.prefetcher_mem_used.load(Ordering::SeqCst); + let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); + + let effective_mem_used = prefetcher_mem_used.max(prefetcher_mem_reserved); + let mut total_usage = effective_mem_used.saturating_add(self.additional_mem_reserved); + if let Some(client_stats) = self.client.mem_usage_stats() { + let effective_client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); + total_usage = total_usage.saturating_add(effective_client_mem_usage); + + info!( + total_usage = formatter(total_usage), + client_mem_used = formatter(client_stats.mem_used), + client_mem_reserved = formatter(client_stats.mem_reserved), + prefetcher_mem_used = formatter(prefetcher_mem_used), + prefetcher_mem_reserved = formatter(prefetcher_mem_reserved), + additional_mem_reserved = formatter(self.additional_mem_reserved), + "total memory usage" + ); + } + } +} diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index 1dea5df9b..19f4a41f7 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -46,7 +46,6 @@ use async_trait::async_trait; use futures::task::Spawn; use metrics::{counter, histogram}; use mountpoint_s3_client::error::{GetObjectError, ObjectClientError}; -use mountpoint_s3_client::types::ETag; use mountpoint_s3_client::ObjectClient; use part::PartOperationError; use part_stream::RequestTaskConfig; @@ -55,6 +54,7 @@ use tracing::trace; use crate::checksums::{ChecksummedBytes, IntegrityError}; use crate::data_cache::DataCache; +use crate::mem_limiter::MemoryLimiter; use crate::object::ObjectId; use crate::prefetch::caching_stream::CachingPartStream; use crate::prefetch::part_stream::{ClientPartStream, ObjectPartStream, RequestRange}; @@ -70,10 +70,10 @@ pub trait Prefetch { fn prefetch( &self, client: Client, - bucket: &str, - key: &str, + mem_limiter: Arc>, + bucket: String, + object_id: ObjectId, size: u64, - etag: ETag, ) -> Self::PrefetchResult where Client: ObjectClient + Clone + Send + Sync + 'static; @@ -164,7 +164,7 @@ impl Default for PrefetcherConfig { fn default() -> Self { Self { max_read_window_size: determine_max_read_size(), - sequential_prefetch_multiplier: 8, + sequential_prefetch_multiplier: 2, read_timeout: Duration::from_secs(60), // We want these large enough to tolerate a single out-of-order Linux readahead, which // is at most 256KiB backwards and then 512KiB forwards. For forwards seeks, we're also @@ -239,15 +239,23 @@ where fn prefetch( &self, client: Client, - bucket: &str, - key: &str, + mem_limiter: Arc>, + bucket: String, + object_id: ObjectId, size: u64, - etag: ETag, ) -> Self::PrefetchResult where Client: ObjectClient + Clone + Send + Sync + 'static, { - PrefetchGetObject::new(client, self.part_stream.clone(), self.config, bucket, key, size, etag) + PrefetchGetObject::new( + client, + self.part_stream.clone(), + mem_limiter, + self.config, + bucket, + object_id, + size, + ) } } @@ -257,8 +265,9 @@ where pub struct PrefetchGetObject { client: Client, part_stream: Arc, + mem_limiter: Arc>, config: PrefetcherConfig, - backpressure_task: Option>, + backpressure_task: Option>, // Invariant: the offset of the last byte in this window is always // self.next_sequential_read_offset - 1. backward_seek_window: SeekWindow, @@ -310,15 +319,16 @@ where fn new( client: Client, part_stream: Arc, + mem_limiter: Arc>, config: PrefetcherConfig, - bucket: &str, - key: &str, + bucket: String, + object_id: ObjectId, size: u64, - etag: ETag, ) -> Self { PrefetchGetObject { client, part_stream, + mem_limiter, config, backpressure_task: None, backward_seek_window: SeekWindow::new(config.max_backward_seek_distance as usize), @@ -326,8 +336,8 @@ where sequential_read_start_offset: 0, next_sequential_read_offset: 0, next_request_offset: 0, - bucket: bucket.to_owned(), - object_id: ObjectId::new(key.to_owned(), etag), + bucket, + object_id, size, } } @@ -410,9 +420,10 @@ where /// We will be using flow-control window to control how much data we want to download into the prefetcher. fn spawn_read_backpressure_request( &mut self, - ) -> Result, PrefetchReadError> { + ) -> Result, PrefetchReadError> { let start = self.next_sequential_read_offset; let object_size = self.size as usize; + let read_part_size = self.client.read_part_size().unwrap_or(8 * 1024 * 1024); let range = RequestRange::new(object_size, start, object_size); // The prefetcher now relies on backpressure mechanism so it must be enabled @@ -431,16 +442,20 @@ where bucket: self.bucket.clone(), object_id: self.object_id.clone(), range, + read_part_size, preferred_part_size: self.preferred_part_size, initial_read_window_size, max_read_window_size: self.config.max_read_window_size, read_window_size_multiplier: self.config.sequential_prefetch_multiplier, }; - Ok(self.part_stream.spawn_get_object_request(&self.client, config)) + Ok(self + .part_stream + .spawn_get_object_request(&self.client, config, self.mem_limiter.clone())) } /// Reset this prefetch request to a new offset, clearing any existing tasks queued. fn reset_prefetch_to_offset(&mut self, offset: u64) { + tracing::warn!("resetting prefetch"); self.backpressure_task = None; self.backward_seek_window.clear(); self.sequential_read_start_offset = offset; @@ -533,6 +548,7 @@ impl PrefetchGetObject Drop for PrefetchGetObject { fn drop(&mut self) { self.record_contiguous_read_metric(); + self.mem_limiter.print_total_usage(); } } @@ -549,6 +565,7 @@ mod tests { use mountpoint_s3_client::error::GetObjectError; use mountpoint_s3_client::failure_client::{countdown_failure_client, RequestFailureMap}; use mountpoint_s3_client::mock_client::{ramp_bytes, MockClient, MockClientConfig, MockClientError, MockObject}; + use mountpoint_s3_client::types::ETag; use proptest::proptest; use proptest::strategy::{Just, Strategy}; use proptest_derive::Arbitrary; @@ -600,6 +617,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, size as usize, ETag::for_tests()); let etag = object.etag(); @@ -614,7 +632,8 @@ mod tests { }; let prefetcher = Prefetcher::new(part_stream, prefetcher_config); - let mut request = prefetcher.prefetch(client, "test-bucket", "hello", size, etag); + let object_id = ObjectId::new("hello".to_owned(), etag); + let mut request = prefetcher.prefetch(client, mem_limiter.into(), "test-bucket".to_owned(), object_id, size); let mut next_offset = 0; loop { @@ -692,7 +711,8 @@ mod tests { ) where Stream: ObjectPartStream + Send + Sync + 'static, { - let client = MockClient::new(client_config); + let client = Arc::new(MockClient::new(client_config)); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let read_size = 1 * MB; let object_size = 8 * MB; let object = MockObject::ramp(0xaa, object_size, ETag::for_tests()); @@ -705,7 +725,14 @@ mod tests { }; let prefetcher = Prefetcher::new(part_stream, prefetcher_config); - let mut request = prefetcher.prefetch(Arc::new(client), "test-bucket", "hello", object_size as u64, etag); + let object_id = ObjectId::new("hello".to_owned(), etag); + let mut request = prefetcher.prefetch( + client, + mem_limiter.into(), + "test-bucket".to_owned(), + object_id, + object_size as u64, + ); let result = block_on(request.read(0, read_size)); assert!(matches!(result, Err(PrefetchReadError::BackpressurePreconditionFailed))); } @@ -785,7 +812,14 @@ mod tests { client.add_object("hello", object); - let client = countdown_failure_client(client, get_failures, HashMap::new(), HashMap::new(), HashMap::new()); + let client = Arc::new(countdown_failure_client( + client, + get_failures, + HashMap::new(), + HashMap::new(), + HashMap::new(), + )); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let prefetcher_config = PrefetcherConfig { max_read_window_size: test_config.max_read_window_size, @@ -794,7 +828,8 @@ mod tests { }; let prefetcher = Prefetcher::new(part_stream, prefetcher_config); - let mut request = prefetcher.prefetch(Arc::new(client), "test-bucket", "hello", size, etag); + let object_id = ObjectId::new("hello".to_owned(), etag); + let mut request = prefetcher.prefetch(client, mem_limiter.into(), "test-bucket".to_owned(), object_id, size); let mut next_offset = 0; loop { @@ -909,6 +944,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let etag = object.etag(); @@ -923,7 +959,14 @@ mod tests { }; let prefetcher = Prefetcher::new(part_stream, prefetcher_config); - let mut request = prefetcher.prefetch(client, "test-bucket", "hello", object_size, etag); + let object_id = ObjectId::new("hello".to_owned(), etag); + let mut request = prefetcher.prefetch( + client, + mem_limiter.into(), + "test-bucket".to_owned(), + object_id, + object_size, + ); for (offset, length) in reads { assert!(offset < object_size); @@ -1085,10 +1128,19 @@ mod tests { HashMap::new(), HashMap::new(), )); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let prefetcher = Prefetcher::new(default_stream(), Default::default()); + let mem_limiter = Arc::new(mem_limiter); block_on(async { - let mut request = prefetcher.prefetch(client, "test-bucket", "hello", OBJECT_SIZE as u64, etag.clone()); + let object_id = ObjectId::new("hello".to_owned(), etag.clone()); + let mut request = prefetcher.prefetch( + client, + mem_limiter, + "test-bucket".to_owned(), + object_id, + OBJECT_SIZE as u64, + ); // The first read should trigger the prefetcher to try and get the whole object (in 2 parts). _ = request.read(0, 1).await.expect("first read should succeed"); @@ -1129,6 +1181,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1138,8 +1191,14 @@ mod tests { // Try every possible seek from first_read_size for offset in first_read_size + 1..OBJECT_SIZE { - let mut request = - prefetcher.prefetch(client.clone(), "test-bucket", "hello", OBJECT_SIZE as u64, etag.clone()); + let object_id = ObjectId::new("hello".to_owned(), etag.clone()); + let mut request = prefetcher.prefetch( + client.clone(), + mem_limiter.clone(), + "test-bucket".to_owned(), + object_id, + OBJECT_SIZE as u64, + ); if first_read_size > 0 { let _first_read = block_on(request.read(0, first_read_size)).unwrap(); } @@ -1164,6 +1223,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1173,8 +1233,14 @@ mod tests { // Try every possible seek from first_read_size for offset in 0..first_read_size { - let mut request = - prefetcher.prefetch(client.clone(), "test-bucket", "hello", OBJECT_SIZE as u64, etag.clone()); + let object_id = ObjectId::new("hello".to_owned(), etag.clone()); + let mut request = prefetcher.prefetch( + client.clone(), + mem_limiter.clone(), + "test-bucket".to_owned(), + object_id, + OBJECT_SIZE as u64, + ); if first_read_size > 0 { let _first_read = block_on(request.read(0, first_read_size)).unwrap(); } @@ -1219,6 +1285,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); @@ -1233,7 +1300,14 @@ mod tests { }; let prefetcher = Prefetcher::new(ClientPartStream::new(ShuttleRuntime), prefetcher_config); - let mut request = prefetcher.prefetch(client, "test-bucket", "hello", object_size, file_etag); + let object_id = ObjectId::new("hello".to_owned(), file_etag); + let mut request = prefetcher.prefetch( + client, + mem_limiter.into(), + "test-bucket".to_owned(), + object_id, + object_size, + ); let mut next_offset = 0; loop { @@ -1277,6 +1351,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); @@ -1291,7 +1366,14 @@ mod tests { }; let prefetcher = Prefetcher::new(ClientPartStream::new(ShuttleRuntime), prefetcher_config); - let mut request = prefetcher.prefetch(client, "test-bucket", "hello", object_size, file_etag); + let object_id = ObjectId::new("hello".to_owned(), file_etag); + let mut request = prefetcher.prefetch( + client, + mem_limiter.into(), + "test-bucket".to_owned(), + object_id, + object_size, + ); let num_reads = rng.gen_range(10usize..50); for _ in 0..num_reads { diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index 269bf459d..f5495fe1b 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -1,8 +1,13 @@ use std::ops::Range; +use std::sync::Arc; use async_channel::{unbounded, Receiver, Sender}; +use humansize::make_format; +use mountpoint_s3_client::ObjectClient; use tracing::trace; +use crate::mem_limiter::MemoryLimiter; + use super::PrefetchReadError; #[derive(Debug)] @@ -16,18 +21,22 @@ pub enum BackpressureFeedbackEvent { pub struct BackpressureConfig { /// Backpressure's initial read window size pub initial_read_window_size: usize, + /// Minimum read window size that the backpressure controller is allowed to scale down to + pub min_read_window_size: usize, /// Maximum read window size that the backpressure controller is allowed to scale up to pub max_read_window_size: usize, /// Factor to increase the read window size by when the part queue is stalled pub read_window_size_multiplier: usize, /// Request range to apply backpressure pub request_range: Range, + pub read_part_size: usize, } #[derive(Debug)] -pub struct BackpressureController { +pub struct BackpressureController { read_window_updater: Sender, preferred_read_window_size: usize, + min_read_window_size: usize, max_read_window_size: usize, read_window_size_multiplier: usize, /// Upper bound of the current read window. The request can return data up to this @@ -36,6 +45,8 @@ pub struct BackpressureController { /// End offset for the request we want to apply backpressure. The request can return /// data up to this offset *exclusively*. request_end_offset: u64, + read_part_size: usize, + mem_limiter: Arc>, } #[derive(Debug)] @@ -61,16 +72,23 @@ pub struct BackpressureLimiter { /// [BackpressureController] will be given to the consumer side of the object stream. /// It can be used anywhere to set preferred read window size for the stream and tell the /// producer when its read window should be increased. -pub fn new_backpressure_controller(config: BackpressureConfig) -> (BackpressureController, BackpressureLimiter) { +pub fn new_backpressure_controller( + config: BackpressureConfig, + mem_limiter: Arc>, +) -> (BackpressureController, BackpressureLimiter) { let read_window_end_offset = config.request_range.start + config.initial_read_window_size as u64; let (read_window_updater, read_window_incrementing_queue) = unbounded(); + mem_limiter.reserve(config.initial_read_window_size as u64); let controller = BackpressureController { read_window_updater, preferred_read_window_size: config.initial_read_window_size, + min_read_window_size: config.min_read_window_size, max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, read_window_end_offset, request_end_offset: config.request_range.end, + read_part_size: config.read_part_size, + mem_limiter, }; let limiter = BackpressureLimiter { read_window_incrementing_queue, @@ -80,7 +98,7 @@ pub fn new_backpressure_controller(config: BackpressureConfig) -> (BackpressureC (controller, limiter) } -impl BackpressureController { +impl BackpressureController { pub fn read_window_end_offset(&self) -> u64 { self.read_window_end_offset } @@ -94,9 +112,18 @@ impl BackpressureController { let next_read_offset = offset + length as u64; let remaining_window = self.read_window_end_offset.saturating_sub(next_read_offset) as usize; // Increment the read window only if the remaining window reaches some threshold i.e. half of it left. - if remaining_window < (self.preferred_read_window_size / 2) + while remaining_window < (self.preferred_read_window_size / 2) && self.read_window_end_offset < self.request_end_offset { + let available_mem = self.mem_limiter.available_mem(); + // If the preferred read window size is still large and available memory is getting low we will try to scale it down. + if self.preferred_read_window_size > self.min_read_window_size + && available_mem < self.preferred_read_window_size as u64 + { + self.try_scaling_down(); + continue; + } + let new_read_window_end_offset = next_read_offset .saturating_add(self.preferred_read_window_size as u64) .min(self.request_end_offset); @@ -131,18 +158,53 @@ impl BackpressureController { // Try scaling up preferred read window size with a multiplier configured at initialization. fn try_scaling_up(&mut self) { if self.preferred_read_window_size < self.max_read_window_size { + let new_read_window_size = self.preferred_read_window_size * self.read_window_size_multiplier; + // Also align the new read window size to the client part size let new_read_window_size = - (self.preferred_read_window_size * self.read_window_size_multiplier).min(self.max_read_window_size); - trace!( - current_size = self.preferred_read_window_size, - new_size = new_read_window_size, - "scaling up preferred read window" + align(new_read_window_size, self.read_part_size, false).min(self.max_read_window_size); + + // Only scale up when there is enough memory + let to_increase = (new_read_window_size - self.preferred_read_window_size) as u64; + if to_increase <= self.mem_limiter.available_mem() { + let formatter = make_format(humansize::BINARY); + tracing::info!( + current_size = formatter(self.preferred_read_window_size), + new_size = formatter(new_read_window_size), + "scaling up preferred read window" + ); + self.mem_limiter.release(self.preferred_read_window_size as u64); + self.mem_limiter.reserve(new_read_window_size as u64); + self.preferred_read_window_size = new_read_window_size; + } + } + } + + pub fn try_scaling_down(&mut self) { + if self.preferred_read_window_size > self.min_read_window_size { + let new_read_window_size = self.preferred_read_window_size / self.read_window_size_multiplier; + // Also align the new read window size to the client part size + let new_read_window_size = + align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); + + let formatter = make_format(humansize::BINARY); + tracing::info!( + current_size = formatter(self.preferred_read_window_size), + new_size = formatter(new_read_window_size), + "scaling down read window" ); + self.mem_limiter.release(self.preferred_read_window_size as u64); + self.mem_limiter.reserve(new_read_window_size as u64); self.preferred_read_window_size = new_read_window_size; } } } +impl Drop for BackpressureController { + fn drop(&mut self) { + self.mem_limiter.release(self.preferred_read_window_size as u64); + } +} + impl BackpressureLimiter { pub fn read_window_end_offset(&self) -> u64 { self.read_window_end_offset @@ -184,3 +246,18 @@ impl BackpressureLimiter { Ok(Some(self.read_window_end_offset)) } } + +/// Try to align the given read window size to the part boundaries. +/// The `trim_only` flags controls whether the range is only trimmed down to +/// part boundaries or is allowed to grow wider. +fn align(read_window_size: usize, part_size: usize, trim_only: bool) -> usize { + let part_alignment = part_size; + let remainder = read_window_size % part_alignment; + if trim_only || remainder == 0 { + // trim it to the previous part boundary + read_window_size - remainder + } else { + // extend it to the next part boundary + read_window_size + (part_alignment - remainder) + } +} diff --git a/mountpoint-s3/src/prefetch/caching_stream.rs b/mountpoint-s3/src/prefetch/caching_stream.rs index 38caf3c4a..dcdee2e18 100644 --- a/mountpoint-s3/src/prefetch/caching_stream.rs +++ b/mountpoint-s3/src/prefetch/caching_stream.rs @@ -9,6 +9,7 @@ use tracing::{debug_span, trace, warn, Instrument}; use crate::checksums::ChecksummedBytes; use crate::data_cache::{BlockIndex, DataCache}; +use crate::mem_limiter::MemoryLimiter; use crate::object::ObjectId; use crate::prefetch::backpressure_controller::{new_backpressure_controller, BackpressureConfig, BackpressureLimiter}; use crate::prefetch::part::Part; @@ -45,7 +46,8 @@ where &self, client: &Client, config: RequestTaskConfig, - ) -> RequestTask<::ClientError> + mem_limiter: Arc>, + ) -> RequestTask<::ClientError, Client> where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -53,12 +55,15 @@ where let backpressure_config = BackpressureConfig { initial_read_window_size: config.initial_read_window_size, + min_read_window_size: config.read_part_size, max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, request_range: range.into(), + read_part_size: config.read_part_size, }; - let (backpressure_controller, backpressure_limiter) = new_backpressure_controller(backpressure_config); - let (part_queue, part_queue_producer) = unbounded_part_queue(); + let (backpressure_controller, backpressure_limiter) = + new_backpressure_controller(backpressure_config, mem_limiter.clone()); + let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); trace!(?range, "spawning request"); let request_task = { @@ -125,7 +130,7 @@ where async fn get_from_cache( mut self, range: RequestRange, - part_queue_producer: PartQueueProducer, + part_queue_producer: PartQueueProducer, ) { let cache_key = &self.config.object_id; let block_size = self.cache.block_size(); @@ -184,7 +189,7 @@ where &mut self, range: RequestRange, block_range: Range, - part_queue_producer: PartQueueProducer, + part_queue_producer: PartQueueProducer, ) { let bucket = &self.config.bucket; let cache_key = &self.config.object_id; @@ -238,8 +243,8 @@ where } } -struct CachingPartComposer { - part_queue_producer: PartQueueProducer, +struct CachingPartComposer { + part_queue_producer: PartQueueProducer, cache_key: ObjectId, original_range: RequestRange, block_index: u64, @@ -248,11 +253,12 @@ struct CachingPartComposer { runtime: Runtime, } -impl CachingPartComposer +impl CachingPartComposer where E: std::error::Error + Send + Sync, Cache: DataCache + Send + Sync + 'static, Runtime: Spawn, + Client: ObjectClient + Send + Sync + 'static, { async fn try_compose_parts(&mut self, request_stream: impl Stream>) { if let Err(e) = self.compose_parts(request_stream).await { @@ -387,7 +393,7 @@ mod tests { }; use test_case::test_case; - use crate::{data_cache::InMemoryDataCache, object::ObjectId}; + use crate::{data_cache::InMemoryDataCache, mem_limiter::MemoryLimiter, object::ObjectId}; use super::*; @@ -428,6 +434,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -443,12 +450,13 @@ mod tests { bucket: bucket.to_owned(), object_id: id.clone(), range, + read_part_size: client_part_size, preferred_part_size: 256 * KB, initial_read_window_size, max_read_window_size, read_window_size_multiplier, }; - let request_task = stream.spawn_get_object_request(&mock_client, config); + let request_task = stream.spawn_get_object_request(&mock_client, config, mem_limiter.clone()); compare_read(&id, &object, request_task); get_object_counter.count() }; @@ -468,12 +476,13 @@ mod tests { bucket: bucket.to_owned(), object_id: id.clone(), range, + read_part_size: client_part_size, preferred_part_size: 256 * KB, initial_read_window_size, max_read_window_size, read_window_size_multiplier, }; - let request_task = stream.spawn_get_object_request(&mock_client, config); + let request_task = stream.spawn_get_object_request(&mock_client, config, mem_limiter.clone()); compare_read(&id, &object, request_task); get_object_counter.count() }; @@ -506,6 +515,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -517,21 +527,22 @@ mod tests { bucket: bucket.to_owned(), object_id: id.clone(), range: RequestRange::new(object_size, offset as u64, preferred_size), + read_part_size: client_part_size, preferred_part_size: 256 * KB, initial_read_window_size, max_read_window_size, read_window_size_multiplier, }; - let request_task = stream.spawn_get_object_request(&mock_client, config); + let request_task = stream.spawn_get_object_request(&mock_client, config, mem_limiter.clone()); compare_read(&id, &object, request_task); } } } - fn compare_read( + fn compare_read( id: &ObjectId, object: &MockObject, - mut request_task: RequestTask, + mut request_task: RequestTask, ) { let mut offset = request_task.start_offset(); let mut remaining = request_task.total_size(); diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs index 08778666a..16cf99b07 100644 --- a/mountpoint-s3/src/prefetch/part_queue.rs +++ b/mountpoint-s3/src/prefetch/part_queue.rs @@ -1,7 +1,9 @@ +use mountpoint_s3_client::ObjectClient; use std::time::Instant; use tracing::trace; +use crate::mem_limiter::MemoryLimiter; use crate::prefetch::part::Part; use crate::prefetch::PrefetchReadError; use crate::sync::async_channel::{unbounded, Receiver, RecvError, Sender}; @@ -11,7 +13,7 @@ use crate::sync::Arc; /// A queue of [Part]s where the first part can be partially read from if the reader doesn't want /// the entire part in one shot. #[derive(Debug)] -pub struct PartQueue { +pub struct PartQueue { /// The auxiliary queue that supports pushing parts to the front of the part queue in order to /// allow partial reads and backwards seeks. front_queue: Vec, @@ -20,18 +22,22 @@ pub struct PartQueue { failed: bool, /// The total number of bytes sent to the underlying queue of `self.receiver` bytes_received: Arc, + mem_limiter: Arc>, } /// Producer side of the queue of [Part]s. #[derive(Debug)] -pub struct PartQueueProducer { +pub struct PartQueueProducer { sender: Sender>>, /// The total number of bytes sent to `self.sender` bytes_sent: Arc, + mem_limiter: Arc>, } /// Creates an unbounded [PartQueue] and its related [PartQueueProducer]. -pub fn unbounded_part_queue() -> (PartQueue, PartQueueProducer) { +pub fn unbounded_part_queue( + mem_limiter: Arc>, +) -> (PartQueue, PartQueueProducer) { let (sender, receiver) = unbounded(); let bytes_counter = Arc::new(AtomicUsize::new(0)); let part_queue = PartQueue { @@ -39,15 +45,17 @@ pub fn unbounded_part_queue() -> (PartQueue, PartQueueP receiver, failed: false, bytes_received: Arc::clone(&bytes_counter), + mem_limiter: mem_limiter.clone(), }; let part_queue_producer = PartQueueProducer { sender, bytes_sent: bytes_counter, + mem_limiter, }; (part_queue, part_queue_producer) } -impl PartQueue { +impl PartQueue { /// Read up to `length` bytes from the queue at the current offset. This function always returns /// a contiguous [Bytes], and so may return fewer than `length` bytes if it would need to copy /// or reallocate to make the return value contiguous. This function blocks only if the queue is @@ -87,7 +95,7 @@ impl PartQueue { let tail = part.split_off(length); self.front_queue.push(tail); } - metrics::gauge!("prefetch.bytes_in_queue").decrement(part.len() as f64); + self.mem_limiter.free(part.len() as u64); Ok(part) } @@ -95,8 +103,9 @@ impl PartQueue { pub async fn push_front(&mut self, part: Part) -> Result<(), PrefetchReadError> { assert!(!self.failed, "cannot use a PartQueue after failure"); - metrics::gauge!("prefetch.bytes_in_queue").increment(part.len() as f64); + let part_len = part.len() as u64; self.front_queue.push(part); + self.mem_limiter.allocate(part_len); Ok(()) } @@ -105,7 +114,7 @@ impl PartQueue { } } -impl PartQueueProducer { +impl PartQueueProducer { /// Push a new [Part] onto the back of the queue pub fn push(&self, part: Result>) { let part_len = part.as_ref().map_or(0, |part| part.len()); @@ -116,12 +125,12 @@ impl PartQueueProducer { trace!("closed channel"); } else { self.bytes_sent.fetch_add(part_len, Ordering::SeqCst); - metrics::gauge!("prefetch.bytes_in_queue").increment(part_len as f64); + self.mem_limiter.allocate(part_len as u64); } } } -impl Drop for PartQueue { +impl Drop for PartQueue { fn drop(&mut self) { // close the channel and drain remaining parts from the main queue self.receiver.close(); @@ -135,7 +144,7 @@ impl Drop for PartQueue { for part in &self.front_queue { queue_size += part.len() } - metrics::gauge!("prefetch.bytes_in_queue").decrement(queue_size as f64); + self.mem_limiter.free(queue_size as u64); } } @@ -148,6 +157,7 @@ mod tests { use bytes::Bytes; use futures::executor::block_on; + use mountpoint_s3_client::mock_client::MockClient; use mountpoint_s3_client::types::ETag; use proptest::proptest; use proptest_derive::Arbitrary; @@ -164,8 +174,10 @@ mod tests { enum DummyError {} async fn run_test(ops: Vec) { + let client = MockClient::new(Default::default()); + let mem_limiter = MemoryLimiter::new(client, 512 * 1024 * 1024); let part_id = ObjectId::new("key".to_owned(), ETag::for_tests()); - let (mut part_queue, part_queue_producer) = unbounded_part_queue::(); + let (mut part_queue, part_queue_producer) = unbounded_part_queue::(mem_limiter.into()); let mut current_offset = 0; let mut current_length = 0; for op in ops { diff --git a/mountpoint-s3/src/prefetch/part_stream.rs b/mountpoint-s3/src/prefetch/part_stream.rs index a01b03668..9be0d4be1 100644 --- a/mountpoint-s3/src/prefetch/part_stream.rs +++ b/mountpoint-s3/src/prefetch/part_stream.rs @@ -4,10 +4,12 @@ use futures::task::{Spawn, SpawnExt}; use futures::{pin_mut, Stream, StreamExt}; use mountpoint_s3_client::{types::GetObjectRequest, ObjectClient}; use std::marker::{Send, Sync}; +use std::sync::Arc; use std::{fmt::Debug, ops::Range}; use tracing::{debug_span, error, trace, Instrument}; use crate::checksums::ChecksummedBytes; +use crate::mem_limiter::MemoryLimiter; use crate::object::ObjectId; use crate::prefetch::backpressure_controller::{new_backpressure_controller, BackpressureConfig}; use crate::prefetch::part::Part; @@ -26,7 +28,8 @@ pub trait ObjectPartStream { &self, client: &Client, config: RequestTaskConfig, - ) -> RequestTask + mem_limiter: Arc>, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static; } @@ -37,6 +40,7 @@ pub struct RequestTaskConfig { pub bucket: String, pub object_id: ObjectId, pub range: RequestRange, + pub read_part_size: usize, pub preferred_part_size: usize, pub initial_read_window_size: usize, pub max_read_window_size: usize, @@ -181,7 +185,8 @@ where &self, client: &Client, config: RequestTaskConfig, - ) -> RequestTask + mem_limiter: Arc>, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -191,12 +196,17 @@ where let backpressure_config = BackpressureConfig { initial_read_window_size: config.initial_read_window_size, + // We don't want to completely block the stream so let's use + // the read part size as minimum read window. + min_read_window_size: config.read_part_size, max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, request_range: range.into(), + read_part_size: config.read_part_size, }; - let (backpressure_controller, mut backpressure_limiter) = new_backpressure_controller(backpressure_config); - let (part_queue, part_queue_producer) = unbounded_part_queue(); + let (backpressure_controller, mut backpressure_limiter) = + new_backpressure_controller(backpressure_config, mem_limiter.clone()); + let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); trace!(?range, "spawning request"); let span = debug_span!("prefetch", ?range); @@ -230,13 +240,17 @@ where } } -struct ClientPartComposer { - part_queue_producer: PartQueueProducer, +struct ClientPartComposer { + part_queue_producer: PartQueueProducer, object_id: ObjectId, preferred_part_size: usize, } -impl ClientPartComposer { +impl ClientPartComposer +where + E: std::error::Error + Send + Sync, + Client: ObjectClient + Send + Sync + 'static, +{ async fn try_compose_parts(&self, request_stream: impl Stream>) { if let Err(e) = self.compose_parts(request_stream).await { trace!(error=?e, "part stream task failed"); diff --git a/mountpoint-s3/src/prefetch/task.rs b/mountpoint-s3/src/prefetch/task.rs index cd858d7bc..bf47190fe 100644 --- a/mountpoint-s3/src/prefetch/task.rs +++ b/mountpoint-s3/src/prefetch/task.rs @@ -1,4 +1,5 @@ use futures::future::RemoteHandle; +use mountpoint_s3_client::ObjectClient; use crate::prefetch::backpressure_controller::BackpressureFeedbackEvent::{DataRead, PartQueueStall}; use crate::prefetch::part::Part; @@ -10,22 +11,22 @@ use super::part_stream::RequestRange; /// A single GetObject request submitted to the S3 client #[derive(Debug)] -pub struct RequestTask { +pub struct RequestTask { /// Handle on the task/future. The future is cancelled when handle is dropped. This is None if /// the request is fake (created by seeking backwards in the stream) _task_handle: RemoteHandle<()>, remaining: usize, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, } -impl RequestTask { +impl RequestTask { pub fn from_handle( task_handle: RemoteHandle<()>, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, ) -> Self { Self { _task_handle: task_handle, From e2b68814433c4be2fc9687eaf4ec509d3a8f20c2 Mon Sep 17 00:00:00 2001 From: Vlad Volodkin Date: Fri, 27 Sep 2024 09:31:34 +0000 Subject: [PATCH 2/9] Clean up development logging Signed-off-by: Vlad Volodkin --- mountpoint-s3/src/mem_limiter.rs | 6 +++--- mountpoint-s3/src/prefetch.rs | 7 +++---- mountpoint-s3/src/prefetch/backpressure_controller.rs | 6 +++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index ae366b621..445e21696 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -2,7 +2,7 @@ use std::sync::atomic::Ordering; use humansize::make_format; use metrics::atomics::AtomicU64; -use tracing::{debug, info}; +use tracing::debug; use mountpoint_s3_client::ObjectClient; @@ -78,7 +78,7 @@ impl MemoryLimiter { available_mem } - pub fn print_total_usage(&self) { + pub fn log_total_usage(&self) { let formatter = make_format(humansize::BINARY); let prefetcher_mem_used = self.prefetcher_mem_used.load(Ordering::SeqCst); let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); @@ -89,7 +89,7 @@ impl MemoryLimiter { let effective_client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); total_usage = total_usage.saturating_add(effective_client_mem_usage); - info!( + debug!( total_usage = formatter(total_usage), client_mem_used = formatter(client_stats.mem_used), client_mem_reserved = formatter(client_stats.mem_reserved), diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index 19f4a41f7..36bc54736 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -50,7 +50,7 @@ use mountpoint_s3_client::ObjectClient; use part::PartOperationError; use part_stream::RequestTaskConfig; use thiserror::Error; -use tracing::trace; +use tracing::{debug, trace}; use crate::checksums::{ChecksummedBytes, IntegrityError}; use crate::data_cache::DataCache; @@ -369,7 +369,7 @@ where if self.try_seek(offset).await? { trace!("seek succeeded"); } else { - trace!( + debug!( expected = self.next_sequential_read_offset, actual = offset, "out-of-order read, resetting prefetch" @@ -455,7 +455,6 @@ where /// Reset this prefetch request to a new offset, clearing any existing tasks queued. fn reset_prefetch_to_offset(&mut self, offset: u64) { - tracing::warn!("resetting prefetch"); self.backpressure_task = None; self.backward_seek_window.clear(); self.sequential_read_start_offset = offset; @@ -548,7 +547,7 @@ impl PrefetchGetObject Drop for PrefetchGetObject { fn drop(&mut self) { self.record_contiguous_read_metric(); - self.mem_limiter.print_total_usage(); + self.mem_limiter.log_total_usage(); } } diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index f5495fe1b..98e507f46 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use async_channel::{unbounded, Receiver, Sender}; use humansize::make_format; use mountpoint_s3_client::ObjectClient; -use tracing::trace; +use tracing::{debug, trace}; use crate::mem_limiter::MemoryLimiter; @@ -167,7 +167,7 @@ impl BackpressureController { let to_increase = (new_read_window_size - self.preferred_read_window_size) as u64; if to_increase <= self.mem_limiter.available_mem() { let formatter = make_format(humansize::BINARY); - tracing::info!( + debug!( current_size = formatter(self.preferred_read_window_size), new_size = formatter(new_read_window_size), "scaling up preferred read window" @@ -187,7 +187,7 @@ impl BackpressureController { align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); let formatter = make_format(humansize::BINARY); - tracing::info!( + debug!( current_size = formatter(self.preferred_read_window_size), new_size = formatter(new_read_window_size), "scaling down read window" From ddbc0ebe29c7fbaa4a9eb419a4f6e8dcc5f917f5 Mon Sep 17 00:00:00 2001 From: Vladislav Volodkin Date: Fri, 27 Sep 2024 09:31:34 +0000 Subject: [PATCH 3/9] Scale up atomically, scale down after data was consumed Signed-off-by: Vladislav Volodkin --- mountpoint-s3/src/mem_limiter.rs | 71 ++++---- .../src/prefetch/backpressure_controller.rs | 171 +++++++++++++----- mountpoint-s3/src/prefetch/part_queue.rs | 17 +- 3 files changed, 175 insertions(+), 84 deletions(-) diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index 445e21696..d67f2427b 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -10,10 +10,8 @@ use mountpoint_s3_client::ObjectClient; pub struct MemoryLimiter { client: Client, mem_limit: u64, - /// Actual allocated memory for data in the part queue - prefetcher_mem_used: AtomicU64, - /// Reserved memory for data we have requested via the request task but may not - /// arrives yet. + /// Reserved memory for data we had requested via the request task but may not + /// arrived yet. prefetcher_mem_reserved: AtomicU64, /// Additional reserved memory for other non-buffer usage like storing metadata additional_mem_reserved: u64, @@ -32,30 +30,47 @@ impl MemoryLimiter { Self { client, mem_limit, - prefetcher_mem_used: AtomicU64::new(0), prefetcher_mem_reserved: AtomicU64::new(0), additional_mem_reserved: reserved_mem, } } - /// Commit the actual memory used. We only record data from the prefetcher for now. - pub fn allocate(&self, size: u64) { - self.prefetcher_mem_used.fetch_add(size, Ordering::SeqCst); - metrics::gauge!("prefetch.bytes_in_queue").increment(size as f64); - } - - /// Free the actual memory used. - pub fn free(&self, size: u64) { - self.prefetcher_mem_used.fetch_sub(size, Ordering::SeqCst); - metrics::gauge!("prefetch.bytes_in_queue").decrement(size as f64); - } - - /// Reserve the memory for future uses. + /// Reserve the memory for future uses. Always succeeds, even if it means going beyond + /// the configured memory limit. pub fn reserve(&self, size: u64) { self.prefetcher_mem_reserved.fetch_add(size, Ordering::SeqCst); metrics::gauge!("prefetch.bytes_reserved").increment(size as f64); } + /// Reserve the memory for future uses. If there is not enough memory returns `false`. + pub fn try_reserve(&self, size: u64, min_available: u64) -> bool { + loop { + let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); + let new_prefetcher_mem_reserved = prefetcher_mem_reserved.saturating_add(size); + let total_mem_usage = prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); + let new_total_mem_usage = new_prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); + if new_total_mem_usage > self.mem_limit - min_available { + debug!( + "not enough memory to reserve, current usage: {}, new (if scaled up): {}, allowed diff: {}", + total_mem_usage, new_total_mem_usage, min_available, + ); + return false; + } + match self.prefetcher_mem_reserved.compare_exchange_weak( + prefetcher_mem_reserved, + new_prefetcher_mem_reserved, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + Ok(_) => { + metrics::gauge!("prefetch.bytes_reserved").increment(size as f64); + return true; + } + Err(_) => continue, // another thread updated the atomic before us, trying again + } + } + } + /// Release the reserved memory. pub fn release(&self, size: u64) { self.prefetcher_mem_reserved.fetch_sub(size, Ordering::SeqCst); @@ -63,28 +78,17 @@ impl MemoryLimiter { } pub fn available_mem(&self) -> u64 { - let fs_mem_usage = self - .prefetcher_mem_used - .load(Ordering::SeqCst) - .max(self.prefetcher_mem_reserved.load(Ordering::SeqCst)); - let mut available_mem = self - .mem_limit + let fs_mem_usage = self.prefetcher_mem_reserved.load(Ordering::SeqCst); + self.mem_limit .saturating_sub(fs_mem_usage) - .saturating_sub(self.additional_mem_reserved); - if let Some(client_stats) = self.client.mem_usage_stats() { - let client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); - available_mem = available_mem.saturating_sub(client_mem_usage); - } - available_mem + .saturating_sub(self.additional_mem_reserved) } pub fn log_total_usage(&self) { let formatter = make_format(humansize::BINARY); - let prefetcher_mem_used = self.prefetcher_mem_used.load(Ordering::SeqCst); let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); - let effective_mem_used = prefetcher_mem_used.max(prefetcher_mem_reserved); - let mut total_usage = effective_mem_used.saturating_add(self.additional_mem_reserved); + let mut total_usage = prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); if let Some(client_stats) = self.client.mem_usage_stats() { let effective_client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); total_usage = total_usage.saturating_add(effective_client_mem_usage); @@ -93,7 +97,6 @@ impl MemoryLimiter { total_usage = formatter(total_usage), client_mem_used = formatter(client_stats.mem_used), client_mem_reserved = formatter(client_stats.mem_reserved), - prefetcher_mem_used = formatter(prefetcher_mem_used), prefetcher_mem_reserved = formatter(prefetcher_mem_reserved), additional_mem_reserved = formatter(self.additional_mem_reserved), "total memory usage" diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index 98e507f46..d32c5c207 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -42,6 +42,9 @@ pub struct BackpressureController { /// Upper bound of the current read window. The request can return data up to this /// offset *exclusively*. This value must be advanced to continue fetching new data. read_window_end_offset: u64, + /// Next offset of the data to be read. It is used for tracking how many bytes of + /// data has been read out of the stream. + next_read_offset: u64, /// End offset for the request we want to apply backpressure. The request can return /// data up to this offset *exclusively*. request_end_offset: u64, @@ -86,6 +89,7 @@ pub fn new_backpressure_controller( max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, read_window_end_offset, + next_read_offset: config.request_range.start, request_end_offset: config.request_range.end, read_part_size: config.read_part_size, mem_limiter, @@ -109,35 +113,72 @@ impl BackpressureController { match event { // Note, that this may come from a backwards seek, so offsets observed by this method are not necessarily ascending BackpressureFeedbackEvent::DataRead { offset, length } => { + // Step 2. of scale down, including the case when we're approaching the request end. See `self.scale_down` for the logic. let next_read_offset = offset + length as u64; + // We don't update `self.next_read_offset` if this feedback arrived from read after a backwards seek + if next_read_offset > self.next_read_offset { + self.next_read_offset = next_read_offset; + } + if self.next_read_offset >= self.request_end_offset { + self.next_read_offset = self.request_end_offset; + } let remaining_window = self.read_window_end_offset.saturating_sub(next_read_offset) as usize; + + let preffered_window_end_offset = self + .next_read_offset + .saturating_add(self.preferred_read_window_size as u64); + let over_reserved = self.read_window_end_offset.saturating_sub(preffered_window_end_offset); + if over_reserved > 0 { + self.mem_limiter.release((length as u64).min(over_reserved)); + } + if self.request_end_offset < preffered_window_end_offset { + // We won't need the full `preffered_window_end_offset` as we're approaching the request's end. + self.mem_limiter.release(length as u64); + } + // Increment the read window only if the remaining window reaches some threshold i.e. half of it left. - while remaining_window < (self.preferred_read_window_size / 2) + if remaining_window < (self.preferred_read_window_size / 2) && self.read_window_end_offset < self.request_end_offset { - let available_mem = self.mem_limiter.available_mem(); - // If the preferred read window size is still large and available memory is getting low we will try to scale it down. - if self.preferred_read_window_size > self.min_read_window_size - && available_mem < self.preferred_read_window_size as u64 - { - self.try_scaling_down(); - continue; + // If there is not enough available memory in the system, we'll try to reduce the read window of the current request. + // We define "not enough memory" as a situation where no new request with a minimum window may fit in the limit. + // + // Scaling down is best effort, meaning that there is no guarantee that after this action such a + // request will fit in memory. This may not be the case if during the scale down a new memory reservation was made by + // another request. + // + // We reduce the frequency of scale downs by only performing it when sufficient amount of data (half of read_window) + // was read. + let mut available_mem = self.mem_limiter.available_mem(); + let mut new_read_window_size = self.preferred_read_window_size; // new_preferred_read_window_size is just too wordy + while available_mem < self.min_read_window_size as u64 && self.read_window_size_multiplier > 1 { + let scaled_down = new_read_window_size / self.read_window_size_multiplier; + if scaled_down < self.min_read_window_size { + break; + } + available_mem += (new_read_window_size - scaled_down) as u64; + new_read_window_size = scaled_down; + } + if new_read_window_size != self.preferred_read_window_size { + self.scale_down(new_read_window_size); } let new_read_window_end_offset = next_read_offset .saturating_add(self.preferred_read_window_size as u64) .min(self.request_end_offset); - debug_assert!(self.read_window_end_offset < new_read_window_end_offset); - let to_increase = new_read_window_end_offset.saturating_sub(self.read_window_end_offset) as usize; - trace!( - preferred_read_window_size = self.preferred_read_window_size, - next_read_offset, - read_window_end_offset = self.read_window_end_offset, - to_increase, - "incrementing read window" - ); - self.increment_read_window(to_increase).await; - self.read_window_end_offset = new_read_window_end_offset; + + if self.read_window_end_offset < new_read_window_end_offset { + let to_increase = + new_read_window_end_offset.saturating_sub(self.read_window_end_offset) as usize; + trace!( + preferred_read_window_size = self.preferred_read_window_size, + next_read_offset = self.next_read_offset, + read_window_end_offset = self.read_window_end_offset, + to_increase, + "incrementing read window" + ); + self.increment_read_window(to_increase).await; + } } } BackpressureFeedbackEvent::PartQueueStall => self.try_scaling_up(), @@ -146,16 +187,18 @@ impl BackpressureController { } // Send an increment read window request to the stream producer - async fn increment_read_window(&self, len: usize) { + async fn increment_read_window(&mut self, len: usize) { // This should not block since the channel is unbounded let _ = self .read_window_updater .send(len) .await .inspect_err(|_| trace!("read window incrementing queue is already closed")); + self.read_window_end_offset += len as u64; } // Try scaling up preferred read window size with a multiplier configured at initialization. + // Scaling up fails silently if there is no enough free memory to perform it. fn try_scaling_up(&mut self) { if self.preferred_read_window_size < self.max_read_window_size { let new_read_window_size = self.preferred_read_window_size * self.read_window_size_multiplier; @@ -165,43 +208,89 @@ impl BackpressureController { // Only scale up when there is enough memory let to_increase = (new_read_window_size - self.preferred_read_window_size) as u64; - if to_increase <= self.mem_limiter.available_mem() { + if self + .mem_limiter + .try_reserve(to_increase, self.min_read_window_size as u64) + { let formatter = make_format(humansize::BINARY); debug!( current_size = formatter(self.preferred_read_window_size), new_size = formatter(new_read_window_size), - "scaling up preferred read window" + "scaled up preferred read window" ); - self.mem_limiter.release(self.preferred_read_window_size as u64); - self.mem_limiter.reserve(new_read_window_size as u64); self.preferred_read_window_size = new_read_window_size; + metrics::histogram!("prefetch.window_after_increase_mib") + .record((self.preferred_read_window_size / 1024 / 1024) as f64); } } } - pub fn try_scaling_down(&mut self) { - if self.preferred_read_window_size > self.min_read_window_size { - let new_read_window_size = self.preferred_read_window_size / self.read_window_size_multiplier; - // Also align the new read window size to the client part size - let new_read_window_size = - align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); + pub fn scale_down(&mut self, new_read_window_size: usize) { + /* + Scaling down is performed in 2 steps, one in this method and another on read. Note that `window_end_offset` is the value + which is set in CRT and it may not be decreased. This function implements step 1. - let formatter = make_format(humansize::BINARY); - debug!( - current_size = formatter(self.preferred_read_window_size), - new_size = formatter(new_read_window_size), - "scaling down read window" - ); - self.mem_limiter.release(self.preferred_read_window_size as u64); - self.mem_limiter.reserve(new_read_window_size as u64); - self.preferred_read_window_size = new_read_window_size; - } + 0. Before scale down: + + read_until window_end_offset preferred_window_end_offset + │ │ │ + ────┼───────────────────────────────────────────────────────────┼───────────────┼─────────────────────────────────► + │ │ + └───────────────────────────────────────────────────────────────────────────┘ + preferred_read_window_size + + 1. Scaling down (`new_read_window_size` is applied): + + read_until preferred_window_end_offset window_end_offset preferred_window_end_offset_old + │ │ │ │ + ────┼──────────────────────────────────────┼────────────────────┼───────────────┼─────────────────────────────────► + │ ├───────────────┘ + └────────────────────┘ released immediatelly + over_reserved + + 2. Part read: + + read_until(old) read_until preferred_window_end_offset window_end_offset + │ │ │ │ + ────┼────────────┼─────────────────────────────────────┼────────┼─────────────────────────────────────────────────► + └────────────┘ └────────┘ + released on read: over_reserved (new) + 1. if over_reserved > 0 + 2. min(part.size(), over_reserved) is to deduct + */ + // Align the new read window size to the client part size + let new_read_window_size = + align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); + + let formatter = make_format(humansize::BINARY); + debug!( + current_size = formatter(self.preferred_read_window_size), + new_size = formatter(new_read_window_size), + "scaling down read window" + ); + let preferred_window_end_offset_old = self + .next_read_offset + .saturating_add(self.preferred_read_window_size as u64); + let preferred_window_end_offset = self.next_read_offset.saturating_add(new_read_window_size as u64); + // In most cases we'll keep memory reserved for `self.read_window_end_offset`, but if the new + // `preferred_window_end_offset` is greater, we'll reserve for it instead. + let reserve_until_offset = self.read_window_end_offset.max(preferred_window_end_offset); + let to_release = preferred_window_end_offset_old.saturating_sub(reserve_until_offset); + self.mem_limiter.release(to_release); + self.preferred_read_window_size = new_read_window_size; + metrics::histogram!("prefetch.window_after_decrease_mib") + .record((self.preferred_read_window_size / 1024 / 1024) as f64); } } impl Drop for BackpressureController { fn drop(&mut self) { - self.mem_limiter.release(self.preferred_read_window_size as u64); + // When approaching request end we have less memory still reserved than `self.preferred_read_window_size`. + debug_assert!(self.request_end_offset >= self.next_read_offset); + let remaining_in_request = self.request_end_offset.saturating_sub(self.next_read_offset); + + self.mem_limiter + .release((self.preferred_read_window_size as u64).min(remaining_in_request)); } } diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs index 16cf99b07..21adc8865 100644 --- a/mountpoint-s3/src/prefetch/part_queue.rs +++ b/mountpoint-s3/src/prefetch/part_queue.rs @@ -22,7 +22,7 @@ pub struct PartQueue { failed: bool, /// The total number of bytes sent to the underlying queue of `self.receiver` bytes_received: Arc, - mem_limiter: Arc>, + _mem_limiter: Arc>, // todo: remove } /// Producer side of the queue of [Part]s. @@ -31,7 +31,7 @@ pub struct PartQueueProducer { sender: Sender>>, /// The total number of bytes sent to `self.sender` bytes_sent: Arc, - mem_limiter: Arc>, + _mem_limiter: Arc>, } /// Creates an unbounded [PartQueue] and its related [PartQueueProducer]. @@ -45,12 +45,12 @@ pub fn unbounded_part_queue( receiver, failed: false, bytes_received: Arc::clone(&bytes_counter), - mem_limiter: mem_limiter.clone(), + _mem_limiter: mem_limiter.clone(), }; let part_queue_producer = PartQueueProducer { sender, bytes_sent: bytes_counter, - mem_limiter, + _mem_limiter: mem_limiter, }; (part_queue, part_queue_producer) } @@ -95,7 +95,7 @@ impl PartQueue PartQueue Result<(), PrefetchReadError> { assert!(!self.failed, "cannot use a PartQueue after failure"); - let part_len = part.len() as u64; + metrics::gauge!("prefetch.bytes_in_queue").increment(part.len() as f64); self.front_queue.push(part); - self.mem_limiter.allocate(part_len); Ok(()) } @@ -125,7 +124,7 @@ impl PartQueueProducer trace!("closed channel"); } else { self.bytes_sent.fetch_add(part_len, Ordering::SeqCst); - self.mem_limiter.allocate(part_len as u64); + metrics::gauge!("prefetch.bytes_in_queue").increment(part_len as f64); } } } @@ -144,7 +143,7 @@ impl Drop for PartQueue { for part in &self.front_queue { queue_size += part.len() } - self.mem_limiter.free(queue_size as u64); + metrics::gauge!("prefetch.bytes_in_queue").decrement(queue_size as f64); } } From 0204a720a997b6a707ec87fe678b321894168b2a Mon Sep 17 00:00:00 2001 From: Vladislav Volodkin Date: Fri, 27 Sep 2024 09:37:16 +0000 Subject: [PATCH 4/9] Remove Client from MemoryLimiter, document this structure Signed-off-by: Vladislav Volodkin --- mountpoint-s3-client/src/failure_client.rs | 8 +-- mountpoint-s3-client/src/mock_client.rs | 10 +--- .../src/mock_client/throughput_client.rs | 6 +- mountpoint-s3-client/src/object_client.rs | 14 ----- mountpoint-s3-client/src/s3_crt_client.rs | 7 --- mountpoint-s3/examples/prefetch_benchmark.rs | 2 +- mountpoint-s3/src/fs.rs | 4 +- mountpoint-s3/src/mem_limiter.rs | 57 ++++++++++--------- mountpoint-s3/src/prefetch.rs | 31 +++++----- .../src/prefetch/backpressure_controller.rs | 15 +++-- mountpoint-s3/src/prefetch/caching_stream.rs | 27 +++++---- mountpoint-s3/src/prefetch/part_queue.rs | 25 +++----- mountpoint-s3/src/prefetch/part_stream.rs | 17 +++--- mountpoint-s3/src/prefetch/task.rs | 13 ++--- 14 files changed, 95 insertions(+), 141 deletions(-) diff --git a/mountpoint-s3-client/src/failure_client.rs b/mountpoint-s3-client/src/failure_client.rs index bbc5cf254..b7d78a25d 100644 --- a/mountpoint-s3-client/src/failure_client.rs +++ b/mountpoint-s3-client/src/failure_client.rs @@ -16,8 +16,8 @@ use pin_project::pin_project; use crate::object_client::{ DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - MemoryUsageStats, ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, - PutObjectRequest, PutObjectResult, UploadReview, + ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, PutObjectRequest, + PutObjectResult, UploadReview, }; use crate::ObjectClient; @@ -85,10 +85,6 @@ where self.client.initial_read_window_size() } - fn mem_usage_stats(&self) -> Option { - self.client.mem_usage_stats() - } - async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client.rs b/mountpoint-s3-client/src/mock_client.rs index 75d8d47e3..b663a9463 100644 --- a/mountpoint-s3-client/src/mock_client.rs +++ b/mountpoint-s3-client/src/mock_client.rs @@ -26,9 +26,9 @@ use crate::error_metadata::{ClientErrorMetadata, ProvideErrorMetadata}; use crate::object_client::{ Checksum, ChecksumAlgorithm, DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesParts, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, - HeadObjectResult, ListObjectsError, ListObjectsResult, MemoryUsageStats, ObjectAttribute, ObjectClient, - ObjectClientError, ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, - PutObjectResult, PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, + HeadObjectResult, ListObjectsError, ListObjectsResult, ObjectAttribute, ObjectClient, ObjectClientError, + ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, PutObjectResult, + PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, }; mod leaky_bucket; @@ -572,10 +572,6 @@ impl ObjectClient for MockClient { } } - fn mem_usage_stats(&self) -> Option { - None - } - async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client/throughput_client.rs b/mountpoint-s3-client/src/mock_client/throughput_client.rs index 610f81ef4..065791069 100644 --- a/mountpoint-s3-client/src/mock_client/throughput_client.rs +++ b/mountpoint-s3-client/src/mock_client/throughput_client.rs @@ -13,7 +13,7 @@ use crate::mock_client::{MockClient, MockClientConfig, MockClientError, MockObje use crate::object_client::{ DeleteObjectError, DeleteObjectResult, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - MemoryUsageStats, ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, + ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, }; use crate::types::ETag; @@ -113,10 +113,6 @@ impl ObjectClient for ThroughputMockClient { self.inner.initial_read_window_size() } - fn mem_usage_stats(&self) -> Option { - self.inner.mem_usage_stats() - } - async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/object_client.rs b/mountpoint-s3-client/src/object_client.rs index 22b10d9f0..b92bf796c 100644 --- a/mountpoint-s3-client/src/object_client.rs +++ b/mountpoint-s3-client/src/object_client.rs @@ -63,16 +63,6 @@ impl FromStr for ETag { } } -/// Memory usage stats for the client -pub struct MemoryUsageStats { - /// Reserved memory for the client. For [S3CrtClient], this value is a sum of primary storage - /// and secondary storage reserved memory. - pub mem_reserved: u64, - /// Actual used memory for the client. For [S3CrtClient], this value is a sum of primanry - /// storage and secondary storage used memory. - pub mem_used: u64, -} - /// A generic interface to S3-like object storage services. /// /// This trait defines the common methods that all object services implement. @@ -99,10 +89,6 @@ pub trait ObjectClient { /// This can be `None` if backpressure is disabled. fn initial_read_window_size(&self) -> Option; - /// Query current memory usage stats for the client. This can be `None` if the client - /// does not record the stats. - fn mem_usage_stats(&self) -> Option; - /// Delete a single object from the object store. /// /// DeleteObject will succeed even if the object within the bucket does not exist. diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index caa3a7e87..caf6bab45 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -1208,13 +1208,6 @@ impl ObjectClient for S3CrtClient { } } - fn mem_usage_stats(&self) -> Option { - let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); - let mem_reserved = crt_buffer_pool_stats.primary_reserved + crt_buffer_pool_stats.secondary_reserved; - let mem_used = crt_buffer_pool_stats.primary_used + crt_buffer_pool_stats.secondary_used; - Some(MemoryUsageStats { mem_reserved, mem_used }) - } - async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3/examples/prefetch_benchmark.rs b/mountpoint-s3/examples/prefetch_benchmark.rs index 7532edde4..5412f0e9e 100644 --- a/mountpoint-s3/examples/prefetch_benchmark.rs +++ b/mountpoint-s3/examples/prefetch_benchmark.rs @@ -93,7 +93,7 @@ fn main() { config = config.part_size(part_size); } let client = Arc::new(S3CrtClient::new(config).expect("couldn't create client")); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); let head_object_result = block_on(client.head_object(bucket, key)).expect("HeadObject failed"); let size = head_object_result.object.size; diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 4e2af1a78..63b9bce46 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -536,7 +536,7 @@ where { config: S3FilesystemConfig, client: Client, - mem_limiter: Arc>, + mem_limiter: Arc, superblock: Superblock, prefetcher: Prefetcher, uploader: Uploader, @@ -567,7 +567,7 @@ where s3_personality: config.s3_personality, }; let superblock = Superblock::new(bucket, prefix, superblock_config); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), config.mem_limit)); + let mem_limiter = Arc::new(MemoryLimiter::new(config.mem_limit)); let uploader = Uploader::new( client.clone(), config.storage_class.to_owned(), diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index d67f2427b..bd1810fcd 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -4,11 +4,35 @@ use humansize::make_format; use metrics::atomics::AtomicU64; use tracing::debug; -use mountpoint_s3_client::ObjectClient; - +/// `MemoryLimiter` tracks memory used by Mountpoint and makes decisions if a new memory reservation request can be accepted. +/// Currently the only metric which we take into account is the memory reserved by prefetcher instances for the data requested or +/// fetched from CRT client. Single instance of this struct is shared among all of the prefetchers (file handles). +/// +/// Each file handle upon creation makes an initial reservation request with a minimal read window size of `1MiB + 128KiB`. This +/// is accepted unconditionally since we want to allow any file handle to make progress even if that means going over the memory +/// limit. Additional reservations for a file handle arise when data is being read from fuse **faster** than it arrives from the +/// client (PartQueueStall). Those reservations may be rejected if there is no available memory. +/// +/// Release of the reserved memory happens on one of the following events: +/// 1) prefetcher is destroyed (`PartQueue` holding the data should be dropped and the CRT request cancelled before this release) +/// 2) prefetcher's read window is scaled down (we wait for the previously requested data to be consumed) +/// 3) prefetcher is approaching the end of the request, in which case we can be sure that reservation in full won't be needed. +/// +/// Following is the visualisation of a single prefetcher instance's data stream: +/// +/// backwards_seek_start next_read_offset in_part_queue window_end_offset preferred_window_end_offset +/// │ │ │ │ │ +/// ─┼────────────────────┼───────────────────────────┼───────────────────────────────┼────────────────────────────┼───────────-► +/// │ ├───────────────────────────┤ │ │ +/// └────────────────────┤ certainly used memory └───────────────────────────────┤ │ +/// memory not accounted │ in CRT buffer, or callback queue └────────────────────────────┤ +/// │ (usage may be less than reserved) will be used after the │ +/// │ window increase │ +/// └────────────────────────────────────────────────────────────────────────────────────────┘ +/// preferred_read_window_size (reserved in MemoryLimiter) +/// #[derive(Debug)] -pub struct MemoryLimiter { - client: Client, +pub struct MemoryLimiter { mem_limit: u64, /// Reserved memory for data we had requested via the request task but may not /// arrived yet. @@ -17,8 +41,8 @@ pub struct MemoryLimiter { additional_mem_reserved: u64, } -impl MemoryLimiter { - pub fn new(client: Client, mem_limit: u64) -> Self { +impl MemoryLimiter { + pub fn new(mem_limit: u64) -> Self { let min_reserved = 128 * 1024 * 1024; let reserved_mem = (mem_limit / 8).max(min_reserved); let formatter = make_format(humansize::BINARY); @@ -28,7 +52,6 @@ impl MemoryLimiter { formatter(reserved_mem) ); Self { - client, mem_limit, prefetcher_mem_reserved: AtomicU64::new(0), additional_mem_reserved: reserved_mem, @@ -83,24 +106,4 @@ impl MemoryLimiter { .saturating_sub(fs_mem_usage) .saturating_sub(self.additional_mem_reserved) } - - pub fn log_total_usage(&self) { - let formatter = make_format(humansize::BINARY); - let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); - - let mut total_usage = prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); - if let Some(client_stats) = self.client.mem_usage_stats() { - let effective_client_mem_usage = client_stats.mem_used.max(client_stats.mem_reserved); - total_usage = total_usage.saturating_add(effective_client_mem_usage); - - debug!( - total_usage = formatter(total_usage), - client_mem_used = formatter(client_stats.mem_used), - client_mem_reserved = formatter(client_stats.mem_reserved), - prefetcher_mem_reserved = formatter(prefetcher_mem_reserved), - additional_mem_reserved = formatter(self.additional_mem_reserved), - "total memory usage" - ); - } - } } diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index 36bc54736..1312068db 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -70,7 +70,7 @@ pub trait Prefetch { fn prefetch( &self, client: Client, - mem_limiter: Arc>, + mem_limiter: Arc, bucket: String, object_id: ObjectId, size: u64, @@ -239,7 +239,7 @@ where fn prefetch( &self, client: Client, - mem_limiter: Arc>, + mem_limiter: Arc, bucket: String, object_id: ObjectId, size: u64, @@ -265,9 +265,9 @@ where pub struct PrefetchGetObject { client: Client, part_stream: Arc, - mem_limiter: Arc>, + mem_limiter: Arc, config: PrefetcherConfig, - backpressure_task: Option>, + backpressure_task: Option>, // Invariant: the offset of the last byte in this window is always // self.next_sequential_read_offset - 1. backward_seek_window: SeekWindow, @@ -319,7 +319,7 @@ where fn new( client: Client, part_stream: Arc, - mem_limiter: Arc>, + mem_limiter: Arc, config: PrefetcherConfig, bucket: String, object_id: ObjectId, @@ -420,7 +420,7 @@ where /// We will be using flow-control window to control how much data we want to download into the prefetcher. fn spawn_read_backpressure_request( &mut self, - ) -> Result, PrefetchReadError> { + ) -> Result, PrefetchReadError> { let start = self.next_sequential_read_offset; let object_size = self.size as usize; let read_part_size = self.client.read_part_size().unwrap_or(8 * 1024 * 1024); @@ -547,7 +547,6 @@ impl PrefetchGetObject Drop for PrefetchGetObject { fn drop(&mut self) { self.record_contiguous_read_metric(); - self.mem_limiter.log_total_usage(); } } @@ -616,7 +615,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let object = MockObject::ramp(0xaa, size as usize, ETag::for_tests()); let etag = object.etag(); @@ -711,7 +710,7 @@ mod tests { Stream: ObjectPartStream + Send + Sync + 'static, { let client = Arc::new(MockClient::new(client_config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let read_size = 1 * MB; let object_size = 8 * MB; let object = MockObject::ramp(0xaa, object_size, ETag::for_tests()); @@ -818,7 +817,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let prefetcher_config = PrefetcherConfig { max_read_window_size: test_config.max_read_window_size, @@ -943,7 +942,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let etag = object.etag(); @@ -1127,7 +1126,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let prefetcher = Prefetcher::new(default_stream(), Default::default()); let mem_limiter = Arc::new(mem_limiter); @@ -1180,7 +1179,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1222,7 +1221,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1284,7 +1283,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); @@ -1350,7 +1349,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index d32c5c207..7fadacb99 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use async_channel::{unbounded, Receiver, Sender}; use humansize::make_format; -use mountpoint_s3_client::ObjectClient; use tracing::{debug, trace}; use crate::mem_limiter::MemoryLimiter; @@ -33,7 +32,7 @@ pub struct BackpressureConfig { } #[derive(Debug)] -pub struct BackpressureController { +pub struct BackpressureController { read_window_updater: Sender, preferred_read_window_size: usize, min_read_window_size: usize, @@ -49,7 +48,7 @@ pub struct BackpressureController { /// data up to this offset *exclusively*. request_end_offset: u64, read_part_size: usize, - mem_limiter: Arc>, + mem_limiter: Arc, } #[derive(Debug)] @@ -75,10 +74,10 @@ pub struct BackpressureLimiter { /// [BackpressureController] will be given to the consumer side of the object stream. /// It can be used anywhere to set preferred read window size for the stream and tell the /// producer when its read window should be increased. -pub fn new_backpressure_controller( +pub fn new_backpressure_controller( config: BackpressureConfig, - mem_limiter: Arc>, -) -> (BackpressureController, BackpressureLimiter) { + mem_limiter: Arc, +) -> (BackpressureController, BackpressureLimiter) { let read_window_end_offset = config.request_range.start + config.initial_read_window_size as u64; let (read_window_updater, read_window_incrementing_queue) = unbounded(); mem_limiter.reserve(config.initial_read_window_size as u64); @@ -102,7 +101,7 @@ pub fn new_backpressure_controller( (controller, limiter) } -impl BackpressureController { +impl BackpressureController { pub fn read_window_end_offset(&self) -> u64 { self.read_window_end_offset } @@ -283,7 +282,7 @@ impl BackpressureController { } } -impl Drop for BackpressureController { +impl Drop for BackpressureController { fn drop(&mut self) { // When approaching request end we have less memory still reserved than `self.preferred_read_window_size`. debug_assert!(self.request_end_offset >= self.next_read_offset); diff --git a/mountpoint-s3/src/prefetch/caching_stream.rs b/mountpoint-s3/src/prefetch/caching_stream.rs index dcdee2e18..d5738c278 100644 --- a/mountpoint-s3/src/prefetch/caching_stream.rs +++ b/mountpoint-s3/src/prefetch/caching_stream.rs @@ -46,8 +46,8 @@ where &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc>, - ) -> RequestTask<::ClientError, Client> + mem_limiter: Arc, + ) -> RequestTask<::ClientError> where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -63,7 +63,7 @@ where }; let (backpressure_controller, backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); - let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); + let (part_queue, part_queue_producer) = unbounded_part_queue(); trace!(?range, "spawning request"); let request_task = { @@ -130,7 +130,7 @@ where async fn get_from_cache( mut self, range: RequestRange, - part_queue_producer: PartQueueProducer, + part_queue_producer: PartQueueProducer, ) { let cache_key = &self.config.object_id; let block_size = self.cache.block_size(); @@ -189,7 +189,7 @@ where &mut self, range: RequestRange, block_range: Range, - part_queue_producer: PartQueueProducer, + part_queue_producer: PartQueueProducer, ) { let bucket = &self.config.bucket; let cache_key = &self.config.object_id; @@ -243,8 +243,8 @@ where } } -struct CachingPartComposer { - part_queue_producer: PartQueueProducer, +struct CachingPartComposer { + part_queue_producer: PartQueueProducer, cache_key: ObjectId, original_range: RequestRange, block_index: u64, @@ -253,12 +253,11 @@ struct CachingPartComposer CachingPartComposer +impl CachingPartComposer where E: std::error::Error + Send + Sync, Cache: DataCache + Send + Sync + 'static, - Runtime: Spawn, - Client: ObjectClient + Send + Sync + 'static, + Runtime: Spawn { async fn try_compose_parts(&mut self, request_stream: impl Stream>) { if let Err(e) = self.compose_parts(request_stream).await { @@ -434,7 +433,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -515,7 +514,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -539,10 +538,10 @@ mod tests { } } - fn compare_read( + fn compare_read( id: &ObjectId, object: &MockObject, - mut request_task: RequestTask, + mut request_task: RequestTask, ) { let mut offset = request_task.start_offset(); let mut remaining = request_task.total_size(); diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs index 21adc8865..08778666a 100644 --- a/mountpoint-s3/src/prefetch/part_queue.rs +++ b/mountpoint-s3/src/prefetch/part_queue.rs @@ -1,9 +1,7 @@ -use mountpoint_s3_client::ObjectClient; use std::time::Instant; use tracing::trace; -use crate::mem_limiter::MemoryLimiter; use crate::prefetch::part::Part; use crate::prefetch::PrefetchReadError; use crate::sync::async_channel::{unbounded, Receiver, RecvError, Sender}; @@ -13,7 +11,7 @@ use crate::sync::Arc; /// A queue of [Part]s where the first part can be partially read from if the reader doesn't want /// the entire part in one shot. #[derive(Debug)] -pub struct PartQueue { +pub struct PartQueue { /// The auxiliary queue that supports pushing parts to the front of the part queue in order to /// allow partial reads and backwards seeks. front_queue: Vec, @@ -22,22 +20,18 @@ pub struct PartQueue { failed: bool, /// The total number of bytes sent to the underlying queue of `self.receiver` bytes_received: Arc, - _mem_limiter: Arc>, // todo: remove } /// Producer side of the queue of [Part]s. #[derive(Debug)] -pub struct PartQueueProducer { +pub struct PartQueueProducer { sender: Sender>>, /// The total number of bytes sent to `self.sender` bytes_sent: Arc, - _mem_limiter: Arc>, } /// Creates an unbounded [PartQueue] and its related [PartQueueProducer]. -pub fn unbounded_part_queue( - mem_limiter: Arc>, -) -> (PartQueue, PartQueueProducer) { +pub fn unbounded_part_queue() -> (PartQueue, PartQueueProducer) { let (sender, receiver) = unbounded(); let bytes_counter = Arc::new(AtomicUsize::new(0)); let part_queue = PartQueue { @@ -45,17 +39,15 @@ pub fn unbounded_part_queue( receiver, failed: false, bytes_received: Arc::clone(&bytes_counter), - _mem_limiter: mem_limiter.clone(), }; let part_queue_producer = PartQueueProducer { sender, bytes_sent: bytes_counter, - _mem_limiter: mem_limiter, }; (part_queue, part_queue_producer) } -impl PartQueue { +impl PartQueue { /// Read up to `length` bytes from the queue at the current offset. This function always returns /// a contiguous [Bytes], and so may return fewer than `length` bytes if it would need to copy /// or reallocate to make the return value contiguous. This function blocks only if the queue is @@ -113,7 +105,7 @@ impl PartQueue PartQueueProducer { +impl PartQueueProducer { /// Push a new [Part] onto the back of the queue pub fn push(&self, part: Result>) { let part_len = part.as_ref().map_or(0, |part| part.len()); @@ -129,7 +121,7 @@ impl PartQueueProducer } } -impl Drop for PartQueue { +impl Drop for PartQueue { fn drop(&mut self) { // close the channel and drain remaining parts from the main queue self.receiver.close(); @@ -156,7 +148,6 @@ mod tests { use bytes::Bytes; use futures::executor::block_on; - use mountpoint_s3_client::mock_client::MockClient; use mountpoint_s3_client::types::ETag; use proptest::proptest; use proptest_derive::Arbitrary; @@ -173,10 +164,8 @@ mod tests { enum DummyError {} async fn run_test(ops: Vec) { - let client = MockClient::new(Default::default()); - let mem_limiter = MemoryLimiter::new(client, 512 * 1024 * 1024); let part_id = ObjectId::new("key".to_owned(), ETag::for_tests()); - let (mut part_queue, part_queue_producer) = unbounded_part_queue::(mem_limiter.into()); + let (mut part_queue, part_queue_producer) = unbounded_part_queue::(); let mut current_offset = 0; let mut current_length = 0; for op in ops { diff --git a/mountpoint-s3/src/prefetch/part_stream.rs b/mountpoint-s3/src/prefetch/part_stream.rs index 9be0d4be1..6df840c23 100644 --- a/mountpoint-s3/src/prefetch/part_stream.rs +++ b/mountpoint-s3/src/prefetch/part_stream.rs @@ -28,8 +28,8 @@ pub trait ObjectPartStream { &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc>, - ) -> RequestTask + mem_limiter: Arc, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static; } @@ -185,8 +185,8 @@ where &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc>, - ) -> RequestTask + mem_limiter: Arc, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -206,7 +206,7 @@ where }; let (backpressure_controller, mut backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); - let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); + let (part_queue, part_queue_producer) = unbounded_part_queue(); trace!(?range, "spawning request"); let span = debug_span!("prefetch", ?range); @@ -240,16 +240,15 @@ where } } -struct ClientPartComposer { - part_queue_producer: PartQueueProducer, +struct ClientPartComposer { + part_queue_producer: PartQueueProducer, object_id: ObjectId, preferred_part_size: usize, } -impl ClientPartComposer +impl ClientPartComposer where E: std::error::Error + Send + Sync, - Client: ObjectClient + Send + Sync + 'static, { async fn try_compose_parts(&self, request_stream: impl Stream>) { if let Err(e) = self.compose_parts(request_stream).await { diff --git a/mountpoint-s3/src/prefetch/task.rs b/mountpoint-s3/src/prefetch/task.rs index bf47190fe..cd858d7bc 100644 --- a/mountpoint-s3/src/prefetch/task.rs +++ b/mountpoint-s3/src/prefetch/task.rs @@ -1,5 +1,4 @@ use futures::future::RemoteHandle; -use mountpoint_s3_client::ObjectClient; use crate::prefetch::backpressure_controller::BackpressureFeedbackEvent::{DataRead, PartQueueStall}; use crate::prefetch::part::Part; @@ -11,22 +10,22 @@ use super::part_stream::RequestRange; /// A single GetObject request submitted to the S3 client #[derive(Debug)] -pub struct RequestTask { +pub struct RequestTask { /// Handle on the task/future. The future is cancelled when handle is dropped. This is None if /// the request is fake (created by seeking backwards in the stream) _task_handle: RemoteHandle<()>, remaining: usize, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, } -impl RequestTask { +impl RequestTask { pub fn from_handle( task_handle: RemoteHandle<()>, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, ) -> Self { Self { _task_handle: task_handle, From f77d35aac84a14f7c4bf777170e5457261829c7b Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:43:01 +0000 Subject: [PATCH 5/9] Simplify the logic and include client metrics Signed-off-by: Monthon Klongklaew --- mountpoint-s3-client/src/failure_client.rs | 8 +- mountpoint-s3-client/src/mock_client.rs | 10 +- .../src/mock_client/throughput_client.rs | 6 +- mountpoint-s3-client/src/object_client.rs | 16 ++ mountpoint-s3-client/src/s3_crt_client.rs | 21 ++ mountpoint-s3/examples/prefetch_benchmark.rs | 2 +- mountpoint-s3/src/cli.rs | 4 +- mountpoint-s3/src/fs.rs | 4 +- mountpoint-s3/src/mem_limiter.rs | 79 ++++--- mountpoint-s3/src/prefetch.rs | 30 +-- .../src/prefetch/backpressure_controller.rs | 205 +++++++----------- mountpoint-s3/src/prefetch/caching_stream.rs | 16 +- mountpoint-s3/src/prefetch/part_queue.rs | 22 +- mountpoint-s3/src/prefetch/part_stream.rs | 10 +- mountpoint-s3/src/prefetch/task.rs | 13 +- 15 files changed, 231 insertions(+), 215 deletions(-) diff --git a/mountpoint-s3-client/src/failure_client.rs b/mountpoint-s3-client/src/failure_client.rs index b7d78a25d..bbc5cf254 100644 --- a/mountpoint-s3-client/src/failure_client.rs +++ b/mountpoint-s3-client/src/failure_client.rs @@ -16,8 +16,8 @@ use pin_project::pin_project; use crate::object_client::{ DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, PutObjectRequest, - PutObjectResult, UploadReview, + MemoryUsageStats, ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, + PutObjectRequest, PutObjectResult, UploadReview, }; use crate::ObjectClient; @@ -85,6 +85,10 @@ where self.client.initial_read_window_size() } + fn mem_usage_stats(&self) -> Option { + self.client.mem_usage_stats() + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client.rs b/mountpoint-s3-client/src/mock_client.rs index b663a9463..75d8d47e3 100644 --- a/mountpoint-s3-client/src/mock_client.rs +++ b/mountpoint-s3-client/src/mock_client.rs @@ -26,9 +26,9 @@ use crate::error_metadata::{ClientErrorMetadata, ProvideErrorMetadata}; use crate::object_client::{ Checksum, ChecksumAlgorithm, DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesParts, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, - HeadObjectResult, ListObjectsError, ListObjectsResult, ObjectAttribute, ObjectClient, ObjectClientError, - ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, PutObjectResult, - PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, + HeadObjectResult, ListObjectsError, ListObjectsResult, MemoryUsageStats, ObjectAttribute, ObjectClient, + ObjectClientError, ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, + PutObjectResult, PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, }; mod leaky_bucket; @@ -572,6 +572,10 @@ impl ObjectClient for MockClient { } } + fn mem_usage_stats(&self) -> Option { + None + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/mock_client/throughput_client.rs b/mountpoint-s3-client/src/mock_client/throughput_client.rs index 065791069..610f81ef4 100644 --- a/mountpoint-s3-client/src/mock_client/throughput_client.rs +++ b/mountpoint-s3-client/src/mock_client/throughput_client.rs @@ -13,7 +13,7 @@ use crate::mock_client::{MockClient, MockClientConfig, MockClientError, MockObje use crate::object_client::{ DeleteObjectError, DeleteObjectResult, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, + MemoryUsageStats, ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, }; use crate::types::ETag; @@ -113,6 +113,10 @@ impl ObjectClient for ThroughputMockClient { self.inner.initial_read_window_size() } + fn mem_usage_stats(&self) -> Option { + self.inner.mem_usage_stats() + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3-client/src/object_client.rs b/mountpoint-s3-client/src/object_client.rs index b92bf796c..f5c3d5b38 100644 --- a/mountpoint-s3-client/src/object_client.rs +++ b/mountpoint-s3-client/src/object_client.rs @@ -63,6 +63,18 @@ impl FromStr for ETag { } } +/// Memory usage stats for the client +pub struct MemoryUsageStats { + /// Total allocated memory for the client. + pub mem_allocated: u64, + /// Reserved memory for the client. For [S3CrtClient], this value is a sum of primary storage + /// and secondary storage reserved memory. + pub mem_reserved: u64, + /// Actual used memory for the client. For [S3CrtClient], this value is a sum of primanry + /// storage and secondary storage used memory. + pub mem_used: u64, +} + /// A generic interface to S3-like object storage services. /// /// This trait defines the common methods that all object services implement. @@ -89,6 +101,10 @@ pub trait ObjectClient { /// This can be `None` if backpressure is disabled. fn initial_read_window_size(&self) -> Option; + /// Query current memory usage stats for the client. This can be `None` if the client + /// does not record the stats. + fn mem_usage_stats(&self) -> Option; + /// Delete a single object from the object store. /// /// DeleteObject will succeed even if the object within the bucket does not exist. diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index caf6bab45..53b681e7a 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -1208,6 +1208,27 @@ impl ObjectClient for S3CrtClient { } } + fn mem_usage_stats(&self) -> Option { + let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); + let mem_allocated = crt_buffer_pool_stats + .primary_allocated + .saturating_add(crt_buffer_pool_stats.secondary_reserved) + .saturating_add(crt_buffer_pool_stats.secondary_used) + .saturating_add(crt_buffer_pool_stats.forced_used); + let mem_reserved = crt_buffer_pool_stats + .primary_reserved + .saturating_add(crt_buffer_pool_stats.secondary_reserved); + let mem_used = crt_buffer_pool_stats + .primary_used + .saturating_add(crt_buffer_pool_stats.secondary_used) + .saturating_add(crt_buffer_pool_stats.forced_used); + Some(MemoryUsageStats { + mem_allocated, + mem_reserved, + mem_used, + }) + } + async fn delete_object( &self, bucket: &str, diff --git a/mountpoint-s3/examples/prefetch_benchmark.rs b/mountpoint-s3/examples/prefetch_benchmark.rs index 5412f0e9e..7532edde4 100644 --- a/mountpoint-s3/examples/prefetch_benchmark.rs +++ b/mountpoint-s3/examples/prefetch_benchmark.rs @@ -93,7 +93,7 @@ fn main() { config = config.part_size(part_size); } let client = Arc::new(S3CrtClient::new(config).expect("couldn't create client")); - let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let head_object_result = block_on(client.head_object(bucket, key)).expect("HeadObject failed"); let size = head_object_result.object.size; diff --git a/mountpoint-s3/src/cli.rs b/mountpoint-s3/src/cli.rs index a247f539a..cd4230a46 100644 --- a/mountpoint-s3/src/cli.rs +++ b/mountpoint-s3/src/cli.rs @@ -442,14 +442,12 @@ impl CliArgs { let mut filter = if self.debug { String::from("debug") } else { - String::from("info") + String::from("warn") }; let crt_verbosity = if self.debug_crt { "debug" } else { "off" }; filter.push_str(&format!(",{}={}", AWSCRT_LOG_TARGET, crt_verbosity)); if self.log_metrics { filter.push_str(&format!(",{}=info", metrics::TARGET_NAME)); - } else { - filter.push_str(&format!(",{}=off", metrics::TARGET_NAME)); } filter }; diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 63b9bce46..4e2af1a78 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -536,7 +536,7 @@ where { config: S3FilesystemConfig, client: Client, - mem_limiter: Arc, + mem_limiter: Arc>, superblock: Superblock, prefetcher: Prefetcher, uploader: Uploader, @@ -567,7 +567,7 @@ where s3_personality: config.s3_personality, }; let superblock = Superblock::new(bucket, prefix, superblock_config); - let mem_limiter = Arc::new(MemoryLimiter::new(config.mem_limit)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), config.mem_limit)); let uploader = Uploader::new( client.clone(), config.storage_class.to_owned(), diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index bd1810fcd..b5a712b75 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -2,47 +2,54 @@ use std::sync::atomic::Ordering; use humansize::make_format; use metrics::atomics::AtomicU64; +use mountpoint_s3_client::ObjectClient; use tracing::debug; /// `MemoryLimiter` tracks memory used by Mountpoint and makes decisions if a new memory reservation request can be accepted. -/// Currently the only metric which we take into account is the memory reserved by prefetcher instances for the data requested or -/// fetched from CRT client. Single instance of this struct is shared among all of the prefetchers (file handles). +/// Currently, there are two metrics we take into account: +/// 1) the memory reserved by prefetcher instances for the data requested or fetched from CRT client. +/// 2) the memory reserved by S3 client if it can report. +/// +/// Single instance of this struct is shared among all of the prefetchers (file handles). /// /// Each file handle upon creation makes an initial reservation request with a minimal read window size of `1MiB + 128KiB`. This /// is accepted unconditionally since we want to allow any file handle to make progress even if that means going over the memory -/// limit. Additional reservations for a file handle arise when data is being read from fuse **faster** than it arrives from the -/// client (PartQueueStall). Those reservations may be rejected if there is no available memory. +/// limit. Additional reservations for a file handle arise when the backpressure read window is incremented to fetch more data +/// from underlying part streams. Those reservations may be rejected if there is no available memory. /// /// Release of the reserved memory happens on one of the following events: -/// 1) prefetcher is destroyed (`PartQueue` holding the data should be dropped and the CRT request cancelled before this release) -/// 2) prefetcher's read window is scaled down (we wait for the previously requested data to be consumed) -/// 3) prefetcher is approaching the end of the request, in which case we can be sure that reservation in full won't be needed. +/// 1) prefetcher is destroyed (`RequestTask` will be dropped and remaining data in the backpressure read window will be released). +/// 2) data is moved out of the part queue. /// /// Following is the visualisation of a single prefetcher instance's data stream: /// -/// backwards_seek_start next_read_offset in_part_queue window_end_offset preferred_window_end_offset -/// │ │ │ │ │ -/// ─┼────────────────────┼───────────────────────────┼───────────────────────────────┼────────────────────────────┼───────────-► -/// │ ├───────────────────────────┤ │ │ -/// └────────────────────┤ certainly used memory └───────────────────────────────┤ │ -/// memory not accounted │ in CRT buffer, or callback queue └────────────────────────────┤ -/// │ (usage may be less than reserved) will be used after the │ -/// │ window increase │ -/// └────────────────────────────────────────────────────────────────────────────────────────┘ -/// preferred_read_window_size (reserved in MemoryLimiter) +/// backwards_seek_start part_queue_start in_part_queue window_end_offset preferred_window_end_offset +/// │ │ │ │ │ +/// ─┼──────────────────────────────┼───────────────────────────┼───────────────────────────────┼────────────────────────────┼───────────-► +/// │ │ │ +/// └──────────────────────────────┤ │ +/// mem reserved by the part queue │ │ +/// └───────────────────────────────────────────────────────────┤ +/// mem reserved by the backpressure controller +/// (on `BackpressureFeedbackEvent`) /// #[derive(Debug)] -pub struct MemoryLimiter { +pub struct MemoryLimiter { mem_limit: u64, /// Reserved memory for data we had requested via the request task but may not /// arrived yet. prefetcher_mem_reserved: AtomicU64, /// Additional reserved memory for other non-buffer usage like storing metadata additional_mem_reserved: u64, + // We will also take client's reserved memory into account because even if the + // prefetch takes control over the entire read path but we don't record or control + // memory usage on the write path today, so we will rely on the client's stats + // for "other buffers" and adjust the prefetcher read window accordingly. + client: Client, } -impl MemoryLimiter { - pub fn new(mem_limit: u64) -> Self { +impl MemoryLimiter { + pub fn new(client: Client, mem_limit: u64) -> Self { let min_reserved = 128 * 1024 * 1024; let reserved_mem = (mem_limit / 8).max(min_reserved); let formatter = make_format(humansize::BINARY); @@ -52,6 +59,7 @@ impl MemoryLimiter { formatter(reserved_mem) ); Self { + client, mem_limit, prefetcher_mem_reserved: AtomicU64::new(0), additional_mem_reserved: reserved_mem, @@ -66,19 +74,19 @@ impl MemoryLimiter { } /// Reserve the memory for future uses. If there is not enough memory returns `false`. - pub fn try_reserve(&self, size: u64, min_available: u64) -> bool { + pub fn try_reserve(&self, size: u64) -> bool { + let mut prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); loop { - let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); let new_prefetcher_mem_reserved = prefetcher_mem_reserved.saturating_add(size); - let total_mem_usage = prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); - let new_total_mem_usage = new_prefetcher_mem_reserved.saturating_add(self.additional_mem_reserved); - if new_total_mem_usage > self.mem_limit - min_available { - debug!( - "not enough memory to reserve, current usage: {}, new (if scaled up): {}, allowed diff: {}", - total_mem_usage, new_total_mem_usage, min_available, - ); + let client_mem_allocated = self.client_mem_allocated(); + let new_total_mem_usage = new_prefetcher_mem_reserved + .saturating_add(client_mem_allocated) + .saturating_add(self.additional_mem_reserved); + if new_total_mem_usage > self.mem_limit { + debug!(new_total_mem_usage, "not enough memory to reserve"); return false; } + // Check that the value we have read is still the same before updating it match self.prefetcher_mem_reserved.compare_exchange_weak( prefetcher_mem_reserved, new_prefetcher_mem_reserved, @@ -89,7 +97,7 @@ impl MemoryLimiter { metrics::gauge!("prefetch.bytes_reserved").increment(size as f64); return true; } - Err(_) => continue, // another thread updated the atomic before us, trying again + Err(current) => prefetcher_mem_reserved = current, // another thread updated the atomic before us, trying again } } } @@ -100,10 +108,17 @@ impl MemoryLimiter { metrics::gauge!("prefetch.bytes_reserved").decrement(size as f64); } + /// Query available memory tracked by the memory limiter. pub fn available_mem(&self) -> u64 { - let fs_mem_usage = self.prefetcher_mem_reserved.load(Ordering::SeqCst); + let prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); + let client_mem_allocated = self.client_mem_allocated(); self.mem_limit - .saturating_sub(fs_mem_usage) + .saturating_sub(prefetcher_mem_reserved) + .saturating_sub(client_mem_allocated) .saturating_sub(self.additional_mem_reserved) } + + fn client_mem_allocated(&self) -> u64 { + self.client.mem_usage_stats().map_or(0, |stats| stats.mem_allocated) + } } diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index 1312068db..2e6917f6f 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -70,7 +70,7 @@ pub trait Prefetch { fn prefetch( &self, client: Client, - mem_limiter: Arc, + mem_limiter: Arc>, bucket: String, object_id: ObjectId, size: u64, @@ -239,7 +239,7 @@ where fn prefetch( &self, client: Client, - mem_limiter: Arc, + mem_limiter: Arc>, bucket: String, object_id: ObjectId, size: u64, @@ -265,9 +265,9 @@ where pub struct PrefetchGetObject { client: Client, part_stream: Arc, - mem_limiter: Arc, + mem_limiter: Arc>, config: PrefetcherConfig, - backpressure_task: Option>, + backpressure_task: Option>, // Invariant: the offset of the last byte in this window is always // self.next_sequential_read_offset - 1. backward_seek_window: SeekWindow, @@ -319,7 +319,7 @@ where fn new( client: Client, part_stream: Arc, - mem_limiter: Arc, + mem_limiter: Arc>, config: PrefetcherConfig, bucket: String, object_id: ObjectId, @@ -420,7 +420,7 @@ where /// We will be using flow-control window to control how much data we want to download into the prefetcher. fn spawn_read_backpressure_request( &mut self, - ) -> Result, PrefetchReadError> { + ) -> Result, PrefetchReadError> { let start = self.next_sequential_read_offset; let object_size = self.size as usize; let read_part_size = self.client.read_part_size().unwrap_or(8 * 1024 * 1024); @@ -615,7 +615,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, size as usize, ETag::for_tests()); let etag = object.etag(); @@ -710,7 +710,7 @@ mod tests { Stream: ObjectPartStream + Send + Sync + 'static, { let client = Arc::new(MockClient::new(client_config)); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let read_size = 1 * MB; let object_size = 8 * MB; let object = MockObject::ramp(0xaa, object_size, ETag::for_tests()); @@ -817,7 +817,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let prefetcher_config = PrefetcherConfig { max_read_window_size: test_config.max_read_window_size, @@ -942,7 +942,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let etag = object.etag(); @@ -1126,7 +1126,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let prefetcher = Prefetcher::new(default_stream(), Default::default()); let mem_limiter = Arc::new(mem_limiter); @@ -1179,7 +1179,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1221,7 +1221,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1283,7 +1283,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); @@ -1349,7 +1349,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index 7fadacb99..131e03d5f 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use async_channel::{unbounded, Receiver, Sender}; use humansize::make_format; +use mountpoint_s3_client::ObjectClient; use tracing::{debug, trace}; use crate::mem_limiter::MemoryLimiter; @@ -32,7 +33,7 @@ pub struct BackpressureConfig { } #[derive(Debug)] -pub struct BackpressureController { +pub struct BackpressureController { read_window_updater: Sender, preferred_read_window_size: usize, min_read_window_size: usize, @@ -48,7 +49,7 @@ pub struct BackpressureController { /// data up to this offset *exclusively*. request_end_offset: u64, read_part_size: usize, - mem_limiter: Arc, + mem_limiter: Arc>, } #[derive(Debug)] @@ -74,10 +75,12 @@ pub struct BackpressureLimiter { /// [BackpressureController] will be given to the consumer side of the object stream. /// It can be used anywhere to set preferred read window size for the stream and tell the /// producer when its read window should be increased. -pub fn new_backpressure_controller( +pub fn new_backpressure_controller( config: BackpressureConfig, - mem_limiter: Arc, -) -> (BackpressureController, BackpressureLimiter) { + mem_limiter: Arc>, +) -> (BackpressureController, BackpressureLimiter) { + // Minimum window size multiplier as the scaling up and down won't work if the multiplier is 1. + const MIN_WINDOW_SIZE_MULTIPLIER: usize = 2; let read_window_end_offset = config.request_range.start + config.initial_read_window_size as u64; let (read_window_updater, read_window_incrementing_queue) = unbounded(); mem_limiter.reserve(config.initial_read_window_size as u64); @@ -86,7 +89,7 @@ pub fn new_backpressure_controller( preferred_read_window_size: config.initial_read_window_size, min_read_window_size: config.min_read_window_size, max_read_window_size: config.max_read_window_size, - read_window_size_multiplier: config.read_window_size_multiplier, + read_window_size_multiplier: config.read_window_size_multiplier.max(MIN_WINDOW_SIZE_MULTIPLIER), read_window_end_offset, next_read_offset: config.request_range.start, request_end_offset: config.request_range.end, @@ -101,7 +104,7 @@ pub fn new_backpressure_controller( (controller, limiter) } -impl BackpressureController { +impl BackpressureController { pub fn read_window_end_offset(&self) -> u64 { self.read_window_end_offset } @@ -112,81 +115,57 @@ impl BackpressureController { match event { // Note, that this may come from a backwards seek, so offsets observed by this method are not necessarily ascending BackpressureFeedbackEvent::DataRead { offset, length } => { - // Step 2. of scale down, including the case when we're approaching the request end. See `self.scale_down` for the logic. - let next_read_offset = offset + length as u64; - // We don't update `self.next_read_offset` if this feedback arrived from read after a backwards seek - if next_read_offset > self.next_read_offset { - self.next_read_offset = next_read_offset; - } - if self.next_read_offset >= self.request_end_offset { - self.next_read_offset = self.request_end_offset; - } - let remaining_window = self.read_window_end_offset.saturating_sub(next_read_offset) as usize; - - let preffered_window_end_offset = self - .next_read_offset - .saturating_add(self.preferred_read_window_size as u64); - let over_reserved = self.read_window_end_offset.saturating_sub(preffered_window_end_offset); - if over_reserved > 0 { - self.mem_limiter.release((length as u64).min(over_reserved)); - } - if self.request_end_offset < preffered_window_end_offset { - // We won't need the full `preffered_window_end_offset` as we're approaching the request's end. - self.mem_limiter.release(length as u64); - } + self.next_read_offset = offset + length as u64; + self.mem_limiter.release(length as u64); + let remaining_window = self.read_window_end_offset.saturating_sub(self.next_read_offset) as usize; // Increment the read window only if the remaining window reaches some threshold i.e. half of it left. - if remaining_window < (self.preferred_read_window_size / 2) + while remaining_window < (self.preferred_read_window_size / 2) && self.read_window_end_offset < self.request_end_offset { - // If there is not enough available memory in the system, we'll try to reduce the read window of the current request. - // We define "not enough memory" as a situation where no new request with a minimum window may fit in the limit. - // - // Scaling down is best effort, meaning that there is no guarantee that after this action such a - // request will fit in memory. This may not be the case if during the scale down a new memory reservation was made by - // another request. - // - // We reduce the frequency of scale downs by only performing it when sufficient amount of data (half of read_window) - // was read. - let mut available_mem = self.mem_limiter.available_mem(); - let mut new_read_window_size = self.preferred_read_window_size; // new_preferred_read_window_size is just too wordy - while available_mem < self.min_read_window_size as u64 && self.read_window_size_multiplier > 1 { - let scaled_down = new_read_window_size / self.read_window_size_multiplier; - if scaled_down < self.min_read_window_size { - break; - } - available_mem += (new_read_window_size - scaled_down) as u64; - new_read_window_size = scaled_down; - } - if new_read_window_size != self.preferred_read_window_size { - self.scale_down(new_read_window_size); - } - - let new_read_window_end_offset = next_read_offset + let new_read_window_end_offset = self + .next_read_offset .saturating_add(self.preferred_read_window_size as u64) .min(self.request_end_offset); + // We can skip if the new `read_window_end_offset` is less than or equal to the current one, this + // could happen after the read window is scaled down. + if new_read_window_end_offset <= self.read_window_end_offset { + break; + } + let to_increase = new_read_window_end_offset.saturating_sub(self.read_window_end_offset) as usize; + + // Force incrementing read window regardless of available memory when we are already at minimum + // read window size. + if self.preferred_read_window_size <= self.min_read_window_size { + self.mem_limiter.reserve(to_increase as u64); + self.increment_read_window(to_increase).await; + break; + } - if self.read_window_end_offset < new_read_window_end_offset { - let to_increase = - new_read_window_end_offset.saturating_sub(self.read_window_end_offset) as usize; - trace!( - preferred_read_window_size = self.preferred_read_window_size, - next_read_offset = self.next_read_offset, - read_window_end_offset = self.read_window_end_offset, - to_increase, - "incrementing read window" - ); + // Try to reserve the memory for the length we want to increase before sending the request, + // scale down the read window if it fails. + if self.mem_limiter.try_reserve(to_increase as u64) { self.increment_read_window(to_increase).await; + } else { + self.scale_down(); } } } - BackpressureFeedbackEvent::PartQueueStall => self.try_scaling_up(), + BackpressureFeedbackEvent::PartQueueStall => self.scale_up(), } Ok(()) } // Send an increment read window request to the stream producer async fn increment_read_window(&mut self, len: usize) { + trace!( + preferred_read_window_size = self.preferred_read_window_size, + next_read_offset = self.next_read_offset, + read_window_end_offset = self.read_window_end_offset, + len, + "incrementing read window" + ); + // This should not block since the channel is unbounded let _ = self .read_window_updater @@ -196,21 +175,21 @@ impl BackpressureController { self.read_window_end_offset += len as u64; } - // Try scaling up preferred read window size with a multiplier configured at initialization. + // Scale up preferred read window size with a multiplier configured at initialization. // Scaling up fails silently if there is no enough free memory to perform it. - fn try_scaling_up(&mut self) { + fn scale_up(&mut self) { if self.preferred_read_window_size < self.max_read_window_size { let new_read_window_size = self.preferred_read_window_size * self.read_window_size_multiplier; // Also align the new read window size to the client part size let new_read_window_size = align(new_read_window_size, self.read_part_size, false).min(self.max_read_window_size); - // Only scale up when there is enough memory + // Only scale up when there is enough memory. We don't have to reserve the memory here + // because only `preferred_read_window_size` is increased but the actual read window will + // be updated later on `DataRead` event (where we do reserve memory). let to_increase = (new_read_window_size - self.preferred_read_window_size) as u64; - if self - .mem_limiter - .try_reserve(to_increase, self.min_read_window_size as u64) - { + let available_mem = self.mem_limiter.available_mem(); + if available_mem >= to_increase { let formatter = make_format(humansize::BINARY); debug!( current_size = formatter(self.preferred_read_window_size), @@ -224,72 +203,34 @@ impl BackpressureController { } } - pub fn scale_down(&mut self, new_read_window_size: usize) { - /* - Scaling down is performed in 2 steps, one in this method and another on read. Note that `window_end_offset` is the value - which is set in CRT and it may not be decreased. This function implements step 1. - - 0. Before scale down: - - read_until window_end_offset preferred_window_end_offset - │ │ │ - ────┼───────────────────────────────────────────────────────────┼───────────────┼─────────────────────────────────► - │ │ - └───────────────────────────────────────────────────────────────────────────┘ - preferred_read_window_size - - 1. Scaling down (`new_read_window_size` is applied): - - read_until preferred_window_end_offset window_end_offset preferred_window_end_offset_old - │ │ │ │ - ────┼──────────────────────────────────────┼────────────────────┼───────────────┼─────────────────────────────────► - │ ├───────────────┘ - └────────────────────┘ released immediatelly - over_reserved - - 2. Part read: - - read_until(old) read_until preferred_window_end_offset window_end_offset - │ │ │ │ - ────┼────────────┼─────────────────────────────────────┼────────┼─────────────────────────────────────────────────► - └────────────┘ └────────┘ - released on read: over_reserved (new) - 1. if over_reserved > 0 - 2. min(part.size(), over_reserved) is to deduct - */ - // Align the new read window size to the client part size - let new_read_window_size = - align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); + // Scale down preferred read window size by a multiplier configured at initialization. + fn scale_down(&mut self) { + if self.preferred_read_window_size > self.min_read_window_size { + assert!(self.read_window_size_multiplier > 1); + let new_read_window_size = self.preferred_read_window_size / self.read_window_size_multiplier; + // Also align the new read window size to the client part size + let new_read_window_size = + align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); - let formatter = make_format(humansize::BINARY); - debug!( - current_size = formatter(self.preferred_read_window_size), - new_size = formatter(new_read_window_size), - "scaling down read window" - ); - let preferred_window_end_offset_old = self - .next_read_offset - .saturating_add(self.preferred_read_window_size as u64); - let preferred_window_end_offset = self.next_read_offset.saturating_add(new_read_window_size as u64); - // In most cases we'll keep memory reserved for `self.read_window_end_offset`, but if the new - // `preferred_window_end_offset` is greater, we'll reserve for it instead. - let reserve_until_offset = self.read_window_end_offset.max(preferred_window_end_offset); - let to_release = preferred_window_end_offset_old.saturating_sub(reserve_until_offset); - self.mem_limiter.release(to_release); - self.preferred_read_window_size = new_read_window_size; - metrics::histogram!("prefetch.window_after_decrease_mib") - .record((self.preferred_read_window_size / 1024 / 1024) as f64); + let formatter = make_format(humansize::BINARY); + debug!( + current_size = formatter(self.preferred_read_window_size), + new_size = formatter(new_read_window_size), + "scaling down read window" + ); + self.preferred_read_window_size = new_read_window_size; + metrics::histogram!("prefetch.window_after_decrease_mib") + .record((self.preferred_read_window_size / 1024 / 1024) as f64); + } } } -impl Drop for BackpressureController { +impl Drop for BackpressureController { fn drop(&mut self) { - // When approaching request end we have less memory still reserved than `self.preferred_read_window_size`. + // Free up memory we have reserved for the read window. debug_assert!(self.request_end_offset >= self.next_read_offset); - let remaining_in_request = self.request_end_offset.saturating_sub(self.next_read_offset); - - self.mem_limiter - .release((self.preferred_read_window_size as u64).min(remaining_in_request)); + let remaining_window = self.read_window_end_offset.saturating_sub(self.next_read_offset); + self.mem_limiter.release(remaining_window); } } diff --git a/mountpoint-s3/src/prefetch/caching_stream.rs b/mountpoint-s3/src/prefetch/caching_stream.rs index d5738c278..9c909feab 100644 --- a/mountpoint-s3/src/prefetch/caching_stream.rs +++ b/mountpoint-s3/src/prefetch/caching_stream.rs @@ -46,8 +46,8 @@ where &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc, - ) -> RequestTask<::ClientError> + mem_limiter: Arc>, + ) -> RequestTask<::ClientError, Client> where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -63,7 +63,7 @@ where }; let (backpressure_controller, backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); - let (part_queue, part_queue_producer) = unbounded_part_queue(); + let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); trace!(?range, "spawning request"); let request_task = { @@ -257,7 +257,7 @@ impl CachingPartComposer where E: std::error::Error + Send + Sync, Cache: DataCache + Send + Sync + 'static, - Runtime: Spawn + Runtime: Spawn, { async fn try_compose_parts(&mut self, request_stream: impl Stream>) { if let Err(e) = self.compose_parts(request_stream).await { @@ -433,7 +433,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -514,7 +514,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -538,10 +538,10 @@ mod tests { } } - fn compare_read( + fn compare_read( id: &ObjectId, object: &MockObject, - mut request_task: RequestTask, + mut request_task: RequestTask, ) { let mut offset = request_task.start_offset(); let mut remaining = request_task.total_size(); diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs index 08778666a..aa1a1d49f 100644 --- a/mountpoint-s3/src/prefetch/part_queue.rs +++ b/mountpoint-s3/src/prefetch/part_queue.rs @@ -1,7 +1,9 @@ use std::time::Instant; +use mountpoint_s3_client::ObjectClient; use tracing::trace; +use crate::mem_limiter::MemoryLimiter; use crate::prefetch::part::Part; use crate::prefetch::PrefetchReadError; use crate::sync::async_channel::{unbounded, Receiver, RecvError, Sender}; @@ -11,7 +13,7 @@ use crate::sync::Arc; /// A queue of [Part]s where the first part can be partially read from if the reader doesn't want /// the entire part in one shot. #[derive(Debug)] -pub struct PartQueue { +pub struct PartQueue { /// The auxiliary queue that supports pushing parts to the front of the part queue in order to /// allow partial reads and backwards seeks. front_queue: Vec, @@ -20,6 +22,7 @@ pub struct PartQueue { failed: bool, /// The total number of bytes sent to the underlying queue of `self.receiver` bytes_received: Arc, + mem_limiter: Arc>, } /// Producer side of the queue of [Part]s. @@ -31,7 +34,9 @@ pub struct PartQueueProducer { } /// Creates an unbounded [PartQueue] and its related [PartQueueProducer]. -pub fn unbounded_part_queue() -> (PartQueue, PartQueueProducer) { +pub fn unbounded_part_queue( + mem_limiter: Arc>, +) -> (PartQueue, PartQueueProducer) { let (sender, receiver) = unbounded(); let bytes_counter = Arc::new(AtomicUsize::new(0)); let part_queue = PartQueue { @@ -39,6 +44,7 @@ pub fn unbounded_part_queue() -> (PartQueue, PartQueueP receiver, failed: false, bytes_received: Arc::clone(&bytes_counter), + mem_limiter, }; let part_queue_producer = PartQueueProducer { sender, @@ -47,7 +53,7 @@ pub fn unbounded_part_queue() -> (PartQueue, PartQueueP (part_queue, part_queue_producer) } -impl PartQueue { +impl PartQueue { /// Read up to `length` bytes from the queue at the current offset. This function always returns /// a contiguous [Bytes], and so may return fewer than `length` bytes if it would need to copy /// or reallocate to make the return value contiguous. This function blocks only if the queue is @@ -96,6 +102,9 @@ impl PartQueue { assert!(!self.failed, "cannot use a PartQueue after failure"); metrics::gauge!("prefetch.bytes_in_queue").increment(part.len() as f64); + // The backpressure controller is not aware of the parts from backwards seek, + // so we have to reserve memory for them here. + self.mem_limiter.reserve(part.len() as u64); self.front_queue.push(part); Ok(()) } @@ -121,7 +130,7 @@ impl PartQueueProducer { } } -impl Drop for PartQueue { +impl Drop for PartQueue { fn drop(&mut self) { // close the channel and drain remaining parts from the main queue self.receiver.close(); @@ -148,6 +157,7 @@ mod tests { use bytes::Bytes; use futures::executor::block_on; + use mountpoint_s3_client::mock_client::MockClient; use mountpoint_s3_client::types::ETag; use proptest::proptest; use proptest_derive::Arbitrary; @@ -164,8 +174,10 @@ mod tests { enum DummyError {} async fn run_test(ops: Vec) { + let client = MockClient::new(Default::default()); + let mem_limiter = MemoryLimiter::new(client, 512 * 1024 * 1024); let part_id = ObjectId::new("key".to_owned(), ETag::for_tests()); - let (mut part_queue, part_queue_producer) = unbounded_part_queue::(); + let (mut part_queue, part_queue_producer) = unbounded_part_queue::(mem_limiter.into()); let mut current_offset = 0; let mut current_length = 0; for op in ops { diff --git a/mountpoint-s3/src/prefetch/part_stream.rs b/mountpoint-s3/src/prefetch/part_stream.rs index 6df840c23..5615378f5 100644 --- a/mountpoint-s3/src/prefetch/part_stream.rs +++ b/mountpoint-s3/src/prefetch/part_stream.rs @@ -28,8 +28,8 @@ pub trait ObjectPartStream { &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc, - ) -> RequestTask + mem_limiter: Arc>, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static; } @@ -185,8 +185,8 @@ where &self, client: &Client, config: RequestTaskConfig, - mem_limiter: Arc, - ) -> RequestTask + mem_limiter: Arc>, + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -206,7 +206,7 @@ where }; let (backpressure_controller, mut backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); - let (part_queue, part_queue_producer) = unbounded_part_queue(); + let (part_queue, part_queue_producer) = unbounded_part_queue(mem_limiter); trace!(?range, "spawning request"); let span = debug_span!("prefetch", ?range); diff --git a/mountpoint-s3/src/prefetch/task.rs b/mountpoint-s3/src/prefetch/task.rs index cd858d7bc..bf47190fe 100644 --- a/mountpoint-s3/src/prefetch/task.rs +++ b/mountpoint-s3/src/prefetch/task.rs @@ -1,4 +1,5 @@ use futures::future::RemoteHandle; +use mountpoint_s3_client::ObjectClient; use crate::prefetch::backpressure_controller::BackpressureFeedbackEvent::{DataRead, PartQueueStall}; use crate::prefetch::part::Part; @@ -10,22 +11,22 @@ use super::part_stream::RequestRange; /// A single GetObject request submitted to the S3 client #[derive(Debug)] -pub struct RequestTask { +pub struct RequestTask { /// Handle on the task/future. The future is cancelled when handle is dropped. This is None if /// the request is fake (created by seeking backwards in the stream) _task_handle: RemoteHandle<()>, remaining: usize, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, } -impl RequestTask { +impl RequestTask { pub fn from_handle( task_handle: RemoteHandle<()>, range: RequestRange, - part_queue: PartQueue, - backpressure_controller: BackpressureController, + part_queue: PartQueue, + backpressure_controller: BackpressureController, ) -> Self { Self { _task_handle: task_handle, From 33050a86c648f435ed5b583cf585e73dae70e799 Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:43:01 +0000 Subject: [PATCH 6/9] Correct client mem usage stats Signed-off-by: Monthon Klongklaew --- mountpoint-s3-client/src/s3_crt_client.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index 53b681e7a..9674c9b15 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -1212,16 +1212,13 @@ impl ObjectClient for S3CrtClient { let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); let mem_allocated = crt_buffer_pool_stats .primary_allocated - .saturating_add(crt_buffer_pool_stats.secondary_reserved) - .saturating_add(crt_buffer_pool_stats.secondary_used) - .saturating_add(crt_buffer_pool_stats.forced_used); + .saturating_add(crt_buffer_pool_stats.secondary_used); let mem_reserved = crt_buffer_pool_stats .primary_reserved .saturating_add(crt_buffer_pool_stats.secondary_reserved); let mem_used = crt_buffer_pool_stats .primary_used - .saturating_add(crt_buffer_pool_stats.secondary_used) - .saturating_add(crt_buffer_pool_stats.forced_used); + .saturating_add(crt_buffer_pool_stats.secondary_used); Some(MemoryUsageStats { mem_allocated, mem_reserved, From 4db4acf46d44b924e066636c3cf11690d2c0f663 Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:44:49 +0000 Subject: [PATCH 7/9] PR comments Signed-off-by: Monthon Klongklaew --- mountpoint-s3-client/src/failure_client.rs | 7 +++-- mountpoint-s3-client/src/mock_client.rs | 9 +++--- .../src/mock_client/throughput_client.rs | 5 ++-- mountpoint-s3-client/src/object_client.rs | 15 ++-------- mountpoint-s3-client/src/s3_crt_client.rs | 25 +++++----------- mountpoint-s3/src/mem_limiter.rs | 14 +++++++-- mountpoint-s3/src/prefetch.rs | 4 +++ .../src/prefetch/backpressure_controller.rs | 29 ++----------------- mountpoint-s3/src/prefetch/caching_stream.rs | 1 - mountpoint-s3/src/prefetch/part_stream.rs | 8 ++++- 10 files changed, 48 insertions(+), 69 deletions(-) diff --git a/mountpoint-s3-client/src/failure_client.rs b/mountpoint-s3-client/src/failure_client.rs index bbc5cf254..bd1345a7f 100644 --- a/mountpoint-s3-client/src/failure_client.rs +++ b/mountpoint-s3-client/src/failure_client.rs @@ -11,13 +11,14 @@ use std::task::{Context, Poll}; use async_trait::async_trait; use futures::Stream; +use mountpoint_s3_crt::s3::client::BufferPoolUsageStats; use pin_project::pin_project; use crate::object_client::{ DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - MemoryUsageStats, ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, - PutObjectRequest, PutObjectResult, UploadReview, + ObjectAttribute, ObjectClientError, ObjectClientResult, PutObjectError, PutObjectParams, PutObjectRequest, + PutObjectResult, UploadReview, }; use crate::ObjectClient; @@ -85,7 +86,7 @@ where self.client.initial_read_window_size() } - fn mem_usage_stats(&self) -> Option { + fn mem_usage_stats(&self) -> Option { self.client.mem_usage_stats() } diff --git a/mountpoint-s3-client/src/mock_client.rs b/mountpoint-s3-client/src/mock_client.rs index 75d8d47e3..08e7c8be2 100644 --- a/mountpoint-s3-client/src/mock_client.rs +++ b/mountpoint-s3-client/src/mock_client.rs @@ -14,6 +14,7 @@ use async_trait::async_trait; use futures::{Stream, StreamExt}; use lazy_static::lazy_static; use mountpoint_s3_crt::checksums::crc32c; +use mountpoint_s3_crt::s3::client::BufferPoolUsageStats; use rand::seq::SliceRandom; use rand::SeedableRng; use rand_chacha::ChaCha20Rng; @@ -26,9 +27,9 @@ use crate::error_metadata::{ClientErrorMetadata, ProvideErrorMetadata}; use crate::object_client::{ Checksum, ChecksumAlgorithm, DeleteObjectError, DeleteObjectResult, ETag, GetBodyPart, GetObjectAttributesError, GetObjectAttributesParts, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, - HeadObjectResult, ListObjectsError, ListObjectsResult, MemoryUsageStats, ObjectAttribute, ObjectClient, - ObjectClientError, ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, - PutObjectResult, PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, + HeadObjectResult, ListObjectsError, ListObjectsResult, ObjectAttribute, ObjectClient, ObjectClientError, + ObjectClientResult, ObjectInfo, ObjectPart, PutObjectError, PutObjectParams, PutObjectRequest, PutObjectResult, + PutObjectTrailingChecksums, RestoreStatus, UploadReview, UploadReviewPart, }; mod leaky_bucket; @@ -572,7 +573,7 @@ impl ObjectClient for MockClient { } } - fn mem_usage_stats(&self) -> Option { + fn mem_usage_stats(&self) -> Option { None } diff --git a/mountpoint-s3-client/src/mock_client/throughput_client.rs b/mountpoint-s3-client/src/mock_client/throughput_client.rs index 610f81ef4..fc2c74831 100644 --- a/mountpoint-s3-client/src/mock_client/throughput_client.rs +++ b/mountpoint-s3-client/src/mock_client/throughput_client.rs @@ -6,6 +6,7 @@ use std::time::Duration; use async_io::block_on; use async_trait::async_trait; use futures::Stream; +use mountpoint_s3_crt::s3::client::BufferPoolUsageStats; use pin_project::pin_project; use crate::mock_client::leaky_bucket::LeakyBucket; @@ -13,7 +14,7 @@ use crate::mock_client::{MockClient, MockClientConfig, MockClientError, MockObje use crate::object_client::{ DeleteObjectError, DeleteObjectResult, GetBodyPart, GetObjectAttributesError, GetObjectAttributesResult, GetObjectError, GetObjectRequest, HeadObjectError, HeadObjectResult, ListObjectsError, ListObjectsResult, - MemoryUsageStats, ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, + ObjectAttribute, ObjectClient, ObjectClientResult, PutObjectError, PutObjectParams, }; use crate::types::ETag; @@ -113,7 +114,7 @@ impl ObjectClient for ThroughputMockClient { self.inner.initial_read_window_size() } - fn mem_usage_stats(&self) -> Option { + fn mem_usage_stats(&self) -> Option { self.inner.mem_usage_stats() } diff --git a/mountpoint-s3-client/src/object_client.rs b/mountpoint-s3-client/src/object_client.rs index f5c3d5b38..1b34bbdae 100644 --- a/mountpoint-s3-client/src/object_client.rs +++ b/mountpoint-s3-client/src/object_client.rs @@ -2,6 +2,7 @@ use crate::error_metadata::{ClientErrorMetadata, ProvideErrorMetadata}; use async_trait::async_trait; use auto_impl::auto_impl; use futures::Stream; +use mountpoint_s3_crt::s3::client::BufferPoolUsageStats; use std::pin::Pin; use std::str::FromStr; use std::time::SystemTime; @@ -63,18 +64,6 @@ impl FromStr for ETag { } } -/// Memory usage stats for the client -pub struct MemoryUsageStats { - /// Total allocated memory for the client. - pub mem_allocated: u64, - /// Reserved memory for the client. For [S3CrtClient], this value is a sum of primary storage - /// and secondary storage reserved memory. - pub mem_reserved: u64, - /// Actual used memory for the client. For [S3CrtClient], this value is a sum of primanry - /// storage and secondary storage used memory. - pub mem_used: u64, -} - /// A generic interface to S3-like object storage services. /// /// This trait defines the common methods that all object services implement. @@ -103,7 +92,7 @@ pub trait ObjectClient { /// Query current memory usage stats for the client. This can be `None` if the client /// does not record the stats. - fn mem_usage_stats(&self) -> Option; + fn mem_usage_stats(&self) -> Option; /// Delete a single object from the object store. /// diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index 9674c9b15..ec7bb74c5 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -25,8 +25,8 @@ use mountpoint_s3_crt::io::event_loop::EventLoopGroup; use mountpoint_s3_crt::io::host_resolver::{AddressKinds, HostResolver, HostResolverDefaultOptions}; use mountpoint_s3_crt::io::retry_strategy::{ExponentialBackoffJitterMode, RetryStrategy, StandardRetryOptions}; use mountpoint_s3_crt::s3::client::{ - init_signing_config, ChecksumConfig, Client, ClientConfig, MetaRequest, MetaRequestOptions, MetaRequestResult, - MetaRequestType, RequestMetrics, RequestType, + init_signing_config, BufferPoolUsageStats, ChecksumConfig, Client, ClientConfig, MetaRequest, MetaRequestOptions, + MetaRequestResult, MetaRequestType, RequestMetrics, RequestType, }; use async_trait::async_trait; @@ -767,7 +767,9 @@ impl S3CrtClientInner { .set(metrics.num_requests_streaming_response as f64); // Buffer pool metrics + let start = Instant::now(); let buffer_pool_stats = s3_client.poll_buffer_pool_usage_stats(); + metrics::histogram!("s3.client.buffer_pool.get_usage_latecy_us").record(start.elapsed().as_micros() as f64); metrics::gauge!("s3.client.buffer_pool.mem_limit").set(buffer_pool_stats.mem_limit as f64); metrics::gauge!("s3.client.buffer_pool.primary_cutoff").set(buffer_pool_stats.primary_cutoff as f64); metrics::gauge!("s3.client.buffer_pool.primary_used").set(buffer_pool_stats.primary_used as f64); @@ -1208,22 +1210,11 @@ impl ObjectClient for S3CrtClient { } } - fn mem_usage_stats(&self) -> Option { + fn mem_usage_stats(&self) -> Option { + let start = Instant::now(); let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); - let mem_allocated = crt_buffer_pool_stats - .primary_allocated - .saturating_add(crt_buffer_pool_stats.secondary_used); - let mem_reserved = crt_buffer_pool_stats - .primary_reserved - .saturating_add(crt_buffer_pool_stats.secondary_reserved); - let mem_used = crt_buffer_pool_stats - .primary_used - .saturating_add(crt_buffer_pool_stats.secondary_used); - Some(MemoryUsageStats { - mem_allocated, - mem_reserved, - mem_used, - }) + metrics::histogram!("s3.client.buffer_pool.get_usage_latecy_us").record(start.elapsed().as_micros() as f64); + Some(crt_buffer_pool_stats) } async fn delete_object( diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index b5a712b75..85ea93c5e 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -1,4 +1,4 @@ -use std::sync::atomic::Ordering; +use std::{sync::atomic::Ordering, time::Instant}; use humansize::make_format; use metrics::atomics::AtomicU64; @@ -75,6 +75,7 @@ impl MemoryLimiter { /// Reserve the memory for future uses. If there is not enough memory returns `false`. pub fn try_reserve(&self, size: u64) -> bool { + let start = Instant::now(); let mut prefetcher_mem_reserved = self.prefetcher_mem_reserved.load(Ordering::SeqCst); loop { let new_prefetcher_mem_reserved = prefetcher_mem_reserved.saturating_add(size); @@ -84,6 +85,7 @@ impl MemoryLimiter { .saturating_add(self.additional_mem_reserved); if new_total_mem_usage > self.mem_limit { debug!(new_total_mem_usage, "not enough memory to reserve"); + metrics::histogram!("prefetch.mem_reserve_latency_us").record(start.elapsed().as_micros() as f64); return false; } // Check that the value we have read is still the same before updating it @@ -95,6 +97,7 @@ impl MemoryLimiter { ) { Ok(_) => { metrics::gauge!("prefetch.bytes_reserved").increment(size as f64); + metrics::histogram!("prefetch.mem_reserve_latency_us").record(start.elapsed().as_micros() as f64); return true; } Err(current) => prefetcher_mem_reserved = current, // another thread updated the atomic before us, trying again @@ -118,7 +121,14 @@ impl MemoryLimiter { .saturating_sub(self.additional_mem_reserved) } + // Get allocated memory for the client. Currently, only the CRT client is able to report its buffer pool stats. + // The CRT allocates memory in two areas. The first one is primary storage where memory is allocated in blocks + // and we can get number of allocated bytes from `primary_allocated` stat. Another area is called secondary storage + // where memory is allocated exactly equal to the used memory. So total allocated memory for the CRT client would + // be `primary_allocated` + `secondary_used`. fn client_mem_allocated(&self) -> u64 { - self.client.mem_usage_stats().map_or(0, |stats| stats.mem_allocated) + self.client + .mem_usage_stats() + .map_or(0, |stats| stats.primary_allocated.saturating_add(stats.secondary_used)) } } diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index 2e6917f6f..a018b92df 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -528,6 +528,10 @@ where trace!("seek failed: not enough data in backwards seek window"); return Ok(false); }; + // This also increase `prefetcher_mem_reserved` value in memory limiter. + // At least one subsequent `RequestTask::read` is required for memory tracking to work correctly + // because `BackpressureController::drop` needs to know the start offset of the part queue to + // release the right amount of memory. task.push_front(parts).await?; self.next_sequential_read_offset = offset; Ok(true) diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index 131e03d5f..28dc8c893 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -29,7 +29,6 @@ pub struct BackpressureConfig { pub read_window_size_multiplier: usize, /// Request range to apply backpressure pub request_range: Range, - pub read_part_size: usize, } #[derive(Debug)] @@ -48,7 +47,6 @@ pub struct BackpressureController { /// End offset for the request we want to apply backpressure. The request can return /// data up to this offset *exclusively*. request_end_offset: u64, - read_part_size: usize, mem_limiter: Arc>, } @@ -93,7 +91,6 @@ pub fn new_backpressure_controller( read_window_end_offset, next_read_offset: config.request_range.start, request_end_offset: config.request_range.end, - read_part_size: config.read_part_size, mem_limiter, }; let limiter = BackpressureLimiter { @@ -120,6 +117,8 @@ impl BackpressureController { let remaining_window = self.read_window_end_offset.saturating_sub(self.next_read_offset) as usize; // Increment the read window only if the remaining window reaches some threshold i.e. half of it left. + // When memory is low the `preferred_read_window_size` will be scaled down so we have to keep trying + // until we have enough read window. while remaining_window < (self.preferred_read_window_size / 2) && self.read_window_end_offset < self.request_end_offset { @@ -146,6 +145,7 @@ impl BackpressureController { // scale down the read window if it fails. if self.mem_limiter.try_reserve(to_increase as u64) { self.increment_read_window(to_increase).await; + break; } else { self.scale_down(); } @@ -180,10 +180,6 @@ impl BackpressureController { fn scale_up(&mut self) { if self.preferred_read_window_size < self.max_read_window_size { let new_read_window_size = self.preferred_read_window_size * self.read_window_size_multiplier; - // Also align the new read window size to the client part size - let new_read_window_size = - align(new_read_window_size, self.read_part_size, false).min(self.max_read_window_size); - // Only scale up when there is enough memory. We don't have to reserve the memory here // because only `preferred_read_window_size` is increased but the actual read window will // be updated later on `DataRead` event (where we do reserve memory). @@ -208,10 +204,6 @@ impl BackpressureController { if self.preferred_read_window_size > self.min_read_window_size { assert!(self.read_window_size_multiplier > 1); let new_read_window_size = self.preferred_read_window_size / self.read_window_size_multiplier; - // Also align the new read window size to the client part size - let new_read_window_size = - align(new_read_window_size, self.read_part_size, false).max(self.min_read_window_size); - let formatter = make_format(humansize::BINARY); debug!( current_size = formatter(self.preferred_read_window_size), @@ -275,18 +267,3 @@ impl BackpressureLimiter { Ok(Some(self.read_window_end_offset)) } } - -/// Try to align the given read window size to the part boundaries. -/// The `trim_only` flags controls whether the range is only trimmed down to -/// part boundaries or is allowed to grow wider. -fn align(read_window_size: usize, part_size: usize, trim_only: bool) -> usize { - let part_alignment = part_size; - let remainder = read_window_size % part_alignment; - if trim_only || remainder == 0 { - // trim it to the previous part boundary - read_window_size - remainder - } else { - // extend it to the next part boundary - read_window_size + (part_alignment - remainder) - } -} diff --git a/mountpoint-s3/src/prefetch/caching_stream.rs b/mountpoint-s3/src/prefetch/caching_stream.rs index 9c909feab..e664d7b03 100644 --- a/mountpoint-s3/src/prefetch/caching_stream.rs +++ b/mountpoint-s3/src/prefetch/caching_stream.rs @@ -59,7 +59,6 @@ where max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, request_range: range.into(), - read_part_size: config.read_part_size, }; let (backpressure_controller, backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); diff --git a/mountpoint-s3/src/prefetch/part_stream.rs b/mountpoint-s3/src/prefetch/part_stream.rs index 5615378f5..7fad902f6 100644 --- a/mountpoint-s3/src/prefetch/part_stream.rs +++ b/mountpoint-s3/src/prefetch/part_stream.rs @@ -202,7 +202,6 @@ where max_read_window_size: config.max_read_window_size, read_window_size_multiplier: config.read_window_size_multiplier, request_range: range.into(), - read_part_size: config.read_part_size, }; let (backpressure_controller, mut backpressure_limiter) = new_backpressure_controller(backpressure_config, mem_limiter.clone()); @@ -387,6 +386,13 @@ fn read_from_request<'a, Client: ObjectClient + 'a>( if next_offset == request_range.end { break; } + + // The CRT could return data more than what we have requested in the read window + // which means unaccounted memory, so we want to record them here. + let excess_bytes = next_offset.saturating_sub(backpressure_limiter.read_window_end_offset()); + if excess_bytes > 0 { + metrics::histogram!("s3.client.read_window_excess_bytes").record(excess_bytes as f64); + } // Blocks if read window increment if it's not enough to read the next offset if let Some(next_read_window_offset) = backpressure_limiter.wait_for_read_window_increment(next_offset).await? { let diff = next_read_window_offset.saturating_sub(request.as_ref().read_window_end_offset()) as usize; From 2cc1a53ecbd21e29ea44d120e04fe381159685a6 Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:45:31 +0000 Subject: [PATCH 8/9] Put the cli argument behind a feature flag Signed-off-by: Monthon Klongklaew --- mountpoint-s3/Cargo.toml | 1 + mountpoint-s3/src/cli.rs | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mountpoint-s3/Cargo.toml b/mountpoint-s3/Cargo.toml index a6155487e..cf64f5e5f 100644 --- a/mountpoint-s3/Cargo.toml +++ b/mountpoint-s3/Cargo.toml @@ -84,6 +84,7 @@ built = { version = "0.7.1", features = ["git2"] } express_cache = ["block_size"] block_size = [] event_log = [] +mem_limiter = [] # Features for choosing tests fips_tests = [] fuse_tests = [] diff --git a/mountpoint-s3/src/cli.rs b/mountpoint-s3/src/cli.rs index cd4230a46..b2451d53a 100644 --- a/mountpoint-s3/src/cli.rs +++ b/mountpoint-s3/src/cli.rs @@ -157,6 +157,8 @@ pub struct CliArgs { )] pub max_threads: u64, + // This config is still unstable + #[cfg(feature = "mem_limiter")] #[clap( long, help = "Maximum memory usage target [default: 95% of total system memory with a minimum of 512 MiB]", @@ -784,14 +786,16 @@ where filesystem_config.allow_overwrite = args.allow_overwrite; filesystem_config.s3_personality = s3_personality; filesystem_config.server_side_encryption = ServerSideEncryption::new(args.sse.clone(), args.sse_kms_key_id.clone()); - filesystem_config.mem_limit = if let Some(max_mem_target) = args.max_memory_target { - max_mem_target * 1024 * 1024 - } else { - const MINIMUM_MEM_LIMIT: u64 = 512 * 1024 * 1024; - let sys = System::new_with_specifics(RefreshKind::everything()); - let default_mem_target = (sys.total_memory() as f64 * 0.95) as u64; - default_mem_target.max(MINIMUM_MEM_LIMIT) - }; + + const MINIMUM_MEM_LIMIT: u64 = 512 * 1024 * 1024; + let sys = System::new_with_specifics(RefreshKind::everything()); + let default_mem_target = (sys.total_memory() as f64 * 0.95) as u64; + filesystem_config.mem_limit = default_mem_target.max(MINIMUM_MEM_LIMIT); + + #[cfg(feature = "mem_limiter")] + if let Some(max_mem_target) = args.max_memory_target { + filesystem_config.mem_limit = max_mem_target * 1024 * 1024; + } // Written in this awkward way to force us to update it if we add new checksum types filesystem_config.use_upload_checksums = match args.upload_checksums { From 8fad2e50040b158d406d6d6e1b5484d0607045b5 Mon Sep 17 00:00:00 2001 From: Monthon Klongklaew Date: Fri, 27 Sep 2024 09:45:31 +0000 Subject: [PATCH 9/9] Fix scaling logic and address comments Signed-off-by: Monthon Klongklaew --- mountpoint-s3-client/src/s3_crt_client.rs | 4 +- mountpoint-s3/examples/prefetch_benchmark.rs | 19 +++- mountpoint-s3/src/cli.rs | 2 +- mountpoint-s3/src/fs.rs | 4 +- mountpoint-s3/src/mem_limiter.rs | 2 + mountpoint-s3/src/prefetch.rs | 23 ++--- .../src/prefetch/backpressure_controller.rs | 93 ++++++++++++++++++- mountpoint-s3/src/prefetch/caching_stream.rs | 18 ++-- mountpoint-s3/src/prefetch/part_queue.rs | 25 +++-- mountpoint-s3/src/prefetch/part_stream.rs | 4 +- mountpoint-s3/src/prefetch/task.rs | 12 +-- 11 files changed, 156 insertions(+), 50 deletions(-) diff --git a/mountpoint-s3-client/src/s3_crt_client.rs b/mountpoint-s3-client/src/s3_crt_client.rs index ec7bb74c5..d38742edf 100644 --- a/mountpoint-s3-client/src/s3_crt_client.rs +++ b/mountpoint-s3-client/src/s3_crt_client.rs @@ -769,7 +769,7 @@ impl S3CrtClientInner { // Buffer pool metrics let start = Instant::now(); let buffer_pool_stats = s3_client.poll_buffer_pool_usage_stats(); - metrics::histogram!("s3.client.buffer_pool.get_usage_latecy_us").record(start.elapsed().as_micros() as f64); + metrics::histogram!("s3.client.buffer_pool.get_usage_latency_us").record(start.elapsed().as_micros() as f64); metrics::gauge!("s3.client.buffer_pool.mem_limit").set(buffer_pool_stats.mem_limit as f64); metrics::gauge!("s3.client.buffer_pool.primary_cutoff").set(buffer_pool_stats.primary_cutoff as f64); metrics::gauge!("s3.client.buffer_pool.primary_used").set(buffer_pool_stats.primary_used as f64); @@ -1213,7 +1213,7 @@ impl ObjectClient for S3CrtClient { fn mem_usage_stats(&self) -> Option { let start = Instant::now(); let crt_buffer_pool_stats = self.inner.s3_client.poll_buffer_pool_usage_stats(); - metrics::histogram!("s3.client.buffer_pool.get_usage_latecy_us").record(start.elapsed().as_micros() as f64); + metrics::histogram!("s3.client.buffer_pool.get_usage_latency_us").record(start.elapsed().as_micros() as f64); Some(crt_buffer_pool_stats) } diff --git a/mountpoint-s3/examples/prefetch_benchmark.rs b/mountpoint-s3/examples/prefetch_benchmark.rs index 7532edde4..f9db56d36 100644 --- a/mountpoint-s3/examples/prefetch_benchmark.rs +++ b/mountpoint-s3/examples/prefetch_benchmark.rs @@ -13,6 +13,7 @@ use mountpoint_s3_client::config::{EndpointConfig, S3ClientConfig}; use mountpoint_s3_client::types::ETag; use mountpoint_s3_client::{ObjectClient, S3CrtClient}; use mountpoint_s3_crt::common::rust_log_adapter::RustLogAdapter; +use sysinfo::{RefreshKind, System}; use tracing_subscriber::fmt::Subscriber; use tracing_subscriber::util::SubscriberInitExt; use tracing_subscriber::EnvFilter; @@ -41,6 +42,11 @@ fn main() { .long("throughput-target-gbps") .help("Desired throughput in Gbps"), ) + .arg( + Arg::new("max-memory-target") + .long("max-memory-target") + .help("Maximum memory usage target in MiB"), + ) .arg( Arg::new("part-size") .long("part-size") @@ -65,6 +71,9 @@ fn main() { let throughput_target_gbps = matches .get_one::("throughput-target-gbps") .map(|s| s.parse::().expect("throughput target must be an f64")); + let max_memory_target = matches + .get_one::("max-memory-target") + .map(|s| s.parse::().expect("throughput target must be a u64")); let part_size = matches .get_one::("part-size") .map(|s| s.parse::().expect("part size must be a usize")); @@ -93,7 +102,15 @@ fn main() { config = config.part_size(part_size); } let client = Arc::new(S3CrtClient::new(config).expect("couldn't create client")); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + + let max_memory_target = if let Some(target) = max_memory_target { + target * 1024 * 1024 + } else { + // Default to 95% of total system memory + let sys = System::new_with_specifics(RefreshKind::everything()); + (sys.total_memory() as f64 * 0.95) as u64 + }; + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), max_memory_target)); let head_object_result = block_on(client.head_object(bucket, key)).expect("HeadObject failed"); let size = head_object_result.object.size; diff --git a/mountpoint-s3/src/cli.rs b/mountpoint-s3/src/cli.rs index b2451d53a..2df7f2c0c 100644 --- a/mountpoint-s3/src/cli.rs +++ b/mountpoint-s3/src/cli.rs @@ -32,6 +32,7 @@ use crate::fs::{CacheConfig, S3FilesystemConfig, ServerSideEncryption, TimeToLiv use crate::fuse::session::FuseSession; use crate::fuse::S3FuseFilesystem; use crate::logging::{init_logging, LoggingConfig}; +use crate::mem_limiter::MINIMUM_MEM_LIMIT; use crate::prefetch::{caching_prefetch, default_prefetch, Prefetch}; use crate::prefix::Prefix; use crate::s3::S3Personality; @@ -787,7 +788,6 @@ where filesystem_config.s3_personality = s3_personality; filesystem_config.server_side_encryption = ServerSideEncryption::new(args.sse.clone(), args.sse_kms_key_id.clone()); - const MINIMUM_MEM_LIMIT: u64 = 512 * 1024 * 1024; let sys = System::new_with_specifics(RefreshKind::everything()); let default_mem_target = (sys.total_memory() as f64 * 0.95) as u64; filesystem_config.mem_limit = default_mem_target.max(MINIMUM_MEM_LIMIT); diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 4e2af1a78..5172e11ca 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -21,7 +21,7 @@ use crate::inode::{ Inode, InodeError, InodeKind, LookedUp, ReadHandle, ReaddirHandle, Superblock, SuperblockConfig, WriteHandle, }; use crate::logging; -use crate::mem_limiter::MemoryLimiter; +use crate::mem_limiter::{MemoryLimiter, MINIMUM_MEM_LIMIT}; use crate::object::ObjectId; use crate::prefetch::{Prefetch, PrefetchResult}; use crate::prefix::Prefix; @@ -422,7 +422,7 @@ impl Default for S3FilesystemConfig { s3_personality: S3Personality::default(), server_side_encryption: Default::default(), use_upload_checksums: true, - mem_limit: 512 * 1024 * 1024, + mem_limit: MINIMUM_MEM_LIMIT, } } } diff --git a/mountpoint-s3/src/mem_limiter.rs b/mountpoint-s3/src/mem_limiter.rs index 85ea93c5e..780071184 100644 --- a/mountpoint-s3/src/mem_limiter.rs +++ b/mountpoint-s3/src/mem_limiter.rs @@ -5,6 +5,8 @@ use metrics::atomics::AtomicU64; use mountpoint_s3_client::ObjectClient; use tracing::debug; +pub const MINIMUM_MEM_LIMIT: u64 = 512 * 1024 * 1024; + /// `MemoryLimiter` tracks memory used by Mountpoint and makes decisions if a new memory reservation request can be accepted. /// Currently, there are two metrics we take into account: /// 1) the memory reserved by prefetcher instances for the data requested or fetched from CRT client. diff --git a/mountpoint-s3/src/prefetch.rs b/mountpoint-s3/src/prefetch.rs index a018b92df..d6aac7fa7 100644 --- a/mountpoint-s3/src/prefetch.rs +++ b/mountpoint-s3/src/prefetch.rs @@ -267,7 +267,7 @@ pub struct PrefetchGetObject { part_stream: Arc, mem_limiter: Arc>, config: PrefetcherConfig, - backpressure_task: Option>, + backpressure_task: Option>, // Invariant: the offset of the last byte in this window is always // self.next_sequential_read_offset - 1. backward_seek_window: SeekWindow, @@ -420,7 +420,7 @@ where /// We will be using flow-control window to control how much data we want to download into the prefetcher. fn spawn_read_backpressure_request( &mut self, - ) -> Result, PrefetchReadError> { + ) -> Result, PrefetchReadError> { let start = self.next_sequential_read_offset; let object_size = self.size as usize; let read_part_size = self.client.read_part_size().unwrap_or(8 * 1024 * 1024); @@ -560,6 +560,7 @@ mod tests { #![allow(clippy::identity_op)] use crate::data_cache::InMemoryDataCache; + use crate::mem_limiter::MINIMUM_MEM_LIMIT; use super::caching_stream::CachingPartStream; use super::*; @@ -619,7 +620,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let object = MockObject::ramp(0xaa, size as usize, ETag::for_tests()); let etag = object.etag(); @@ -714,7 +715,7 @@ mod tests { Stream: ObjectPartStream + Send + Sync + 'static, { let client = Arc::new(MockClient::new(client_config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let read_size = 1 * MB; let object_size = 8 * MB; let object = MockObject::ramp(0xaa, object_size, ETag::for_tests()); @@ -821,7 +822,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let prefetcher_config = PrefetcherConfig { max_read_window_size: test_config.max_read_window_size, @@ -946,7 +947,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let etag = object.etag(); @@ -1130,7 +1131,7 @@ mod tests { HashMap::new(), HashMap::new(), )); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let prefetcher = Prefetcher::new(default_stream(), Default::default()); let mem_limiter = Arc::new(mem_limiter); @@ -1183,7 +1184,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1225,7 +1226,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT)); let object = MockObject::ramp(0xaa, OBJECT_SIZE, ETag::for_tests()); let etag = object.etag(); @@ -1287,7 +1288,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); @@ -1353,7 +1354,7 @@ mod tests { ..Default::default() }; let client = Arc::new(MockClient::new(config)); - let mem_limiter = MemoryLimiter::new(client.clone(), 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client.clone(), MINIMUM_MEM_LIMIT); let object = MockObject::ramp(0xaa, object_size as usize, ETag::for_tests()); let file_etag = object.etag(); diff --git a/mountpoint-s3/src/prefetch/backpressure_controller.rs b/mountpoint-s3/src/prefetch/backpressure_controller.rs index 28dc8c893..86db54031 100644 --- a/mountpoint-s3/src/prefetch/backpressure_controller.rs +++ b/mountpoint-s3/src/prefetch/backpressure_controller.rs @@ -179,7 +179,9 @@ impl BackpressureController { // Scaling up fails silently if there is no enough free memory to perform it. fn scale_up(&mut self) { if self.preferred_read_window_size < self.max_read_window_size { - let new_read_window_size = self.preferred_read_window_size * self.read_window_size_multiplier; + let new_read_window_size = (self.preferred_read_window_size * self.read_window_size_multiplier) + .max(self.min_read_window_size) + .min(self.max_read_window_size); // Only scale up when there is enough memory. We don't have to reserve the memory here // because only `preferred_read_window_size` is increased but the actual read window will // be updated later on `DataRead` event (where we do reserve memory). @@ -203,7 +205,9 @@ impl BackpressureController { fn scale_down(&mut self) { if self.preferred_read_window_size > self.min_read_window_size { assert!(self.read_window_size_multiplier > 1); - let new_read_window_size = self.preferred_read_window_size / self.read_window_size_multiplier; + let new_read_window_size = (self.preferred_read_window_size / self.read_window_size_multiplier) + .max(self.min_read_window_size) + .min(self.max_read_window_size); let formatter = make_format(humansize::BINARY); debug!( current_size = formatter(self.preferred_read_window_size), @@ -267,3 +271,88 @@ impl BackpressureLimiter { Ok(Some(self.read_window_end_offset)) } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use mountpoint_s3_client::mock_client::{MockClient, MockClientConfig}; + use test_case::test_case; + + use crate::mem_limiter::MemoryLimiter; + + use super::{new_backpressure_controller, BackpressureConfig, BackpressureController, BackpressureLimiter}; + + #[test_case(1024 * 1024 + 128 * 1024, 2)] // real config + #[test_case(3 * 1024 * 1024, 4)] + #[test_case(8 * 1024 * 1024, 8)] + #[test_case(2 * 1024 * 1024 * 1024, 2)] + fn test_read_window_scale_up(initial_read_window_size: usize, read_window_size_multiplier: usize) { + let request_range = 0..(5 * 1024 * 1024 * 1024); + let backpressure_config = BackpressureConfig { + initial_read_window_size, + min_read_window_size: 8 * 1024 * 1024, + max_read_window_size: 2 * 1024 * 1024 * 1024, + read_window_size_multiplier, + request_range, + }; + + let (mut backpressure_controller, _backpressure_limiter) = + new_backpressure_controller_for_test(backpressure_config); + while backpressure_controller.preferred_read_window_size < backpressure_controller.max_read_window_size { + backpressure_controller.scale_up(); + assert!(backpressure_controller.preferred_read_window_size >= backpressure_controller.min_read_window_size); + assert!(backpressure_controller.preferred_read_window_size <= backpressure_controller.max_read_window_size); + } + assert_eq!( + backpressure_controller.preferred_read_window_size, backpressure_controller.max_read_window_size, + "should have scaled up to max read window size" + ); + } + + #[test_case(2 * 1024 * 1024 * 1024, 2)] + #[test_case(15 * 1024 * 1024 * 1024, 2)] + #[test_case(2 * 1024 * 1024 * 1024, 8)] + #[test_case(8 * 1024 * 1024, 8)] + fn test_read_window_scale_down(initial_read_window_size: usize, read_window_size_multiplier: usize) { + let request_range = 0..(5 * 1024 * 1024 * 1024); + let backpressure_config = BackpressureConfig { + initial_read_window_size, + min_read_window_size: 8 * 1024 * 1024, + max_read_window_size: 2 * 1024 * 1024 * 1024, + read_window_size_multiplier, + request_range, + }; + + let (mut backpressure_controller, _backpressure_limiter) = + new_backpressure_controller_for_test(backpressure_config); + while backpressure_controller.preferred_read_window_size > backpressure_controller.min_read_window_size { + backpressure_controller.scale_down(); + assert!(backpressure_controller.preferred_read_window_size <= backpressure_controller.max_read_window_size); + assert!(backpressure_controller.preferred_read_window_size >= backpressure_controller.min_read_window_size); + } + assert_eq!( + backpressure_controller.preferred_read_window_size, backpressure_controller.min_read_window_size, + "should have scaled down to min read window size" + ); + } + + fn new_backpressure_controller_for_test( + backpressure_config: BackpressureConfig, + ) -> (BackpressureController, BackpressureLimiter) { + let config = MockClientConfig { + bucket: "test-bucket".to_string(), + part_size: 8 * 1024 * 1024, + enable_backpressure: true, + initial_read_window_size: backpressure_config.initial_read_window_size, + ..Default::default() + }; + + let client = MockClient::new(config); + let mem_limiter = Arc::new(MemoryLimiter::new( + client, + backpressure_config.max_read_window_size as u64, + )); + new_backpressure_controller(backpressure_config, mem_limiter.clone()) + } +} diff --git a/mountpoint-s3/src/prefetch/caching_stream.rs b/mountpoint-s3/src/prefetch/caching_stream.rs index e664d7b03..7d7d044c8 100644 --- a/mountpoint-s3/src/prefetch/caching_stream.rs +++ b/mountpoint-s3/src/prefetch/caching_stream.rs @@ -47,7 +47,7 @@ where client: &Client, config: RequestTaskConfig, mem_limiter: Arc>, - ) -> RequestTask<::ClientError, Client> + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static, { @@ -391,7 +391,11 @@ mod tests { }; use test_case::test_case; - use crate::{data_cache::InMemoryDataCache, mem_limiter::MemoryLimiter, object::ObjectId}; + use crate::{ + data_cache::InMemoryDataCache, + mem_limiter::{MemoryLimiter, MINIMUM_MEM_LIMIT}, + object::ObjectId, + }; use super::*; @@ -432,7 +436,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), MINIMUM_MEM_LIMIT)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -513,7 +517,7 @@ mod tests { ..Default::default() }; let mock_client = Arc::new(MockClient::new(config)); - let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), 512 * 1024 * 1024)); + let mem_limiter = Arc::new(MemoryLimiter::new(mock_client.clone(), MINIMUM_MEM_LIMIT)); mock_client.add_object(key, object.clone()); let runtime = ThreadPool::builder().pool_size(1).create().unwrap(); @@ -537,11 +541,7 @@ mod tests { } } - fn compare_read( - id: &ObjectId, - object: &MockObject, - mut request_task: RequestTask, - ) { + fn compare_read(id: &ObjectId, object: &MockObject, mut request_task: RequestTask) { let mut offset = request_task.start_offset(); let mut remaining = request_task.total_size(); while remaining > 0 { diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs index aa1a1d49f..3f8b2e16d 100644 --- a/mountpoint-s3/src/prefetch/part_queue.rs +++ b/mountpoint-s3/src/prefetch/part_queue.rs @@ -13,12 +13,12 @@ use crate::sync::Arc; /// A queue of [Part]s where the first part can be partially read from if the reader doesn't want /// the entire part in one shot. #[derive(Debug)] -pub struct PartQueue { +pub struct PartQueue { /// The auxiliary queue that supports pushing parts to the front of the part queue in order to /// allow partial reads and backwards seeks. front_queue: Vec, /// The main queue that receives parts from [super::ObjectPartStream] - receiver: Receiver>>, + receiver: Receiver>>, failed: bool, /// The total number of bytes sent to the underlying queue of `self.receiver` bytes_received: Arc, @@ -34,9 +34,9 @@ pub struct PartQueueProducer { } /// Creates an unbounded [PartQueue] and its related [PartQueueProducer]. -pub fn unbounded_part_queue( +pub fn unbounded_part_queue( mem_limiter: Arc>, -) -> (PartQueue, PartQueueProducer) { +) -> (PartQueue, PartQueueProducer) { let (sender, receiver) = unbounded(); let bytes_counter = Arc::new(AtomicUsize::new(0)); let part_queue = PartQueue { @@ -53,14 +53,14 @@ pub fn unbounded_part_queue( (part_queue, part_queue_producer) } -impl PartQueue { +impl PartQueue { /// Read up to `length` bytes from the queue at the current offset. This function always returns /// a contiguous [Bytes], and so may return fewer than `length` bytes if it would need to copy /// or reallocate to make the return value contiguous. This function blocks only if the queue is /// empty. /// /// If this method returns an Err, the PartQueue must never be accessed again. - pub async fn read(&mut self, length: usize) -> Result> { + pub async fn read(&mut self, length: usize) -> Result> { assert!(!self.failed, "cannot use a PartQueue after failure"); // Read from the auxiliary queue first if it's not empty @@ -98,7 +98,7 @@ impl PartQueue Result<(), PrefetchReadError> { + pub async fn push_front(&mut self, part: Part) -> Result<(), PrefetchReadError> { assert!(!self.failed, "cannot use a PartQueue after failure"); metrics::gauge!("prefetch.bytes_in_queue").increment(part.len() as f64); @@ -130,7 +130,7 @@ impl PartQueueProducer { } } -impl Drop for PartQueue { +impl Drop for PartQueue { fn drop(&mut self) { // close the channel and drain remaining parts from the main queue self.receiver.close(); @@ -151,6 +151,7 @@ impl Drop for PartQueue { #[cfg(test)] mod tests { use crate::checksums::ChecksummedBytes; + use crate::mem_limiter::MINIMUM_MEM_LIMIT; use crate::object::ObjectId; use super::*; @@ -161,7 +162,6 @@ mod tests { use mountpoint_s3_client::types::ETag; use proptest::proptest; use proptest_derive::Arbitrary; - use thiserror::Error; #[derive(Debug, Arbitrary)] enum Op { @@ -170,14 +170,11 @@ mod tests { Push(#[proptest(strategy = "1usize..8192")] usize), } - #[derive(Debug, Error)] - enum DummyError {} - async fn run_test(ops: Vec) { let client = MockClient::new(Default::default()); - let mem_limiter = MemoryLimiter::new(client, 512 * 1024 * 1024); + let mem_limiter = MemoryLimiter::new(client, MINIMUM_MEM_LIMIT); let part_id = ObjectId::new("key".to_owned(), ETag::for_tests()); - let (mut part_queue, part_queue_producer) = unbounded_part_queue::(mem_limiter.into()); + let (mut part_queue, part_queue_producer) = unbounded_part_queue::(mem_limiter.into()); let mut current_offset = 0; let mut current_length = 0; for op in ops { diff --git a/mountpoint-s3/src/prefetch/part_stream.rs b/mountpoint-s3/src/prefetch/part_stream.rs index 7fad902f6..b5eb32e01 100644 --- a/mountpoint-s3/src/prefetch/part_stream.rs +++ b/mountpoint-s3/src/prefetch/part_stream.rs @@ -29,7 +29,7 @@ pub trait ObjectPartStream { client: &Client, config: RequestTaskConfig, mem_limiter: Arc>, - ) -> RequestTask + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static; } @@ -186,7 +186,7 @@ where client: &Client, config: RequestTaskConfig, mem_limiter: Arc>, - ) -> RequestTask + ) -> RequestTask where Client: ObjectClient + Clone + Send + Sync + 'static, { diff --git a/mountpoint-s3/src/prefetch/task.rs b/mountpoint-s3/src/prefetch/task.rs index bf47190fe..9ff2718ee 100644 --- a/mountpoint-s3/src/prefetch/task.rs +++ b/mountpoint-s3/src/prefetch/task.rs @@ -11,21 +11,21 @@ use super::part_stream::RequestRange; /// A single GetObject request submitted to the S3 client #[derive(Debug)] -pub struct RequestTask { +pub struct RequestTask { /// Handle on the task/future. The future is cancelled when handle is dropped. This is None if /// the request is fake (created by seeking backwards in the stream) _task_handle: RemoteHandle<()>, remaining: usize, range: RequestRange, - part_queue: PartQueue, + part_queue: PartQueue, backpressure_controller: BackpressureController, } -impl RequestTask { +impl RequestTask { pub fn from_handle( task_handle: RemoteHandle<()>, range: RequestRange, - part_queue: PartQueue, + part_queue: PartQueue, backpressure_controller: BackpressureController, ) -> Self { Self { @@ -38,7 +38,7 @@ impl RequestTask) -> Result<(), PrefetchReadError> { + pub async fn push_front(&mut self, parts: Vec) -> Result<(), PrefetchReadError> { // Iterate backwards to push each part to the front of the part queue for part in parts.into_iter().rev() { self.remaining += part.len(); @@ -47,7 +47,7 @@ impl RequestTask Result> { + pub async fn read(&mut self, length: usize) -> Result> { let part = self.part_queue.read(length).await?; debug_assert!(part.len() <= self.remaining); self.remaining -= part.len();