Avoid extending part on backward seek (#1005)

* Avoid extending part on backward seek Currently, we combine parts from the seek window to `current_part` in the part queue whenever we seek backward which mean we also have to re-compute checksums for this combined part. It particularly affect read throughput in some use cases where backward seek rate is high. This change should improve the throughput for those use cases. Signed-off-by: Monthon Klongklaew <[email protected]> * PR comments Signed-off-by: Monthon Klongklaew <[email protected]> --------- Signed-off-by: Monthon Klongklaew <[email protected]>
awslabs · Sep 9, 2024 · cdb8ccd · cdb8ccd · github-actions · Sep 9, 2024
1 parent 6d49885
commit cdb8ccd
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 57 deletions.
diff --git a/mountpoint-s3/src/prefetch/part_queue.rs b/mountpoint-s3/src/prefetch/part_queue.rs
@@ -5,16 +5,19 @@ use tracing::trace;
 use crate::prefetch::part::Part;
 use crate::prefetch::PrefetchReadError;
 use crate::sync::async_channel::{unbounded, Receiver, RecvError, Sender};
-use crate::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use crate::sync::{Arc, AsyncMutex};
+use crate::sync::atomic::{AtomicUsize, Ordering};
+use crate::sync::Arc;
 
 /// A queue of [Part]s where the first part can be partially read from if the reader doesn't want
 /// the entire part in one shot.
 #[derive(Debug)]
 pub struct PartQueue<E: std::error::Error> {
-    current_part: AsyncMutex<Option<Part>>,
+    /// The auxiliary queue that supports pushing parts to the front of the part queue in order to
+    /// allow partial reads and backwards seeks.
+    front_queue: Vec<Part>,
+    /// The main queue that receives parts from [super::ObjectPartStream]
     receiver: Receiver<Result<Part, PrefetchReadError<E>>>,
-    failed: AtomicBool,
+    failed: bool,
     /// The total number of bytes sent to the underlying queue of `self.receiver`
     bytes_received: Arc<AtomicUsize>,
 }
@@ -32,9 +35,9 @@ pub fn unbounded_part_queue<E: std::error::Error>() -> (PartQueue<E>, PartQueueP
     let (sender, receiver) = unbounded();
     let bytes_counter = Arc::new(AtomicUsize::new(0));
     let part_queue = PartQueue {
-        current_part: AsyncMutex::new(None),
+        front_queue: Vec::new(),
         receiver,
-        failed: AtomicBool::new(false),
+        failed: false,
         bytes_received: Arc::clone(&bytes_counter),
     };
     let part_queue_producer = PartQueueProducer {
@@ -52,65 +55,48 @@ impl<E: std::error::Error + Send + Sync> PartQueue<E> {
     ///
     /// If this method returns an Err, the PartQueue must never be accessed again.
     pub async fn read(&mut self, length: usize) -> Result<Part, PrefetchReadError<E>> {
-        let mut current_part = self.current_part.lock().await;
-
-        assert!(
-            !self.failed.load(Ordering::SeqCst),
-            "cannot use a PartQueue after failure"
-        );
-
-        let part = if let Some(current_part) = current_part.take() {
-            Ok(current_part)
+        assert!(!self.failed, "cannot use a PartQueue after failure");
+
+        // Read from the auxiliary queue first if it's not empty
+        let part = if let Some(part) = self.front_queue.pop() {
+            Ok(part)
+        // Then do `try_recv` from the main queue so we can track whether the read is starved or not
+        } else if let Ok(part) = self.receiver.try_recv() {
+            part
         } else {
-            // Do `try_recv` first so we can track whether the read is starved or not
-            if let Ok(part) = self.receiver.try_recv() {
-                part
-            } else {
-                let start = Instant::now();
-                let part = self.receiver.recv().await;
-                metrics::histogram!("prefetch.part_queue_starved_us").record(start.elapsed().as_micros() as f64);
-                match part {
-                    Err(RecvError) => Err(PrefetchReadError::GetRequestTerminatedUnexpectedly),
-                    Ok(part) => part,
-                }
+            let start = Instant::now();
+            let part = self.receiver.recv().await;
+            metrics::histogram!("prefetch.part_queue_starved_us").record(start.elapsed().as_micros() as f64);
+            match part {
+                Err(RecvError) => Err(PrefetchReadError::GetRequestTerminatedUnexpectedly),
+                Ok(part) => part,
             }
         };
 
         let mut part = match part {
             Err(e) => {
-                self.failed.store(true, Ordering::SeqCst);
+                self.failed = true;
                 return Err(e);
             }
             Ok(part) => part,
         };
         debug_assert!(!part.is_empty(), "parts must not be empty");
 
+        // Split the part and push the remaining to the front of the queue if it's a partial read
         if length < part.len() {
             let tail = part.split_off(length);
-            *current_part = Some(tail);
+            self.front_queue.push(tail);
         }
         metrics::gauge!("prefetch.bytes_in_queue").decrement(part.len() as f64);
         Ok(part)
     }
 
     /// Push a new [Part] onto the front of the queue
-    /// which actually just concatenate it with the current part
-    pub async fn push_front(&self, mut part: Part) -> Result<(), PrefetchReadError<E>> {
-        let part_len = part.len();
-        let mut current_part = self.current_part.lock().await;
-
-        assert!(
-            !self.failed.load(Ordering::SeqCst),
-            "cannot use a PartQueue after failure"
-        );
-
-        if let Some(current_part) = current_part.as_mut() {
-            part.extend(current_part)?;
-            *current_part = part;
-        } else {
-            *current_part = Some(part);
-        }
-        metrics::gauge!("prefetch.bytes_in_queue").increment(part_len as f64);
+    pub async fn push_front(&mut self, part: Part) -> Result<(), PrefetchReadError<E>> {
+        assert!(!self.failed, "cannot use a PartQueue after failure");
+
+        metrics::gauge!("prefetch.bytes_in_queue").increment(part.len() as f64);
+        self.front_queue.push(part);
         Ok(())
     }
 
@@ -137,21 +123,19 @@ impl<E: std::error::Error + Send + Sync> PartQueueProducer<E> {
 
 impl<E: std::error::Error> Drop for PartQueue<E> {
     fn drop(&mut self) {
-        let current_part = self.current_part.lock_blocking();
-        let current_size = match current_part.as_ref() {
-            Some(part) => part.len(),
-            None => 0,
-        };
-        // close the channel and drain remaining parts
+        // close the channel and drain remaining parts from the main queue
         self.receiver.close();
         let mut queue_size = 0;
         while let Ok(part) = self.receiver.try_recv() {
             if let Ok(part) = part {
                 queue_size += part.len();
             }
         }
-        let remaining = current_size + queue_size;
-        metrics::gauge!("prefetch.bytes_in_queue").decrement(remaining as f64);
+        // count remaining bytes in the auxiliary queue
+        for part in &self.front_queue {
+            queue_size += part.len()
+        }
+        metrics::gauge!("prefetch.bytes_in_queue").decrement(queue_size as f64);
     }
 }
 
@@ -199,7 +183,7 @@ mod tests {
                     current_length -= bytes.len();
                 }
                 Op::ReadAligned => {
-                    let first_part_length = part_queue.current_part.lock().await.as_ref().map(|p| p.len());
+                    let first_part_length = part_queue.front_queue.last().map(|p| p.len());
                     if let Some(n) = first_part_length {
                         let part = part_queue.read(n).await.unwrap();
                         let checksummed_bytes = part.into_bytes(&part_id, current_offset).unwrap();

diff --git a/mountpoint-s3/src/prefetch/task.rs b/mountpoint-s3/src/prefetch/task.rs
@@ -38,9 +38,7 @@ impl<E: std::error::Error + Send + Sync> RequestTask<E> {
 
     // Push a given list of parts in front of the part queue
     pub async fn push_front(&mut self, parts: Vec<Part>) -> Result<(), PrefetchReadError<E>> {
-        // Merge all parts into one single part by pushing them to the front of the part queue.
-        // This could result in a really big part, but we normally use this only for backward seek
-        // so its size should not be bigger than the prefetcher's `max_backward_seek_distance`.
+        // Iterate backwards to push each part to the front of the part queue
         for part in parts.into_iter().rev() {
             self.remaining += part.len();
             self.part_queue.push_front(part).await?;