diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 168b36843b..8eb46f0aa9 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1499,7 +1499,7 @@ impl Global {
                     .raw
                     .as_ref()
                     .unwrap()
-                    .submit(&refs, &submit_surface_textures, Some((fence, submit_index)))
+                    .submit(&refs, &submit_surface_textures, (fence, submit_index))
                     .map_err(DeviceError::from)?;
             }
 
diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs
index 053f7fdb24..7f5939feb0 100644
--- a/wgpu-core/src/present.rs
+++ b/wgpu-core/src/present.rs
@@ -154,17 +154,20 @@ impl Global {
                 parent_id: surface_id,
             });
         }
-        #[cfg(not(feature = "trace"))]
-        let _ = device;
+
+        let fence_guard = device.fence.read();
+        let fence = fence_guard.as_ref().unwrap();
 
         let suf = A::surface_as_hal(surface.as_ref());
         let (texture_id, status) = match unsafe {
-            suf.unwrap()
-                .acquire_texture(Some(std::time::Duration::from_millis(
-                    FRAME_TIMEOUT_MS as u64,
-                )))
+            suf.unwrap().acquire_texture(
+                Some(std::time::Duration::from_millis(FRAME_TIMEOUT_MS as u64)),
+                fence,
+            )
         } {
             Ok(Some(ast)) => {
+                drop(fence_guard);
+
                 let texture_desc = wgt::TextureDescriptor {
                     label: (),
                     size: wgt::Extent3d {
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index ee59fa2590..36c286429a 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -22,7 +22,6 @@ const MAX_BUNNIES: usize = 1 << 20;
 const BUNNY_SIZE: f32 = 0.15 * 256.0;
 const GRAVITY: f32 = -9.8 * 100.0;
 const MAX_VELOCITY: f32 = 750.0;
-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;
 
 #[repr(C)]
@@ -496,7 +495,7 @@ impl<A: hal::Api> Example<A> {
             let mut fence = device.create_fence().unwrap();
             let init_cmd = cmd_encoder.end_encoding().unwrap();
             queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                 .unwrap();
             device.wait(&fence, init_fence_value, !0).unwrap();
             device.destroy_buffer(staging_buffer);
@@ -548,7 +547,7 @@ impl<A: hal::Api> Example<A> {
             {
                 let ctx = &mut self.contexts[self.context_index];
                 self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                     .unwrap();
             }
 
@@ -648,7 +647,13 @@ impl<A: hal::Api> Example<A> {
 
         let ctx = &mut self.contexts[self.context_index];
 
-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };
 
         let target_barrier0 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -716,7 +721,6 @@ impl<A: hal::Api> Example<A> {
         }
 
         ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;
 
         let target_barrier1 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -730,45 +734,42 @@ impl<A: hal::Api> Example<A> {
 
         unsafe {
             let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
             self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                 .unwrap();
             self.queue.present(&self.surface, surface_tex).unwrap();
             ctx.used_cmd_bufs.push(cmd_buf);
             ctx.used_views.push(surface_tex_view);
         };
 
-        if do_fence {
-            log::debug!("Context switch from {}", self.context_index);
-            let old_fence_value = ctx.fence_value;
-            if self.contexts.len() == 1 {
-                let hal_desc = hal::CommandEncoderDescriptor {
-                    label: None,
-                    queue: &self.queue,
-                };
-                self.contexts.push(unsafe {
-                    ExecutionContext {
-                        encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
-                        fence: self.device.create_fence().unwrap(),
-                        fence_value: 0,
-                        used_views: Vec::new(),
-                        used_cmd_bufs: Vec::new(),
-                        frames_recorded: 0,
-                    }
-                });
-            }
-            self.context_index = (self.context_index + 1) % self.contexts.len();
-            let next = &mut self.contexts[self.context_index];
-            unsafe {
-                next.wait_and_clear(&self.device);
-            }
-            next.fence_value = old_fence_value + 1;
+        log::debug!("Context switch from {}", self.context_index);
+        let old_fence_value = ctx.fence_value;
+        if self.contexts.len() == 1 {
+            let hal_desc = hal::CommandEncoderDescriptor {
+                label: None,
+                queue: &self.queue,
+            };
+            self.contexts.push(unsafe {
+                ExecutionContext {
+                    encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
+                    fence: self.device.create_fence().unwrap(),
+                    fence_value: 0,
+                    used_views: Vec::new(),
+                    used_cmd_bufs: Vec::new(),
+                    frames_recorded: 0,
+                }
+            });
+        }
+        self.context_index = (self.context_index + 1) % self.contexts.len();
+        let next = &mut self.contexts[self.context_index];
+        unsafe {
+            next.wait_and_clear(&self.device);
         }
+        next.fence_value = old_fence_value + 1;
     }
 }
 
diff --git a/wgpu-hal/examples/raw-gles.rs b/wgpu-hal/examples/raw-gles.rs
index 342100e1cb..675a518694 100644
--- a/wgpu-hal/examples/raw-gles.rs
+++ b/wgpu-hal/examples/raw-gles.rs
@@ -156,6 +156,7 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
             })
             .unwrap()
     };
+    let mut fence = unsafe { od.device.create_fence().unwrap() };
     let rp_desc = hal::RenderPassDescriptor {
         label: None,
         extent: wgt::Extent3d {
@@ -183,6 +184,6 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
         encoder.begin_render_pass(&rp_desc);
         encoder.end_render_pass();
         let cmd_buf = encoder.end_encoding().unwrap();
-        od.queue.submit(&[&cmd_buf], &[], None).unwrap();
+        od.queue.submit(&[&cmd_buf], &[], (&mut fence, 0)).unwrap();
     }
 }
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 8f404dc4d2..4d995f7576 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -13,7 +13,6 @@ use std::{
 };
 use winit::window::WindowButtons;
 
-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;
 
 /// [D3D12_RAYTRACING_INSTANCE_DESC](https://microsoft.github.io/DirectX-Specs/d3d/Raytracing.html#d3d12_raytracing_instance_desc)
@@ -758,7 +757,7 @@ impl<A: hal::Api> Example<A> {
             let mut fence = device.create_fence().unwrap();
             let init_cmd = cmd_encoder.end_encoding().unwrap();
             queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                 .unwrap();
             device.wait(&fence, init_fence_value, !0).unwrap();
             cmd_encoder.reset_all(iter::once(init_cmd));
@@ -807,7 +806,13 @@ impl<A: hal::Api> Example<A> {
     fn render(&mut self) {
         let ctx = &mut self.contexts[self.context_index];
 
-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };
 
         let target_barrier0 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -908,7 +913,6 @@ impl<A: hal::Api> Example<A> {
         }
 
         ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;
 
         let target_barrier1 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -958,45 +962,42 @@ impl<A: hal::Api> Example<A> {
 
         unsafe {
             let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
             self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                 .unwrap();
             self.queue.present(&self.surface, surface_tex).unwrap();
             ctx.used_cmd_bufs.push(cmd_buf);
             ctx.used_views.push(surface_tex_view);
         };
 
-        if do_fence {
-            log::info!("Context switch from {}", self.context_index);
-            let old_fence_value = ctx.fence_value;
-            if self.contexts.len() == 1 {
-                let hal_desc = hal::CommandEncoderDescriptor {
-                    label: None,
-                    queue: &self.queue,
-                };
-                self.contexts.push(unsafe {
-                    ExecutionContext {
-                        encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
-                        fence: self.device.create_fence().unwrap(),
-                        fence_value: 0,
-                        used_views: Vec::new(),
-                        used_cmd_bufs: Vec::new(),
-                        frames_recorded: 0,
-                    }
-                });
-            }
-            self.context_index = (self.context_index + 1) % self.contexts.len();
-            let next = &mut self.contexts[self.context_index];
-            unsafe {
-                next.wait_and_clear(&self.device);
-            }
-            next.fence_value = old_fence_value + 1;
+        log::info!("Context switch from {}", self.context_index);
+        let old_fence_value = ctx.fence_value;
+        if self.contexts.len() == 1 {
+            let hal_desc = hal::CommandEncoderDescriptor {
+                label: None,
+                queue: &self.queue,
+            };
+            self.contexts.push(unsafe {
+                ExecutionContext {
+                    encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
+                    fence: self.device.create_fence().unwrap(),
+                    fence_value: 0,
+                    used_views: Vec::new(),
+                    used_cmd_bufs: Vec::new(),
+                    frames_recorded: 0,
+                }
+            });
+        }
+        self.context_index = (self.context_index + 1) % self.contexts.len();
+        let next = &mut self.contexts[self.context_index];
+        unsafe {
+            next.wait_and_clear(&self.device);
         }
+        next.fence_value = old_fence_value + 1;
     }
 
     fn exit(mut self) {
@@ -1004,7 +1005,7 @@ impl<A: hal::Api> Example<A> {
             {
                 let ctx = &mut self.contexts[self.context_index];
                 self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                     .unwrap();
             }
 
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 99800e87c9..9d5f62f915 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -857,6 +857,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        _fence: &Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
         let mut swapchain = self.swap_chain.write();
         let sc = swapchain.as_mut().unwrap();
@@ -895,7 +896,7 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         _surface_textures: &[&Texture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let mut temp_lists = self.temp_lists.lock();
         temp_lists.clear();
@@ -908,11 +909,9 @@ impl crate::Queue for Queue {
             self.raw.execute_command_lists(&temp_lists);
         }
 
-        if let Some((fence, value)) = signal_fence {
-            self.raw
-                .signal(&fence.raw, value)
-                .into_device_result("Signal fence")?;
-        }
+        self.raw
+            .signal(&signal_fence.raw, signal_value)
+            .into_device_result("Signal fence")?;
 
         // Note the lack of synchronization here between the main Direct queue
         // and the dedicated presentation queue. This is automatically handled
diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs
index f1986f7705..8cba9d063f 100644
--- a/wgpu-hal/src/empty.rs
+++ b/wgpu-hal/src/empty.rs
@@ -75,6 +75,7 @@ impl crate::Surface for Context {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &Resource,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
         Ok(None)
     }
@@ -114,7 +115,7 @@ impl crate::Queue for Context {
         &self,
         command_buffers: &[&Resource],
         surface_textures: &[&Resource],
-        signal_fence: Option<(&mut Resource, crate::FenceValue)>,
+        signal_fence: (&mut Resource, crate::FenceValue),
     ) -> DeviceResult<()> {
         Ok(())
     }
diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs
index 00ef70ba88..da844322a2 100644
--- a/wgpu-hal/src/gles/egl.rs
+++ b/wgpu-hal/src/gles/egl.rs
@@ -1434,6 +1434,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index f6b55a449a..95eff36d57 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -1740,7 +1740,7 @@ impl crate::Queue for super::Queue {
         &self,
         command_buffers: &[&super::CommandBuffer],
         _surface_textures: &[&super::Texture],
-        signal_fence: Option<(&mut super::Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut super::Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let shared = Arc::clone(&self.shared);
         let gl = &shared.context.lock();
@@ -1774,12 +1774,10 @@ impl crate::Queue for super::Queue {
             }
         }
 
-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(gl);
-            let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) }
-                .map_err(|_| crate::DeviceError::OutOfMemory)?;
-            fence.pending.push((value, sync));
-        }
+        signal_fence.maintain(gl);
+        let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) }
+            .map_err(|_| crate::DeviceError::OutOfMemory)?;
+        signal_fence.pending.push((signal_value, sync));
 
         Ok(())
     }
diff --git a/wgpu-hal/src/gles/web.rs b/wgpu-hal/src/gles/web.rs
index ab2ccef8b6..081f7da5d1 100644
--- a/wgpu-hal/src/gles/web.rs
+++ b/wgpu-hal/src/gles/web.rs
@@ -427,6 +427,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/gles/wgl.rs b/wgpu-hal/src/gles/wgl.rs
index aae70478b4..1111d98f83 100644
--- a/wgpu-hal/src/gles/wgl.rs
+++ b/wgpu-hal/src/gles/wgl.rs
@@ -798,6 +798,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<Duration>,
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 35b9ea0d0a..c6bc4e166c 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -459,44 +459,101 @@ pub trait Instance: Sized + WasmNotSendSync {
 pub trait Surface: WasmNotSendSync {
     type A: Api;
 
-    /// Configures the surface to use the given device.
+    /// Configure `self` to use `device`.
     ///
     /// # Safety
     ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work using `self` must have been completed.
     /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
     /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - All surfaces created using other devices must have been unconfigured before this call.
+    /// - The surface `self` must not currently be configured to use any other [`Device`].
     unsafe fn configure(
         &self,
         device: &<Self::A as Api>::Device,
         config: &SurfaceConfiguration,
     ) -> Result<(), SurfaceError>;
 
-    /// Unconfigures the surface on the given device.
+    /// Unconfigure `self` on `device`.
     ///
     /// # Safety
     ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work that uses `surface` must have been completed.
     /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
     /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - The surface must have been configured on the given device.
+    /// - The surface `self` must have been configured on `device`.
     unsafe fn unconfigure(&self, device: &<Self::A as Api>::Device);
 
-    /// Returns the next texture to be presented by the swapchain for drawing
+    /// Return the next texture to be presented by `self`, for the caller to draw on.
     ///
-    /// A `timeout` of `None` means to wait indefinitely, with no timeout.
+    /// On success, return an [`AcquiredSurfaceTexture`] representing the
+    /// texture into which the caller should draw the image to be displayed on
+    /// `self`.
+    ///
+    /// If `timeout` elapses before `self` has a texture ready to be acquired,
+    /// return `Ok(None)`. If `timeout` is `None`, wait indefinitely, with no
+    /// timeout.
+    ///
+    /// # Using an [`AcquiredSurfaceTexture`]
+    ///
+    /// On success, this function returns an [`AcquiredSurfaceTexture`] whose
+    /// [`texture`] field is a [`SurfaceTexture`] from which the caller can
+    /// [`borrow`] a [`Texture`] to draw on. The [`AcquiredSurfaceTexture`] also
+    /// carries some metadata about that [`SurfaceTexture`].
+    ///
+    /// All calls to [`Queue::submit`] that draw on that [`Texture`] must also
+    /// include the [`SurfaceTexture`] in the `surface_textures` argument.
+    ///
+    /// When you are done drawing on the texture, you can display it on `self`
+    /// by passing the [`SurfaceTexture`] and `self` to [`Queue::present`].
+    ///
+    /// If you do not wish to display the texture, you must pass the
+    /// [`SurfaceTexture`] to [`self.discard_texture`], so that it can be reused
+    /// by future acquisitions.
     ///
     /// # Portability
     ///
-    /// Some backends can't support a timeout when acquiring a texture and
-    /// the timeout will be ignored.
+    /// Some backends can't support a timeout when acquiring a texture. On these
+    /// backends, `timeout` is ignored.
     ///
-    /// Returns `None` on timing out.
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - The `fence` argument must be the same [`Fence`] passed to all calls to
+    ///   [`Queue::submit`] that used [`Texture`]s acquired from this surface.
+    ///
+    /// - You may only have one texture acquired from `self` at a time. When
+    ///   `acquire_texture` returns `Ok(Some(ast))`, you must pass the returned
+    ///   [`SurfaceTexture`] `ast.texture` to either [`Queue::present`] or
+    ///   [`Surface::discard_texture`] before calling `acquire_texture` again.
+    ///
+    /// [`texture`]: AcquiredSurfaceTexture::texture
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`borrow`]: std::borrow::Borrow::borrow
+    /// [`Texture`]: Api::Texture
+    /// [`Fence`]: Api::Fence
+    /// [`self.discard_texture`]: Surface::discard_texture
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &<Self::A as Api>::Fence,
     ) -> Result<Option<AcquiredSurfaceTexture<Self::A>>, SurfaceError>;
+
+    /// Relinquish an acquired texture without presenting it.
+    ///
+    /// After this call, the texture underlying [`SurfaceTexture`] may be
+    /// returned by subsequent calls to [`self.acquire_texture`].
+    ///
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - `texture` must be a [`SurfaceTexture`] returned by a call to
+    ///   [`self.acquire_texture`] that has not yet been passed to
+    ///   [`Queue::present`].
+    ///
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`self.acquire_texture`]: Surface::acquire_texture
     unsafe fn discard_texture(&self, texture: <Self::A as Api>::SurfaceTexture);
 }
 
@@ -762,19 +819,23 @@ pub trait Queue: WasmNotSendSync {
 
     /// Submit `command_buffers` for execution on GPU.
     ///
-    /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value`
-    /// when the operation is complete. See [`Fence`] for details.
+    /// Update `fence` to `value` when the operation is complete. See
+    /// [`Fence`] for details.
+    ///
+    /// A `wgpu_hal` queue is "single threaded": all command buffers are
+    /// executed in the order they're submitted, with each buffer able to see
+    /// previous buffers' results. Specifically:
     ///
-    /// If two calls to `submit` on a single `Queue` occur in a particular order
-    /// (that is, they happen on the same thread, or on two threads that have
-    /// synchronized to establish an ordering), then the first submission's
-    /// commands all complete execution before any of the second submission's
-    /// commands begin. All results produced by one submission are visible to
-    /// the next.
+    /// - If two calls to `submit` on a single `Queue` occur in a particular
+    ///   order (that is, they happen on the same thread, or on two threads that
+    ///   have synchronized to establish an ordering), then the first
+    ///   submission's commands all complete execution before any of the second
+    ///   submission's commands begin. All results produced by one submission
+    ///   are visible to the next.
     ///
-    /// Within a submission, command buffers execute in the order in which they
-    /// appear in `command_buffers`. All results produced by one buffer are
-    /// visible to the next.
+    /// - Within a submission, command buffers execute in the order in which they
+    ///   appear in `command_buffers`. All results produced by one buffer are
+    ///   visible to the next.
     ///
     /// If two calls to `submit` on a single `Queue` from different threads are
     /// not synchronized to occur in a particular order, they must pass distinct
@@ -803,10 +864,16 @@ pub trait Queue: WasmNotSendSync {
     /// - Every [`SurfaceTexture`][st] that any command in `command_buffers`
     ///   writes to must appear in the `surface_textures` argument.
     ///
+    /// - No [`SurfaceTexture`][st] may appear in the `surface_textures`
+    ///   argument more than once.
+    ///
     /// - Each [`SurfaceTexture`][st] in `surface_textures` must be configured
     ///   for use with the [`Device`][d] associated with this [`Queue`],
     ///   typically by calling [`Surface::configure`].
     ///
+    /// - All calls to this function that include a given [`SurfaceTexture`][st]
+    ///   in `surface_textures` must use the same [`Fence`].
+    ///
     /// [`Fence`]: Api::Fence
     /// [cb]: Api::CommandBuffer
     /// [ce]: Api::CommandEncoder
@@ -819,7 +886,7 @@ pub trait Queue: WasmNotSendSync {
         &self,
         command_buffers: &[&<Self::A as Api>::CommandBuffer],
         surface_textures: &[&<Self::A as Api>::SurfaceTexture],
-        signal_fence: Option<(&mut <Self::A as Api>::Fence, FenceValue)>,
+        signal_fence: (&mut <Self::A as Api>::Fence, FenceValue),
     ) -> Result<(), DeviceError>;
     unsafe fn present(
         &self,
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index a5ea63b035..b157cdce26 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -377,38 +377,37 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         _surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         objc::rc::autoreleasepool(|| {
-            let extra_command_buffer = match signal_fence {
-                Some((fence, value)) => {
-                    let completed_value = Arc::clone(&fence.completed_value);
-                    let block = block::ConcreteBlock::new(move |_cmd_buf| {
-                        completed_value.store(value, atomic::Ordering::Release);
-                    })
-                    .copy();
-
-                    let raw = match command_buffers.last() {
-                        Some(&cmd_buf) => cmd_buf.raw.to_owned(),
-                        None => {
-                            let queue = self.raw.lock();
-                            queue
-                                .new_command_buffer_with_unretained_references()
-                                .to_owned()
-                        }
-                    };
-                    raw.set_label("(wgpu internal) Signal");
-                    raw.add_completed_handler(&block);
-
-                    fence.maintain();
-                    fence.pending_command_buffers.push((value, raw.to_owned()));
-                    // only return an extra one if it's extra
-                    match command_buffers.last() {
-                        Some(_) => None,
-                        None => Some(raw),
+            let extra_command_buffer = {
+                let completed_value = Arc::clone(&signal_fence.completed_value);
+                let block = block::ConcreteBlock::new(move |_cmd_buf| {
+                    completed_value.store(signal_value, atomic::Ordering::Release);
+                })
+                .copy();
+
+                let raw = match command_buffers.last() {
+                    Some(&cmd_buf) => cmd_buf.raw.to_owned(),
+                    None => {
+                        let queue = self.raw.lock();
+                        queue
+                            .new_command_buffer_with_unretained_references()
+                            .to_owned()
                     }
+                };
+                raw.set_label("(wgpu internal) Signal");
+                raw.add_completed_handler(&block);
+
+                signal_fence.maintain();
+                signal_fence
+                    .pending_command_buffers
+                    .push((signal_value, raw.to_owned()));
+                // only return an extra one if it's extra
+                match command_buffers.last() {
+                    Some(_) => None,
+                    None => Some(raw),
                 }
-                None => None,
             };
 
             for cmd_buffer in command_buffers {
diff --git a/wgpu-hal/src/metal/surface.rs b/wgpu-hal/src/metal/surface.rs
index 889e319493..890d296614 100644
--- a/wgpu-hal/src/metal/surface.rs
+++ b/wgpu-hal/src/metal/surface.rs
@@ -242,6 +242,7 @@ impl crate::Surface for super::Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let render_layer = self.render_layer.lock();
         let (drawable, texture) = match autoreleasepool(|| {
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 6641a24a75..d407e39539 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -3,11 +3,7 @@ use super::conv;
 use ash::{amd, ext, khr, vk};
 use parking_lot::Mutex;
 
-use std::{
-    collections::BTreeMap,
-    ffi::CStr,
-    sync::{atomic::AtomicIsize, Arc},
-};
+use std::{collections::BTreeMap, ffi::CStr, sync::Arc};
 
 fn depth_stencil_required_flags() -> vk::FormatFeatureFlags {
     vk::FormatFeatureFlags::SAMPLED_IMAGE | vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT
@@ -1783,21 +1779,15 @@ impl super::Adapter {
             render_passes: Mutex::new(Default::default()),
             framebuffers: Mutex::new(Default::default()),
         });
-        let mut relay_semaphores = [vk::Semaphore::null(); 2];
-        for sem in relay_semaphores.iter_mut() {
-            unsafe {
-                *sem = shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)?
-            };
-        }
+
+        let relay_semaphores = super::RelaySemaphores::new(&shared)?;
+
         let queue = super::Queue {
             raw: raw_queue,
             swapchain_fn,
             device: Arc::clone(&shared),
             family_index,
-            relay_semaphores,
-            relay_index: AtomicIsize::new(-1),
+            relay_semaphores: Mutex::new(relay_semaphores),
         };
 
         let mem_allocator = {
diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs
index 1ea627897f..867b7efb23 100644
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@@ -612,17 +612,16 @@ impl super::Device {
         let images =
             unsafe { functor.get_swapchain_images(raw) }.map_err(crate::DeviceError::from)?;
 
-        // NOTE: It's important that we define at least images.len() + 1 wait
+        // NOTE: It's important that we define at least images.len() wait
         // semaphores, since we prospectively need to provide the call to
         // acquire the next image with an unsignaled semaphore.
-        let surface_semaphores = (0..images.len() + 1)
-            .map(|_| unsafe {
-                self.shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+        let surface_semaphores = (0..=images.len())
+            .map(|_| {
+                super::SwapchainImageSemaphores::new(&self.shared)
+                    .map(Mutex::new)
+                    .map(Arc::new)
             })
-            .collect::<Result<Vec<_>, _>>()
-            .map_err(crate::DeviceError::from)?;
+            .collect::<Result<Vec<_>, _>>()?;
 
         Ok(super::Swapchain {
             raw,
@@ -633,7 +632,7 @@ impl super::Device {
             config: config.clone(),
             view_formats: wgt_view_formats,
             surface_semaphores,
-            next_surface_index: 0,
+            next_semaphore_index: 0,
         })
     }
 
@@ -836,9 +835,12 @@ impl crate::Device for super::Device {
     unsafe fn exit(self, queue: super::Queue) {
         unsafe { self.mem_allocator.into_inner().cleanup(&*self.shared) };
         unsafe { self.desc_allocator.into_inner().cleanup(&*self.shared) };
-        for &sem in queue.relay_semaphores.iter() {
-            unsafe { self.shared.raw.destroy_semaphore(sem, None) };
-        }
+        unsafe {
+            queue
+                .relay_semaphores
+                .into_inner()
+                .destroy(&self.shared.raw)
+        };
         unsafe { self.shared.free_resources() };
     }
 
@@ -2055,54 +2057,7 @@ impl crate::Device for super::Device {
         timeout_ms: u32,
     ) -> Result<bool, crate::DeviceError> {
         let timeout_ns = timeout_ms as u64 * super::MILLIS_TO_NANOS;
-        match *fence {
-            super::Fence::TimelineSemaphore(raw) => {
-                let semaphores = [raw];
-                let values = [wait_value];
-                let vk_info = vk::SemaphoreWaitInfo::default()
-                    .semaphores(&semaphores)
-                    .values(&values);
-                let result = match self.shared.extension_fns.timeline_semaphore {
-                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
-                        ext.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    Some(super::ExtensionFn::Promoted) => unsafe {
-                        self.shared.raw.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    None => unreachable!(),
-                };
-                match result {
-                    Ok(()) => Ok(true),
-                    Err(vk::Result::TIMEOUT) => Ok(false),
-                    Err(other) => Err(other.into()),
-                }
-            }
-            super::Fence::FencePool {
-                last_completed,
-                ref active,
-                free: _,
-            } => {
-                if wait_value <= last_completed {
-                    Ok(true)
-                } else {
-                    match active.iter().find(|&&(value, _)| value >= wait_value) {
-                        Some(&(_, raw)) => {
-                            match unsafe {
-                                self.shared.raw.wait_for_fences(&[raw], true, timeout_ns)
-                            } {
-                                Ok(()) => Ok(true),
-                                Err(vk::Result::TIMEOUT) => Ok(false),
-                                Err(other) => Err(other.into()),
-                            }
-                        }
-                        None => {
-                            log::error!("No signals reached value {}", wait_value);
-                            Err(crate::DeviceError::Lost)
-                        }
-                    }
-                }
-            }
-        }
+        self.shared.wait_for_fence(fence, wait_value, timeout_ns)
     }
 
     unsafe fn start_capture(&self) -> bool {
@@ -2364,6 +2319,71 @@ impl crate::Device for super::Device {
     }
 }
 
+impl super::DeviceShared {
+    pub(super) fn new_binary_semaphore(&self) -> Result<vk::Semaphore, crate::DeviceError> {
+        unsafe {
+            self.raw
+                .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+                .map_err(crate::DeviceError::from)
+        }
+    }
+
+    pub(super) fn wait_for_fence(
+        &self,
+        fence: &super::Fence,
+        wait_value: crate::FenceValue,
+        timeout_ns: u64,
+    ) -> Result<bool, crate::DeviceError> {
+        profiling::scope!("Device::wait");
+        match *fence {
+            super::Fence::TimelineSemaphore(raw) => {
+                let semaphores = [raw];
+                let values = [wait_value];
+                let vk_info = vk::SemaphoreWaitInfo::default()
+                    .semaphores(&semaphores)
+                    .values(&values);
+                let result = match self.extension_fns.timeline_semaphore {
+                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
+                        ext.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    Some(super::ExtensionFn::Promoted) => unsafe {
+                        self.raw.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    None => unreachable!(),
+                };
+                match result {
+                    Ok(()) => Ok(true),
+                    Err(vk::Result::TIMEOUT) => Ok(false),
+                    Err(other) => Err(other.into()),
+                }
+            }
+            super::Fence::FencePool {
+                last_completed,
+                ref active,
+                free: _,
+            } => {
+                if wait_value <= last_completed {
+                    Ok(true)
+                } else {
+                    match active.iter().find(|&&(value, _)| value >= wait_value) {
+                        Some(&(_, raw)) => {
+                            match unsafe { self.raw.wait_for_fences(&[raw], true, timeout_ns) } {
+                                Ok(()) => Ok(true),
+                                Err(vk::Result::TIMEOUT) => Ok(false),
+                                Err(other) => Err(other.into()),
+                            }
+                        }
+                        None => {
+                            log::error!("No signals reached value {}", wait_value);
+                            Err(crate::DeviceError::Lost)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 impl From<gpu_alloc::AllocationError> for crate::DeviceError {
     fn from(error: gpu_alloc::AllocationError) -> Self {
         use gpu_alloc::AllocationError as Ae;
diff --git a/wgpu-hal/src/vulkan/instance.rs b/wgpu-hal/src/vulkan/instance.rs
index 6f471f8905..18acaeabb9 100644
--- a/wgpu-hal/src/vulkan/instance.rs
+++ b/wgpu-hal/src/vulkan/instance.rs
@@ -164,10 +164,14 @@ impl super::Swapchain {
             let _ = unsafe { device.device_wait_idle() };
         };
 
+        // We cannot take this by value, as the function returns `self`.
         for semaphore in self.surface_semaphores.drain(..) {
-            unsafe {
-                device.destroy_semaphore(semaphore, None);
-            }
+            let arc_removed = Arc::into_inner(semaphore).expect(
+                "Trying to destroy a SurfaceSemaphores that is still in use by a SurfaceTexture",
+            );
+            let mutex_removed = arc_removed.into_inner();
+
+            unsafe { mutex_removed.destroy(device) };
         }
 
         self
@@ -966,9 +970,10 @@ impl crate::Surface for super::Surface {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let mut swapchain = self.swapchain.write();
-        let sc = swapchain.as_mut().unwrap();
+        let swapchain = swapchain.as_mut().unwrap();
 
         let mut timeout_ns = match timeout {
             Some(duration) => duration.as_nanos() as u64,
@@ -988,12 +993,40 @@ impl crate::Surface for super::Surface {
             timeout_ns = u64::MAX;
         }
 
-        let wait_semaphore = sc.surface_semaphores[sc.next_surface_index];
+        let swapchain_semaphores_arc = swapchain.get_surface_semaphores();
+        // Nothing should be using this, so we don't block, but panic if we fail to lock.
+        let locked_swapchain_semaphores = swapchain_semaphores_arc
+            .try_lock()
+            .expect("Failed to lock a SwapchainSemaphores.");
+
+        // Wait for all commands writing to the previously acquired image to
+        // complete.
+        //
+        // Almost all the steps in the usual acquire-draw-present flow are
+        // asynchronous: they get something started on the presentation engine
+        // or the GPU, but on the CPU, control returns immediately. Without some
+        // sort of intervention, the CPU could crank out frames much faster than
+        // the presentation engine can display them.
+        //
+        // This is the intervention: if any submissions drew on this image, and
+        // thus waited for `locked_swapchain_semaphores.acquire`, wait for all
+        // of them to finish, thus ensuring that it's okay to pass `acquire` to
+        // `vkAcquireNextImageKHR` again.
+        swapchain.device.wait_for_fence(
+            fence,
+            locked_swapchain_semaphores.previously_used_submission_index,
+            timeout_ns,
+        )?;
 
         // will block if no image is available
         let (index, suboptimal) = match unsafe {
-            sc.functor
-                .acquire_next_image(sc.raw, timeout_ns, wait_semaphore, vk::Fence::null())
+            profiling::scope!("vkAcquireNextImageKHR");
+            swapchain.functor.acquire_next_image(
+                swapchain.raw,
+                timeout_ns,
+                locked_swapchain_semaphores.acquire,
+                vk::Fence::null(),
+            )
         } {
             // We treat `VK_SUBOPTIMAL_KHR` as `VK_SUCCESS` on Android.
             // See the comment in `Queue::present`.
@@ -1013,16 +1046,18 @@ impl crate::Surface for super::Surface {
             }
         };
 
-        sc.next_surface_index += 1;
-        sc.next_surface_index %= sc.surface_semaphores.len();
+        drop(locked_swapchain_semaphores);
+        // We only advance the surface semaphores if we successfully acquired an image, otherwise
+        // we should try to re-acquire using the same semaphores.
+        swapchain.advance_surface_semaphores();
 
         // special case for Intel Vulkan returning bizarre values (ugh)
-        if sc.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
+        if swapchain.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
             return Err(crate::SurfaceError::Outdated);
         }
 
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-framebuffer-03209
-        let raw_flags = if sc
+        let raw_flags = if swapchain
             .raw_flags
             .contains(vk::SwapchainCreateFlagsKHR::MUTABLE_FORMAT)
         {
@@ -1034,20 +1069,20 @@ impl crate::Surface for super::Surface {
         let texture = super::SurfaceTexture {
             index,
             texture: super::Texture {
-                raw: sc.images[index as usize],
+                raw: swapchain.images[index as usize],
                 drop_guard: None,
                 block: None,
-                usage: sc.config.usage,
-                format: sc.config.format,
+                usage: swapchain.config.usage,
+                format: swapchain.config.format,
                 raw_flags,
                 copy_size: crate::CopyExtent {
-                    width: sc.config.extent.width,
-                    height: sc.config.extent.height,
+                    width: swapchain.config.extent.width,
+                    height: swapchain.config.extent.height,
                     depth: 1,
                 },
-                view_formats: sc.view_formats.clone(),
+                view_formats: swapchain.view_formats.clone(),
             },
-            wait_semaphore,
+            surface_semaphores: swapchain_semaphores_arc,
         };
         Ok(Some(crate::AcquiredSurfaceTexture {
             texture,
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index 1716ee9206..40e7a2cb42 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -33,13 +33,11 @@ mod instance;
 
 use std::{
     borrow::Borrow,
+    collections::HashSet,
     ffi::{CStr, CString},
-    fmt,
+    fmt, mem,
     num::NonZeroU32,
-    sync::{
-        atomic::{AtomicIsize, Ordering},
-        Arc,
-    },
+    sync::Arc,
 };
 
 use arrayvec::ArrayVec;
@@ -147,6 +145,173 @@ pub struct Instance {
     shared: Arc<InstanceShared>,
 }
 
+/// The semaphores needed to use one image in a swapchain.
+#[derive(Debug)]
+struct SwapchainImageSemaphores {
+    /// A semaphore that is signaled when this image is safe for us to modify.
+    ///
+    /// When [`vkAcquireNextImageKHR`] returns the index of the next swapchain
+    /// image that we should use, that image may actually still be in use by the
+    /// presentation engine, and is not yet safe to modify. However, that
+    /// function does accept a semaphore that it will signal when the image is
+    /// indeed safe to begin messing with.
+    ///
+    /// This semaphore is:
+    ///
+    /// - waited for by the first queue submission to operate on this image
+    ///   since it was acquired, and
+    ///
+    /// - signaled by [`vkAcquireNextImageKHR`] when the acquired image is ready
+    ///   for us to use.
+    ///
+    /// [`vkAcquireNextImageKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkAcquireNextImageKHR
+    acquire: vk::Semaphore,
+
+    /// True if the next command submission operating on this image should wait
+    /// for [`acquire`].
+    ///
+    /// We must wait for `acquire` before drawing to this swapchain image, but
+    /// because `wgpu-hal` queue submissions are always strongly ordered, only
+    /// the first submission that works with a swapchain image actually needs to
+    /// wait. We set this flag when this image is acquired, and clear it the
+    /// first time it's passed to [`Queue::submit`] as a surface texture.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    /// [`Queue::submit`]: crate::Queue::submit
+    should_wait_for_acquire: bool,
+
+    /// A pool of semaphores for ordering presentation after drawing.
+    ///
+    /// The first [`present_index`] semaphores in this vector are:
+    ///
+    /// - all waited on by the call to [`vkQueuePresentKHR`] that presents this
+    ///   image, and
+    ///
+    /// - each signaled by some [`vkQueueSubmit`] queue submission that draws to
+    ///   this image, when the submission finishes execution.
+    ///
+    /// This vector accumulates one semaphore per submission that writes to this
+    /// image. This is awkward, but hard to avoid: [`vkQueuePresentKHR`]
+    /// requires a semaphore to order it with respect to drawing commands, and
+    /// we can't attach new completion semaphores to a command submission after
+    /// it's been submitted. This means that, at submission time, we must create
+    /// the semaphore we might need if the caller's next action is to enqueue a
+    /// presentation of this image.
+    ///
+    /// An alternative strategy would be for presentation to enqueue an empty
+    /// submit, ordered relative to other submits in the usual way, and
+    /// signaling a single presentation semaphore. But we suspect that submits
+    /// are usually expensive enough, and semaphores usually cheap enough, that
+    /// performance-sensitive users will avoid making many submits, so that the
+    /// cost of accumulated semaphores will usually be less than the cost of an
+    /// additional submit.
+    ///
+    /// Only the first [`present_index`] semaphores in the vector are actually
+    /// going to be signalled by submitted commands, and need to be waited for
+    /// by the next present call. Any semaphores beyond that index were created
+    /// for prior presents and are simply being retained for recycling.
+    ///
+    /// [`present_index`]: SwapchainImageSemaphores::present_index
+    /// [`vkQueuePresentKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueuePresentKHR
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    present: Vec<vk::Semaphore>,
+
+    /// The number of semaphores in [`present`] to be signalled for this submission.
+    ///
+    /// [`present`]: SwapchainImageSemaphores::present
+    present_index: usize,
+
+    /// The fence value of the last command submission that wrote to this image.
+    ///
+    /// The next time we try to acquire this image, we'll block until
+    /// this submission finishes, proving that [`acquire`] is ready to
+    /// pass to `vkAcquireNextImageKHR` again.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    previously_used_submission_index: crate::FenceValue,
+}
+
+impl SwapchainImageSemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            acquire: device.new_binary_semaphore()?,
+            should_wait_for_acquire: true,
+            present: Vec::new(),
+            present_index: 0,
+            previously_used_submission_index: 0,
+        })
+    }
+
+    fn set_used_fence_value(&mut self, value: crate::FenceValue) {
+        self.previously_used_submission_index = value;
+    }
+
+    /// Return the semaphore that commands drawing to this image should wait for, if any.
+    ///
+    /// This only returns `Some` once per acquisition; see
+    /// [`SwapchainImageSemaphores::should_wait_for_acquire`] for details.
+    fn get_acquire_wait_semaphore(&mut self) -> Option<vk::Semaphore> {
+        if self.should_wait_for_acquire {
+            self.should_wait_for_acquire = false;
+            Some(self.acquire)
+        } else {
+            None
+        }
+    }
+
+    /// Return a semaphore that a submission that writes to this image should
+    /// signal when it's done.
+    ///
+    /// See [`SwapchainImageSemaphores::present`] for details.
+    fn get_submit_signal_semaphore(
+        &mut self,
+        device: &DeviceShared,
+    ) -> Result<vk::Semaphore, crate::DeviceError> {
+        // Try to recycle a semaphore we created for a previous presentation.
+        let sem = match self.present.get(self.present_index) {
+            Some(sem) => *sem,
+            None => {
+                let sem = device.new_binary_semaphore()?;
+                self.present.push(sem);
+                sem
+            }
+        };
+
+        self.present_index += 1;
+
+        Ok(sem)
+    }
+
+    /// Return the semaphores that a presentation of this image should wait on.
+    ///
+    /// Return a slice of semaphores that the call to [`vkQueueSubmit`] that
+    /// ends this image's acquisition should wait for. See
+    /// [`SwapchainImageSemaphores::present`] for details.
+    ///
+    /// Reset `self` to be ready for the next acquisition cycle.
+    ///
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    fn get_present_wait_semaphores(&mut self) -> &[vk::Semaphore] {
+        let old_index = self.present_index;
+
+        // Since this marks the end of this acquire/draw/present cycle, take the
+        // opportunity to reset `self` in preparation for the next acquisition.
+        self.present_index = 0;
+        self.should_wait_for_acquire = true;
+
+        &self.present[0..old_index]
+    }
+
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            device.destroy_semaphore(self.acquire, None);
+            for sem in &self.present {
+                device.destroy_semaphore(*sem, None);
+            }
+        }
+    }
+}
+
 struct Swapchain {
     raw: vk::SwapchainKHR,
     raw_flags: vk::SwapchainCreateFlagsKHR,
@@ -157,9 +322,25 @@ struct Swapchain {
     view_formats: Vec<wgt::TextureFormat>,
     /// One wait semaphore per swapchain image. This will be associated with the
     /// surface texture, and later collected during submission.
-    surface_semaphores: Vec<vk::Semaphore>,
-    /// Current semaphore index to use when acquiring a surface.
-    next_surface_index: usize,
+    ///
+    /// We need this to be `Arc<Mutex<>>` because we need to be able to pass this
+    /// data into the surface texture, so submit/present can use it.
+    surface_semaphores: Vec<Arc<Mutex<SwapchainImageSemaphores>>>,
+    /// The index of the next semaphore to use. Ideally we would use the same
+    /// index as the image index, but we need to specify the semaphore as an argument
+    /// to the acquire_next_image function which is what tells us which image to use.
+    next_semaphore_index: usize,
+}
+
+impl Swapchain {
+    fn advance_surface_semaphores(&mut self) {
+        let semaphore_count = self.surface_semaphores.len();
+        self.next_semaphore_index = (self.next_semaphore_index + 1) % semaphore_count;
+    }
+
+    fn get_surface_semaphores(&self) -> Arc<Mutex<SwapchainImageSemaphores>> {
+        self.surface_semaphores[self.next_semaphore_index].clone()
+    }
 }
 
 pub struct Surface {
@@ -173,7 +354,7 @@ pub struct Surface {
 pub struct SurfaceTexture {
     index: u32,
     texture: Texture,
-    wait_semaphore: vk::Semaphore,
+    surface_semaphores: Arc<Mutex<SwapchainImageSemaphores>>,
 }
 
 impl Borrow<Texture> for SurfaceTexture {
@@ -359,18 +540,87 @@ pub struct Device {
     render_doc: crate::auxil::renderdoc::RenderDoc,
 }
 
+/// Semaphores for forcing queue submissions to run in order.
+///
+/// The [`wgpu_hal::Queue`] trait promises that if two calls to [`submit`] are
+/// ordered, then the first submission will finish on the GPU before the second
+/// submission begins. To get this behavior on Vulkan we need to pass semaphores
+/// to [`vkQueueSubmit`] for the commands to wait on before beginning execution,
+/// and to signal when their execution is done.
+///
+/// Normally this can be done with a single semaphore, waited on and then
+/// signalled for each submission. At any given time there's exactly one
+/// submission that would signal the semaphore, and exactly one waiting on it,
+/// as Vulkan requires.
+///
+/// However, as of Oct 2021, bug [#5508] in the Mesa ANV drivers caused them to
+/// hang if we use a single semaphore. The workaround is to alternate between
+/// two semaphores. The bug has been fixed in Mesa, but we should probably keep
+/// the workaround until, say, Oct 2026.
+///
+/// [`wgpu_hal::Queue`]: crate::Queue
+/// [`submit`]: crate::Queue::submit
+/// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+/// [#5508]: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508
+#[derive(Clone)]
+struct RelaySemaphores {
+    /// The semaphore the next submission should wait on before beginning
+    /// execution on the GPU. This is `None` for the first submission, which
+    /// should not wait on anything at all.
+    wait: Option<vk::Semaphore>,
+
+    /// The semaphore the next submission should signal when it has finished
+    /// execution on the GPU.
+    signal: vk::Semaphore,
+}
+
+impl RelaySemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            wait: None,
+            signal: device.new_binary_semaphore()?,
+        })
+    }
+
+    /// Advances the semaphores, returning the semaphores that should be used for a submission.
+    fn advance(&mut self, device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        let old = self.clone();
+
+        // Build the state for the next submission.
+        match self.wait {
+            None => {
+                // The `old` values describe the first submission to this queue.
+                // The second submission should wait on `old.signal`, and then
+                // signal a new semaphore which we'll create now.
+                self.wait = Some(old.signal);
+                self.signal = device.new_binary_semaphore()?;
+            }
+            Some(ref mut wait) => {
+                // What this submission signals, the next should wait.
+                mem::swap(wait, &mut self.signal);
+            }
+        };
+
+        Ok(old)
+    }
+
+    /// Destroys the semaphores.
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            if let Some(wait) = self.wait {
+                device.destroy_semaphore(wait, None);
+            }
+            device.destroy_semaphore(self.signal, None);
+        }
+    }
+}
+
 pub struct Queue {
     raw: vk::Queue,
     swapchain_fn: khr::swapchain::Device,
     device: Arc<DeviceShared>,
     family_index: u32,
-    /// We use a redundant chain of semaphores to pass on the signal
-    /// from submissions to the last present, since it's required by the
-    /// specification.
-    /// It would be correct to use a single semaphore there, but
-    /// [Intel hangs in `anv_queue_finish`](https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508).
-    relay_semaphores: [vk::Semaphore; 2],
-    relay_index: AtomicIsize,
+    relay_semaphores: Mutex<RelaySemaphores>,
 }
 
 #[derive(Debug)]
@@ -702,58 +952,89 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let mut fence_raw = vk::Fence::null();
 
         let mut wait_stage_masks = Vec::new();
         let mut wait_semaphores = Vec::new();
-        let mut signal_semaphores = ArrayVec::<_, 2>::new();
-        let mut signal_values = ArrayVec::<_, 2>::new();
+        let mut signal_semaphores = Vec::new();
+        let mut signal_values = Vec::new();
+
+        // Double check that the same swapchain image isn't being given to us multiple times,
+        // as that will deadlock when we try to lock them all.
+        debug_assert!(
+            {
+                let mut check = HashSet::with_capacity(surface_textures.len());
+                // We compare the Arcs by pointer, as Eq isn't well defined for SurfaceSemaphores.
+                for st in surface_textures {
+                    check.insert(Arc::as_ptr(&st.surface_semaphores));
+                }
+                check.len() == surface_textures.len()
+            },
+            "More than one surface texture is being used from the same swapchain. This will cause a deadlock in release."
+        );
 
-        for &surface_texture in surface_textures {
-            wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(surface_texture.wait_semaphore);
+        let locked_swapchain_semaphores = surface_textures
+            .iter()
+            .map(|st| {
+                st.surface_semaphores
+                    .try_lock()
+                    .expect("Failed to lock surface semaphore.")
+            })
+            .collect::<Vec<_>>();
+
+        for mut swapchain_semaphore in locked_swapchain_semaphores {
+            swapchain_semaphore.set_used_fence_value(signal_value);
+
+            // If we're the first submission to operate on this image, wait on
+            // its acquire semaphore, to make sure the presentation engine is
+            // done with it.
+            if let Some(sem) = swapchain_semaphore.get_acquire_wait_semaphore() {
+                wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
+                wait_semaphores.push(sem);
+            }
+
+            // Get a semaphore to signal when we're done writing to this surface
+            // image. Presentation of this image will wait for this.
+            let signal_semaphore = swapchain_semaphore.get_submit_signal_semaphore(&self.device)?;
+            signal_semaphores.push(signal_semaphore);
+            signal_values.push(!0);
         }
 
-        let old_index = self.relay_index.load(Ordering::Relaxed);
+        // In order for submissions to be strictly ordered, we encode a dependency between each submission
+        // using a pair of semaphores. This adds a wait if it is needed, and signals the next semaphore.
+        let semaphore_state = self.relay_semaphores.lock().advance(&self.device)?;
 
-        let sem_index = if old_index >= 0 {
+        if let Some(sem) = semaphore_state.wait {
             wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(self.relay_semaphores[old_index as usize]);
-            (old_index as usize + 1) % self.relay_semaphores.len()
-        } else {
-            0
-        };
-
-        signal_semaphores.push(self.relay_semaphores[sem_index]);
+            wait_semaphores.push(sem);
+        }
 
-        self.relay_index
-            .store(sem_index as isize, Ordering::Relaxed);
+        signal_semaphores.push(semaphore_state.signal);
+        signal_values.push(!0);
 
-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(&self.device.raw)?;
-            match *fence {
-                Fence::TimelineSemaphore(raw) => {
-                    signal_semaphores.push(raw);
-                    signal_values.push(!0);
-                    signal_values.push(value);
-                }
-                Fence::FencePool {
-                    ref mut active,
-                    ref mut free,
-                    ..
-                } => {
-                    fence_raw = match free.pop() {
-                        Some(raw) => raw,
-                        None => unsafe {
-                            self.device
-                                .raw
-                                .create_fence(&vk::FenceCreateInfo::default(), None)?
-                        },
-                    };
-                    active.push((value, fence_raw));
-                }
+        // We need to signal our wgpu::Fence if we have one, this adds it to the signal list.
+        signal_fence.maintain(&self.device.raw)?;
+        match *signal_fence {
+            Fence::TimelineSemaphore(raw) => {
+                signal_semaphores.push(raw);
+                signal_values.push(signal_value);
+            }
+            Fence::FencePool {
+                ref mut active,
+                ref mut free,
+                ..
+            } => {
+                fence_raw = match free.pop() {
+                    Some(raw) => raw,
+                    None => unsafe {
+                        self.device
+                            .raw
+                            .create_fence(&vk::FenceCreateInfo::default(), None)?
+                    },
+                };
+                active.push((signal_value, fence_raw));
             }
         }
 
@@ -771,7 +1052,7 @@ impl crate::Queue for Queue {
 
         let mut vk_timeline_info;
 
-        if !signal_values.is_empty() {
+        if self.device.private_caps.timeline_semaphores {
             vk_timeline_info =
                 vk::TimelineSemaphoreSubmitInfo::default().signal_semaphore_values(&signal_values);
             vk_info = vk_info.push_next(&mut vk_timeline_info);
@@ -793,19 +1074,14 @@ impl crate::Queue for Queue {
     ) -> Result<(), crate::SurfaceError> {
         let mut swapchain = surface.swapchain.write();
         let ssc = swapchain.as_mut().unwrap();
+        let mut swapchain_semaphores = texture.surface_semaphores.lock();
 
         let swapchains = [ssc.raw];
         let image_indices = [texture.index];
-        let mut vk_info = vk::PresentInfoKHR::default()
+        let vk_info = vk::PresentInfoKHR::default()
             .swapchains(&swapchains)
-            .image_indices(&image_indices);
-
-        let old_index = self.relay_index.swap(-1, Ordering::Relaxed);
-        if old_index >= 0 {
-            vk_info = vk_info.wait_semaphores(
-                &self.relay_semaphores[old_index as usize..old_index as usize + 1],
-            );
-        }
+            .image_indices(&image_indices)
+            .wait_semaphores(swapchain_semaphores.get_present_wait_semaphores());
 
         let suboptimal = {
             profiling::scope!("vkQueuePresentKHR");