From 729b15ed67c56b7f829dd8fde323c0ac0a01a630 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 10:24:33 +0000
Subject: [PATCH 1/7] Initialise shaders in parallel

---
 crates/tests/src/lib.rs        |   1 +
 examples/headless/src/main.rs  |   1 +
 examples/with_bevy/src/main.rs |   1 +
 examples/with_winit/src/lib.rs |   1 +
 src/lib.rs                     |  12 +++
 src/wgpu_engine.rs             | 153 +++++++++++++++++++++++++++------
 6 files changed, 145 insertions(+), 24 deletions(-)
diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs
index faf7b5149..740330afd 100644
--- a/crates/tests/src/lib.rs
+++ b/crates/tests/src/lib.rs
@@ -62,6 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result<Image> {
         RendererOptions {
             surface_format: None,
             use_cpu: params.use_cpu,
+            initialise_in_parallel: false,
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )
diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs
index 65587c4df..d8dfa6a1a 100644
--- a/examples/headless/src/main.rs
+++ b/examples/headless/src/main.rs
@@ -90,6 +90,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
         RendererOptions {
             surface_format: None,
             use_cpu: args.use_cpu,
+            initialise_in_parallel: false,
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )
diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs
index b4760e9dd..4dfec082a 100644
--- a/examples/with_bevy/src/main.rs
+++ b/examples/with_bevy/src/main.rs
@@ -29,6 +29,7 @@ impl FromWorld for VelloRenderer {
                 device.wgpu_device(),
                 RendererOptions {
                     surface_format: None,
+                    initialise_in_parallel: false,
                     antialiasing_support: vello::AaSupport::area_only(),
                     use_cpu: false,
                 },
diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index 00868de7e..e23ec3122 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -538,6 +538,7 @@ fn run(
                                     surface_format: Some(render_state.surface.format),
                                     use_cpu,
                                     antialiasing_support: vello::AaSupport::all(),
+                                    initialise_in_parallel: cfg!(all(not(target_arch="wasm32"), not(target_os="mac")))
                                 },
                             )
                             .expect("Could create renderer")
diff --git a/src/lib.rs b/src/lib.rs
index 4cde7fb52..28487f8b8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,6 +25,8 @@ mod shaders;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 
+use std::time::Instant;
+
 /// Styling and composition primitives.
 pub use peniko;
 /// 2D geometry, with a focus on curves.
@@ -140,6 +142,9 @@ pub struct RendererOptions {
     /// Represents the enabled set of AA configurations. This will be used to determine which
     /// pipeline permutations should be compiled at startup.
     pub antialiasing_support: AaSupport,
+
+    /// Whether to initialise shaders in parallel
+    pub initialise_in_parallel: bool,
 }
 
 #[cfg(feature = "wgpu")]
@@ -147,7 +152,13 @@ impl Renderer {
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, options: RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new(options.use_cpu);
+        if options.initialise_in_parallel {
+            engine.use_parallel_initialisation();
+        }
+        let start = Instant::now();
         let shaders = shaders::full_shaders(device, &mut engine, &options)?;
+        engine.build_shaders_if_needed(device);
+        eprintln!("Building shaders took {:?}", start.elapsed());
         let blit = options
             .surface_format
             .map(|surface_format| BlitPipeline::new(device, surface_format));
@@ -272,6 +283,7 @@ impl Renderer {
     pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> {
         device.push_error_scope(wgpu::ErrorFilter::Validation);
         let mut engine = WgpuEngine::new(self.options.use_cpu);
+        // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly
         let shaders = shaders::full_shaders(device, &mut engine, &self.options)?;
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 8a39a76e7..b28ac1e5b 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -5,6 +5,8 @@ use std::{
     borrow::Cow,
     cell::RefCell,
     collections::{hash_map::Entry, HashMap, HashSet},
+    sync::{mpsc::RecvError, Mutex},
+    thread::available_parallelism,
 };
 
 use wgpu::{
@@ -19,12 +21,20 @@ use crate::{
     BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
 };
 
+struct UninitialisedShader {
+    wgsl: Cow<'static, str>,
+    label: &'static str,
+    entries: Vec<wgpu::BindGroupLayoutEntry>,
+    shader_id: ShaderId,
+}
+
 #[derive(Default)]
 pub struct WgpuEngine {
     shaders: Vec<Shader>,
     pool: ResourcePool,
     bind_map: BindMap,
     downloads: HashMap<Id, Buffer>,
+    shaders_to_initialise: Option<Vec<UninitialisedShader>>,
     pub(crate) use_cpu: bool,
 }
 
@@ -62,7 +72,7 @@ impl Shader {
         } else if let Some(wgpu) = self.wgpu.as_ref() {
             ShaderKind::Wgpu(wgpu)
         } else {
-            panic!("no available shader")
+            panic!("no available shader for {}", self.label)
         }
     }
 }
@@ -130,6 +140,77 @@ impl WgpuEngine {
         }
     }
 
+    /// Enable creating any remaining shaders in parallel
+    pub fn use_parallel_initialisation(&mut self) {
+        if self.shaders_to_initialise.is_some() {
+            return;
+        }
+        self.shaders_to_initialise = Some(Vec::new());
+    }
+
+    /// Initialise (in parallel) any shaders which are yet to be created
+    pub fn build_shaders_if_needed(&mut self, device: &Device) {
+        if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
+            // Use half of available threads if available, or three otherwise
+            // (This choice is arbitrary, and could be tuned, although a proper work stealing system should be used instead)
+            let threads_to_use = available_parallelism().map_or(3, |it| it.get() / 2);
+            let remainder =
+                new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
+            let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
+
+            let work_queue = Mutex::new(remainder.into_iter());
+            let work_queue = &work_queue;
+            std::thread::scope(|scope| {
+                let tx = tx;
+                new_shaders
+                    .into_iter()
+                    .map(|it| {
+                        let tx = tx.clone();
+                        scope.spawn(move || {
+                            let shader = Self::create_compute_pipeline(
+                                device, it.label, it.wgsl, it.entries,
+                            );
+                            // We know the rx can only be closed if all the tx references are dropped
+                            tx.send((it.shader_id, shader)).unwrap();
+                            loop {
+                                if let Ok(mut guard) = work_queue.lock() {
+                                    if let Some(value) = guard.next() {
+                                        drop(guard);
+                                        let shader = Self::create_compute_pipeline(
+                                            device,
+                                            value.label,
+                                            value.wgsl,
+                                            value.entries,
+                                        );
+                                        tx.send((value.shader_id, shader)).unwrap();
+                                    } else {
+                                        break;
+                                    }
+                                } else {
+                                    // Another thread panicked, we ignore that here and finish our processing
+                                    break;
+                                }
+                            }
+                            drop(tx);
+                        });
+                    })
+                    .for_each(drop);
+                // Drop the initial sender, to mean that there will be no more senders if and only if all other threads have finished
+                drop(tx);
+                loop {
+                    match rx.recv() {
+                        Ok((id, value)) => {
+                            self.shaders[id.0].wgpu = Some(value);
+                        }
+                        // The channel has finished, we can finish this work
+                        // If any worker paniced, the `scope` will panic
+                        Err(RecvError) => break,
+                    }
+                }
+            });
+        }
+    }
+
     /// Add a shader.
     ///
     /// This function is somewhat limited, it doesn't apply a label, only allows one bind group,
@@ -173,10 +254,6 @@ impl WgpuEngine {
             }
         }
 
-        let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
-            label: Some(label),
-            source: wgpu::ShaderSource::Wgsl(wgsl),
-        });
         let entries = layout
             .iter()
             .enumerate()
@@ -225,27 +302,23 @@ impl WgpuEngine {
                 }
             })
             .collect::<Vec<_>>();
-        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
-            label: None,
-            entries: &entries,
-        });
-        let compute_pipeline_layout =
-            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
-                label: None,
-                bind_group_layouts: &[&bind_group_layout],
-                push_constant_ranges: &[],
+        if let Some(uninit) = self.shaders_to_initialise.as_mut() {
+            let id = add(Shader {
+                label,
+                wgpu: None,
+                cpu: None,
+            })?;
+            uninit.push(UninitialisedShader {
+                wgsl,
+                label,
+                entries,
+                shader_id: id,
             });
-        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
-            label: Some(label),
-            layout: Some(&compute_pipeline_layout),
-            module: &shader_module,
-            entry_point: "main",
-        });
+            return Ok(id);
+        }
+        let wgpu = Self::create_compute_pipeline(device, label, wgsl, entries);
         add(Shader {
-            wgpu: Some(WgpuShader {
-                pipeline,
-                bind_group_layout,
-            }),
+            wgpu: Some(wgpu),
             cpu: None,
             label,
         })
@@ -532,6 +605,38 @@ impl WgpuEngine {
     pub fn free_download(&mut self, buf: BufProxy) {
         self.downloads.remove(&buf.id);
     }
+
+    fn create_compute_pipeline(
+        device: &Device,
+        label: &str,
+        wgsl: Cow<'_, str>,
+        entries: Vec<wgpu::BindGroupLayoutEntry>,
+    ) -> WgpuShader {
+        let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some(label),
+            source: wgpu::ShaderSource::Wgsl(wgsl),
+        });
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &entries,
+        });
+        let compute_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: None,
+                bind_group_layouts: &[&bind_group_layout],
+                push_constant_ranges: &[],
+            });
+        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some(label),
+            layout: Some(&compute_pipeline_layout),
+            module: &shader_module,
+            entry_point: "main",
+        });
+        WgpuShader {
+            pipeline,
+            bind_group_layout,
+        }
+    }
 }
 
 impl BindMap {

From bcf76647bea5a2bb85ba3805c2a0a31679fffffd Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 10:51:53 +0000
Subject: [PATCH 2/7] Add control and visibility over initialisation

---
 examples/with_winit/src/lib.rs |  8 +++++-
 src/wgpu_engine.rs             | 50 ++++++++++++++++------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index e23ec3122..ec7a84370 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -51,6 +51,10 @@ struct Args {
     #[arg(long)]
     /// Whether to use CPU shaders
     use_cpu: bool,
+    /// Whether to force initialising the shaders serially (rather than spawning threads)
+    /// This has no effect on wasm, and on macOS for performance reasons
+    #[arg(long, action = clap::ArgAction::SetFalse)]
+    serial_initialisation: bool,
 }
 
 struct RenderState<'s> {
@@ -538,7 +542,9 @@ fn run(
                                     surface_format: Some(render_state.surface.format),
                                     use_cpu,
                                     antialiasing_support: vello::AaSupport::all(),
-                                    initialise_in_parallel: cfg!(all(not(target_arch="wasm32"), not(target_os="mac")))
+                                    // We exclude macOS because it (supposedly) makes compilation slower
+                                    // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
+                                    initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac")))
                                 },
                             )
                             .expect("Could create renderer")
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index b28ac1e5b..af6781f0c 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -5,7 +5,7 @@ use std::{
     borrow::Cow,
     cell::RefCell,
     collections::{hash_map::Entry, HashMap, HashSet},
-    sync::{mpsc::RecvError, Mutex},
+    sync::Mutex,
     thread::available_parallelism,
 };
 
@@ -151,13 +151,15 @@ impl WgpuEngine {
     /// Initialise (in parallel) any shaders which are yet to be created
     pub fn build_shaders_if_needed(&mut self, device: &Device) {
         if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
-            // Use half of available threads if available, or three otherwise
-            // (This choice is arbitrary, and could be tuned, although a proper work stealing system should be used instead)
-            let threads_to_use = available_parallelism().map_or(3, |it| it.get() / 2);
+            // Try and not to use all threads
+            // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead)
+            let threads_to_use = available_parallelism().map_or(2, |it| it.get().max(4) - 2);
+            eprintln!("Initialising in parallel using {threads_to_use} threads");
             let remainder =
                 new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
             let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
 
+            // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue
             let work_queue = Mutex::new(remainder.into_iter());
             let work_queue = &work_queue;
             std::thread::scope(|scope| {
@@ -166,14 +168,15 @@ impl WgpuEngine {
                     .into_iter()
                     .map(|it| {
                         let tx = tx.clone();
-                        scope.spawn(move || {
-                            let shader = Self::create_compute_pipeline(
-                                device, it.label, it.wgsl, it.entries,
-                            );
-                            // We know the rx can only be closed if all the tx references are dropped
-                            tx.send((it.shader_id, shader)).unwrap();
-                            loop {
-                                if let Ok(mut guard) = work_queue.lock() {
+                        std::thread::Builder::new()
+                            .name("Vello shader initialisation worker thread".into())
+                            .spawn_scoped(scope, move || {
+                                let shader = Self::create_compute_pipeline(
+                                    device, it.label, it.wgsl, it.entries,
+                                );
+                                // We know the rx can only be closed if all the tx references are dropped
+                                tx.send((it.shader_id, shader)).unwrap();
+                                while let Ok(mut guard) = work_queue.lock() {
                                     if let Some(value) = guard.next() {
                                         drop(guard);
                                         let shader = Self::create_compute_pipeline(
@@ -186,26 +189,19 @@ impl WgpuEngine {
                                     } else {
                                         break;
                                     }
-                                } else {
-                                    // Another thread panicked, we ignore that here and finish our processing
-                                    break;
                                 }
-                            }
-                            drop(tx);
-                        });
+                                // Another thread panicked or we finished.
+                                // If another thread panicked, we ignore that here and finish our processing
+                                drop(tx);
+                            })
+                            .expect("failed to spawn thread");
                     })
                     .for_each(drop);
                 // Drop the initial sender, to mean that there will be no more senders if and only if all other threads have finished
                 drop(tx);
-                loop {
-                    match rx.recv() {
-                        Ok((id, value)) => {
-                            self.shaders[id.0].wgpu = Some(value);
-                        }
-                        // The channel has finished, we can finish this work
-                        // If any worker paniced, the `scope` will panic
-                        Err(RecvError) => break,
-                    }
+
+                while let Ok((id, value)) = rx.recv() {
+                    self.shaders[id.0].wgpu = Some(value);
                 }
             });
         }

From 5f68cb346bb78706509d32894cf9cec69d051e97 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 11:03:42 +0000
Subject: [PATCH 3/7] Clarify comment

---
 examples/with_winit/src/lib.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index ec7a84370..6090eb44d 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -542,8 +542,9 @@ fn run(
                                     surface_format: Some(render_state.surface.format),
                                     use_cpu,
                                     antialiasing_support: vello::AaSupport::all(),
-                                    // We exclude macOS because it (supposedly) makes compilation slower
+                                    // We exclude macOS because it (supposedly) makes pipeline compilation slower
                                     // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
+                                    // In theory, we should only exclude metal adapters, but the difference is very minor
                                     initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac")))
                                 },
                             )

From 603421e3b8468b428a6c782e3e4ec2487e14f025 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 11:20:22 +0000
Subject: [PATCH 4/7] Fix wasm compilation

---
 examples/with_winit/src/lib.rs |  3 ++-
 src/lib.rs                     |  4 ++++
 src/wgpu_engine.rs             | 12 ++++++++----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index 6090eb44d..647a092bd 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -545,7 +545,8 @@ fn run(
                                     // We exclude macOS because it (supposedly) makes pipeline compilation slower
                                     // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
                                     // In theory, we should only exclude metal adapters, but the difference is very minor
-                                    initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac")))
+                                    // wasm isn't supported
+                                    initialise_in_parallel: args.serial_initialisation && cfg!(not(target_os="mac"))
                                 },
                             )
                             .expect("Could create renderer")
diff --git a/src/lib.rs b/src/lib.rs
index 28487f8b8..6e13d4e81 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -144,6 +144,8 @@ pub struct RendererOptions {
     pub antialiasing_support: AaSupport,
 
     /// Whether to initialise shaders in parallel
+    ///
+    /// Has no effect on WebAssembly
     pub initialise_in_parallel: bool,
 }
 
@@ -153,10 +155,12 @@ impl Renderer {
     pub fn new(device: &Device, options: RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new(options.use_cpu);
         if options.initialise_in_parallel {
+            #[cfg(not(target_arch = "wasm32"))]
             engine.use_parallel_initialisation();
         }
         let start = Instant::now();
         let shaders = shaders::full_shaders(device, &mut engine, &options)?;
+        #[cfg(not(target_arch = "wasm32"))]
         engine.build_shaders_if_needed(device);
         eprintln!("Building shaders took {:?}", start.elapsed());
         let blit = options
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index af6781f0c..87ac7c8f6 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -5,8 +5,6 @@ use std::{
     borrow::Cow,
     cell::RefCell,
     collections::{hash_map::Entry, HashMap, HashSet},
-    sync::Mutex,
-    thread::available_parallelism,
 };
 
 use wgpu::{
@@ -21,6 +19,7 @@ use crate::{
     BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
 };
 
+#[cfg(not(target_arch = "wasm32"))]
 struct UninitialisedShader {
     wgsl: Cow<'static, str>,
     label: &'static str,
@@ -34,6 +33,7 @@ pub struct WgpuEngine {
     pool: ResourcePool,
     bind_map: BindMap,
     downloads: HashMap<Id, Buffer>,
+    #[cfg(not(target_arch = "wasm32"))]
     shaders_to_initialise: Option<Vec<UninitialisedShader>>,
     pub(crate) use_cpu: bool,
 }
@@ -141,6 +141,7 @@ impl WgpuEngine {
     }
 
     /// Enable creating any remaining shaders in parallel
+    #[cfg(not(target_arch = "wasm32"))]
     pub fn use_parallel_initialisation(&mut self) {
         if self.shaders_to_initialise.is_some() {
             return;
@@ -148,19 +149,21 @@ impl WgpuEngine {
         self.shaders_to_initialise = Some(Vec::new());
     }
 
+    #[cfg(not(target_arch = "wasm32"))]
     /// Initialise (in parallel) any shaders which are yet to be created
     pub fn build_shaders_if_needed(&mut self, device: &Device) {
         if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
             // Try and not to use all threads
             // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead)
-            let threads_to_use = available_parallelism().map_or(2, |it| it.get().max(4) - 2);
+            let threads_to_use =
+                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2);
             eprintln!("Initialising in parallel using {threads_to_use} threads");
             let remainder =
                 new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
             let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
 
             // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue
-            let work_queue = Mutex::new(remainder.into_iter());
+            let work_queue = std::sync::Mutex::new(remainder.into_iter());
             let work_queue = &work_queue;
             std::thread::scope(|scope| {
                 let tx = tx;
@@ -298,6 +301,7 @@ impl WgpuEngine {
                 }
             })
             .collect::<Vec<_>>();
+        #[cfg(not(target_arch = "wasm32"))]
         if let Some(uninit) = self.shaders_to_initialise.as_mut() {
             let id = add(Shader {
                 label,

From d029d717bcbb4040060b49c3025e9245f9f327b7 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 11:45:49 +0000
Subject: [PATCH 5/7] Fix inverted name

---
 examples/with_winit/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index 647a092bd..ecc71675b 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -53,7 +53,7 @@ struct Args {
     use_cpu: bool,
     /// Whether to force initialising the shaders serially (rather than spawning threads)
     /// This has no effect on wasm, and on macOS for performance reasons
-    #[arg(long, action = clap::ArgAction::SetFalse)]
+    #[arg(long)]
     serial_initialisation: bool,
 }
 
@@ -546,7 +546,7 @@ fn run(
                                     // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
                                     // In theory, we should only exclude metal adapters, but the difference is very minor
                                     // wasm isn't supported
-                                    initialise_in_parallel: args.serial_initialisation && cfg!(not(target_os="mac"))
+                                    initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac"))
                                 },
                             )
                             .expect("Could create renderer")

From aa0819eb191ac96ff25ce26fa09b940ca2a50bcd Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 14:47:39 +0000
Subject: [PATCH 6/7] Allow configuring the number of threads directly

---
 crates/tests/src/lib.rs        |  4 ++--
 examples/headless/src/main.rs  |  3 ++-
 examples/scenes/src/svg.rs     |  3 ++-
 examples/with_bevy/src/main.rs |  5 ++++-
 examples/with_winit/src/lib.rs | 24 ++++++++++++++++--------
 src/lib.rs                     | 16 +++++++++++-----
 src/wgpu_engine.rs             | 25 +++++++++++++++++--------
 7 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs
index 740330afd..3cceefcc6 100644
--- a/crates/tests/src/lib.rs
+++ b/crates/tests/src/lib.rs
@@ -1,4 +1,4 @@
-use std::{env, fs::File, path::Path, sync::Arc};
+use std::{env, fs::File, num::NonZeroUsize, path::Path, sync::Arc};
 
 use anyhow::{anyhow, bail, Result};
 use vello::{
@@ -62,7 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result<Image> {
         RendererOptions {
             surface_format: None,
             use_cpu: params.use_cpu,
-            initialise_in_parallel: false,
+            num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )
diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs
index d8dfa6a1a..add53ba83 100644
--- a/examples/headless/src/main.rs
+++ b/examples/headless/src/main.rs
@@ -1,5 +1,6 @@
 use std::{
     fs::File,
+    num::NonZeroUsize,
     path::{Path, PathBuf},
 };
 
@@ -90,7 +91,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
         RendererOptions {
             surface_format: None,
             use_cpu: args.use_cpu,
-            initialise_in_parallel: false,
+            num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )
diff --git a/examples/scenes/src/svg.rs b/examples/scenes/src/svg.rs
index b4ca6c7a6..3155bc9d8 100644
--- a/examples/scenes/src/svg.rs
+++ b/examples/scenes/src/svg.rs
@@ -76,7 +76,8 @@ fn example_scene_of(file: PathBuf) -> ExampleScene {
         .unwrap_or_else(|| "unknown".to_string());
     ExampleScene {
         function: Box::new(svg_function_of(name.clone(), move || {
-            std::fs::read_to_string(file).expect("failed to read svg file")
+            std::fs::read_to_string(&file)
+                .unwrap_or_else(|e| panic!("failed to read svg file {file:?}: {e}"))
         })),
         config: crate::SceneConfig {
             animated: false,
diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs
index 4dfec082a..b5c310e46 100644
--- a/examples/with_bevy/src/main.rs
+++ b/examples/with_bevy/src/main.rs
@@ -1,3 +1,5 @@
+use std::num::NonZeroUsize;
+
 use bevy::render::{Render, RenderSet};
 use bevy::utils::synccell::SyncCell;
 use vello::kurbo::{Affine, Point, Rect, Stroke};
@@ -29,7 +31,8 @@ impl FromWorld for VelloRenderer {
                 device.wgpu_device(),
                 RendererOptions {
                     surface_format: None,
-                    initialise_in_parallel: false,
+                    // TODO: We should ideally use the Bevy threadpool here
+                    num_init_threads: NonZeroUsize::new(1),
                     antialiasing_support: vello::AaSupport::area_only(),
                     use_cpu: false,
                 },
diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index ecc71675b..2fc4dd731 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -15,6 +15,7 @@
 // Also licensed under MIT license, at your choice.
 
 use instant::{Duration, Instant};
+use std::num::NonZeroUsize;
 use std::{collections::HashSet, sync::Arc};
 
 use anyhow::Result;
@@ -52,9 +53,20 @@ struct Args {
     /// Whether to use CPU shaders
     use_cpu: bool,
     /// Whether to force initialising the shaders serially (rather than spawning threads)
-    /// This has no effect on wasm, and on macOS for performance reasons
-    #[arg(long)]
-    serial_initialisation: bool,
+    /// This has no effect on wasm, and defaults to 1 on macOS for performance reasons
+    ///
+    /// Use `0` for an automatic choice
+    #[arg(long, default_value_t=default_threads())]
+    num_init_threads: usize,
+}
+
+fn default_threads() -> usize {
+    #![allow(unreachable_code)]
+    #[cfg(target_os = "mac")]
+    {
+        return 1;
+    }
+    0
 }
 
 struct RenderState<'s> {
@@ -542,11 +554,7 @@ fn run(
                                     surface_format: Some(render_state.surface.format),
                                     use_cpu,
                                     antialiasing_support: vello::AaSupport::all(),
-                                    // We exclude macOS because it (supposedly) makes pipeline compilation slower
-                                    // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
-                                    // In theory, we should only exclude metal adapters, but the difference is very minor
-                                    // wasm isn't supported
-                                    initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac"))
+                                    num_init_threads: NonZeroUsize::new(args.num_init_threads)
                                 },
                             )
                             .expect("Could create renderer")
diff --git a/src/lib.rs b/src/lib.rs
index 6e13d4e81..e9676cd32 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,7 +25,7 @@ mod shaders;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 
-use std::time::Instant;
+use std::{num::NonZeroUsize, time::Instant};
 
 /// Styling and composition primitives.
 pub use peniko;
@@ -143,10 +143,15 @@ pub struct RendererOptions {
     /// pipeline permutations should be compiled at startup.
     pub antialiasing_support: AaSupport,
 
-    /// Whether to initialise shaders in parallel
+    /// How many threads to use for initialisation of shaders.
+    ///
+    /// Use `Some(1)` to use a single thread. This is recommended when on macOS
+    /// (see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004)
+    ///
+    /// Set to `None` to use a heuristic which will use many but not all threads
     ///
     /// Has no effect on WebAssembly
-    pub initialise_in_parallel: bool,
+    pub num_init_threads: Option<NonZeroUsize>,
 }
 
 #[cfg(feature = "wgpu")]
@@ -154,14 +159,15 @@ impl Renderer {
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, options: RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new(options.use_cpu);
-        if options.initialise_in_parallel {
+        // If we are running in parallel (i.e. the number of threads is not 1)
+        if options.num_init_threads != NonZeroUsize::new(1) {
             #[cfg(not(target_arch = "wasm32"))]
             engine.use_parallel_initialisation();
         }
         let start = Instant::now();
         let shaders = shaders::full_shaders(device, &mut engine, &options)?;
         #[cfg(not(target_arch = "wasm32"))]
-        engine.build_shaders_if_needed(device);
+        engine.build_shaders_if_needed(device, options.num_init_threads);
         eprintln!("Building shaders took {:?}", start.elapsed());
         let blit = options
             .surface_format
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 87ac7c8f6..3e05813e9 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -151,15 +151,24 @@ impl WgpuEngine {
 
     #[cfg(not(target_arch = "wasm32"))]
     /// Initialise (in parallel) any shaders which are yet to be created
-    pub fn build_shaders_if_needed(&mut self, device: &Device) {
+    pub fn build_shaders_if_needed(
+        &mut self,
+        device: &Device,
+        num_threads: Option<std::num::NonZeroUsize>,
+    ) {
+        use std::num::NonZeroUsize;
+
         if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
-            // Try and not to use all threads
-            // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead)
-            let threads_to_use =
-                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2);
-            eprintln!("Initialising in parallel using {threads_to_use} threads");
-            let remainder =
-                new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
+            let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| {
+                // Fallback onto a heuristic. This tries to not to use all threads.
+                // We keep the main thread blocked and not doing much whilst this is running,
+                // so we broadly leave two cores unused at the point of maximum parallelism
+                // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool
+                // should probably be used instead)
+                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2)
+            });
+            eprintln!("Initialising in parallel using {num_threads} threads");
+            let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads);
             let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
 
             // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue

From 00d939bcdfd2307820352a5ef417652d98242e16 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 21 Feb 2024 17:55:39 +0000
Subject: [PATCH 7/7] Fix dodgy maths

---
 src/wgpu_engine.rs | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 3e05813e9..bb2df60bf 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -159,16 +159,19 @@ impl WgpuEngine {
         use std::num::NonZeroUsize;
 
         if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
-            let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| {
-                // Fallback onto a heuristic. This tries to not to use all threads.
-                // We keep the main thread blocked and not doing much whilst this is running,
-                // so we broadly leave two cores unused at the point of maximum parallelism
-                // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool
-                // should probably be used instead)
-                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2)
-            });
+            let num_threads = num_threads
+                .map(NonZeroUsize::get)
+                .unwrap_or_else(|| {
+                    // Fallback onto a heuristic. This tries to not to use all threads.
+                    // We keep the main thread blocked and not doing much whilst this is running,
+                    // so we broadly leave two cores unused at the point of maximum parallelism
+                    // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool
+                    // should probably be used instead)
+                    std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2)
+                })
+                .min(new_shaders.len());
             eprintln!("Initialising in parallel using {num_threads} threads");
-            let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads);
+            let remainder = new_shaders.split_off(num_threads);
             let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
 
             // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue