From 729b15ed67c56b7f829dd8fde323c0ac0a01a630 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 10:24:33 +0000 Subject: [PATCH 1/7] Initialise shaders in parallel --- crates/tests/src/lib.rs | 1 + examples/headless/src/main.rs | 1 + examples/with_bevy/src/main.rs | 1 + examples/with_winit/src/lib.rs | 1 + src/lib.rs | 12 +++ src/wgpu_engine.rs | 153 +++++++++++++++++++++++++++------ 6 files changed, 145 insertions(+), 24 deletions(-) diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs index faf7b5149..740330afd 100644 --- a/crates/tests/src/lib.rs +++ b/crates/tests/src/lib.rs @@ -62,6 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result { RendererOptions { surface_format: None, use_cpu: params.use_cpu, + initialise_in_parallel: false, antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs index 65587c4df..d8dfa6a1a 100644 --- a/examples/headless/src/main.rs +++ b/examples/headless/src/main.rs @@ -90,6 +90,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> { RendererOptions { surface_format: None, use_cpu: args.use_cpu, + initialise_in_parallel: false, antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs index b4760e9dd..4dfec082a 100644 --- a/examples/with_bevy/src/main.rs +++ b/examples/with_bevy/src/main.rs @@ -29,6 +29,7 @@ impl FromWorld for VelloRenderer { device.wgpu_device(), RendererOptions { surface_format: None, + initialise_in_parallel: false, antialiasing_support: vello::AaSupport::area_only(), use_cpu: false, }, diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 00868de7e..e23ec3122 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -538,6 +538,7 @@ fn run( surface_format: Some(render_state.surface.format), use_cpu, antialiasing_support: vello::AaSupport::all(), + initialise_in_parallel: cfg!(all(not(target_arch="wasm32"), not(target_os="mac"))) }, ) .expect("Could create renderer") diff --git a/src/lib.rs b/src/lib.rs index 4cde7fb52..28487f8b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,6 +25,8 @@ mod shaders; #[cfg(feature = "wgpu")] mod wgpu_engine; +use std::time::Instant; + /// Styling and composition primitives. pub use peniko; /// 2D geometry, with a focus on curves. @@ -140,6 +142,9 @@ pub struct RendererOptions { /// Represents the enabled set of AA configurations. This will be used to determine which /// pipeline permutations should be compiled at startup. pub antialiasing_support: AaSupport, + + /// Whether to initialise shaders in parallel + pub initialise_in_parallel: bool, } #[cfg(feature = "wgpu")] @@ -147,7 +152,13 @@ impl Renderer { /// Creates a new renderer for the specified device. pub fn new(device: &Device, options: RendererOptions) -> Result { let mut engine = WgpuEngine::new(options.use_cpu); + if options.initialise_in_parallel { + engine.use_parallel_initialisation(); + } + let start = Instant::now(); let shaders = shaders::full_shaders(device, &mut engine, &options)?; + engine.build_shaders_if_needed(device); + eprintln!("Building shaders took {:?}", start.elapsed()); let blit = options .surface_format .map(|surface_format| BlitPipeline::new(device, surface_format)); @@ -272,6 +283,7 @@ impl Renderer { pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> { device.push_error_scope(wgpu::ErrorFilter::Validation); let mut engine = WgpuEngine::new(self.options.use_cpu); + // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly let shaders = shaders::full_shaders(device, &mut engine, &self.options)?; let error = device.pop_error_scope().await; if let Some(error) = error { diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 8a39a76e7..b28ac1e5b 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -5,6 +5,8 @@ use std::{ borrow::Cow, cell::RefCell, collections::{hash_map::Entry, HashMap, HashSet}, + sync::{mpsc::RecvError, Mutex}, + thread::available_parallelism, }; use wgpu::{ @@ -19,12 +21,20 @@ use crate::{ BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId, }; +struct UninitialisedShader { + wgsl: Cow<'static, str>, + label: &'static str, + entries: Vec, + shader_id: ShaderId, +} + #[derive(Default)] pub struct WgpuEngine { shaders: Vec, pool: ResourcePool, bind_map: BindMap, downloads: HashMap, + shaders_to_initialise: Option>, pub(crate) use_cpu: bool, } @@ -62,7 +72,7 @@ impl Shader { } else if let Some(wgpu) = self.wgpu.as_ref() { ShaderKind::Wgpu(wgpu) } else { - panic!("no available shader") + panic!("no available shader for {}", self.label) } } } @@ -130,6 +140,77 @@ impl WgpuEngine { } } + /// Enable creating any remaining shaders in parallel + pub fn use_parallel_initialisation(&mut self) { + if self.shaders_to_initialise.is_some() { + return; + } + self.shaders_to_initialise = Some(Vec::new()); + } + + /// Initialise (in parallel) any shaders which are yet to be created + pub fn build_shaders_if_needed(&mut self, device: &Device) { + if let Some(mut new_shaders) = self.shaders_to_initialise.take() { + // Use half of available threads if available, or three otherwise + // (This choice is arbitrary, and could be tuned, although a proper work stealing system should be used instead) + let threads_to_use = available_parallelism().map_or(3, |it| it.get() / 2); + let remainder = + new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use); + let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); + + let work_queue = Mutex::new(remainder.into_iter()); + let work_queue = &work_queue; + std::thread::scope(|scope| { + let tx = tx; + new_shaders + .into_iter() + .map(|it| { + let tx = tx.clone(); + scope.spawn(move || { + let shader = Self::create_compute_pipeline( + device, it.label, it.wgsl, it.entries, + ); + // We know the rx can only be closed if all the tx references are dropped + tx.send((it.shader_id, shader)).unwrap(); + loop { + if let Ok(mut guard) = work_queue.lock() { + if let Some(value) = guard.next() { + drop(guard); + let shader = Self::create_compute_pipeline( + device, + value.label, + value.wgsl, + value.entries, + ); + tx.send((value.shader_id, shader)).unwrap(); + } else { + break; + } + } else { + // Another thread panicked, we ignore that here and finish our processing + break; + } + } + drop(tx); + }); + }) + .for_each(drop); + // Drop the initial sender, to mean that there will be no more senders if and only if all other threads have finished + drop(tx); + loop { + match rx.recv() { + Ok((id, value)) => { + self.shaders[id.0].wgpu = Some(value); + } + // The channel has finished, we can finish this work + // If any worker paniced, the `scope` will panic + Err(RecvError) => break, + } + } + }); + } + } + /// Add a shader. /// /// This function is somewhat limited, it doesn't apply a label, only allows one bind group, @@ -173,10 +254,6 @@ impl WgpuEngine { } } - let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { - label: Some(label), - source: wgpu::ShaderSource::Wgsl(wgsl), - }); let entries = layout .iter() .enumerate() @@ -225,27 +302,23 @@ impl WgpuEngine { } }) .collect::>(); - let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { - label: None, - entries: &entries, - }); - let compute_pipeline_layout = - device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { - label: None, - bind_group_layouts: &[&bind_group_layout], - push_constant_ranges: &[], + if let Some(uninit) = self.shaders_to_initialise.as_mut() { + let id = add(Shader { + label, + wgpu: None, + cpu: None, + })?; + uninit.push(UninitialisedShader { + wgsl, + label, + entries, + shader_id: id, }); - let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { - label: Some(label), - layout: Some(&compute_pipeline_layout), - module: &shader_module, - entry_point: "main", - }); + return Ok(id); + } + let wgpu = Self::create_compute_pipeline(device, label, wgsl, entries); add(Shader { - wgpu: Some(WgpuShader { - pipeline, - bind_group_layout, - }), + wgpu: Some(wgpu), cpu: None, label, }) @@ -532,6 +605,38 @@ impl WgpuEngine { pub fn free_download(&mut self, buf: BufProxy) { self.downloads.remove(&buf.id); } + + fn create_compute_pipeline( + device: &Device, + label: &str, + wgsl: Cow<'_, str>, + entries: Vec, + ) -> WgpuShader { + let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some(label), + source: wgpu::ShaderSource::Wgsl(wgsl), + }); + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &entries, + }); + let compute_pipeline_layout = + device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some(label), + layout: Some(&compute_pipeline_layout), + module: &shader_module, + entry_point: "main", + }); + WgpuShader { + pipeline, + bind_group_layout, + } + } } impl BindMap { From bcf76647bea5a2bb85ba3805c2a0a31679fffffd Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 10:51:53 +0000 Subject: [PATCH 2/7] Add control and visibility over initialisation --- examples/with_winit/src/lib.rs | 8 +++++- src/wgpu_engine.rs | 50 ++++++++++++++++------------------ 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index e23ec3122..ec7a84370 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -51,6 +51,10 @@ struct Args { #[arg(long)] /// Whether to use CPU shaders use_cpu: bool, + /// Whether to force initialising the shaders serially (rather than spawning threads) + /// This has no effect on wasm, and on macOS for performance reasons + #[arg(long, action = clap::ArgAction::SetFalse)] + serial_initialisation: bool, } struct RenderState<'s> { @@ -538,7 +542,9 @@ fn run( surface_format: Some(render_state.surface.format), use_cpu, antialiasing_support: vello::AaSupport::all(), - initialise_in_parallel: cfg!(all(not(target_arch="wasm32"), not(target_os="mac"))) + // We exclude macOS because it (supposedly) makes compilation slower + // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004 + initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac"))) }, ) .expect("Could create renderer") diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index b28ac1e5b..af6781f0c 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -5,7 +5,7 @@ use std::{ borrow::Cow, cell::RefCell, collections::{hash_map::Entry, HashMap, HashSet}, - sync::{mpsc::RecvError, Mutex}, + sync::Mutex, thread::available_parallelism, }; @@ -151,13 +151,15 @@ impl WgpuEngine { /// Initialise (in parallel) any shaders which are yet to be created pub fn build_shaders_if_needed(&mut self, device: &Device) { if let Some(mut new_shaders) = self.shaders_to_initialise.take() { - // Use half of available threads if available, or three otherwise - // (This choice is arbitrary, and could be tuned, although a proper work stealing system should be used instead) - let threads_to_use = available_parallelism().map_or(3, |it| it.get() / 2); + // Try and not to use all threads + // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead) + let threads_to_use = available_parallelism().map_or(2, |it| it.get().max(4) - 2); + eprintln!("Initialising in parallel using {threads_to_use} threads"); let remainder = new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use); let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); + // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue let work_queue = Mutex::new(remainder.into_iter()); let work_queue = &work_queue; std::thread::scope(|scope| { @@ -166,14 +168,15 @@ impl WgpuEngine { .into_iter() .map(|it| { let tx = tx.clone(); - scope.spawn(move || { - let shader = Self::create_compute_pipeline( - device, it.label, it.wgsl, it.entries, - ); - // We know the rx can only be closed if all the tx references are dropped - tx.send((it.shader_id, shader)).unwrap(); - loop { - if let Ok(mut guard) = work_queue.lock() { + std::thread::Builder::new() + .name("Vello shader initialisation worker thread".into()) + .spawn_scoped(scope, move || { + let shader = Self::create_compute_pipeline( + device, it.label, it.wgsl, it.entries, + ); + // We know the rx can only be closed if all the tx references are dropped + tx.send((it.shader_id, shader)).unwrap(); + while let Ok(mut guard) = work_queue.lock() { if let Some(value) = guard.next() { drop(guard); let shader = Self::create_compute_pipeline( @@ -186,26 +189,19 @@ impl WgpuEngine { } else { break; } - } else { - // Another thread panicked, we ignore that here and finish our processing - break; } - } - drop(tx); - }); + // Another thread panicked or we finished. + // If another thread panicked, we ignore that here and finish our processing + drop(tx); + }) + .expect("failed to spawn thread"); }) .for_each(drop); // Drop the initial sender, to mean that there will be no more senders if and only if all other threads have finished drop(tx); - loop { - match rx.recv() { - Ok((id, value)) => { - self.shaders[id.0].wgpu = Some(value); - } - // The channel has finished, we can finish this work - // If any worker paniced, the `scope` will panic - Err(RecvError) => break, - } + + while let Ok((id, value)) = rx.recv() { + self.shaders[id.0].wgpu = Some(value); } }); } From 5f68cb346bb78706509d32894cf9cec69d051e97 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:03:42 +0000 Subject: [PATCH 3/7] Clarify comment --- examples/with_winit/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index ec7a84370..6090eb44d 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -542,8 +542,9 @@ fn run( surface_format: Some(render_state.surface.format), use_cpu, antialiasing_support: vello::AaSupport::all(), - // We exclude macOS because it (supposedly) makes compilation slower + // We exclude macOS because it (supposedly) makes pipeline compilation slower // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004 + // In theory, we should only exclude metal adapters, but the difference is very minor initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac"))) }, ) From 603421e3b8468b428a6c782e3e4ec2487e14f025 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:20:22 +0000 Subject: [PATCH 4/7] Fix wasm compilation --- examples/with_winit/src/lib.rs | 3 ++- src/lib.rs | 4 ++++ src/wgpu_engine.rs | 12 ++++++++---- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 6090eb44d..647a092bd 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -545,7 +545,8 @@ fn run( // We exclude macOS because it (supposedly) makes pipeline compilation slower // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004 // In theory, we should only exclude metal adapters, but the difference is very minor - initialise_in_parallel: args.serial_initialisation && cfg!(all(not(target_arch="wasm32"), not(target_os="mac"))) + // wasm isn't supported + initialise_in_parallel: args.serial_initialisation && cfg!(not(target_os="mac")) }, ) .expect("Could create renderer") diff --git a/src/lib.rs b/src/lib.rs index 28487f8b8..6e13d4e81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,6 +144,8 @@ pub struct RendererOptions { pub antialiasing_support: AaSupport, /// Whether to initialise shaders in parallel + /// + /// Has no effect on WebAssembly pub initialise_in_parallel: bool, } @@ -153,10 +155,12 @@ impl Renderer { pub fn new(device: &Device, options: RendererOptions) -> Result { let mut engine = WgpuEngine::new(options.use_cpu); if options.initialise_in_parallel { + #[cfg(not(target_arch = "wasm32"))] engine.use_parallel_initialisation(); } let start = Instant::now(); let shaders = shaders::full_shaders(device, &mut engine, &options)?; + #[cfg(not(target_arch = "wasm32"))] engine.build_shaders_if_needed(device); eprintln!("Building shaders took {:?}", start.elapsed()); let blit = options diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index af6781f0c..87ac7c8f6 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -5,8 +5,6 @@ use std::{ borrow::Cow, cell::RefCell, collections::{hash_map::Entry, HashMap, HashSet}, - sync::Mutex, - thread::available_parallelism, }; use wgpu::{ @@ -21,6 +19,7 @@ use crate::{ BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId, }; +#[cfg(not(target_arch = "wasm32"))] struct UninitialisedShader { wgsl: Cow<'static, str>, label: &'static str, @@ -34,6 +33,7 @@ pub struct WgpuEngine { pool: ResourcePool, bind_map: BindMap, downloads: HashMap, + #[cfg(not(target_arch = "wasm32"))] shaders_to_initialise: Option>, pub(crate) use_cpu: bool, } @@ -141,6 +141,7 @@ impl WgpuEngine { } /// Enable creating any remaining shaders in parallel + #[cfg(not(target_arch = "wasm32"))] pub fn use_parallel_initialisation(&mut self) { if self.shaders_to_initialise.is_some() { return; @@ -148,19 +149,21 @@ impl WgpuEngine { self.shaders_to_initialise = Some(Vec::new()); } + #[cfg(not(target_arch = "wasm32"))] /// Initialise (in parallel) any shaders which are yet to be created pub fn build_shaders_if_needed(&mut self, device: &Device) { if let Some(mut new_shaders) = self.shaders_to_initialise.take() { // Try and not to use all threads // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead) - let threads_to_use = available_parallelism().map_or(2, |it| it.get().max(4) - 2); + let threads_to_use = + std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2); eprintln!("Initialising in parallel using {threads_to_use} threads"); let remainder = new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use); let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue - let work_queue = Mutex::new(remainder.into_iter()); + let work_queue = std::sync::Mutex::new(remainder.into_iter()); let work_queue = &work_queue; std::thread::scope(|scope| { let tx = tx; @@ -298,6 +301,7 @@ impl WgpuEngine { } }) .collect::>(); + #[cfg(not(target_arch = "wasm32"))] if let Some(uninit) = self.shaders_to_initialise.as_mut() { let id = add(Shader { label, From d029d717bcbb4040060b49c3025e9245f9f327b7 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:45:49 +0000 Subject: [PATCH 5/7] Fix inverted name --- examples/with_winit/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 647a092bd..ecc71675b 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -53,7 +53,7 @@ struct Args { use_cpu: bool, /// Whether to force initialising the shaders serially (rather than spawning threads) /// This has no effect on wasm, and on macOS for performance reasons - #[arg(long, action = clap::ArgAction::SetFalse)] + #[arg(long)] serial_initialisation: bool, } @@ -546,7 +546,7 @@ fn run( // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004 // In theory, we should only exclude metal adapters, but the difference is very minor // wasm isn't supported - initialise_in_parallel: args.serial_initialisation && cfg!(not(target_os="mac")) + initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac")) }, ) .expect("Could create renderer") From aa0819eb191ac96ff25ce26fa09b940ca2a50bcd Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 14:47:39 +0000 Subject: [PATCH 6/7] Allow configuring the number of threads directly --- crates/tests/src/lib.rs | 4 ++-- examples/headless/src/main.rs | 3 ++- examples/scenes/src/svg.rs | 3 ++- examples/with_bevy/src/main.rs | 5 ++++- examples/with_winit/src/lib.rs | 24 ++++++++++++++++-------- src/lib.rs | 16 +++++++++++----- src/wgpu_engine.rs | 25 +++++++++++++++++-------- 7 files changed, 54 insertions(+), 26 deletions(-) diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs index 740330afd..3cceefcc6 100644 --- a/crates/tests/src/lib.rs +++ b/crates/tests/src/lib.rs @@ -1,4 +1,4 @@ -use std::{env, fs::File, path::Path, sync::Arc}; +use std::{env, fs::File, num::NonZeroUsize, path::Path, sync::Arc}; use anyhow::{anyhow, bail, Result}; use vello::{ @@ -62,7 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result { RendererOptions { surface_format: None, use_cpu: params.use_cpu, - initialise_in_parallel: false, + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs index d8dfa6a1a..add53ba83 100644 --- a/examples/headless/src/main.rs +++ b/examples/headless/src/main.rs @@ -1,5 +1,6 @@ use std::{ fs::File, + num::NonZeroUsize, path::{Path, PathBuf}, }; @@ -90,7 +91,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> { RendererOptions { surface_format: None, use_cpu: args.use_cpu, - initialise_in_parallel: false, + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/scenes/src/svg.rs b/examples/scenes/src/svg.rs index b4ca6c7a6..3155bc9d8 100644 --- a/examples/scenes/src/svg.rs +++ b/examples/scenes/src/svg.rs @@ -76,7 +76,8 @@ fn example_scene_of(file: PathBuf) -> ExampleScene { .unwrap_or_else(|| "unknown".to_string()); ExampleScene { function: Box::new(svg_function_of(name.clone(), move || { - std::fs::read_to_string(file).expect("failed to read svg file") + std::fs::read_to_string(&file) + .unwrap_or_else(|e| panic!("failed to read svg file {file:?}: {e}")) })), config: crate::SceneConfig { animated: false, diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs index 4dfec082a..b5c310e46 100644 --- a/examples/with_bevy/src/main.rs +++ b/examples/with_bevy/src/main.rs @@ -1,3 +1,5 @@ +use std::num::NonZeroUsize; + use bevy::render::{Render, RenderSet}; use bevy::utils::synccell::SyncCell; use vello::kurbo::{Affine, Point, Rect, Stroke}; @@ -29,7 +31,8 @@ impl FromWorld for VelloRenderer { device.wgpu_device(), RendererOptions { surface_format: None, - initialise_in_parallel: false, + // TODO: We should ideally use the Bevy threadpool here + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), use_cpu: false, }, diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index ecc71675b..2fc4dd731 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -15,6 +15,7 @@ // Also licensed under MIT license, at your choice. use instant::{Duration, Instant}; +use std::num::NonZeroUsize; use std::{collections::HashSet, sync::Arc}; use anyhow::Result; @@ -52,9 +53,20 @@ struct Args { /// Whether to use CPU shaders use_cpu: bool, /// Whether to force initialising the shaders serially (rather than spawning threads) - /// This has no effect on wasm, and on macOS for performance reasons - #[arg(long)] - serial_initialisation: bool, + /// This has no effect on wasm, and defaults to 1 on macOS for performance reasons + /// + /// Use `0` for an automatic choice + #[arg(long, default_value_t=default_threads())] + num_init_threads: usize, +} + +fn default_threads() -> usize { + #![allow(unreachable_code)] + #[cfg(target_os = "mac")] + { + return 1; + } + 0 } struct RenderState<'s> { @@ -542,11 +554,7 @@ fn run( surface_format: Some(render_state.surface.format), use_cpu, antialiasing_support: vello::AaSupport::all(), - // We exclude macOS because it (supposedly) makes pipeline compilation slower - // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004 - // In theory, we should only exclude metal adapters, but the difference is very minor - // wasm isn't supported - initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac")) + num_init_threads: NonZeroUsize::new(args.num_init_threads) }, ) .expect("Could create renderer") diff --git a/src/lib.rs b/src/lib.rs index 6e13d4e81..e9676cd32 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,7 +25,7 @@ mod shaders; #[cfg(feature = "wgpu")] mod wgpu_engine; -use std::time::Instant; +use std::{num::NonZeroUsize, time::Instant}; /// Styling and composition primitives. pub use peniko; @@ -143,10 +143,15 @@ pub struct RendererOptions { /// pipeline permutations should be compiled at startup. pub antialiasing_support: AaSupport, - /// Whether to initialise shaders in parallel + /// How many threads to use for initialisation of shaders. + /// + /// Use `Some(1)` to use a single thread. This is recommended when on macOS + /// (see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004) + /// + /// Set to `None` to use a heuristic which will use many but not all threads /// /// Has no effect on WebAssembly - pub initialise_in_parallel: bool, + pub num_init_threads: Option, } #[cfg(feature = "wgpu")] @@ -154,14 +159,15 @@ impl Renderer { /// Creates a new renderer for the specified device. pub fn new(device: &Device, options: RendererOptions) -> Result { let mut engine = WgpuEngine::new(options.use_cpu); - if options.initialise_in_parallel { + // If we are running in parallel (i.e. the number of threads is not 1) + if options.num_init_threads != NonZeroUsize::new(1) { #[cfg(not(target_arch = "wasm32"))] engine.use_parallel_initialisation(); } let start = Instant::now(); let shaders = shaders::full_shaders(device, &mut engine, &options)?; #[cfg(not(target_arch = "wasm32"))] - engine.build_shaders_if_needed(device); + engine.build_shaders_if_needed(device, options.num_init_threads); eprintln!("Building shaders took {:?}", start.elapsed()); let blit = options .surface_format diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 87ac7c8f6..3e05813e9 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -151,15 +151,24 @@ impl WgpuEngine { #[cfg(not(target_arch = "wasm32"))] /// Initialise (in parallel) any shaders which are yet to be created - pub fn build_shaders_if_needed(&mut self, device: &Device) { + pub fn build_shaders_if_needed( + &mut self, + device: &Device, + num_threads: Option, + ) { + use std::num::NonZeroUsize; + if let Some(mut new_shaders) = self.shaders_to_initialise.take() { - // Try and not to use all threads - // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead) - let threads_to_use = - std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2); - eprintln!("Initialising in parallel using {threads_to_use} threads"); - let remainder = - new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use); + let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| { + // Fallback onto a heuristic. This tries to not to use all threads. + // We keep the main thread blocked and not doing much whilst this is running, + // so we broadly leave two cores unused at the point of maximum parallelism + // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool + // should probably be used instead) + std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2) + }); + eprintln!("Initialising in parallel using {num_threads} threads"); + let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads); let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue From 00d939bcdfd2307820352a5ef417652d98242e16 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:55:39 +0000 Subject: [PATCH 7/7] Fix dodgy maths --- src/wgpu_engine.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 3e05813e9..bb2df60bf 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -159,16 +159,19 @@ impl WgpuEngine { use std::num::NonZeroUsize; if let Some(mut new_shaders) = self.shaders_to_initialise.take() { - let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| { - // Fallback onto a heuristic. This tries to not to use all threads. - // We keep the main thread blocked and not doing much whilst this is running, - // so we broadly leave two cores unused at the point of maximum parallelism - // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool - // should probably be used instead) - std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2) - }); + let num_threads = num_threads + .map(NonZeroUsize::get) + .unwrap_or_else(|| { + // Fallback onto a heuristic. This tries to not to use all threads. + // We keep the main thread blocked and not doing much whilst this is running, + // so we broadly leave two cores unused at the point of maximum parallelism + // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool + // should probably be used instead) + std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2) + }) + .min(new_shaders.len()); eprintln!("Initialising in parallel using {num_threads} threads"); - let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads); + let remainder = new_shaders.split_off(num_threads); let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue