diff --git a/brian2cuda/cuda_prefs.py b/brian2cuda/cuda_prefs.py index ce4aa2c3..34fe7bba 100644 --- a/brian2cuda/cuda_prefs.py +++ b/brian2cuda/cuda_prefs.py @@ -173,7 +173,15 @@ def validate_bundle_size_expression(string): application. Since this avoids race conditions, effect application can be parallelised.''', validator=lambda v: isinstance(v, bool), - default=True) + default=True), + + default_threads_per_block=BrianPreference( + docs='''If set, this overwrites the threads per block chosen by + `cudaOccupancyMaxPotentialBlockSize`, which appears to not always choose the + optimal threads per block. This needs fixing, see #266.''', + validator=lambda v: isinstance(v, int) or v is None, + default=None + ) ) prefs.register_preferences( diff --git a/brian2cuda/device.py b/brian2cuda/device.py index 32e752f4..5a28dfdf 100644 --- a/brian2cuda/device.py +++ b/brian2cuda/device.py @@ -254,6 +254,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name, template_kwds = dict(template_kwds) template_kwds['profiled'] = self.enable_profiling template_kwds['bundle_mode'] = prefs["devices.cuda_standalone.push_synapse_bundles"] + template_kwds['default_threads_per_block'] = prefs["devices.cuda_standalone.default_threads_per_block"] no_or_const_delay_mode = False if isinstance(owner, (SynapticPathway, Synapses)) and "delay" in owner.variables and owner.variables["delay"].scalar: # catches Synapses(..., delay=...) syntax, does not catch the case when no delay is specified at all diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu index ac9543e5..c2faeed2 100644 --- a/brian2cuda/templates/common_group.cu +++ b/brian2cuda/templates/common_group.cu @@ -187,14 +187,18 @@ void _run_{{codeobj_name}}() {% block prepare_kernel_inner %} // get number of blocks and threads {% if calc_occupancy %} + + {% if default_threads_per_block %} + num_threads = {{default_threads_per_block}}; + {% else %} int min_num_blocks; // The minimum grid size needed to achieve the - // maximum occupancy for a full device launch + // maximum occupancy for a full device launch CUDA_SAFE_CALL( cudaOccupancyMaxPotentialBlockSize(&min_num_blocks, &num_threads, _run_kernel_{{codeobj_name}}, 0, 0) // last args: dynamicSMemSize, blockSizeLimit ); - + {% endif %} // Round up according to array size diff --git a/brian2cuda/templates/spatialstateupdate.cu b/brian2cuda/templates/spatialstateupdate.cu index e86fbfa5..e3b3159b 100644 --- a/brian2cuda/templates/spatialstateupdate.cu +++ b/brian2cuda/templates/spatialstateupdate.cu @@ -492,13 +492,17 @@ __global__ void _currents_kernel_{{codeobj_name}}( // calculate number of threads that maximize occupancy // and also the corresponding number of blocks // the code below is adapted from common_group.cu + {% if default_threads_per_block %} + num_threads_currents = {{default_threads_per_block}}; + {% else %} int min_num_blocks_currents; // The minimum grid size needed to achieve the - // maximum occupancy for a full device launch + // maximum occupancy for a full device launch CUDA_SAFE_CALL( cudaOccupancyMaxPotentialBlockSize(&min_num_blocks_currents, &num_threads_currents, _currents_kernel_{{codeobj_name}}, 0, 0) // last args: dynamicSMemSize, blockSizeLimit ); + {% endif %} // Round up according to array size num_blocks_currents = (_N + num_threads_currents - 1) / num_threads_currents;