diff --git a/brian2cuda/cuda_prefs.py b/brian2cuda/cuda_prefs.py
index ce4aa2c3..34fe7bba 100644
--- a/brian2cuda/cuda_prefs.py
+++ b/brian2cuda/cuda_prefs.py
@@ -173,7 +173,15 @@ def validate_bundle_size_expression(string):
         application. Since this avoids race conditions, effect application can
         be parallelised.''',
         validator=lambda v: isinstance(v, bool),
-        default=True)
+        default=True),
+
+    default_threads_per_block=BrianPreference(
+        docs='''If set, this overwrites the threads per block chosen by
+        `cudaOccupancyMaxPotentialBlockSize`, which appears to not always choose the
+        optimal threads per block. This needs fixing, see #266.''',
+        validator=lambda v: isinstance(v, int) or v is None,
+        default=None
+    )
 )
 
 prefs.register_preferences(
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 32e752f4..5a28dfdf 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -254,6 +254,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
             template_kwds = dict(template_kwds)
         template_kwds['profiled'] = self.enable_profiling
         template_kwds['bundle_mode'] = prefs["devices.cuda_standalone.push_synapse_bundles"]
+        template_kwds['default_threads_per_block'] = prefs["devices.cuda_standalone.default_threads_per_block"]
         no_or_const_delay_mode = False
         if isinstance(owner, (SynapticPathway, Synapses)) and "delay" in owner.variables and owner.variables["delay"].scalar:
             # catches Synapses(..., delay=...) syntax, does not catch the case when no delay is specified at all
diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
index ac9543e5..c2faeed2 100644
--- a/brian2cuda/templates/common_group.cu
+++ b/brian2cuda/templates/common_group.cu
@@ -187,14 +187,18 @@ void _run_{{codeobj_name}}()
         {% block prepare_kernel_inner %}
         // get number of blocks and threads
         {% if calc_occupancy %}
+
+        {% if default_threads_per_block %}
+        num_threads = {{default_threads_per_block}};
+        {% else %}
         int min_num_blocks; // The minimum grid size needed to achieve the
-                             // maximum occupancy for a full device launch
+                            // maximum occupancy for a full device launch
 
         CUDA_SAFE_CALL(
                 cudaOccupancyMaxPotentialBlockSize(&min_num_blocks, &num_threads,
                     _run_kernel_{{codeobj_name}}, 0, 0)  // last args: dynamicSMemSize, blockSizeLimit
                 );
-
+        {% endif %}
 
 
         // Round up according to array size
diff --git a/brian2cuda/templates/spatialstateupdate.cu b/brian2cuda/templates/spatialstateupdate.cu
index e86fbfa5..e3b3159b 100644
--- a/brian2cuda/templates/spatialstateupdate.cu
+++ b/brian2cuda/templates/spatialstateupdate.cu
@@ -492,13 +492,17 @@ __global__ void _currents_kernel_{{codeobj_name}}(
             // calculate number of threads that maximize occupancy
             // and also the corresponding number of blocks
             // the code below is adapted from common_group.cu
+            {% if default_threads_per_block %}
+            num_threads_currents = {{default_threads_per_block}};
+            {% else %}
             int min_num_blocks_currents; // The minimum grid size needed to achieve the
-                                 // maximum occupancy for a full device launch
+                                         // maximum occupancy for a full device launch
 
             CUDA_SAFE_CALL(
                     cudaOccupancyMaxPotentialBlockSize(&min_num_blocks_currents, &num_threads_currents,
                         _currents_kernel_{{codeobj_name}}, 0, 0)  // last args: dynamicSMemSize, blockSizeLimit
                     );
+            {% endif %}
 
             // Round up according to array size
             num_blocks_currents = (_N + num_threads_currents - 1) / num_threads_currents;