diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 41b88a09..32209d7d 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -78,11 +78,6 @@ def __init__(self):
         self.include_dirs.remove('brianlib/randomkit')
         self.library_dirs.remove('brianlib/randomkit')
 
-        # Add code line slots used in our benchmarks
-        # TODO: Add to brian2 and remove here
-        self.code_lines.update({'before_network_run': [],
-                                'after_network_run': []})
-
         ### Attributes specific to CUDAStandaloneDevice:
         # only true during first run call (relevant for synaptic pre/post ID deletion)
         self.first_run = True
@@ -94,6 +89,8 @@ def __init__(self):
         # list of pre/post ID arrays that are not needed in device memory
         self.delete_synaptic_pre = {}
         self.delete_synaptic_post = {}
+        # dictionary to store parallalelization information
+        self.stream_info = {}
         # The following nested dictionary collects all codeobjects that use random
         # number generation (RNG).
         self.codeobjects_with_rng = {
@@ -351,18 +348,6 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
                     f"Using atomics in synaptic effect application of Synapses object "
                     f"{name}"
                 )
-            threads_expr = prefs.devices.cuda_standalone.threads_per_synapse_bundle
-            pathway_name = template_kwds['pathway'].name
-            replace_expr = {
-                '{mean}': f'{pathway_name}_bundle_size_mean',
-                '{std}': f'{pathway_name}_bundle_size_std',
-                '{min}': f'{pathway_name}_bundle_size_min',
-                '{max}': f'{pathway_name}_bundle_size_max',
-            }
-            for old, new in replace_expr.items():
-                threads_expr = threads_expr.replace(old, new)
-            template_kwds["threads_per_synapse_bundle"] = threads_expr
-            template_kwds["bundle_threads_warp_multiple"] = prefs.devices.cuda_standalone.bundle_threads_warp_multiple
         if template_name in ["synapses_create_generator", "synapses_create_array"]:
             if owner.multisynaptic_index is not None:
                 template_kwds["multisynaptic_idx_var"] = owner.variables[owner.multisynaptic_index]
@@ -376,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
         template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"]
         template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"]
         template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"]
+        template_kwds["stream_info"] = self.stream_info
         if template_name in ["threshold", "spikegenerator"]:
             template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"]
         codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables,
@@ -391,7 +377,7 @@ def check_openmp_compatible(self, nb_threads):
         if nb_threads > 0:
             raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported")
 
-    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks):
+    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info):
         sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier
         num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks
         curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type
@@ -410,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
         for syn in synapses:
             if syn.multisynaptic_index is not None:
                 multisyn_vars.append(syn.variables[syn.multisynaptic_index])
+        # get number of unique streams
+
+        num_stream = max(Counter(stream_info).values())
         arr_tmp = self.code_object_class().templater.objects(
                         None, None,
                         array_specs=self.arrays,
@@ -433,7 +422,8 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
                         spikegenerator_eventspaces=self.spikegenerator_eventspaces,
                         multisynaptic_idx_vars=multisyn_vars,
                         profiled_codeobjects=self.profiled_codeobjects,
-                        profile_statemonitor_copy_to_host=prefs.devices.cuda_standalone.profile_statemonitor_copy_to_host)
+                        parallelize=True,
+                        stream_size=num_stream)
         # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary...
         self.arrays.update(self.eventspace_arrays)
         writer.write('objects.*', arr_tmp)
@@ -463,7 +453,8 @@ def generate_main_source(self, writer):
                         # For codeobjects run every tick, this happens in the init() of
                         # the random number buffer called at first clock cycle of the network
                         main_lines.append('random_number_buffer.ensure_enough_curand_states();')
-                main_lines.append(f'_run_{codeobj.name}();')
+                # add stream - default
+                main_lines.append(f'_run_{codeobj.name}(0);')
             elif func == 'after_run_code_object':
                 codeobj, = args
                 main_lines.append(f'_after_run_{codeobj.name}();')
@@ -944,13 +935,13 @@ def _replace_constants_and_parameters(code):
                 sub = 't - lastupdate'
                 if sub in code:
                     code = code.replace(sub, f'float({sub})')
-                    logger.debug(f"Replaced {sub} with float({sub}) in {codeobj.name}")
+                    logger.debug(f"Replaced {sub} with float({sub}) in {codeobj}")
                 # replace double-precision floating-point literals with their
                 # single-precision version (e.g. `1.0` -> `1.0f`)
                 code = replace_floating_point_literals(code)
                 logger.debug(
                     f"Replaced floating point literals by single precision version "
-                    f"(appending `f`) in {codeobj.name}."
+                    f"(appending `f`) in {codeobj}."
                 )
 
             writer.write('code_objects/'+codeobj.name+'.cu', code)
@@ -1004,10 +995,14 @@ def generate_network_source(self, writer):
         maximum_run_time = self._maximum_run_time
         if maximum_run_time is not None:
             maximum_run_time = float(maximum_run_time)
+        num_stream = max(Counter(self.stream_info).values())
         network_tmp = self.code_object_class().templater.network(None, None,
                                                                  maximum_run_time=maximum_run_time,
                                                                  eventspace_arrays=self.eventspace_arrays,
-                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces)
+                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces,
+                                                                 parallelize = True,
+                                                                 stream_info = self.stream_info,
+                                                                 num_stream= num_stream)
         writer.write('network.*', network_tmp)
 
     def generate_synapses_classes_source(self, writer):
@@ -1328,7 +1323,7 @@ def build(self, directory='output',
 
         self.generate_objects_source(self.writer, self.arange_arrays,
                                      net_synapses, self.static_array_specs,
-                                     self.networks)
+                                     self.networks, self.stream_info)
         self.generate_network_source(self.writer)
         self.generate_synapses_classes_source(self.writer)
         self.generate_run_source(self.writer)
@@ -1355,6 +1350,11 @@ def build(self, directory='output',
             if "devices.cuda_standalone" in pref_name:
                 logger.info(f"\t{pref_name} = {prefs[pref_name]}")
 
+        logger.debug("Using the following brian preferences:")
+        for pref_name in prefs:
+            if pref_name not in prefs:
+                logger.debug(f"\t{pref_name} = {prefs[pref_name]}")
+
         if compile:
             self.compile_source(directory, cpp_compiler, debug, clean)
             if run:
@@ -1395,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         self.clocks.update(net._clocks)
         net.t_ = float(t_end)
 
+
+        # Create dictionary for parallelisation with stream
+        streams_organization = defaultdict(list)
+        for obj in net.sorted_objects:
+            streams_organization[(obj.when, obj.order)].append(obj)
+
+        # associate each code object with a particular stream
+        streams_details = defaultdict(list)
+        count = 1
+        for key in streams_organization:
+            for object in streams_organization[key]:
+                streams_details[object.name] = count
+            count +=1
+
+        self.stream_info = streams_details
+        self.stream_info['default'] = 0
+
+
+
         # TODO: remove this horrible hack
         for clock in self.clocks:
             if clock.name=='clock':
@@ -1529,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second,
 
         # create all random numbers needed for the next clock cycle
         for clock in net._clocks:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);')
+            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});')
 
         all_clocks = set()
+        # TODO add for every code object ->  add where in the list are there.
+        # TODO create new dic (code object, position in list)
         for clock, codeobj in code_objects:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});')
+            # add this position as additional number here
+            # check if codeobj.name has _codeobject in it
+            name = codeobj.name
+            if "_codeobject" in codeobj.name:
+                name = codeobj.name[:-11]
+            if name in self.stream_info.keys():
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});')
+            else:
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});')
             all_clocks.add(clock)
 
         # Under some rare circumstances (e.g. a NeuronGroup only defining a
@@ -1558,12 +1587,12 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         # release version? (TODO: add via insert_code mechanism)
         run_lines.append('CUDA_SAFE_CALL(cudaProfilerStart());')
 
-        run_lines.extend(self.code_lines['before_network_run'])
+        run_lines.extend(self.code_lines['before_run'])
         # run everything that is run on a clock
         run_lines.append(
             f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});'
         )
-        run_lines.extend(self.code_lines['after_network_run'])
+        run_lines.extend(self.code_lines['after_run'])
         # for multiple runs, the random number buffer needs to be reset
         run_lines.append('random_number_buffer.run_finished();')
         # nvprof stuff
@@ -1840,4 +1869,4 @@ def prepare_codeobj_code_for_rng(codeobj):
 
 cuda_standalone_device = CUDAStandaloneDevice()
 
-all_devices['cuda_standalone'] = cuda_standalone_device
+all_devices['cuda_standalone'] = cuda_standalone_device
\ No newline at end of file
diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
index 7c069d2b..25d11c68 100644
--- a/brian2cuda/templates/common_group.cu
+++ b/brian2cuda/templates/common_group.cu
@@ -63,8 +63,8 @@ namespace {
        (e.g. _host_rand used in _poisson), but we can't put support_code_lines lines
        after block random_functions since random_functions can use functions defined in
        support_code_lines (e.g. _rand) #}
-    randomNumber_t _host_rand(const int _vectorisation_idx);
-    randomNumber_t _host_randn(const int _vectorisation_idx);
+    double _host_rand(const int _vectorisation_idx);
+    double _host_randn(const int _vectorisation_idx);
     int32_t _host_poisson(double _lambda, const int _vectorisation_idx);
 
     ///// block extra_device_helper /////
@@ -77,13 +77,13 @@ namespace {
     {% block random_functions %}
     // Implement dummy functions such that the host compiled code of binomial
     // functions works. Hacky, hacky ...
-    randomNumber_t _host_rand(const int _vectorisation_idx)
+    double _host_rand(const int _vectorisation_idx)
     {
         printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
         exit(EXIT_FAILURE);
     }
-    randomNumber_t _host_randn(const int _vectorisation_idx)
+    double _host_randn(const int _vectorisation_idx)
     {
         printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
@@ -109,8 +109,6 @@ __launch_bounds__(1024, {{sm_multiplier}})
 _run_kernel_{{codeobj_name}}(
     int _N,
     int THREADS_PER_BLOCK,
-    {% block extra_kernel_parameters %}
-    {% endblock %}
     ///// KERNEL_PARAMETERS /////
     %KERNEL_PARAMETERS%
     )
@@ -119,11 +117,8 @@ _run_kernel_{{codeobj_name}}(
 
     int tid = threadIdx.x;
     int bid = blockIdx.x;
-
-    {% block indices %}
     int _idx = bid * THREADS_PER_BLOCK + tid;
     int _vectorisation_idx = _idx;
-    {% endblock %}
 
     ///// KERNEL_CONSTANTS /////
     %KERNEL_CONSTANTS%
@@ -136,10 +131,12 @@ _run_kernel_{{codeobj_name}}(
     {% block additional_variables %}
     {% endblock %}
 
-    if(_vectorisation_idx >= _N)
+    {% block num_thread_check %}
+    if(_idx >= _N)
     {
         return;
     }
+    {% endblock %}
 
     {% block kernel_maincode %}
 
@@ -158,7 +155,7 @@ _run_kernel_{{codeobj_name}}(
 {% endblock kernel %}
 
 
-void _run_{{codeobj_name}}()
+void _run_{{codeobj_name}}(cudaStream_t stream)
 {
     using namespace brian;
 
@@ -182,9 +179,7 @@ void _run_{{codeobj_name}}()
     {% endblock %}
 
     {% block prepare_kernel %}
-    {% block static_kernel_dimensions %}
     static int num_threads, num_blocks;
-    {% endblock %}
     static size_t needed_shared_memory = 0;
     static bool first_run = true;
     if (first_run)
@@ -273,11 +268,9 @@ void _run_{{codeobj_name}}()
         else
         {
             printf("INFO _run_kernel_{{codeobj_name}}\n"
-                   {% block kernel_info_num_blocks_str %}
                    "\t%u blocks\n"
-                   {% endblock %}
                    "\t%u threads\n"
-                   "\t%i registers per thread\n"
+                   "\t%i registers per block\n"
                    "\t%i bytes statically-allocated shared memory per block\n"
                    "\t%i bytes local memory per thread\n"
                    "\t%i bytes user-allocated constant memory\n"
@@ -286,10 +279,7 @@ void _run_{{codeobj_name}}()
                    {% else %}
                    "",
                    {% endif %}
-                   {% block kernel_info_num_blocks_var %}
-                   num_blocks,
-                   {% endblock %}
-                   num_threads, funcAttrib.numRegs,
+                   num_blocks, num_threads, funcAttrib.numRegs,
                    funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
                    funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %});
         }
@@ -302,11 +292,9 @@ void _run_{{codeobj_name}}()
     {% endblock %}
 
     {% block kernel_call %}
-    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
+    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads, 0, stream>>>(
             _N,
             num_threads,
-            {% block extra_host_parameters %}
-            {% endblock %}
             ///// HOST_PARAMETERS /////
             %HOST_PARAMETERS%
         );
@@ -338,7 +326,7 @@ void _run_{{codeobj_name}}()
 #ifndef _INCLUDED_{{codeobj_name}}
 #define _INCLUDED_{{codeobj_name}}
 
-void _run_{{codeobj_name}}();
+void _run_{{codeobj_name}}(cudaStream_t);
 
 {% block extra_functions_h %}
 {% endblock %}
@@ -374,7 +362,7 @@ void _after_run_{{codeobj_name}}()
 }
 {% endmacro %}
 
-
+// {{codeobj_name}}
 {% macro after_run_h_file() %}
 #ifndef _INCLUDED_{{codeobj_name}}_after
 #define _INCLUDED_{{codeobj_name}}_affer
@@ -382,4 +370,4 @@ void _after_run_{{codeobj_name}}()
 void _after_run_{{codeobj_name}}();
 
 #endif
-{% endmacro %}
+{% endmacro %}
\ No newline at end of file
diff --git a/brian2cuda/templates/makefile b/brian2cuda/templates/makefile
index 6bcf06a1..7b3bc76f 100644
--- a/brian2cuda/templates/makefile
+++ b/brian2cuda/templates/makefile
@@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o}
 OBJS := ${OBJS:.c=.o}
 NVCC = @{{ nvcc_path }} -ccbin $(CXX)
 NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}"
-LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
+LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
 
 all: $(PROGRAM)
 
@@ -20,4 +20,4 @@ clean:
 	{{rm_cmd}}
 
 %.o : %.cu
-	$(NVCC) $(NVCCFLAGS) -dc $< -o $@
+	$(NVCC) $(NVCCFLAGS) -dc $< -o $@
\ No newline at end of file
diff --git a/brian2cuda/templates/network.cu b/brian2cuda/templates/network.cu
index 51723178..45a129f7 100644
--- a/brian2cuda/templates/network.cu
+++ b/brian2cuda/templates/network.cu
@@ -14,10 +14,18 @@
 
 double Network::_last_run_time = 0.0;
 double Network::_last_run_completed_fraction = 0.0;
+{% if parallelize %}
+cudaStream_t custom_stream[{{num_stream}}];
+{% endif %}
 
 Network::Network()
 {
     t = 0.0;
+    {% if parallelize %}
+    for(int i=0;i<{{num_stream}};i++){
+        CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+    }
+    {% endif %}
 }
 
 void Network::clear()
@@ -25,12 +33,14 @@ void Network::clear()
     objects.clear();
 }
 
-void Network::add(Clock *clock, codeobj_func func)
+// TODO have to makr change in objects - make it a tuple
+// make decision which bject has which stream
+void Network::add(Clock *clock, codeobj_func func, int group_num)
 {
 #if defined(_MSC_VER) && (_MSC_VER>=1700)
-    objects.push_back(std::make_pair(std::move(clock), std::move(func)));
+    objects.push_back(std::make_tuple(std::move(clock), std::move(func), std::move(group_num)));
 #else
-    objects.push_back(std::make_pair(clock, func));
+    objects.push_back(std::make_tuple(clock, func, group_num));
 #endif
 }
 
@@ -56,7 +66,7 @@ void Network::run(const double duration, void (*report_func)(const double, const
     Clock* clock = next_clocks();
     double elapsed_realtime;
     bool did_break_early = false;
-
+    //TODO here
     while(clock && clock->running())
     {
         t = clock->t[0];
@@ -73,17 +83,42 @@ void Network::run(const double duration, void (*report_func)(const double, const
                     next_report_time += report_period;
                 }
             }
-            Clock *obj_clock = objects[i].first;
+            // TODO tuple of clock and function
+            //Clock *obj_clock = objects[i].first;
+            Clock *obj_clock = std::get<0>(objects[i]);
+            int group_int = std::get<2>(objects[i]);
             // Only execute the object if it uses the right clock for this step
             if (curclocks.find(obj_clock) != curclocks.end())
             {
-                codeobj_func func = objects[i].second;
+               // function -> whixh is in templates like common_group.cu
+               // sort the code object - waiting mechanism between groups
+               // cudaEvent or cudaSynchronise
+                //codeobj_func func = objects[i].second;
+                codeobj_func func = std::get<1>(objects[i]);
+                int func_group_int = std::get<2>(objects[i]);
                 if (func)  // code objects can be NULL in cases where we store just the clock
                 {
-                    func();
+                      func_groups[func_group_int].push_back(func);
+                      //func_groups.push_back(std::make_pair(func_group_int,func));
+                    //func();
+                    // [[func1,func2,func3],[func4...]]
                 }
             }
         }
+
+        // get maximum in objects.cu array
+
+        // go through each list of func group - 2 loops
+        for(int i=0; i<func_groups.size(); i++){
+            for(int j=0; j<func_groups[i].size(); j++){
+                codeobj_func func = func_groups[i][j];
+                func(custom_stream[j]);
+            }
+            // reset the func group for that sub stream
+            cudaDeviceSynchronize();
+            func_groups[i].resize(0);
+        }
+
         for(std::set<Clock*>::iterator i=curclocks.begin(); i!=curclocks.end(); i++)
             (*i)->tick();
         clock = next_clocks();
@@ -129,7 +164,8 @@ void Network::compute_clocks()
     clocks.clear();
     for(int i=0; i<objects.size(); i++)
     {
-        Clock *clock = objects[i].first;
+        Clock *clock = std::get<0>(objects[i]);
+        // Clock *clock = std::get<0>()objects[i].first;
         clocks.insert(clock);
     }
 }
@@ -174,7 +210,7 @@ Clock* Network::next_clocks()
 #include <ctime>
 #include "brianlib/clocks.h"
 
-typedef void (*codeobj_func)();
+typedef void (*codeobj_func)(cudaStream_t);
 
 class Network
 {
@@ -182,17 +218,25 @@ class Network
     void compute_clocks();
     Clock* next_clocks();
 public:
-    std::vector< std::pair< Clock*, codeobj_func > > objects;
+// TODO vectory of tuples having clock , codeobj_func and stread integer
+    std::vector< std::tuple< Clock*, codeobj_func, int > > objects;
+    //std::vector< std::pair< Clock*, codeobj_func > > objects;
+    std::vector<std::vector<codeobj_func >> func_groups = std::vector<std::vector<codeobj_func >>({{num_stream}});
+    //std::vector<std::pair< int, codeobj_func >> func_groups;
     double t;
     static double _last_run_time;
     static double _last_run_completed_fraction;
+    int num_streams;
+    {% if parallelize %}
+    cudaStream_t custom_stream[{{num_stream}}];
+    {% endif %}
 
     Network();
     void clear();
-    void add(Clock *clock, codeobj_func func);
+    void add(Clock *clock, codeobj_func func, int num_streams);
     void run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period);
 };
 
 #endif
 
-{% endmacro %}
+{% endmacro %}
\ No newline at end of file
diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
index ea8fe158..c69e00b1 100644
--- a/brian2cuda/templates/objects.cu
+++ b/brian2cuda/templates/objects.cu
@@ -40,6 +40,12 @@ const int brian::_num_{{varname}} = {{var.size}};
 {% endif %}
 {% endfor %}
 
+
+///////////////// array of streams for parallelization //////////////////////////
+// {% if parallelize %}
+// cudaStream_t brian::custom_stream[{{stream_size}}];
+// {% endif %}
+
 //////////////// eventspaces ///////////////
 // we dynamically create multiple eventspaces in no_or_const_delay_mode
 // for initiating the first spikespace, we need a host pointer
@@ -90,10 +96,8 @@ __device__ int* brian::{{path.name}}_num_synapses_by_bundle;
 __device__ int* brian::{{path.name}}_unique_delays;
 __device__ int* brian::{{path.name}}_synapses_offset_by_bundle;
 __device__ int* brian::{{path.name}}_global_bundle_id_start_by_pre;
-int brian::{{path.name}}_bundle_size_max = 0;
-int brian::{{path.name}}_bundle_size_min = 0;
-double brian::{{path.name}}_bundle_size_mean = 0;
-double brian::{{path.name}}_bundle_size_std = 0;
+int brian::{{path.name}}_max_bundle_size = 0;
+int brian::{{path.name}}_mean_bundle_size = 0;
 int brian::{{path.name}}_max_size = 0;
 __device__ int* brian::{{path.name}}_num_unique_delays_by_pre;
 int brian::{{path.name}}_max_num_unique_delays = 0;
@@ -199,6 +203,7 @@ void _init_arrays()
     {% else %}
     num_parallel_blocks = props.multiProcessorCount * {{sm_multiplier}};
     {% endif %}
+    printf("objects cu num par blocks %d\n", num_parallel_blocks);
     max_threads_per_block = props.maxThreadsPerBlock;
     max_threads_per_sm = props.maxThreadsPerMultiProcessor;
     max_shared_mem_size = props.sharedMemPerBlock;
@@ -227,6 +232,14 @@ void _init_arrays()
             );
     {% endif %}
 
+//     {% if parallelize %}
+//     for(int i=0;i<{{stream_size}};i++){
+//         CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+//     }
+//     {% endif %}
+
+
+
     // this sets seed for host and device api RNG
     random_number_buffer.set_seed(seed);
 
@@ -352,16 +365,10 @@ void _write_arrays()
     using namespace brian;
 
     {% for var, varname in array_specs | dictsort(by='value') %}
-    {% if not (var in dynamic_array_specs
-                or var in dynamic_array_2d_specs
-                or var in static_array_specs
-              ) %}
-    {# Don't copy StateMonitor's N variables, which are modified on host only #}
-    {% if not (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 'N') %}
+    {% if not (var in dynamic_array_specs or var in dynamic_array_2d_specs or var in static_array_specs) %}
     CUDA_SAFE_CALL(
             cudaMemcpy({{varname}}, dev{{varname}}, sizeof({{c_data_type(var.dtype)}})*_num_{{varname}}, cudaMemcpyDeviceToHost)
             );
-    {% endif %}
     ofstream outfile_{{varname}};
     outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out);
     if(outfile_{{varname}}.is_open())
@@ -376,11 +383,7 @@ void _write_arrays()
     {% endfor %}
 
     {% for var, varname in dynamic_array_specs | dictsort(by='value') %}
-    {# TODO: pass isinstance to Jinja template to make it available here #}
-    {% if not (var in multisynaptic_idx_vars
-                or var.name in ['delay', '_synaptic_pre', '_synaptic_post']
-                or (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 't')
-              ) %}
+    {% if not var in multisynaptic_idx_vars and not var.name in ['delay', '_synaptic_pre', '_synaptic_post'] %}
     {{varname}} = dev{{varname}};
     {% endif %}
     ofstream outfile_{{varname}};
@@ -396,28 +399,15 @@ void _write_arrays()
     {% endfor %}
 
     {% for var, varname in dynamic_array_2d_specs | dictsort(by='value') %}
-        {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %}
-        {# Record copying statemonitor variable from device to host for benchmarking #}
-        std::clock_t before_copy_statemon;
-        string profile_statemonitor_copy_to_host_varname = "{{var.owner.name}}_copy_to_host_{{profile_statemonitor_copy_to_host}}";
-        double copy_time_statemon;
-        {% endif %}
         ofstream outfile_{{varname}};
         outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out);
         if(outfile_{{varname}}.is_open())
         {
-            {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %}
-            before_copy_statemon = std::clock();
-            {% endif %}
             thrust::host_vector<{{c_data_type(var.dtype)}}>* temp_array{{varname}} = new thrust::host_vector<{{c_data_type(var.dtype)}}>[_num__array_{{var.owner.name}}__indices];
             for (int n=0; n<_num__array_{{var.owner.name}}__indices; n++)
             {
                 temp_array{{varname}}[n] = {{varname}}[n];
             }
-            {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %}
-            string profile_statemonitor_copy_to_host_varname = "{{varname}}_copy_to_host";
-            copy_time_statemon += (double)(std::clock() - before_copy_statemon) / CLOCKS_PER_SEC;
-            {% endif %}
             for(int j = 0; j < temp_array{{varname}}[0].size(); j++)
             {
                 for(int i = 0; i < _num__array_{{var.owner.name}}__indices; i++)
@@ -449,9 +439,6 @@ void _write_arrays()
     {% endif %}
     #}
     outfile_profiling_info << "{{codeobj}}\t" << {{codeobj}}_profiling_info << std::endl;
-    {% if profile_statemonitor_copy_to_host %}
-    outfile_profiling_info << profile_statemonitor_copy_to_host_varname << "\t" << copy_time_statemon << std::endl;
-    {% endif %}
     {% endfor %}
     outfile_profiling_info.close();
     } else
@@ -573,8 +560,8 @@ typedef {{curand_float_type}} randomNumber_t;  // random number type
 #include "network.h"
 #include "rand.h"
 
-#include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
 #include <curand.h>
 #include <curand_kernel.h>
 
@@ -625,6 +612,12 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}*> addresses_monitor_{{va
 extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}};
 {% endfor %}
 
+//////////////// stream ////////////
+// {% if parallelize %}
+// extern cudaStream_t custom_stream[{{stream_size}}];
+// {% endif %}
+
+
 /////////////// static arrays /////////////
 {% for (name, dtype_spec, N, filename) in static_array_specs | sort %}
 {# arrays that are initialized from static data are already declared #}
@@ -646,10 +639,8 @@ extern __device__ int* {{path.name}}_num_synapses_by_bundle;
 extern __device__ int* {{path.name}}_unique_delays;
 extern __device__ int* {{path.name}}_synapses_offset_by_bundle;
 extern __device__ int* {{path.name}}_global_bundle_id_start_by_pre;
-extern int {{path.name}}_bundle_size_max;
-extern int {{path.name}}_bundle_size_min;
-extern double {{path.name}}_bundle_size_mean;
-extern double {{path.name}}_bundle_size_std;
+extern int {{path.name}}_max_bundle_size;
+extern int {{path.name}}_mean_bundle_size;
 extern int {{path.name}}_max_size;
 extern __device__ int* {{path.name}}_num_unique_delays_by_pre;
 extern int {{path.name}}_max_num_unique_delays;
@@ -723,4 +714,4 @@ void _dealloc_arrays();
 #endif
 
 
-{% endmacro %}
+{% endmacro %}
\ No newline at end of file
diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu
index 08208d5a..06e7eae2 100644
--- a/brian2cuda/templates/rand.cu
+++ b/brian2cuda/templates/rand.cu
@@ -44,8 +44,9 @@ namespace {
 
 
 // need a function pointer for Network::add(), can't pass a pointer to a class
-// method, which is of different type
-void _run_random_number_buffer()
+// method, which is of different type. Random number buffer runs in default
+// stream always, the `stream` parameter is not used.
+void _run_random_number_buffer(cudaStream_t stream)
 {
     // random_number_buffer is a RandomNumberBuffer instance, declared in objects.cu
     random_number_buffer.next_time_step();
@@ -472,7 +473,7 @@ void RandomNumberBuffer::next_time_step()
 
 #include <curand.h>
 
-void _run_random_number_buffer();
+void _run_random_number_buffer(cudaStream_t);
 
 class RandomNumberBuffer
 {
@@ -562,4 +563,4 @@ public:
 
 #endif
 
-{% endmacro %}
+{% endmacro %}
\ No newline at end of file
diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu
index fed9ef78..31a91903 100644
--- a/brian2cuda/templates/synapses.cu
+++ b/brian2cuda/templates/synapses.cu
@@ -25,7 +25,7 @@ _run_kernel_{{codeobj_name}}(
     int threads_per_bundle,
     {% endif %}
     int32_t* eventspace,
-    {% if uses_atomics or synaptic_effects == "synapse" %}
+    {% if uses_atomics %}
     int num_spiking_neurons,
     {% else %}
     int neurongroup_size,
@@ -72,7 +72,7 @@ _run_kernel_{{codeobj_name}}(
             {
                 // `spiking_neuron_idx` runs through the eventspace
                 // `post_block_idx` runs through the post neuron blocks of the connectivity matrix
-                {% if uses_atomics or synaptic_effects == "synapse" %}
+                {% if uses_atomics %}
                 int spiking_neuron_idx = bid / num_parallel_blocks;
                 int post_block_idx = bid % num_parallel_blocks;
                 {% else %}
@@ -87,7 +87,7 @@ _run_kernel_{{codeobj_name}}(
                     // spiking_neuron is index in NeuronGroup
                     int32_t spiking_neuron = eventspace[spiking_neuron_idx];
 
-                    {% if uses_atomics or synaptic_effects == "synapse" %}
+                    {% if uses_atomics %}
                     assert(spiking_neuron != -1);
                     {% else %}
                     if(spiking_neuron == -1) // end of spiking neurons
@@ -172,36 +172,6 @@ static int num_threads_per_bundle;
 static int num_loops;
 {% endblock %}
 
-{% block extra_device_helper %}
-int getThreadsPerBundle(){
-    {# Allow using std functions (ceil, floor...) in
-       prefs.device.cuda_standalone.threads_per_synapse_bundle #}
-    using namespace std;
-    using namespace brian;
-    int threads_per_bundle = static_cast<int>({{threads_per_synapse_bundle}});
-    {% if bundle_threads_warp_multiple %}
-    int multiple = threads_per_bundle / num_threads_per_warp;
-    {% if bundle_threads_warp_multiple == 'up' %}
-    int remainder = threads_per_bundle % num_threads_per_warp;
-    if (remainder != 0){
-        // if remainder is 0, just use thread_per_bundle as is
-        // round up to next multiple of warp size
-        threads_per_bundle = (multiple + 1) * num_threads_per_warp;
-    }
-    {% elif bundle_threads_warp_multiple == 'down' %}
-    // ignore remainder, round down to next muptiple of warp size
-    threads_per_bundle = multiple * num_threads_per_warp;
-    {% endif %}
-    {% endif %}
-
-    if (threads_per_bundle < 1){
-        threads_per_bundle = 1;
-    }
-    return threads_per_bundle;
-}
-{% endblock extra_device_helper %}
-
-
 {% block prepare_kernel_inner %}
 {#######################################################################}
 {% if uses_atomics or synaptic_effects == "synapse" %}
@@ -212,14 +182,13 @@ int getThreadsPerBundle(){
 {% endif %}
 num_blocks = num_parallel_blocks;
 num_threads = max_threads_per_block;
+// TODO: effect of mean instead of max?
 {% if bundle_mode %}
-//num_threads_per_bundle = {{pathway.name}}_bundle_size_max;
-num_threads_per_bundle = getThreadsPerBundle();
-printf("INFO _run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle);
+num_threads_per_bundle = {{pathway.name}}_max_bundle_size;
 {% endif %}
 num_loops = 1;
 
-{% elif synaptic_effects == "target" %}{# not uses_atomics #}
+{% elif synaptic_effects == "target" %}
 // Synaptic effects modify target group variables but NO source group variables.
 num_blocks = num_parallel_blocks;
 num_loops = 1;
@@ -229,9 +198,7 @@ if (!{{owner.name}}_multiple_pre_post){
         num_threads = max_threads_per_block;
     {% if bundle_mode %}
     else  // heterogeneous delays
-        // Since we can only parallelize within each bundle, we use as many threads as
-        // the maximum bundle size
-        num_threads = {{pathway.name}}_bundle_size_max;
+        num_threads = {{pathway.name}}_max_bundle_size;
     {% endif %}
 }
 else {
@@ -279,7 +246,7 @@ else if ({{pathway.name}}_max_size <= 0)
 // only call kernel if we have synapses (otherwise we skipped the push kernel)
 if ({{pathway.name}}_max_size > 0)
 {
-    {% if uses_atomics or synaptic_effects == "synapse" %}
+    {% if uses_atomics %}
     int32_t num_spiking_neurons;
     // we only need the number of spiking neurons if we parallelise effect
     // application over spiking neurons in homogeneous delay mode
@@ -287,9 +254,10 @@ if ({{pathway.name}}_max_size > 0)
     {
         if (defaultclock.timestep[0] >= {{pathway.name}}_delay)
         {
-            cudaMemcpy(&num_spiking_neurons,
+            CUDA_SAFE_CALL(cudaMemcpyAsync(&num_spiking_neurons,
                     &dev{{_eventspace}}[{{pathway.name}}_eventspace_idx][_num_{{_eventspace}} - 1],
-                    sizeof(int32_t), cudaMemcpyDeviceToHost);
+                    sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+            CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
             num_blocks = num_parallel_blocks * num_spiking_neurons;
             //TODO collect info abt mean, std of num spiking neurons per time
             //step and print INFO at end of simulation
@@ -309,7 +277,7 @@ if ({{pathway.name}}_max_size > 0)
                 num_threads_per_bundle,
                 {% endif %}
                 dev{{_eventspace}}[{{pathway.name}}_eventspace_idx],
-                {% if uses_atomics or synaptic_effects == "synapse" %}
+                {% if uses_atomics %}
                 num_spiking_neurons,
                 {% else %}
                 _num_{{_eventspace}}-1,
@@ -318,7 +286,7 @@ if ({{pathway.name}}_max_size > 0)
                 %HOST_PARAMETERS%
             );
         }
-    {% if uses_atomics or synaptic_effects == "synapse" %}
+    {% if uses_atomics %}
     }
     {% endif %}
 
@@ -340,4 +308,4 @@ void _debugmsg_{{codeobj_name}}();
 
 {% macro main_finalise() %}
 _debugmsg_{{codeobj_name}}();
-{% endmacro %}
+{% endmacro %}
\ No newline at end of file
diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu
index 16b408ce..655ec3a0 100644
--- a/brian2cuda/templates/synapses_push_spikes.cu
+++ b/brian2cuda/templates/synapses_push_spikes.cu
@@ -489,26 +489,13 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
                 int synapses_start_idx = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx];
                 // find the number of synapses for this delay (bundle)
                 int num_synapses;
-                if (bundle_idx == num_unique_elements - 1){
+                if (bundle_idx == num_unique_elements - 1)
                     num_synapses = num_elements - synapses_start_idx;
-                }
-                else {
+                else
                     num_synapses = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx + 1] - synapses_start_idx;
-                }
                 h_num_synapses_by_bundle.push_back(num_synapses);
-
-                if (bundle_idx == 0){
-                    {{owner.name}}_bundle_size_min = num_synapses;
-                    {{owner.name}}_bundle_size_max = num_synapses;
-                }
-                else {
-                    if (num_synapses > {{owner.name}}_bundle_size_max){
-                        {{owner.name}}_bundle_size_max = num_synapses;
-                    }
-                    if (num_synapses < {{owner.name}}_bundle_size_min){
-                        {{owner.name}}_bundle_size_min = num_synapses;
-                    }
-                }
+                if (num_synapses > {{owner.name}}_max_bundle_size)
+                    {{owner.name}}_max_bundle_size = num_synapses;
 
                 // copy this bundle to device and store the device pointer
                 int32_t* d_this_bundle = d_ptr_synapse_ids + sum_bundle_sizes;
@@ -671,10 +658,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         // add num_bundle_ids as last entry
         h_global_bundle_id_start_by_pre[num_pre_post_blocks] = num_bundle_ids;
 
-        // floor(mean(h_num_synapses_by_bundle)) = sum_bundle_sizes / num_bundle_ids;
-        assert(std::floor(mean_bundle_sizes) == sum_bundle_sizes / num_bundle_ids);
-        {{owner.name}}_bundle_size_mean = mean_bundle_sizes;
-        {{owner.name}}_bundle_size_std = getStd(count_bundle_sizes, M2_bundle_sizes);
+        // floor(mean(h_num_synapses_by_bundle))
+        {{owner.name}}_mean_bundle_size = sum_bundle_sizes / num_bundle_ids;
 
         // pointer to start of synapse IDs array
         CUDA_SAFE_CALL(
@@ -851,7 +836,7 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         printf("INFO _before_run_kernel_{{codeobj_name}}\n"
                "\t%u blocks\n"
                "\t%u threads\n"
-               "\t%i registers per thread\n"
+               "\t%i registers per block\n"
                "\t%i bytes statically-allocated shared memory per block\n"
                "\t%i bytes local memory per thread\n"
                "\t%i bytes user-allocated constant memory\n"
@@ -1023,13 +1008,14 @@ void _run_{{codeobj_name}}()
         // get the number of spiking neurons
         int32_t num_spiking_neurons;
         CUDA_SAFE_CALL(
-                cudaMemcpy(&num_spiking_neurons,
+                cudaMemcpyAsync(&num_spiking_neurons,
                     dev{{_eventspace}}[current_idx{{_eventspace}}] + _num_{{owner.event}}space - 1,
                     sizeof(int32_t), cudaMemcpyDeviceToHost)
                 );
 
         // advance spike queues
-        _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks>>>();
+        _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks, 0, stream>>>();
+        CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
 
         CUDA_CHECK_ERROR("_advance_kernel_{{codeobj_name}}");
 
@@ -1037,11 +1023,6 @@ void _run_{{codeobj_name}}()
        block prepare_kernel and block kernel_call are executed in this else clause #}
 {% endblock host_maincode %}
 
-{% block kernel_info_num_blocks_str %}
-"\tvariable number of blocks (depends on number of spiking neurons)\n"
-{% endblock %}
-{% block kernel_info_num_blocks_var %}
-{% endblock %}
 
 {% block prepare_kernel_inner %}
     {% if not bundle_mode %}
@@ -1082,4 +1063,4 @@ void _run_{{codeobj_name}}()
             CUDA_CHECK_ERROR("_run_kernel_{{codeobj_name}}");
         }
     }  // end else if ({{owner.name}}_max_size > 0) {# from block host_maincode #}
-{% endblock kernel_call %}
+{% endblock kernel_call %}
\ No newline at end of file