diff --git a/brian2cuda/device.py b/brian2cuda/device.py index 41b88a09..32209d7d 100644 --- a/brian2cuda/device.py +++ b/brian2cuda/device.py @@ -78,11 +78,6 @@ def __init__(self): self.include_dirs.remove('brianlib/randomkit') self.library_dirs.remove('brianlib/randomkit') - # Add code line slots used in our benchmarks - # TODO: Add to brian2 and remove here - self.code_lines.update({'before_network_run': [], - 'after_network_run': []}) - ### Attributes specific to CUDAStandaloneDevice: # only true during first run call (relevant for synaptic pre/post ID deletion) self.first_run = True @@ -94,6 +89,8 @@ def __init__(self): # list of pre/post ID arrays that are not needed in device memory self.delete_synaptic_pre = {} self.delete_synaptic_post = {} + # dictionary to store parallalelization information + self.stream_info = {} # The following nested dictionary collects all codeobjects that use random # number generation (RNG). self.codeobjects_with_rng = { @@ -351,18 +348,6 @@ def code_object(self, owner, name, abstract_code, variables, template_name, f"Using atomics in synaptic effect application of Synapses object " f"{name}" ) - threads_expr = prefs.devices.cuda_standalone.threads_per_synapse_bundle - pathway_name = template_kwds['pathway'].name - replace_expr = { - '{mean}': f'{pathway_name}_bundle_size_mean', - '{std}': f'{pathway_name}_bundle_size_std', - '{min}': f'{pathway_name}_bundle_size_min', - '{max}': f'{pathway_name}_bundle_size_max', - } - for old, new in replace_expr.items(): - threads_expr = threads_expr.replace(old, new) - template_kwds["threads_per_synapse_bundle"] = threads_expr - template_kwds["bundle_threads_warp_multiple"] = prefs.devices.cuda_standalone.bundle_threads_warp_multiple if template_name in ["synapses_create_generator", "synapses_create_array"]: if owner.multisynaptic_index is not None: template_kwds["multisynaptic_idx_var"] = owner.variables[owner.multisynaptic_index] @@ -376,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name, template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"] template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"] template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"] + template_kwds["stream_info"] = self.stream_info if template_name in ["threshold", "spikegenerator"]: template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"] codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables, @@ -391,7 +377,7 @@ def check_openmp_compatible(self, nb_threads): if nb_threads > 0: raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported") - def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks): + def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info): sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type @@ -410,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_ for syn in synapses: if syn.multisynaptic_index is not None: multisyn_vars.append(syn.variables[syn.multisynaptic_index]) + # get number of unique streams + + num_stream = max(Counter(stream_info).values()) arr_tmp = self.code_object_class().templater.objects( None, None, array_specs=self.arrays, @@ -433,7 +422,8 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_ spikegenerator_eventspaces=self.spikegenerator_eventspaces, multisynaptic_idx_vars=multisyn_vars, profiled_codeobjects=self.profiled_codeobjects, - profile_statemonitor_copy_to_host=prefs.devices.cuda_standalone.profile_statemonitor_copy_to_host) + parallelize=True, + stream_size=num_stream) # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary... self.arrays.update(self.eventspace_arrays) writer.write('objects.*', arr_tmp) @@ -463,7 +453,8 @@ def generate_main_source(self, writer): # For codeobjects run every tick, this happens in the init() of # the random number buffer called at first clock cycle of the network main_lines.append('random_number_buffer.ensure_enough_curand_states();') - main_lines.append(f'_run_{codeobj.name}();') + # add stream - default + main_lines.append(f'_run_{codeobj.name}(0);') elif func == 'after_run_code_object': codeobj, = args main_lines.append(f'_after_run_{codeobj.name}();') @@ -944,13 +935,13 @@ def _replace_constants_and_parameters(code): sub = 't - lastupdate' if sub in code: code = code.replace(sub, f'float({sub})') - logger.debug(f"Replaced {sub} with float({sub}) in {codeobj.name}") + logger.debug(f"Replaced {sub} with float({sub}) in {codeobj}") # replace double-precision floating-point literals with their # single-precision version (e.g. `1.0` -> `1.0f`) code = replace_floating_point_literals(code) logger.debug( f"Replaced floating point literals by single precision version " - f"(appending `f`) in {codeobj.name}." + f"(appending `f`) in {codeobj}." ) writer.write('code_objects/'+codeobj.name+'.cu', code) @@ -1004,10 +995,14 @@ def generate_network_source(self, writer): maximum_run_time = self._maximum_run_time if maximum_run_time is not None: maximum_run_time = float(maximum_run_time) + num_stream = max(Counter(self.stream_info).values()) network_tmp = self.code_object_class().templater.network(None, None, maximum_run_time=maximum_run_time, eventspace_arrays=self.eventspace_arrays, - spikegenerator_eventspaces=self.spikegenerator_eventspaces) + spikegenerator_eventspaces=self.spikegenerator_eventspaces, + parallelize = True, + stream_info = self.stream_info, + num_stream= num_stream) writer.write('network.*', network_tmp) def generate_synapses_classes_source(self, writer): @@ -1328,7 +1323,7 @@ def build(self, directory='output', self.generate_objects_source(self.writer, self.arange_arrays, net_synapses, self.static_array_specs, - self.networks) + self.networks, self.stream_info) self.generate_network_source(self.writer) self.generate_synapses_classes_source(self.writer) self.generate_run_source(self.writer) @@ -1355,6 +1350,11 @@ def build(self, directory='output', if "devices.cuda_standalone" in pref_name: logger.info(f"\t{pref_name} = {prefs[pref_name]}") + logger.debug("Using the following brian preferences:") + for pref_name in prefs: + if pref_name not in prefs: + logger.debug(f"\t{pref_name} = {prefs[pref_name]}") + if compile: self.compile_source(directory, cpp_compiler, debug, clean) if run: @@ -1395,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second, self.clocks.update(net._clocks) net.t_ = float(t_end) + + # Create dictionary for parallelisation with stream + streams_organization = defaultdict(list) + for obj in net.sorted_objects: + streams_organization[(obj.when, obj.order)].append(obj) + + # associate each code object with a particular stream + streams_details = defaultdict(list) + count = 1 + for key in streams_organization: + for object in streams_organization[key]: + streams_details[object.name] = count + count +=1 + + self.stream_info = streams_details + self.stream_info['default'] = 0 + + + # TODO: remove this horrible hack for clock in self.clocks: if clock.name=='clock': @@ -1529,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second, # create all random numbers needed for the next clock cycle for clock in net._clocks: - run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);') + run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});') all_clocks = set() + # TODO add for every code object -> add where in the list are there. + # TODO create new dic (code object, position in list) for clock, codeobj in code_objects: - run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});') + # add this position as additional number here + # check if codeobj.name has _codeobject in it + name = codeobj.name + if "_codeobject" in codeobj.name: + name = codeobj.name[:-11] + if name in self.stream_info.keys(): + run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});') + else: + run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});') all_clocks.add(clock) # Under some rare circumstances (e.g. a NeuronGroup only defining a @@ -1558,12 +1587,12 @@ def network_run(self, net, duration, report=None, report_period=10*second, # release version? (TODO: add via insert_code mechanism) run_lines.append('CUDA_SAFE_CALL(cudaProfilerStart());') - run_lines.extend(self.code_lines['before_network_run']) + run_lines.extend(self.code_lines['before_run']) # run everything that is run on a clock run_lines.append( f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});' ) - run_lines.extend(self.code_lines['after_network_run']) + run_lines.extend(self.code_lines['after_run']) # for multiple runs, the random number buffer needs to be reset run_lines.append('random_number_buffer.run_finished();') # nvprof stuff @@ -1840,4 +1869,4 @@ def prepare_codeobj_code_for_rng(codeobj): cuda_standalone_device = CUDAStandaloneDevice() -all_devices['cuda_standalone'] = cuda_standalone_device +all_devices['cuda_standalone'] = cuda_standalone_device \ No newline at end of file diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu index 7c069d2b..25d11c68 100644 --- a/brian2cuda/templates/common_group.cu +++ b/brian2cuda/templates/common_group.cu @@ -63,8 +63,8 @@ namespace { (e.g. _host_rand used in _poisson), but we can't put support_code_lines lines after block random_functions since random_functions can use functions defined in support_code_lines (e.g. _rand) #} - randomNumber_t _host_rand(const int _vectorisation_idx); - randomNumber_t _host_randn(const int _vectorisation_idx); + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); int32_t _host_poisson(double _lambda, const int _vectorisation_idx); ///// block extra_device_helper ///// @@ -77,13 +77,13 @@ namespace { {% block random_functions %} // Implement dummy functions such that the host compiled code of binomial // functions works. Hacky, hacky ... - randomNumber_t _host_rand(const int _vectorisation_idx) + double _host_rand(const int _vectorisation_idx) { printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } - randomNumber_t _host_randn(const int _vectorisation_idx) + double _host_randn(const int _vectorisation_idx) { printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, __LINE__); @@ -109,8 +109,6 @@ __launch_bounds__(1024, {{sm_multiplier}}) _run_kernel_{{codeobj_name}}( int _N, int THREADS_PER_BLOCK, - {% block extra_kernel_parameters %} - {% endblock %} ///// KERNEL_PARAMETERS ///// %KERNEL_PARAMETERS% ) @@ -119,11 +117,8 @@ _run_kernel_{{codeobj_name}}( int tid = threadIdx.x; int bid = blockIdx.x; - - {% block indices %} int _idx = bid * THREADS_PER_BLOCK + tid; int _vectorisation_idx = _idx; - {% endblock %} ///// KERNEL_CONSTANTS ///// %KERNEL_CONSTANTS% @@ -136,10 +131,12 @@ _run_kernel_{{codeobj_name}}( {% block additional_variables %} {% endblock %} - if(_vectorisation_idx >= _N) + {% block num_thread_check %} + if(_idx >= _N) { return; } + {% endblock %} {% block kernel_maincode %} @@ -158,7 +155,7 @@ _run_kernel_{{codeobj_name}}( {% endblock kernel %} -void _run_{{codeobj_name}}() +void _run_{{codeobj_name}}(cudaStream_t stream) { using namespace brian; @@ -182,9 +179,7 @@ void _run_{{codeobj_name}}() {% endblock %} {% block prepare_kernel %} - {% block static_kernel_dimensions %} static int num_threads, num_blocks; - {% endblock %} static size_t needed_shared_memory = 0; static bool first_run = true; if (first_run) @@ -273,11 +268,9 @@ void _run_{{codeobj_name}}() else { printf("INFO _run_kernel_{{codeobj_name}}\n" - {% block kernel_info_num_blocks_str %} "\t%u blocks\n" - {% endblock %} "\t%u threads\n" - "\t%i registers per thread\n" + "\t%i registers per block\n" "\t%i bytes statically-allocated shared memory per block\n" "\t%i bytes local memory per thread\n" "\t%i bytes user-allocated constant memory\n" @@ -286,10 +279,7 @@ void _run_{{codeobj_name}}() {% else %} "", {% endif %} - {% block kernel_info_num_blocks_var %} - num_blocks, - {% endblock %} - num_threads, funcAttrib.numRegs, + num_blocks, num_threads, funcAttrib.numRegs, funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %}); } @@ -302,11 +292,9 @@ void _run_{{codeobj_name}}() {% endblock %} {% block kernel_call %} - _run_kernel_{{codeobj_name}}<<>>( + _run_kernel_{{codeobj_name}}<<>>( _N, num_threads, - {% block extra_host_parameters %} - {% endblock %} ///// HOST_PARAMETERS ///// %HOST_PARAMETERS% ); @@ -338,7 +326,7 @@ void _run_{{codeobj_name}}() #ifndef _INCLUDED_{{codeobj_name}} #define _INCLUDED_{{codeobj_name}} -void _run_{{codeobj_name}}(); +void _run_{{codeobj_name}}(cudaStream_t); {% block extra_functions_h %} {% endblock %} @@ -374,7 +362,7 @@ void _after_run_{{codeobj_name}}() } {% endmacro %} - +// {{codeobj_name}} {% macro after_run_h_file() %} #ifndef _INCLUDED_{{codeobj_name}}_after #define _INCLUDED_{{codeobj_name}}_affer @@ -382,4 +370,4 @@ void _after_run_{{codeobj_name}}() void _after_run_{{codeobj_name}}(); #endif -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/makefile b/brian2cuda/templates/makefile index 6bcf06a1..7b3bc76f 100644 --- a/brian2cuda/templates/makefile +++ b/brian2cuda/templates/makefile @@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o} OBJS := ${OBJS:.c=.o} NVCC = @{{ nvcc_path }} -ccbin $(CXX) NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}" -LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}} +LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}} all: $(PROGRAM) @@ -20,4 +20,4 @@ clean: {{rm_cmd}} %.o : %.cu - $(NVCC) $(NVCCFLAGS) -dc $< -o $@ + $(NVCC) $(NVCCFLAGS) -dc $< -o $@ \ No newline at end of file diff --git a/brian2cuda/templates/network.cu b/brian2cuda/templates/network.cu index 51723178..45a129f7 100644 --- a/brian2cuda/templates/network.cu +++ b/brian2cuda/templates/network.cu @@ -14,10 +14,18 @@ double Network::_last_run_time = 0.0; double Network::_last_run_completed_fraction = 0.0; +{% if parallelize %} +cudaStream_t custom_stream[{{num_stream}}]; +{% endif %} Network::Network() { t = 0.0; + {% if parallelize %} + for(int i=0;i<{{num_stream}};i++){ + CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i]))); + } + {% endif %} } void Network::clear() @@ -25,12 +33,14 @@ void Network::clear() objects.clear(); } -void Network::add(Clock *clock, codeobj_func func) +// TODO have to makr change in objects - make it a tuple +// make decision which bject has which stream +void Network::add(Clock *clock, codeobj_func func, int group_num) { #if defined(_MSC_VER) && (_MSC_VER>=1700) - objects.push_back(std::make_pair(std::move(clock), std::move(func))); + objects.push_back(std::make_tuple(std::move(clock), std::move(func), std::move(group_num))); #else - objects.push_back(std::make_pair(clock, func)); + objects.push_back(std::make_tuple(clock, func, group_num)); #endif } @@ -56,7 +66,7 @@ void Network::run(const double duration, void (*report_func)(const double, const Clock* clock = next_clocks(); double elapsed_realtime; bool did_break_early = false; - + //TODO here while(clock && clock->running()) { t = clock->t[0]; @@ -73,17 +83,42 @@ void Network::run(const double duration, void (*report_func)(const double, const next_report_time += report_period; } } - Clock *obj_clock = objects[i].first; + // TODO tuple of clock and function + //Clock *obj_clock = objects[i].first; + Clock *obj_clock = std::get<0>(objects[i]); + int group_int = std::get<2>(objects[i]); // Only execute the object if it uses the right clock for this step if (curclocks.find(obj_clock) != curclocks.end()) { - codeobj_func func = objects[i].second; + // function -> whixh is in templates like common_group.cu + // sort the code object - waiting mechanism between groups + // cudaEvent or cudaSynchronise + //codeobj_func func = objects[i].second; + codeobj_func func = std::get<1>(objects[i]); + int func_group_int = std::get<2>(objects[i]); if (func) // code objects can be NULL in cases where we store just the clock { - func(); + func_groups[func_group_int].push_back(func); + //func_groups.push_back(std::make_pair(func_group_int,func)); + //func(); + // [[func1,func2,func3],[func4...]] } } } + + // get maximum in objects.cu array + + // go through each list of func group - 2 loops + for(int i=0; i::iterator i=curclocks.begin(); i!=curclocks.end(); i++) (*i)->tick(); clock = next_clocks(); @@ -129,7 +164,8 @@ void Network::compute_clocks() clocks.clear(); for(int i=0; i(objects[i]); + // Clock *clock = std::get<0>()objects[i].first; clocks.insert(clock); } } @@ -174,7 +210,7 @@ Clock* Network::next_clocks() #include #include "brianlib/clocks.h" -typedef void (*codeobj_func)(); +typedef void (*codeobj_func)(cudaStream_t); class Network { @@ -182,17 +218,25 @@ class Network void compute_clocks(); Clock* next_clocks(); public: - std::vector< std::pair< Clock*, codeobj_func > > objects; +// TODO vectory of tuples having clock , codeobj_func and stread integer + std::vector< std::tuple< Clock*, codeobj_func, int > > objects; + //std::vector< std::pair< Clock*, codeobj_func > > objects; + std::vector> func_groups = std::vector>({{num_stream}}); + //std::vector> func_groups; double t; static double _last_run_time; static double _last_run_completed_fraction; + int num_streams; + {% if parallelize %} + cudaStream_t custom_stream[{{num_stream}}]; + {% endif %} Network(); void clear(); - void add(Clock *clock, codeobj_func func); + void add(Clock *clock, codeobj_func func, int num_streams); void run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period); }; #endif -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu index ea8fe158..c69e00b1 100644 --- a/brian2cuda/templates/objects.cu +++ b/brian2cuda/templates/objects.cu @@ -40,6 +40,12 @@ const int brian::_num_{{varname}} = {{var.size}}; {% endif %} {% endfor %} + +///////////////// array of streams for parallelization ////////////////////////// +// {% if parallelize %} +// cudaStream_t brian::custom_stream[{{stream_size}}]; +// {% endif %} + //////////////// eventspaces /////////////// // we dynamically create multiple eventspaces in no_or_const_delay_mode // for initiating the first spikespace, we need a host pointer @@ -90,10 +96,8 @@ __device__ int* brian::{{path.name}}_num_synapses_by_bundle; __device__ int* brian::{{path.name}}_unique_delays; __device__ int* brian::{{path.name}}_synapses_offset_by_bundle; __device__ int* brian::{{path.name}}_global_bundle_id_start_by_pre; -int brian::{{path.name}}_bundle_size_max = 0; -int brian::{{path.name}}_bundle_size_min = 0; -double brian::{{path.name}}_bundle_size_mean = 0; -double brian::{{path.name}}_bundle_size_std = 0; +int brian::{{path.name}}_max_bundle_size = 0; +int brian::{{path.name}}_mean_bundle_size = 0; int brian::{{path.name}}_max_size = 0; __device__ int* brian::{{path.name}}_num_unique_delays_by_pre; int brian::{{path.name}}_max_num_unique_delays = 0; @@ -199,6 +203,7 @@ void _init_arrays() {% else %} num_parallel_blocks = props.multiProcessorCount * {{sm_multiplier}}; {% endif %} + printf("objects cu num par blocks %d\n", num_parallel_blocks); max_threads_per_block = props.maxThreadsPerBlock; max_threads_per_sm = props.maxThreadsPerMultiProcessor; max_shared_mem_size = props.sharedMemPerBlock; @@ -227,6 +232,14 @@ void _init_arrays() ); {% endif %} +// {% if parallelize %} +// for(int i=0;i<{{stream_size}};i++){ +// CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i]))); +// } +// {% endif %} + + + // this sets seed for host and device api RNG random_number_buffer.set_seed(seed); @@ -352,16 +365,10 @@ void _write_arrays() using namespace brian; {% for var, varname in array_specs | dictsort(by='value') %} - {% if not (var in dynamic_array_specs - or var in dynamic_array_2d_specs - or var in static_array_specs - ) %} - {# Don't copy StateMonitor's N variables, which are modified on host only #} - {% if not (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 'N') %} + {% if not (var in dynamic_array_specs or var in dynamic_array_2d_specs or var in static_array_specs) %} CUDA_SAFE_CALL( cudaMemcpy({{varname}}, dev{{varname}}, sizeof({{c_data_type(var.dtype)}})*_num_{{varname}}, cudaMemcpyDeviceToHost) ); - {% endif %} ofstream outfile_{{varname}}; outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out); if(outfile_{{varname}}.is_open()) @@ -376,11 +383,7 @@ void _write_arrays() {% endfor %} {% for var, varname in dynamic_array_specs | dictsort(by='value') %} - {# TODO: pass isinstance to Jinja template to make it available here #} - {% if not (var in multisynaptic_idx_vars - or var.name in ['delay', '_synaptic_pre', '_synaptic_post'] - or (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 't') - ) %} + {% if not var in multisynaptic_idx_vars and not var.name in ['delay', '_synaptic_pre', '_synaptic_post'] %} {{varname}} = dev{{varname}}; {% endif %} ofstream outfile_{{varname}}; @@ -396,28 +399,15 @@ void _write_arrays() {% endfor %} {% for var, varname in dynamic_array_2d_specs | dictsort(by='value') %} - {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %} - {# Record copying statemonitor variable from device to host for benchmarking #} - std::clock_t before_copy_statemon; - string profile_statemonitor_copy_to_host_varname = "{{var.owner.name}}_copy_to_host_{{profile_statemonitor_copy_to_host}}"; - double copy_time_statemon; - {% endif %} ofstream outfile_{{varname}}; outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out); if(outfile_{{varname}}.is_open()) { - {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %} - before_copy_statemon = std::clock(); - {% endif %} thrust::host_vector<{{c_data_type(var.dtype)}}>* temp_array{{varname}} = new thrust::host_vector<{{c_data_type(var.dtype)}}>[_num__array_{{var.owner.name}}__indices]; for (int n=0; n<_num__array_{{var.owner.name}}__indices; n++) { temp_array{{varname}}[n] = {{varname}}[n]; } - {% if profile_statemonitor_copy_to_host and var.owner.__class__.__name__ == 'StateMonitor' and var.name == profile_statemonitor_copy_to_host %} - string profile_statemonitor_copy_to_host_varname = "{{varname}}_copy_to_host"; - copy_time_statemon += (double)(std::clock() - before_copy_statemon) / CLOCKS_PER_SEC; - {% endif %} for(int j = 0; j < temp_array{{varname}}[0].size(); j++) { for(int i = 0; i < _num__array_{{var.owner.name}}__indices; i++) @@ -449,9 +439,6 @@ void _write_arrays() {% endif %} #} outfile_profiling_info << "{{codeobj}}\t" << {{codeobj}}_profiling_info << std::endl; - {% if profile_statemonitor_copy_to_host %} - outfile_profiling_info << profile_statemonitor_copy_to_host_varname << "\t" << copy_time_statemon << std::endl; - {% endif %} {% endfor %} outfile_profiling_info.close(); } else @@ -573,8 +560,8 @@ typedef {{curand_float_type}} randomNumber_t; // random number type #include "network.h" #include "rand.h" -#include #include +#include #include #include @@ -625,6 +612,12 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}*> addresses_monitor_{{va extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}}; {% endfor %} +//////////////// stream //////////// +// {% if parallelize %} +// extern cudaStream_t custom_stream[{{stream_size}}]; +// {% endif %} + + /////////////// static arrays ///////////// {% for (name, dtype_spec, N, filename) in static_array_specs | sort %} {# arrays that are initialized from static data are already declared #} @@ -646,10 +639,8 @@ extern __device__ int* {{path.name}}_num_synapses_by_bundle; extern __device__ int* {{path.name}}_unique_delays; extern __device__ int* {{path.name}}_synapses_offset_by_bundle; extern __device__ int* {{path.name}}_global_bundle_id_start_by_pre; -extern int {{path.name}}_bundle_size_max; -extern int {{path.name}}_bundle_size_min; -extern double {{path.name}}_bundle_size_mean; -extern double {{path.name}}_bundle_size_std; +extern int {{path.name}}_max_bundle_size; +extern int {{path.name}}_mean_bundle_size; extern int {{path.name}}_max_size; extern __device__ int* {{path.name}}_num_unique_delays_by_pre; extern int {{path.name}}_max_num_unique_delays; @@ -723,4 +714,4 @@ void _dealloc_arrays(); #endif -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu index 08208d5a..06e7eae2 100644 --- a/brian2cuda/templates/rand.cu +++ b/brian2cuda/templates/rand.cu @@ -44,8 +44,9 @@ namespace { // need a function pointer for Network::add(), can't pass a pointer to a class -// method, which is of different type -void _run_random_number_buffer() +// method, which is of different type. Random number buffer runs in default +// stream always, the `stream` parameter is not used. +void _run_random_number_buffer(cudaStream_t stream) { // random_number_buffer is a RandomNumberBuffer instance, declared in objects.cu random_number_buffer.next_time_step(); @@ -472,7 +473,7 @@ void RandomNumberBuffer::next_time_step() #include -void _run_random_number_buffer(); +void _run_random_number_buffer(cudaStream_t); class RandomNumberBuffer { @@ -562,4 +563,4 @@ public: #endif -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu index fed9ef78..31a91903 100644 --- a/brian2cuda/templates/synapses.cu +++ b/brian2cuda/templates/synapses.cu @@ -25,7 +25,7 @@ _run_kernel_{{codeobj_name}}( int threads_per_bundle, {% endif %} int32_t* eventspace, - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} int num_spiking_neurons, {% else %} int neurongroup_size, @@ -72,7 +72,7 @@ _run_kernel_{{codeobj_name}}( { // `spiking_neuron_idx` runs through the eventspace // `post_block_idx` runs through the post neuron blocks of the connectivity matrix - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} int spiking_neuron_idx = bid / num_parallel_blocks; int post_block_idx = bid % num_parallel_blocks; {% else %} @@ -87,7 +87,7 @@ _run_kernel_{{codeobj_name}}( // spiking_neuron is index in NeuronGroup int32_t spiking_neuron = eventspace[spiking_neuron_idx]; - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} assert(spiking_neuron != -1); {% else %} if(spiking_neuron == -1) // end of spiking neurons @@ -172,36 +172,6 @@ static int num_threads_per_bundle; static int num_loops; {% endblock %} -{% block extra_device_helper %} -int getThreadsPerBundle(){ - {# Allow using std functions (ceil, floor...) in - prefs.device.cuda_standalone.threads_per_synapse_bundle #} - using namespace std; - using namespace brian; - int threads_per_bundle = static_cast({{threads_per_synapse_bundle}}); - {% if bundle_threads_warp_multiple %} - int multiple = threads_per_bundle / num_threads_per_warp; - {% if bundle_threads_warp_multiple == 'up' %} - int remainder = threads_per_bundle % num_threads_per_warp; - if (remainder != 0){ - // if remainder is 0, just use thread_per_bundle as is - // round up to next multiple of warp size - threads_per_bundle = (multiple + 1) * num_threads_per_warp; - } - {% elif bundle_threads_warp_multiple == 'down' %} - // ignore remainder, round down to next muptiple of warp size - threads_per_bundle = multiple * num_threads_per_warp; - {% endif %} - {% endif %} - - if (threads_per_bundle < 1){ - threads_per_bundle = 1; - } - return threads_per_bundle; -} -{% endblock extra_device_helper %} - - {% block prepare_kernel_inner %} {#######################################################################} {% if uses_atomics or synaptic_effects == "synapse" %} @@ -212,14 +182,13 @@ int getThreadsPerBundle(){ {% endif %} num_blocks = num_parallel_blocks; num_threads = max_threads_per_block; +// TODO: effect of mean instead of max? {% if bundle_mode %} -//num_threads_per_bundle = {{pathway.name}}_bundle_size_max; -num_threads_per_bundle = getThreadsPerBundle(); -printf("INFO _run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle); +num_threads_per_bundle = {{pathway.name}}_max_bundle_size; {% endif %} num_loops = 1; -{% elif synaptic_effects == "target" %}{# not uses_atomics #} +{% elif synaptic_effects == "target" %} // Synaptic effects modify target group variables but NO source group variables. num_blocks = num_parallel_blocks; num_loops = 1; @@ -229,9 +198,7 @@ if (!{{owner.name}}_multiple_pre_post){ num_threads = max_threads_per_block; {% if bundle_mode %} else // heterogeneous delays - // Since we can only parallelize within each bundle, we use as many threads as - // the maximum bundle size - num_threads = {{pathway.name}}_bundle_size_max; + num_threads = {{pathway.name}}_max_bundle_size; {% endif %} } else { @@ -279,7 +246,7 @@ else if ({{pathway.name}}_max_size <= 0) // only call kernel if we have synapses (otherwise we skipped the push kernel) if ({{pathway.name}}_max_size > 0) { - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} int32_t num_spiking_neurons; // we only need the number of spiking neurons if we parallelise effect // application over spiking neurons in homogeneous delay mode @@ -287,9 +254,10 @@ if ({{pathway.name}}_max_size > 0) { if (defaultclock.timestep[0] >= {{pathway.name}}_delay) { - cudaMemcpy(&num_spiking_neurons, + CUDA_SAFE_CALL(cudaMemcpyAsync(&num_spiking_neurons, &dev{{_eventspace}}[{{pathway.name}}_eventspace_idx][_num_{{_eventspace}} - 1], - sizeof(int32_t), cudaMemcpyDeviceToHost); + sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); num_blocks = num_parallel_blocks * num_spiking_neurons; //TODO collect info abt mean, std of num spiking neurons per time //step and print INFO at end of simulation @@ -309,7 +277,7 @@ if ({{pathway.name}}_max_size > 0) num_threads_per_bundle, {% endif %} dev{{_eventspace}}[{{pathway.name}}_eventspace_idx], - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} num_spiking_neurons, {% else %} _num_{{_eventspace}}-1, @@ -318,7 +286,7 @@ if ({{pathway.name}}_max_size > 0) %HOST_PARAMETERS% ); } - {% if uses_atomics or synaptic_effects == "synapse" %} + {% if uses_atomics %} } {% endif %} @@ -340,4 +308,4 @@ void _debugmsg_{{codeobj_name}}(); {% macro main_finalise() %} _debugmsg_{{codeobj_name}}(); -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu index 16b408ce..655ec3a0 100644 --- a/brian2cuda/templates/synapses_push_spikes.cu +++ b/brian2cuda/templates/synapses_push_spikes.cu @@ -489,26 +489,13 @@ __global__ void _before_run_kernel_{{codeobj_name}}( int synapses_start_idx = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx]; // find the number of synapses for this delay (bundle) int num_synapses; - if (bundle_idx == num_unique_elements - 1){ + if (bundle_idx == num_unique_elements - 1) num_synapses = num_elements - synapses_start_idx; - } - else { + else num_synapses = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx + 1] - synapses_start_idx; - } h_num_synapses_by_bundle.push_back(num_synapses); - - if (bundle_idx == 0){ - {{owner.name}}_bundle_size_min = num_synapses; - {{owner.name}}_bundle_size_max = num_synapses; - } - else { - if (num_synapses > {{owner.name}}_bundle_size_max){ - {{owner.name}}_bundle_size_max = num_synapses; - } - if (num_synapses < {{owner.name}}_bundle_size_min){ - {{owner.name}}_bundle_size_min = num_synapses; - } - } + if (num_synapses > {{owner.name}}_max_bundle_size) + {{owner.name}}_max_bundle_size = num_synapses; // copy this bundle to device and store the device pointer int32_t* d_this_bundle = d_ptr_synapse_ids + sum_bundle_sizes; @@ -671,10 +658,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}( // add num_bundle_ids as last entry h_global_bundle_id_start_by_pre[num_pre_post_blocks] = num_bundle_ids; - // floor(mean(h_num_synapses_by_bundle)) = sum_bundle_sizes / num_bundle_ids; - assert(std::floor(mean_bundle_sizes) == sum_bundle_sizes / num_bundle_ids); - {{owner.name}}_bundle_size_mean = mean_bundle_sizes; - {{owner.name}}_bundle_size_std = getStd(count_bundle_sizes, M2_bundle_sizes); + // floor(mean(h_num_synapses_by_bundle)) + {{owner.name}}_mean_bundle_size = sum_bundle_sizes / num_bundle_ids; // pointer to start of synapse IDs array CUDA_SAFE_CALL( @@ -851,7 +836,7 @@ __global__ void _before_run_kernel_{{codeobj_name}}( printf("INFO _before_run_kernel_{{codeobj_name}}\n" "\t%u blocks\n" "\t%u threads\n" - "\t%i registers per thread\n" + "\t%i registers per block\n" "\t%i bytes statically-allocated shared memory per block\n" "\t%i bytes local memory per thread\n" "\t%i bytes user-allocated constant memory\n" @@ -1023,13 +1008,14 @@ void _run_{{codeobj_name}}() // get the number of spiking neurons int32_t num_spiking_neurons; CUDA_SAFE_CALL( - cudaMemcpy(&num_spiking_neurons, + cudaMemcpyAsync(&num_spiking_neurons, dev{{_eventspace}}[current_idx{{_eventspace}}] + _num_{{owner.event}}space - 1, sizeof(int32_t), cudaMemcpyDeviceToHost) ); // advance spike queues - _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks>>>(); + _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks, 0, stream>>>(); + CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); CUDA_CHECK_ERROR("_advance_kernel_{{codeobj_name}}"); @@ -1037,11 +1023,6 @@ void _run_{{codeobj_name}}() block prepare_kernel and block kernel_call are executed in this else clause #} {% endblock host_maincode %} -{% block kernel_info_num_blocks_str %} -"\tvariable number of blocks (depends on number of spiking neurons)\n" -{% endblock %} -{% block kernel_info_num_blocks_var %} -{% endblock %} {% block prepare_kernel_inner %} {% if not bundle_mode %} @@ -1082,4 +1063,4 @@ void _run_{{codeobj_name}}() CUDA_CHECK_ERROR("_run_kernel_{{codeobj_name}}"); } } // end else if ({{owner.name}}_max_size > 0) {# from block host_maincode #} -{% endblock kernel_call %} +{% endblock kernel_call %} \ No newline at end of file