From 2fd63b624dec23a1b0b11f2749a4b1a98d30dbbb Mon Sep 17 00:00:00 2001 From: issac Date: Mon, 14 Dec 2015 17:16:44 +0100 Subject: [PATCH] Integrated Claudia's cleanup code --- cmake/info.cmake | 6 +- .../model/observation/gpu/cuda_filter.hpp | 292 ++-- .../kinect_image_observation_model_gpu.hpp | 890 +++++++--- .../observation/gpu/object_rasterizer.hpp | 147 +- include/dbot/util/traits.hpp | 1 + src/dbot/model/observation/gpu/cuda_filter.cu | 1469 ++++------------- .../observation/gpu/object_rasterizer.cpp | 292 ++-- src/dbot/model/observation/gpu/shader.cpp | 1 - 8 files changed, 1334 insertions(+), 1764 deletions(-) diff --git a/cmake/info.cmake b/cmake/info.cmake index 30608c6..0013f3e 100644 --- a/cmake/info.cmake +++ b/cmake/info.cmake @@ -31,9 +31,9 @@ function(info_end) endfunction(info_end) function(info_project project_name project_version) -message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ${COLOR_HEADER}${project_name}${COLOR_CLEAR}") -message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} Version: ${COLOR_BOLD}${project_version}${COLOR_CLEAR}") -endfunction(info_package) + message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ${COLOR_HEADER}${project_name}${COLOR_CLEAR}") + message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} Version: ${COLOR_BOLD}${project_version}${COLOR_CLEAR}") +endfunction(info_project) function(info_header list_header) message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ") diff --git a/include/dbot/model/observation/gpu/cuda_filter.hpp b/include/dbot/model/observation/gpu/cuda_filter.hpp index 49b6568..5b214a6 100644 --- a/include/dbot/model/observation/gpu/cuda_filter.hpp +++ b/include/dbot/model/observation/gpu/cuda_filter.hpp @@ -1,168 +1,254 @@ +/// @author Claudia Pfreundt + #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_CUDA_FILTER_HPP #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_CUDA_FILTER_HPP -#include "boost/shared_ptr.hpp" #include -#include "GL/glut.h" - #include -#include -#include -#include - namespace fil { +/// This class provides a parallel implementation of the weighting step on the +/// GPU. +/** After initializing the class and setting the execution parameters like the + * number of poses + * and the resolution, you can weigh poses with the weigh_poses() function. + * Make sure to + * always render the poses first with opengl, then map the texture into CUDA, + * update the observation image with set_observations() and update the + * occlusion indices (after resampling) + * before you call the weigh_poses() function. + */ class CudaFilter { - public: - CudaFilter(); +public: + /// constructor which takes the resolution of the camera image + /** + * \param [in] nr_rows the number of rows in each camera image + * \param [in] nr_cols the number of columns in each camera image + */ + CudaFilter(const int nr_rows = 60, const int nr_cols = 80); + + /// destructor which frees the memory used on the GPU ~CudaFilter(); - void init(std::vector > com_models, - float angle_sigma, - float trans_sigma, - float p_visible_init, - float c, - float log_c, - float p_visible_occluded, - float tail_weight, - float model_sigma, - float sigma_factor, - float max_depth, - float exponential_rate); - - // filter functions - void propagate(const float& current_time, - std::vector >& states); // not used - void propagate_multiple( - const float& current_time, - std::vector > >& states); // not used - void compare(float observation_time, - bool constant_occlusion, - std::vector& log_likelihoods); // not used - void compare_multiple(bool update, std::vector& log_likelihoods); - void resample(std::vector resampling_indices); // not used - void resample_multiple(std::vector resampling_indices); // not used + /// copies constants to GPU memory and initializes some memory-related + /// values. + /** + * This function has to be called once in the beginning, before calling the + * weigh_poses() function. + * \param [in] initial_occlusion_prob the initial probability for each pixel + * of being occluded, meaning + * that something occludes the object in this pixel + * \param [in] p_occluded_occluded the probability of a pixel staying + * occluded from one frame to the next + * \param [in] p_occluded_visible the probability of a pixel changing from + * being occluded to being visible + * from one frame to the next + * \param [in] tail_weight the probability of a faulty measurement + * \param [in] model_sigma the uncertainty in the 3D model of the object + * \param [in] sigma_factor the standard deviation of the measurement noise + * at a distance of 1m to the camera + * \param [in] max_depth maximum value which can be measured by the depth + * sensor + * \param [in] exponential_rate the rate of the exponential distribution + * that models the probability of a measurement coming from an unknown + * object + */ + void init(const float initial_occlusion_prob, + const float p_occluded_occluded, + const float p_occluded_visible, + const float tail_weight, + const float model_sigma, + const float sigma_factor, + const float max_depth, + const float exponential_rate); + + /// weights the different poses that were previously rendered with OpenGL + /** + * \param [in] update whether or not to update the occlusion probabilities + * during this weighting + * \param [out] log_likelihoods the computed likelihoods for each pose + */ + void weigh_poses(const bool update_occlusions, + std::vector& log_likelihoods); // setters - void set_states(std::vector >& states, - int seed); // not needed if propagation not on GPU - void set_states_multiple(int n_objects, - int n_features, - int seed); // not needed if propagation not on GPU + /// sets the number of threads used for the CUDA weighting kernel to the + /// desired number. + /// A default of 128 is used if nothing is specified here. + /** + * \param [in] nr_threads the desired number of threads + */ + void set_nr_threads(const int nr_threads); + + /// copies the observation image from the camera to the GPU for comparison + /** + * \param [in] observations a pointer to the observation values + * \param [in] observation_time the time at which this observation was + * captured + */ void set_observations(const float* observations, const float observation_time); - void set_observations(const float* observations); // not used outside, can - // be integrated into - // above - void set_visibility_probabilities(const float* visibility_probabilities); - void set_prev_sample_indices(const int* prev_sample_indices); + + /// sets the indices to the occlusion array for every state + /** + * \param [in] occlusion_indices [state_nr] = {index}. For each state, this + * gives the index + * into the occlusion array. + */ + void set_occlusion_indices(const int* occlusion_indices); + + /// sets the resolution for the images to be compared + /** This function might downgrade the number of poses or change the + * arrangement of the + * poses in the grid due to the resolution change. + * \param [in] n_rows the number of rows in an image + * \param [in] n_cols the number of columns in an image + * \param [out] nr_poses the number of poses that will be weighted + * \param [out] nr_poses_per_row the number of poses per row that will be + * weighted + * \param [out] nr_poses_per_column the number of poses per column that will + * be weighted + * \param [in] adapt_to_constraints whether to automatically adapt to GPU + * constraints or quit the program if constraints are not met + */ void set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, - int& nr_poses_per_column); + int& nr_poses_per_column, + bool adapt_to_constraints = false); + + /// sets the occlusion probabilities for all pixels for all states + /** + * \param [in] occlusion_probabilities a 1D-array of occlusion probabilities + * which should contain + * nr_rows * nr_cols * nr_poses values. + */ + void set_occlusion_probabilities(const float* occlusion_probabilities); + + /// maps the texture array to an actual texture reference + /** + * \param [in] texture_array the cudaArray retrieved from OpenGL + */ + void map_texture_to_texture_array(const cudaArray_t texture_array); + + /// allocates the maximum amount of memory that will ever be needed by CUDA + /// during runtime + /** + * \param [in][out] allocated_poses the maximum number of poses that will + * ever be evaluated in one weighting step. + * This number might be lowered if GPU contraints do now allow this number + * of poses. + * \param [out] allocated_poses_per_row the maximum number of poses per row + * \param [out] allocated_poses_per_column the maximum number of poses per + * column + * \param [in] adapt_to_constraints whether to automatically adapt to GPU + * constraints or quit the program if constraints are not met + */ void allocate_memory_for_max_poses(int& allocated_poses, int& allocated_poses_per_row, - int& allocated_poses_per_column); + int& allocated_poses_per_column, + bool adapt_to_constraints = false); + + /// sets the number of poses to be weighted in the next weighting step + /** + * \param [in][out] nr_poses the desired number of poses. Might be changed + * due to GPU constraints. + * \param [out] nr_poses_per_row the number of poses per row + * \param [out] nr_poses_per_column the number of poses per column + * \param [in] adapt_to_constraints whether to automatically adapt to GPU + * constraints or quit the program if constraints are not met + */ void set_number_of_poses(int& nr_poses, int& nr_poses_per_row, - int& nr_poses_per_column); - void set_texture_array(cudaArray_t texture_array); + int& nr_poses_per_column, + bool adapt_to_constraints = false); // getters - std::vector get_visibility_probabilities(int state_id); - std::vector > get_visibility_probabilities(); // returns - // all of - // them. - // Ask - // Manuel - // if they - // could - // need - // that. - - void map_texture(); - void destroy_context(); - - private: - // resolution values if not specified - static const int WINDOW_WIDTH = 80; - static const int WINDOW_HEIGHT = 60; - static const int DEFAULT_NR_THREADS = 128; + /// gets the maximum number of threads that can be handled with this GPU + /** + * \return the maximum number of threads that can be scheduled per block on + * the GPU + */ + int get_max_nr_threads(); - // time observation - static const int COUNT = 500; - int count_; - double compare_kernel_time_; - double copy_likelihoods_time_; + /// gets the warp size of this GPU + /** + * \return the warp size = the number of threads that are executed + * concurrently on a CUDA streaming multiprocessor + */ + int get_warp_size(); + + /// gets the occlusion probabilities of a particular state + /** + * \param [in] state_id the index into the state array + * \return a 1D array containing the occlusion probability for each pixel + */ + std::vector get_occlusion_probabilities(int state_id); + +private: + static const int DEFAULT_NR_THREADS = 128; // device pointers to arrays stored in global memory on the GPU - float* d_states_; // not needed if propagation not on GPU - float* d_states_copy_; // not needed if propagation not on GPU - float* d_visibility_probs_; - float* d_visibility_probs_copy_; + float* d_occlusion_probs_; + float* d_occlusion_probs_copy_; float* d_observations_; float* d_log_likelihoods_; - int* d_prev_sample_indices_; - int* d_resampling_indices_; // not needed if resampling not on GPU - curandStateMRG32k3a* d_mrg_states_; + int* d_occlusion_indices_; // this contains, for each pose, the index into + // the occlusion probabilities array, which + // contains the occlusion probabilities for that + // particular pose. // for OpenGL interop cudaArray_t d_texture_array_; // resolution - int n_cols_; - int n_rows_; + int nr_cols_; + int nr_rows_; // maximum number of poses and their arrangement in the OpenGL texture int nr_max_poses_; int nr_max_poses_per_row_; int nr_max_poses_per_column_; - // number of poses and their arrangement in the OpenGL texture + // actual number of poses and their arrangement in the OpenGL texture + // (current frame) int nr_poses_; int nr_poses_per_row_; int nr_poses_per_column_; - // number of features in a state vector - int n_features_; - // block and grid arrangement of the CUDA kernels int nr_threads_, n_blocks_; dim3 grid_dimension_; - // system properties - int warp_size_; - int n_mps_; - - // visibility prob default - float visibility_prob_default_; + // occlusion probability default value + float occlusion_prob_default_; - // time values to compute the time deltas when calling propagate() or - // evaluate() + // time values to compute the time deltas when calling the weighting + // function float occlusion_time_; float observation_time_; - // float delta_time_; - float last_propagation_time_; // not needed if propagation not on GPU - - // booleans to describe the state of the cuda filter, to avoid wrong usage - // of the class - bool n_poses_set_; - // CUDA device properties cudaDeviceProp cuda_device_properties_; + int warp_size_; + int n_mps_; + + // bool to ensure correct usage of public functions + bool observations_set_, occlusion_indices_set_, + occlusion_probabilities_set_, memory_allocated_; + bool number_of_poses_set_, constants_initialized_; void set_default_kernel_config(int& nr_poses_, int& nr_poses_per_row, int& nr_poses_per_column, - bool& nr_poses_changed); - + bool& nr_poses_changed, + bool adapt_to_constraints); // helper functions template - void allocate(T*& pointer, size_t size, std::string name); + void allocate(T*& pointer, size_t size); void check_cuda_error(const char* msg); }; } diff --git a/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp b/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp index 353adc9..d2579f5 100644 --- a/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp +++ b/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp @@ -1,6 +1,11 @@ +/// @author Claudia Pfreundt + #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_KINECT_IMAGE_OBSERVATION_MODEL_GPU_HPP #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_KINECT_IMAGE_OBSERVATION_MODEL_GPU_HPP +#define PROFILING_ACTIVE +//#define OPTIMIZE_NR_THREADS + #include #include "boost/shared_ptr.hpp" #include "boost/filesystem.hpp" @@ -10,7 +15,6 @@ #include #include -#include #include #include @@ -20,34 +24,34 @@ #include #include - -//#include +#include #include namespace dbot { - // Forward declarations -template class KinectImageObservationModelGPU; +template +class KinectImageObservationModelGPU; namespace internal { /** - * ImageObservationModelCPU distribution traits specialization + * ImageObservationModelGPU distribution traits specialization * \internal */ template -struct Traits > +struct Traits> { - typedef fl::Real Scalar; + typedef double Scalar; + typedef Eigen::Matrix Observation; typedef RbObservationModel Base; - typedef typename Base::Observation Observation; typedef typename Eigen::Matrix CameraMatrix; }; } + /** * \class ImageObservationModelGPU * @@ -55,15 +59,15 @@ struct Traits > * \ingroup observation_models */ template -class KinectImageObservationModelGPU: - public internal::Traits >::Base +class KinectImageObservationModelGPU + : public internal::Traits>::Base { public: - typedef internal::Traits > Traits; + typedef internal::Traits> Traits; - typedef typename Traits::Scalar Scalar; - typedef typename Traits::Observation Observation; - typedef typename Traits::CameraMatrix CameraMatrix; + typedef typename Traits::Scalar Scalar; + typedef typename Traits::Observation Observation; + typedef typename Traits::CameraMatrix CameraMatrix; typedef typename Traits::Base::StateArray StateArray; typedef typename Traits::Base::RealArray RealArray; @@ -71,45 +75,89 @@ class KinectImageObservationModelGPU: typedef typename Eigen::Transform Affine; - - - // TODO: ALL THIS SHOULD SWITCH FROM USING VISIBILITY TO OCCLUSION - KinectImageObservationModelGPU(const CameraMatrix& camera_matrix, - const size_t& n_rows, - const size_t& n_cols, - const size_t& max_sample_count, - const std::vector > vertices_double, - const std::vector > > indices, - const std::string vertex_shader_path, - const std::string fragment_shader_path, - const Scalar& initial_occlusion_prob = 0.1d, - const double& delta_time = 0.033d, - const float p_occluded_visible = 0.1f, - const float p_occluded_occluded = 0.7f, - const float tail_weight = 0.01f, - const float model_sigma = 0.003f, - const float sigma_factor = 0.0014247f, - const float max_depth = 6.0f, - const float exponential_rate = -log(0.5f)): - camera_matrix_(camera_matrix), - n_rows_(n_rows), - n_cols_(n_cols), - nr_max_poses_(max_sample_count), - indices_(indices), - initial_visibility_prob_(1 - initial_occlusion_prob), - p_visible_visible_(1.0 - p_occluded_visible), - p_visible_occluded_(1.0 - p_occluded_occluded), - tail_weight_(tail_weight), - model_sigma_(model_sigma), - sigma_factor_(sigma_factor), - max_depth_(max_depth), - exponential_rate_(exponential_rate), - nr_poses_(max_sample_count), - observations_set_(false), - resource_registered_(false), - nr_calls_set_observation_(0), - observation_time_(0), - Traits::Base(delta_time) + /// constructor which takes relevant constants and initializes the graphics + /// pipeline with them + /** + * \param [in] camera_matrix + * matrix of the intrinsic parameters of the camera + * \param [in] nr_rows + * the number of rows in one sensor image (vertical resolution) + * \param [in] nr_cols + * the number of columns in one sensor image (horizontal + * resolution) + * \param [in] max_sample_count + * the maximum number of poses that will be rendered per object + * in one frame. + * This is needed to allocate the necessary memory on the GPU. + * \param [in] vertices + * [object_nr][vertex_nr] = {x, y, z}. This list should + * contain, for each object, a list of 3-dimensional vectors + * that specify the corners of the triangles of the object mesh + * \param [in] indices [object_nr][index_nr][0 - 2] = {index}. This list + * should contain the indices + * that index the vertices list and tell us which vertices to connect to + * a + * triangle (every group of 3). + * For each object, the indices should be in the range of [0, nr_vertices + * - 1]. + * \param [in] vertex_shader_path path to the vertex shader + * \param [in] fragment_shader_path path to the fragment shader + * \param [in] initial_occlusion_prob the initial probability for each pixel + * of being occluded, meaning + * that something occludes the object in this pixel + * \param [in] delta_time the time between two frames in seconds. This + * should + * correspond to the rate at which the + * sensor provides new images of the scene. + * \param [in] p_occluded_visible the probability of a pixel changing from + * being occluded to being visible + * from one frame to the next + * \param [in] p_occluded_occluded the probability of a pixel staying + * occluded from one frame to the next + * \param [in] tail_weight the probability of a faulty measurement + * \param [in] model_sigma the uncertainty in the 3D model of the object + * \param [in] sigma_factor the standard deviation of the measurement noise + * at a distance of 1m to the camera + * \param [in] max_depth maximum value which can be measured by the depth + * sensor + * \param [in] exponential_rate the rate of the exponential distribution + * that + * models the probability of a measurement coming from an unknown object + */ + KinectImageObservationModelGPU( + const CameraMatrix& camera_matrix, + const size_t& n_rows, + const size_t& n_cols, + const size_t& max_sample_count, + const std::vector> vertices_double, + const std::vector>> indices, + const std::string vertex_shader_path, + const std::string fragment_shader_path, + const Scalar& initial_occlusion_prob = 0.1d, + const double& delta_time = 0.033d, + const float p_occluded_visible = 0.1f, + const float p_occluded_occluded = 0.7f, + const float tail_weight = 0.01f, + const float model_sigma = 0.003f, + const float sigma_factor = 0.0014247f, + const float max_depth = 6.0f, + const float exponential_rate = -log(0.5f)) + : camera_matrix_(camera_matrix), + n_rows_(n_rows), + n_cols_(n_cols), + nr_max_poses_(max_sample_count), + indices_(indices), + initial_occlusion_prob_(initial_occlusion_prob), + tail_weight_(tail_weight), + model_sigma_(model_sigma), + sigma_factor_(sigma_factor), + max_depth_(max_depth), + exponential_rate_(exponential_rate), + nr_poses_(max_sample_count), + observations_set_(false), + resource_registered_(false), + observation_time_(0), + Traits::Base(delta_time) { // set constants this->default_poses_.recount(vertices_double.size()); @@ -117,393 +165,687 @@ class KinectImageObservationModelGPU: // convert doubles to floats vertices_.resize(vertices_double.size()); - for(size_t object_index = 0; object_index < vertices_.size(); object_index++) + for (size_t object_index = 0; object_index < vertices_.size(); + object_index++) { - vertices_[object_index].resize(vertices_double[object_index].size()); - for(size_t vertex_index = 0; vertex_index < vertices_[object_index].size(); vertex_index++) - vertices_[object_index][vertex_index] = vertices_double[object_index][vertex_index].cast(); + vertices_[object_index].resize( + vertices_double[object_index].size()); + for (size_t vertex_index = 0; + vertex_index < vertices_[object_index].size(); + vertex_index++) + vertices_[object_index][vertex_index] = + vertices_double[object_index][vertex_index].cast(); } // check for incorrect path names - if(!boost::filesystem::exists(vertex_shader_path)) + if (!boost::filesystem::exists(vertex_shader_path)) { std::cout << "vertex shader does not exist at: " - << vertex_shader_path << std::endl; + << vertex_shader_path << std::endl; exit(-1); } - if(!boost::filesystem::exists(fragment_shader_path)) + if (!boost::filesystem::exists(fragment_shader_path)) { std::cout << "fragment_shader does not exist at: " - << fragment_shader_path << std::endl; + << fragment_shader_path << std::endl; exit(-1); } - vertex_shader_path_ = vertex_shader_path; + vertex_shader_path_ = vertex_shader_path; fragment_shader_path_ = fragment_shader_path; - // initialize opengl and cuda - opengl_ = boost::shared_ptr - (new ObjectRasterizer(vertices_, - indices_, - vertex_shader_path_, - fragment_shader_path_, - camera_matrix_.cast())); - cuda_ = boost::shared_ptr (new fil::CudaFilter()); - - - // sets the dimensions of how many poses will be rendered per row and per column in a texture - opengl_->allocate_textures_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); - - std::cout << "OpenGL: allocated " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - - int tmp_max_nr_poses = nr_max_poses_; - int tmp_nr_poses_per_row = nr_poses_per_row_; - int tmp_nr_poses_per_column = nr_poses_per_column_; - - cuda_->allocate_memory_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); - - std::cout << "CUDA: allocated " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - - - // if number of poses gets limited by cuda, we have to reallocate the textures in OpenGL again - if (tmp_max_nr_poses != nr_max_poses_ || - tmp_nr_poses_per_row != nr_poses_per_row_ || - tmp_nr_poses_per_column != nr_poses_per_column_) { - opengl_->allocate_textures_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); - - std::cout << "OpenGL adapts to CUDA restrictions: allocated " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - } - + opengl_ = boost::shared_ptr( + new ObjectRasterizer(vertices_, + indices_, + vertex_shader_path_, + fragment_shader_path_, + camera_matrix_.cast())); - std:: cout << "set resolution in cuda..." << std::endl; + cuda_ = boost::shared_ptr(new fil::CudaFilter()); - opengl_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); + // allocates memory and sets the dimensions of how many poses will be + // rendered per row and per column in the texture + allocate_memory_for_max_poses(); - std::cout << "OpenGL: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - - tmp_max_nr_poses = nr_max_poses_; - tmp_nr_poses_per_row = nr_poses_per_row_; - tmp_nr_poses_per_column = nr_poses_per_column_; - - cuda_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); - - std::cout << "CUDA: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - - // if number of poses gets limited by cuda, we have to reallocate the textures in OpenGL again - if (tmp_max_nr_poses != nr_max_poses_ || - tmp_nr_poses_per_row != nr_poses_per_row_ || - tmp_nr_poses_per_column != nr_poses_per_column_) { - opengl_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_); - - std::cout << "OpenGL adapts to CUDA restrictions: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; - } + // sets the resolution and reallocates the texture + set_resolution(); register_resource(); - std:: cout << "set occlusions..." << std::endl; - reset(); - float c = p_visible_visible_ - p_visible_occluded_; - float log_c = log(c); - - std::vector > dummy_com_models; - cuda_->init(dummy_com_models, 0.0f, 0.0f, - initial_visibility_prob_, c, log_c, p_visible_occluded_, - tail_weight_, model_sigma_, sigma_factor_, max_depth_, exponential_rate_); - + cuda_->init(initial_occlusion_prob_, + p_occluded_occluded, + p_occluded_visible, + tail_weight_, + model_sigma_, + sigma_factor_, + max_depth_, + exponential_rate_); count_ = 0; -// render_time_ = 0; - visibility_probs_.resize(n_rows_ * n_cols_); - } + occlusion_probs_.resize(n_rows_ * n_cols_); - ~KinectImageObservationModelGPU() noexcept { +#ifdef PROFILING_ACTIVE + for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++) + { + time_[i] = 0; + } + strings_for_subtasks_[SET_OCCLUSION_INDICES] = + "Setting occlusion indices"; + strings_for_subtasks_[CONVERTING_STATE_FORMAT] = + "Converting state format"; + strings_for_subtasks_[RENDERING] = "Rendering step"; + strings_for_subtasks_[MAPPING] = + "Mapping the texture from OpenGL to CUDA"; + strings_for_subtasks_[WEIGHTING] = "Weighting step"; + strings_for_subtasks_[UNMAPPING] = + "Unmapping the texture and reconverting the likelihoods"; +#endif -// std::cout << "time for render: " << render_time_ / count_ << std::endl; + optimize_nr_threads_ = false; + adapt_to_constraints_ = false; - } +#ifdef OPTIMIZE_NR_THREADS + set_optimization_of_thread_nr(true); +#endif + if (optimize_nr_threads_) + { + max_nr_threads_ = cuda_->get_max_nr_threads(); + warp_size_ = cuda_->get_warp_size(); + nr_threads_ = warp_size_; + best_time_ = std::numeric_limits::infinity(); + best_nr_threads_ = nr_threads_; + average_time_ = 0; + } + optimization_runs_ = 0; + } + /// computes the loglikelihoods for the given states + /** Make sure the observation image was set previously, as it is used for + * comparison. + * \param [in] deltas the states which should be evaluated + * \param [in][out] occlusion_indices for each state, this should contain the + * index into the occlusion + * array where the corresponding occlusion probabilities per pixel are + * stored + * \param [in] update_occlusions whether or not the occlusions should be + * updated in this evaluation step + * \return the loglikelihoods for the given states + */ RealArray loglikes(const StateArray& deltas, - IntArray& occlusion_indices, - const bool& update_occlusions = false) + IntArray& occlusion_indices, + const bool& update_occlusions = false) { +#ifdef PROFILING_ACTIVE + if (!optimize_nr_threads_ && optimization_runs_ != count_) + { + time_before_ = dbot::hf::get_wall_time(); + } +#endif - if(!observations_set_) + if (!observations_set_) { - std:: cout << "GPU: observations not set" << std::endl; + std::cout << "GPU: observations not set" << std::endl; exit(-1); } nr_poses_ = deltas.size(); - std::vector flog_likelihoods (nr_poses_, 0); + std::vector flog_likelihoods(nr_poses_, 0); set_number_of_poses(nr_poses_); // transform occlusion indices from size_t to int - std::vector occlusion_indices_transformed (occlusion_indices.size(), 0); + std::vector occlusion_indices_transformed(occlusion_indices.size(), + 0); for (size_t i = 0; i < occlusion_indices.size(); i++) { - occlusion_indices_transformed[i] = (int) occlusion_indices[i]; + occlusion_indices_transformed[i] = (int)occlusion_indices[i]; } - INIT_PROFILING; // copy occlusion indices to GPU - cuda_->set_prev_sample_indices(occlusion_indices_transformed.data()); - MEASURE("gpu: setting occlusion indices"); - // convert to internal state format - std::vector > > poses( - nr_poses_, - std::vector >(vertices_.size(), - std::vector(7, 0))); + cuda_->set_occlusion_indices(occlusion_indices_transformed.data()); - MEASURE("gpu: creating state vectors"); +#ifdef PROFILING_ACTIVE + store_time(SET_OCCLUSION_INDICES); +#endif + std::vector> poses( + nr_poses_, std::vector(vertices_.size())); - for(size_t i_state = 0; i_state < size_t(nr_poses_); i_state++) + for (size_t i_state = 0; i_state < size_t(nr_poses_); i_state++) { - for(size_t i_obj = 0; i_obj < vertices_.size(); i_obj++) + for (size_t i_obj = 0; i_obj < vertices_.size(); i_obj++) { auto pose_0 = this->default_poses_.component(i_obj); auto delta = deltas[i_state].component(i_obj); osr::PoseVector pose; + pose.orientation() = delta.orientation() * pose_0.orientation(); pose.position() = delta.position() + pose_0.position(); - poses[i_state][i_obj][0] = pose.orientation().quaternion().w(); - poses[i_state][i_obj][1] = pose.orientation().quaternion().x(); - poses[i_state][i_obj][2] = pose.orientation().quaternion().y(); - poses[i_state][i_obj][3] = pose.orientation().quaternion().z(); - poses[i_state][i_obj][4] = pose.position()[0]; - poses[i_state][i_obj][5] = pose.position()[1]; - poses[i_state][i_obj][6] = pose.position()[2]; + poses[i_state][i_obj] = pose.homogeneous().cast(); } } - MEASURE("gpu: converting state format"); - - -// double before_render = dbot::hf::get_wall_time(); - +#ifdef PROFILING_ACTIVE + store_time(CONVERTING_STATE_FORMAT); +#endif opengl_->render(poses); -// double after_render = dbot::hf::get_wall_time(); -// render_time_ += after_render - before_render; - count_++; +#ifdef PROFILING_ACTIVE + store_time(RENDERING); +#endif + cudaGraphicsMapResources(1, &texture_resource_, 0); + cudaGraphicsSubResourceGetMappedArray( + &texture_array_, texture_resource_, 0, 0); + cuda_->map_texture_to_texture_array(texture_array_); - MEASURE("gpu: rendering"); +#ifdef PROFILING_ACTIVE + store_time(MAPPING); +#endif + if (optimize_nr_threads_) + { + if (nr_threads_ <= max_nr_threads_) + { + before_weighting_ = dbot::hf::get_wall_time(); + cuda_->set_nr_threads(nr_threads_); + optimization_runs_++; + } + else + { + nr_threads_ = best_nr_threads_; + optimize_nr_threads_ = false; + std::cout << std::endl + << "Best #threads: " << nr_threads_ << std::endl + << std::endl; + } + } - cudaGraphicsMapResources(1, &combined_texture_resource_, 0); - cudaGraphicsSubResourceGetMappedArray(&texture_array_, combined_texture_resource_, 0, 0); - cuda_->set_texture_array(texture_array_); - cuda_->map_texture(); - MEASURE("gpu: mapping texture"); + cuda_->weigh_poses(update_occlusions, flog_likelihoods); - cuda_->compare_multiple(update_occlusions, flog_likelihoods); - cudaGraphicsUnmapResources(1, &combined_texture_resource_, 0); + if (optimize_nr_threads_) + { + if (nr_threads_ <= max_nr_threads_) + { + after_weighting_ = dbot::hf::get_wall_time(); + double time = after_weighting_ - before_weighting_; + average_time_ += time; + + if (count_ % NR_ROUNDS_PER_SETTING_ == + NR_ROUNDS_PER_SETTING_ - 1) + { + average_time_ /= NR_ROUNDS_PER_SETTING_; + if (average_time_ < best_time_) + { + best_time_ = average_time_; + best_nr_threads_ = nr_threads_; + } + + nr_threads_ += warp_size_; + average_time_ = 0; + } + } + } - MEASURE("gpu: computing likelihoods"); +#ifdef PROFILING_ACTIVE + store_time(WEIGHTING); +#endif + cudaGraphicsUnmapResources(1, &texture_resource_, 0); - if(update_occlusions) + if (update_occlusions) { - for(size_t i_state = 0; i_state < occlusion_indices.size(); i_state++) + for (size_t i_state = 0; i_state < occlusion_indices.size(); + i_state++) occlusion_indices[i_state] = i_state; - - MEASURE("gpu: updating occlusions"); } - // convert RealArray log_likelihoods(flog_likelihoods.size()); - for(size_t i = 0; i < flog_likelihoods.size(); i++) + for (size_t i = 0; i < flog_likelihoods.size(); i++) log_likelihoods[i] = flog_likelihoods[i]; +#ifdef PROFILING_ACTIVE + store_time(UNMAPPING); +#endif + + count_++; return log_likelihoods; } - - void set_observation(const Observation& image){ + /// sets the observation image that should be used for comparison in the + /// next evaluation step + /** + * \param [in] image the image obtained from the camera + */ + void set_observation(const Observation& image) + { std::vector std_measurement(image.size()); - for(size_t row = 0; row < image.rows(); row++) - for(size_t col = 0; col < image.cols(); col++) - std_measurement[row*image.cols() + col] = image(row, col); + for (size_t row = 0; row < image.rows(); row++) + for (size_t col = 0; col < image.cols(); col++) + std_measurement[row * image.cols() + col] = image(row, col); + + observation_time_ += this->delta_time_; - set_observation(std_measurement, this->delta_time_); + cuda_->set_observations(std_measurement.data(), observation_time_); + observations_set_ = true; } + /// resets the occlusion probabilities and observation time virtual void reset() { - set_occlusions(); + float default_occlusion_probability = initial_occlusion_prob_; + + std::vector occlusion_probabilities( + n_rows_ * n_cols_ * nr_poses_, default_occlusion_probability); + cuda_->set_occlusion_probabilities(occlusion_probabilities.data()); + observation_time_ = 0; } + /// activates automatic optimization of the number of threads + void set_optimization_of_thread_nr(bool shouldOptimize) + { + optimize_nr_threads_ = shouldOptimize; + } - // TODO: this image should be in a different format BOTH OF THEM!! - const std::vector get_occlusions(size_t index) const + /// activates automatic downgrading of the number of poses, if hardware + /// limitations are reached + void set_adaptation_to_GPU_constraints(bool shouldAdapt) { - std::vector visibility_probs = cuda_->get_visibility_probabilities((int) index); - return visibility_probs; + adapt_to_constraints_ = shouldAdapt; } - void get_range_image(std::vector > &intersect_indices, - std::vector > &depth) + /// returns the occlusion probabilities for each pixel for a given state + /** + * \param [in] index the index into the state array of the state you are + * interested in + * \return an Eigen Matrix containing the occlusion probabilities for each + * pixel for the given state + */ + const Eigen::Map get_occlusions(size_t index) const { - opengl_->get_depth_values(intersect_indices, depth); + std::vector occlusion_probs = + cuda_->get_occlusion_probabilities((int)index); + + // convert values to doubles and put them into an Eigen Matrix + std::vector occlusion_probs_double(occlusion_probs.begin(), + occlusion_probs.end()); + Eigen::Map occlusion_probs_matrix( + &occlusion_probs_double[0], n_rows_, n_cols_); + + return occlusion_probs_matrix; } -private: - // TODO: this function should disappear, BOTH OF THEM - void set_observation(const std::vector& observations, const Scalar& delta_time) + /// returns the depth values of the rendered states + /** + * \return an Eigen Matrix containing the depth values per pixel, stored in + * a 1D array denoting the respective pose + */ + std::vector> get_range_image() { - observation_time_ += delta_time; + std::vector> depth_values_raw = + opengl_->get_depth_values(); - cuda_->set_observations(observations.data(), observation_time_); - observations_set_ = true; + // convert depth values to doubles + std::vector> depth_values_raw_double; + for (int i = 0; i < depth_values_raw.size(); i++) + { + depth_values_raw_double.push_back(std::vector( + depth_values_raw[i].begin(), depth_values_raw[i].end())); + } + + // map values into an Eigen Matrix + std::vector> depth_values; + + for (int i = 0; i < depth_values_raw.size(); i++) + { + Eigen::Map tmp( + &depth_values_raw_double[i][0], n_rows_, n_cols_); + depth_values.push_back(tmp); + } + return depth_values; } - void set_occlusions(const float& visibility_prob = -1) + /// The destructor. If profiling is activated, the time measurements are + /// processed and printed out here + ~KinectImageObservationModelGPU() noexcept { - float default_visibility_probability = visibility_prob; - if (visibility_prob == -1) default_visibility_probability = initial_visibility_prob_; +#ifdef PROFILING_ACTIVE - std::vector visibility_probabilities (n_rows_ * n_cols_ * nr_poses_, default_visibility_probability); - cuda_->set_visibility_probabilities(visibility_probabilities.data()); - // TODO set update times if you want to use them + count_ -= optimization_runs_; + if (count_ > 0) + { + std::cout << std::endl + << "Time measurements for the different steps of the " + "evaluation process averaged over " + << count_ << " evaluation calls:" << std::endl + << std::endl; + + double total_time_per_evaluation = 0; + for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++) + { + total_time_per_evaluation += time_[i]; + } + total_time_per_evaluation /= count_; + + for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++) + { + std::cout << strings_for_subtasks_[i] << ": \t" + << time_[i] / count_ << " s \t " + << std::setprecision(1) + << time_[i] / count_ * 100 / total_time_per_evaluation + << " %" << std::setprecision(9) << std::endl; + } + std::cout << "Total time per evaluation call: " + << total_time_per_evaluation << std::endl; + if (optimization_runs_ > 0) + { + std::cout << std::endl + << "The best number of threads for this setup was " + "estimated to be " + << nr_threads_ << std::endl; + } + } + else + { + std::cout << "No measurement for the different steps of the " + "evaluation was taken. " + << "Most likely, you need to deactivate the optimization " + "for #threads or" + << " let the program run for a longer period of time." + << std::endl; + + if (optimize_nr_threads_ == true) + { + std::cout << std::endl + << "The best #threads could not be computed, because " + "the runtime of the program was too short." + << " Please allow the program to run a bit longer to " + "find the best #threads." + << std::endl; + } + } +#endif } +private: const Eigen::Matrix3d camera_matrix_; const size_t n_rows_; const size_t n_cols_; - const float initial_visibility_prob_; int nr_max_poses_; - void set_number_of_poses(int nr_poses){ - - nr_poses_ = nr_poses; - opengl_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_); + void allocate_memory_for_max_poses() + { + // allocates memory and sets the dimensions of how many poses will be + // rendered per row and per column in the texture + opengl_->allocate_textures_for_max_poses(nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); - std::cout << "OpenGL: set number of poses to " << nr_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; + std::cout << "OpenGL: allocated " << nr_max_poses_ + << " poses in the form (" << nr_poses_per_row_ << ", " + << nr_poses_per_column_ << ")" << std::endl; - int tmp_nr_poses = nr_poses_; + int tmp_max_nr_poses = nr_max_poses_; int tmp_nr_poses_per_row = nr_poses_per_row_; int tmp_nr_poses_per_column = nr_poses_per_column_; - cuda_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_); + cuda_->allocate_memory_for_max_poses(nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); - std::cout << "CUDA: set number of poses to " << nr_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; + std::cout << "CUDA: allocated " << nr_max_poses_ + << " poses in the form (" << nr_poses_per_row_ << ", " + << nr_poses_per_column_ << ")" << std::endl; - // if number of poses gets limited by cuda, we have to tell OpenGL about it - if (tmp_nr_poses != nr_poses_ || + // if number of poses gets limited by cuda, we have to reallocate the + // textures in OpenGL again + if (tmp_max_nr_poses != nr_max_poses_ || + tmp_nr_poses_per_row != nr_poses_per_row_ || + tmp_nr_poses_per_column != nr_poses_per_column_) + { + opengl_->allocate_textures_for_max_poses(nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + std::cout << "OpenGL adapts to CUDA restrictions: allocated " + << nr_max_poses_ << " poses in the form (" + << nr_poses_per_row_ << ", " << nr_poses_per_column_ + << ")" << std::endl; + } + } + + void set_resolution() + { + int tmp_max_nr_poses = nr_max_poses_; + int tmp_nr_poses_per_row = nr_poses_per_row_; + int tmp_nr_poses_per_column = nr_poses_per_column_; + + opengl_->set_resolution(n_rows_, + n_cols_, + nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + if (tmp_max_nr_poses != nr_max_poses_ || tmp_nr_poses_per_row != nr_poses_per_row_ || - tmp_nr_poses_per_column != nr_poses_per_column_) { - opengl_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_); + tmp_nr_poses_per_column != nr_poses_per_column_) + { + std::cout << "OpenGL: setting resolution changes allocation to " + << nr_max_poses_ << " poses in the form (" + << nr_poses_per_row_ << ", " << nr_poses_per_column_ + << ")" << std::endl; + } - std::cout << "OpenGL adapts to CUDA restrictions: set number of poses to " << nr_poses_ << " poses in the form (" - << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl; + tmp_max_nr_poses = nr_max_poses_; + tmp_nr_poses_per_row = nr_poses_per_row_; + tmp_nr_poses_per_column = nr_poses_per_column_; + + cuda_->set_resolution(n_rows_, + n_cols_, + nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + if (tmp_max_nr_poses != nr_max_poses_ || + tmp_nr_poses_per_row != nr_poses_per_row_ || + tmp_nr_poses_per_column != nr_poses_per_column_) + { + std::cout << "CUDA: setting resolution changes allocation to " + << nr_max_poses_ << " poses in the form (" + << nr_poses_per_row_ << ", " << nr_poses_per_column_ + << ")" << std::endl; } + tmp_max_nr_poses = nr_max_poses_; + tmp_nr_poses_per_row = nr_poses_per_row_; + tmp_nr_poses_per_column = nr_poses_per_column_; + + // if number of poses gets limited by cuda, we have to reallocate the + // textures in OpenGL again + if (tmp_max_nr_poses != nr_max_poses_ || + tmp_nr_poses_per_row != nr_poses_per_row_ || + tmp_nr_poses_per_column != nr_poses_per_column_) + { + opengl_->set_resolution(n_rows_, + n_cols_, + nr_max_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + std::cout << "OpenGL adapts to CUDA restrictions: setting " + "resolution changes allocation to " + << nr_max_poses_ << " poses in the form (" + << nr_poses_per_row_ << ", " << nr_poses_per_column_ + << ")" << std::endl; + } } - void check_cuda_error(const char *msg) + void set_number_of_poses(int nr_poses) { - cudaError_t err = cudaGetLastError(); - if( cudaSuccess != err) + nr_poses_ = nr_poses; + opengl_->set_number_of_poses(nr_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + int tmp_nr_poses = nr_poses_; + int tmp_nr_poses_per_row = nr_poses_per_row_; + int tmp_nr_poses_per_column = nr_poses_per_column_; + + cuda_->set_number_of_poses(nr_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + + // if number of poses gets limited by cuda, we have to tell OpenGL about + // it + if (tmp_nr_poses != nr_poses_ || + tmp_nr_poses_per_row != nr_poses_per_row_ || + tmp_nr_poses_per_column != nr_poses_per_column_) { - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); - exit(EXIT_FAILURE); + opengl_->set_number_of_poses(nr_poses_, + nr_poses_per_row_, + nr_poses_per_column_, + adapt_to_constraints_); + } + + if (nr_poses_ < nr_poses) + { + std::cout << "Number of poses was reduced to " << nr_poses_ + << " poses in the form (" << nr_poses_per_row_ << ", " + << nr_poses_per_column_ << ")." << std::endl; } } + void store_time(int task) + { + if (!optimize_nr_threads_ && optimization_runs_ != count_) + { + time_after_ = dbot::hf::get_wall_time(); + time_[task] += time_after_ - time_before_; + time_before_ = time_after_; + } + } - void unregister_resource() + void check_cuda_error(const char* msg) { - if (resource_registered_) { - cudaGraphicsUnregisterResource(combined_texture_resource_); - check_cuda_error("cudaGraphicsUnregisterResource"); - resource_registered_ = false; + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + { + fprintf( + stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } } void register_resource() { - if (!resource_registered_) { - combined_texture_opengl_ = opengl_->get_framebuffer_texture(); - cudaGraphicsGLRegisterImage(&combined_texture_resource_, combined_texture_opengl_, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly); + if (!resource_registered_) + { + opengl_texture_ = opengl_->get_framebuffer_texture(); + cudaGraphicsGLRegisterImage(&texture_resource_, + opengl_texture_, + GL_TEXTURE_2D, + cudaGraphicsRegisterFlagsReadOnly); check_cuda_error("cudaGraphicsGLRegisterImage)"); resource_registered_ = true; } } + void unregister_resource() + { + if (resource_registered_) + { + cudaGraphicsUnregisterResource(texture_resource_); + check_cuda_error("cudaGraphicsUnregisterResource"); + resource_registered_ = false; + } + } + // OpenGL handle and input boost::shared_ptr opengl_; - boost::shared_ptr cuda_; - - // arrays for timings - std::vector > cpu_times_; - std::vector > cuda_times_; + std::vector> vertices_; + std::vector>> indices_; + std::string vertex_shader_path_; + std::string fragment_shader_path_; - // data - std::vector > vertices_; - std::vector > > indices_; - std::vector visibility_probs_; + // CUDA handle + boost::shared_ptr cuda_; - // constants for likelihood evaluation - float p_visible_visible_; - float p_visible_occluded_; + // constants for likelihood evaluation in CUDA kernel + double observation_time_; + float initial_occlusion_prob_; float tail_weight_; float model_sigma_; float sigma_factor_; float max_depth_; float exponential_rate_; + std::vector occlusion_probs_; - std::string vertex_shader_path_; - std::string fragment_shader_path_; - - -// double render_time_; - + // amount of poses and pose distribution in the OpenGL texture int nr_poses_; int nr_poses_per_row_; int nr_poses_per_column_; - int count_; - - // booleans to ensure correct usage of function calls - bool observations_set_, resource_registered_; - int nr_calls_set_observation_; - // Shared resource between OpenGL and CUDA - GLuint combined_texture_opengl_; - cudaGraphicsResource* combined_texture_resource_; + GLuint opengl_texture_; + cudaGraphicsResource* texture_resource_; cudaArray_t texture_array_; - double observation_time_; + // booleans to ensure correct usage of function calls + bool observations_set_, resource_registered_; // used for time observations - static const int TIME_MEASUREMENTS_COUNT = 8; - static const int COUNT = 500; - enum time_measurement {SEND_OBSERVATIONS, RENDER, MAP_RESOURCE, GET_MAPPED_ARRAY, SET_TEXTURE_ARRAY, - MAP_TEXTURE, COMPUTE_LIKELIHOODS, UNMAP_RESOURCE}; + static const int NR_SUBTASKS_TO_MEASURE = 6; + enum subtasks_to_measure + { + SET_OCCLUSION_INDICES, + CONVERTING_STATE_FORMAT, + RENDERING, + MAPPING, + WEIGHTING, + UNMAPPING + }; + double time_[NR_SUBTASKS_TO_MEASURE]; + std::string strings_for_subtasks_[NR_SUBTASKS_TO_MEASURE]; + double time_before_, time_after_; + int count_; + // variables for the optimization runs + int nr_threads_; + int max_nr_threads_; + int warp_size_; + double best_time_; + double before_weighting_; + double after_weighting_; + int best_nr_threads_; + double average_time_; + bool stop_optimizing_; + static const int NR_ROUNDS_PER_SETTING_ = 30; + int optimization_runs_; + + // optional flags for optimizing the #threads and for adapting to GPU + // constraints + bool optimize_nr_threads_; + bool adapt_to_constraints_; }; - } #endif diff --git a/include/dbot/model/observation/gpu/object_rasterizer.hpp b/include/dbot/model/observation/gpu/object_rasterizer.hpp index f3746fe..de684f8 100644 --- a/include/dbot/model/observation/gpu/object_rasterizer.hpp +++ b/include/dbot/model/observation/gpu/object_rasterizer.hpp @@ -1,61 +1,69 @@ +/// @author Claudia Pfreundt + #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_OBJECT_RASTERIZER_HPP #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_OBJECT_RASTERIZER_HPP #include #include #include "GL/glew.h" -#include -/// renders the object using openGL rasterization +/// renders the objects using openGL rasterization /** The objects that should be rendered have to be passed in the constructor and can then be rendered - * in different poses with the Render() function. The resulting depth values are stored in a texture - * whose values can be obtained with get_depth_values(). Alternatively, get_combined_texture() returns - * the ID of the texture for mapping it into the CUDA context. + * in different poses with the render() function. The resulting depth values are stored in a texture + * whose values can be obtained with get_depth_values(). Alternatively, get_framebuffer_texture() returns + * the ID of the texture for mapping it into CUDA. */ class ObjectRasterizer { public: - /// constructor which takes the vertices and indices that describe the objects as input + /// constructor which takes the vertices and indices that describe the objects as input. The paths to the + /// shader files and the instrinsic camera matrix also have to be passed here. /** - * @param[in] vertices [object_nr][vertex_nr] = {x, y, z}. This list should contain 3-dimensional - * vectors that specify the corners of the triangles the object meshes consists of. + * @param[in] vertices [object_nr][vertex_nr] = {x, y, z}. This list should contain, for each object, + * a list of 3-dimensional vectors that specify the corners of the triangles of the object mesh. * @param[in] indices [object_nr][index_nr][0 - 2] = {index}. This list should contain the indices * that index the vertices list and tell us which vertices to connect to a triangle (every group of 3). * For each object, the indices should be in the range of [0, nr_vertices - 1]. * @param[in] vertex_shader_path path to the vertex shader * @param[in] fragment_shader_path path to the fragment shader * @param[in] camera_matrix matrix of the intrinsic parameters of the camera + * @param[in] near_plane everything closer than the near plane will not be rendered. This should + * be similar to the minimal distance up to which the sensor can see objects. + * @param[in] far_plane everything further away than the far plane will not be rendered. This should + * be similar to the maximum distance up to which the sensor can see objects. + * @param[in] nr_rows the number of rows in one sensor image (vertical resolution) + * @param[in] nr_cols the number of columns in one sensor image (horizontal resolution) */ ObjectRasterizer(const std::vector > vertices, const std::vector > > indices, const std::string vertex_shader_path, const std::string fragment_shader_path, - const Eigen::Matrix3f camera_matrix); - - /// constructor with no arguments, should not be used - ObjectRasterizer(); + const Eigen::Matrix3f camera_matrix, + const float near_plane = 0.4, + const float far_plane = 4, + const int nr_rows = 60, + const int nr_cols = 80); /// destructor which deletes the buffers and programs used by openGL ~ObjectRasterizer(); - /// render the objects in all given states and return the depth for all relevant pixels of each rendered object /** This function renders all poses (of all objects) into one large texture. Reading back the depth values * is a relatively slow process, so this function should mainly be used for debugging. If you are using - * CUDA to further process the depth values, please use the other Render() function. + * CUDA to further process the depth values, please use the other render() function. * @param[in] states [pose_nr][object_nr][0 - 6] = {qw, qx, qy, qz, tx, ty, tz}. This should contain the quaternion * and the translation for each object per pose. - * @param[in,out] intersect_indices [pose_nr][0 - nr_relevant_pixels] = {pixel_nr}. This list should be empty when passed - * to the function. Afterwards, it will contain the pixel numbers of all pixels that were rendered to, per pose. - * @param[in,out] depth [pose_nr][0 - nr_relevant_pixels] = {depth_value}. This list should be empty when passed to the function. - * Afterwards, it will contain the depth value of all pixels that were rendered to, per pose. + * @param[out] intersect_indices [pose_nr][0 - nr_relevant_pixels] = {pixel_nr}. This list should be empty when passed + * to the function. Afterwards, it will contain the pixel numbers of all pixels that were rendered to, per pose. Pixels + * that have a depth value of 0 will be ignored. + * @param[out] depth [pose_nr][0 - nr_relevant_pixels] = {depth_value}. This list should be empty when passed to the function. + * Afterwards, it will contain the depth value of all pixels that were rendered to, per pose. Pixels + * that have a depth value of 0 will be ignored. */ - void render(const std::vector > > states, - std::vector > &intersect_indices, - std::vector > &depth); - + void render(const std::vector > states, + std::vector > depth_values); /// render the objects in all given states into a texture that can then be accessed by CUDA /** This function renders all poses (of all objects) into one large texture, which can then be mapped into the CUDA @@ -63,77 +71,103 @@ class ObjectRasterizer * @param[in] states [pose_nr][object_nr][0 - 6] = {qw, qx, qy, qz, tx, ty, tz}. This should contain the quaternion * and the translation for each object per pose. */ - void render(const std::vector > > states); + void render(const std::vector > states); /// sets the objects that should be rendered. /** This function only needs to be called if any objects initially passed in the constructor should be left out when rendering. * @param[in] object_numbers [0 - nr_objects] = {object_nr}. This list should contain the indices of all objects that - * should be rendered when calling Render(). + * should be rendered when calling render(). For example, [0,1,4,5] will only render objects 0,1,4 and 5 (whose vertices + * were passed in the constructor). */ void set_objects(std::vector object_numbers); /// set a new resolution /** This function reallocates the framebuffer textures. This is expensive, so only do it if necessary. - * @param[in] n_rows the height of the image - * @param[in] n_cols the width of the image + * @param[in] n_rows the height of the image + * @param[in] n_cols the width of the image + * @param[out] nr_poses the new number of poses (only changed when adapting to constraints) + * @param[out] nr_poses_per_row the new number of poses per row + * @param[out] nr_poses_per_column the new number of poses per column + * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program instead + */ + void set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints = false); + + /// allocates memory on the GPU + /** Use this function to allocate memory for the maximum number of poses that you will need throughout the filtering. + * @param[in,out] allocated_poses number of poses for which space should be allocated. Might be changed by the function + * if there are space restrictions posed by OpenGL. + * @param[out] allocated_poses_per_row the number of poses that will be rendered per row of the texture + * @param[out] allocated_poses_per_column the number of poses that will be rendered per column of the texture + * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program if constraints are not met */ - void set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column); - void allocate_textures_for_max_poses(int& allocated_poses, int& allocated_poses_per_row, - int& allocated_poses_per_column); - - void set_number_of_poses(const int nr_poses, int& nr_poses_per_row, int& nr_poses_per_column); - + int& allocated_poses_per_column, + const bool adapt_to_constraints = false); + + /// sets the number of poses that should be rendered in the next render call + /** Use this function previously to every render call if you need to change the amount of poses. + * @param[in/out] nr_poses amount of poses that should be rendered. Cannot exceed the maximum number of poses set with + * allocate_textures_for_max_poses(). Might be changed if adapt_to_constraints is activated. + * @param[out] nr_poses_per_row the number of poses that will be rendered per row of the texture + * @param[out] nr_poses_per_column the number of poses that will be rendered per column of the texture + * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program instead + */ + void set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints = false); + /// returns the OpenGL framebuffer texture ID, which is needed for CUDA interoperation + /** Use this function to retrieve the texture ID and pass it to the cudaGraphicsGLRegisterImage call. + * @return The texture ID + */ GLuint get_framebuffer_texture(); - int get_nr_poses_per_row(); - void get_depth_values(std::vector > &intersect_indices, - std::vector > &depth); -private: - static constexpr float NEAR_PLANE = 0.4f; // Kinect does not see anything closer than 0.3 meters - static constexpr float FAR_PLANE = 4.0f; // Kinect does not see anything further away than 7 meters - static const int WINDOW_WIDTH = 80; // default values if not specified - static const int WINDOW_HEIGHT = 60; // default values if not specified + /// returns the rendered depth values of all poses + /** This function should only be used for debugging. It will be extremely slow. + * @return [pose_nr][0 - nr_pixels] = {depth value of that pixel} + */ + std::vector > get_depth_values(); +private: // GPU constraints GLint max_texture_size_; - // values initialized to WINDOW_WIDTH, WINDOW_HEIGHT in constructor. May be changed by user with set_resolution(). + // values initialized in constructor. May be changed by user with set_resolution(). int nr_rows_; int nr_cols_; - // number of poses to render + // values initialized in constructor. Cannot be changed afterwards. + float near_plane_; + float far_plane_; + + // actual number of poses to render (current frame) int nr_poses_; int nr_poses_per_row_; int nr_poses_per_column_; - // number of maximum poses + // maximum number of poses that can be rendered in one call int nr_max_poses_; int nr_max_poses_per_row_; int nr_max_poses_per_column_; - // needed for openGL time observation + // needed for OpenGL time measurement static const int NR_SUBROUTINES_TO_MEASURE = 4; GLuint time_query_[NR_SUBROUTINES_TO_MEASURE]; enum subroutines_to_measure { ATTACH_TEXTURE, CLEAR_SCREEN, RENDER, DETACH_TEXTURE}; std::vector strings_for_subroutines; - std::vector gpu_times_aggregate_; + std::vector time_measurement_; int nr_calls_; - bool initial_run_; - + bool initial_run_; // the first run should not count // the paths to the respective shaders std::string vertex_shader_path_; std::string fragment_shader_path_; - std::vector indices_list_; - // a list of all vertices of all objects + // lists of all vertices and indices of all objects std::vector vertices_list_; - std::vector vertices_per_object_; + std::vector indices_list_; std::vector indices_per_object_; std::vector start_position_; + // contains a list of object indices which should be rendered std::vector object_numbers_; @@ -149,12 +183,12 @@ class ObjectRasterizer // VAO, VBO and element arrays are needed to store the object meshes GLuint vertex_array_; // The vertex array contains the vertex and index buffers GLuint vertex_buffer_; // contains the vertices of the object meshes passed in the constructor - GLuint index_buffer_; // contains the indices of the object meshes passed in the constructor + GLuint index_buffer_; // contains the indices of the object meshes passed in the constructor - // PBO for copying results to CPU + // PBO for copying results to CPU for debugging GLuint result_buffer_; - // custom framebuffer and its textures for color and depth + // custom framebuffer and its textures for depth (for z-testing) and color (which also represents depth in our case) GLuint framebuffer_; GLuint framebuffer_texture_for_all_poses_; GLuint texture_for_z_testing; @@ -163,13 +197,11 @@ class ObjectRasterizer // ====================== PRIVATE FUNCTIONS ====================== // - void reallocate_buffers(); - // set up model-, view- and projection-matrix - Eigen::Matrix4f get_model_matrix(const std::vector state); + // set up view- and projection-matrix void setup_view_matrix(); - void setup_projection_matrix(const Eigen::Matrix3f camera_matrix); + void setup_projection_matrix(const Eigen::Matrix3f camera_matrix); Eigen::Matrix4f get_projection_matrix(float n, float f, float l, float r, float t, float b); // functions for time measurement @@ -179,7 +211,6 @@ class ObjectRasterizer // functions for error checking void check_GL_errors(const char *label); bool check_framebuffer_status(); - }; #endif // OBJECT_RASTERIZER_HPP diff --git a/include/dbot/util/traits.hpp b/include/dbot/util/traits.hpp index 7088faa..b084670 100644 --- a/include/dbot/util/traits.hpp +++ b/include/dbot/util/traits.hpp @@ -10,6 +10,7 @@ * License License (GNU GPL). A copy of the license can be found in the LICENSE * file distributed with this source code. */ + /** * @date 05/25/2014 * @author Jan Issac (jan.issac@gmail.com) diff --git a/src/dbot/model/observation/gpu/cuda_filter.cu b/src/dbot/model/observation/gpu/cuda_filter.cu index 01c2cdd..e55521e 100644 --- a/src/dbot/model/observation/gpu/cuda_filter.cu +++ b/src/dbot/model/observation/gpu/cuda_filter.cu @@ -1,8 +1,6 @@ /** @author Claudia Pfreundt */ -#define CHECK_ERRORS -//#define PROFILING_ACTIVE -//#define DEBUG_ON +#define DEBUG #define VECTOR_DIM 3 #define MATRIX_DIM 9 @@ -18,7 +16,6 @@ #include #include "cuda_gl_interop.h" -#include #include #include @@ -30,26 +27,25 @@ namespace fil // ====================== CUDA CONSTANT VALUES ======================= // -// used in propagate -// 1000 denotes the maximum number of objects -__constant__ float3 g_rot_center[1000]; -// sigmas.x == angle_sigma, sigmas.y == trans_sigma -__constant__ float2 g_sigmas; // used in propagateOcclusion -__constant__ float g_p_visible_occluded; -__constant__ float g_c; +__constant__ float g_p_occluded_occluded; +__constant__ float g_one_div_c_minus_one; __constant__ float g_log_c; + // used in prob -__constant__ float g_tail_weight; +__constant__ float g_one_minus_tail_weight; __constant__ float g_model_sigma; __constant__ float g_sigma_factor; -__constant__ float g_max_depth; +__constant__ float g_tail_weight_div_max_depth; __constant__ float g_exponential_rate; +__constant__ float g_one_div_sqrt_of_two; +__constant__ float g_one_div_sqrt_of_two_pi; + // used in compare -__constant__ float g_p_visible_init; +__constant__ float g_initial_occlusion_prob; // texture for OpenGL interop texture texture_reference; @@ -68,476 +64,66 @@ texture texture_reference; // ====================== DEVICE kernels - to be called by other kernels ======================= // // ============================================================================================= // -// ====================== MATRIX MANIPULATION FUNCTIONS ======================= // - -__device__ void multiply_matrices(float *A, float *B, float *C) { - float sum = 0; - for (int i = 0; i < VECTOR_DIM; i++) { // iterate through rows - for (int j = 0; j < VECTOR_DIM; j++) { // iterate through cols - for (int k = 0; k < VECTOR_DIM; k++) { - sum += A[i * VECTOR_DIM + k] * B[k * VECTOR_DIM + j]; - } - C[i * VECTOR_DIM + j] = sum; - sum = 0; - } - } -} - -__device__ float3 multiply_matrix_with_vector(float* M, float3 v) { - float result[3]; - float v_copy[3]; - v_copy[0] = v.x; v_copy[1] = v.y; v_copy[2] = v.z; - float sum = 0; - - for (int i = 0; i < VECTOR_DIM; i++) { - for (int j = 0; j < VECTOR_DIM; j++) { - sum += M[i * VECTOR_DIM + j] * v_copy[j]; - } - result[i] = sum; - sum = 0; - } - - return make_float3(result[0], result[1], result[2]); -} - -/* axis is defined as follows: 0 = x, 1 = y, 2 = z */ -__device__ void create_rotation_matrix(const float angle, const int axis, float *R) { - float cos_angle = cos(angle); - float sin_angle = sin(angle); - - if (axis == 0) { - R[0] = 1; R[1] = 0; R[2] = 0; - R[3] = 0; R[4] = cos_angle; R[5] = -sin_angle; - R[6] = 0; R[7] = sin_angle; R[8] = cos_angle; - } else if (axis == 1) { - R[0] = cos_angle; R[1] = 0; R[2] = sin_angle; - R[3] = 0; R[4] = 1; R[5] = 0; - R[6] = -sin_angle; R[7] = 0; R[8] = cos_angle; - } else if (axis == 2) { - R[0] = cos_angle; R[1] = -sin_angle; R[2] = 0; - R[3] = sin_angle; R[4] = cos_angle; R[5] = 0; - R[6] = 0; R[7] = 0; R[8] = 1; - } -} - -__device__ void transpose_matrix(float *A, float *T) { - T[0] = A[0]; - T[1] = A[3]; - T[2] = A[6]; - T[3] = A[1]; - T[4] = A[4]; - T[5] = A[7]; - T[6] = A[2]; - T[7] = A[5]; - T[8] = A[8]; -} - -// ====================== VECTOR MANIPULATION FUNCTIONS ======================= // - -__device__ float4 normalize(const float4 v) { - float4 v_n = v; - const float n = 1.0f/sqrt(v_n.x*v_n.x+v_n.y*v_n.y+v_n.z*v_n.z+v_n.w*v_n.w); - v_n.x *= n; - v_n.y *= n; - v_n.z *= n; - v_n.w *= n; - - return v_n; -} - -__device__ float3 operator+(const float3 &a, const float3 &b) { - - return make_float3(a.x+b.x, a.y+b.y, a.z+b.z); - -} - -__device__ float3 negate(const float3 &a) { - return make_float3(-a.x, -a.y, -a.z); -} - -// ======================= QUATERNION CONVERSIONS AND MANIPULATION FUNCTIONS ======================= // - -__device__ void quaternion_to_matrix(const float4 q_in, float *Q) { - float4 q = normalize(q_in); - Q[0] = 1.0f - 2.0f*q.y*q.y - 2.0f*q.z*q.z; Q[1] = 2.0f*q.x*q.y - 2.0f*q.z*q.w; Q[2] = 2.0f*q.x*q.z + 2.0f*q.y*q.w; - Q[3] = 2.0f*q.x*q.y + 2.0f*q.z*q.w; Q[4] = 1.0f - 2.0f*q.x*q.x - 2.0f*q.z*q.z; Q[5] = 2.0f*q.y*q.z - 2.0f*q.x*q.w; - Q[6] = 2.0f*q.x*q.z - 2.0f*q.y*q.w; Q[7] = 2.0f*q.y*q.z + 2.0f*q.x*q.w; Q[8] = 1.0f - 2.0f*q.x*q.x - 2.0f*q.y*q.y; -} - -__device__ float4 matrix_to_quaternion(float *Q) { - float4 q; - - q.w = sqrtf( fmaxf( 0, 1 + Q[0] + Q[4] + Q[8] ) ) / 2; - q.x = sqrtf( fmaxf( 0, 1 + Q[0] - Q[4] - Q[8] ) ) / 2; - q.y = sqrtf( fmaxf( 0, 1 - Q[0] + Q[4] - Q[8] ) ) / 2; - q.z = sqrtf( fmaxf( 0, 1 - Q[0] - Q[4] + Q[8] ) ) / 2; - if (( q.x * ( Q[7] - Q[5] ) ) < 0) { - q.x = -q.x; - } - if (( q.y * ( Q[2] - Q[6] ) ) < 0) { - q.y = -q.y; - } - if (( q.z * ( Q[3] - Q[1] ) ) < 0) { - q.z = -q.z; - } - - return q; -} - -__device__ float4 multiply_quaternions(float4 q1, float4 q2) { - float w = (q1.w * q2.w) - (q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z); - float x = (q1.w * q2.x) + (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y); - float y = (q1.w * q2.y) - (q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x); - float z = (q1.w * q2.z) + (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w); - - return make_float4(x, y, z, w); -} - - // ======================= helper functions for compare (observation model) ======================= // + __device__ float propagate_occlusion(float initial_p_source, float time) { if (isnan(time)) { return initial_p_source; } - float pow_c_time = exp(time*g_log_c); - return (float) pow_c_time*initial_p_source + g_p_visible_occluded*(pow_c_time-1.)/(g_c-1.); + float pow_c_time = __expf(time * g_log_c); + return 1 - (pow_c_time * (1 - initial_p_source) + (1. - g_p_occluded_occluded) * (pow_c_time - 1.) * g_one_div_c_minus_one); } -__device__ float prob(float observation, float prediction, bool visible) +__device__ float prob(float observation, float prediction, bool occluded) { - // todo: if the prediction is infinite, the prob should not depend on visibility. it does not matter + // todo: if the prediction is infinite, the prob should not depend on occlusion. it does not matter // for the algorithm right now, but it should be changed - float sigma = g_model_sigma + g_sigma_factor*observation*observation; - if(visible) + float sigma = g_model_sigma + g_sigma_factor * observation * observation; + float sigma_sq = sigma * sigma; + + if(!occluded) { if(isinf(prediction)) // if the prediction is infinite we return the limit - return g_tail_weight/g_max_depth; - else - return g_tail_weight/g_max_depth - + (1 - g_tail_weight)*expf(-(powf(prediction-observation,2)/(2*sigma*sigma))) - / (sqrtf(2*M_PI) *sigma); + return g_tail_weight_div_max_depth; + else { + float pred_minus_obs = prediction - observation; + return g_tail_weight_div_max_depth + + __fdividef(g_one_minus_tail_weight * __expf(- __fdividef(pred_minus_obs * pred_minus_obs, (2 * sigma_sq))) + * g_one_div_sqrt_of_two_pi, sigma); + } } else { if(isinf(prediction)) // if the prediction is infinite we return the limit - return g_tail_weight/g_max_depth + - (1-g_tail_weight)*g_exponential_rate* - expf(0.5*g_exponential_rate*(-2*observation + g_exponential_rate*sigma*sigma)); + return g_tail_weight_div_max_depth + + g_one_minus_tail_weight * g_exponential_rate * + __expf(0.5 * g_exponential_rate * (-2 * observation + g_exponential_rate * sigma_sq)); else - return g_tail_weight/g_max_depth + - (1-g_tail_weight)*g_exponential_rate* - expf(0.5*g_exponential_rate*(2*prediction-2*observation + g_exponential_rate*sigma*sigma)) - *(1+erff((prediction-observation+g_exponential_rate*sigma*sigma)/(sqrtf(2)*sigma))) - /(2*(expf(prediction*g_exponential_rate)-1)); + return g_tail_weight_div_max_depth + + g_one_minus_tail_weight * g_exponential_rate * + __expf(0.5 * g_exponential_rate * (2 * (prediction - observation) + g_exponential_rate * sigma_sq)) + * __fdividef((1 + erff(__fdividef((prediction - observation + g_exponential_rate * sigma_sq) * g_one_div_sqrt_of_two, sigma))), + (2 * (__expf(prediction * g_exponential_rate) - 1))); } } - - // ============================================================================================= // // ========================= GLOBAL kernels - to be called by CPU code ========================= // // ============================================================================================= // -__global__ void setup_number_generators_kernel(int current_time, curandStateMRG32k3a *mrg_state, int n_poses) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id < n_poses) { - /* Each thread gets same seed, a different sequence number, no offset */ - curand_init(current_time, id, 0, &mrg_state[id]); - } -} - - -__global__ void propagate_kernel(float *states, int n_states, int states_size, float delta_time, curandStateMRG32k3a *mrg_state) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id < n_states) { - - /* Copy sigmas from constant memory into local register */ - float2 local_sigmas = g_sigmas; - - /* Copy mrg_state from global memory into local register */ - curandStateMRG32k3a local_mrg_state = mrg_state[id]; - - for (int i = 0; i < states_size / 7; i++) { - - int states_index = id * states_size + i * 7; - - /* Copy rot_center from constant memory into local register */ - float3 local_rot_center = g_rot_center[i]; - - /* TODO coalesce accesses? Does it do it automatically or do I manually have to store them as - * float3 and float4 values? */ - /* quaternion stored as (w,x,y,z), but make_float4 takes (x,y,z,w) */ - float4 q_init_vector = make_float4(states[states_index + 1], states[states_index + 2], states[states_index + 3], states[states_index]); - float3 t_init = make_float3(states[states_index + 4], states[states_index + 5], states[states_index + 6]); - - float angle_x, angle_y, angle_z; - float trans_x, trans_y, trans_z; - - angle_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - angle_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - angle_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - trans_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - trans_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - trans_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - - - float q_rand_matrix[MATRIX_DIM]; - float q_init_matrix[MATRIX_DIM]; - - float rot_matrix_x[MATRIX_DIM]; - float rot_matrix_y[MATRIX_DIM]; - float rot_matrix_z[MATRIX_DIM]; - - float tmp_matrix[MATRIX_DIM]; - - - float3 t_rand = make_float3(trans_x, trans_y, trans_z); - - create_rotation_matrix(angle_x, 0, rot_matrix_x); - create_rotation_matrix(angle_y, 1, rot_matrix_y); - create_rotation_matrix(angle_z, 2, rot_matrix_z); - - multiply_matrices(rot_matrix_y, rot_matrix_z, tmp_matrix); - multiply_matrices(rot_matrix_x, tmp_matrix, q_rand_matrix); - - float4 q_rand_vector = matrix_to_quaternion(q_rand_matrix); - - quaternion_to_matrix(q_init_vector, q_init_matrix); - - float3 t = negate(multiply_matrix_with_vector(q_init_matrix, multiply_matrix_with_vector(q_rand_matrix, local_rot_center))) - + multiply_matrix_with_vector(q_init_matrix, local_rot_center) - + t_init - + t_rand; - - float4 q = multiply_quaternions(q_init_vector, q_rand_vector); - q = normalize(q); - - /* write state back into global memory */ - states[states_index] = q.w; - states[states_index + 1] = q.x; - states[states_index + 2] = q.y; - states[states_index + 3] = q.z; - - states[states_index + 4] = t.x; - states[states_index + 5] = t.y; - states[states_index + 6] = t.z; - } - - /* Copy mrg state back to global memory */ - mrg_state[id] = local_mrg_state; - } -} - - - - - - -__global__ void propagate_multiple_kernel(float *states, int n_states, int n_objects, int states_size, float delta_time, curandStateMRG32k3a *mrg_state) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id < n_states) { - - /* Copy sigmas from constant memory into local register */ - float2 local_sigmas = g_sigmas; - - /* Copy mrg_state from global memory into local register */ - curandStateMRG32k3a local_mrg_state = mrg_state[id]; - - for (int i = 0; i < n_objects; i++) { - - int states_index = id * n_objects * states_size + i * states_size; - - /* Copy rot_center from constant memory into local register */ - float3 local_rot_center = g_rot_center[i]; - - /* TODO coalesce accesses? Does it do it automatically or do I manually have to store them as - * float3 and float4 values? */ - /* quaternion stored as (w,x,y,z), but make_float4 takes (x,y,z,w) */ - float4 q_init_vector = make_float4(states[states_index + 1], states[states_index + 2], states[states_index + 3], states[states_index]); - float3 t_init = make_float3(states[states_index + 4], states[states_index + 5], states[states_index + 6]); - - float angle_x, angle_y, angle_z; - float trans_x, trans_y, trans_z; - - // WARNING: same random number for all states, because mrg_state is the same.. - angle_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - angle_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - angle_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x; - trans_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - trans_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - trans_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y; - - - float q_rand_matrix[MATRIX_DIM]; - float q_init_matrix[MATRIX_DIM]; - - float rot_matrix_x[MATRIX_DIM]; - float rot_matrix_y[MATRIX_DIM]; - float rot_matrix_z[MATRIX_DIM]; - - float tmp_matrix[MATRIX_DIM]; - - - float3 t_rand = make_float3(trans_x, trans_y, trans_z); - - create_rotation_matrix(angle_x, 0, rot_matrix_x); - create_rotation_matrix(angle_y, 1, rot_matrix_y); - create_rotation_matrix(angle_z, 2, rot_matrix_z); - - multiply_matrices(rot_matrix_y, rot_matrix_z, tmp_matrix); - multiply_matrices(rot_matrix_x, tmp_matrix, q_rand_matrix); - - float4 q_rand_vector = matrix_to_quaternion(q_rand_matrix); - - quaternion_to_matrix(q_init_vector, q_init_matrix); - - float3 t = negate(multiply_matrix_with_vector(q_init_matrix, multiply_matrix_with_vector(q_rand_matrix, local_rot_center))) - + multiply_matrix_with_vector(q_init_matrix, local_rot_center) - + t_init - + t_rand; - - float4 q = multiply_quaternions(q_init_vector, q_rand_vector); - q = normalize(q); - - /* write state back into global memory */ - states[states_index] = q.w; - states[states_index + 1] = q.x; - states[states_index + 2] = q.y; - states[states_index + 3] = q.z; - - states[states_index + 4] = t.x; - states[states_index + 5] = t.y; - states[states_index + 6] = t.z; - } - - /* Copy mrg state back to global memory */ - mrg_state[id] = local_mrg_state; - } -} - - - - - - - - - - - - -__global__ void compare_kernel(float *observations, float* visibility_probs, int n_pixels_per_pose, - bool constant_occlusion, float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols) { - int block_id = blockIdx.x + blockIdx.y * gridDim.x; - if (block_id < n_poses) { - - int pixel_nr = threadIdx.x; -// int pixel_nr = threadIdx.x * ceilf(n_pixels_per_pose / blockDim.x); - int global_index = block_id * n_pixels_per_pose + pixel_nr; - - // OpenGL contructs the texture so that the left lower edge is (0,0), but our observations texture - // has its (0,0) in the upper left corner, so we need to reverse the reads from the OpenGL texture. - float texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols; - float texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1); - - float depth; - float observed_depth; - float visibility_prob = g_p_visible_init; - float local_sum_of_likelihoods = 0; - float p_obsIpred_vis, p_obsIpred_occl, p_obsIinf; - - // TODO: uninitialized? - __shared__ float log_likelihoods; - - if (threadIdx.x == 0) { - log_likelihoods = 0; - } - - __syncthreads(); - - while (pixel_nr < n_pixels_per_pose ) { //&& pixel_nr < (threadIdx.x + 1) * ceilf(n_pixels_per_pose / blockDim.x)) { - - depth = tex2D(texture_reference, texture_array_index_x, texture_array_index_y); - observed_depth = observations[pixel_nr]; - - // TODO either this, or only write the values back for pixels with depth value == 1. - // Could save some data transfer time, but will cost more execution time, since all - // the threads in one warp will have to wait for the else-branch to finish - if (!constant_occlusion) { - visibility_prob = propagate_occlusion(visibility_probs[global_index], delta_time); - visibility_probs[global_index] = visibility_prob; - } -// if (!constant_occlusion) { -// visibility_prob = propagateOcclusion(visibility_probs[global_index], delta_time); -// } - - //TODO slow: 4800 threads have to go through this whole if instruction - if (depth != 0 && !isnan(observed_depth)) { - - // prob of observation given prediction, knowing that the object is visible - p_obsIpred_vis = prob(observed_depth, depth, true) * visibility_prob; - // prob of observation given prediction, knowing that the object is occluded - p_obsIpred_occl = prob(observed_depth, depth, false) * (1-visibility_prob); - // prob of observation given no intersection - p_obsIinf = prob(observed_depth, CUDART_INF_F, false); - - local_sum_of_likelihoods += logf((p_obsIpred_vis + p_obsIpred_occl)/p_obsIinf); - - if(!constant_occlusion) { // we check if we are tracking the visibilities - // we update the visibility (occlusion) with the observations - visibility_probs[global_index] = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl); - } -// if (!constant_occlusion) { -// visibility_prob = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl); -// } - } - -// if (!constant_occlusion) { -// visibility_probs[global_index] = visibility_prob; -// } - - pixel_nr += blockDim.x; -// pixel_nr += 1; - global_index = block_id * n_pixels_per_pose + pixel_nr; - texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols; - texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1); - } - - // TODO: will execute blockDim.x sequential writes to log_likelihoods - // instead could do a manual reduction after syncthreads - atomicAdd(&log_likelihoods, local_sum_of_likelihoods); - - __syncthreads(); - - if (threadIdx.x == 0) { - d_log_likelihoods[block_id] = log_likelihoods; - } - } else { - __syncthreads(); - } - -} - - - - - -__global__ void compare_multiple_kernel(float *observations, float* old_visibility_probs, float* new_visibility_probs, int* occlusion_image_indices, int nr_pixels, - float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols, bool update) { +__global__ void evaluate_kernel(float *observations, float* old_occlusion_probs, float* new_occlusion_probs, int* occlusion_image_indices, int nr_pixels, + float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols, bool update_occlusions) { int block_id = blockIdx.x + blockIdx.y * gridDim.x; if (block_id < n_poses) { @@ -547,11 +133,11 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili // has its (0,0) in the upper left corner, so we need to reverse the reads from the OpenGL texture. float texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols; - float texture_array_index_y = gridDim.y * n_rows - 1 - (blockIdx.y * n_rows + pixel_nr / n_cols); + float texture_array_index_y = gridDim.y * n_rows - 1 - (blockIdx.y * n_rows + __fdividef(pixel_nr, n_cols)); float depth; float observed_depth; - float visibility_prob = g_p_visible_init; + float occlusion_prob = g_initial_occlusion_prob; float local_sum_of_likelihoods = 0; float p_obsIpred_vis, p_obsIpred_occl, p_obsIinf; @@ -565,21 +151,21 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili __syncthreads(); - float* visibility_probs = old_visibility_probs; + float* occlusion_probs = old_occlusion_probs; int occlusion_pixel_index= occlusion_image_index * nr_pixels + pixel_nr; - if (update) { - // copy / duplicate visibility probabilities from the old particles - int index_from_visibility = occlusion_image_indices[block_id] * nr_pixels; - int index_to_visibility = block_id * nr_pixels; + if (update_occlusions) { + // copy occlusion probabilities from the old particles + int index_from_occlusion = occlusion_image_indices[block_id] * nr_pixels; + int index_to_occlusion = block_id * nr_pixels; while (pixel_nr < nr_pixels) { - new_visibility_probs[index_to_visibility + pixel_nr] = old_visibility_probs[index_from_visibility + pixel_nr]; + new_occlusion_probs[index_to_occlusion + pixel_nr] = old_occlusion_probs[index_from_occlusion + pixel_nr]; pixel_nr += blockDim.x; } - // change visibility prob array to the new one and change the global index - visibility_probs = new_visibility_probs; + // change occlusion prob array to the new one and change the global index + occlusion_probs = new_occlusion_probs; // reset pixel_nr for following loop pixel_nr = threadIdx.x; @@ -592,41 +178,34 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili depth = tex2D(texture_reference, texture_array_index_x, texture_array_index_y); observed_depth = observations[pixel_nr]; - // TODO either this, or only write the values back for pixels with depth value == 1. - // Could save some data transfer time, but will cost more execution time, since all - // the threads in one warp will have to wait for the else-branch to finish - - visibility_prob = propagate_occlusion(visibility_probs[occlusion_pixel_index], delta_time); - if (update) visibility_probs[occlusion_pixel_index] = visibility_prob; + occlusion_prob = propagate_occlusion(occlusion_probs[occlusion_pixel_index], delta_time); + if (update_occlusions) occlusion_probs[occlusion_pixel_index] = occlusion_prob; if (depth != 0 && !isnan(observed_depth)) { - - // prob of observation given prediction, knowing that the object is visible - p_obsIpred_vis = prob(observed_depth, depth, true) * visibility_prob; + // prob of observation given prediction, knowing that the object is not occluded + p_obsIpred_vis = prob(observed_depth, depth, false) * (1 - occlusion_prob); // prob of observation given prediction, knowing that the object is occluded - p_obsIpred_occl = prob(observed_depth, depth, false) * (1-visibility_prob); + p_obsIpred_occl = prob(observed_depth, depth, true) * occlusion_prob; // prob of observation given no intersection - p_obsIinf = prob(observed_depth, CUDART_INF_F, false); + p_obsIinf = prob(observed_depth, CUDART_INF_F, true); - local_sum_of_likelihoods += logf((p_obsIpred_vis + p_obsIpred_occl)/p_obsIinf); + local_sum_of_likelihoods += __logf(__fdividef((p_obsIpred_vis + p_obsIpred_occl), p_obsIinf)); - if(update) { - // we update the visibility (occlusion) probability with the observations - visibility_probs[occlusion_pixel_index] = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl); + if(update_occlusions) { + // we update the occlusion probability with the observations + occlusion_probs[occlusion_pixel_index] = 1 - __fdividef(p_obsIpred_vis, (p_obsIpred_vis + p_obsIpred_occl)); } } pixel_nr += blockDim.x; occlusion_pixel_index += blockDim.x; texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols; - texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1); + texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + (pixel_nr / n_cols) + 1); } - // TODO: will execute blockDim.x sequential writes to log_likelihoods - // instead could do a manual reduction after syncthreads atomicAdd(&log_likelihoods, local_sum_of_likelihoods); __syncthreads(); @@ -645,51 +224,6 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili - -__global__ void resample_kernel(float *visibility_probs, - float *visibility_probs_copy, - float *states, - float *states_copy, - int *resampling_indices, - int nr_pixels, - int nr_features) { - - int pixel_nr = threadIdx.x; - int feature_nr = threadIdx.x; - int index_from_visibility = resampling_indices[blockIdx.x] * nr_pixels; - int index_to_visibility = blockIdx.x * nr_pixels; - int index_from_states = resampling_indices[blockIdx.x] * nr_features; - int index_to_states = blockIdx.x * nr_features; - - - while (pixel_nr < nr_pixels) { - visibility_probs_copy[index_to_visibility + pixel_nr] = visibility_probs[index_from_visibility + pixel_nr]; - pixel_nr += blockDim.x; - } - while (feature_nr < nr_features) { - states_copy[index_to_states + feature_nr] = states[index_from_states + feature_nr]; - feature_nr += blockDim.x; - } -} - - -__global__ void resample_multiple_kernel(float *visibility_probs, - float *visibility_probs_copy, - int *resampling_indices, - int nr_pixels) { - - int pixel_nr = threadIdx.x; - int index_from_visibility = resampling_indices[blockIdx.x] * nr_pixels; - int index_to_visibility = blockIdx.x * nr_pixels; - - while (pixel_nr < nr_pixels) { - visibility_probs_copy[index_to_visibility + pixel_nr] = visibility_probs[index_from_visibility + pixel_nr]; - pixel_nr += blockDim.x; - } -} - - - // ************************************************************************************** // // ************************************************************************************** // // ========================== CUDA_FILTER MEMBER FUNCTIONS ============================== // @@ -697,10 +231,11 @@ __global__ void resample_multiple_kernel(float *visibility_probs, // ************************************************************************************** // -CudaFilter::CudaFilter() : - n_cols_(WINDOW_WIDTH), - n_rows_(WINDOW_HEIGHT), - n_poses_set_(false) +CudaFilter::CudaFilter(const int nr_rows, + const int nr_cols) : + + nr_rows_(nr_rows), + nr_cols_(nr_cols) { cudaDeviceProp props; @@ -710,14 +245,18 @@ CudaFilter::CudaFilter() : props.major = 2; props.minor = 0; cudaChooseDevice( &device_number, &props ); - check_cuda_error("No device with compute capability > 2.0 found"); + #ifdef DEBUG + check_cuda_error("No device with compute capability > 2.0 found"); + #endif - /* tell CUDA which device we will be using for graphic interop - * requires that the CUDA device be specified by + /* tell CUDA which device we will be using for graphic interop. + * Requires that the CUDA device be specified by * cudaGLSetGLDevice() before any other runtime calls. */ cudaGLSetGLDevice( device_number ); - check_cuda_error("cudaGLsetGLDevice"); + #ifdef DEBUG + check_cuda_error("cudaGLsetGLDevice"); + #endif cudaGetDeviceProperties(&props, device_number); // we will run the program only on one graphics card, the first one we can find = 0 warp_size_ = props.warpSize; // equals 32 for all current graphics cards, but might change in the future @@ -725,606 +264,308 @@ CudaFilter::CudaFilter() : cuda_device_properties_ = props; - cout << "Your device has the following properties: " << endl - << "CUDA Version: " << props.major << "." << props.minor << endl - << "Number of multiprocessors: " << n_mps_ << endl - << "Warp size: " << warp_size_ << endl; + #ifdef DEBUG + cout << "Your device has the following properties: " << endl + << "CUDA Version: " << props.major << "." << props.minor << endl + << "Number of multiprocessors: " << n_mps_ << endl + << "Warp size: " << warp_size_ << endl; + #endif /* each multiprocessor has various KB of memory (64 KB for the GTX 560 Ti 448) which can be subdivided * into L1 cache or shared memory. If you don't need a lot of shared memory set this to prefer L1. */ cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - d_states_ = NULL; - d_states_copy_ = NULL; - d_visibility_probs_ = NULL; - d_visibility_probs_copy_ = NULL; + d_occlusion_probs_ = NULL; + d_occlusion_probs_copy_ = NULL; d_observations_ = NULL; d_log_likelihoods_ = NULL; - d_mrg_states_ = NULL; - d_resampling_indices_ = NULL; - d_prev_sample_indices_ = NULL; + d_occlusion_indices_ = NULL; } -void CudaFilter::init(vector > com_models, float angle_sigma, float trans_sigma, - float p_visible_init, float c, float log_c, float p_visible_occluded, - float tail_weight, float model_sigma, float sigma_factor, float max_depth, float exponential_rate) { + + +void CudaFilter::init(const float initial_occlusion_prob, const float p_occluded_occluded, const float p_occluded_visible, + const float tail_weight, const float model_sigma, const float sigma_factor, const float max_depth, const float exponential_rate) { occlusion_time_ = 0; - last_propagation_time_ = 0; - count_ = 0; - compare_kernel_time_ = 0; - copy_likelihoods_time_ = 0; - visibility_prob_default_ = p_visible_init; + occlusion_prob_default_ = initial_occlusion_prob; - float2 local_sigmas = make_float2(angle_sigma, trans_sigma); + // precompute constants that are used in high-performance kernels later + float c = p_occluded_occluded - p_occluded_visible; + float tail_weight_div_max_depth = tail_weight / max_depth; + float one_minus_tail_weight = 1.0f - tail_weight; + float one_div_c_minus_one = 1.0f / (c - 1.0f); + float one_div_sqrt_of_two = 1.0f / sqrt(2); + float one_div_sqrt_of_two_pi = 1.0f / sqrt(2 * M_PI); + float log_c = log(c); - allocate(d_observations_, n_cols_ * n_rows_ * sizeof(float), "d_observations"); -// allocate(d_log_likelihoods_, sizeof(float) * n_poses_, "d_log_likelihoods"); - // TODO don't allocate here!! only when setting resolution! -// allocate(d_prev_sample_indices_, sizeof(int) * n_poses_, "d_prev_sample_indices"); -// allocate(d_resampling_indices_, sizeof(int) * n_poses_, "d_resampling_indices"); + allocate(d_observations_, nr_cols_ * nr_rows_ * sizeof(float)); + // initialize log likelihoods with 0 cudaMemset(d_log_likelihoods_, 0, sizeof(float) * nr_poses_); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaMemset d_log_likelihoods"); #endif - cudaMemcpyToSymbol(g_sigmas, &local_sigmas, sizeof(float2), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol local_sigmas -> sigmas"); + // copy constants to GPU memory + cudaMemcpyToSymbol(g_initial_occlusion_prob, &initial_occlusion_prob, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol initial_occlusion_prob -> g_initial_occlusion_prob"); #endif - vector com_models_raw; - for (int i = 0; i < com_models.size(); i++) { - com_models_raw.push_back(make_float3(com_models[i][0], com_models[i][1], com_models[i][2])); - } - - cudaMemcpyToSymbol(g_rot_center, com_models_raw.data(), com_models_raw.size() * sizeof(float3), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol com_model -> rot_center"); + cudaMemcpyToSymbol(g_one_div_c_minus_one, &one_div_c_minus_one, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol one_div_c_minus_one -> g_one_div_c_minus_one"); #endif - cudaMemcpyToSymbol(g_p_visible_init, &p_visible_init, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol p_visible_init -> g_p_visible_init"); + cudaMemcpyToSymbol(g_one_div_sqrt_of_two, &one_div_sqrt_of_two, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol one_div_sqrt_of_two -> g_one_div_sqrt_of_two"); #endif - cudaMemcpyToSymbol(g_c, &c, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol c -> g_c"); + cudaMemcpyToSymbol(g_one_div_sqrt_of_two_pi, &one_div_sqrt_of_two_pi, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol one_div_sqrt_of_two_pi -> g_one_div_sqrt_of_two_pi"); #endif cudaMemcpyToSymbol(g_log_c, &log_c, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaMemcpyToSymbol log_c -> g_log_c"); #endif - cudaMemcpyToSymbol(g_p_visible_occluded, &p_visible_occluded, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol p_visible_occluded -> g_p_visible_occluded"); + cudaMemcpyToSymbol(g_p_occluded_occluded, &p_occluded_occluded, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol p_occluded_occluded -> g_p_occluded_occluded"); #endif - cudaMemcpyToSymbol(g_tail_weight, &tail_weight, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol tail_weight -> g_tail_weight"); + cudaMemcpyToSymbol(g_one_minus_tail_weight, &one_minus_tail_weight, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol one_minus_tail_weight -> g_one_minus_tail_weight"); #endif cudaMemcpyToSymbol(g_model_sigma, &model_sigma, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaMemcpyToSymbol model_sigma -> g_model_sigma"); #endif cudaMemcpyToSymbol(g_sigma_factor, &sigma_factor, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaMemcpyToSymbol sigma_factor -> g_sigma_factor"); #endif - cudaMemcpyToSymbol(g_max_depth, &max_depth, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpyToSymbol max_depth -> g_max_depth"); + cudaMemcpyToSymbol(g_tail_weight_div_max_depth, &tail_weight_div_max_depth, sizeof(float), 0, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpyToSymbol tail_weight_div_max_depth -> g_tail_weight_div_max_depth"); #endif cudaMemcpyToSymbol(g_exponential_rate, &exponential_rate, sizeof(float), 0, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaMemcpyToSymbol exponential_rate -> g_exponential_rate"); #endif -} - - -void CudaFilter::propagate(const float ¤t_time, vector > &states) -{ - - - float delta_time = current_time - last_propagation_time_; - last_propagation_time_ = current_time; - - - propagate_kernel <<< n_blocks_, nr_threads_ >>> (d_states_, nr_poses_, n_features_, delta_time, d_mrg_states_); - #ifdef CHECK_ERRORS - check_cuda_error("propagate kernel call"); - #endif - - - - // TODO necessary? Remove for performance? - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize propagate"); - #endif - - - - float *states_raw = (float*) malloc(nr_poses_ * n_features_ * sizeof(float)); - cudaMemcpy(states_raw, d_states_, nr_poses_ * n_features_ * sizeof(float), cudaMemcpyDeviceToHost); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_states -> states"); - #endif - - for (int i = 0; i < nr_poses_; i++) { - for (int j = 0; j < n_features_; j++) { - states[i][j] = states_raw[i * n_features_ + j]; - } - } + constants_initialized_ = true; } -void CudaFilter::propagate_multiple(const float ¤t_time, vector > > &states) -{ - - float delta_time = current_time - last_propagation_time_; - last_propagation_time_ = current_time; - - int n_objects = states[0].size(); - - float *states_raw = (float*) malloc(nr_poses_ * n_objects * n_features_ * sizeof(float)); - for (int i = 0; i < nr_poses_; i++) { - for (int j = 0; j < n_objects; j++) { - for (int k = 0; k < n_features_; k++) { - states_raw[(i * n_objects *n_features_) + j * n_features_ + k] = states[i][j][k]; - } - } - } - - - cudaMemcpy(d_states_, states_raw, nr_poses_ * n_objects * n_features_ * sizeof(float), cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy states -> d_states"); - #endif - - - propagate_multiple_kernel <<< n_blocks_, nr_threads_ >>> (d_states_, nr_poses_, n_objects, n_features_, delta_time, d_mrg_states_); - #ifdef CHECK_ERRORS - check_cuda_error("propagate kernel call"); - #endif - - - - // TODO necessary? Remove for performance? - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize propagate"); - #endif +void CudaFilter::weigh_poses(const bool update_occlusions, vector &log_likelihoods) { + if (observations_set_ && occlusion_indices_set_ && occlusion_probabilities_set_ + && memory_allocated_ && number_of_poses_set_ && constants_initialized_) { + double delta_time = observation_time_ - occlusion_time_; + if(update_occlusions) occlusion_time_ = observation_time_; - cudaMemcpy(states_raw, d_states_, nr_poses_ * n_objects * n_features_ * sizeof(float), cudaMemcpyDeviceToHost); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_states -> states"); - #endif + evaluate_kernel <<< grid_dimension_, nr_threads_ >>> (d_observations_, d_occlusion_probs_, d_occlusion_probs_copy_, d_occlusion_indices_, nr_cols_ * nr_rows_, + d_log_likelihoods_, delta_time, nr_poses_, nr_rows_, nr_cols_, update_occlusions); + #ifdef DEBUG + check_cuda_error("compare kernel call"); + #endif + cudaDeviceSynchronize(); + #ifdef DEBUG + check_cuda_error("cudaDeviceSynchronize compare_multiple"); + #endif - for (int i = 0; i < nr_poses_; i++) { - for (int j = 0; j < n_objects; j++) { - for (int k = 0; k < n_features_; k++) { - states[i][j][k] = states_raw[(i * n_objects *n_features_) + j * n_features_ + k]; - } + // switch to new / copied occlusion probabilities + if (update_occlusions) { + float *tmp_pointer; + tmp_pointer = d_occlusion_probs_; + d_occlusion_probs_ = d_occlusion_probs_copy_; + d_occlusion_probs_copy_ = tmp_pointer; } - } - - free(states_raw); -} - - - - -void CudaFilter::compare(float observation_time, bool constant_occlusion, vector &log_likelihoods) { - -#ifdef PROFILING_ACTIVE - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - float milliseconds; -#endif - - dim3 gridDim = dim3(nr_poses_per_row_, nr_poses_per_column_); - - // update observation time - float delta_time = observation_time - occlusion_time_; - occlusion_time_ = observation_time; - - - -#ifdef PROFILING_ACTIVE - cudaEventRecord(start); -#endif - compare_kernel <<< gridDim, 128 >>> (d_observations_, d_visibility_probs_, n_cols_ * n_rows_, - constant_occlusion, d_log_likelihoods_, delta_time, nr_poses_, n_rows_, n_cols_); - #ifdef CHECK_ERRORS - check_cuda_error("compare kernel call"); - #endif - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize compare"); - #endif + cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost); + #ifdef DEBUG + check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods"); + #endif -#ifdef PROFILING_ACTIVE - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - count_++; - compare_kernel_time_ += milliseconds; - if (count_ == COUNT) { - cout << "compare kernel: " << compare_kernel_time_ * 1e3 / count_ << " us" << endl; + cudaDeviceSynchronize(); + #ifdef DEBUG + check_cuda_error("cudaDeviceSynchronize compare"); + #endif + } else { + std::cout << "WARNING (CUDA): It seems you forgot to do one of the following: set observation image, set occlusion" + << " indices, set occlusion probabilities, set number of poses, allocate memory or inisitialize constants." << std::endl; } - cudaEventRecord(start); -#endif - - cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods"); - #endif - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize compare"); - #endif - -#ifdef PROFILING_ACTIVE - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - copy_likelihoods_time_ += milliseconds; - if (count_ == COUNT) { - cout << "copy likelihoods: " << copy_likelihoods_time_ * 1e3 / count_ << " us" << endl; - } -#endif } -void CudaFilter::compare_multiple(bool update, vector &log_likelihoods) { - -#ifdef PROFILING_ACTIVE - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - float milliseconds; - cudaEventRecord(start); -#endif - - double delta_time = observation_time_ - occlusion_time_; - if(update) occlusion_time_ = observation_time_; -// cout << "delta time: " << delta_time << endl; - - - compare_multiple_kernel <<< grid_dimension_, nr_threads_ >>> (d_observations_, d_visibility_probs_, d_visibility_probs_copy_, d_prev_sample_indices_, n_cols_ * n_rows_, - d_log_likelihoods_, delta_time, nr_poses_, n_rows_, n_cols_, update); - #ifdef CHECK_ERRORS - check_cuda_error("compare kernel call"); - #endif - - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize compare_multiple"); - #endif - - // switch to new / copied visibility probabilities - if (update) { - float *tmp_pointer; - tmp_pointer = d_visibility_probs_; - d_visibility_probs_ = d_visibility_probs_copy_; - d_visibility_probs_copy_ = tmp_pointer; - } - -#ifdef PROFILING_ACTIVE - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - count_++; - compare_kernel_time_ += milliseconds; - if (count_ == COUNT) { - cout << "compare kernel: " << compare_kernel_time_ * 1e3 / count_ << " us" << endl; - } - cudaEventRecord(start); -#endif - cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods"); - #endif - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize compare"); - #endif +// ===================================================================================== // +// =============================== CUDA FILTER SETTERS ================================= // +// ===================================================================================== // -#ifdef PROFILING_ACTIVE - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - copy_likelihoods_time_ += milliseconds; - if (count_ == COUNT) { - cout << "copy likelihoods: " << copy_likelihoods_time_ * 1e3 / count_ << " us" << endl; - } -#endif +void CudaFilter::set_nr_threads(const int nr_threads) { + nr_threads_ = min(nr_threads, cuda_device_properties_.maxThreadsDim[0]); } +void CudaFilter::set_observations(const float* observations, const float observation_time) { + observation_time_ = observation_time; - - -void CudaFilter::resample(vector resampling_indices) { - -// cout << "resample <<< " << n_poses_ << ", " << 128 << " >>>" << endl; - - cudaMemcpy(d_resampling_indices_, &resampling_indices[0], sizeof(int) * nr_poses_, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy resampling_indices -> d_resampling_indices_"); - #endif - -// int min = 100; -// int max = -1; -// for (int i = 0; i < resampling_indices.size(); i++) { -// int value = resampling_indices[i]; -// if (value > max) max = value; -// if (value < min) min = value; -// } -// cout << "resample min: " << min << ", max: " << max << endl; - - - int nr_pixels = n_rows_ * n_cols_; - - resample_kernel <<< nr_poses_, 128 >>> (d_visibility_probs_, d_visibility_probs_copy_, - d_states_, d_states_copy_, - d_resampling_indices_, nr_pixels, n_features_); - #ifdef CHECK_ERRORS - check_cuda_error("resample kernel call"); + cudaMemcpy(d_observations_, observations, nr_cols_ * nr_rows_ * sizeof(float), cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpy observations -> d_observations_"); #endif - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize resample"); + #ifdef DEBUG + check_cuda_error("cudaDeviceSynchronize set_observations"); #endif - - // switch the visibility probs pointers, so that the next Compare() call will access the resampled - // visibility probs. Same for the states. - float *tmp_pointer; - tmp_pointer = d_visibility_probs_; - d_visibility_probs_ = d_visibility_probs_copy_; - d_visibility_probs_copy_ = tmp_pointer; - tmp_pointer = d_states_; - d_states_ = d_states_copy_; - d_states_copy_ = tmp_pointer; - + observations_set_ = true; } +void CudaFilter::set_occlusion_indices(const int* occlusion_indices) { -void CudaFilter::resample_multiple(vector resampling_indices) { + cudaMemcpy(d_occlusion_indices_, occlusion_indices, nr_poses_ * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_resampling_indices_, &resampling_indices[0], sizeof(int) * nr_poses_, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy resampling_indices -> d_resampling_indices_"); + #ifdef DEBUG + check_cuda_error("cudaMemcpy occlusion_indices -> d_occlusion_indices"); #endif - - int nr_pixels = n_rows_ * n_cols_; - - resample_multiple_kernel <<< nr_poses_, 128 >>> (d_visibility_probs_, d_visibility_probs_copy_, - d_resampling_indices_, nr_pixels); - #ifdef CHECK_ERRORS - check_cuda_error("resample kernel call"); - #endif - cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS - check_cuda_error("cudaDeviceSynchronize resample"); + #ifdef DEBUG + check_cuda_error("cudaDeviceSynchronize set_occlusion_indices"); #endif - - // switch the visibility probs pointers, so that the next Compare() call will access the resampled - // visibility probs. - float *tmp_pointer; - tmp_pointer = d_visibility_probs_; - d_visibility_probs_ = d_visibility_probs_copy_; - d_visibility_probs_copy_ = tmp_pointer; + occlusion_indices_set_ = true; } +void CudaFilter::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, bool adapt_to_constraints) { + nr_rows_ = n_rows; + nr_cols_ = n_cols; - -// ===================================================================================== // -// =============================== CUDA FILTER SETTERS ================================= // -// ===================================================================================== // - -void CudaFilter::set_states(std::vector > &states, int seed) -{ - if (n_poses_set_) { - // copy states into linear array - /* TODO maybe padding can speed up the memory accesses later from the kernel, since - * right now, each MP needs 7 values out of d_states_. 8 would be a much better number. */ - n_features_ = states[0].size(); - - int states_size = nr_poses_ * n_features_ * sizeof(float); - float *states_raw = (float*) malloc(states_size); - - for (size_t i = 0; i < nr_poses_; i++) { - for (size_t j = 0; j < n_features_; j++) { - states_raw[i * n_features_ + j] = states[i][j]; - } - } - - allocate(d_states_, states_size, "d_states"); - allocate(d_states_copy_, states_size, "d_states_copy"); // placeholder for resampling purposes - - cudaMemcpy(d_states_, states_raw, states_size, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy states_raw -> d_states_"); - #endif - - free(states_raw); - - // setup random number generators for each thread to be used in the propagate kernel - allocate(d_mrg_states_, nr_poses_ * sizeof(curandStateMRG32k3a), "d_mrg_states"); - - setup_number_generators_kernel <<< n_blocks_, nr_threads_ >>> (seed, d_mrg_states_, nr_poses_); - - cudaDeviceSynchronize(); - } else { - cout << "WARNING: set_states() was not executed, because n_poses_ has not been set previously"; - exit(-1); - } -} - - - - - - -void CudaFilter::set_states_multiple(int n_objects, int n_features, int seed) -{ - if (n_poses_set_) { - n_features_ = n_features; - - int states_size = nr_poses_ * n_objects * n_features_ * sizeof(float); - allocate(d_states_, states_size, "d_states"); - - - // setup random number generators for each thread to be used in the propagate kernel - allocate(d_mrg_states_, nr_poses_ * sizeof(curandStateMRG32k3a), "d_mrg_states"); - - setup_number_generators_kernel <<< n_blocks_, nr_threads_ >>> (seed, d_mrg_states_, nr_poses_); - - cudaDeviceSynchronize(); - } else { - cout << "WARNING: set_states_multiple() was not executed, because n_poses_ has not been set previously"; - exit(-1); - } + // reallocate buffers + allocate(d_observations_, nr_cols_ * nr_rows_ * sizeof(float)); + allocate_memory_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column, adapt_to_constraints); } +void CudaFilter::set_occlusion_probabilities(const float* occlusion_probabilities) { + cudaMemcpy(d_occlusion_probs_, occlusion_probabilities, nr_rows_ * nr_cols_ * nr_poses_ * sizeof(float), cudaMemcpyHostToDevice); - - - -void CudaFilter::set_observations(const float* observations, const float observation_time) { - -// delta_time_ = observation_time - last_observation_time_; - observation_time_ = observation_time; -// cout << "delta time: " << delta_time_ << ", last_observation_time: " << occlusion_time_ << endl; - set_observations(observations); -} - -void CudaFilter::set_observations(const float* observations) { - cudaMemcpy(d_observations_, observations, n_cols_ * n_rows_ * sizeof(float), cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy observations -> d_observations_"); + #ifdef DEBUG + check_cuda_error("cudaMemcpy occlusion_probabilities -> d_occlusion_probs_"); #endif cudaDeviceSynchronize(); -} - - -void CudaFilter::set_prev_sample_indices(const int* prev_sample_indices) { - cudaMemcpy(d_prev_sample_indices_, prev_sample_indices, nr_poses_ * sizeof(int), cudaMemcpyHostToDevice); -// cout << "when setting prev_sample_indices: n_poses: " << n_poses_ << ", max poses: " << n_max_poses_ << endl; - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy prev_sample_indices -> d_prev_sample_indices_"); + #ifdef DEBUG + check_cuda_error("cudaDeviceSynchronize set_occlusion_probabilities"); #endif - cudaDeviceSynchronize(); + + occlusion_probabilities_set_ = true; } -void CudaFilter::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) { - n_rows_ = n_rows; - n_cols_ = n_cols; - - // reallocate buffers - allocate(d_observations_, n_cols_ * n_rows_ * sizeof(float), "d_observations"); - allocate_memory_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column); -} +void CudaFilter::map_texture_to_texture_array(const cudaArray_t texture_array) { + d_texture_array_ = texture_array; + cudaBindTextureToArray(texture_reference, d_texture_array_); -void CudaFilter::set_visibility_probabilities(const float* visibility_probabilities) { - cudaMemcpy(d_visibility_probs_, visibility_probabilities, n_rows_ * n_cols_ * nr_poses_ * sizeof(float), cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy visibility_probabilities -> d_visibility_probs_"); + #ifdef DEBUG + check_cuda_error("cudaBindTextureToArray"); #endif } - - void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses, int& allocated_poses_per_row, - int& allocated_poses_per_column) { + int& allocated_poses_per_column, + bool adapt_to_constraints) { // check limitation by global memory size_t size_of_log_likelihoods = sizeof(float) * allocated_poses; size_t size_of_resampling_indices = sizeof(int) * allocated_poses; - size_t size_of_prev_sample_indices = sizeof(int) * allocated_poses; - size_t size_of_visibility_probs = n_rows_ * n_cols_ * allocated_poses * sizeof(float); - size_t size_of_opengl_textures = size_of_visibility_probs * 2; - size_t size_of_observations = n_cols_ * n_rows_ * sizeof(float); + size_t size_of_occlusion_indices = sizeof(int) * allocated_poses; + size_t size_of_occlusion_probs = nr_rows_ * nr_cols_ * allocated_poses * sizeof(float); + size_t size_of_opengl_textures = size_of_occlusion_probs * 2; + size_t size_of_observations = nr_cols_ * nr_rows_ * sizeof(float); - size_t total_size = size_of_log_likelihoods + size_of_resampling_indices + size_of_prev_sample_indices - + size_of_visibility_probs * 2 + size_of_opengl_textures + size_of_observations; + size_t total_size = size_of_log_likelihoods + size_of_resampling_indices + size_of_occlusion_indices + + size_of_occlusion_probs * 2 + size_of_opengl_textures + size_of_observations; if (total_size > cuda_device_properties_.totalGlobalMem) { - - std::cout << "The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. " - << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem - << " B) retrieved from CUDA properties." << std::endl; - - size_t size_depending_on_nr_poses = (sizeof(float) + sizeof(int) * 2 + n_rows_ * n_cols_ * sizeof(float) * 4); - allocated_poses = min(allocated_poses, (int) floor((cuda_device_properties_.totalGlobalMem - size_of_observations) / size_depending_on_nr_poses)); - allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row); - - std::cout << "Instead, space (" << allocated_poses * size_depending_on_nr_poses + size_of_observations << " B) for " << allocated_poses << " poses was allocated. " << std::endl; + if (adapt_to_constraints) { + std::cout << "WARNING (CUDA): The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. " + << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem + << " B) retrieved from CUDA properties." << std::endl; + + size_t size_depending_on_nr_poses = (sizeof(float) + sizeof(int) * 2 + nr_rows_ * nr_cols_ * sizeof(float) * 4); + allocated_poses = min(allocated_poses, (int) floor((cuda_device_properties_.totalGlobalMem - size_of_observations) / size_depending_on_nr_poses)); + allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row); + + std::cout << "Instead, space (" << allocated_poses * size_depending_on_nr_poses + size_of_observations << " B) for " << allocated_poses << " poses was allocated. " << std::endl; + } else { + std::cout << "ERROR (CUDA): The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. " + << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem + << " B) retrieved from CUDA properties." << std::endl; + exit(-1); + } } // check limitation by texture size - if (cuda_device_properties_.maxTexture2D[0] <= allocated_poses_per_row * n_cols_) { + if (cuda_device_properties_.maxTexture2D[0] <= allocated_poses_per_row * nr_cols_) { + if (adapt_to_constraints) { - std::cout << "The max poses you requested (" << allocated_poses << ") could not be allocated." << std::endl; + std::cout << "WARNING (CUDA): The max poses you requested (" << allocated_poses << ") could not be allocated." << std::endl; - allocated_poses_per_row = cuda_device_properties_.maxTexture2D[0] / n_cols_; - allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row); + allocated_poses_per_row = cuda_device_properties_.maxTexture2D[0] / nr_cols_; + allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row); - if (cuda_device_properties_.maxTexture2D[1] <= allocated_poses_per_column * n_rows_) { - allocated_poses_per_column = cuda_device_properties_.maxTexture2D[1] / n_rows_; - } + if (cuda_device_properties_.maxTexture2D[1] <= allocated_poses_per_column * nr_rows_) { + allocated_poses_per_column = cuda_device_properties_.maxTexture2D[1] / nr_rows_; + } + + allocated_poses = min(allocated_poses, allocated_poses_per_row * allocated_poses_per_column); + + std::cout << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0] + << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. " + << "Number of poses was reduced to (" << allocated_poses_per_row << ", " + << allocated_poses_per_column << "), a total of " << allocated_poses << std::endl; - allocated_poses = min(allocated_poses, allocated_poses_per_row * allocated_poses_per_column); - std::cout << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0] - << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. " - << "Number of poses was reduced to (" << allocated_poses_per_row << ", " - << allocated_poses_per_column << "), a total of " << allocated_poses << std::endl; + } else { + std::cout << "ERROR (CUDA): The max poses you requested (" << allocated_poses << ") could not be allocated." + << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0] + << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. " << std::endl; + exit(-1); + } } nr_max_poses_ = allocated_poses; @@ -1332,23 +573,8 @@ void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses, nr_max_poses_per_column_ = allocated_poses_per_column; -/* - nr_max_poses_ = n_poses; - nr_max_poses_per_row_ = n_poses_x; - - // determine n_max_poses_y_ - nr_max_poses_per_column_ = nr_max_poses_ / nr_max_poses_per_row_; - if (n_poses % nr_max_poses_per_row_ != 0) nr_max_poses_per_column_++; - - n_poses_ = nr_max_poses_; - n_poses_x_ = nr_max_poses_per_row_; - n_poses_y_ = nr_max_poses_per_column_; - -*/ - n_poses_set_ = true; - bool nr_poses_changed = false; - set_default_kernel_config(nr_max_poses_, nr_max_poses_per_row_, nr_max_poses_per_column_, nr_poses_changed); + set_default_kernel_config(nr_max_poses_, nr_max_poses_per_row_, nr_max_poses_per_column_, nr_poses_changed, adapt_to_constraints); allocated_poses = nr_max_poses_; allocated_poses_per_row = nr_max_poses_per_row_; @@ -1361,87 +587,78 @@ void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses, if (nr_poses_changed) { size_of_log_likelihoods = sizeof(float) * nr_max_poses_; size_of_resampling_indices = sizeof(int) * nr_max_poses_; - size_of_prev_sample_indices = sizeof(int) * nr_max_poses_; - size_of_visibility_probs = n_rows_ * n_cols_ * nr_max_poses_ * sizeof(float); + size_of_occlusion_indices = sizeof(int) * nr_max_poses_; + size_of_occlusion_probs = nr_rows_ * nr_cols_ * nr_max_poses_ * sizeof(float); } // reallocate arrays - allocate(d_log_likelihoods_, size_of_log_likelihoods, "d_log_likelihoods"); - allocate(d_resampling_indices_, size_of_resampling_indices, "d_resampling_indices"); - allocate(d_prev_sample_indices_, size_of_prev_sample_indices, "d_prev_sample_indices"); - allocate(d_visibility_probs_, size_of_visibility_probs, "d_visibility_probs"); - allocate(d_visibility_probs_copy_, size_of_visibility_probs, "d_visibility_probs_copy"); - - // TODO maybe delete after set_visibility_probabilities is properly in use - vector initial_visibility_probs (n_rows_ * n_cols_ * nr_max_poses_, visibility_prob_default_); - cudaMemcpy(d_visibility_probs_, &initial_visibility_probs[0], size_of_visibility_probs, cudaMemcpyHostToDevice); - #ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy visibility_prob_default_ -> d_visibility_probs_"); - #endif + allocate(d_log_likelihoods_, size_of_log_likelihoods); + allocate(d_occlusion_indices_, size_of_occlusion_indices); + allocate(d_occlusion_probs_, size_of_occlusion_probs); + allocate(d_occlusion_probs_copy_, size_of_occlusion_probs); + vector initial_occlusion_probs (nr_rows_ * nr_cols_ * nr_max_poses_, occlusion_prob_default_); + + cudaMemcpy(d_occlusion_probs_, &initial_occlusion_probs[0], size_of_occlusion_probs, cudaMemcpyHostToDevice); + #ifdef DEBUG + check_cuda_error("cudaMemcpy occlusion_prob_default_ -> d_occlusion_probs_"); + #endif cudaDeviceSynchronize(); - #ifdef CHECK_ERRORS + #ifdef DEBUG check_cuda_error("cudaDeviceSynchronize allocate_memory_for_max_poses"); #endif - + memory_allocated_ = true; } -void CudaFilter::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) { - if (nr_poses <= nr_max_poses_) { - - if (nr_max_poses_per_row_ < nr_poses_per_row) { - nr_poses_per_row = nr_max_poses_per_row_; - nr_poses_per_column = ceil(nr_poses / nr_poses_per_row); - if (nr_max_poses_per_column_ < nr_poses_per_column) { - nr_poses_per_column = nr_max_poses_per_column_; - } +void CudaFilter::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, bool adapt_to_constraints) { + if (nr_poses > nr_max_poses_) { + if (adapt_to_constraints) { + std::cout << "WARNING (CUDA): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")." + << "The number of poses was automatically reduced to " << nr_max_poses_ << "." << std::endl; + nr_poses = nr_max_poses_; + } else { - std::cout << "Number of poses was reduced to (" << nr_poses_per_row << ", " - << nr_poses_per_column << ") because of the maximum number of poses set in the beginning." << std::endl; + cout << "ERROR (CUDA): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl; + exit(-1); } + } - nr_poses = min(nr_poses, nr_poses_per_row * nr_poses_per_column); - - nr_poses_ = nr_poses; - nr_poses_per_row_ = nr_poses_per_row; - nr_poses_per_column_ = nr_poses_per_column; - - /* + if (nr_max_poses_per_row_ < nr_poses_per_row) { + nr_poses_per_row = nr_max_poses_per_row_; + nr_poses_per_column = ceil(nr_poses / nr_poses_per_row); + if (nr_max_poses_per_column_ < nr_poses_per_column) { + nr_poses_per_column = nr_max_poses_per_column_; + } + std::cout << "WARNING (CUDA): Number of poses was reduced to (" << nr_poses_per_row << ", " + << nr_poses_per_column << ") because of the maximum number of poses set in the beginning." << std::endl; + } - nr_poses_ = nr_poses; - nr_poses_per_row_ = nr_poses_per_row; + nr_poses = min(nr_poses, nr_poses_per_row * nr_poses_per_column); - // determine n_max_poses_y_ - nr_poses_per_column_ = nr_poses_ / nr_poses_per_row_; - if (nr_poses % nr_poses_per_row_ != 0) nr_poses_per_column_++; + nr_poses_ = nr_poses; + nr_poses_per_row_ = nr_poses_per_row; + nr_poses_per_column_ = nr_poses_per_column; - if (nr_poses_per_row_ > nr_max_poses_per_row_ || nr_poses_per_column_ > nr_max_poses_per_column_) { - cout << "WARNING: You tried to evaluate more poses in a row or in a column than was allocated in the beginning." - << endl << "Check set_number_of_poses() functions in object_rasterizer.cpp" << endl; - }*/ - bool nr_poses_changed = false; - set_default_kernel_config(nr_poses_, nr_poses_per_row_, nr_poses_per_column_, nr_poses_changed); + bool nr_poses_changed = false; + set_default_kernel_config(nr_poses_, nr_poses_per_row_, nr_poses_per_column_, nr_poses_changed, adapt_to_constraints); - nr_poses = nr_poses_; - nr_poses_per_row = nr_poses_per_row_; - nr_poses_per_column = nr_poses_per_column_; + nr_poses = nr_poses_; + nr_poses_per_row = nr_poses_per_row_; + nr_poses_per_column = nr_poses_per_column_; - } else { - cout << "ERROR (Cuda): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl; - exit(-1); - } + number_of_poses_set_ = true; } void CudaFilter::set_default_kernel_config(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, - bool& nr_poses_changed) { + bool& nr_poses_changed, bool adapt_to_constraints) { nr_threads_ = min(DEFAULT_NR_THREADS, cuda_device_properties_.maxThreadsDim[0]); // check for grid dimension limitations @@ -1456,34 +673,25 @@ void CudaFilter::set_default_kernel_config(int& nr_poses, int& nr_poses_per_row, nr_poses_changed = true; - std::cout << "Number of poses was reduced to (" << nr_poses_per_row << ", " - << nr_poses_per_column << ") because of the maximum grid size (" - << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1] - << ") retrieved from CUDA properties." << std::endl; + if (adapt_to_constraints) { + std::cout << "WARNING (CUDA): Number of poses was reduced to (" << nr_poses_per_row << ", " + << nr_poses_per_column << ") because of the maximum grid size (" + << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1] + << ") retrieved from CUDA properties." << std::endl; + } else { + std::cout << "ERROR (CUDA): Number of poses exceeded maximum grid size specified by GPU: " + << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1] << "." << std::endl; + exit(-1); + } } grid_dimension_ = dim3(nr_poses_per_row, nr_poses_per_column); - /* - - // determine n_threads_ and n_blocks_ - // n_threads_ should lie between 32 (warp_size) and 128 and all microprocessors should be busy - nr_threads_ = ((nr_poses_ / n_mps_) / warp_size_) * warp_size_; - if (nr_threads_ == 0) nr_threads_ = warp_size_; - if (nr_threads_ > 4 * warp_size_) nr_threads_ = 4 * warp_size_; - - n_blocks_ = nr_poses_ / nr_threads_; - if (n_blocks_ % nr_poses_ != 0) n_blocks_++; - - */ } -void CudaFilter::set_texture_array(cudaArray_t texture_array) { - d_texture_array_ = texture_array; -} // ===================================================================================== // @@ -1491,41 +699,29 @@ void CudaFilter::set_texture_array(cudaArray_t texture_array) { // ===================================================================================== // +int CudaFilter::get_max_nr_threads() { + return cuda_device_properties_.maxThreadsDim[0]; +} - -vector CudaFilter::get_visibility_probabilities(int state_id) { -// cout << "n_rows_: " << n_rows_ << ", n_cols_: " << n_cols_ << endl; - float* visibility_probabilities = (float*) malloc(n_rows_ * n_cols_ * sizeof(float)); - int offset = state_id * n_rows_ * n_cols_; - cudaMemcpy(visibility_probabilities, d_visibility_probs_ + offset, n_rows_ * n_cols_ * sizeof(float), cudaMemcpyDeviceToHost); -#ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_visibility_probabilities -> visibility_probabilities"); -#endif - vector visibility_probabilities_vector; - for (int i = 0; i < n_rows_ * n_cols_; i++) { - visibility_probabilities_vector.push_back(visibility_probabilities[i]); - } - free(visibility_probabilities); - return visibility_probabilities_vector; +int CudaFilter::get_warp_size() { + return cuda_device_properties_.warpSize; } +vector CudaFilter::get_occlusion_probabilities(int state_id) { + float* occlusion_probabilities = (float*) malloc(nr_rows_ * nr_cols_ * sizeof(float)); + int offset = state_id * nr_rows_ * nr_cols_; + cudaMemcpy(occlusion_probabilities, d_occlusion_probs_ + offset, nr_rows_ * nr_cols_ * sizeof(float), cudaMemcpyDeviceToHost); + #ifdef DEBUG + check_cuda_error("cudaMemcpy d_occlusion_probabilities -> occlusion_probabilities"); + #endif -vector > CudaFilter::get_visibility_probabilities() { - float* visibility_probabilities = (float*) malloc(nr_poses_ * n_rows_ * n_cols_ * sizeof(float)); - cudaMemcpy(visibility_probabilities, d_visibility_probs_, nr_poses_ * n_rows_ * n_cols_ * sizeof(float), cudaMemcpyDeviceToHost); -#ifdef CHECK_ERRORS - check_cuda_error("cudaMemcpy d_visibility_probabilities -> visibility_probabilities"); -#endif - vector > visibility_probabilities_vector; - vector tmp_vector (n_rows_ * n_cols_); - for (int i = 0; i < nr_poses_; i++) { - for (int j = 0; j < n_rows_ * n_cols_; j++) { - tmp_vector[j] = visibility_probabilities[i * n_rows_ * n_cols_ + j]; - } - visibility_probabilities_vector.push_back(tmp_vector); + vector occlusion_probabilities_vector; + for (int i = 0; i < nr_rows_ * nr_cols_; i++) { + occlusion_probabilities_vector.push_back(occlusion_probabilities[i]); } - return visibility_probabilities_vector; + free(occlusion_probabilities); + return occlusion_probabilities_vector; } @@ -1537,31 +733,16 @@ vector > CudaFilter::get_visibility_probabilities() { -template void CudaFilter::allocate(T * &pointer, size_t size, string name) { -#ifdef CHECK_ERRORS - size_t free_space_before, free_space_after, total_space; - cuMemGetInfo(&free_space_before, &total_space); -#endif +template void CudaFilter::allocate(T * &pointer, size_t size) { cudaFree(pointer); cudaMalloc((void **) &pointer, size); -#ifdef CHECK_ERRORS - cuMemGetInfo(&free_space_after, &total_space); - std::cout << "memory to allocate for " << name << ": " << size / 1e6 << " MB; free space: " << free_space_before / 1e6 - << "MB; --> allocated " << (free_space_before - free_space_after) / 1e6 << " MB, free space left: " << free_space_after / 1e6 << " MB" << std::endl; +#ifdef DEBUG check_cuda_error("cudaMalloc failed"); #endif } -void CudaFilter::map_texture() { - cudaBindTextureToArray(texture_reference, d_texture_array_); - check_cuda_error("cudaBindTextureToArray"); -} - - - - void CudaFilter::check_cuda_error(const char *msg) { cudaError_t err = cudaGetLastError(); @@ -1572,29 +753,19 @@ void CudaFilter::check_cuda_error(const char *msg) } } -CudaFilter::~CudaFilter() { - cudaFree(d_states_); - cudaFree(d_states_copy_); - cudaFree(d_visibility_probs_); - cudaFree(d_visibility_probs_copy_); - cudaFree(d_observations_); - cudaFree(d_log_likelihoods_); - cudaFree(d_mrg_states_); - cudaFree(d_resampling_indices_); +// ===================================================================================== // +// ============================ CUDA FILTER DESTRUCTOR ================================ // +// ===================================================================================== // + -} -void CudaFilter::destroy_context() { - cudaFree(d_states_); - cudaFree(d_states_copy_); - cudaFree(d_visibility_probs_); - cudaFree(d_visibility_probs_copy_); + +CudaFilter::~CudaFilter() { + cudaFree(d_occlusion_probs_); + cudaFree(d_occlusion_probs_copy_); cudaFree(d_observations_); cudaFree(d_log_likelihoods_); - cudaFree(d_mrg_states_); - cudaFree(d_resampling_indices_); - cudaDeviceReset(); + cudaFree(d_occlusion_indices_); } } - diff --git a/src/dbot/model/observation/gpu/object_rasterizer.cpp b/src/dbot/model/observation/gpu/object_rasterizer.cpp index 3fb7c05..3eca454 100644 --- a/src/dbot/model/observation/gpu/object_rasterizer.cpp +++ b/src/dbot/model/observation/gpu/object_rasterizer.cpp @@ -1,45 +1,42 @@ -/** @author Claudia Pfreundt */ +/// @author Claudia Pfreundt /* Note: Profiling slows down the rendering process. Use only to display the runtimes * of the different subroutines inside the render call. */ //#define PROFILING_ACTIVE +//#define DEBUG - -#include #include -#include -#include -#include -#include - #include #include - #include - -#include +#include +#include using namespace std; using namespace Eigen; -ObjectRasterizer::ObjectRasterizer() -{ -} ObjectRasterizer::ObjectRasterizer(const std::vector > vertices, const std::vector > > indices, const std::string vertex_shader_path, const std::string fragment_shader_path, - const Eigen::Matrix3f camera_matrix) : - nr_rows_(WINDOW_HEIGHT), - nr_cols_(WINDOW_WIDTH), + const Eigen::Matrix3f camera_matrix, + const float near_plane, + const float far_plane, + const int nr_rows, + const int nr_cols) : + near_plane_(near_plane), + far_plane_(far_plane), + nr_rows_(nr_rows), + nr_cols_(nr_cols), vertex_shader_path_(vertex_shader_path), fragment_shader_path_(fragment_shader_path) { // ========== CREATE WINDOWLESS OPENGL CONTEXT =========== // + typedef GLXContext (*glXCreateContextAttribsARBProc)(Display*, GLXFBConfig, GLXContext, Bool, const int*); typedef Bool (*glXMakeContextCurrentARBProc)(Display*, GLXDrawable, GLXDrawable, GLXContext); static glXCreateContextAttribsARBProc glXCreateContextAttribsARB = 0; @@ -68,7 +65,6 @@ ObjectRasterizer::ObjectRasterizer(const std::vector vertices_per_object; + for (size_t i = 0; i < vertices.size(); i++) { // each i equals one object object_numbers_.push_back(i); - // vertices_per_object_.push_back(vertices[i].size() * 3); ?? * 3 equals floats per object, how does gldrawelements index? - vertices_per_object_.push_back(vertices[i].size()); + vertices_per_object.push_back(vertices[i].size()); for (size_t j = 0; j < vertices[i].size(); j++) { // each j equals one vertex in that object for (int k = 0; k < vertices[i][j].size(); k++) { // each k equals one dimension of that vertex vertices_list_.push_back(vertices[i][j][k]); @@ -184,7 +180,7 @@ ObjectRasterizer::ObjectRasterizer(const std::vector (NR_SUBROUTINES_TO_MEASURE, 0); + time_measurement_ = vector (NR_SUBROUTINES_TO_MEASURE, 0); initial_run_ = true; strings_for_subroutines.push_back("ATTACH_TEXTURE"); @@ -283,31 +286,21 @@ ObjectRasterizer::ObjectRasterizer(const std::vector > > states, - std::vector > &intersect_indices, - std::vector > &depth) { +void ObjectRasterizer::render(const std::vector > states, + std::vector > depth_values) { render(states); - get_depth_values(intersect_indices, depth); + depth_values = get_depth_values(); } +void ObjectRasterizer::render(const std::vector > states) { -void ObjectRasterizer::render(const std::vector > > states) { #ifdef PROFILING_ACTIVE glBeginQuery(GL_TIME_ELAPSED, time_query_[ATTACH_TEXTURE]); @@ -319,7 +312,9 @@ void ObjectRasterizer::render(const std::vector > GL_TEXTURE_2D, // 3. tex target: GL_TEXTURE_2D framebuffer_texture_for_all_poses_, // 4. tex ID 0); - +#ifdef DEBUG + check_GL_errors("attaching texture to framebuffer"); +#endif #ifdef PROFILING_ACTIVE glFinish(); glEndQuery(GL_TIME_ELAPSED); @@ -328,6 +323,9 @@ void ObjectRasterizer::render(const std::vector > glClear(GL_DEPTH_BUFFER_BIT | GL_COLOR_BUFFER_BIT); +#ifdef DEBUG + check_GL_errors("clearing framebuffer"); +#endif #ifdef PROFILING_ACTIVE glFinish(); glEndQuery(GL_TIME_ELAPSED); @@ -342,14 +340,19 @@ void ObjectRasterizer::render(const std::vector > for (int j = 0; j < nr_poses_per_row_; j++) { glViewport(j * nr_cols_, (nr_poses_per_column_ - 1 - i) * nr_rows_, nr_cols_, nr_rows_); - + #ifdef DEBUG + check_GL_errors("setting the viewport"); + #endif for (size_t k = 0; k < object_numbers_.size(); k++) { int index = object_numbers_[k]; - model_view_matrix = view_matrix_ * get_model_matrix(states[nr_poses_per_row_ * i + j][index]); + model_view_matrix = view_matrix_ * states[nr_poses_per_row_ * i + j][index]; glUniformMatrix4fv(model_view_matrix_ID_, 1, GL_FALSE, model_view_matrix.data()); glDrawElements(GL_TRIANGLES, indices_per_object_[index], GL_UNSIGNED_INT, (void*) (start_position_[index] * sizeof(uint))); + #ifdef DEBUG + check_GL_errors("render call"); + #endif } } } @@ -358,19 +361,24 @@ void ObjectRasterizer::render(const std::vector > for (int j = 0; j < nr_poses_ - (nr_poses_per_row_ * (nr_poses_per_column_ - 1)); j++) { glViewport(j * nr_cols_, 0, nr_cols_, nr_rows_); + #ifdef DEBUG + check_GL_errors("setting the viewport"); + #endif for (size_t k = 0; k < object_numbers_.size(); k++) { int index = object_numbers_[k]; - model_view_matrix = view_matrix_ * get_model_matrix(states[nr_poses_per_row_ * (nr_poses_per_column_ - 1) + j][index]); + model_view_matrix = view_matrix_ * states[nr_poses_per_row_ * (nr_poses_per_column_ - 1) + j][index]; glUniformMatrix4fv(model_view_matrix_ID_, 1, GL_FALSE, model_view_matrix.data()); glDrawElements(GL_TRIANGLES, indices_per_object_[index], GL_UNSIGNED_INT, (void*) (start_position_[index] * sizeof(uint))); + #ifdef DEBUG + check_GL_errors("render call"); + #endif } } - #ifdef PROFILING_ACTIVE glFinish(); glEndQuery(GL_TIME_ELAPSED); @@ -383,28 +391,38 @@ void ObjectRasterizer::render(const std::vector > 0, // 4. tex ID 0); +#ifdef DEBUG + check_GL_errors("detaching texture from framebuffer"); +#endif + #ifdef PROFILING_ACTIVE glFinish(); glEndQuery(GL_TIME_ELAPSED); store_time_measurements(); #endif - /* TODO should be unnecessary when texture is previously detached from framebuffer.. - * would like to find evidence that this detaching really introduces a synchronization though*/ - glFinish(); - } void ObjectRasterizer::set_objects(vector object_numbers) { - // TODO does it copy the vector or set a reference? object_numbers_ = object_numbers; } + +void ObjectRasterizer::set_resolution(const int n_rows, const int n_cols, + int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints) { + nr_rows_ = n_rows; + nr_cols_ = n_cols; + + // reallocate textures + allocate_textures_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column, adapt_to_constraints); +} + void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses, int& allocated_poses_per_row, - int& allocated_poses_per_column) { + int& allocated_poses_per_column, + const bool adapt_to_constraints) { int max_poses_per_row = floor(max_texture_size_ / nr_cols_); int max_poses_per_column = floor(max_texture_size_ / nr_rows_); @@ -412,14 +430,20 @@ void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses, allocated_poses_per_column = min(max_poses_per_column, (int) ceil(allocated_poses / (float) allocated_poses_per_row)); if (allocated_poses > allocated_poses_per_row * allocated_poses_per_column) { - std::cout << "The space for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. " - << "The limit is OpenGL texture size: " << max_texture_size_ << ". Current resolution is (" << nr_cols_ << ", " - << nr_rows_ << ") , which means a maximum of (" << max_poses_per_row << ", " << max_poses_per_column << ") poses. " - << "As a result, space for " << allocated_poses_per_row * allocated_poses_per_column << " poses will be allocated " - << "in the form of (" << allocated_poses_per_row << ", " << allocated_poses_per_column << ")." << std::endl; + if (adapt_to_constraints) { + std::cout << "WARNING (OPENGL): The space for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. " + << "The limit is OpenGL texture size: " << max_texture_size_ << ". Current resolution is (" << nr_cols_ << ", " + << nr_rows_ << ") , which means a maximum of (" << max_poses_per_row << ", " << max_poses_per_column << ") poses. " + << "As a result, space for " << allocated_poses_per_row * allocated_poses_per_column << " poses will be allocated " + << "in the form of (" << allocated_poses_per_row << ", " << allocated_poses_per_column << ")." << std::endl; + } else { + std::cout << "ERROR (OPENGL): The number of poses you requested cannot be rendered. The limit is the maximum OpenGL texture size: " + << max_texture_size_ << " x " << max_texture_size_ << ". You requested a resolution of " << nr_cols_ << " x " << nr_rows_ + << " and " << allocated_poses << " poses." << std::endl; + exit(-1); + } } - allocated_poses = allocated_poses_per_row * allocated_poses_per_column; nr_max_poses_ = allocated_poses; @@ -430,87 +454,37 @@ void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses, nr_poses_per_column_ = allocated_poses_per_column; reallocate_buffers(); - - - - /* -// if (nr_max_poses_ != nr_max_poses) { - nr_max_poses_ = nr_max_poses; - nr_poses_ = nr_max_poses; - -// // TODO max_dimension[0], [1], at the moment they are identical - float sqrt_poses = sqrt(nr_poses_); - // TODO this can be done smarter. I want to only increment sqrt_poses, if it is not an int, i.e. 10.344 instead of 10) - if (sqrt_poses * sqrt_poses != nr_poses_) sqrt_poses = (int) ceil(sqrt_poses); - // TODO max_dimension[0], [1], at the moment they are identical - nr_max_poses_per_row_ = min((int) sqrt_poses, (max_texture_size_ / nr_cols_)); - int y_poses = nr_max_poses_ / nr_max_poses_per_row_; - if (y_poses * nr_max_poses_per_row_ < nr_max_poses_) y_poses++; - nr_max_poses_per_column_ = min(y_poses, (max_texture_size_ / nr_rows_)); - - nr_poses_per_row_ = nr_max_poses_per_row_; - nr_poses_per_column_ = nr_max_poses_per_column_; - - reallocate_buffers(); -// }*/ } -void ObjectRasterizer::set_number_of_poses(const int nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) { - if (nr_poses <= nr_max_poses_) { - nr_poses_per_row = min(nr_max_poses_per_row_, nr_poses); - nr_poses_per_column = min(nr_max_poses_per_column_, (int) ceil(nr_poses / (float) nr_poses_per_row)); - - nr_poses_ = nr_poses; - nr_poses_per_row_ = nr_poses_per_row; - nr_poses_per_column_ = nr_poses_per_column; - - - - /* - nr_poses_ = nr_poses; - +void ObjectRasterizer::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints) { -// // TODO max_dimension[0], [1], at the moment they are identical - float sqrt_poses = sqrt(nr_poses_); - // TODO this can be done smarter. I want to only increment sqrt_poses, if it is not an int, i.e. 10.344 instead of 10) - if (sqrt_poses * sqrt_poses != nr_poses_) sqrt_poses = (int) ceil(sqrt_poses); - // TODO max_dimension[0], [1], at the moment they are identical - nr_poses_per_row_ = min((int) sqrt_poses, (max_texture_size_ / nr_cols_)); - int y_poses = nr_poses_ / nr_poses_per_row_; - if (y_poses * nr_poses_per_row_ < nr_poses_) y_poses++; - nr_poses_per_column_ = min(y_poses, (max_texture_size_ / nr_rows_)); - - if (nr_poses_per_row_ > nr_max_poses_per_row_ || nr_poses_per_column_ > nr_max_poses_per_column_) { - cout << "WARNING: You tried to evaluate more poses in a row or in a column than was allocated in the beginning." - << endl << "Check set_number_of_poses() functions in object_rasterizer.cpp" << endl; - }*/ - - } else { - cout << "ERROR (OpenGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl; - exit(-1); + if (nr_poses > nr_max_poses_) { + if (adapt_to_constraints) { + std::cout << "WARNING (OPENGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")." + << "The number of poses was automatically reduced to " << nr_max_poses_ << "." << std::endl; + nr_poses = nr_max_poses_; + } else { + cout << "ERROR (OPENGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl; + exit(-1); + } } -} -void ObjectRasterizer::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) { - nr_rows_ = n_rows; - nr_cols_ = n_cols; + nr_poses_per_row = min(nr_max_poses_per_row_, nr_poses); + nr_poses_per_column = min(nr_max_poses_per_column_, (int) ceil(nr_poses / (float) nr_poses_per_row)); - // reallocate textures - allocate_textures_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column); + nr_poses_ = nr_poses; + nr_poses_per_row_ = nr_poses_per_row; + nr_poses_per_column_ = nr_poses_per_column; } + GLuint ObjectRasterizer::get_framebuffer_texture() { return framebuffer_texture_for_all_poses_; } -int ObjectRasterizer::get_nr_poses_per_row() { - return nr_poses_per_row_; -} - -void ObjectRasterizer::get_depth_values(std::vector > &intersect_indices, - std::vector > &depth) { +vector > ObjectRasterizer::get_depth_values() { // ===================== ATTACH TEXTURE TO FRAMEBUFFER ================ // @@ -559,22 +533,7 @@ void ObjectRasterizer::get_depth_values(std::vector > &intersec } } - // filling the respective values per pose into their indices and depth vectors - vector tmp_indices; - vector tmp_depth; - - for (int state = 0; state < nr_poses_; state++) { - for (int i = 0; i < nr_rows_ * nr_cols_; i++) { - if (depth_image_per_pose[state][i] != 0) { - tmp_indices.push_back(i); - tmp_depth.push_back(depth_image_per_pose[state][i]); - } - } - intersect_indices.push_back(tmp_indices); - depth.push_back(tmp_depth); - tmp_indices.resize(0); - tmp_depth.resize(0); - } + return depth_image_per_pose; } else { cout << "WARNING: Could not map Pixel Pack Buffer." << endl; @@ -589,12 +548,11 @@ void ObjectRasterizer::get_depth_values(std::vector > &intersec GL_TEXTURE_2D, // 3. tex target: GL_TEXTURE_2D 0, // 4. tex ID 0); -} - - - - + #ifdef DEBUG + check_GL_errors("copying depth values to CPU"); + #endif +} @@ -656,22 +614,6 @@ void ObjectRasterizer::reallocate_buffers() { } - -Matrix4f ObjectRasterizer::get_model_matrix(const vector state) { - // Model matrix equals the state of the object. Position and Quaternion just have to be expressed as a matrix. - // note: state = (q.w, q.x, q.y, q.z, v.x, v.y, v.z) - Matrix4f model_matrix = Matrix4f::Identity(); - Transform model_matrix_transform; - model_matrix_transform = Translation3f(state[4], state[5], state[6]); - model_matrix = model_matrix_transform.matrix(); - - Quaternionf qRotation = Quaternionf(state[0], state[1], state[2], state[3]); - model_matrix.topLeftCorner(3, 3) = qRotation.toRotationMatrix(); - - return model_matrix; -} - - void ObjectRasterizer::setup_view_matrix() { // =========================== VIEW MATRIX =========================== // @@ -691,8 +633,8 @@ void ObjectRasterizer::setup_projection_matrix(const Eigen::Matrix3f camera_matr Vector3f boundaries_min = camera_matrix_inverse * Vector3f(-0.5, -0.5, 1); Vector3f boundaries_max = camera_matrix_inverse * Vector3f(float(nr_cols_)-0.5, float(nr_rows_)-0.5, 1); - float near = NEAR_PLANE; - float far = FAR_PLANE; + float near = near_plane_; + float far = far_plane_; float left = near * boundaries_min(0); float right = near * boundaries_max(0); float top = -near * boundaries_min(1); @@ -733,7 +675,7 @@ void ObjectRasterizer::store_time_measurements() { } glGetQueryObjectuiv(time_query_[i], GL_QUERY_RESULT, &time_elapsed_ns); time_elapsed_s = time_elapsed_ns / (double) 1e9; - gpu_times_aggregate_[i] += time_elapsed_s; + time_measurement_[i] += time_elapsed_s; } @@ -741,7 +683,7 @@ void ObjectRasterizer::store_time_measurements() { if (initial_run_) { initial_run_ = false; for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) { - gpu_times_aggregate_[i] = 0; + time_measurement_[i] = 0; } nr_calls_ = 0; @@ -803,7 +745,7 @@ bool ObjectRasterizer::check_framebuffer_status() { ObjectRasterizer::~ObjectRasterizer() { -#ifdef PROFILING_ON +#ifdef PROFILING_ACTIVE if (nr_calls_ != 0) { cout << endl << "Time measurements for the different steps of the rendering process averaged over " << nr_calls_ << " render calls:" << endl << endl; @@ -811,12 +753,12 @@ ObjectRasterizer::~ObjectRasterizer() double total_time_per_render = 0; for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) { - total_time_per_render += gpu_times_aggregate_[i]; + total_time_per_render += time_measurement_[i]; } total_time_per_render /= nr_calls_; for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) { - double time_per_subroutine = gpu_times_aggregate_[i] / nr_calls_; + double time_per_subroutine = time_measurement_[i] / nr_calls_; cout << get_text_for_enum(i) << ": " << "\t " << time_per_subroutine << " s \t " << setprecision(1) @@ -831,7 +773,6 @@ ObjectRasterizer::~ObjectRasterizer() glDeleteQueries(NR_SUBROUTINES_TO_MEASURE, time_query_); #endif - cout << "clean up OpenGL.." << endl; glDisableVertexAttribArray(0); glDeleteVertexArrays(1, &vertex_array_); @@ -848,4 +789,3 @@ ObjectRasterizer::~ObjectRasterizer() } - diff --git a/src/dbot/model/observation/gpu/shader.cpp b/src/dbot/model/observation/gpu/shader.cpp index f6b4f09..59f8376 100644 --- a/src/dbot/model/observation/gpu/shader.cpp +++ b/src/dbot/model/observation/gpu/shader.cpp @@ -4,7 +4,6 @@ #include #include #include -//using namespace std; #include #include