From 2fd63b624dec23a1b0b11f2749a4b1a98d30dbbb Mon Sep 17 00:00:00 2001
From: issac <jan.issac@gmail.com>
Date: Mon, 14 Dec 2015 17:16:44 +0100
Subject: [PATCH] Integrated Claudia's cleanup code

---
 cmake/info.cmake                              |    6 +-
 .../model/observation/gpu/cuda_filter.hpp     |  292 ++--
 .../kinect_image_observation_model_gpu.hpp    |  890 +++++++---
 .../observation/gpu/object_rasterizer.hpp     |  147 +-
 include/dbot/util/traits.hpp                  |    1 +
 src/dbot/model/observation/gpu/cuda_filter.cu | 1469 ++++-------------
 .../observation/gpu/object_rasterizer.cpp     |  292 ++--
 src/dbot/model/observation/gpu/shader.cpp     |    1 -
 8 files changed, 1334 insertions(+), 1764 deletions(-)
diff --git a/cmake/info.cmake b/cmake/info.cmake
index 30608c6..0013f3e 100644
--- a/cmake/info.cmake
+++ b/cmake/info.cmake
@@ -31,9 +31,9 @@ function(info_end)
 endfunction(info_end)
 
 function(info_project project_name project_version)
-message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ${COLOR_HEADER}${project_name}${COLOR_CLEAR}")
-message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} Version: ${COLOR_BOLD}${project_version}${COLOR_CLEAR}")
-endfunction(info_package)
+  message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ${COLOR_HEADER}${project_name}${COLOR_CLEAR}")
+  message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} Version: ${COLOR_BOLD}${project_version}${COLOR_CLEAR}")
+endfunction(info_project)
 
 function(info_header list_header)
   message(STATUS "${COLOR_BORDER}== ${COLOR_CLEAR} ")
diff --git a/include/dbot/model/observation/gpu/cuda_filter.hpp b/include/dbot/model/observation/gpu/cuda_filter.hpp
index 49b6568..5b214a6 100644
--- a/include/dbot/model/observation/gpu/cuda_filter.hpp
+++ b/include/dbot/model/observation/gpu/cuda_filter.hpp
@@ -1,168 +1,254 @@
+/// @author Claudia Pfreundt <claudilein@gmail.com>
+
 #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_CUDA_FILTER_HPP
 #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_CUDA_FILTER_HPP
 
-#include "boost/shared_ptr.hpp"
 #include <curand_kernel.h>
-#include "GL/glut.h"
-
 #include <vector>
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-
 namespace fil
 {
+/// This class provides a parallel implementation of the weighting step on the
+/// GPU.
+/** After initializing the class and setting the execution parameters like the
+ * number of poses
+ *  and the resolution, you can weigh poses with the weigh_poses() function.
+ * Make sure to
+ *  always render the poses first with opengl, then map the texture into CUDA,
+ *  update the observation image with set_observations() and update the
+ * occlusion indices (after resampling)
+ *  before you call the weigh_poses() function.
+ */
 class CudaFilter
 {
-   public:
-    CudaFilter();
+public:
+    /// constructor which takes the resolution of the camera image
+    /**
+     * \param [in] nr_rows the number of rows in each camera image
+     * \param [in] nr_cols the number of columns in each camera image
+     */
+    CudaFilter(const int nr_rows = 60, const int nr_cols = 80);
+
+    /// destructor which frees the memory used on the GPU
     ~CudaFilter();
 
-    void init(std::vector<std::vector<float> > com_models,
-              float angle_sigma,
-              float trans_sigma,
-              float p_visible_init,
-              float c,
-              float log_c,
-              float p_visible_occluded,
-              float tail_weight,
-              float model_sigma,
-              float sigma_factor,
-              float max_depth,
-              float exponential_rate);
-
-    // filter functions
-    void propagate(const float& current_time,
-                   std::vector<std::vector<float> >& states);  // not used
-    void propagate_multiple(
-        const float& current_time,
-        std::vector<std::vector<std::vector<float> > >& states);  // not used
-    void compare(float observation_time,
-                 bool constant_occlusion,
-                 std::vector<float>& log_likelihoods);  // not used
-    void compare_multiple(bool update, std::vector<float>& log_likelihoods);
-    void resample(std::vector<int> resampling_indices);           // not used
-    void resample_multiple(std::vector<int> resampling_indices);  // not used
+    /// copies constants to GPU memory and initializes some memory-related
+    /// values.
+    /**
+     * This function has to be called once in the beginning, before calling the
+     * weigh_poses() function.
+     * \param [in] initial_occlusion_prob the initial probability for each pixel
+     * of being occluded, meaning
+      * that something occludes the object in this pixel
+     * \param [in] p_occluded_occluded the probability of a pixel staying
+     * occluded from one frame to the next
+     * \param [in] p_occluded_visible the probability of a pixel changing from
+     * being occluded to being visible
+     * from one frame to the next
+     * \param [in] tail_weight the probability of a faulty measurement
+     * \param [in] model_sigma the uncertainty in the 3D model of the object
+     * \param [in] sigma_factor the standard deviation of the measurement noise
+     * at a distance of 1m to the camera
+     * \param [in] max_depth maximum value which can be measured by the depth
+     * sensor
+     * \param [in] exponential_rate the rate of the exponential distribution
+     * that models the probability of a measurement coming from an unknown
+     * object
+     */
+    void init(const float initial_occlusion_prob,
+              const float p_occluded_occluded,
+              const float p_occluded_visible,
+              const float tail_weight,
+              const float model_sigma,
+              const float sigma_factor,
+              const float max_depth,
+              const float exponential_rate);
+
+    /// weights the different poses that were previously rendered with OpenGL
+    /**
+     * \param [in] update whether or not to update the occlusion probabilities
+     * during this weighting
+     * \param [out] log_likelihoods the computed likelihoods for each pose
+     */
+    void weigh_poses(const bool update_occlusions,
+                     std::vector<float>& log_likelihoods);
 
     // setters
-    void set_states(std::vector<std::vector<float> >& states,
-                    int seed);  // not needed if propagation not on GPU
-    void set_states_multiple(int n_objects,
-                             int n_features,
-                             int seed);  // not needed if propagation not on GPU
+    /// sets the number of threads used for the CUDA weighting kernel to the
+    /// desired number.
+    /// A default of 128 is used if nothing is specified here.
+    /**
+     * \param [in] nr_threads the desired number of threads
+     */
+    void set_nr_threads(const int nr_threads);
+
+    /// copies the observation image from the camera to the GPU for comparison
+    /**
+     * \param [in] observations a pointer to the observation values
+     * \param [in] observation_time the time at which this observation was
+     * captured
+     */
     void set_observations(const float* observations,
                           const float observation_time);
-    void set_observations(const float* observations);  // not used outside, can
-                                                       // be integrated into
-                                                       // above
-    void set_visibility_probabilities(const float* visibility_probabilities);
-    void set_prev_sample_indices(const int* prev_sample_indices);
+
+    /// sets the indices to the occlusion array for every state
+    /**
+     * \param [in] occlusion_indices [state_nr] = {index}. For each state, this
+     * gives the index
+     * into the occlusion array.
+     */
+    void set_occlusion_indices(const int* occlusion_indices);
+
+    /// sets the resolution for the images to be compared
+    /** This function might downgrade the number of poses or change the
+     * arrangement of the
+     *  poses in the grid due to the resolution change.
+     * \param [in] n_rows the number of rows in an image
+     * \param [in] n_cols the number of columns in an image
+     * \param [out] nr_poses the number of poses that will be weighted
+     * \param [out] nr_poses_per_row the number of poses per row that will be
+     * weighted
+     * \param [out] nr_poses_per_column the number of poses per column that will
+     * be weighted
+     * \param [in] adapt_to_constraints whether to automatically adapt to GPU
+     * constraints or quit the program if constraints are not met
+     */
     void set_resolution(const int n_rows,
                         const int n_cols,
                         int& nr_poses,
                         int& nr_poses_per_row,
-                        int& nr_poses_per_column);
+                        int& nr_poses_per_column,
+                        bool adapt_to_constraints = false);
+
+    /// sets the occlusion probabilities for all pixels for all states
+    /**
+    * \param [in] occlusion_probabilities a 1D-array of occlusion probabilities
+    * which should contain
+    * nr_rows * nr_cols * nr_poses values.
+    */
+    void set_occlusion_probabilities(const float* occlusion_probabilities);
+
+    /// maps the texture array to an actual texture reference
+    /**
+     * \param [in] texture_array the cudaArray retrieved from OpenGL
+     */
+    void map_texture_to_texture_array(const cudaArray_t texture_array);
+
+    /// allocates the maximum amount of memory that will ever be needed by CUDA
+    /// during runtime
+    /**
+     * \param [in][out] allocated_poses the maximum number of poses that will
+     * ever be evaluated in one weighting step.
+     * This number might be lowered if GPU contraints do now allow this number
+     * of poses.
+     * \param [out] allocated_poses_per_row the maximum number of poses per row
+     * \param [out] allocated_poses_per_column the maximum number of poses per
+     * column
+     * \param [in] adapt_to_constraints whether to automatically adapt to GPU
+     * constraints or quit the program if constraints are not met
+     */
     void allocate_memory_for_max_poses(int& allocated_poses,
                                        int& allocated_poses_per_row,
-                                       int& allocated_poses_per_column);
+                                       int& allocated_poses_per_column,
+                                       bool adapt_to_constraints = false);
+
+    /// sets the number of poses to be weighted in the next weighting step
+    /**
+     * \param [in][out] nr_poses the desired number of poses. Might be changed
+     * due to GPU constraints.
+     * \param [out] nr_poses_per_row the number of poses per row
+     * \param [out] nr_poses_per_column the number of poses per column
+     * \param [in] adapt_to_constraints whether to automatically adapt to GPU
+     * constraints or quit the program if constraints are not met
+     */
     void set_number_of_poses(int& nr_poses,
                              int& nr_poses_per_row,
-                             int& nr_poses_per_column);
-    void set_texture_array(cudaArray_t texture_array);
+                             int& nr_poses_per_column,
+                             bool adapt_to_constraints = false);
 
     // getters
-    std::vector<float> get_visibility_probabilities(int state_id);
-    std::vector<std::vector<float> > get_visibility_probabilities();  // returns
-                                                                      // all of
-                                                                      // them.
-                                                                      // Ask
-                                                                      // Manuel
-                                                                      // if they
-                                                                      // could
-                                                                      // need
-                                                                      // that.
-
-    void map_texture();
-    void destroy_context();
-
-   private:
-    // resolution values if not specified
-    static const int WINDOW_WIDTH = 80;
-    static const int WINDOW_HEIGHT = 60;
-    static const int DEFAULT_NR_THREADS = 128;
+    /// gets the maximum number of threads that can be handled with this GPU
+    /**
+     * \return the maximum number of threads that can be scheduled per block on
+     * the GPU
+     */
+    int get_max_nr_threads();
 
-    // time observation
-    static const int COUNT = 500;
-    int count_;
-    double compare_kernel_time_;
-    double copy_likelihoods_time_;
+    /// gets the warp size of this GPU
+    /**
+     * \return the warp size = the number of threads that are executed
+     * concurrently on a CUDA streaming multiprocessor
+     */
+    int get_warp_size();
+
+    /// gets the occlusion probabilities of a particular state
+    /**
+     * \param [in] state_id the index into the state array
+     * \return a 1D array containing the occlusion probability for each pixel
+     */
+    std::vector<float> get_occlusion_probabilities(int state_id);
+
+private:
+    static const int DEFAULT_NR_THREADS = 128;
 
     // device pointers to arrays stored in global memory on the GPU
-    float* d_states_;       // not needed if propagation not on GPU
-    float* d_states_copy_;  // not needed if propagation not on GPU
-    float* d_visibility_probs_;
-    float* d_visibility_probs_copy_;
+    float* d_occlusion_probs_;
+    float* d_occlusion_probs_copy_;
     float* d_observations_;
     float* d_log_likelihoods_;
-    int* d_prev_sample_indices_;
-    int* d_resampling_indices_;  // not needed if resampling not on GPU
-    curandStateMRG32k3a* d_mrg_states_;
+    int* d_occlusion_indices_;  // this contains, for each pose, the index into
+                                // the occlusion probabilities array, which
+                                // contains the occlusion probabilities for that
+                                // particular pose.
 
     // for OpenGL interop
     cudaArray_t d_texture_array_;
 
     // resolution
-    int n_cols_;
-    int n_rows_;
+    int nr_cols_;
+    int nr_rows_;
 
     // maximum number of poses and their arrangement in the OpenGL texture
     int nr_max_poses_;
     int nr_max_poses_per_row_;
     int nr_max_poses_per_column_;
 
-    // number of poses and their arrangement in the OpenGL texture
+    // actual number of poses and their arrangement in the OpenGL texture
+    // (current frame)
     int nr_poses_;
     int nr_poses_per_row_;
     int nr_poses_per_column_;
 
-    // number of features in a state vector
-    int n_features_;
-
     // block and grid arrangement of the CUDA kernels
     int nr_threads_, n_blocks_;
     dim3 grid_dimension_;
 
-    // system properties
-    int warp_size_;
-    int n_mps_;
-
-    // visibility prob default
-    float visibility_prob_default_;
+    // occlusion probability default value
+    float occlusion_prob_default_;
 
-    // time values to compute the time deltas when calling propagate() or
-    // evaluate()
+    // time values to compute the time deltas when calling the weighting
+    // function
     float occlusion_time_;
     float observation_time_;
 
-    // float delta_time_;
-    float last_propagation_time_;  // not needed if propagation not on GPU
-
-    // booleans to describe the state of the cuda filter, to avoid wrong usage
-    // of the class
-    bool n_poses_set_;
-
     // CUDA device properties
     cudaDeviceProp cuda_device_properties_;
+    int warp_size_;
+    int n_mps_;
+
+    // bool to ensure correct usage of public functions
+    bool observations_set_, occlusion_indices_set_,
+        occlusion_probabilities_set_, memory_allocated_;
+    bool number_of_poses_set_, constants_initialized_;
 
     void set_default_kernel_config(int& nr_poses_,
                                    int& nr_poses_per_row,
                                    int& nr_poses_per_column,
-                                   bool& nr_poses_changed);
-
+                                   bool& nr_poses_changed,
+                                   bool adapt_to_constraints);
     // helper functions
     template <typename T>
-    void allocate(T*& pointer, size_t size, std::string name);
+    void allocate(T*& pointer, size_t size);
     void check_cuda_error(const char* msg);
 };
 }
diff --git a/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp b/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp
index 353adc9..d2579f5 100644
--- a/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp
+++ b/include/dbot/model/observation/gpu/kinect_image_observation_model_gpu.hpp
@@ -1,6 +1,11 @@
+/// @author Claudia Pfreundt <claudilein@gmail.com>
+
 #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_KINECT_IMAGE_OBSERVATION_MODEL_GPU_HPP
 #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_KINECT_IMAGE_OBSERVATION_MODEL_GPU_HPP
 
+#define PROFILING_ACTIVE
+//#define OPTIMIZE_NR_THREADS
+
 #include <vector>
 #include "boost/shared_ptr.hpp"
 #include "boost/filesystem.hpp"
@@ -10,7 +15,6 @@
 #include <dbot/model/observation/rao_blackwell_observation_model.hpp>
 #include <osr/free_floating_rigid_bodies_state.hpp>
 
-#include <dbot/util/traits.hpp>
 #include <dbot/model/observation/gpu/object_rasterizer.hpp>
 #include <dbot/model/observation/gpu/cuda_filter.hpp>
 
@@ -20,34 +24,34 @@
 #include <iostream>
 #include <cuda_gl_interop.h>
 
-
-//#include <dbot/util/helper_functions.hpp>
+#include <dbot/util/helper_functions.hpp>
 #include <fl/util/profiling.hpp>
 
 namespace dbot
 {
-
 // Forward declarations
-template <typename State> class KinectImageObservationModelGPU;
+template <typename State>
+class KinectImageObservationModelGPU;
 
 namespace internal
 {
 /**
- * ImageObservationModelCPU distribution traits specialization
+ * ImageObservationModelGPU distribution traits specialization
  * \internal
  */
 template <typename State>
-struct Traits<KinectImageObservationModelGPU<State> >
+struct Traits<KinectImageObservationModelGPU<State>>
 {
-    typedef fl::Real Scalar;
+    typedef double Scalar;
+    typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> Observation;
 
     typedef RbObservationModel<State> Base;
-    typedef typename Base::Observation Observation;
 
     typedef typename Eigen::Matrix<Scalar, 3, 3> CameraMatrix;
 };
 }
 
+
 /**
  * \class ImageObservationModelGPU
  *
@@ -55,15 +59,15 @@ struct Traits<KinectImageObservationModelGPU<State> >
  * \ingroup observation_models
  */
 template <typename State>
-class KinectImageObservationModelGPU:
-        public internal::Traits<KinectImageObservationModelGPU<State> >::Base
+class KinectImageObservationModelGPU
+    : public internal::Traits<KinectImageObservationModelGPU<State>>::Base
 {
 public:
-    typedef internal::Traits<KinectImageObservationModelGPU<State> > Traits;
+    typedef internal::Traits<KinectImageObservationModelGPU<State>> Traits;
 
-    typedef typename Traits::Scalar         Scalar;
-    typedef typename Traits::Observation    Observation;
-    typedef typename Traits::CameraMatrix   CameraMatrix;
+    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::Observation Observation;
+    typedef typename Traits::CameraMatrix CameraMatrix;
 
     typedef typename Traits::Base::StateArray StateArray;
     typedef typename Traits::Base::RealArray RealArray;
@@ -71,45 +75,89 @@ class KinectImageObservationModelGPU:
 
     typedef typename Eigen::Transform<fl::Real, 3, Eigen::Affine> Affine;
 
-
-
-    // TODO: ALL THIS SHOULD SWITCH FROM USING VISIBILITY TO OCCLUSION
-    KinectImageObservationModelGPU(const CameraMatrix& camera_matrix,
-                    const size_t& n_rows,
-                    const size_t& n_cols,
-                    const size_t& max_sample_count,
-                    const std::vector<std::vector<Eigen::Vector3d> > vertices_double,
-                    const std::vector<std::vector<std::vector<int> > > indices,
-                    const std::string vertex_shader_path,
-                    const std::string fragment_shader_path,
-                    const Scalar& initial_occlusion_prob = 0.1d,
-                    const double& delta_time = 0.033d,
-                    const float p_occluded_visible = 0.1f,
-                    const float p_occluded_occluded = 0.7f,
-                    const float tail_weight = 0.01f,
-                    const float model_sigma = 0.003f,
-                    const float sigma_factor = 0.0014247f,
-                    const float max_depth = 6.0f,
-                    const float exponential_rate = -log(0.5f)):
-            camera_matrix_(camera_matrix),
-            n_rows_(n_rows),
-            n_cols_(n_cols),
-            nr_max_poses_(max_sample_count),
-            indices_(indices),
-            initial_visibility_prob_(1 - initial_occlusion_prob),
-            p_visible_visible_(1.0 - p_occluded_visible),
-            p_visible_occluded_(1.0 - p_occluded_occluded),
-            tail_weight_(tail_weight),
-            model_sigma_(model_sigma),
-            sigma_factor_(sigma_factor),
-            max_depth_(max_depth),
-            exponential_rate_(exponential_rate),
-            nr_poses_(max_sample_count),
-            observations_set_(false),
-            resource_registered_(false),
-            nr_calls_set_observation_(0),
-            observation_time_(0),
-            Traits::Base(delta_time)
+    /// constructor which takes relevant constants and initializes the graphics
+    /// pipeline with them
+    /**
+     * \param [in] camera_matrix
+     * 		matrix of the intrinsic parameters of the camera
+     * \param [in] nr_rows
+     * 		the number of rows in one sensor image (vertical resolution)
+     * \param [in] nr_cols
+     * 		the number of columns in one sensor image (horizontal
+     * 		resolution)
+     * \param [in] max_sample_count
+     * 		the maximum number of poses that will be rendered per object
+     *      in one frame.
+     * This is needed to allocate the necessary memory on the GPU.
+     * \param [in] vertices
+     * 		[object_nr][vertex_nr] = {x, y, z}. This list should
+     * 		contain, for each object, a list of 3-dimensional vectors
+     * 		that specify the corners of the triangles of the object mesh
+     * \param [in] indices [object_nr][index_nr][0 - 2] = {index}. This list
+     * 		should contain the indices
+     * 		that index the vertices list and tell us which vertices to connect to
+     * a
+     * 		triangle (every group of 3).
+     * 		For each object, the indices should be in the range of [0, nr_vertices
+     * - 1].
+     * \param [in] vertex_shader_path path to the vertex shader
+     * \param [in] fragment_shader_path path to the fragment shader
+     * \param [in] initial_occlusion_prob the initial probability for each pixel
+     * of being occluded, meaning
+     * that something occludes the object in this pixel
+     * \param [in] delta_time the time between two frames in seconds. This
+     * should
+     * correspond to the rate at which the
+     * sensor provides new images of the scene.
+     * \param [in] p_occluded_visible the probability of a pixel changing from
+     * being occluded to being visible
+     * from one frame to the next
+     * \param [in] p_occluded_occluded the probability of a pixel staying
+     * occluded from one frame to the next
+     * \param [in] tail_weight the probability of a faulty measurement
+     * \param [in] model_sigma the uncertainty in the 3D model of the object
+     * \param [in] sigma_factor the standard deviation of the measurement noise
+     * at a distance of 1m to the camera
+     * \param [in] max_depth maximum value which can be measured by the depth
+     * sensor
+     * \param [in] exponential_rate the rate of the exponential distribution
+     * that
+     * models the probability of a measurement coming from an unknown object
+     */
+    KinectImageObservationModelGPU(
+        const CameraMatrix& camera_matrix,
+        const size_t& n_rows,
+        const size_t& n_cols,
+        const size_t& max_sample_count,
+        const std::vector<std::vector<Eigen::Vector3d>> vertices_double,
+        const std::vector<std::vector<std::vector<int>>> indices,
+        const std::string vertex_shader_path,
+        const std::string fragment_shader_path,
+        const Scalar& initial_occlusion_prob = 0.1d,
+        const double& delta_time = 0.033d,
+        const float p_occluded_visible = 0.1f,
+        const float p_occluded_occluded = 0.7f,
+        const float tail_weight = 0.01f,
+        const float model_sigma = 0.003f,
+        const float sigma_factor = 0.0014247f,
+        const float max_depth = 6.0f,
+        const float exponential_rate = -log(0.5f))
+        : camera_matrix_(camera_matrix),
+          n_rows_(n_rows),
+          n_cols_(n_cols),
+          nr_max_poses_(max_sample_count),
+          indices_(indices),
+          initial_occlusion_prob_(initial_occlusion_prob),
+          tail_weight_(tail_weight),
+          model_sigma_(model_sigma),
+          sigma_factor_(sigma_factor),
+          max_depth_(max_depth),
+          exponential_rate_(exponential_rate),
+          nr_poses_(max_sample_count),
+          observations_set_(false),
+          resource_registered_(false),
+          observation_time_(0),
+          Traits::Base(delta_time)
     {
         // set constants
         this->default_poses_.recount(vertices_double.size());
@@ -117,393 +165,687 @@ class KinectImageObservationModelGPU:
 
         // convert doubles to floats
         vertices_.resize(vertices_double.size());
-        for(size_t object_index = 0; object_index < vertices_.size(); object_index++)
+        for (size_t object_index = 0; object_index < vertices_.size();
+             object_index++)
         {
-            vertices_[object_index].resize(vertices_double[object_index].size());
-            for(size_t vertex_index = 0; vertex_index < vertices_[object_index].size(); vertex_index++)
-                vertices_[object_index][vertex_index] = vertices_double[object_index][vertex_index].cast<float>();
+            vertices_[object_index].resize(
+                vertices_double[object_index].size());
+            for (size_t vertex_index = 0;
+                 vertex_index < vertices_[object_index].size();
+                 vertex_index++)
+                vertices_[object_index][vertex_index] =
+                    vertices_double[object_index][vertex_index].cast<float>();
         }
 
         // check for incorrect path names
-        if(!boost::filesystem::exists(vertex_shader_path))
+        if (!boost::filesystem::exists(vertex_shader_path))
         {
             std::cout << "vertex shader does not exist at: "
-                 << vertex_shader_path << std::endl;
+                      << vertex_shader_path << std::endl;
             exit(-1);
         }
-        if(!boost::filesystem::exists(fragment_shader_path))
+        if (!boost::filesystem::exists(fragment_shader_path))
         {
             std::cout << "fragment_shader does not exist at: "
-                 << fragment_shader_path << std::endl;
+                      << fragment_shader_path << std::endl;
             exit(-1);
         }
 
-        vertex_shader_path_ =  vertex_shader_path;
+        vertex_shader_path_ = vertex_shader_path;
         fragment_shader_path_ = fragment_shader_path;
 
-
         // initialize opengl and cuda
-        opengl_ = boost::shared_ptr<ObjectRasterizer>
-                (new ObjectRasterizer(vertices_,
-                                      indices_,
-                                      vertex_shader_path_,
-                                      fragment_shader_path_,
-                                      camera_matrix_.cast<float>()));
-        cuda_ = boost::shared_ptr<fil::CudaFilter> (new fil::CudaFilter());
-
-
-        // sets the dimensions of how many poses will be rendered per row and per column in a texture
-        opengl_->allocate_textures_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
-
-        std::cout << "OpenGL: allocated " << nr_max_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-
-        int tmp_max_nr_poses = nr_max_poses_;
-        int tmp_nr_poses_per_row = nr_poses_per_row_;
-        int tmp_nr_poses_per_column = nr_poses_per_column_;
-
-        cuda_->allocate_memory_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
-
-        std::cout << "CUDA: allocated " << nr_max_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-
-
-        // if number of poses gets limited by cuda, we have to reallocate the textures in OpenGL again
-        if (tmp_max_nr_poses != nr_max_poses_ ||
-            tmp_nr_poses_per_row != nr_poses_per_row_ ||
-            tmp_nr_poses_per_column != nr_poses_per_column_) {
-            opengl_->allocate_textures_for_max_poses(nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
-
-            std::cout << "OpenGL adapts to CUDA restrictions: allocated " << nr_max_poses_ << " poses in the form ("
-                      << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-        }
-
+        opengl_ = boost::shared_ptr<ObjectRasterizer>(
+            new ObjectRasterizer(vertices_,
+                                 indices_,
+                                 vertex_shader_path_,
+                                 fragment_shader_path_,
+                                 camera_matrix_.cast<float>()));
 
-        std:: cout << "set resolution in cuda..." << std::endl;
+        cuda_ = boost::shared_ptr<fil::CudaFilter>(new fil::CudaFilter());
 
-        opengl_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
+        // allocates memory and sets the dimensions of how many poses will be
+        // rendered per row and per column in the texture
+        allocate_memory_for_max_poses();
 
-        std::cout << "OpenGL: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-
-        tmp_max_nr_poses = nr_max_poses_;
-        tmp_nr_poses_per_row = nr_poses_per_row_;
-        tmp_nr_poses_per_column = nr_poses_per_column_;
-
-        cuda_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
-
-        std::cout << "CUDA: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-
-        // if number of poses gets limited by cuda, we have to reallocate the textures in OpenGL again
-        if (tmp_max_nr_poses != nr_max_poses_ ||
-            tmp_nr_poses_per_row != nr_poses_per_row_ ||
-            tmp_nr_poses_per_column != nr_poses_per_column_) {
-            opengl_->set_resolution(n_rows_, n_cols_, nr_max_poses_, nr_poses_per_row_, nr_poses_per_column_);
-
-            std::cout << "OpenGL adapts to CUDA restrictions: setting resolution changes allocation to " << nr_max_poses_ << " poses in the form ("
-                      << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
-        }
+        // sets the resolution and reallocates the texture
+        set_resolution();
 
         register_resource();
 
-        std:: cout << "set occlusions..." << std::endl;
-
         reset();
 
-        float c = p_visible_visible_ - p_visible_occluded_;
-        float log_c = log(c);
-
-        std::vector<std::vector<float> > dummy_com_models;
-        cuda_->init(dummy_com_models, 0.0f, 0.0f,
-                    initial_visibility_prob_, c, log_c, p_visible_occluded_,
-                    tail_weight_, model_sigma_, sigma_factor_, max_depth_, exponential_rate_);
-
+        cuda_->init(initial_occlusion_prob_,
+                    p_occluded_occluded,
+                    p_occluded_visible,
+                    tail_weight_,
+                    model_sigma_,
+                    sigma_factor_,
+                    max_depth_,
+                    exponential_rate_);
 
         count_ = 0;
-//        render_time_ = 0;
 
-        visibility_probs_.resize(n_rows_ * n_cols_);
-    }
+        occlusion_probs_.resize(n_rows_ * n_cols_);
 
-    ~KinectImageObservationModelGPU() noexcept {
+#ifdef PROFILING_ACTIVE
+        for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++)
+        {
+            time_[i] = 0;
+        }
+        strings_for_subtasks_[SET_OCCLUSION_INDICES] =
+            "Setting occlusion indices";
+        strings_for_subtasks_[CONVERTING_STATE_FORMAT] =
+            "Converting state format";
+        strings_for_subtasks_[RENDERING] = "Rendering step";
+        strings_for_subtasks_[MAPPING] =
+            "Mapping the texture from OpenGL to CUDA";
+        strings_for_subtasks_[WEIGHTING] = "Weighting step";
+        strings_for_subtasks_[UNMAPPING] =
+            "Unmapping the texture and reconverting the likelihoods";
+#endif
 
-//        std::cout << "time for render: " << render_time_ / count_ << std::endl;
+        optimize_nr_threads_ = false;
+        adapt_to_constraints_ = false;
 
-    }
+#ifdef OPTIMIZE_NR_THREADS
+        set_optimization_of_thread_nr(true);
+#endif
+        if (optimize_nr_threads_)
+        {
+            max_nr_threads_ = cuda_->get_max_nr_threads();
+            warp_size_ = cuda_->get_warp_size();
+            nr_threads_ = warp_size_;
+            best_time_ = std::numeric_limits<double>::infinity();
+            best_nr_threads_ = nr_threads_;
+            average_time_ = 0;
+        }
 
+        optimization_runs_ = 0;
+    }
 
+    /// computes the loglikelihoods for the given states
+    /** Make sure the observation image was set previously, as it is used for
+     * comparison.
+     * \param [in] deltas the states which should be evaluated
+     * \param [in][out] occlusion_indices for each state, this should contain the
+     * index into the occlusion
+     * array where the corresponding occlusion probabilities per pixel are
+     * stored
+     * \param [in] update_occlusions whether or not the occlusions should be
+     * updated in this evaluation step
+     * \return the loglikelihoods for the given states
+     */
     RealArray loglikes(const StateArray& deltas,
-                                  IntArray& occlusion_indices,
-                                  const bool& update_occlusions = false)
+                       IntArray& occlusion_indices,
+                       const bool& update_occlusions = false)
     {
+#ifdef PROFILING_ACTIVE
+        if (!optimize_nr_threads_ && optimization_runs_ != count_)
+        {
+            time_before_ = dbot::hf::get_wall_time();
+        }
+#endif
 
-        if(!observations_set_)
+        if (!observations_set_)
         {
-            std:: cout << "GPU: observations not set" << std::endl;
+            std::cout << "GPU: observations not set" << std::endl;
             exit(-1);
         }
 
         nr_poses_ = deltas.size();
-        std::vector<float> flog_likelihoods (nr_poses_, 0);
+        std::vector<float> flog_likelihoods(nr_poses_, 0);
         set_number_of_poses(nr_poses_);
 
         // transform occlusion indices from size_t to int
-        std::vector<int> occlusion_indices_transformed (occlusion_indices.size(), 0);
+        std::vector<int> occlusion_indices_transformed(occlusion_indices.size(),
+                                                       0);
         for (size_t i = 0; i < occlusion_indices.size(); i++)
         {
-            occlusion_indices_transformed[i] = (int) occlusion_indices[i];
+            occlusion_indices_transformed[i] = (int)occlusion_indices[i];
         }
 
-        INIT_PROFILING;
         // copy occlusion indices to GPU
-        cuda_->set_prev_sample_indices(occlusion_indices_transformed.data());
-        MEASURE("gpu: setting occlusion indices");
-        // convert to internal state format
-        std::vector<std::vector<std::vector<float> > > poses(
-                    nr_poses_,
-                    std::vector<std::vector<float> >(vertices_.size(),
-                    std::vector<float>(7, 0)));
+        cuda_->set_occlusion_indices(occlusion_indices_transformed.data());
 
-        MEASURE("gpu: creating state vectors");
+#ifdef PROFILING_ACTIVE
+        store_time(SET_OCCLUSION_INDICES);
+#endif
 
+        std::vector<std::vector<Eigen::Matrix4f>> poses(
+            nr_poses_, std::vector<Eigen::Matrix4f>(vertices_.size()));
 
-        for(size_t i_state = 0; i_state < size_t(nr_poses_); i_state++)
+        for (size_t i_state = 0; i_state < size_t(nr_poses_); i_state++)
         {
-            for(size_t i_obj = 0; i_obj < vertices_.size(); i_obj++)
+            for (size_t i_obj = 0; i_obj < vertices_.size(); i_obj++)
             {
                 auto pose_0 = this->default_poses_.component(i_obj);
                 auto delta = deltas[i_state].component(i_obj);
 
                 osr::PoseVector pose;
+
                 pose.orientation() = delta.orientation() * pose_0.orientation();
                 pose.position() = delta.position() + pose_0.position();
 
-                poses[i_state][i_obj][0] = pose.orientation().quaternion().w();
-                poses[i_state][i_obj][1] = pose.orientation().quaternion().x();
-                poses[i_state][i_obj][2] = pose.orientation().quaternion().y();
-                poses[i_state][i_obj][3] = pose.orientation().quaternion().z();
-                poses[i_state][i_obj][4] = pose.position()[0];
-                poses[i_state][i_obj][5] = pose.position()[1];
-                poses[i_state][i_obj][6] = pose.position()[2];
+                poses[i_state][i_obj] = pose.homogeneous().cast<float>();
             }
         }
 
-        MEASURE("gpu: converting state format");
-
-
-//        double before_render = dbot::hf::get_wall_time();
-
+#ifdef PROFILING_ACTIVE
+        store_time(CONVERTING_STATE_FORMAT);
+#endif
 
         opengl_->render(poses);
 
-//        double after_render = dbot::hf::get_wall_time();
-//        render_time_ += after_render - before_render;
-        count_++;
+#ifdef PROFILING_ACTIVE
+        store_time(RENDERING);
+#endif
 
+        cudaGraphicsMapResources(1, &texture_resource_, 0);
+        cudaGraphicsSubResourceGetMappedArray(
+            &texture_array_, texture_resource_, 0, 0);
+        cuda_->map_texture_to_texture_array(texture_array_);
 
-        MEASURE("gpu: rendering");
+#ifdef PROFILING_ACTIVE
+        store_time(MAPPING);
+#endif
 
+        if (optimize_nr_threads_)
+        {
+            if (nr_threads_ <= max_nr_threads_)
+            {
+                before_weighting_ = dbot::hf::get_wall_time();
+                cuda_->set_nr_threads(nr_threads_);
+                optimization_runs_++;
+            }
+            else
+            {
+                nr_threads_ = best_nr_threads_;
+                optimize_nr_threads_ = false;
+                std::cout << std::endl
+                          << "Best #threads: " << nr_threads_ << std::endl
+                          << std::endl;
+            }
+        }
 
-        cudaGraphicsMapResources(1, &combined_texture_resource_, 0);
-        cudaGraphicsSubResourceGetMappedArray(&texture_array_, combined_texture_resource_, 0, 0);
-        cuda_->set_texture_array(texture_array_);
-        cuda_->map_texture();
-        MEASURE("gpu: mapping texture");
+        cuda_->weigh_poses(update_occlusions, flog_likelihoods);
 
-        cuda_->compare_multiple(update_occlusions, flog_likelihoods);
-        cudaGraphicsUnmapResources(1, &combined_texture_resource_, 0);
+        if (optimize_nr_threads_)
+        {
+            if (nr_threads_ <= max_nr_threads_)
+            {
+                after_weighting_ = dbot::hf::get_wall_time();
+                double time = after_weighting_ - before_weighting_;
+                average_time_ += time;
+
+                if (count_ % NR_ROUNDS_PER_SETTING_ ==
+                    NR_ROUNDS_PER_SETTING_ - 1)
+                {
+                    average_time_ /= NR_ROUNDS_PER_SETTING_;
+                    if (average_time_ < best_time_)
+                    {
+                        best_time_ = average_time_;
+                        best_nr_threads_ = nr_threads_;
+                    }
+
+                    nr_threads_ += warp_size_;
+                    average_time_ = 0;
+                }
+            }
+        }
 
-        MEASURE("gpu: computing likelihoods");
+#ifdef PROFILING_ACTIVE
+        store_time(WEIGHTING);
+#endif
 
+        cudaGraphicsUnmapResources(1, &texture_resource_, 0);
 
-        if(update_occlusions)
+        if (update_occlusions)
         {
-            for(size_t i_state = 0; i_state < occlusion_indices.size(); i_state++)
+            for (size_t i_state = 0; i_state < occlusion_indices.size();
+                 i_state++)
                 occlusion_indices[i_state] = i_state;
-
-            MEASURE("gpu: updating occlusions");
         }
 
-
         // convert
         RealArray log_likelihoods(flog_likelihoods.size());
-        for(size_t i = 0; i < flog_likelihoods.size(); i++)
+        for (size_t i = 0; i < flog_likelihoods.size(); i++)
             log_likelihoods[i] = flog_likelihoods[i];
 
+#ifdef PROFILING_ACTIVE
+        store_time(UNMAPPING);
+#endif
+
+        count_++;
         return log_likelihoods;
     }
 
-
-    void set_observation(const Observation& image){
+    /// sets the observation image that should be used for comparison in the
+    /// next evaluation step
+    /**
+     * \param [in] image the image obtained from the camera
+     */
+    void set_observation(const Observation& image)
+    {
         std::vector<float> std_measurement(image.size());
 
-        for(size_t row = 0; row < image.rows(); row++)
-            for(size_t col = 0; col < image.cols(); col++)
-                std_measurement[row*image.cols() + col] = image(row, col);
+        for (size_t row = 0; row < image.rows(); row++)
+            for (size_t col = 0; col < image.cols(); col++)
+                std_measurement[row * image.cols() + col] = image(row, col);
+
+        observation_time_ += this->delta_time_;
 
-        set_observation(std_measurement, this->delta_time_);
+        cuda_->set_observations(std_measurement.data(), observation_time_);
+        observations_set_ = true;
     }
 
+    /// resets the occlusion probabilities and observation time
     virtual void reset()
     {
-        set_occlusions();
+        float default_occlusion_probability = initial_occlusion_prob_;
+
+        std::vector<float> occlusion_probabilities(
+            n_rows_ * n_cols_ * nr_poses_, default_occlusion_probability);
+        cuda_->set_occlusion_probabilities(occlusion_probabilities.data());
+
         observation_time_ = 0;
     }
 
+    /// activates automatic optimization of the number of threads
+    void set_optimization_of_thread_nr(bool shouldOptimize)
+    {
+        optimize_nr_threads_ = shouldOptimize;
+    }
 
-    // TODO: this image should be in a different format BOTH OF THEM!!
-    const std::vector<float> get_occlusions(size_t index) const
+    /// activates automatic downgrading of the number of poses, if hardware
+    /// limitations are reached
+    void set_adaptation_to_GPU_constraints(bool shouldAdapt)
     {
-        std::vector<float> visibility_probs = cuda_->get_visibility_probabilities((int) index);
-        return visibility_probs;
+        adapt_to_constraints_ = shouldAdapt;
     }
 
-    void get_range_image(std::vector<std::vector<int> > &intersect_indices,
-                    std::vector<std::vector<float> > &depth)
+    /// returns the occlusion probabilities for each pixel for a given state
+    /**
+     * \param [in] index the index into the state array of the state you are
+     * interested in
+     * \return an Eigen Matrix containing the occlusion probabilities for each
+     * pixel for the given state
+     */
+    const Eigen::Map<Observation> get_occlusions(size_t index) const
     {
-        opengl_->get_depth_values(intersect_indices, depth);
+        std::vector<float> occlusion_probs =
+            cuda_->get_occlusion_probabilities((int)index);
+
+        // convert values to doubles and put them into an Eigen Matrix
+        std::vector<double> occlusion_probs_double(occlusion_probs.begin(),
+                                                   occlusion_probs.end());
+        Eigen::Map<Observation> occlusion_probs_matrix(
+            &occlusion_probs_double[0], n_rows_, n_cols_);
+
+        return occlusion_probs_matrix;
     }
 
-private:
-    // TODO: this function should disappear, BOTH OF THEM
-    void set_observation(const std::vector<float>& observations, const Scalar& delta_time)
+    /// returns the depth values of the rendered states
+    /**
+     * \return an Eigen Matrix containing the depth values per pixel, stored in
+     * a 1D array denoting the respective pose
+      */
+    std::vector<Eigen::Map<Observation>> get_range_image()
     {
-        observation_time_ += delta_time;
+        std::vector<std::vector<float>> depth_values_raw =
+            opengl_->get_depth_values();
 
-        cuda_->set_observations(observations.data(), observation_time_);
-        observations_set_ = true;
+        // convert depth values to doubles
+        std::vector<std::vector<double>> depth_values_raw_double;
+        for (int i = 0; i < depth_values_raw.size(); i++)
+        {
+            depth_values_raw_double.push_back(std::vector<double>(
+                depth_values_raw[i].begin(), depth_values_raw[i].end()));
+        }
+
+        // map values into an Eigen Matrix
+        std::vector<Eigen::Map<Observation>> depth_values;
+
+        for (int i = 0; i < depth_values_raw.size(); i++)
+        {
+            Eigen::Map<Observation> tmp(
+                &depth_values_raw_double[i][0], n_rows_, n_cols_);
+            depth_values.push_back(tmp);
+        }
 
+        return depth_values;
     }
 
-    void set_occlusions(const float& visibility_prob = -1)
+    /// The destructor. If profiling is activated, the time measurements are
+    /// processed and printed out here
+    ~KinectImageObservationModelGPU() noexcept
     {
-        float default_visibility_probability = visibility_prob;
-        if (visibility_prob == -1) default_visibility_probability = initial_visibility_prob_;
+#ifdef PROFILING_ACTIVE
 
-        std::vector<float> visibility_probabilities (n_rows_ * n_cols_ * nr_poses_, default_visibility_probability);
-        cuda_->set_visibility_probabilities(visibility_probabilities.data());
-        // TODO set update times if you want to use them
+        count_ -= optimization_runs_;
+        if (count_ > 0)
+        {
+            std::cout << std::endl
+                      << "Time measurements for the different steps of the "
+                         "evaluation process averaged over "
+                      << count_ << " evaluation calls:" << std::endl
+                      << std::endl;
+
+            double total_time_per_evaluation = 0;
+            for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++)
+            {
+                total_time_per_evaluation += time_[i];
+            }
+            total_time_per_evaluation /= count_;
+
+            for (int i = 0; i < NR_SUBTASKS_TO_MEASURE; i++)
+            {
+                std::cout << strings_for_subtasks_[i] << ": \t"
+                          << time_[i] / count_ << " s \t "
+                          << std::setprecision(1)
+                          << time_[i] / count_ * 100 / total_time_per_evaluation
+                          << " %" << std::setprecision(9) << std::endl;
+            }
+            std::cout << "Total time per evaluation call: "
+                      << total_time_per_evaluation << std::endl;
 
+            if (optimization_runs_ > 0)
+            {
+                std::cout << std::endl
+                          << "The best number of threads for this setup was "
+                             "estimated to be "
+                          << nr_threads_ << std::endl;
+            }
+        }
+        else
+        {
+            std::cout << "No measurement for the different steps of the "
+                         "evaluation was taken. "
+                      << "Most likely, you need to deactivate the optimization "
+                         "for #threads or"
+                      << " let the program run for a longer period of time."
+                      << std::endl;
+
+            if (optimize_nr_threads_ == true)
+            {
+                std::cout << std::endl
+                          << "The best #threads could not be computed, because "
+                             "the runtime of the program was too short."
+                          << " Please allow the program to run a bit longer to "
+                             "find the best #threads."
+                          << std::endl;
+            }
+        }
+#endif
     }
 
+private:
     const Eigen::Matrix3d camera_matrix_;
     const size_t n_rows_;
     const size_t n_cols_;
-    const float initial_visibility_prob_;
     int nr_max_poses_;
 
-    void set_number_of_poses(int nr_poses){
-
-        nr_poses_ = nr_poses;
-        opengl_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_);
+    void allocate_memory_for_max_poses()
+    {
+        // allocates memory and sets the dimensions of how many poses will be
+        // rendered per row and per column in the texture
+        opengl_->allocate_textures_for_max_poses(nr_max_poses_,
+                                                 nr_poses_per_row_,
+                                                 nr_poses_per_column_,
+                                                 adapt_to_constraints_);
 
-        std::cout << "OpenGL: set number of poses to " << nr_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
+        std::cout << "OpenGL: allocated " << nr_max_poses_
+                  << " poses in the form (" << nr_poses_per_row_ << ", "
+                  << nr_poses_per_column_ << ")" << std::endl;
 
-        int tmp_nr_poses = nr_poses_;
+        int tmp_max_nr_poses = nr_max_poses_;
         int tmp_nr_poses_per_row = nr_poses_per_row_;
         int tmp_nr_poses_per_column = nr_poses_per_column_;
 
-        cuda_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_);
+        cuda_->allocate_memory_for_max_poses(nr_max_poses_,
+                                             nr_poses_per_row_,
+                                             nr_poses_per_column_,
+                                             adapt_to_constraints_);
 
-        std::cout << "CUDA: set number of poses to " << nr_poses_ << " poses in the form ("
-                  << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
+        std::cout << "CUDA: allocated " << nr_max_poses_
+                  << " poses in the form (" << nr_poses_per_row_ << ", "
+                  << nr_poses_per_column_ << ")" << std::endl;
 
-        // if number of poses gets limited by cuda, we have to tell OpenGL about it
-        if (tmp_nr_poses != nr_poses_ ||
+        // if number of poses gets limited by cuda, we have to reallocate the
+        // textures in OpenGL again
+        if (tmp_max_nr_poses != nr_max_poses_ ||
+            tmp_nr_poses_per_row != nr_poses_per_row_ ||
+            tmp_nr_poses_per_column != nr_poses_per_column_)
+        {
+            opengl_->allocate_textures_for_max_poses(nr_max_poses_,
+                                                     nr_poses_per_row_,
+                                                     nr_poses_per_column_,
+                                                     adapt_to_constraints_);
+
+            std::cout << "OpenGL adapts to CUDA restrictions: allocated "
+                      << nr_max_poses_ << " poses in the form ("
+                      << nr_poses_per_row_ << ", " << nr_poses_per_column_
+                      << ")" << std::endl;
+        }
+    }
+
+    void set_resolution()
+    {
+        int tmp_max_nr_poses = nr_max_poses_;
+        int tmp_nr_poses_per_row = nr_poses_per_row_;
+        int tmp_nr_poses_per_column = nr_poses_per_column_;
+
+        opengl_->set_resolution(n_rows_,
+                                n_cols_,
+                                nr_max_poses_,
+                                nr_poses_per_row_,
+                                nr_poses_per_column_,
+                                adapt_to_constraints_);
+
+        if (tmp_max_nr_poses != nr_max_poses_ ||
             tmp_nr_poses_per_row != nr_poses_per_row_ ||
-            tmp_nr_poses_per_column != nr_poses_per_column_) {
-            opengl_->set_number_of_poses(nr_poses_, nr_poses_per_row_, nr_poses_per_column_);
+            tmp_nr_poses_per_column != nr_poses_per_column_)
+        {
+            std::cout << "OpenGL: setting resolution changes allocation to "
+                      << nr_max_poses_ << " poses in the form ("
+                      << nr_poses_per_row_ << ", " << nr_poses_per_column_
+                      << ")" << std::endl;
+        }
 
-            std::cout << "OpenGL adapts to CUDA restrictions: set number of poses to " << nr_poses_ << " poses in the form ("
-                      << nr_poses_per_row_ << ", " << nr_poses_per_column_ << ")" << std::endl;
+        tmp_max_nr_poses = nr_max_poses_;
+        tmp_nr_poses_per_row = nr_poses_per_row_;
+        tmp_nr_poses_per_column = nr_poses_per_column_;
+
+        cuda_->set_resolution(n_rows_,
+                              n_cols_,
+                              nr_max_poses_,
+                              nr_poses_per_row_,
+                              nr_poses_per_column_,
+                              adapt_to_constraints_);
+
+        if (tmp_max_nr_poses != nr_max_poses_ ||
+            tmp_nr_poses_per_row != nr_poses_per_row_ ||
+            tmp_nr_poses_per_column != nr_poses_per_column_)
+        {
+            std::cout << "CUDA: setting resolution changes allocation to "
+                      << nr_max_poses_ << " poses in the form ("
+                      << nr_poses_per_row_ << ", " << nr_poses_per_column_
+                      << ")" << std::endl;
         }
 
+        tmp_max_nr_poses = nr_max_poses_;
+        tmp_nr_poses_per_row = nr_poses_per_row_;
+        tmp_nr_poses_per_column = nr_poses_per_column_;
+
+        // if number of poses gets limited by cuda, we have to reallocate the
+        // textures in OpenGL again
+        if (tmp_max_nr_poses != nr_max_poses_ ||
+            tmp_nr_poses_per_row != nr_poses_per_row_ ||
+            tmp_nr_poses_per_column != nr_poses_per_column_)
+        {
+            opengl_->set_resolution(n_rows_,
+                                    n_cols_,
+                                    nr_max_poses_,
+                                    nr_poses_per_row_,
+                                    nr_poses_per_column_,
+                                    adapt_to_constraints_);
+
+            std::cout << "OpenGL adapts to CUDA restrictions: setting "
+                         "resolution changes allocation to "
+                      << nr_max_poses_ << " poses in the form ("
+                      << nr_poses_per_row_ << ", " << nr_poses_per_column_
+                      << ")" << std::endl;
+        }
     }
 
-    void check_cuda_error(const char *msg)
+    void set_number_of_poses(int nr_poses)
     {
-        cudaError_t err = cudaGetLastError();
-        if( cudaSuccess != err)
+        nr_poses_ = nr_poses;
+        opengl_->set_number_of_poses(nr_poses_,
+                                     nr_poses_per_row_,
+                                     nr_poses_per_column_,
+                                     adapt_to_constraints_);
+
+        int tmp_nr_poses = nr_poses_;
+        int tmp_nr_poses_per_row = nr_poses_per_row_;
+        int tmp_nr_poses_per_column = nr_poses_per_column_;
+
+        cuda_->set_number_of_poses(nr_poses_,
+                                   nr_poses_per_row_,
+                                   nr_poses_per_column_,
+                                   adapt_to_constraints_);
+
+        // if number of poses gets limited by cuda, we have to tell OpenGL about
+        // it
+        if (tmp_nr_poses != nr_poses_ ||
+            tmp_nr_poses_per_row != nr_poses_per_row_ ||
+            tmp_nr_poses_per_column != nr_poses_per_column_)
         {
-            fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
-            exit(EXIT_FAILURE);
+            opengl_->set_number_of_poses(nr_poses_,
+                                         nr_poses_per_row_,
+                                         nr_poses_per_column_,
+                                         adapt_to_constraints_);
+        }
+
+        if (nr_poses_ < nr_poses)
+        {
+            std::cout << "Number of poses was reduced to " << nr_poses_
+                      << " poses in the form (" << nr_poses_per_row_ << ", "
+                      << nr_poses_per_column_ << ")." << std::endl;
         }
     }
 
+    void store_time(int task)
+    {
+        if (!optimize_nr_threads_ && optimization_runs_ != count_)
+        {
+            time_after_ = dbot::hf::get_wall_time();
+            time_[task] += time_after_ - time_before_;
+            time_before_ = time_after_;
+        }
+    }
 
-    void unregister_resource()
+    void check_cuda_error(const char* msg)
     {
-        if (resource_registered_) {
-            cudaGraphicsUnregisterResource(combined_texture_resource_);
-            check_cuda_error("cudaGraphicsUnregisterResource");
-            resource_registered_ = false;
+        cudaError_t err = cudaGetLastError();
+        if (cudaSuccess != err)
+        {
+            fprintf(
+                stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+            exit(EXIT_FAILURE);
         }
     }
 
     void register_resource()
     {
-        if (!resource_registered_) {
-            combined_texture_opengl_ = opengl_->get_framebuffer_texture();
-            cudaGraphicsGLRegisterImage(&combined_texture_resource_, combined_texture_opengl_, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
+        if (!resource_registered_)
+        {
+            opengl_texture_ = opengl_->get_framebuffer_texture();
+            cudaGraphicsGLRegisterImage(&texture_resource_,
+                                        opengl_texture_,
+                                        GL_TEXTURE_2D,
+                                        cudaGraphicsRegisterFlagsReadOnly);
             check_cuda_error("cudaGraphicsGLRegisterImage)");
             resource_registered_ = true;
         }
     }
 
+    void unregister_resource()
+    {
+        if (resource_registered_)
+        {
+            cudaGraphicsUnregisterResource(texture_resource_);
+            check_cuda_error("cudaGraphicsUnregisterResource");
+            resource_registered_ = false;
+        }
+    }
 
+    // OpenGL handle and input
     boost::shared_ptr<ObjectRasterizer> opengl_;
-    boost::shared_ptr<fil::CudaFilter> cuda_;
-
-    // arrays for timings
-    std::vector<std::vector<double> > cpu_times_;
-    std::vector<std::vector<float> > cuda_times_;
+    std::vector<std::vector<Eigen::Vector3f>> vertices_;
+    std::vector<std::vector<std::vector<int>>> indices_;
+    std::string vertex_shader_path_;
+    std::string fragment_shader_path_;
 
-    // data
-    std::vector<std::vector<Eigen::Vector3f> > vertices_;
-    std::vector<std::vector<std::vector<int> > > indices_;
-    std::vector<float> visibility_probs_;
+    // CUDA handle
+    boost::shared_ptr<fil::CudaFilter> cuda_;
 
-    // constants for likelihood evaluation
-    float p_visible_visible_;
-    float p_visible_occluded_;
+    // constants for likelihood evaluation in CUDA kernel
+    double observation_time_;
+    float initial_occlusion_prob_;
     float tail_weight_;
     float model_sigma_;
     float sigma_factor_;
     float max_depth_;
     float exponential_rate_;
+    std::vector<float> occlusion_probs_;
 
-    std::string vertex_shader_path_;
-    std::string fragment_shader_path_;
-
-
-//    double render_time_;
-
+    // amount of poses and pose distribution in the OpenGL texture
     int nr_poses_;
     int nr_poses_per_row_;
     int nr_poses_per_column_;
 
-    int count_;
-
-    // booleans to ensure correct usage of function calls
-    bool observations_set_, resource_registered_;
-    int nr_calls_set_observation_;
-
     // Shared resource between OpenGL and CUDA
-    GLuint combined_texture_opengl_;
-    cudaGraphicsResource* combined_texture_resource_;
+    GLuint opengl_texture_;
+    cudaGraphicsResource* texture_resource_;
     cudaArray_t texture_array_;
 
-    double observation_time_;
+    // booleans to ensure correct usage of function calls
+    bool observations_set_, resource_registered_;
 
     // used for time observations
-    static const int TIME_MEASUREMENTS_COUNT = 8;
-    static const int COUNT = 500;
-    enum time_measurement {SEND_OBSERVATIONS, RENDER, MAP_RESOURCE, GET_MAPPED_ARRAY, SET_TEXTURE_ARRAY,
-                           MAP_TEXTURE, COMPUTE_LIKELIHOODS, UNMAP_RESOURCE};
+    static const int NR_SUBTASKS_TO_MEASURE = 6;
+    enum subtasks_to_measure
+    {
+        SET_OCCLUSION_INDICES,
+        CONVERTING_STATE_FORMAT,
+        RENDERING,
+        MAPPING,
+        WEIGHTING,
+        UNMAPPING
+    };
+    double time_[NR_SUBTASKS_TO_MEASURE];
+    std::string strings_for_subtasks_[NR_SUBTASKS_TO_MEASURE];
+    double time_before_, time_after_;
+    int count_;
 
+    // variables for the optimization runs
+    int nr_threads_;
+    int max_nr_threads_;
+    int warp_size_;
+    double best_time_;
+    double before_weighting_;
+    double after_weighting_;
+    int best_nr_threads_;
+    double average_time_;
+    bool stop_optimizing_;
+    static const int NR_ROUNDS_PER_SETTING_ = 30;
+    int optimization_runs_;
+
+    // optional flags for optimizing the #threads and for adapting to GPU
+    // constraints
+    bool optimize_nr_threads_;
+    bool adapt_to_constraints_;
 };
-
 }
 #endif
diff --git a/include/dbot/model/observation/gpu/object_rasterizer.hpp b/include/dbot/model/observation/gpu/object_rasterizer.hpp
index f3746fe..de684f8 100644
--- a/include/dbot/model/observation/gpu/object_rasterizer.hpp
+++ b/include/dbot/model/observation/gpu/object_rasterizer.hpp
@@ -1,61 +1,69 @@
+/// @author Claudia Pfreundt <claudilein@gmail.com>
+
 #ifndef POSE_TRACKING_MODELS_OBSERVATION_MODELS_OBJECT_RASTERIZER_HPP
 #define POSE_TRACKING_MODELS_OBSERVATION_MODELS_OBJECT_RASTERIZER_HPP
 
 #include <vector>
 #include <Eigen/Dense>
 #include "GL/glew.h"
-#include <map>
 
 
-/// renders the object using openGL rasterization
+/// renders the objects using openGL rasterization
 /** The objects that should be rendered have to be passed in the constructor and can then be rendered
-  * in different poses with the Render() function. The resulting depth values are stored in a texture
-  * whose values can be obtained with get_depth_values(). Alternatively, get_combined_texture() returns
-  * the ID of the texture for mapping it into the CUDA context.
+  * in different poses with the render() function. The resulting depth values are stored in a texture
+  * whose values can be obtained with get_depth_values(). Alternatively, get_framebuffer_texture() returns
+  * the ID of the texture for mapping it into CUDA.
   */
 class ObjectRasterizer
 {
 public:
-    /// constructor which takes the vertices and indices that describe the objects as input
+    /// constructor which takes the vertices and indices that describe the objects as input. The paths to the
+    /// shader files and the instrinsic camera matrix also have to be passed here.
     /**
-      * @param[in] vertices [object_nr][vertex_nr] = {x, y, z}. This list should contain 3-dimensional
-      * vectors that specify the corners of the triangles the object meshes consists of.
+      * @param[in] vertices [object_nr][vertex_nr] = {x, y, z}. This list should contain, for each object,
+      * a list of 3-dimensional vectors that specify the corners of the triangles of the object mesh.
       * @param[in] indices [object_nr][index_nr][0 - 2] = {index}. This list should contain the indices
       * that index the vertices list and tell us which vertices to connect to a triangle (every group of 3).
       * For each object, the indices should be in the range of [0, nr_vertices - 1].
       * @param[in] vertex_shader_path path to the vertex shader
       * @param[in] fragment_shader_path path to the fragment shader
       * @param[in] camera_matrix matrix of the intrinsic parameters of the camera
+      * @param[in] near_plane everything closer than the near plane will not be rendered. This should
+      * be similar to the minimal distance up to which the sensor can see objects.
+      * @param[in] far_plane everything further away than the far plane will not be rendered. This should
+      * be similar to the maximum distance up to which the sensor can see objects.
+      * @param[in] nr_rows the number of rows in one sensor image (vertical resolution)
+      * @param[in] nr_cols the number of columns in one sensor image (horizontal resolution)
       */
     ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f> > vertices,
                      const std::vector<std::vector<std::vector<int> > > indices,
                      const std::string vertex_shader_path,
                      const std::string fragment_shader_path,
-                     const Eigen::Matrix3f camera_matrix);
-
-    /// constructor with no arguments, should not be used
-    ObjectRasterizer();
+                     const Eigen::Matrix3f camera_matrix,
+                     const float near_plane = 0.4,
+                     const float far_plane = 4,
+                     const int nr_rows = 60,
+                     const int nr_cols = 80);
 
     /// destructor which deletes the buffers and programs used by openGL
     ~ObjectRasterizer();
 
 
-
     /// render the objects in all given states and return the depth for all relevant pixels of each rendered object
     /** This function renders all poses (of all objects) into one large texture. Reading back the depth values
       * is a relatively slow process, so this function should mainly be used for debugging. If you are using
-      * CUDA to further process the depth values, please use the other Render() function.
+      * CUDA to further process the depth values, please use the other render() function.
       * @param[in] states [pose_nr][object_nr][0 - 6] = {qw, qx, qy, qz, tx, ty, tz}. This should contain the quaternion
       * and the translation for each object per pose.
-      * @param[in,out] intersect_indices [pose_nr][0 - nr_relevant_pixels] = {pixel_nr}. This list should be empty when passed
-      * to the function. Afterwards, it will contain the pixel numbers of all pixels that were rendered to, per pose.
-      * @param[in,out] depth [pose_nr][0 - nr_relevant_pixels] = {depth_value}. This list should be empty when passed to the function.
-      * Afterwards, it will contain the depth value of all pixels that were rendered to, per pose.
+      * @param[out] intersect_indices [pose_nr][0 - nr_relevant_pixels] = {pixel_nr}. This list should be empty when passed
+      * to the function. Afterwards, it will contain the pixel numbers of all pixels that were rendered to, per pose. Pixels
+      * that have a depth value of 0 will be ignored.
+      * @param[out] depth [pose_nr][0 - nr_relevant_pixels] = {depth_value}. This list should be empty when passed to the function.
+      * Afterwards, it will contain the depth value of all pixels that were rendered to, per pose. Pixels
+      * that have a depth value of 0 will be ignored.
       */
-    void render(const std::vector<std::vector<std::vector<float> > > states,
-                            std::vector<std::vector<int> > &intersect_indices,
-                            std::vector<std::vector<float> > &depth);
-
+    void render(const std::vector<std::vector<Eigen::Matrix4f> > states,
+                std::vector<std::vector<float> > depth_values);
 
     /// render the objects in all given states into a texture that can then be accessed by CUDA
     /** This function renders all poses (of all objects) into one large texture, which can then be mapped into the CUDA
@@ -63,77 +71,103 @@ class ObjectRasterizer
       * @param[in] states [pose_nr][object_nr][0 - 6] = {qw, qx, qy, qz, tx, ty, tz}. This should contain the quaternion
       * and the translation for each object per pose.
       */
-    void render(const std::vector<std::vector<std::vector<float> > > states);
+    void render(const std::vector<std::vector<Eigen::Matrix4f> > states);
 
     /// sets the objects that should be rendered.
     /** This function only needs to be called if any objects initially passed in the constructor should be left out when rendering.
       * @param[in] object_numbers [0 - nr_objects] = {object_nr}. This list should contain the indices of all objects that
-      * should be rendered when calling Render().
+      * should be rendered when calling render(). For example, [0,1,4,5] will only render objects 0,1,4 and 5 (whose vertices
+      * were passed in the constructor).
       */
     void set_objects(std::vector<int> object_numbers);
 
     /// set a new resolution
     /** This function reallocates the framebuffer textures. This is expensive, so only do it if necessary.
-      * @param[in] n_rows the height of the image
-      * @param[in] n_cols the width of the image
+     * @param[in] n_rows the height of the image
+     * @param[in] n_cols the width of the image
+     * @param[out] nr_poses the new number of poses (only changed when adapting to constraints)
+     * @param[out] nr_poses_per_row the new number of poses per row
+     * @param[out] nr_poses_per_column the new number of poses per column
+     * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program instead
+     */
+    void set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints = false);
+
+    /// allocates memory on the GPU
+    /** Use this function to allocate memory for the maximum number of poses that you will need throughout the filtering.
+      * @param[in,out] allocated_poses number of poses for which space should be allocated. Might be changed by the function
+      * if there are space restrictions posed by OpenGL.
+      * @param[out] allocated_poses_per_row the number of poses that will be rendered per row of the texture
+      * @param[out] allocated_poses_per_column the number of poses that will be rendered per column of the texture
+      * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program if constraints are not met
       */
-    void set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column);
-
     void allocate_textures_for_max_poses(int& allocated_poses,
                                          int& allocated_poses_per_row,
-                                         int& allocated_poses_per_column);
-
-    void set_number_of_poses(const int nr_poses, int& nr_poses_per_row, int& nr_poses_per_column);
-
+                                         int& allocated_poses_per_column,
+                                         const bool adapt_to_constraints = false);
+
+    /// sets the number of poses that should be rendered in the next render call
+    /** Use this function previously to every render call if you need to change the amount of poses.
+      * @param[in/out] nr_poses amount of poses that should be rendered. Cannot exceed the maximum number of poses set with
+      * allocate_textures_for_max_poses(). Might be changed if adapt_to_constraints is activated.
+      * @param[out] nr_poses_per_row the number of poses that will be rendered per row of the texture
+      * @param[out] nr_poses_per_column the number of poses that will be rendered per column of the texture
+      * @param[in] adapt_to_constraints whether to automatically adapt to GPU constraints or quit the program instead
+      */
+    void set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints = false);
 
+    /// returns the OpenGL framebuffer texture ID, which is needed for CUDA interoperation
+    /** Use this function to retrieve the texture ID and pass it to the cudaGraphicsGLRegisterImage call.
+      * @return The texture ID
+      */
     GLuint get_framebuffer_texture();
-    int get_nr_poses_per_row();
-    void get_depth_values(std::vector<std::vector<int> > &intersect_indices,
-                          std::vector<std::vector<float> > &depth);
 
-private:
-    static constexpr float NEAR_PLANE = 0.4f; // Kinect does not see anything closer than 0.3 meters
-    static constexpr float FAR_PLANE = 4.0f; // Kinect does not see anything further away than 7 meters
-    static const int WINDOW_WIDTH = 80;  // default values if not specified
-    static const int WINDOW_HEIGHT = 60; // default values if not specified
+    /// returns the rendered depth values of all poses
+    /** This function should only be used for debugging. It will be extremely slow.
+      * @return [pose_nr][0 - nr_pixels] = {depth value of that pixel}
+      */
+    std::vector<std::vector<float> > get_depth_values();
 
+private:
     // GPU constraints
     GLint max_texture_size_;
 
-    // values initialized to WINDOW_WIDTH, WINDOW_HEIGHT in constructor. May be changed by user with set_resolution().
+    // values initialized in constructor. May be changed by user with set_resolution().
     int nr_rows_;
     int nr_cols_;
 
-    // number of poses to render
+    // values initialized in constructor. Cannot be changed afterwards.
+    float near_plane_;
+    float far_plane_;
+
+    // actual number of poses to render (current frame)
     int nr_poses_;
     int nr_poses_per_row_;
     int nr_poses_per_column_;
 
-    // number of maximum poses
+    // maximum number of poses that can be rendered in one call
     int nr_max_poses_;
     int nr_max_poses_per_row_;
     int nr_max_poses_per_column_;
 
-    // needed for openGL time observation
+    // needed for OpenGL time measurement
     static const int NR_SUBROUTINES_TO_MEASURE = 4;
     GLuint time_query_[NR_SUBROUTINES_TO_MEASURE];
     enum subroutines_to_measure { ATTACH_TEXTURE, CLEAR_SCREEN, RENDER, DETACH_TEXTURE};
     std::vector<std::string> strings_for_subroutines;
-    std::vector<double> gpu_times_aggregate_;
+    std::vector<double> time_measurement_;
     int nr_calls_;
-    bool initial_run_;
-
+    bool initial_run_;  // the first run should not count
 
     // the paths to the respective shaders
     std::string vertex_shader_path_;
     std::string fragment_shader_path_;
 
-    std::vector<uint> indices_list_;
-    // a list of all vertices of all objects
+    // lists of all vertices and indices of all objects
     std::vector<float> vertices_list_;
-    std::vector<int> vertices_per_object_;
+    std::vector<uint> indices_list_;
     std::vector<int> indices_per_object_;
     std::vector<int> start_position_;
+
     // contains a list of object indices which should be rendered
     std::vector<int> object_numbers_;
 
@@ -149,12 +183,12 @@ class ObjectRasterizer
     // VAO, VBO and element arrays are needed to store the object meshes
     GLuint vertex_array_;   // The vertex array contains the vertex and index buffers
     GLuint vertex_buffer_;    // contains the vertices of the object meshes passed in the constructor
-    GLuint index_buffer_;     //  contains the indices of the object meshes passed in the constructor
+    GLuint index_buffer_;     // contains the indices of the object meshes passed in the constructor
 
-    // PBO for copying results to CPU
+    // PBO for copying results to CPU for debugging
     GLuint result_buffer_;
 
-    // custom framebuffer and its textures for color and depth
+    // custom framebuffer and its textures for depth (for z-testing) and color (which also represents depth in our case)
     GLuint framebuffer_;
     GLuint framebuffer_texture_for_all_poses_;
     GLuint texture_for_z_testing;
@@ -163,13 +197,11 @@ class ObjectRasterizer
 
     // ====================== PRIVATE FUNCTIONS ====================== //
 
-
     void reallocate_buffers();
 
-    // set up model-, view- and projection-matrix
-    Eigen::Matrix4f get_model_matrix(const std::vector<float> state);
+    // set up view- and projection-matrix
     void setup_view_matrix();
-    void setup_projection_matrix(const Eigen::Matrix3f camera_matrix);    
+    void setup_projection_matrix(const Eigen::Matrix3f camera_matrix);
     Eigen::Matrix4f get_projection_matrix(float n, float f, float l, float r, float t, float b);
 
     // functions for time measurement
@@ -179,7 +211,6 @@ class ObjectRasterizer
     // functions for error checking
     void check_GL_errors(const char *label);
     bool check_framebuffer_status();
-
 };
 
 #endif // OBJECT_RASTERIZER_HPP
diff --git a/include/dbot/util/traits.hpp b/include/dbot/util/traits.hpp
index 7088faa..b084670 100644
--- a/include/dbot/util/traits.hpp
+++ b/include/dbot/util/traits.hpp
@@ -10,6 +10,7 @@
  * License License (GNU GPL). A copy of the license can be found in the LICENSE
  * file distributed with this source code.
  */
+
 /**
  * @date 05/25/2014
  * @author Jan Issac (jan.issac@gmail.com)
diff --git a/src/dbot/model/observation/gpu/cuda_filter.cu b/src/dbot/model/observation/gpu/cuda_filter.cu
index 01c2cdd..e55521e 100644
--- a/src/dbot/model/observation/gpu/cuda_filter.cu
+++ b/src/dbot/model/observation/gpu/cuda_filter.cu
@@ -1,8 +1,6 @@
 /** @author Claudia Pfreundt */
 
-#define CHECK_ERRORS
-//#define PROFILING_ACTIVE
-//#define DEBUG_ON
+#define DEBUG
 
 #define VECTOR_DIM 3
 #define MATRIX_DIM 9
@@ -18,7 +16,6 @@
 
 #include <cuda.h>
 #include "cuda_gl_interop.h"
-#include <curand_kernel.h>
 #include <math.h>
 #include <math_constants.h>
 
@@ -30,26 +27,25 @@ namespace fil
 
 // ====================== CUDA CONSTANT VALUES ======================= //
 
-// used in propagate
-// 1000 denotes the maximum number of objects
-__constant__ float3 g_rot_center[1000];
-// sigmas.x == angle_sigma, sigmas.y == trans_sigma
-__constant__ float2 g_sigmas;
 
 // used in propagateOcclusion
-__constant__ float g_p_visible_occluded;
-__constant__ float g_c;
+__constant__ float g_p_occluded_occluded;
+__constant__ float g_one_div_c_minus_one;
 __constant__ float g_log_c;
 
+
 // used in prob
-__constant__ float g_tail_weight;
+__constant__ float g_one_minus_tail_weight;
 __constant__ float g_model_sigma;
 __constant__ float g_sigma_factor;
-__constant__ float g_max_depth;
+__constant__ float g_tail_weight_div_max_depth;
 __constant__ float g_exponential_rate;
+__constant__ float g_one_div_sqrt_of_two;
+__constant__ float g_one_div_sqrt_of_two_pi;
+
 
 // used in compare
-__constant__ float g_p_visible_init;
+__constant__ float g_initial_occlusion_prob;
 
 // texture for OpenGL interop
 texture<float, cudaTextureType2D, cudaReadModeElementType> texture_reference;
@@ -68,476 +64,66 @@ texture<float, cudaTextureType2D, cudaReadModeElementType> texture_reference;
 // ====================== DEVICE kernels - to be called by other kernels ======================= //
 // ============================================================================================= //
 
-// ====================== MATRIX MANIPULATION FUNCTIONS ======================= //
-
-__device__ void multiply_matrices(float *A, float *B, float *C) {
-    float sum = 0;
-    for (int i = 0; i < VECTOR_DIM; i++) {        // iterate through rows
-        for (int j = 0; j < VECTOR_DIM; j++) {    // iterate through cols
-            for (int k = 0; k < VECTOR_DIM; k++) {
-                sum += A[i * VECTOR_DIM + k] * B[k * VECTOR_DIM + j];
-            }
-            C[i * VECTOR_DIM + j] = sum;
-            sum = 0;
-        }
-    }
-}
-
-__device__ float3 multiply_matrix_with_vector(float* M, float3 v) {
-    float result[3];
-    float v_copy[3];
-    v_copy[0] = v.x; v_copy[1] = v.y; v_copy[2] = v.z;
-    float sum = 0;
-
-    for (int i = 0; i < VECTOR_DIM; i++) {
-        for (int j = 0; j < VECTOR_DIM; j++) {
-            sum += M[i * VECTOR_DIM + j] * v_copy[j];
-        }
-        result[i] = sum;
-        sum = 0;
-    }
-
-    return make_float3(result[0], result[1], result[2]);
-}
-
-/* axis is defined as follows: 0 = x, 1 = y, 2 = z */
-__device__ void create_rotation_matrix(const float angle, const int axis, float *R) {
-    float cos_angle = cos(angle);
-    float sin_angle = sin(angle);
-
-    if (axis == 0) {
-        R[0] = 1;   R[1] = 0;           R[2] = 0;
-        R[3] = 0;   R[4] = cos_angle;   R[5] = -sin_angle;
-        R[6] = 0;   R[7] = sin_angle;   R[8] = cos_angle;
-    } else if (axis == 1) {
-        R[0] = cos_angle;   R[1] = 0;   R[2] = sin_angle;
-        R[3] = 0;           R[4] = 1;   R[5] = 0;
-        R[6] = -sin_angle;  R[7] = 0;   R[8] = cos_angle;
-    } else if (axis == 2) {
-        R[0] = cos_angle;   R[1] = -sin_angle;  R[2] = 0;
-        R[3] = sin_angle;   R[4] = cos_angle;   R[5] = 0;
-        R[6] = 0;           R[7] = 0;           R[8] = 1;
-    }
-}
-
-__device__ void transpose_matrix(float *A, float *T) {
-    T[0] = A[0];
-    T[1] = A[3];
-    T[2] = A[6];
-    T[3] = A[1];
-    T[4] = A[4];
-    T[5] = A[7];
-    T[6] = A[2];
-    T[7] = A[5];
-    T[8] = A[8];
-}
-
-// ====================== VECTOR MANIPULATION FUNCTIONS ======================= //
-
-__device__ float4 normalize(const float4 v) {
-    float4 v_n = v;
-    const float n = 1.0f/sqrt(v_n.x*v_n.x+v_n.y*v_n.y+v_n.z*v_n.z+v_n.w*v_n.w);
-    v_n.x *= n;
-    v_n.y *= n;
-    v_n.z *= n;
-    v_n.w *= n;
-
-    return v_n;
-}
-
-__device__ float3 operator+(const float3 &a, const float3 &b) {
-
-  return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-
-}
-
-__device__ float3 negate(const float3 &a) {
-    return make_float3(-a.x, -a.y, -a.z);
-}
-
-// ======================= QUATERNION CONVERSIONS AND MANIPULATION FUNCTIONS ======================= //
-
-__device__ void quaternion_to_matrix(const float4 q_in, float *Q) {
-    float4 q = normalize(q_in);
-    Q[0] = 1.0f - 2.0f*q.y*q.y - 2.0f*q.z*q.z;  Q[1] = 2.0f*q.x*q.y - 2.0f*q.z*q.w;         Q[2] = 2.0f*q.x*q.z + 2.0f*q.y*q.w;
-    Q[3] = 2.0f*q.x*q.y + 2.0f*q.z*q.w;         Q[4] = 1.0f - 2.0f*q.x*q.x - 2.0f*q.z*q.z;  Q[5] = 2.0f*q.y*q.z - 2.0f*q.x*q.w;
-    Q[6] = 2.0f*q.x*q.z - 2.0f*q.y*q.w;         Q[7] = 2.0f*q.y*q.z + 2.0f*q.x*q.w;         Q[8] = 1.0f - 2.0f*q.x*q.x - 2.0f*q.y*q.y;
-}
-
-__device__ float4 matrix_to_quaternion(float *Q) {
-    float4 q;
-
-    q.w = sqrtf( fmaxf( 0, 1 + Q[0] + Q[4] + Q[8] ) ) / 2;
-    q.x = sqrtf( fmaxf( 0, 1 + Q[0] - Q[4] - Q[8] ) ) / 2;
-    q.y = sqrtf( fmaxf( 0, 1 - Q[0] + Q[4] - Q[8] ) ) / 2;
-    q.z = sqrtf( fmaxf( 0, 1 - Q[0] - Q[4] + Q[8] ) ) / 2;
-    if (( q.x * ( Q[7] - Q[5] ) ) < 0) {
-        q.x = -q.x;
-    }
-    if (( q.y * ( Q[2] - Q[6] ) ) < 0) {
-        q.y = -q.y;
-    }
-    if (( q.z * ( Q[3] - Q[1] ) ) < 0) {
-        q.z = -q.z;
-    }
-
-    return q;
-}
-
-__device__ float4 multiply_quaternions(float4 q1, float4 q2) {
-    float w = (q1.w * q2.w) - (q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z);
-    float x = (q1.w * q2.x) + (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y);
-    float y = (q1.w * q2.y) - (q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x);
-    float z = (q1.w * q2.z) + (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w);
-
-    return make_float4(x, y, z, w);
-}
-
-
 
 // ======================= helper functions for compare (observation model)  ======================= //
 
+
 __device__ float propagate_occlusion(float initial_p_source, float time) {
     if (isnan(time)) {
         return initial_p_source;
     }
-    float pow_c_time = exp(time*g_log_c);
-    return (float) pow_c_time*initial_p_source + g_p_visible_occluded*(pow_c_time-1.)/(g_c-1.);
+    float pow_c_time = __expf(time * g_log_c);
+    return 1 - (pow_c_time * (1 - initial_p_source) + (1. - g_p_occluded_occluded) * (pow_c_time - 1.) * g_one_div_c_minus_one);
 }
 
 
 
-__device__ float prob(float observation, float prediction, bool visible)
+__device__ float prob(float observation, float prediction, bool occluded)
 {
-    // todo: if the prediction is infinite, the prob should not depend on visibility. it does not matter
+    // todo: if the prediction is infinite, the prob should not depend on occlusion. it does not matter
     // for the algorithm right now, but it should be changed
 
-    float sigma = g_model_sigma + g_sigma_factor*observation*observation;
-    if(visible)
+    float sigma = g_model_sigma + g_sigma_factor * observation * observation;
+    float sigma_sq = sigma * sigma;
+
+    if(!occluded)
     {
         if(isinf(prediction)) // if the prediction is infinite we return the limit
-            return g_tail_weight/g_max_depth;
-        else
-            return g_tail_weight/g_max_depth
-                    + (1 - g_tail_weight)*expf(-(powf(prediction-observation,2)/(2*sigma*sigma)))
-                    / (sqrtf(2*M_PI) *sigma);
+            return g_tail_weight_div_max_depth;
+        else {
+            float pred_minus_obs = prediction - observation;
+            return g_tail_weight_div_max_depth
+                    + __fdividef(g_one_minus_tail_weight * __expf(- __fdividef(pred_minus_obs * pred_minus_obs, (2 * sigma_sq)))
+                    * g_one_div_sqrt_of_two_pi, sigma);
+        }
     }
     else
     {
         if(isinf(prediction)) // if the prediction is infinite we return the limit
-            return g_tail_weight/g_max_depth +
-                    (1-g_tail_weight)*g_exponential_rate*
-                    expf(0.5*g_exponential_rate*(-2*observation + g_exponential_rate*sigma*sigma));
+            return g_tail_weight_div_max_depth +
+                    g_one_minus_tail_weight * g_exponential_rate *
+                    __expf(0.5 * g_exponential_rate * (-2 * observation + g_exponential_rate * sigma_sq));
 
         else
-            return g_tail_weight/g_max_depth +
-                    (1-g_tail_weight)*g_exponential_rate*
-                    expf(0.5*g_exponential_rate*(2*prediction-2*observation + g_exponential_rate*sigma*sigma))
-        *(1+erff((prediction-observation+g_exponential_rate*sigma*sigma)/(sqrtf(2)*sigma)))
-        /(2*(expf(prediction*g_exponential_rate)-1));
+            return g_tail_weight_div_max_depth +
+                    g_one_minus_tail_weight * g_exponential_rate *
+                    __expf(0.5 * g_exponential_rate * (2 * (prediction - observation) + g_exponential_rate * sigma_sq))
+                    * __fdividef((1 + erff(__fdividef((prediction - observation + g_exponential_rate * sigma_sq) * g_one_div_sqrt_of_two, sigma))),
+                    (2 * (__expf(prediction * g_exponential_rate) - 1)));
     }
 }
 
 
 
 
-
-
 // ============================================================================================= //
 // ========================= GLOBAL kernels - to be called by CPU code ========================= //
 // ============================================================================================= //
 
 
 
-__global__ void setup_number_generators_kernel(int current_time, curandStateMRG32k3a *mrg_state, int n_poses)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < n_poses) {
-        /* Each thread gets same seed, a different sequence number, no offset */
-        curand_init(current_time, id, 0, &mrg_state[id]);
-    }
-}
-
-
-__global__ void propagate_kernel(float *states, int n_states, int states_size, float delta_time, curandStateMRG32k3a *mrg_state)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < n_states) {
-
-        /* Copy sigmas from constant memory into local register */
-        float2 local_sigmas = g_sigmas;
-
-        /* Copy mrg_state from global memory into local register */
-        curandStateMRG32k3a local_mrg_state = mrg_state[id];        
-
-        for (int i = 0; i < states_size / 7; i++) {
-
-            int states_index = id * states_size + i * 7;
-
-            /* Copy rot_center from constant memory into local register */
-            float3 local_rot_center = g_rot_center[i];
-
-            /* TODO coalesce accesses? Does it do it automatically or do I manually have to store them as
-            * float3 and float4 values? */
-            /* quaternion stored as (w,x,y,z), but make_float4 takes (x,y,z,w) */
-            float4 q_init_vector = make_float4(states[states_index + 1], states[states_index + 2], states[states_index + 3], states[states_index]);
-            float3 t_init = make_float3(states[states_index + 4], states[states_index + 5], states[states_index + 6]);
-
-            float angle_x, angle_y, angle_z;
-            float trans_x, trans_y, trans_z;
-
-            angle_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            angle_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            angle_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            trans_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-            trans_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-            trans_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-
-
-            float q_rand_matrix[MATRIX_DIM];
-            float q_init_matrix[MATRIX_DIM];
-
-            float rot_matrix_x[MATRIX_DIM];
-            float rot_matrix_y[MATRIX_DIM];
-            float rot_matrix_z[MATRIX_DIM];
-
-            float tmp_matrix[MATRIX_DIM];
-
-
-            float3 t_rand = make_float3(trans_x, trans_y, trans_z);
-
-            create_rotation_matrix(angle_x, 0, rot_matrix_x);
-            create_rotation_matrix(angle_y, 1, rot_matrix_y);
-            create_rotation_matrix(angle_z, 2, rot_matrix_z);
-
-            multiply_matrices(rot_matrix_y, rot_matrix_z, tmp_matrix);
-            multiply_matrices(rot_matrix_x, tmp_matrix, q_rand_matrix);
-
-            float4 q_rand_vector = matrix_to_quaternion(q_rand_matrix);
-
-            quaternion_to_matrix(q_init_vector, q_init_matrix);
-
-            float3 t = negate(multiply_matrix_with_vector(q_init_matrix, multiply_matrix_with_vector(q_rand_matrix, local_rot_center)))
-                   + multiply_matrix_with_vector(q_init_matrix, local_rot_center)
-                   + t_init
-                   + t_rand;
-
-            float4 q = multiply_quaternions(q_init_vector, q_rand_vector);
-            q = normalize(q);
-
-            /* write state back into global memory */
-            states[states_index] = q.w;
-            states[states_index + 1] = q.x;
-            states[states_index + 2] = q.y;
-            states[states_index + 3] = q.z;
-
-            states[states_index + 4] = t.x;
-            states[states_index + 5] = t.y;
-            states[states_index + 6] = t.z;
-        }
-
-        /* Copy mrg state back to global memory */
-        mrg_state[id] = local_mrg_state;
-    }
-}
-
-
-
-
-
-
-__global__ void propagate_multiple_kernel(float *states, int n_states, int n_objects, int states_size, float delta_time, curandStateMRG32k3a *mrg_state)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < n_states) {
-
-        /* Copy sigmas from constant memory into local register */
-        float2 local_sigmas = g_sigmas;
-
-        /* Copy mrg_state from global memory into local register */
-        curandStateMRG32k3a local_mrg_state = mrg_state[id];
-
-        for (int i = 0; i < n_objects; i++) {
-
-            int states_index = id * n_objects * states_size + i * states_size;
-
-            /* Copy rot_center from constant memory into local register */
-            float3 local_rot_center = g_rot_center[i];
-
-            /* TODO coalesce accesses? Does it do it automatically or do I manually have to store them as
-            * float3 and float4 values? */
-            /* quaternion stored as (w,x,y,z), but make_float4 takes (x,y,z,w) */
-            float4 q_init_vector = make_float4(states[states_index + 1], states[states_index + 2], states[states_index + 3], states[states_index]);
-            float3 t_init = make_float3(states[states_index + 4], states[states_index + 5], states[states_index + 6]);
-
-            float angle_x, angle_y, angle_z;
-            float trans_x, trans_y, trans_z;
-
-            // WARNING: same random number for all states, because mrg_state is the same..
-            angle_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            angle_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            angle_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.x;
-            trans_x = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-            trans_y = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-            trans_z = curand_normal(&local_mrg_state) * delta_time * local_sigmas.y;
-
-
-            float q_rand_matrix[MATRIX_DIM];
-            float q_init_matrix[MATRIX_DIM];
-
-            float rot_matrix_x[MATRIX_DIM];
-            float rot_matrix_y[MATRIX_DIM];
-            float rot_matrix_z[MATRIX_DIM];
-
-            float tmp_matrix[MATRIX_DIM];
-
-
-            float3 t_rand = make_float3(trans_x, trans_y, trans_z);
-
-            create_rotation_matrix(angle_x, 0, rot_matrix_x);
-            create_rotation_matrix(angle_y, 1, rot_matrix_y);
-            create_rotation_matrix(angle_z, 2, rot_matrix_z);
-
-            multiply_matrices(rot_matrix_y, rot_matrix_z, tmp_matrix);
-            multiply_matrices(rot_matrix_x, tmp_matrix, q_rand_matrix);
-
-            float4 q_rand_vector = matrix_to_quaternion(q_rand_matrix);
-
-            quaternion_to_matrix(q_init_vector, q_init_matrix);
-
-            float3 t = negate(multiply_matrix_with_vector(q_init_matrix, multiply_matrix_with_vector(q_rand_matrix, local_rot_center)))
-                   + multiply_matrix_with_vector(q_init_matrix, local_rot_center)
-                   + t_init
-                   + t_rand;
-
-            float4 q = multiply_quaternions(q_init_vector, q_rand_vector);
-            q = normalize(q);
-
-            /* write state back into global memory */
-            states[states_index] = q.w;
-            states[states_index + 1] = q.x;
-            states[states_index + 2] = q.y;
-            states[states_index + 3] = q.z;
-
-            states[states_index + 4] = t.x;
-            states[states_index + 5] = t.y;
-            states[states_index + 6] = t.z;
-        }
-
-        /* Copy mrg state back to global memory */
-        mrg_state[id] = local_mrg_state;
-    }
-}
-
-
-
-
-
-
-
-
-
-
-
-
-__global__ void compare_kernel(float *observations, float* visibility_probs, int n_pixels_per_pose,
-                        bool constant_occlusion, float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols) {
-    int block_id = blockIdx.x + blockIdx.y * gridDim.x;
-    if (block_id < n_poses) {
-
-        int pixel_nr = threadIdx.x;
-//        int pixel_nr = threadIdx.x * ceilf(n_pixels_per_pose / blockDim.x);
-        int global_index = block_id * n_pixels_per_pose + pixel_nr;
-
-        // OpenGL contructs the texture so that the left lower edge is (0,0), but our observations texture
-        // has its (0,0) in the upper left corner, so we need to reverse the reads from the OpenGL texture.
-        float texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols;
-        float texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1);
-
-        float depth;
-        float observed_depth;
-        float visibility_prob = g_p_visible_init;
-        float local_sum_of_likelihoods = 0;
-        float p_obsIpred_vis, p_obsIpred_occl, p_obsIinf;
-
-        // TODO: uninitialized?
-        __shared__ float log_likelihoods;
-
-        if (threadIdx.x == 0) {
-            log_likelihoods = 0;
-        }
-
-        __syncthreads();
-
-        while (pixel_nr < n_pixels_per_pose ) { //&& pixel_nr < (threadIdx.x + 1) * ceilf(n_pixels_per_pose / blockDim.x)) {
-
-            depth = tex2D(texture_reference, texture_array_index_x, texture_array_index_y);
-            observed_depth = observations[pixel_nr];
-
-            // TODO either this, or only write the values back for pixels with depth value == 1.
-            // Could save some data transfer time, but will cost more execution time, since all
-            // the threads in one warp will have to wait for the else-branch to finish
-            if (!constant_occlusion) {
-                visibility_prob = propagate_occlusion(visibility_probs[global_index], delta_time);
-                visibility_probs[global_index] = visibility_prob;
-            }
-//            if (!constant_occlusion) {
-//                visibility_prob = propagateOcclusion(visibility_probs[global_index], delta_time);
-//            }
-
-            //TODO slow: 4800 threads have to go through this whole if instruction
-            if (depth != 0 && !isnan(observed_depth)) {
-
-                // prob of observation given prediction, knowing that the object is visible
-                p_obsIpred_vis = prob(observed_depth, depth, true) * visibility_prob;
-                // prob of observation given prediction, knowing that the object is occluded
-                p_obsIpred_occl = prob(observed_depth, depth, false) * (1-visibility_prob);
-                // prob of observation given no intersection
-                p_obsIinf = prob(observed_depth, CUDART_INF_F, false);
-
-                local_sum_of_likelihoods += logf((p_obsIpred_vis + p_obsIpred_occl)/p_obsIinf);
-
-                if(!constant_occlusion) { // we check if we are tracking the visibilities
-                    // we update the visibility (occlusion) with the observations
-                    visibility_probs[global_index] = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl);
-                }
-//                if (!constant_occlusion) {
-//                    visibility_prob = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl);
-//                }
-            }
-
-//            if (!constant_occlusion) {
-//                visibility_probs[global_index] = visibility_prob;
-//            }
-
-            pixel_nr += blockDim.x;
-//            pixel_nr += 1;
-            global_index = block_id * n_pixels_per_pose + pixel_nr;
-            texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols;
-            texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1);
-        }
-
-        // TODO: will execute blockDim.x sequential writes to log_likelihoods
-        // instead could do a manual reduction after syncthreads
-        atomicAdd(&log_likelihoods, local_sum_of_likelihoods);
-
-        __syncthreads();
-
-        if (threadIdx.x == 0) {
-            d_log_likelihoods[block_id] = log_likelihoods;
-        }
-    } else {
-        __syncthreads();
-    }
-
-}
-
-
-
-
-
-__global__ void compare_multiple_kernel(float *observations, float* old_visibility_probs, float* new_visibility_probs, int* occlusion_image_indices, int nr_pixels,
-                                 float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols, bool update) {
+__global__ void evaluate_kernel(float *observations, float* old_occlusion_probs, float* new_occlusion_probs, int* occlusion_image_indices, int nr_pixels,
+                                 float *d_log_likelihoods, float delta_time, int n_poses, int n_rows, int n_cols, bool update_occlusions) {
     int block_id = blockIdx.x + blockIdx.y * gridDim.x;
     if (block_id < n_poses) {
 
@@ -547,11 +133,11 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili
         // has its (0,0) in the upper left corner, so we need to reverse the reads from the OpenGL texture.
         float texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols;
 
-        float texture_array_index_y = gridDim.y * n_rows - 1 - (blockIdx.y * n_rows + pixel_nr / n_cols);
+        float texture_array_index_y = gridDim.y * n_rows - 1 - (blockIdx.y * n_rows + __fdividef(pixel_nr, n_cols));
 
         float depth;
         float observed_depth;
-        float visibility_prob = g_p_visible_init;
+        float occlusion_prob = g_initial_occlusion_prob;
         float local_sum_of_likelihoods = 0;
         float p_obsIpred_vis, p_obsIpred_occl, p_obsIinf;
 
@@ -565,21 +151,21 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili
 
         __syncthreads();
 
-        float* visibility_probs = old_visibility_probs;
+        float* occlusion_probs = old_occlusion_probs;
         int occlusion_pixel_index= occlusion_image_index * nr_pixels + pixel_nr;
 
-        if (update) {
-            // copy / duplicate visibility probabilities from the old particles
-            int index_from_visibility = occlusion_image_indices[block_id] * nr_pixels;
-            int index_to_visibility = block_id * nr_pixels;
+        if (update_occlusions) {
+            // copy occlusion probabilities from the old particles
+            int index_from_occlusion = occlusion_image_indices[block_id] * nr_pixels;
+            int index_to_occlusion = block_id * nr_pixels;
 
             while (pixel_nr < nr_pixels) {
-                new_visibility_probs[index_to_visibility + pixel_nr] = old_visibility_probs[index_from_visibility + pixel_nr];
+                new_occlusion_probs[index_to_occlusion + pixel_nr] = old_occlusion_probs[index_from_occlusion + pixel_nr];
                 pixel_nr += blockDim.x;
             }
 
-            // change visibility prob array to the new one and change the global index
-            visibility_probs = new_visibility_probs;
+            // change occlusion prob array to the new one and change the global index
+            occlusion_probs = new_occlusion_probs;
             // reset pixel_nr for following loop
             pixel_nr = threadIdx.x;
 
@@ -592,41 +178,34 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili
             depth = tex2D(texture_reference, texture_array_index_x, texture_array_index_y);
             observed_depth = observations[pixel_nr];
 
-            // TODO either this, or only write the values back for pixels with depth value == 1.
-            // Could save some data transfer time, but will cost more execution time, since all
-            // the threads in one warp will have to wait for the else-branch to finish
-
-            visibility_prob = propagate_occlusion(visibility_probs[occlusion_pixel_index], delta_time);
-            if (update) visibility_probs[occlusion_pixel_index] = visibility_prob;
+            occlusion_prob = propagate_occlusion(occlusion_probs[occlusion_pixel_index], delta_time);
+            if (update_occlusions) occlusion_probs[occlusion_pixel_index] = occlusion_prob;
 
 
             if (depth != 0 && !isnan(observed_depth)) {
 
-
-                // prob of observation given prediction, knowing that the object is visible
-                p_obsIpred_vis = prob(observed_depth, depth, true) * visibility_prob;
+                // prob of observation given prediction, knowing that the object is not occluded
+                p_obsIpred_vis = prob(observed_depth, depth, false) * (1 - occlusion_prob);
                 // prob of observation given prediction, knowing that the object is occluded
-                p_obsIpred_occl = prob(observed_depth, depth, false) * (1-visibility_prob);
+                p_obsIpred_occl = prob(observed_depth, depth, true) * occlusion_prob;
                 // prob of observation given no intersection
-                p_obsIinf = prob(observed_depth, CUDART_INF_F, false);
+                p_obsIinf = prob(observed_depth, CUDART_INF_F, true);
 
-                local_sum_of_likelihoods += logf((p_obsIpred_vis + p_obsIpred_occl)/p_obsIinf);
+                local_sum_of_likelihoods += __logf(__fdividef((p_obsIpred_vis + p_obsIpred_occl), p_obsIinf));
 
 
-                if(update) {
-                    // we update the visibility (occlusion) probability with the observations
-                    visibility_probs[occlusion_pixel_index] = p_obsIpred_vis/(p_obsIpred_vis + p_obsIpred_occl);
+                if(update_occlusions) {
+                    // we update the occlusion probability with the observations
+                    occlusion_probs[occlusion_pixel_index] = 1 - __fdividef(p_obsIpred_vis, (p_obsIpred_vis + p_obsIpred_occl));
                 }
             }
 
             pixel_nr += blockDim.x;
             occlusion_pixel_index += blockDim.x;
             texture_array_index_x = blockIdx.x * n_cols + pixel_nr % n_cols;
-            texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + pixel_nr / n_cols + 1);
+            texture_array_index_y = gridDim.y * n_rows - (blockIdx.y * n_rows + (pixel_nr / n_cols) + 1);
         }
 
-        // TODO: will execute blockDim.x sequential writes to log_likelihoods
-        // instead could do a manual reduction after syncthreads
         atomicAdd(&log_likelihoods, local_sum_of_likelihoods);
 
         __syncthreads();
@@ -645,51 +224,6 @@ __global__ void compare_multiple_kernel(float *observations, float* old_visibili
 
 
 
-
-__global__ void resample_kernel(float *visibility_probs,
-                         float *visibility_probs_copy,
-                         float *states,
-                         float *states_copy,
-                         int *resampling_indices,
-                         int nr_pixels,
-                         int nr_features) {
-
-    int pixel_nr = threadIdx.x;
-    int feature_nr = threadIdx.x;
-    int index_from_visibility = resampling_indices[blockIdx.x] * nr_pixels;
-    int index_to_visibility = blockIdx.x * nr_pixels;
-    int index_from_states = resampling_indices[blockIdx.x] * nr_features;
-    int index_to_states = blockIdx.x * nr_features;
-
-
-    while (pixel_nr < nr_pixels) {
-        visibility_probs_copy[index_to_visibility + pixel_nr] = visibility_probs[index_from_visibility + pixel_nr];
-        pixel_nr += blockDim.x;
-    }
-    while (feature_nr < nr_features) {
-        states_copy[index_to_states + feature_nr] = states[index_from_states + feature_nr];
-        feature_nr += blockDim.x;
-    }
-}
-
-
-__global__ void resample_multiple_kernel(float *visibility_probs,
-                                  float *visibility_probs_copy,
-                                  int *resampling_indices,
-                                  int nr_pixels) {
-
-    int pixel_nr = threadIdx.x;
-    int index_from_visibility = resampling_indices[blockIdx.x] * nr_pixels;
-    int index_to_visibility = blockIdx.x * nr_pixels;
-
-    while (pixel_nr < nr_pixels) {
-        visibility_probs_copy[index_to_visibility + pixel_nr] = visibility_probs[index_from_visibility + pixel_nr];
-        pixel_nr += blockDim.x;
-    }
-}
-
-
-
 // ************************************************************************************** //
 // ************************************************************************************** //
 // ========================== CUDA_FILTER MEMBER FUNCTIONS ============================== //
@@ -697,10 +231,11 @@ __global__ void resample_multiple_kernel(float *visibility_probs,
 // ************************************************************************************** //
 
 
-CudaFilter::CudaFilter() :
-    n_cols_(WINDOW_WIDTH),
-    n_rows_(WINDOW_HEIGHT),
-    n_poses_set_(false)
+CudaFilter::CudaFilter(const int nr_rows,
+                       const int nr_cols) :
+
+    nr_rows_(nr_rows),
+    nr_cols_(nr_cols)
 {
 
     cudaDeviceProp  props;
@@ -710,14 +245,18 @@ CudaFilter::CudaFilter() :
     props.major = 2;
     props.minor = 0;
     cudaChooseDevice( &device_number, &props );
-    check_cuda_error("No device with compute capability > 2.0 found");
+    #ifdef DEBUG
+        check_cuda_error("No device with compute capability > 2.0 found");
+    #endif
 
-    /* tell CUDA which device we will be using for graphic interop
-     * requires that the CUDA device be specified by
+    /* tell CUDA which device we will be using for graphic interop.
+     * Requires that the CUDA device be specified by
      * cudaGLSetGLDevice() before any other runtime calls. */
 
     cudaGLSetGLDevice( device_number );
-    check_cuda_error("cudaGLsetGLDevice");
+    #ifdef DEBUG
+        check_cuda_error("cudaGLsetGLDevice");
+    #endif
 
     cudaGetDeviceProperties(&props, device_number);     // we will run the program only on one graphics card, the first one we can find = 0
     warp_size_ = props.warpSize;            // equals 32 for all current graphics cards, but might change in the future
@@ -725,606 +264,308 @@ CudaFilter::CudaFilter() :
 
     cuda_device_properties_ = props;
 
-    cout << "Your device has the following properties: " << endl
-         << "CUDA Version: " << props.major << "." << props.minor << endl
-         << "Number of multiprocessors: " << n_mps_ << endl
-         << "Warp size: " << warp_size_ << endl;
+    #ifdef DEBUG
+        cout << "Your device has the following properties: " << endl
+             << "CUDA Version: " << props.major << "." << props.minor << endl
+             << "Number of multiprocessors: " << n_mps_ << endl
+             << "Warp size: " << warp_size_ << endl;
+    #endif
 
     /* each multiprocessor has various KB of memory (64 KB for the GTX 560 Ti 448) which can be subdivided
      * into L1 cache or shared memory. If you don't need a lot of shared memory set this to prefer L1. */
     cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
 
-    d_states_ = NULL;
-    d_states_copy_ = NULL;
-    d_visibility_probs_ = NULL;
-    d_visibility_probs_copy_ = NULL;
+    d_occlusion_probs_ = NULL;
+    d_occlusion_probs_copy_ = NULL;
     d_observations_ = NULL;
     d_log_likelihoods_ = NULL;
-    d_mrg_states_ = NULL;
-    d_resampling_indices_ = NULL;
-    d_prev_sample_indices_ = NULL;
+    d_occlusion_indices_ = NULL;
 
 }
 
-void CudaFilter::init(vector<vector<float> > com_models, float angle_sigma, float trans_sigma,
-                      float p_visible_init, float c, float log_c, float p_visible_occluded,
-                      float tail_weight, float model_sigma, float sigma_factor, float max_depth, float exponential_rate) {
+
+
+void CudaFilter::init(const float initial_occlusion_prob, const float p_occluded_occluded, const float p_occluded_visible,
+                      const float tail_weight, const float model_sigma, const float sigma_factor, const float max_depth, const float exponential_rate) {
 
     occlusion_time_ = 0;
-    last_propagation_time_ = 0;
-    count_ = 0;
-    compare_kernel_time_ = 0;
-    copy_likelihoods_time_ = 0;
-    visibility_prob_default_ = p_visible_init;
+    occlusion_prob_default_ = initial_occlusion_prob;
 
-    float2 local_sigmas = make_float2(angle_sigma, trans_sigma);
+    // precompute constants that are used in high-performance kernels later
+    float c = p_occluded_occluded - p_occluded_visible;
+    float tail_weight_div_max_depth = tail_weight / max_depth;
+    float one_minus_tail_weight = 1.0f - tail_weight;
+    float one_div_c_minus_one = 1.0f / (c - 1.0f);
+    float one_div_sqrt_of_two = 1.0f / sqrt(2);
+    float one_div_sqrt_of_two_pi = 1.0f / sqrt(2 * M_PI);
+    float log_c = log(c);
 
-    allocate(d_observations_, n_cols_ * n_rows_ * sizeof(float), "d_observations");
-//    allocate(d_log_likelihoods_, sizeof(float) * n_poses_, "d_log_likelihoods");
-    // TODO don't allocate here!! only when setting resolution!
-//    allocate(d_prev_sample_indices_, sizeof(int) * n_poses_, "d_prev_sample_indices");
-//    allocate(d_resampling_indices_, sizeof(int) * n_poses_, "d_resampling_indices");
+    allocate(d_observations_, nr_cols_ * nr_rows_ * sizeof(float));
 
+    // initialize log likelihoods with 0
     cudaMemset(d_log_likelihoods_, 0, sizeof(float) * nr_poses_);
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaMemset d_log_likelihoods");
     #endif
 
-    cudaMemcpyToSymbol(g_sigmas, &local_sigmas, sizeof(float2), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol local_sigmas -> sigmas");
+    // copy constants to GPU memory
+    cudaMemcpyToSymbol(g_initial_occlusion_prob, &initial_occlusion_prob, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol initial_occlusion_prob -> g_initial_occlusion_prob");
     #endif
 
-    vector<float3> com_models_raw;
-    for (int i = 0; i < com_models.size(); i++) {
-        com_models_raw.push_back(make_float3(com_models[i][0], com_models[i][1], com_models[i][2]));
-    }
-
-    cudaMemcpyToSymbol(g_rot_center, com_models_raw.data(), com_models_raw.size() * sizeof(float3), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol com_model -> rot_center");
+    cudaMemcpyToSymbol(g_one_div_c_minus_one, &one_div_c_minus_one, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol one_div_c_minus_one -> g_one_div_c_minus_one");
     #endif
 
-    cudaMemcpyToSymbol(g_p_visible_init, &p_visible_init, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol p_visible_init -> g_p_visible_init");
+    cudaMemcpyToSymbol(g_one_div_sqrt_of_two, &one_div_sqrt_of_two, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol one_div_sqrt_of_two -> g_one_div_sqrt_of_two");
     #endif
 
-    cudaMemcpyToSymbol(g_c, &c, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol c -> g_c");
+    cudaMemcpyToSymbol(g_one_div_sqrt_of_two_pi, &one_div_sqrt_of_two_pi, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol one_div_sqrt_of_two_pi -> g_one_div_sqrt_of_two_pi");
     #endif
 
     cudaMemcpyToSymbol(g_log_c, &log_c, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaMemcpyToSymbol log_c -> g_log_c");
     #endif
 
-    cudaMemcpyToSymbol(g_p_visible_occluded, &p_visible_occluded, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol p_visible_occluded -> g_p_visible_occluded");
+    cudaMemcpyToSymbol(g_p_occluded_occluded, &p_occluded_occluded, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol p_occluded_occluded -> g_p_occluded_occluded");
     #endif
 
-    cudaMemcpyToSymbol(g_tail_weight, &tail_weight, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol tail_weight -> g_tail_weight");
+    cudaMemcpyToSymbol(g_one_minus_tail_weight, &one_minus_tail_weight, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol one_minus_tail_weight -> g_one_minus_tail_weight");
     #endif
 
     cudaMemcpyToSymbol(g_model_sigma, &model_sigma, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaMemcpyToSymbol model_sigma -> g_model_sigma");
     #endif
 
     cudaMemcpyToSymbol(g_sigma_factor, &sigma_factor, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaMemcpyToSymbol sigma_factor -> g_sigma_factor");
     #endif
 
-    cudaMemcpyToSymbol(g_max_depth, &max_depth, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpyToSymbol max_depth -> g_max_depth");
+    cudaMemcpyToSymbol(g_tail_weight_div_max_depth, &tail_weight_div_max_depth, sizeof(float), 0, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpyToSymbol tail_weight_div_max_depth -> g_tail_weight_div_max_depth");
     #endif
 
     cudaMemcpyToSymbol(g_exponential_rate, &exponential_rate, sizeof(float), 0, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaMemcpyToSymbol exponential_rate -> g_exponential_rate");
     #endif
-}
-
-
-void CudaFilter::propagate(const float &current_time, vector<vector<float> > &states)
-{
-
-
-    float delta_time = current_time - last_propagation_time_;
-    last_propagation_time_ = current_time;
-
-
-    propagate_kernel <<< n_blocks_, nr_threads_ >>> (d_states_, nr_poses_, n_features_, delta_time, d_mrg_states_);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("propagate kernel call");
-    #endif
-
-
-
-    // TODO necessary? Remove for performance?
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize propagate");
-    #endif
-
-
-
-    float *states_raw = (float*) malloc(nr_poses_ * n_features_ * sizeof(float));
-    cudaMemcpy(states_raw, d_states_, nr_poses_ * n_features_ * sizeof(float), cudaMemcpyDeviceToHost);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy d_states -> states");
-    #endif
-
 
-    for (int i = 0; i < nr_poses_; i++) {
-        for (int j = 0; j < n_features_; j++) {
-            states[i][j] = states_raw[i * n_features_ + j];
-        }
-    }
+    constants_initialized_ = true;
 }
 
 
 
 
-void CudaFilter::propagate_multiple(const float &current_time, vector<vector<vector<float> > > &states)
-{
-
-    float delta_time = current_time - last_propagation_time_;
-    last_propagation_time_ = current_time;
-
-    int n_objects = states[0].size();
-
-    float *states_raw = (float*) malloc(nr_poses_ * n_objects * n_features_ * sizeof(float));
-    for (int i = 0; i < nr_poses_; i++) {
-        for (int j = 0; j < n_objects; j++) {
-            for (int k = 0; k < n_features_; k++) {
-                states_raw[(i * n_objects *n_features_) + j * n_features_ + k] = states[i][j][k];
-            }
-        }
-    }
-
-
-    cudaMemcpy(d_states_, states_raw, nr_poses_ * n_objects * n_features_ * sizeof(float), cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy states -> d_states");
-    #endif
-
-
-    propagate_multiple_kernel <<< n_blocks_, nr_threads_ >>> (d_states_, nr_poses_, n_objects, n_features_, delta_time, d_mrg_states_);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("propagate kernel call");
-    #endif
-
-
-
-    // TODO necessary? Remove for performance?
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize propagate");
-    #endif
 
+void CudaFilter::weigh_poses(const bool update_occlusions, vector<float> &log_likelihoods) {
+    if (observations_set_ && occlusion_indices_set_ && occlusion_probabilities_set_
+            && memory_allocated_ && number_of_poses_set_ && constants_initialized_) {
 
+        double delta_time = observation_time_ - occlusion_time_;
+        if(update_occlusions) occlusion_time_ = observation_time_;
 
-    cudaMemcpy(states_raw, d_states_, nr_poses_ * n_objects * n_features_ * sizeof(float), cudaMemcpyDeviceToHost);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy d_states -> states");
-    #endif
+        evaluate_kernel <<< grid_dimension_, nr_threads_ >>> (d_observations_, d_occlusion_probs_, d_occlusion_probs_copy_, d_occlusion_indices_, nr_cols_ * nr_rows_,
+                                               d_log_likelihoods_, delta_time, nr_poses_, nr_rows_, nr_cols_, update_occlusions);
+        #ifdef DEBUG
+            check_cuda_error("compare kernel call");
+        #endif
 
+        cudaDeviceSynchronize();
+        #ifdef DEBUG
+            check_cuda_error("cudaDeviceSynchronize compare_multiple");
+        #endif
 
-    for (int i = 0; i < nr_poses_; i++) {
-        for (int j = 0; j < n_objects; j++) {
-            for (int k = 0; k < n_features_; k++) {
-                states[i][j][k] = states_raw[(i * n_objects *n_features_) + j * n_features_ + k];
-            }
+        // switch to new / copied occlusion probabilities
+        if (update_occlusions) {
+            float *tmp_pointer;
+            tmp_pointer = d_occlusion_probs_;
+            d_occlusion_probs_ = d_occlusion_probs_copy_;
+            d_occlusion_probs_copy_ = tmp_pointer;
         }
-    }
-
-    free(states_raw);
-}
-
-
-
-
-void CudaFilter::compare(float observation_time, bool constant_occlusion, vector<float> &log_likelihoods) {
-
-#ifdef PROFILING_ACTIVE
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    float milliseconds;
-#endif
-
-    dim3 gridDim = dim3(nr_poses_per_row_, nr_poses_per_column_);
-
-    // update observation time
-    float delta_time = observation_time - occlusion_time_;
-    occlusion_time_ = observation_time;
-
-
-
-#ifdef PROFILING_ACTIVE
-    cudaEventRecord(start);
-#endif
 
-    compare_kernel <<< gridDim, 128 >>> (d_observations_, d_visibility_probs_, n_cols_ * n_rows_,
-            constant_occlusion, d_log_likelihoods_, delta_time, nr_poses_, n_rows_, n_cols_);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("compare kernel call");
-    #endif
 
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize compare");
-    #endif
+        cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost);
+        #ifdef DEBUG
+            check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods");
+        #endif
 
-#ifdef PROFILING_ACTIVE
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);    
-    count_++;
-    compare_kernel_time_ += milliseconds;
-    if (count_ == COUNT) {
-        cout << "compare kernel: " << compare_kernel_time_ * 1e3 / count_ << " us" << endl;
+        cudaDeviceSynchronize();
+        #ifdef DEBUG
+            check_cuda_error("cudaDeviceSynchronize compare");
+        #endif
+    } else {
+        std::cout << "WARNING (CUDA): It seems you forgot to do one of the following: set observation image, set occlusion"
+                  << " indices, set occlusion probabilities, set number of poses, allocate memory or inisitialize constants." << std::endl;
     }
-    cudaEventRecord(start);
-#endif
-
-    cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods");
-    #endif
 
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize compare");
-    #endif
-
-#ifdef PROFILING_ACTIVE
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    copy_likelihoods_time_ += milliseconds;
-    if (count_ == COUNT) {
-        cout << "copy likelihoods: " << copy_likelihoods_time_ * 1e3 / count_ << " us" << endl;
-    }
-#endif
 }
 
 
 
-void CudaFilter::compare_multiple(bool update, vector<float> &log_likelihoods) {
-
-#ifdef PROFILING_ACTIVE
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    float milliseconds;
-    cudaEventRecord(start);
-#endif
-
-    double delta_time = observation_time_ - occlusion_time_;
-    if(update) occlusion_time_ = observation_time_;
-//    cout << "delta time: " << delta_time << endl;
-
-
-    compare_multiple_kernel <<< grid_dimension_, nr_threads_ >>> (d_observations_, d_visibility_probs_, d_visibility_probs_copy_, d_prev_sample_indices_, n_cols_ * n_rows_,
-                                           d_log_likelihoods_, delta_time, nr_poses_, n_rows_, n_cols_, update);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("compare kernel call");
-    #endif
-
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize compare_multiple");
-    #endif
-
 
 
-    // switch to new / copied visibility probabilities
-    if (update) {
-        float *tmp_pointer;
-        tmp_pointer = d_visibility_probs_;
-        d_visibility_probs_ = d_visibility_probs_copy_;
-        d_visibility_probs_copy_ = tmp_pointer;
-    }
-
-#ifdef PROFILING_ACTIVE
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    count_++;
-    compare_kernel_time_ += milliseconds;
-    if (count_ == COUNT) {
-        cout << "compare kernel: " << compare_kernel_time_ * 1e3 / count_ << " us" << endl;
-    }
-    cudaEventRecord(start);
-#endif
 
-    cudaMemcpy(&log_likelihoods[0], d_log_likelihoods_, nr_poses_ * sizeof(float), cudaMemcpyDeviceToHost);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy d_log_likelihoods -> log_likelihoods");
-    #endif
 
-    cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize compare");
-    #endif
+// ===================================================================================== //
+// =============================== CUDA FILTER SETTERS ================================= //
+// ===================================================================================== //
 
-#ifdef PROFILING_ACTIVE
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    copy_likelihoods_time_ += milliseconds;
-    if (count_ == COUNT) {
-        cout << "copy likelihoods: " << copy_likelihoods_time_ * 1e3 / count_ << " us" << endl;
-    }
-#endif
+void CudaFilter::set_nr_threads(const int nr_threads) {
+    nr_threads_ = min(nr_threads, cuda_device_properties_.maxThreadsDim[0]);
 }
 
 
 
+void CudaFilter::set_observations(const float* observations, const float observation_time) {
 
+    observation_time_ = observation_time;
 
-
-
-void CudaFilter::resample(vector<int> resampling_indices) {
-
-//    cout << "resample <<< " << n_poses_ << ", " << 128 << " >>>" << endl;
-
-    cudaMemcpy(d_resampling_indices_, &resampling_indices[0], sizeof(int) * nr_poses_, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy resampling_indices -> d_resampling_indices_");
-    #endif
-
-//        int min = 100;
-//        int max = -1;
-//        for (int i = 0; i < resampling_indices.size(); i++) {
-//            int value = resampling_indices[i];
-//            if (value > max) max = value;
-//            if (value < min) min = value;
-//        }
-//        cout << "resample min: " << min << ", max: " << max << endl;
-
-
-    int nr_pixels = n_rows_ * n_cols_;
-
-    resample_kernel <<< nr_poses_, 128 >>> (d_visibility_probs_, d_visibility_probs_copy_,
-                                    d_states_, d_states_copy_,
-                                    d_resampling_indices_, nr_pixels, n_features_);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("resample kernel call");
+    cudaMemcpy(d_observations_, observations, nr_cols_ * nr_rows_ * sizeof(float), cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpy observations -> d_observations_");
     #endif
-
     cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize resample");
+    #ifdef DEBUG
+        check_cuda_error("cudaDeviceSynchronize set_observations");
     #endif
 
-
-    // switch the visibility probs pointers, so that the next Compare() call will access the resampled
-    // visibility probs. Same for the states.
-    float *tmp_pointer;
-    tmp_pointer = d_visibility_probs_;
-    d_visibility_probs_ = d_visibility_probs_copy_;
-    d_visibility_probs_copy_ = tmp_pointer;
-    tmp_pointer = d_states_;
-    d_states_ = d_states_copy_;
-    d_states_copy_ = tmp_pointer;
-
+    observations_set_ = true;
 }
 
 
 
+void CudaFilter::set_occlusion_indices(const int* occlusion_indices) {
 
-void CudaFilter::resample_multiple(vector<int> resampling_indices) {
+    cudaMemcpy(d_occlusion_indices_, occlusion_indices, nr_poses_ * sizeof(int), cudaMemcpyHostToDevice);
 
-    cudaMemcpy(d_resampling_indices_, &resampling_indices[0], sizeof(int) * nr_poses_, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy resampling_indices -> d_resampling_indices_");
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpy occlusion_indices -> d_occlusion_indices");
     #endif
-
-    int nr_pixels = n_rows_ * n_cols_;
-
-    resample_multiple_kernel <<< nr_poses_, 128 >>> (d_visibility_probs_, d_visibility_probs_copy_,
-                                             d_resampling_indices_, nr_pixels);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("resample kernel call");
-    #endif
-
     cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaDeviceSynchronize resample");
+    #ifdef DEBUG
+        check_cuda_error("cudaDeviceSynchronize set_occlusion_indices");
     #endif
 
-
-    // switch the visibility probs pointers, so that the next Compare() call will access the resampled
-    // visibility probs.
-    float *tmp_pointer;
-    tmp_pointer = d_visibility_probs_;
-    d_visibility_probs_ = d_visibility_probs_copy_;
-    d_visibility_probs_copy_ = tmp_pointer;
+    occlusion_indices_set_ = true;
 }
 
 
+void CudaFilter::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, bool adapt_to_constraints) {
 
+    nr_rows_ = n_rows;
+    nr_cols_ = n_cols;
 
-
-// ===================================================================================== //
-// =============================== CUDA FILTER SETTERS ================================= //
-// ===================================================================================== //
-
-void CudaFilter::set_states(std::vector<std::vector<float> > &states, int seed)
-{
-    if (n_poses_set_) {
-        // copy states into linear array
-        /* TODO maybe padding can speed up the memory accesses later from the kernel, since
-         * right now, each MP needs 7 values out of d_states_. 8 would be a much better number. */
-        n_features_ = states[0].size();
-
-        int states_size = nr_poses_ * n_features_ * sizeof(float);
-        float *states_raw = (float*) malloc(states_size);
-
-        for (size_t i = 0; i < nr_poses_; i++) {
-            for (size_t j = 0; j < n_features_; j++) {
-                states_raw[i * n_features_ + j] = states[i][j];
-            }
-        }
-
-        allocate(d_states_, states_size, "d_states");
-        allocate(d_states_copy_, states_size, "d_states_copy");     // placeholder for resampling purposes
-
-        cudaMemcpy(d_states_, states_raw, states_size, cudaMemcpyHostToDevice);
-        #ifdef CHECK_ERRORS
-            check_cuda_error("cudaMemcpy states_raw -> d_states_");
-        #endif
-
-        free(states_raw);
-
-        // setup random number generators for each thread to be used in the propagate kernel
-        allocate(d_mrg_states_, nr_poses_ * sizeof(curandStateMRG32k3a), "d_mrg_states");
-
-        setup_number_generators_kernel <<< n_blocks_, nr_threads_ >>> (seed, d_mrg_states_, nr_poses_);
-
-        cudaDeviceSynchronize();
-    } else {
-        cout << "WARNING: set_states() was not executed, because n_poses_ has not been set previously";
-        exit(-1);
-    }
-}
-
-
-
-
-
-
-void CudaFilter::set_states_multiple(int n_objects, int n_features, int seed)
-{
-    if (n_poses_set_) {
-        n_features_ = n_features;
-
-        int states_size = nr_poses_ * n_objects * n_features_ * sizeof(float);
-        allocate(d_states_, states_size, "d_states");
-
-
-        // setup random number generators for each thread to be used in the propagate kernel
-        allocate(d_mrg_states_, nr_poses_ * sizeof(curandStateMRG32k3a), "d_mrg_states");
-
-        setup_number_generators_kernel <<< n_blocks_, nr_threads_ >>> (seed, d_mrg_states_, nr_poses_);
-
-        cudaDeviceSynchronize();
-    } else {
-        cout << "WARNING: set_states_multiple() was not executed, because n_poses_ has not been set previously";
-        exit(-1);
-    }
+    // reallocate buffers
+    allocate(d_observations_, nr_cols_ * nr_rows_ * sizeof(float));
+    allocate_memory_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column, adapt_to_constraints);
 }
 
 
+void CudaFilter::set_occlusion_probabilities(const float* occlusion_probabilities) {
 
+    cudaMemcpy(d_occlusion_probs_, occlusion_probabilities, nr_rows_ * nr_cols_ * nr_poses_ * sizeof(float), cudaMemcpyHostToDevice);
 
-
-
-
-void CudaFilter::set_observations(const float* observations, const float observation_time) {
-
-//    delta_time_ = observation_time - last_observation_time_;
-    observation_time_ = observation_time;
-//    cout << "delta time: " << delta_time_ << ", last_observation_time: " << occlusion_time_ << endl;
-    set_observations(observations);
-}
-
-void CudaFilter::set_observations(const float* observations) {
-    cudaMemcpy(d_observations_, observations, n_cols_ * n_rows_ * sizeof(float), cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy observations -> d_observations_");
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpy occlusion_probabilities -> d_occlusion_probs_");
     #endif
     cudaDeviceSynchronize();
-}
-
-
-void CudaFilter::set_prev_sample_indices(const int* prev_sample_indices) {
-    cudaMemcpy(d_prev_sample_indices_, prev_sample_indices, nr_poses_ * sizeof(int), cudaMemcpyHostToDevice);
-//    cout << "when setting prev_sample_indices: n_poses: " << n_poses_ << ", max poses: " << n_max_poses_ << endl;
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy prev_sample_indices -> d_prev_sample_indices_");
+    #ifdef DEBUG
+        check_cuda_error("cudaDeviceSynchronize set_occlusion_probabilities");
     #endif
-    cudaDeviceSynchronize();
+
+    occlusion_probabilities_set_ = true;
 }
 
 
-void CudaFilter::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) {
-    n_rows_ = n_rows;
-    n_cols_ = n_cols;
-
-    // reallocate buffers
-    allocate(d_observations_, n_cols_ * n_rows_ * sizeof(float), "d_observations");    
-    allocate_memory_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column);
-}
+void CudaFilter::map_texture_to_texture_array(const cudaArray_t texture_array) {
 
+    d_texture_array_ = texture_array;
+    cudaBindTextureToArray(texture_reference, d_texture_array_);
 
-void CudaFilter::set_visibility_probabilities(const float* visibility_probabilities) {
-    cudaMemcpy(d_visibility_probs_, visibility_probabilities, n_rows_ * n_cols_ * nr_poses_ * sizeof(float), cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy visibility_probabilities -> d_visibility_probs_");
+    #ifdef DEBUG
+        check_cuda_error("cudaBindTextureToArray");
     #endif
 }
 
 
-
-
 void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses,
                                                int& allocated_poses_per_row,
-                                               int& allocated_poses_per_column) {
+                                               int& allocated_poses_per_column,
+                                               bool adapt_to_constraints) {
 
     // check limitation by global memory
     size_t size_of_log_likelihoods = sizeof(float) * allocated_poses;
     size_t size_of_resampling_indices = sizeof(int) * allocated_poses;
-    size_t size_of_prev_sample_indices = sizeof(int) * allocated_poses;
-    size_t size_of_visibility_probs = n_rows_ * n_cols_ * allocated_poses * sizeof(float);
-    size_t size_of_opengl_textures = size_of_visibility_probs * 2;
-    size_t size_of_observations = n_cols_ * n_rows_ * sizeof(float);
+    size_t size_of_occlusion_indices = sizeof(int) * allocated_poses;
+    size_t size_of_occlusion_probs = nr_rows_ * nr_cols_ * allocated_poses * sizeof(float);
+    size_t size_of_opengl_textures = size_of_occlusion_probs * 2;
+    size_t size_of_observations = nr_cols_ * nr_rows_ * sizeof(float);
 
-    size_t total_size = size_of_log_likelihoods + size_of_resampling_indices + size_of_prev_sample_indices
-                      + size_of_visibility_probs * 2 + size_of_opengl_textures + size_of_observations;
+    size_t total_size = size_of_log_likelihoods + size_of_resampling_indices + size_of_occlusion_indices
+                      + size_of_occlusion_probs * 2 + size_of_opengl_textures + size_of_observations;
 
     if (total_size > cuda_device_properties_.totalGlobalMem) {
-
-        std::cout << "The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. "
-                  << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem
-                  << " B) retrieved from CUDA properties." << std::endl;
-
-        size_t size_depending_on_nr_poses = (sizeof(float) + sizeof(int) * 2 + n_rows_ * n_cols_ * sizeof(float) * 4);
-        allocated_poses = min(allocated_poses, (int) floor((cuda_device_properties_.totalGlobalMem - size_of_observations) / size_depending_on_nr_poses));
-        allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row);
-
-        std::cout << "Instead, space (" << allocated_poses * size_depending_on_nr_poses + size_of_observations << " B) for " << allocated_poses << " poses was allocated. " << std::endl;
+        if (adapt_to_constraints) {
+            std::cout << "WARNING (CUDA): The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. "
+                      << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem
+                      << " B) retrieved from CUDA properties." << std::endl;
+
+            size_t size_depending_on_nr_poses = (sizeof(float) + sizeof(int) * 2 + nr_rows_ * nr_cols_ * sizeof(float) * 4);
+            allocated_poses = min(allocated_poses, (int) floor((cuda_device_properties_.totalGlobalMem - size_of_observations) / size_depending_on_nr_poses));
+            allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row);
+
+            std::cout << "Instead, space (" << allocated_poses * size_depending_on_nr_poses + size_of_observations << " B) for " << allocated_poses << " poses was allocated. " << std::endl;
+        } else {
+            std::cout << "ERROR (CUDA): The space (" << total_size << " B) for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. "
+                      << "The limit is global memory size (" << cuda_device_properties_.totalGlobalMem
+                      << " B) retrieved from CUDA properties." << std::endl;
+            exit(-1);
+        }
     }
 
 
     // check limitation by texture size
-    if (cuda_device_properties_.maxTexture2D[0] <= allocated_poses_per_row * n_cols_) {
+    if (cuda_device_properties_.maxTexture2D[0] <= allocated_poses_per_row * nr_cols_) {
+        if (adapt_to_constraints) {
 
-        std::cout << "The max poses you requested (" << allocated_poses << ") could not be allocated." << std::endl;
+            std::cout << "WARNING (CUDA): The max poses you requested (" << allocated_poses << ") could not be allocated." << std::endl;
 
-        allocated_poses_per_row = cuda_device_properties_.maxTexture2D[0] / n_cols_;
-        allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row);
+            allocated_poses_per_row = cuda_device_properties_.maxTexture2D[0] / nr_cols_;
+            allocated_poses_per_column = ceil(allocated_poses / allocated_poses_per_row);
 
-        if (cuda_device_properties_.maxTexture2D[1] <= allocated_poses_per_column * n_rows_) {
-            allocated_poses_per_column = cuda_device_properties_.maxTexture2D[1] / n_rows_;
-        }
+            if (cuda_device_properties_.maxTexture2D[1] <= allocated_poses_per_column * nr_rows_) {
+                allocated_poses_per_column = cuda_device_properties_.maxTexture2D[1] / nr_rows_;
+            }
+
+            allocated_poses = min(allocated_poses, allocated_poses_per_row * allocated_poses_per_column);
+
+            std::cout << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0]
+                      << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. "
+                      << "Number of poses was reduced to (" << allocated_poses_per_row << ", "
+                      << allocated_poses_per_column << "), a total of " << allocated_poses << std::endl;
 
-        allocated_poses = min(allocated_poses, allocated_poses_per_row * allocated_poses_per_column);
 
-        std::cout << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0]
-                  << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. "
-                  << "Number of poses was reduced to (" << allocated_poses_per_row << ", "
-                  << allocated_poses_per_column << "), a total of " << allocated_poses << std::endl;
+        } else {
+            std::cout << "ERROR (CUDA): The max poses you requested (" << allocated_poses << ") could not be allocated."
+                      << "The limit is max texture size (" << cuda_device_properties_.maxTexture2D[0]
+                      << ", " << cuda_device_properties_.maxTexture2D[1] << ") retrieved from CUDA properties. " << std::endl;
+            exit(-1);
+        }
     }
 
     nr_max_poses_ = allocated_poses;
@@ -1332,23 +573,8 @@ void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses,
     nr_max_poses_per_column_ = allocated_poses_per_column;
 
 
-/*
-    nr_max_poses_ = n_poses;
-    nr_max_poses_per_row_ = n_poses_x;
-
-    // determine n_max_poses_y_
-    nr_max_poses_per_column_ = nr_max_poses_ / nr_max_poses_per_row_;
-    if (n_poses % nr_max_poses_per_row_ != 0) nr_max_poses_per_column_++;
-
-    n_poses_ = nr_max_poses_;
-    n_poses_x_ = nr_max_poses_per_row_;
-    n_poses_y_ = nr_max_poses_per_column_;
-
-*/
-    n_poses_set_ = true;
-
     bool nr_poses_changed = false;
-    set_default_kernel_config(nr_max_poses_, nr_max_poses_per_row_, nr_max_poses_per_column_, nr_poses_changed);
+    set_default_kernel_config(nr_max_poses_, nr_max_poses_per_row_, nr_max_poses_per_column_, nr_poses_changed, adapt_to_constraints);
 
     allocated_poses = nr_max_poses_;
     allocated_poses_per_row = nr_max_poses_per_row_;
@@ -1361,87 +587,78 @@ void CudaFilter::allocate_memory_for_max_poses(int& allocated_poses,
     if (nr_poses_changed) {
         size_of_log_likelihoods = sizeof(float) * nr_max_poses_;
         size_of_resampling_indices = sizeof(int) * nr_max_poses_;
-        size_of_prev_sample_indices = sizeof(int) * nr_max_poses_;
-        size_of_visibility_probs = n_rows_ * n_cols_ * nr_max_poses_ * sizeof(float);
+        size_of_occlusion_indices = sizeof(int) * nr_max_poses_;
+        size_of_occlusion_probs = nr_rows_ * nr_cols_ * nr_max_poses_ * sizeof(float);
     }
 
 
     // reallocate arrays
-    allocate(d_log_likelihoods_, size_of_log_likelihoods, "d_log_likelihoods");
-    allocate(d_resampling_indices_, size_of_resampling_indices, "d_resampling_indices");
-    allocate(d_prev_sample_indices_, size_of_prev_sample_indices, "d_prev_sample_indices");
-    allocate(d_visibility_probs_, size_of_visibility_probs, "d_visibility_probs");
-    allocate(d_visibility_probs_copy_, size_of_visibility_probs, "d_visibility_probs_copy");
-
-    // TODO maybe delete after set_visibility_probabilities is properly in use
-    vector<float> initial_visibility_probs (n_rows_ * n_cols_ * nr_max_poses_, visibility_prob_default_);
-    cudaMemcpy(d_visibility_probs_, &initial_visibility_probs[0], size_of_visibility_probs, cudaMemcpyHostToDevice);
-    #ifdef CHECK_ERRORS
-        check_cuda_error("cudaMemcpy visibility_prob_default_ -> d_visibility_probs_");
-    #endif
+    allocate(d_log_likelihoods_, size_of_log_likelihoods);
+    allocate(d_occlusion_indices_, size_of_occlusion_indices);
+    allocate(d_occlusion_probs_, size_of_occlusion_probs);
+    allocate(d_occlusion_probs_copy_, size_of_occlusion_probs);
 
+    vector<float> initial_occlusion_probs (nr_rows_ * nr_cols_ * nr_max_poses_, occlusion_prob_default_);
+
+    cudaMemcpy(d_occlusion_probs_, &initial_occlusion_probs[0], size_of_occlusion_probs, cudaMemcpyHostToDevice);
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpy occlusion_prob_default_ -> d_occlusion_probs_");
+    #endif
 
     cudaDeviceSynchronize();
-    #ifdef CHECK_ERRORS
+    #ifdef DEBUG
         check_cuda_error("cudaDeviceSynchronize allocate_memory_for_max_poses");
     #endif
 
-
+    memory_allocated_ = true;
 }
 
 
-void CudaFilter::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) {
-    if (nr_poses <= nr_max_poses_) {
-
-        if (nr_max_poses_per_row_ < nr_poses_per_row) {
-            nr_poses_per_row = nr_max_poses_per_row_;
-            nr_poses_per_column = ceil(nr_poses / nr_poses_per_row);
-            if (nr_max_poses_per_column_ < nr_poses_per_column) {
-                nr_poses_per_column = nr_max_poses_per_column_;
-            }
+void CudaFilter::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, bool adapt_to_constraints) {
+    if (nr_poses > nr_max_poses_) {
+        if (adapt_to_constraints) {
+            std::cout << "WARNING (CUDA): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")."
+                      << "The number of poses was automatically reduced to " << nr_max_poses_ << "." << std::endl;
+            nr_poses = nr_max_poses_;
+        } else {
 
-            std::cout << "Number of poses was reduced to (" << nr_poses_per_row << ", "
-                      << nr_poses_per_column << ") because of the maximum number of poses set in the beginning." << std::endl;
+            cout << "ERROR (CUDA): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl;
+            exit(-1);
         }
+    }
 
-        nr_poses = min(nr_poses, nr_poses_per_row * nr_poses_per_column);
-
-        nr_poses_ = nr_poses;
-        nr_poses_per_row_ = nr_poses_per_row;
-        nr_poses_per_column_ = nr_poses_per_column;
-
-        /*
+    if (nr_max_poses_per_row_ < nr_poses_per_row) {
+        nr_poses_per_row = nr_max_poses_per_row_;
+        nr_poses_per_column = ceil(nr_poses / nr_poses_per_row);
+        if (nr_max_poses_per_column_ < nr_poses_per_column) {
+            nr_poses_per_column = nr_max_poses_per_column_;
+        }
 
+        std::cout << "WARNING (CUDA): Number of poses was reduced to (" << nr_poses_per_row << ", "
+                  << nr_poses_per_column << ") because of the maximum number of poses set in the beginning." << std::endl;
+    }
 
-        nr_poses_ = nr_poses;
-        nr_poses_per_row_ = nr_poses_per_row;
+    nr_poses = min(nr_poses, nr_poses_per_row * nr_poses_per_column);
 
-        // determine n_max_poses_y_
-        nr_poses_per_column_ = nr_poses_ / nr_poses_per_row_;
-        if (nr_poses % nr_poses_per_row_ != 0) nr_poses_per_column_++;
+    nr_poses_ = nr_poses;
+    nr_poses_per_row_ = nr_poses_per_row;
+    nr_poses_per_column_ = nr_poses_per_column;
 
-        if (nr_poses_per_row_ > nr_max_poses_per_row_ || nr_poses_per_column_ > nr_max_poses_per_column_) {
-            cout << "WARNING: You tried to evaluate more poses in a row or in a column than was allocated in the beginning."
-                 << endl << "Check set_number_of_poses() functions in object_rasterizer.cpp" << endl;
-        }*/
 
-        bool nr_poses_changed = false;
-        set_default_kernel_config(nr_poses_, nr_poses_per_row_, nr_poses_per_column_, nr_poses_changed);
+    bool nr_poses_changed = false;
+    set_default_kernel_config(nr_poses_, nr_poses_per_row_, nr_poses_per_column_, nr_poses_changed, adapt_to_constraints);
 
-        nr_poses = nr_poses_;
-        nr_poses_per_row = nr_poses_per_row_;
-        nr_poses_per_column = nr_poses_per_column_;
+    nr_poses = nr_poses_;
+    nr_poses_per_row = nr_poses_per_row_;
+    nr_poses_per_column = nr_poses_per_column_;
 
-    } else {
-        cout << "ERROR (Cuda): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl;
-        exit(-1);
-    }
+    number_of_poses_set_ = true;
 }
 
 
 
 void CudaFilter::set_default_kernel_config(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column,
-                                           bool& nr_poses_changed) {
+                                           bool& nr_poses_changed, bool adapt_to_constraints) {
     nr_threads_ = min(DEFAULT_NR_THREADS, cuda_device_properties_.maxThreadsDim[0]);
 
     // check for grid dimension limitations
@@ -1456,34 +673,25 @@ void CudaFilter::set_default_kernel_config(int& nr_poses, int& nr_poses_per_row,
 
         nr_poses_changed = true;
 
-        std::cout << "Number of poses was reduced to (" << nr_poses_per_row << ", "
-                  << nr_poses_per_column << ") because of the maximum grid size ("
-                  << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1]
-                  << ") retrieved from CUDA properties." << std::endl;
+        if (adapt_to_constraints) {
+            std::cout << "WARNING (CUDA): Number of poses was reduced to (" << nr_poses_per_row << ", "
+                      << nr_poses_per_column << ") because of the maximum grid size ("
+                      << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1]
+                      << ") retrieved from CUDA properties." << std::endl;
+        } else {
+            std::cout << "ERROR (CUDA): Number of poses exceeded maximum grid size specified by GPU: "
+                      << cuda_device_properties_.maxGridSize[0] << ", " << cuda_device_properties_.maxGridSize[1] << "." << std::endl;
+            exit(-1);
+        }
     }
 
 
     grid_dimension_ = dim3(nr_poses_per_row, nr_poses_per_column);
 
 
-    /*
-
-    // determine n_threads_ and n_blocks_
-    // n_threads_ should lie between 32 (warp_size) and 128 and all microprocessors should be busy
-    nr_threads_ = ((nr_poses_ / n_mps_) / warp_size_) * warp_size_;
-    if (nr_threads_ == 0) nr_threads_ = warp_size_;
-    if (nr_threads_ > 4 * warp_size_) nr_threads_ = 4 * warp_size_;
-
-    n_blocks_ = nr_poses_ / nr_threads_;
-    if (n_blocks_ % nr_poses_ != 0) n_blocks_++;
-
-    */
 }
 
 
-void CudaFilter::set_texture_array(cudaArray_t texture_array) {
-    d_texture_array_ = texture_array;
-}
 
 
 // ===================================================================================== //
@@ -1491,41 +699,29 @@ void CudaFilter::set_texture_array(cudaArray_t texture_array) {
 // ===================================================================================== //
 
 
+int CudaFilter::get_max_nr_threads() {
+    return cuda_device_properties_.maxThreadsDim[0];
+}
 
-
-vector<float> CudaFilter::get_visibility_probabilities(int state_id) {
-//    cout << "n_rows_: " << n_rows_ << ", n_cols_: " << n_cols_ << endl;
-    float* visibility_probabilities = (float*) malloc(n_rows_ * n_cols_ * sizeof(float));
-    int offset = state_id * n_rows_ * n_cols_;
-    cudaMemcpy(visibility_probabilities, d_visibility_probs_ + offset, n_rows_ * n_cols_ * sizeof(float), cudaMemcpyDeviceToHost);
-#ifdef CHECK_ERRORS
-    check_cuda_error("cudaMemcpy d_visibility_probabilities -> visibility_probabilities");
-#endif
-    vector<float> visibility_probabilities_vector;
-    for (int i = 0; i < n_rows_ * n_cols_; i++) {
-        visibility_probabilities_vector.push_back(visibility_probabilities[i]);
-    }
-    free(visibility_probabilities);
-    return visibility_probabilities_vector;
+int CudaFilter::get_warp_size() {
+    return cuda_device_properties_.warpSize;
 }
 
+vector<float> CudaFilter::get_occlusion_probabilities(int state_id) {
+    float* occlusion_probabilities = (float*) malloc(nr_rows_ * nr_cols_ * sizeof(float));
+    int offset = state_id * nr_rows_ * nr_cols_;
+    cudaMemcpy(occlusion_probabilities, d_occlusion_probs_ + offset, nr_rows_ * nr_cols_ * sizeof(float), cudaMemcpyDeviceToHost);
 
+    #ifdef DEBUG
+        check_cuda_error("cudaMemcpy d_occlusion_probabilities -> occlusion_probabilities");
+    #endif
 
-vector<vector<float> > CudaFilter::get_visibility_probabilities() {
-    float* visibility_probabilities = (float*) malloc(nr_poses_ * n_rows_ * n_cols_ * sizeof(float));
-    cudaMemcpy(visibility_probabilities, d_visibility_probs_, nr_poses_ * n_rows_ * n_cols_ * sizeof(float), cudaMemcpyDeviceToHost);
-#ifdef CHECK_ERRORS
-    check_cuda_error("cudaMemcpy d_visibility_probabilities -> visibility_probabilities");
-#endif
-    vector<vector<float> > visibility_probabilities_vector;
-    vector<float> tmp_vector (n_rows_ * n_cols_);
-    for (int i = 0; i < nr_poses_; i++) {
-        for (int j = 0; j < n_rows_ * n_cols_; j++) {
-            tmp_vector[j] = visibility_probabilities[i * n_rows_ * n_cols_ + j];
-        }
-        visibility_probabilities_vector.push_back(tmp_vector);
+    vector<float> occlusion_probabilities_vector;
+    for (int i = 0; i < nr_rows_ * nr_cols_; i++) {
+        occlusion_probabilities_vector.push_back(occlusion_probabilities[i]);
     }
-    return visibility_probabilities_vector;
+    free(occlusion_probabilities);
+    return occlusion_probabilities_vector;
 }
 
 
@@ -1537,31 +733,16 @@ vector<vector<float> > CudaFilter::get_visibility_probabilities() {
 
 
 
-template <typename T> void CudaFilter::allocate(T * &pointer, size_t size, string name) {
-#ifdef CHECK_ERRORS
-    size_t free_space_before, free_space_after, total_space;
-    cuMemGetInfo(&free_space_before, &total_space);
-#endif
+template <typename T> void CudaFilter::allocate(T * &pointer, size_t size) {
     cudaFree(pointer);
     cudaMalloc((void **) &pointer, size);
-#ifdef CHECK_ERRORS
-    cuMemGetInfo(&free_space_after, &total_space);
-    std::cout << "memory to allocate for " << name << ": " << size / 1e6 << " MB; free space: " << free_space_before / 1e6
-         << "MB; --> allocated " << (free_space_before - free_space_after) / 1e6 << " MB, free space left: " << free_space_after / 1e6 << " MB" << std::endl;
+#ifdef DEBUG
     check_cuda_error("cudaMalloc failed");
 #endif
 }
 
 
 
-void CudaFilter::map_texture() {
-    cudaBindTextureToArray(texture_reference, d_texture_array_);
-    check_cuda_error("cudaBindTextureToArray");
-}
-
-
-
-
 void CudaFilter::check_cuda_error(const char *msg)
 {
     cudaError_t err = cudaGetLastError();
@@ -1572,29 +753,19 @@ void CudaFilter::check_cuda_error(const char *msg)
     }
 }
 
-CudaFilter::~CudaFilter() {
-    cudaFree(d_states_);
-    cudaFree(d_states_copy_);
-    cudaFree(d_visibility_probs_);
-    cudaFree(d_visibility_probs_copy_);
-    cudaFree(d_observations_);
-    cudaFree(d_log_likelihoods_);
-    cudaFree(d_mrg_states_);
-    cudaFree(d_resampling_indices_);
+// ===================================================================================== //
+// ============================ CUDA FILTER DESTRUCTOR  ================================ //
+// ===================================================================================== //
+
 
-}
 
-void CudaFilter::destroy_context() {
-    cudaFree(d_states_);
-    cudaFree(d_states_copy_);
-    cudaFree(d_visibility_probs_);
-    cudaFree(d_visibility_probs_copy_);
+
+CudaFilter::~CudaFilter() {
+    cudaFree(d_occlusion_probs_);
+    cudaFree(d_occlusion_probs_copy_);
     cudaFree(d_observations_);
     cudaFree(d_log_likelihoods_);
-    cudaFree(d_mrg_states_);
-    cudaFree(d_resampling_indices_);
-    cudaDeviceReset();
+    cudaFree(d_occlusion_indices_);
 }
 
 }
-
diff --git a/src/dbot/model/observation/gpu/object_rasterizer.cpp b/src/dbot/model/observation/gpu/object_rasterizer.cpp
index 3fb7c05..3eca454 100644
--- a/src/dbot/model/observation/gpu/object_rasterizer.cpp
+++ b/src/dbot/model/observation/gpu/object_rasterizer.cpp
@@ -1,45 +1,42 @@
-/** @author Claudia Pfreundt */
+/// @author Claudia Pfreundt <claudilein@gmail.com>
 
 /* Note: Profiling slows down the rendering process. Use only to display the runtimes
  *       of the different subroutines inside the render call. */
 //#define PROFILING_ACTIVE
+//#define DEBUG
 
-
-#include <dbot/model/observation/gpu/shader.hpp>
 #include <dbot/model/observation/gpu/object_rasterizer.hpp>
 
-#include <iomanip>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-
 #include <GL/glew.h>
 #include <GL/glx.h>
-
 #include <Eigen/Geometry>
-
-#include <ros/package.h>
+#include <dbot/model/observation/gpu/shader.hpp>
+#include <dbot/util/helper_functions.hpp>
 
 using namespace std;
 using namespace Eigen;
 
-ObjectRasterizer::ObjectRasterizer()
-{
-}
 
 ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f> > vertices,
                                    const std::vector<std::vector<std::vector<int> > > indices,
                                    const std::string vertex_shader_path,
                                    const std::string fragment_shader_path,
-                                   const Eigen::Matrix3f camera_matrix) :
-    nr_rows_(WINDOW_HEIGHT),
-    nr_cols_(WINDOW_WIDTH),
+                                   const Eigen::Matrix3f camera_matrix,
+                                   const float near_plane,
+                                   const float far_plane,
+                                   const int nr_rows,
+                                   const int nr_cols) :
+    near_plane_(near_plane),
+    far_plane_(far_plane),
+    nr_rows_(nr_rows),
+    nr_cols_(nr_cols),
     vertex_shader_path_(vertex_shader_path),
     fragment_shader_path_(fragment_shader_path)
 {
 
     // ========== CREATE WINDOWLESS OPENGL CONTEXT =========== //
 
+
     typedef GLXContext (*glXCreateContextAttribsARBProc)(Display*, GLXFBConfig, GLXContext, Bool, const int*);
     typedef Bool (*glXMakeContextCurrentARBProc)(Display*, GLXDrawable, GLXDrawable, GLXContext);
     static glXCreateContextAttribsARBProc glXCreateContextAttribsARB = 0;
@@ -68,7 +65,6 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
            exit(1);
     }
 
-
     /* get framebuffer configs, any is usable (might want to add proper attribs) */
     if ( !(fbc = glXChooseFBConfig(dpy, DefaultScreen(dpy), visual_attribs, &fbcount) ) ){
            fprintf(stderr, "Failed to get FBConfig\n");
@@ -84,6 +80,7 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
            exit(1);
     }
 
+
     /* create a context using glXCreateContextAttribsARB */
     if ( !( ctx = glXCreateContextAttribsARB(dpy, fbc[0], 0, True, context_attribs)) ){
            fprintf(stderr, "Failed to create opengl context\n");
@@ -91,6 +88,7 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
            exit(1);
     }
 
+
     /* create temporary pbuffer */
     int pbuffer_attribs[] = {
            GLX_PBUFFER_WIDTH, nr_cols_,
@@ -99,13 +97,12 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
     };
     pbuf = glXCreatePbuffer(dpy, fbc[0], pbuffer_attribs);
 
-
     XFree(fbc);
     XSync(dpy, False);
 
     /* try to make it the current context */
     if ( !glXMakeContextCurrent(dpy, pbuf, pbuf, ctx) ){
-           /* some drivers does not support context without default framebuffer, so fallback on
+           /* some drivers do not support context without default framebuffer, so fallback on
             * using the default window.
             */
            if ( !glXMakeContextCurrent(dpy, DefaultRootWindow(dpy), DefaultRootWindow(dpy), ctx) ){
@@ -131,9 +128,6 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
 
 
 
-
-
-
     // ======================== SET OPENGL OPTIONS ======================== //
 
 
@@ -163,10 +157,12 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
 
     // ========== CHANGE VERTICES AND INDICES FORMATS SO THAT OPENGL CAN READ THEM =========== //
 
+
+    std::vector<int> vertices_per_object;
+
     for (size_t i = 0; i < vertices.size(); i++) {        // each i equals one object
         object_numbers_.push_back(i);
-        // vertices_per_object_.push_back(vertices[i].size() * 3); ?? * 3 equals floats per object, how does gldrawelements index?
-        vertices_per_object_.push_back(vertices[i].size());
+        vertices_per_object.push_back(vertices[i].size());
         for (size_t j = 0; j < vertices[i].size(); j++) { // each j equals one vertex in that object
             for (int k = 0; k < vertices[i][j].size(); k++) {  // each k equals one dimension of that vertex
                 vertices_list_.push_back(vertices[i][j][k]);
@@ -184,7 +180,7 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
                 indices_list_.push_back(indices[i][j][k] + vertex_count);
             }
         }
-        vertex_count += vertices_per_object_[i];
+        vertex_count += vertices_per_object[i];
     }
 
 
@@ -232,7 +228,7 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
     glGenFramebuffers(1, &framebuffer_);
     glBindFramebuffer(GL_FRAMEBUFFER, framebuffer_);
 
-    // create color texture that will contain the depth values after the render
+    // create color texture that will contain the depth values after the rendering
     glGenTextures(1, &framebuffer_texture_for_all_poses_);
     glBindTexture(GL_TEXTURE_2D, framebuffer_texture_for_all_poses_);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
@@ -255,27 +251,34 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
     shader_ID_ = LoadShaders(shader_list);
 
 
-    // Set up handles for uniforms in transformation saving shaders
+    // Set up handles for uniforms
     model_view_matrix_ID_ = glGetUniformLocation(shader_ID_, "MV");
     projection_matrix_ID_ = glGetUniformLocation(shader_ID_, "P");
 
     /* The view matrix is constant throughout this class since we are not changing the camera position.
        If you are looking to pass a different camera matrix for each render call, move this function
-       into that render call and change it to accept the camera position and rotation as parameter
+       into the render call and change it to accept the camera position and rotation as parameter
        and calculate the respective matrix. */
     setup_view_matrix();
 
     check_GL_errors("OpenGL initialization");
 
 
+    // ========== SETUP PROJECTION MATRIX AND SHADER TO USE =========== //
+
+    setup_projection_matrix(camera_matrix);
+    glUseProgram(shader_ID_);
+    check_GL_errors("setup projection matrix");
+
 
     // ========== INITIALIZE & SET DEFAULTS FOR MEASURING EXECUTION TIMES =========== //
 
+#ifdef PROFILING_ACTIVE
     // generate query objects needed for timing OpenGL commands
     glGenQueries(NR_SUBROUTINES_TO_MEASURE, time_query_);
 
     nr_calls_ = 0;
-    gpu_times_aggregate_ = vector<double> (NR_SUBROUTINES_TO_MEASURE, 0);
+    time_measurement_ = vector<double> (NR_SUBROUTINES_TO_MEASURE, 0);
     initial_run_ = true;
 
     strings_for_subroutines.push_back("ATTACH_TEXTURE");
@@ -283,31 +286,21 @@ ObjectRasterizer::ObjectRasterizer(const std::vector<std::vector<Eigen::Vector3f
     strings_for_subroutines.push_back("RENDER");
     strings_for_subroutines.push_back("DETACH_TEXTURE");
 
-    // ========== SETUP PROJECTION MATRIX AND SHADER TO USE =========== //
-
-    setup_projection_matrix(camera_matrix);
-    glUseProgram(shader_ID_);
-    check_GL_errors("setup projection matrix");
+    check_GL_errors("Generating time queries");
+#endif
 }
 
 
 
-
-
-
-
-
-
-void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> > > states,
-                                          std::vector<std::vector<int> > &intersect_indices,
-                                          std::vector<std::vector<float> > &depth) {
+void ObjectRasterizer::render(const std::vector<std::vector<Eigen::Matrix4f> > states,
+                              std::vector<std::vector<float> > depth_values) {
     render(states);
-    get_depth_values(intersect_indices, depth);
+    depth_values = get_depth_values();
 }
 
 
+void ObjectRasterizer::render(const std::vector<std::vector<Eigen::Matrix4f> > states) {
 
-void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> > > states) {
 
 #ifdef PROFILING_ACTIVE
     glBeginQuery(GL_TIME_ELAPSED, time_query_[ATTACH_TEXTURE]);
@@ -319,7 +312,9 @@ void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> >
                            GL_TEXTURE_2D,         // 3. tex target: GL_TEXTURE_2D
                            framebuffer_texture_for_all_poses_,     // 4. tex ID
                            0);
-
+#ifdef DEBUG
+    check_GL_errors("attaching texture to framebuffer");
+#endif
 #ifdef PROFILING_ACTIVE
     glFinish();
     glEndQuery(GL_TIME_ELAPSED);
@@ -328,6 +323,9 @@ void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> >
 
     glClear(GL_DEPTH_BUFFER_BIT | GL_COLOR_BUFFER_BIT);
 
+#ifdef DEBUG
+    check_GL_errors("clearing framebuffer");
+#endif
 #ifdef PROFILING_ACTIVE
     glFinish();
     glEndQuery(GL_TIME_ELAPSED);
@@ -342,14 +340,19 @@ void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> >
         for (int j = 0; j < nr_poses_per_row_; j++) {
 
             glViewport(j * nr_cols_, (nr_poses_per_column_ - 1 - i) * nr_rows_, nr_cols_, nr_rows_);
-
+            #ifdef DEBUG
+                check_GL_errors("setting the viewport");
+            #endif
             for (size_t k = 0; k < object_numbers_.size(); k++) {
                 int index = object_numbers_[k];
 
-                model_view_matrix = view_matrix_ * get_model_matrix(states[nr_poses_per_row_ * i + j][index]);
+                model_view_matrix = view_matrix_ * states[nr_poses_per_row_ * i + j][index];
                 glUniformMatrix4fv(model_view_matrix_ID_, 1, GL_FALSE, model_view_matrix.data());
 
                 glDrawElements(GL_TRIANGLES, indices_per_object_[index], GL_UNSIGNED_INT, (void*) (start_position_[index] * sizeof(uint)));
+                #ifdef DEBUG
+                    check_GL_errors("render call");
+                #endif
             }
         }
     }
@@ -358,19 +361,24 @@ void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> >
     for (int j = 0; j < nr_poses_ - (nr_poses_per_row_ * (nr_poses_per_column_ - 1)); j++) {
 
         glViewport(j * nr_cols_, 0, nr_cols_, nr_rows_);
+        #ifdef DEBUG
+            check_GL_errors("setting the viewport");
+        #endif
 
         for (size_t k = 0; k < object_numbers_.size(); k++) {
             int index = object_numbers_[k];
 
-            model_view_matrix = view_matrix_ * get_model_matrix(states[nr_poses_per_row_ * (nr_poses_per_column_ - 1) + j][index]);
+            model_view_matrix = view_matrix_ * states[nr_poses_per_row_ * (nr_poses_per_column_ - 1) + j][index];
             glUniformMatrix4fv(model_view_matrix_ID_, 1, GL_FALSE, model_view_matrix.data());
 
             glDrawElements(GL_TRIANGLES, indices_per_object_[index], GL_UNSIGNED_INT, (void*) (start_position_[index] * sizeof(uint)));
+            #ifdef DEBUG
+                check_GL_errors("render call");
+            #endif
         }
     }
 
 
-
 #ifdef PROFILING_ACTIVE
     glFinish();
     glEndQuery(GL_TIME_ELAPSED);
@@ -383,28 +391,38 @@ void ObjectRasterizer::render(const std::vector<std::vector<std::vector<float> >
                            0,     // 4. tex ID
                            0);
 
+#ifdef DEBUG
+    check_GL_errors("detaching texture from framebuffer");
+#endif
+
 #ifdef PROFILING_ACTIVE
     glFinish();
     glEndQuery(GL_TIME_ELAPSED);
     store_time_measurements();
 #endif
 
-    /* TODO should be unnecessary when texture is previously detached from framebuffer..
-     * would like to find evidence that this detaching really introduces a synchronization though*/
-    glFinish();
-
 
 }
 
 
 void ObjectRasterizer::set_objects(vector<int> object_numbers) {
-    // TODO does it copy the vector or set a reference?
     object_numbers_ = object_numbers;
 }
 
+
+void ObjectRasterizer::set_resolution(const int n_rows, const int n_cols,
+                                      int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints) {
+        nr_rows_ = n_rows;
+        nr_cols_ = n_cols;
+
+        // reallocate textures
+        allocate_textures_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column, adapt_to_constraints);
+}
+
 void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses,
                                                        int& allocated_poses_per_row,
-                                                       int& allocated_poses_per_column) {
+                                                       int& allocated_poses_per_column,
+                                                       const bool adapt_to_constraints) {
     int max_poses_per_row = floor(max_texture_size_ / nr_cols_);
     int max_poses_per_column = floor(max_texture_size_ / nr_rows_);
 
@@ -412,14 +430,20 @@ void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses,
     allocated_poses_per_column = min(max_poses_per_column, (int) ceil(allocated_poses / (float) allocated_poses_per_row));
 
     if (allocated_poses > allocated_poses_per_row * allocated_poses_per_column) {
-        std::cout << "The space for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. "
-                  << "The limit is OpenGL texture size: " << max_texture_size_ << ". Current resolution is (" << nr_cols_ << ", "
-                  << nr_rows_ << ") , which means a maximum of (" << max_poses_per_row << ", " << max_poses_per_column << ") poses. "
-                  << "As a result, space for " << allocated_poses_per_row * allocated_poses_per_column << " poses will be allocated "
-                  << "in the form of (" << allocated_poses_per_row << ", " << allocated_poses_per_column << ")." << std::endl;
+        if (adapt_to_constraints) {
+            std::cout << "WARNING (OPENGL): The space for the number of maximum poses you requested (" << allocated_poses << ") cannot be allocated. "
+                      << "The limit is OpenGL texture size: " << max_texture_size_ << ". Current resolution is (" << nr_cols_ << ", "
+                      << nr_rows_ << ") , which means a maximum of (" << max_poses_per_row << ", " << max_poses_per_column << ") poses. "
+                      << "As a result, space for " << allocated_poses_per_row * allocated_poses_per_column << " poses will be allocated "
+                      << "in the form of (" << allocated_poses_per_row << ", " << allocated_poses_per_column << ")." << std::endl;
+        } else {
+            std::cout << "ERROR (OPENGL): The number of poses you requested cannot be rendered. The limit is the maximum OpenGL texture size: "
+                      << max_texture_size_ << " x " << max_texture_size_ << ". You requested a resolution of " << nr_cols_  << " x " << nr_rows_
+                      << " and " << allocated_poses << " poses." << std::endl;
+            exit(-1);
+        }
     }
 
-
     allocated_poses = allocated_poses_per_row * allocated_poses_per_column;
 
     nr_max_poses_ = allocated_poses;
@@ -430,87 +454,37 @@ void ObjectRasterizer::allocate_textures_for_max_poses(int& allocated_poses,
     nr_poses_per_column_ = allocated_poses_per_column;
 
     reallocate_buffers();
-
-
-
-    /*
-//    if (nr_max_poses_ != nr_max_poses) {
-        nr_max_poses_ = nr_max_poses;
-        nr_poses_ = nr_max_poses;
-
-//        // TODO max_dimension[0], [1], at the moment they are identical
-        float sqrt_poses = sqrt(nr_poses_);
-        // TODO this can be done smarter. I want to only increment sqrt_poses, if it is not an int, i.e. 10.344 instead of 10)
-        if (sqrt_poses * sqrt_poses != nr_poses_) sqrt_poses = (int) ceil(sqrt_poses);
-        // TODO max_dimension[0], [1], at the moment they are identical
-        nr_max_poses_per_row_ = min((int) sqrt_poses, (max_texture_size_ / nr_cols_));
-        int y_poses = nr_max_poses_ / nr_max_poses_per_row_;
-        if (y_poses * nr_max_poses_per_row_ < nr_max_poses_) y_poses++;
-        nr_max_poses_per_column_ = min(y_poses, (max_texture_size_ / nr_rows_));
-
-        nr_poses_per_row_ = nr_max_poses_per_row_;
-        nr_poses_per_column_ = nr_max_poses_per_column_;
-
-        reallocate_buffers();
-//    }*/
 }
 
-void ObjectRasterizer::set_number_of_poses(const int nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) {
-    if (nr_poses <= nr_max_poses_) {
-        nr_poses_per_row = min(nr_max_poses_per_row_, nr_poses);
-        nr_poses_per_column = min(nr_max_poses_per_column_, (int) ceil(nr_poses / (float) nr_poses_per_row));
-
-        nr_poses_ = nr_poses;
-        nr_poses_per_row_ = nr_poses_per_row;
-        nr_poses_per_column_ = nr_poses_per_column;
-
-
-
-        /*
-        nr_poses_ = nr_poses;
-
+void ObjectRasterizer::set_number_of_poses(int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column, const bool adapt_to_constraints) {
 
-//        // TODO max_dimension[0], [1], at the moment they are identical
-        float sqrt_poses = sqrt(nr_poses_);
-        // TODO this can be done smarter. I want to only increment sqrt_poses, if it is not an int, i.e. 10.344 instead of 10)
-        if (sqrt_poses * sqrt_poses != nr_poses_) sqrt_poses = (int) ceil(sqrt_poses);
-        // TODO max_dimension[0], [1], at the moment they are identical
-        nr_poses_per_row_ = min((int) sqrt_poses, (max_texture_size_ / nr_cols_));
-        int y_poses = nr_poses_ / nr_poses_per_row_;
-        if (y_poses * nr_poses_per_row_ < nr_poses_) y_poses++;
-        nr_poses_per_column_ = min(y_poses, (max_texture_size_ / nr_rows_));
-
-        if (nr_poses_per_row_ > nr_max_poses_per_row_ || nr_poses_per_column_ > nr_max_poses_per_column_) {
-            cout << "WARNING: You tried to evaluate more poses in a row or in a column than was allocated in the beginning."
-                 << endl << "Check set_number_of_poses() functions in object_rasterizer.cpp" << endl;
-        }*/
-
-    } else {
-        cout << "ERROR (OpenGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl;
-        exit(-1);
+    if (nr_poses > nr_max_poses_) {
+        if (adapt_to_constraints) {
+            std::cout << "WARNING (OPENGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")."
+                      << "The number of poses was automatically reduced to " << nr_max_poses_ << "." << std::endl;
+            nr_poses = nr_max_poses_;
+        } else {
+            cout << "ERROR (OPENGL): You tried to evaluate more poses (" << nr_poses << ") than specified by max_poses (" << nr_max_poses_ << ")" << endl;
+            exit(-1);
+        }
     }
-}
 
-void ObjectRasterizer::set_resolution(const int n_rows, const int n_cols, int& nr_poses, int& nr_poses_per_row, int& nr_poses_per_column) {
-        nr_rows_ = n_rows;
-        nr_cols_ = n_cols;
+    nr_poses_per_row = min(nr_max_poses_per_row_, nr_poses);
+    nr_poses_per_column = min(nr_max_poses_per_column_, (int) ceil(nr_poses / (float) nr_poses_per_row));
 
-        // reallocate textures
-        allocate_textures_for_max_poses(nr_poses, nr_poses_per_row, nr_poses_per_column);
+    nr_poses_ = nr_poses;
+    nr_poses_per_row_ = nr_poses_per_row;
+    nr_poses_per_column_ = nr_poses_per_column;
 }
 
+
 GLuint ObjectRasterizer::get_framebuffer_texture() {
     return framebuffer_texture_for_all_poses_;
 }
 
-int ObjectRasterizer::get_nr_poses_per_row() {
-    return nr_poses_per_row_;
-}
-
 
 
-void ObjectRasterizer::get_depth_values(std::vector<std::vector<int> > &intersect_indices,
-                                        std::vector<std::vector<float> > &depth) {
+vector<vector<float> > ObjectRasterizer::get_depth_values() {
 
     // ===================== ATTACH TEXTURE TO FRAMEBUFFER ================ //
 
@@ -559,22 +533,7 @@ void ObjectRasterizer::get_depth_values(std::vector<std::vector<int> > &intersec
             }
         }
 
-        // filling the respective values per pose into their indices and depth vectors
-        vector<int> tmp_indices;
-        vector<float> tmp_depth;
-
-        for (int state = 0; state < nr_poses_; state++) {
-            for (int i = 0; i < nr_rows_ * nr_cols_; i++) {
-                if (depth_image_per_pose[state][i] != 0) {
-                    tmp_indices.push_back(i);
-                    tmp_depth.push_back(depth_image_per_pose[state][i]);
-                }
-            }
-            intersect_indices.push_back(tmp_indices);
-            depth.push_back(tmp_depth);
-            tmp_indices.resize(0);
-            tmp_depth.resize(0);
-        }
+        return depth_image_per_pose;
 
     } else {
         cout << "WARNING: Could not map Pixel Pack Buffer." << endl;
@@ -589,12 +548,11 @@ void ObjectRasterizer::get_depth_values(std::vector<std::vector<int> > &intersec
                            GL_TEXTURE_2D,         // 3. tex target: GL_TEXTURE_2D
                            0,     // 4. tex ID
                            0);
-}
-
-
-
-
 
+    #ifdef DEBUG
+        check_GL_errors("copying depth values to CPU");
+    #endif
+}
 
 
 
@@ -656,22 +614,6 @@ void ObjectRasterizer::reallocate_buffers() {
 }
 
 
-
-Matrix4f ObjectRasterizer::get_model_matrix(const vector<float> state) {
-    // Model matrix equals the state of the object. Position and Quaternion just have to be expressed as a matrix.
-    // note: state = (q.w, q.x, q.y, q.z, v.x, v.y, v.z)
-    Matrix4f model_matrix = Matrix4f::Identity();
-    Transform<float, 3, Affine, ColMajor> model_matrix_transform;
-    model_matrix_transform = Translation3f(state[4], state[5], state[6]);
-    model_matrix = model_matrix_transform.matrix();
-
-    Quaternionf qRotation = Quaternionf(state[0], state[1], state[2], state[3]);
-    model_matrix.topLeftCorner(3, 3) = qRotation.toRotationMatrix();
-
-    return model_matrix;
-}
-
-
 void ObjectRasterizer::setup_view_matrix() {
     // =========================== VIEW MATRIX =========================== //
 
@@ -691,8 +633,8 @@ void ObjectRasterizer::setup_projection_matrix(const Eigen::Matrix3f camera_matr
     Vector3f boundaries_min = camera_matrix_inverse * Vector3f(-0.5, -0.5, 1);
     Vector3f boundaries_max = camera_matrix_inverse * Vector3f(float(nr_cols_)-0.5, float(nr_rows_)-0.5, 1);
 
-    float near = NEAR_PLANE;
-    float far = FAR_PLANE;
+    float near = near_plane_;
+    float far = far_plane_;
     float left = near * boundaries_min(0);
     float right = near * boundaries_max(0);
     float top = -near * boundaries_min(1);
@@ -733,7 +675,7 @@ void ObjectRasterizer::store_time_measurements() {
         }
         glGetQueryObjectuiv(time_query_[i], GL_QUERY_RESULT, &time_elapsed_ns);
         time_elapsed_s = time_elapsed_ns / (double) 1e9;
-        gpu_times_aggregate_[i] += time_elapsed_s;
+        time_measurement_[i] += time_elapsed_s;
 
     }
 
@@ -741,7 +683,7 @@ void ObjectRasterizer::store_time_measurements() {
     if (initial_run_) {
         initial_run_ = false;
         for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) {
-            gpu_times_aggregate_[i] = 0;
+            time_measurement_[i] = 0;
         }
 
         nr_calls_ = 0;
@@ -803,7 +745,7 @@ bool ObjectRasterizer::check_framebuffer_status() {
 ObjectRasterizer::~ObjectRasterizer()
 {
 
-#ifdef PROFILING_ON
+#ifdef PROFILING_ACTIVE
 
     if (nr_calls_ != 0) {
         cout << endl << "Time measurements for the different steps of the rendering process averaged over " << nr_calls_ << " render calls:" << endl << endl;
@@ -811,12 +753,12 @@ ObjectRasterizer::~ObjectRasterizer()
         double total_time_per_render = 0;
 
         for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) {
-            total_time_per_render += gpu_times_aggregate_[i];
+            total_time_per_render += time_measurement_[i];
         }
         total_time_per_render /= nr_calls_;
 
         for (int i = 0; i < NR_SUBROUTINES_TO_MEASURE; i++) {
-            double time_per_subroutine = gpu_times_aggregate_[i] / nr_calls_;
+            double time_per_subroutine = time_measurement_[i] / nr_calls_;
 
                 cout << get_text_for_enum(i) << ":     "
                      << "\t " << time_per_subroutine << " s \t " << setprecision(1)
@@ -831,7 +773,6 @@ ObjectRasterizer::~ObjectRasterizer()
     glDeleteQueries(NR_SUBROUTINES_TO_MEASURE, time_query_);
 #endif
 
-    cout << "clean up OpenGL.." << endl;
 
     glDisableVertexAttribArray(0);
     glDeleteVertexArrays(1, &vertex_array_);
@@ -848,4 +789,3 @@ ObjectRasterizer::~ObjectRasterizer()
 
 }
 
-
diff --git a/src/dbot/model/observation/gpu/shader.cpp b/src/dbot/model/observation/gpu/shader.cpp
index f6b4f09..59f8376 100644
--- a/src/dbot/model/observation/gpu/shader.cpp
+++ b/src/dbot/model/observation/gpu/shader.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <fstream>
 #include <algorithm>
-//using namespace std;
 
 #include <stdlib.h>
 #include <string.h>