Skip to content

Commit

Permalink
Merging occupancy tuning changes from David Polikoff.
Browse files Browse the repository at this point in the history
Note: This is a re-commit of a somehow polluted branch when I rebased on
develop. I started over with the 5 changed files.

The old Kokkos fork/branch from :
davidp	[email protected]:DavidPoliakoff/kokkos.git (fetch)
was merged with current Kokkos develop, and tested with ArborX to
confirm that autotuning occupancy for the DBSCAN benchmark worked.
In tests on a system with V100, the original benchmark when iterated
600 times took 119.064 seconds to run. During the tuning process
(using simulated annealing), the runtime was 108.014 seconds.
When using cached results, the runtime was 109.058 seconds. The
converged occupancy value was 70. Here are the cached results
from APEX autotuning:

Input_1:
  name: kokkos.kernel_name
  id: 1
  info.type: string
  info.category: categorical
  info.valueQuantity: unbounded
  info.candidates: unbounded
  num_bins: 0
Input_2:
  name: kokkos.kernel_type
  id: 2
  info.type: string
  info.category: categorical
  info.valueQuantity: set
  info.candidates: [parallel_for,parallel_reduce,parallel_scan,parallel_copy]
Output_3:
  name: ArborX::Experimental::HalfTraversal
  id: 3
  info.type: int64
  info.category: ratio
  info.valueQuantity: range
  info.candidates:
    lower: 5
    upper: 100
    step: 5
    open upper: 0
    open lower: 0
Context_0:
  Name: "[2:parallel_for,1:ArborX::Experimental::HalfTraversal,tree_node:default]"
  Converged: true
  Results:
    NumVars: 1
    id: 3
    value: 70

In manual experiments, the ArborX team determined that the optimal
occupancy for this example was beetween 40-90, which were a 10%
improvement over baseline default of 100. See arborx/ArborX#815
for details.

One deviation from the branch that David had written - the occupancy
range is [5-100], with a step size of 5. The original implementation
in Kokkos used [1-100] with a step size of 1.
  • Loading branch information
khuck committed Mar 11, 2024
1 parent 35ad698 commit 8dba118
Show file tree
Hide file tree
Showing 5 changed files with 705 additions and 11 deletions.
12 changes: 8 additions & 4 deletions core/src/Kokkos_Parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy,
const FunctorType& functor) {
uint64_t kpID = 0;

ExecPolicy inner_policy = policy;
Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID);
/** Request a tuned policy from the tools subsystem */
const auto& response =
Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID);
const auto& inner_policy = response.policy;

Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelFor<FunctorType, ExecPolicy> closure(functor, inner_policy);
Expand Down Expand Up @@ -349,8 +351,10 @@ template <class ExecutionPolicy, class FunctorType,
inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
const FunctorType& functor) {
uint64_t kpID = 0;
ExecutionPolicy inner_policy = policy;
Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
/** Request a tuned policy from the tools subsystem */
const auto& response =
Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID);
const auto& inner_policy = response.policy;

Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelScan<FunctorType, ExecutionPolicy> closure(functor,
Expand Down
8 changes: 5 additions & 3 deletions core/src/Kokkos_Parallel_Reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1492,9 +1492,11 @@ struct ParallelReduceAdaptor {
using PassedReducerType = typename return_value_adapter::reducer_type;
uint64_t kpID = 0;

PolicyType inner_policy = policy;
Kokkos::Tools::Impl::begin_parallel_reduce<PassedReducerType>(
inner_policy, functor, label, kpID);
/** Request a tuned policy from the tools subsystem */
auto response = Kokkos::Tools::Impl::begin_parallel_reduce<
typename return_value_adapter::reducer_type>(policy, functor, label,
kpID);
auto& inner_policy = response.policy;

using ReducerSelector =
Kokkos::Impl::if_c<std::is_same<InvalidType, PassedReducerType>::value,
Expand Down
123 changes: 120 additions & 3 deletions core/src/Kokkos_Tuners.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t);
VariableValue make_variable_value(size_t, double);
SetOrRange make_candidate_range(double lower, double upper, double step,
bool openLower, bool openUpper);
SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step,
bool openLower, bool openUpper);
size_t get_new_context_id();
void begin_context(size_t context_id);
void end_context(size_t context_id);
Expand Down Expand Up @@ -419,10 +421,11 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
template <typename ViableConfigurationCalculator, typename Functor,
typename TagType, typename... Properties>
TeamSizeTuner(const std::string& name,
Kokkos::TeamPolicy<Properties...>& policy,
const Kokkos::TeamPolicy<Properties...>& policy_in,
const Functor& functor, const TagType& tag,
ViableConfigurationCalculator calc) {
using PolicyType = Kokkos::TeamPolicy<Properties...>;
PolicyType policy(policy_in);
auto initial_vector_length = policy.impl_vector_length();
if (initial_vector_length < 1) {
policy.impl_set_vector_length(1);
Expand Down Expand Up @@ -504,7 +507,8 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
}

template <typename... Properties>
void tune(Kokkos::TeamPolicy<Properties...>& policy) {
auto tune(const Kokkos::TeamPolicy<Properties...>& policy_in) {
Kokkos::TeamPolicy<Properties...> policy(policy_in);
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
auto configuration = tuner.begin();
auto team_size = std::get<1>(configuration);
Expand All @@ -514,6 +518,117 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
policy.impl_set_vector_length(vector_length);
}
}
return policy;
}
void end() {
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
tuner.end();
}
}

TunerType get_tuner() const { return tuner; }
};
namespace Impl {
template <class T>
struct tuning_type_for;

template <>
struct tuning_type_for<double> {
static constexpr Kokkos::Tools::Experimental::ValueType value =
Kokkos::Tools::Experimental::ValueType::kokkos_value_double;
static double get(
const Kokkos::Tools::Experimental::VariableValue& value_struct) {
return value_struct.value.double_value;
}
};
template <>
struct tuning_type_for<int64_t> {
static constexpr Kokkos::Tools::Experimental::ValueType value =
Kokkos::Tools::Experimental::ValueType::kokkos_value_int64;
static int64_t get(
const Kokkos::Tools::Experimental::VariableValue& value_struct) {
return value_struct.value.int_value;
}
};
} // namespace Impl
template <class Bound>
class SingleDimensionalRangeTuner {
size_t id;
size_t context;
using tuning_util = Impl::tuning_type_for<Bound>;

Bound default_value;

public:
SingleDimensionalRangeTuner() = default;
SingleDimensionalRangeTuner(
const std::string& name,
Kokkos::Tools::Experimental::StatisticalCategory category,
Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) {
default_value = default_val;
Kokkos::Tools::Experimental::VariableInfo info;
info.category = category;
info.candidates = make_candidate_range(
static_cast<Bound>(lower), static_cast<Bound>(upper),
static_cast<Bound>(step), false, false);
info.valueQuantity =
Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range;
info.type = tuning_util::value;
id = Kokkos::Tools::Experimental::declare_output_type(name, info);
}

Bound begin() {
context = Kokkos::Tools::Experimental::get_new_context_id();
Kokkos::Tools::Experimental::begin_context(context);
auto tuned_value =
Kokkos::Tools::Experimental::make_variable_value(id, default_value);
Kokkos::Tools::Experimental::request_output_values(context, 1,
&tuned_value);
return tuning_util::get(tuned_value);
}

void end() { Kokkos::Tools::Experimental::end_context(context); }

template <typename Functor>
void with_tuned_value(Functor& func) {
func(begin());
end();
}
};

class RangePolicyOccupancyTuner {
private:
using TunerType = SingleDimensionalRangeTuner<int64_t>;
TunerType tuner;

public:
RangePolicyOccupancyTuner() = default;
RangePolicyOccupancyTuner& operator=(const RangePolicyOccupancyTuner& other) =
default;
RangePolicyOccupancyTuner(const RangePolicyOccupancyTuner& other) = default;
RangePolicyOccupancyTuner& operator=(RangePolicyOccupancyTuner&& other) =
default;
RangePolicyOccupancyTuner(RangePolicyOccupancyTuner&& other) = default;
template <typename ViableConfigurationCalculator, typename Functor,
typename TagType, typename... Properties>
RangePolicyOccupancyTuner(const std::string& name,
const Kokkos::RangePolicy<Properties...>&,
const Functor&, const TagType&,
ViableConfigurationCalculator)
: tuner(TunerType(name,
Kokkos::Tools::Experimental::StatisticalCategory::
kokkos_value_ratio,
100, 5, 100, 5)) {}

template <typename... Properties>
auto tune(const Kokkos::RangePolicy<Properties...>& policy_in) {
Kokkos::RangePolicy<Properties...> policy(policy_in);
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
auto occupancy = tuner.begin();
policy.impl_set_desired_occupancy(
Kokkos::Experimental::DesiredOccupancy{static_cast<int>(occupancy)});
}
return policy;
}
void end() {
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
Expand Down Expand Up @@ -577,11 +692,13 @@ struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
policy.impl_change_tile_size({std::get<Indices>(tuple)...});
}
template <typename... Properties>
void tune(Kokkos::MDRangePolicy<Properties...>& policy) {
auto tune(const Kokkos::MDRangePolicy<Properties...>& policy_in) {
Kokkos::MDRangePolicy<Properties...> policy(policy_in);
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
auto configuration = tuner.begin();
set_policy_tile(policy, configuration, std::make_index_sequence<rank>{});
}
return policy;
}
void end() {
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
Expand Down
Loading

0 comments on commit 8dba118

Please sign in to comment.