Skip to content

Commit

Permalink
[STF] Document dot sections (#3506)
Browse files Browse the repository at this point in the history
* Start to document STF dot sections

* fix formatting

* Minor fixes in the doc

* Add missing file

* clang-format

* Remove dot_push_section and dot_pop_section and also fix a bazillion warnings

* Format

* More Format

* Add missing mv

* misc. C++ fixes and clang-format

* Update dot_section doc to reflect that we removed dot_push_section and dot_pop_section

* - Fix documentation error
- Use the dot_section doc example as a test
- do not use assert directly

* Review and a few more touches

* Improvement for docs/cudax/stf.rst

Co-authored-by: Bernhard Manfred Gruber <[email protected]>

---------

Co-authored-by: Andrei Alexandrescu <[email protected]>
Co-authored-by: Bernhard Manfred Gruber <[email protected]>
  • Loading branch information
3 people authored Jan 29, 2025
1 parent 2a03b6e commit 74c17c6
Show file tree
Hide file tree
Showing 22 changed files with 369 additions and 136 deletions.
8 changes: 4 additions & 4 deletions cudax/examples/stf/linear_algebra/07-cholesky.cu
Original file line number Diff line number Diff line change
Expand Up @@ -659,14 +659,14 @@ int main(int argc, char** argv)
return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row);
};

ctx.dot_push_section("fillA");
auto s = ctx.dot_section("fillA");
if (check_result)
{
Aref.fill(hilbert);
}

A.fill(hilbert);
ctx.dot_pop_section();
s.end();

/* Right-hand side */
matrix<double> B_potrs(N, 1, NB, 1, false, "B");
Expand All @@ -693,9 +693,9 @@ int main(int argc, char** argv)
cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf;
float milliseconds_pdpotrf = 0;

// for (int row = 0; row < A.mt; row++)
// for (size_t row = 0; row < A.mt; row++)
// {
// for (int col = 0; col <= row; col++)
// for (size_t col = 0; col <= row; col++)
// {
// cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col)));
// NOOP(A, row, col);
Expand Down
8 changes: 4 additions & 4 deletions cudax/examples/stf/linear_algebra/07-potri.cu
Original file line number Diff line number Diff line change
Expand Up @@ -197,17 +197,17 @@ public:
void print()
{
// print blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
// Each task fills a block
ctx.host_launch(get_handle(rowb, colb).read())->*[=](auto sA) {
for (int lcol = 0; lcol < sA.extent(1); lcol++)
for (size_t lcol = 0; lcol < sA.extent(1); lcol++)
{
size_t col = lcol + colb * sA.extent(1);
for (int lrow = 0; lrow < sA.extent(0); lrow++)
for (size_t lrow = 0; lrow < sA.extent(0); lrow++)
{
size_t row = lrow + rowb * sA.extent(0);

Expand Down
9 changes: 8 additions & 1 deletion cudax/examples/stf/linear_algebra/cg_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public:
static void copy_vector(const vector& from, vector& to)
{
to.ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()).set_symbol("copy_vector")
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<double> dfrom) {
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<const double> dfrom) {
dto(i) = dfrom(i);
};
}
Expand Down Expand Up @@ -116,6 +116,13 @@ public:
copy_scalar(a, *this);
}

scalar& operator=(scalar&& a)
{
handle = mv(a.handle);
ctx = mv(a.ctx);
return *this;
}

scalar operator/(scalar const& rhs) const
{
// Submit a task that computes this/rhs
Expand Down
10 changes: 0 additions & 10 deletions cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -955,16 +955,6 @@ public:
reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot());
}

void dot_push_section(::std::string symbol) const
{
reserved::dot::section::push(mv(symbol));
}

void dot_pop_section() const
{
reserved::dot::section::pop();
}

auto dot_section(::std::string symbol) const
{
return reserved::dot::section::guard(mv(symbol));
Expand Down
33 changes: 25 additions & 8 deletions cudax/include/cuda/experimental/__stf/internal/dot.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ public:
static int get_current_section_id();

template <typename task_type, typename data_type>
void add_vertex(task_type t)
void add_vertex(const task_type& t)
{
// Do this work outside the critical section
const auto remove_deps = getenv("CUDASTF_DOT_REMOVE_DATA_DEPS");
Expand Down Expand Up @@ -208,7 +208,7 @@ public:
}

template <typename task_type>
void add_vertex_timing(task_type t, float time_ms, int device = -1)
void add_vertex_timing(const task_type& t, float time_ms, int device = -1)
{
::std::lock_guard<::std::mutex> guard(mtx);

Expand Down Expand Up @@ -286,7 +286,7 @@ public:
::std::shared_ptr<per_ctx_dot> parent;
::std::vector<::std::shared_ptr<per_ctx_dot>> children;

const ::std::string get_ctx_symbol() const
const ::std::string& get_ctx_symbol() const
{
return ctx_symbol;
}
Expand Down Expand Up @@ -352,7 +352,10 @@ public:
// Constructor to initialize symbol and children
section(::std::string sym)
: symbol(mv(sym))
{}
{
static_assert(::std::is_move_constructible_v<section>, "section must be move constructible");
static_assert(::std::is_move_assignable_v<section>, "section must be move assignable");
}

class guard
{
Expand All @@ -362,10 +365,24 @@ public:
section::push(mv(symbol));
}

~guard()
void end()
{
_CCCL_ASSERT(active, "Attempting to end the same section twice.");
section::pop();
active = false;
}

~guard()
{
if (active)
{
section::pop();
}
}

private:
// Have we called end() ?
bool active = true;
};

static auto& current()
Expand All @@ -380,7 +397,7 @@ public:
auto sec = ::std::make_shared<section>(mv(symbol));
int id = sec->get_id();

int parent_id = current().size() == 0 ? 0 : current().top();
int parent_id = current().empty() ? 0 : current().top();
sec->parent_id = parent_id;

// Save the section in the map
Expand Down Expand Up @@ -416,7 +433,7 @@ public:
return 1 + int(id);
}

const ::std::string get_symbol() const
const ::std::string& get_symbol() const
{
return symbol;
}
Expand All @@ -431,7 +448,7 @@ public:
::std::vector<int> children_ids;

private:
int depth;
int depth = ::std::numeric_limits<int>::min();

::std::string symbol;

Expand Down
28 changes: 1 addition & 27 deletions cudax/include/cuda/experimental/stf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -635,40 +635,14 @@ public:
payload);
}

/**
* @brief Start a new section in the DOT file identified by its symbol
*/
void dot_push_section(::std::string symbol) const
{
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
::std::visit(
[symbol = mv(symbol)](auto& self) {
self.dot_push_section(symbol);
},
payload);
}

/**
* @brief Ends current dot section
*/
void dot_pop_section() const
{
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
::std::visit(
[](auto& self) {
self.dot_pop_section();
},
payload);
}

/**
* @brief RAII-style description of a new section in the DOT file identified by its symbol
*/
auto dot_section(::std::string symbol) const
{
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
return ::std::visit(
[symbol = mv(symbol)](auto& self) {
[&symbol](auto& self) {
return self.dot_section(symbol);
},
payload);
Expand Down
1 change: 1 addition & 0 deletions cudax/test/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(stf_test_sources
dot/basic.cu
dot/graph_print_to_dot.cu
dot/sections.cu
dot/sections_2.cu
dot/with_events.cu
error_checks/ctx_mismatch.cu
error_checks/data_interface_mismatch.cu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ static __global__ void finalError(double* x, double* g_sum)
double JacobiMethodGpuCudaGraphExecKernelSetParams(
const float* A,
const double* b,
const float conv_threshold,
float conv_threshold,
const int max_iter,
double* x,
double* x_new,
Expand Down
49 changes: 49 additions & 0 deletions cudax/test/stf/dot/sections_2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
* @brief This test makes sure we can generate a dot file with sections
*/

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

int main()
{
// TODO (miscco): Make it work for windows
#if !_CCCL_COMPILER(MSVC)
context ctx;
auto lA = ctx.logical_token().set_symbol("A");
auto lB = ctx.logical_token().set_symbol("B");
auto lC = ctx.logical_token().set_symbol("C");

// Begin a top-level section named "foo"
auto s_foo = ctx.dot_section("foo");
for (size_t i = 0; i < 2; i++)
{
// Section named "bar" using RAII
auto s_bar = ctx.dot_section("bar");
ctx.task(lA.read(), lB.rw()).set_symbol("t1")->*[](cudaStream_t, auto, auto) {};
for (size_t j = 0; j < 2; j++)
{
// Section named "baz" using RAII
auto s_bar = ctx.dot_section("baz");
ctx.task(lA.read(), lC.rw()).set_symbol("t2")->*[](cudaStream_t, auto, auto) {};
ctx.task(lB.read(), lC.read(), lA.rw()).set_symbol("t3")->*[](cudaStream_t, auto, auto, auto) {};
// Implicit end of section "baz"
}
// Implicit end of section "bar"
}
s_foo.end(); // Explicit end of section "foo"
ctx.finalize();
#endif // !_CCCL_COMPILER(MSVC)
}
Loading

0 comments on commit 74c17c6

Please sign in to comment.