Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: kernels for random.vonmisses; part 2 #681

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 80 additions & 55 deletions dpnp/backend/kernels/dpnp_krnl_random.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,8 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
_DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one;
_DataType* Uvec = nullptr;
_DataType* Vvec = nullptr;
bool* result_ready = nullptr;
bool* result_mask = nullptr;
const _DataType d_zero = 0.0, d_one = 1.0;

assert(kappa > 1.0);
Expand All @@ -1276,49 +1278,59 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da

Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));

result_ready = reinterpret_cast<bool*>(dpnp_memory_alloc_c(1 * sizeof(bool)));
result_ready[0] = false;
result_mask = reinterpret_cast<bool*>(dpnp_memory_alloc_c(size * sizeof(bool)));
dpnp_full_c<bool>(result_ready, result_mask, size);

for (size_t n = 0; n < size;)
while(!result_ready[0])
{
size_t diff_size = size - n;
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, 0.5 * M_PI);
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
event_out.wait();
// TODO
// use deps case
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
event_out.wait();
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

// TODO
// kernel
for (size_t i = 0; i < diff_size; i++)
{
_DataType sn, cn, sn2, cn2;
_DataType neg_W_minus_one, V, Y;
cl::sycl::range<1> gws(size);
auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
Copy link
Contributor

@shssf shssf May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kernel inside the loop with bigger trip count. It would be more efficient to parallelize (make kernel) the algorithm by bigger value size instead size-n. So, it will require a loop inside the kernel.
It is questionable what will be more performant

  1. loop with a kernels queue (data dependent)
  2. kernel with a loop

It is hard to predict it with no perf measurements but I would vote that parallelization with bigger number of threads should be better.

cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];
if (!result_mask[i]) {
_DataType sn, cn, sn2, cn2;
_DataType neg_W_minus_one, V, Y;

sn = sin(Uvec[i]);
cn = cos(Uvec[i]);
V = Vvec[i];
sn2 = sn * sn;
cn2 = cn * cn;
sn = cl::sycl::sin(Uvec[i]);
cn = cl::sycl::cos(Uvec[i]);
V = Vvec[i];
sn2 = sn * sn;
cn2 = cn * cn;

neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
Y = kappa * (s_minus_one + neg_W_minus_one);
neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
Y = kappa * (s_minus_one + neg_W_minus_one);

if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
{
Y = neg_W_minus_one * (2 - neg_W_minus_one);
if (Y < 0)
Y = 0.0;
else if (Y > 1.0)
Y = 1.0;
if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
{
Y = neg_W_minus_one * (2 - neg_W_minus_one);
if (Y < 0)
Y = 0.0;
else if (Y > 1.0)
Y = 1.0;

result1[i] = cl::sycl::asin(cl::sycl::sqrt(Y));
result_mask[i] = true;
}
}
});
};
auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
some_event.wait();

result1[n++] = asin(sqrt(Y));
}
}
dpnp_all_c<bool, bool>(result_mask, result_ready, size);
}

dpnp_memory_free_c(Uvec);
dpnp_memory_free_c(result_ready);
dpnp_memory_free_c(result_mask);

mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
Expand Down Expand Up @@ -1357,6 +1369,8 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
_DataType rho_over_kappa, rho, r, s_kappa;
_DataType* Uvec = nullptr;
_DataType* Vvec = nullptr;
bool* result_ready = nullptr;
bool* result_mask = nullptr;

const _DataType d_zero = 0.0, d_one = 1.0;

Expand All @@ -1372,35 +1386,46 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));

for (size_t n = 0; n < size;)
result_ready = reinterpret_cast<bool*>(dpnp_memory_alloc_c(1 * sizeof(bool)));
result_ready[0] = false;
result_mask = reinterpret_cast<bool*>(dpnp_memory_alloc_c(size * sizeof(bool)));
dpnp_full_c<bool>(result_ready, result_mask, size);

while (!result_ready[0])
{
size_t diff_size = size - n;
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, M_PI);
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
event_out.wait();
// TODO
// use deps case
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
event_out.wait();
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

// TODO
// kernel
for (size_t i = 0; i < diff_size; i++)
{
_DataType Z, W, Y, V;
Z = cos(Uvec[i]);
V = Vvec[i];
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
Y = s_kappa - kappa * W;
if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
{
result1[n++] = acos(W);
}
}
}
cl::sycl::range<1> gws((size));

auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];
if (!result_mask[i]) {
_DataType Z, W, Y, V;
Z = cl::sycl::cos(Uvec[i]);
V = Vvec[i];
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
Y = s_kappa - kappa * W;
if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
{
result1[i] = cl::sycl::acos(W);
result_mask[i] = true;
}
}
});
};
auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
some_event.wait();

dpnp_all_c<bool, bool>(result_mask, result_ready, size);
}
dpnp_memory_free_c(Uvec);
dpnp_memory_free_c(result_ready);
dpnp_memory_free_c(result_mask);

mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
Expand Down
2 changes: 1 addition & 1 deletion tests/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ def test_invalid_args(self):
@pytest.mark.parametrize("kappa", [5.0, 0.5], ids=['large_kappa', 'small_kappa'])
def test_seed(self, kappa):
seed = 28041990
size = 10
size = 1000
mu = 2.
dpnp.random.seed(seed)
a1 = dpnp.asarray(dpnp.random.vonmises(mu, kappa, size=size))
Expand Down