Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data Parallel Harpocrates Cipher, using SYCL #3

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ CXXFLAGS = -std=c++20 -Wall -Wextra -pedantic
OPTFLAGS = -O3
IFLAGS = -I ./include

DTARGET = -DTARGET_$(shell echo $(or $(TARGET),default) | tr a-z A-Z)

all: test_harpocrates

test/a.out: test/main.cpp include/*.hpp
test/harpocrates.out: test/main.cpp include/*.hpp
$(CXX) $(CXXFLAGS) $(OPTFLAGS) $(IFLAGS) $< -o $@

test_harpocrates: test/a.out
test_harpocrates: test/harpocrates.out
./$<

clean:
Expand All @@ -17,8 +19,63 @@ clean:
format:
find . -name '*.cpp' -o -name '*.hpp' | xargs clang-format -i --style=Mozilla

bench/a.out: bench/main.cpp include/*.hpp
# benchmarks Harpocrates minimal cipher variant on CPU
bench/harpocrates.out: bench/harpocrates.cpp include/*.hpp
$(CXX) $(CXXFLAGS) $(OPTFLAGS) $(IFLAGS) $< -lbenchmark -o $@

benchmark: bench/a.out
benchmark: bench/harpocrates.out
./$<

# tests data-parallel version of Harpocrates kernels
test/harpocrates_parallel.out: test/main.cpp include/*.hpp
dpcpp $(CXXFLAGS) -DTEST_SYCL -fsycl $(DTARGET) $(OPTFLAGS) $(IFLAGS) $< -o $@

test_harpocrates_parallel: test/harpocrates_parallel.out
./$<

# compiles Harpocrates kernels on-the-fly & benchmarks on selected ( or available )
# accelerator device
bench/harpocrates_parallel.out: bench/harpocrates_parallel.cpp include/*.hpp
dpcpp $(CXXFLAGS) -fsycl $(DTARGET) $(OPTFLAGS) $(IFLAGS) $< -o $@

benchmark_parallel: bench/harpocrates_parallel.out
./$<

# attempt to AOT compile both kernels, when targeting Intel CPUs,
# using AVX/ AVX2 /AVX512/ SSE4.2 ( when available )
bench/aot_cpu.out: bench/harpocrates_parallel.cpp include/*.hpp
@if lscpu | grep -q 'avx512'; then \
echo "Using avx512"; \
dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx512" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \
elif lscpu | grep -q 'avx2'; then \
echo "Using avx2"; \
dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx2" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \
elif lscpu | grep -q 'avx'; then \
echo "Using avx"; \
dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \
elif lscpu | grep -q 'sse4.2'; then \
echo "Using sse4.2"; \
dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=sse4.2" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \
else \
echo "Can't AOT compile using avx, avx2, avx512 or sse4.2"; \
fi

aot_cpu: bench/aot_cpu.out
./$<

# attempt to AOT compile both kernels, when targeting Intel GPUs
bench/aot_gpu.out: bench/harpocrates_parallel.cpp include/*.hpp
# you may want to replace `device` identifier with `0x3e96` if you're targeting *Intel(R) UHD Graphics P630*
#
# otherwise, let it be what it's when you're targeting *Intel(R) Iris(R) Xe MAX Graphics*
dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_gen -Xs "-device 0x4905" -DTARGET_GPU $(OPTFLAGS) $(IFLAGS) $< -o $@

aot_gpu: bench/aot_gpu.out
./$<

# target CUDA for benchmarking data-parallel Harpocrates cipher
bench/harpocrates_parallel.cuda.out: bench/harpocrates_parallel.cpp include/*.hpp
clang++ $(CXXFLAGS) -fsycl -fsycl-targets=nvptx64-nvidia-cuda -DTARGET_GPU $(OPTFLAGS) $(IFLAGS) $< -o $@

cuda: bench/harpocrates_parallel.cuda.out
./$<
8 changes: 8 additions & 0 deletions bench/harpocrates.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include "bench_harpocrates.hpp"

// register function for benchmarking
BENCHMARK(harpocrates_encrypt);
BENCHMARK(harpocrates_decrypt);

// main function to make it executable
BENCHMARK_MAIN();
76 changes: 76 additions & 0 deletions bench/harpocrates_parallel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "bench_harpocrates_parallel.hpp"
#include "table.hpp"

int
main()
{
#if defined TARGET_CPU
sycl::cpu_selector s{};
#pragma message("Targeting default CPU at run-time")
#elif defined TARGET_GPU
sycl::gpu_selector s{};
#pragma message("Targeting default GPU at run-time")
#else
sycl::default_selector s{};
#pragma message("Targeting default Accelerator at run-time")
#endif

sycl::device d{ s };
sycl::context c{ d };
sycl::queue q{ c, d, sycl::property::queue::enable_profiling{} };

constexpr size_t min_wi_cnt = 1ul << 20;
constexpr size_t max_wi_cnt = 1ul << 26;
constexpr size_t wg_size = 32ul;

std::cout << "Running on " << d.get_info<sycl::info::device::name>()
<< std::endl
<< std::endl;

TextTable tbl('-', '|', '+');

tbl.add("# -of work-items");
tbl.add("kernel name");
tbl.add("input size ( bytes )");
tbl.add("output size ( bytes )");
tbl.add("host-to-device b/w");
tbl.add("kernel b/w");
tbl.add("device-to-host b/w");
tbl.endOfRow();

for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) {
const auto ret = bench_harpocrates_parallel_encrypt_decrypt(q, wi, wg_size);

const auto enc_kernel = ret[0];
const auto dec_kernel = ret[1];

tbl.add(std::to_string(wi));
tbl.add("Harpocrates Encrypt");
tbl.add(to_readable_data_amount(enc_kernel.h2d_tx));
tbl.add(to_readable_data_amount(enc_kernel.d2h_tx));
tbl.add(to_readable_bandwidth(enc_kernel.h2d_tx, enc_kernel.h2d_tx_tm));
tbl.add(to_readable_bandwidth(enc_kernel.h2d_tx - 256, enc_kernel.exec_tm));
tbl.add(to_readable_bandwidth(enc_kernel.d2h_tx, enc_kernel.d2h_tx_tm));
tbl.endOfRow();

tbl.add(std::to_string(wi));
tbl.add("Harpocrates Decrypt");
tbl.add(to_readable_data_amount(dec_kernel.h2d_tx));
tbl.add(to_readable_data_amount(dec_kernel.d2h_tx));
tbl.add(to_readable_bandwidth(dec_kernel.h2d_tx, dec_kernel.h2d_tx_tm));
tbl.add(to_readable_bandwidth(dec_kernel.h2d_tx - 256, dec_kernel.exec_tm));
tbl.add(to_readable_bandwidth(dec_kernel.d2h_tx, dec_kernel.d2h_tx_tm));
tbl.endOfRow();
}

tbl.setAlignment(1, TextTable::Alignment::RIGHT);
tbl.setAlignment(2, TextTable::Alignment::RIGHT);
tbl.setAlignment(3, TextTable::Alignment::RIGHT);
tbl.setAlignment(4, TextTable::Alignment::RIGHT);
tbl.setAlignment(5, TextTable::Alignment::RIGHT);
tbl.setAlignment(6, TextTable::Alignment::RIGHT);

std::cout << tbl;

return EXIT_SUCCESS;
}
8 changes: 1 addition & 7 deletions bench/main.cpp → include/bench_harpocrates.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#pragma once
#include "harpocrates.hpp"
#include "utils.hpp"
#include <benchmark/benchmark.h>
Expand Down Expand Up @@ -91,10 +92,3 @@ harpocrates_decrypt(benchmark::State& state)
std::free(enc);
std::free(dec);
}

// register function for benchmarking
BENCHMARK(harpocrates_encrypt);
BENCHMARK(harpocrates_decrypt);

// main function to make it executable
BENCHMARK_MAIN();
181 changes: 181 additions & 0 deletions include/bench_harpocrates_parallel.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#pragma once
#include "harpocrates_parallel.hpp"
#include "utils.hpp"
#include <string.h>

constexpr double GB = 1073741824.; // 1 << 30 bytes
constexpr double MB = 1048576.; // 1 << 20 bytes
constexpr double KB = 1024.; // 1 << 10 bytes

// Data structure for holding
//
// i) host -> device input data transfer time
// ii) bytes of data transferred from host -> device
// iii) SYCL kernel execution time
// iv) device -> host output data transfer time
// v) bytes of data brought back to host from device
//
// where kernel in question can be Harpocrates encrypt/ decrypt routine,
// offloaded to accelerator device.
struct sycl_benchmark_t
{
uint64_t h2d_tx_tm; // host -> device input data tx time ( ns )
size_t h2d_tx; // bytes of data transferred from host -> device ( bytes )
uint64_t exec_tm; // SYCL kernel execution time ( ns )
uint64_t d2h_tx_tm; // device -> host output data tx time ( ns )
size_t d2h_tx; // bytes of data transferred from device -> host ( bytes )
};

// Time execution of SYCL command, whose submission resulted into supplied SYCL
// event
//
// Ensure where SYCL command was submitted, that queue has profiling enabled !
static inline uint64_t
time_event(sycl::event& e)
{
using command = sycl::info::event_profiling;

sycl::cl_ulong t0 = e.get_profiling_info<command::command_start>();
sycl::cl_ulong t1 = e.get_profiling_info<command::command_end>();

return static_cast<uint64_t>(t1 - t0);
}

// Convert how many bytes processed in how long timespan ( given in nanosecond
// level granularity ) to more human digestable format ( i.e. GB/ s or MB/ s or
// KB/ s or B/ s )
static inline const std::string
to_readable_bandwidth(const size_t bytes, // bytes
const uint64_t ts // nanoseconds
)
{
const double bytes_ = static_cast<double>(bytes);
const double ts_ = static_cast<double>(ts) * 1e-9; // seconds
const double bps = bytes_ / ts_; // bytes/ sec

return bps >= GB ? (std::to_string(bps / GB) + " GB/ s")
: bps >= MB ? (std::to_string(bps / MB) + " MB/ s")
: bps >= KB ? (std::to_string(bps / KB) + " KB/ s")
: (std::to_string(bps) + " B/ s");
}

// Convert given number of bytes to more readable form such as GB, MB, KB or B
static inline const std::string
to_readable_data_amount(const size_t bytes)
{
const double bytes_ = static_cast<double>(bytes);

return bytes_ >= GB ? (std::to_string(bytes_ / GB) + " GB")
: bytes_ >= MB ? (std::to_string(bytes_ / MB) + " MB")
: bytes_ >= KB ? (std::to_string(bytes_ / KB) + " KB")
: (std::to_string(bytes_) + " B");
}

// Benchmark execution of data-parallel Harpocrates encrypt/ decrypt kernels,
// along with that keep track of host -> device & device -> host data transfer
// amount, time
//
// Note, in this routine, amount of data is calculated in bytes, while time is
// calculated in nanoseconds
//
// This function returns a vector of two elements, where first element holds
// benchmarked data for Harpocrates encrypt kernel, while second entry holds
// benchmarked data for Harpocrates decrypt kernel
static const std::vector<sycl_benchmark_t>
bench_harpocrates_parallel_encrypt_decrypt(
sycl::queue& q, // profiling enabled SYCL queue
const size_t wi_cnt, // # -of work-items to dispatch
const size_t wg_size // # -of work-items to group together
)
{
// SYCL queue must have profiling enabled
assert(q.has_property<sycl::property::queue::enable_profiling>());

const size_t ct_len = wi_cnt << 4;

// allocate resources
uint8_t* lut_h = static_cast<uint8_t*>(sycl::malloc_host(256, q));
uint8_t* ilut_h = static_cast<uint8_t*>(sycl::malloc_host(256, q));
uint8_t* txt_h = static_cast<uint8_t*>(sycl::malloc_host(ct_len, q));
uint8_t* enc_h = static_cast<uint8_t*>(sycl::malloc_host(ct_len, q));
uint8_t* dec_h = static_cast<uint8_t*>(sycl::malloc_host(ct_len, q));

uint8_t* lut_d = static_cast<uint8_t*>(sycl::malloc_device(256, q));
uint8_t* ilut_d = static_cast<uint8_t*>(sycl::malloc_device(256, q));
uint8_t* txt_d = static_cast<uint8_t*>(sycl::malloc_device(ct_len, q));
uint8_t* enc_d = static_cast<uint8_t*>(sycl::malloc_device(ct_len, q));
uint8_t* dec_d = static_cast<uint8_t*>(sycl::malloc_device(ct_len, q));

harpocrates_utils::generate_lut(lut_h);
harpocrates_utils::generate_inv_lut(lut_h, ilut_h);

random_data(txt_h, ct_len);
memset(enc_h, 0, ct_len);
memset(dec_h, 0, ct_len);

using evt = sycl::event;
using evts = std::vector<evt>;
using namespace harpocrates_parallel;

// host -> device data tx
evt e0 = q.memcpy(lut_d, lut_h, 256);
evt e1 = q.memcpy(ilut_d, ilut_h, 256);
evt e2 = q.memcpy(txt_d, txt_h, ct_len);

evt e3 = q.memset(enc_d, 0, ct_len);
evt e4 = q.memset(dec_d, 0, ct_len);

// dispatch encryption kernel
evts e5{ e0, e2, e3 };
evt e6 = encrypt(q, lut_d, txt_d, enc_d, ct_len, wg_size, e5);

// dispatch decryption kernel
evts e7{ e1, e4, e6 };
evt e8 = decrypt(q, ilut_d, enc_d, dec_d, ct_len, wg_size, e7);

// device -> host data tx
evt e9 = q.submit([&](sycl::handler& h) {
h.depends_on(e6);
h.memcpy(enc_h, enc_d, ct_len);
});
evt e10 = q.submit([&](sycl::handler& h) {
h.depends_on(e8);
h.memcpy(dec_h, dec_d, ct_len);
});

evt e11 = q.ext_oneapi_submit_barrier({ e9, e10 });

// host sychronization
e11.wait();

// compare to check that encrypt -> decrypt on accelerator worked as expected
for (size_t i = 0; i < ct_len; i++) {
assert((txt_h[i] ^ dec_h[i]) == 0);
}

// release memory resources
sycl::free(lut_h, q);
sycl::free(ilut_h, q);
sycl::free(txt_h, q);
sycl::free(enc_h, q);
sycl::free(dec_h, q);

sycl::free(lut_d, q);
sycl::free(ilut_d, q);
sycl::free(txt_d, q);
sycl::free(enc_d, q);
sycl::free(dec_d, q);

return {
{ time_event(e0) + time_event(e2),
256ul + ct_len,
time_event(e6),
time_event(e9),
ct_len }, // for encrypt kernel
{ time_event(e1) + time_event(e2),
256ul + ct_len,
time_event(e8),
time_event(e10),
ct_len } // for decrypt kernel
};
}
Loading