diff --git a/Makefile b/Makefile index 21a0f15..b8a7d5f 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,14 @@ CXXFLAGS = -std=c++20 -Wall -Wextra -pedantic OPTFLAGS = -O3 IFLAGS = -I ./include +DTARGET = -DTARGET_$(shell echo $(or $(TARGET),default) | tr a-z A-Z) + all: test_harpocrates -test/a.out: test/main.cpp include/*.hpp +test/harpocrates.out: test/main.cpp include/*.hpp $(CXX) $(CXXFLAGS) $(OPTFLAGS) $(IFLAGS) $< -o $@ -test_harpocrates: test/a.out +test_harpocrates: test/harpocrates.out ./$< clean: @@ -17,8 +19,63 @@ clean: format: find . -name '*.cpp' -o -name '*.hpp' | xargs clang-format -i --style=Mozilla -bench/a.out: bench/main.cpp include/*.hpp +# benchmarks Harpocrates minimal cipher variant on CPU +bench/harpocrates.out: bench/harpocrates.cpp include/*.hpp $(CXX) $(CXXFLAGS) $(OPTFLAGS) $(IFLAGS) $< -lbenchmark -o $@ -benchmark: bench/a.out +benchmark: bench/harpocrates.out + ./$< + +# tests data-parallel version of Harpocrates kernels +test/harpocrates_parallel.out: test/main.cpp include/*.hpp + dpcpp $(CXXFLAGS) -DTEST_SYCL -fsycl $(DTARGET) $(OPTFLAGS) $(IFLAGS) $< -o $@ + +test_harpocrates_parallel: test/harpocrates_parallel.out + ./$< + +# compiles Harpocrates kernels on-the-fly & benchmarks on selected ( or available ) +# accelerator device +bench/harpocrates_parallel.out: bench/harpocrates_parallel.cpp include/*.hpp + dpcpp $(CXXFLAGS) -fsycl $(DTARGET) $(OPTFLAGS) $(IFLAGS) $< -o $@ + +benchmark_parallel: bench/harpocrates_parallel.out + ./$< + +# attempt to AOT compile both kernels, when targeting Intel CPUs, +# using AVX/ AVX2 /AVX512/ SSE4.2 ( when available ) +bench/aot_cpu.out: bench/harpocrates_parallel.cpp include/*.hpp + @if lscpu | grep -q 'avx512'; then \ + echo "Using avx512"; \ + dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx512" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \ + elif lscpu | grep -q 'avx2'; then \ + echo "Using avx2"; \ + dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx2" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \ + elif lscpu | grep -q 'avx'; then \ + echo "Using avx"; \ + dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=avx" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \ + elif lscpu | grep -q 'sse4.2'; then \ + echo "Using sse4.2"; \ + dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 -Xs "-march=sse4.2" -DTARGET_CPU $(OPTFLAGS) $(IFLAGS) $< -o $@; \ + else \ + echo "Can't AOT compile using avx, avx2, avx512 or sse4.2"; \ + fi + +aot_cpu: bench/aot_cpu.out + ./$< + +# attempt to AOT compile both kernels, when targeting Intel GPUs +bench/aot_gpu.out: bench/harpocrates_parallel.cpp include/*.hpp + # you may want to replace `device` identifier with `0x3e96` if you're targeting *Intel(R) UHD Graphics P630* + # + # otherwise, let it be what it's when you're targeting *Intel(R) Iris(R) Xe MAX Graphics* + dpcpp $(CXXFLAGS) -fsycl -fsycl-targets=spir64_gen -Xs "-device 0x4905" -DTARGET_GPU $(OPTFLAGS) $(IFLAGS) $< -o $@ + +aot_gpu: bench/aot_gpu.out + ./$< + +# target CUDA for benchmarking data-parallel Harpocrates cipher +bench/harpocrates_parallel.cuda.out: bench/harpocrates_parallel.cpp include/*.hpp + clang++ $(CXXFLAGS) -fsycl -fsycl-targets=nvptx64-nvidia-cuda -DTARGET_GPU $(OPTFLAGS) $(IFLAGS) $< -o $@ + +cuda: bench/harpocrates_parallel.cuda.out ./$< diff --git a/bench/harpocrates.cpp b/bench/harpocrates.cpp new file mode 100644 index 0000000..bd78896 --- /dev/null +++ b/bench/harpocrates.cpp @@ -0,0 +1,8 @@ +#include "bench_harpocrates.hpp" + +// register function for benchmarking +BENCHMARK(harpocrates_encrypt); +BENCHMARK(harpocrates_decrypt); + +// main function to make it executable +BENCHMARK_MAIN(); diff --git a/bench/harpocrates_parallel.cpp b/bench/harpocrates_parallel.cpp new file mode 100644 index 0000000..f093913 --- /dev/null +++ b/bench/harpocrates_parallel.cpp @@ -0,0 +1,76 @@ +#include "bench_harpocrates_parallel.hpp" +#include "table.hpp" + +int +main() +{ +#if defined TARGET_CPU + sycl::cpu_selector s{}; +#pragma message("Targeting default CPU at run-time") +#elif defined TARGET_GPU + sycl::gpu_selector s{}; +#pragma message("Targeting default GPU at run-time") +#else + sycl::default_selector s{}; +#pragma message("Targeting default Accelerator at run-time") +#endif + + sycl::device d{ s }; + sycl::context c{ d }; + sycl::queue q{ c, d, sycl::property::queue::enable_profiling{} }; + + constexpr size_t min_wi_cnt = 1ul << 20; + constexpr size_t max_wi_cnt = 1ul << 26; + constexpr size_t wg_size = 32ul; + + std::cout << "Running on " << d.get_info() + << std::endl + << std::endl; + + TextTable tbl('-', '|', '+'); + + tbl.add("# -of work-items"); + tbl.add("kernel name"); + tbl.add("input size ( bytes )"); + tbl.add("output size ( bytes )"); + tbl.add("host-to-device b/w"); + tbl.add("kernel b/w"); + tbl.add("device-to-host b/w"); + tbl.endOfRow(); + + for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) { + const auto ret = bench_harpocrates_parallel_encrypt_decrypt(q, wi, wg_size); + + const auto enc_kernel = ret[0]; + const auto dec_kernel = ret[1]; + + tbl.add(std::to_string(wi)); + tbl.add("Harpocrates Encrypt"); + tbl.add(to_readable_data_amount(enc_kernel.h2d_tx)); + tbl.add(to_readable_data_amount(enc_kernel.d2h_tx)); + tbl.add(to_readable_bandwidth(enc_kernel.h2d_tx, enc_kernel.h2d_tx_tm)); + tbl.add(to_readable_bandwidth(enc_kernel.h2d_tx - 256, enc_kernel.exec_tm)); + tbl.add(to_readable_bandwidth(enc_kernel.d2h_tx, enc_kernel.d2h_tx_tm)); + tbl.endOfRow(); + + tbl.add(std::to_string(wi)); + tbl.add("Harpocrates Decrypt"); + tbl.add(to_readable_data_amount(dec_kernel.h2d_tx)); + tbl.add(to_readable_data_amount(dec_kernel.d2h_tx)); + tbl.add(to_readable_bandwidth(dec_kernel.h2d_tx, dec_kernel.h2d_tx_tm)); + tbl.add(to_readable_bandwidth(dec_kernel.h2d_tx - 256, dec_kernel.exec_tm)); + tbl.add(to_readable_bandwidth(dec_kernel.d2h_tx, dec_kernel.d2h_tx_tm)); + tbl.endOfRow(); + } + + tbl.setAlignment(1, TextTable::Alignment::RIGHT); + tbl.setAlignment(2, TextTable::Alignment::RIGHT); + tbl.setAlignment(3, TextTable::Alignment::RIGHT); + tbl.setAlignment(4, TextTable::Alignment::RIGHT); + tbl.setAlignment(5, TextTable::Alignment::RIGHT); + tbl.setAlignment(6, TextTable::Alignment::RIGHT); + + std::cout << tbl; + + return EXIT_SUCCESS; +} diff --git a/bench/main.cpp b/include/bench_harpocrates.hpp similarity index 94% rename from bench/main.cpp rename to include/bench_harpocrates.hpp index 0e1d7a2..345e634 100644 --- a/bench/main.cpp +++ b/include/bench_harpocrates.hpp @@ -1,3 +1,4 @@ +#pragma once #include "harpocrates.hpp" #include "utils.hpp" #include @@ -91,10 +92,3 @@ harpocrates_decrypt(benchmark::State& state) std::free(enc); std::free(dec); } - -// register function for benchmarking -BENCHMARK(harpocrates_encrypt); -BENCHMARK(harpocrates_decrypt); - -// main function to make it executable -BENCHMARK_MAIN(); diff --git a/include/bench_harpocrates_parallel.hpp b/include/bench_harpocrates_parallel.hpp new file mode 100644 index 0000000..adba949 --- /dev/null +++ b/include/bench_harpocrates_parallel.hpp @@ -0,0 +1,181 @@ +#pragma once +#include "harpocrates_parallel.hpp" +#include "utils.hpp" +#include + +constexpr double GB = 1073741824.; // 1 << 30 bytes +constexpr double MB = 1048576.; // 1 << 20 bytes +constexpr double KB = 1024.; // 1 << 10 bytes + +// Data structure for holding +// +// i) host -> device input data transfer time +// ii) bytes of data transferred from host -> device +// iii) SYCL kernel execution time +// iv) device -> host output data transfer time +// v) bytes of data brought back to host from device +// +// where kernel in question can be Harpocrates encrypt/ decrypt routine, +// offloaded to accelerator device. +struct sycl_benchmark_t +{ + uint64_t h2d_tx_tm; // host -> device input data tx time ( ns ) + size_t h2d_tx; // bytes of data transferred from host -> device ( bytes ) + uint64_t exec_tm; // SYCL kernel execution time ( ns ) + uint64_t d2h_tx_tm; // device -> host output data tx time ( ns ) + size_t d2h_tx; // bytes of data transferred from device -> host ( bytes ) +}; + +// Time execution of SYCL command, whose submission resulted into supplied SYCL +// event +// +// Ensure where SYCL command was submitted, that queue has profiling enabled ! +static inline uint64_t +time_event(sycl::event& e) +{ + using command = sycl::info::event_profiling; + + sycl::cl_ulong t0 = e.get_profiling_info(); + sycl::cl_ulong t1 = e.get_profiling_info(); + + return static_cast(t1 - t0); +} + +// Convert how many bytes processed in how long timespan ( given in nanosecond +// level granularity ) to more human digestable format ( i.e. GB/ s or MB/ s or +// KB/ s or B/ s ) +static inline const std::string +to_readable_bandwidth(const size_t bytes, // bytes + const uint64_t ts // nanoseconds +) +{ + const double bytes_ = static_cast(bytes); + const double ts_ = static_cast(ts) * 1e-9; // seconds + const double bps = bytes_ / ts_; // bytes/ sec + + return bps >= GB ? (std::to_string(bps / GB) + " GB/ s") + : bps >= MB ? (std::to_string(bps / MB) + " MB/ s") + : bps >= KB ? (std::to_string(bps / KB) + " KB/ s") + : (std::to_string(bps) + " B/ s"); +} + +// Convert given number of bytes to more readable form such as GB, MB, KB or B +static inline const std::string +to_readable_data_amount(const size_t bytes) +{ + const double bytes_ = static_cast(bytes); + + return bytes_ >= GB ? (std::to_string(bytes_ / GB) + " GB") + : bytes_ >= MB ? (std::to_string(bytes_ / MB) + " MB") + : bytes_ >= KB ? (std::to_string(bytes_ / KB) + " KB") + : (std::to_string(bytes_) + " B"); +} + +// Benchmark execution of data-parallel Harpocrates encrypt/ decrypt kernels, +// along with that keep track of host -> device & device -> host data transfer +// amount, time +// +// Note, in this routine, amount of data is calculated in bytes, while time is +// calculated in nanoseconds +// +// This function returns a vector of two elements, where first element holds +// benchmarked data for Harpocrates encrypt kernel, while second entry holds +// benchmarked data for Harpocrates decrypt kernel +static const std::vector +bench_harpocrates_parallel_encrypt_decrypt( + sycl::queue& q, // profiling enabled SYCL queue + const size_t wi_cnt, // # -of work-items to dispatch + const size_t wg_size // # -of work-items to group together +) +{ + // SYCL queue must have profiling enabled + assert(q.has_property()); + + const size_t ct_len = wi_cnt << 4; + + // allocate resources + uint8_t* lut_h = static_cast(sycl::malloc_host(256, q)); + uint8_t* ilut_h = static_cast(sycl::malloc_host(256, q)); + uint8_t* txt_h = static_cast(sycl::malloc_host(ct_len, q)); + uint8_t* enc_h = static_cast(sycl::malloc_host(ct_len, q)); + uint8_t* dec_h = static_cast(sycl::malloc_host(ct_len, q)); + + uint8_t* lut_d = static_cast(sycl::malloc_device(256, q)); + uint8_t* ilut_d = static_cast(sycl::malloc_device(256, q)); + uint8_t* txt_d = static_cast(sycl::malloc_device(ct_len, q)); + uint8_t* enc_d = static_cast(sycl::malloc_device(ct_len, q)); + uint8_t* dec_d = static_cast(sycl::malloc_device(ct_len, q)); + + harpocrates_utils::generate_lut(lut_h); + harpocrates_utils::generate_inv_lut(lut_h, ilut_h); + + random_data(txt_h, ct_len); + memset(enc_h, 0, ct_len); + memset(dec_h, 0, ct_len); + + using evt = sycl::event; + using evts = std::vector; + using namespace harpocrates_parallel; + + // host -> device data tx + evt e0 = q.memcpy(lut_d, lut_h, 256); + evt e1 = q.memcpy(ilut_d, ilut_h, 256); + evt e2 = q.memcpy(txt_d, txt_h, ct_len); + + evt e3 = q.memset(enc_d, 0, ct_len); + evt e4 = q.memset(dec_d, 0, ct_len); + + // dispatch encryption kernel + evts e5{ e0, e2, e3 }; + evt e6 = encrypt(q, lut_d, txt_d, enc_d, ct_len, wg_size, e5); + + // dispatch decryption kernel + evts e7{ e1, e4, e6 }; + evt e8 = decrypt(q, ilut_d, enc_d, dec_d, ct_len, wg_size, e7); + + // device -> host data tx + evt e9 = q.submit([&](sycl::handler& h) { + h.depends_on(e6); + h.memcpy(enc_h, enc_d, ct_len); + }); + evt e10 = q.submit([&](sycl::handler& h) { + h.depends_on(e8); + h.memcpy(dec_h, dec_d, ct_len); + }); + + evt e11 = q.ext_oneapi_submit_barrier({ e9, e10 }); + + // host sychronization + e11.wait(); + + // compare to check that encrypt -> decrypt on accelerator worked as expected + for (size_t i = 0; i < ct_len; i++) { + assert((txt_h[i] ^ dec_h[i]) == 0); + } + + // release memory resources + sycl::free(lut_h, q); + sycl::free(ilut_h, q); + sycl::free(txt_h, q); + sycl::free(enc_h, q); + sycl::free(dec_h, q); + + sycl::free(lut_d, q); + sycl::free(ilut_d, q); + sycl::free(txt_d, q); + sycl::free(enc_d, q); + sycl::free(dec_d, q); + + return { + { time_event(e0) + time_event(e2), + 256ul + ct_len, + time_event(e6), + time_event(e9), + ct_len }, // for encrypt kernel + { time_event(e1) + time_event(e2), + 256ul + ct_len, + time_event(e8), + time_event(e10), + ct_len } // for decrypt kernel + }; +} diff --git a/include/harpocrates_parallel.hpp b/include/harpocrates_parallel.hpp new file mode 100644 index 0000000..5ca8645 --- /dev/null +++ b/include/harpocrates_parallel.hpp @@ -0,0 +1,133 @@ +#pragma once +#include "harpocrates.hpp" +#include +#include + +// Data Parallel Harpocrates - An Efficient Parallel Encryption Mechanism for +// Data-at-rest, offloading to CPU & GPU using SYCL +// +// Read more about SYCL here +// https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html +namespace harpocrates_parallel { + +class kernelHarpocratesEncrypt; +class kernelHarpocratesDecrypt; + +sycl::event +encrypt(sycl::queue&, + const uint8_t* const __restrict, + const uint8_t* const __restrict, + uint8_t* const __restrict, + const size_t, + const size_t, + std::vector); + +sycl::event +decrypt(sycl::queue&, + const uint8_t* const __restrict, + const uint8_t* const __restrict, + uint8_t* const __restrict, + const size_t, + const size_t, + std::vector); + +// Given 256 -bytes look up table & N -bytes plain text, this routine offloads +// computation of encrypted byte slices to available accelerator device ( +// encapsulated in SYCL queue object ), producing N -many encrypted bytes. +// +// For encrypting each 16 -bytes slice, this routine makes use of Harpocrates +// cipher. Which is exactly why N must be evenly divisible by 16. +// +// Also note, dispatched work group size must be evenly dividing `ct_len >> 4`, +// so that each work-group has same number of active work-items +// +// For creating SYCL dependency graph, make use of last parameter & return type +// of this routine. +sycl::event +encrypt(sycl::queue& q, // SYCL queue + const uint8_t* const __restrict lut, // 256 -bytes look up table + const uint8_t* const __restrict txt, // plain text | len(txt) = N + uint8_t* const __restrict enc, // encrypted text | len(enc) = N + const size_t ct_len, // len(txt) = len(enc) = N + const size_t wg_size, // (ct_len >> 4) % wg_size == 0 + std::vector evts // SYCL events to wait for +) +{ + // ensure input plain text is of non-zero length + assert(ct_len > 0ul); + // ensure input plain text length is evenly divisible by 16, + // so that we can dispatch `ct_len >> 4` -many work-items + assert((ct_len & 15ul) == 0ul); + + // these many work-items to be dispatched so that each work-item + // can encrypt its portion of plain text independently + const size_t wi_cnt = ct_len >> 4; + + // all work-groups must have same #- of active work-items + assert(wi_cnt % wg_size == 0); + + return q.submit([&](sycl::handler& h) { + // create dependency graph + h.depends_on(evts); + + const auto rng = sycl::nd_range<1>{ wi_cnt, wg_size }; + h.parallel_for(rng, [=](sycl::nd_item<1> it) { + const size_t idx = it.get_global_linear_id(); + const size_t b_off = idx << 4; + + harpocrates::encrypt(lut, txt + b_off, enc + b_off); + }); + }); +} + +// Given 256 -bytes inverse look up table & N -bytes cipher text, this routine +// offloads computation of decrypted byte slices to available accelerator device +// ( encapsulated in SYCL queue object ), producing N -many decrypted bytes. +// +// For decrypting each 16 -bytes slice, this routine makes use of Harpocrates +// cipher. Which is exactly why N must be evenly divisible by 16. +// +// Also note, dispatched work group size must be evenly dividing `ct_len >> 4`, +// so that each work-group has same number of active work-items +// +// For creating SYCL dependency graph, make use of last parameter & return type +// of this routine. +sycl::event +decrypt( + sycl::queue& q, // SYCL queue + const uint8_t* const __restrict inv_lut, // 256 -bytes inverse look up table + const uint8_t* const __restrict enc, // encrypted bytes | len(enc) = N + uint8_t* const __restrict dec, // decrypted bytes | len(dec) = N + const size_t ct_len, // len(enc) = len(dec) = N + const size_t wg_size, // (ct_len >> 4) % wg_size == 0 + std::vector evts // SYCL events to wait for +) +{ + // ensure input cipher text is of non-zero length + assert(ct_len > 0ul); + // ensure input cipher text length is evenly divisible by 16, + // so that we can dispatch `ct_len >> 4` -many work-items + assert((ct_len & 15ul) == 0ul); + + // these many work-items to be dispatched so that each work-item + // can decrypt its portion of cipher text independently + const size_t wi_cnt = ct_len >> 4; + + // all work-groups must have same #- of active work-items + assert(wi_cnt % wg_size == 0); + + return q.submit([&](sycl::handler& h) { + // create dependency graph + h.depends_on(evts); + + const auto rng = sycl::nd_range<1>{ wi_cnt, wg_size }; + h.parallel_for(rng, [=](sycl::nd_item<1> it) { + const size_t idx = it.get_global_linear_id(); + const size_t b_off = idx << 4; + + harpocrates::decrypt(inv_lut, enc + b_off, dec + b_off); + }); + }); +} + +} diff --git a/include/table.hpp b/include/table.hpp new file mode 100644 index 0000000..82f9fad --- /dev/null +++ b/include/table.hpp @@ -0,0 +1,230 @@ +#pragma once + +// This file is copied from +// https://github.com/haarcuba/cpp-text-table/tree/f217b3d; while fixing some +// bugs which popped up due to enabling `-Werror -Weverything` +// +// Thanks to https://github.com/haarcuba for creating this minimalistic +// header-only library for printing tabular data onto console ! + +#include +#include +#include +#include +#include + +#ifdef TEXTTABLE_ENCODE_MULTIBYTE_STRINGS +#include +#ifndef TEXTTABLE_USE_EN_US_UTF8 +#define TEXTTABLE_USE_EN_US_UTF8 +#endif +#endif + +class TextTable +{ +public: + enum class Alignment + { + LEFT, + RIGHT + }; + typedef std::vector Row; + TextTable() + : _horizontal('-') + , _vertical('|') + , _corner('+') + , _has_ruler(true) + { + } + + TextTable(char horizontal, char vertical, char corner) + : _horizontal(horizontal) + , _vertical(vertical) + , _corner(corner) + , _has_ruler(true) + { + } + + explicit TextTable(char vertical) + : _horizontal('\0') + , _vertical(vertical) + , _corner('\0') + , _has_ruler(false) + { + } + + void setAlignment(unsigned i, Alignment alignment) + { + _alignment[i] = alignment; + } + + Alignment alignment(unsigned i) const { return _alignment[i]; } + + char vertical() const { return _vertical; } + + char horizontal() const { return _horizontal; } + + void add(const std::string& content) { _current.push_back(content); } + + void endOfRow() + { + _rows.push_back(_current); + _current.assign(0, ""); + } + + template + void addRow(Iterator begin, Iterator end) + { + for (auto i = begin; i != end; ++i) { + add(*i); + } + endOfRow(); + } + + template + void addRow(const Container& container) + { + addRow(container.begin(), container.end()); + } + + const std::vector& rows() const { return _rows; } + + void setup() const + { + determineWidths(); + setupAlignment(); + } + + std::string ruler() const + { + std::string result; + result += _corner; + for (auto width = _width.begin(); width != _width.end(); ++width) { + result += repeat(*width, _horizontal); + result += _corner; + } + + return result; + } + + int width(unsigned i) const { return static_cast(_width[i]); } + + bool has_ruler() const { return _has_ruler; } + + int correctDistance(const std::string& string_to_correct) const + { + return static_cast(string_to_correct.size()) - + static_cast(glyphLength(string_to_correct)); + } + +private: + const char _horizontal; + const char _vertical; + const char _corner; + const bool _has_ruler; + Row _current; + std::vector _rows; + std::vector mutable _width; + std::vector mutable _utf8width; + std::map mutable _alignment; + + static std::string repeat(unsigned times, char c) + { + std::string result; + for (; times > 0; --times) + result += c; + + return result; + } + + unsigned columns() const { return static_cast(_rows[0].size()); } + + unsigned glyphLength(const std::string& s) const + { + unsigned int _byteLength = static_cast(s.length()); +#ifdef TEXTTABLE_ENCODE_MULTIBYTE_STRINGS +#ifdef TEXTTABLE_USE_EN_US_UTF8 + std::setlocale(LC_ALL, "en_US.utf8"); +#else +#error You need to specify the encoding if the TextTable library uses multybyte string encoding! +#endif + unsigned int u = 0; + const char* c_str = s.c_str(); + unsigned _glyphLength = 0; + while (u < _byteLength) { + u += std::mblen(&c_str[u], _byteLength - u); + _glyphLength += 1; + } + return _glyphLength; +#else + return _byteLength; +#endif + } + + void determineWidths() const + { + if (_rows.empty()) { + return; + } + _width.assign(columns(), 0); + _utf8width.assign(columns(), 0); + for (auto rowIterator = _rows.begin(); rowIterator != _rows.end(); + ++rowIterator) { + Row const& row = *rowIterator; + for (unsigned i = 0; i < row.size(); ++i) { + _width[i] = + _width[i] > glyphLength(row[i]) ? _width[i] : glyphLength(row[i]); + } + } + } + + void setupAlignment() const + { + if (_rows.empty()) { + return; + } + for (unsigned i = 0; i < columns(); ++i) { + if (_alignment.find(i) == _alignment.end()) { + _alignment[i] = Alignment::LEFT; + } + } + } +}; + +inline std::ostream& +operator<<(std::ostream& stream, const TextTable& table) +{ + if (table.rows().empty()) { + return stream; + } + table.setup(); + if (table.has_ruler()) { + stream << table.ruler() << "\n"; + } + for (auto rowIterator = table.rows().begin(); + rowIterator != table.rows().end(); + ++rowIterator) { + TextTable::Row const& row = *rowIterator; + stream << table.vertical(); + for (unsigned i = 0; i < row.size(); ++i) { + auto alignment = table.alignment(i) == TextTable::Alignment::LEFT + ? std::left + : std::right; + // std::setw( width ) works as follows: a string which goes in the stream + // with byte length (!) l is filled with n spaces so that l+n=width. For a + // utf8 encoded string the glyph length g might be smaller than l. We need + // n spaces so that g+n=width which is equivalent to g+n+l-l=width ==> l+n + // = width+l-g l-g (that means glyph length minus byte length) has to be + // added to the width argument. l-g is computed by correctDistance. + stream << std::setw(table.width(i) + table.correctDistance(row[i])) + << alignment << row[i]; + stream << table.vertical(); + } + stream << "\n"; + if (table.has_ruler()) { + stream << table.ruler() << "\n"; + } + } + + return stream; +} diff --git a/include/test_harpocrates.hpp b/include/test_harpocrates.hpp index da8a339..7a851e7 100644 --- a/include/test_harpocrates.hpp +++ b/include/test_harpocrates.hpp @@ -4,7 +4,7 @@ #include // Tests functional correctness of Harpocrates cipher implementation -static inline void +static void test_harpocrates() { constexpr size_t ct_len = harpocrates_common::BLOCK_LEN; diff --git a/include/test_harpocrates_parallel.hpp b/include/test_harpocrates_parallel.hpp new file mode 100644 index 0000000..8ac5818 --- /dev/null +++ b/include/test_harpocrates_parallel.hpp @@ -0,0 +1,52 @@ +#pragma once +#include "harpocrates_parallel.hpp" +#include "utils.hpp" + +// Test functional correctness of data-parallel Harpocrates cipher +// implementation +static void +test_harpocrates_parallel(sycl::queue& q, + const size_t wi_cnt, + const size_t wg_size) +{ + const size_t ct_len = wi_cnt << 4; + + // allocate resources + uint8_t* lut = static_cast(sycl::malloc_shared(256, q)); + uint8_t* ilut = static_cast(sycl::malloc_shared(256, q)); + uint8_t* txt = static_cast(sycl::malloc_shared(ct_len, q)); + uint8_t* enc = static_cast(sycl::malloc_shared(ct_len, q)); + uint8_t* dec = static_cast(sycl::malloc_shared(ct_len, q)); + + // creation of (inverse) look up table is one time ! + // + // lut -> secret key when encrypting + // ilut -> secret key when decrypting + harpocrates_utils::generate_lut(lut); + harpocrates_utils::generate_inv_lut(lut, ilut); + + random_data(txt, ct_len); + + using evt = sycl::event; + using namespace harpocrates_parallel; + + // data-parallel encryption of a large byte array + evt e0 = encrypt(q, lut, txt, enc, ct_len, wg_size, {}); + // data-parallel decryption of encrypted byte array + evt e1 = decrypt(q, ilut, enc, dec, ct_len, wg_size, { e0 }); + + // host synchronization + e1.wait(); + + // do byte-by-byte comparison of decrypted bytes against original input bytes + for (size_t i = 0; i < ct_len; i++) { + assert((txt[i] ^ dec[i]) == 0); + } + + // release memory resources + sycl::free(lut, q); + sycl::free(ilut, q); + sycl::free(txt, q); + sycl::free(enc, q); + sycl::free(dec, q); +} diff --git a/results/cpu/intel.md b/results/cpu/intel.md new file mode 100644 index 0000000..0078c17 --- /dev/null +++ b/results/cpu/intel.md @@ -0,0 +1,209 @@ +# Benchmarking Harpocrates Cipher on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz + +Build & offload computation + +```bash +make aot_cpu +``` + +```bash +$ lscpu | grep -i cpu\(s\) # number -of CPUs to offload computation to + +CPU(s): 12 +On-line CPU(s) list: 0-11 +NUMA node0 CPU(s): 0-11 +``` + +```bash +Running on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 5.581951 GB/ s|144.205437 MB/ s| 6.777904 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 5.582035 GB/ s|138.363850 MB/ s| 9.932491 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 5.837728 GB/ s|142.967339 MB/ s| 6.554243 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 5.837727 GB/ s|147.584833 MB/ s| 10.092610 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 5.969808 GB/ s|144.459242 MB/ s| 9.595604 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 5.955587 GB/ s|142.983975 MB/ s| 10.062077 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 5.841922 GB/ s|146.445741 MB/ s| 415.322551 MB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 5.843310 GB/ s|146.885674 MB/ s| 10.046577 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 5.868078 GB/ s|147.465584 MB/ s| 1.756786 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 5.866426 GB/ s|147.073815 MB/ s| 10.050817 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 5.833020 GB/ s|147.855008 MB/ s| 10.556802 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 5.833023 GB/ s|147.324569 MB/ s| 8.423482 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 9.717002 GB/ s|147.982074 MB/ s| 10.430648 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 9.717006 GB/ s|148.287912 MB/ s| 13.519199 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` + +# Benchmarking Harpocrates Cipher on Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz + +Build & offload computation + +```bash +make aot_cpu +``` + +```bash +$ lscpu | grep -i cpu\(s\) # number -of CPUs to offload computation to + +CPU(s): 24 +On-line CPU(s) list: 0-23 +NUMA node0 CPU(s): 0-5,12-17 +NUMA node1 CPU(s): 6-11,18-23 +``` + +```bash +Running on Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 2.685523 GB/ s|280.134188 MB/ s| 1.604813 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 2.686198 GB/ s|335.021444 MB/ s| 7.026786 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 2.457648 GB/ s|322.606821 MB/ s| 872.713531 MB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 2.457858 GB/ s|344.721595 MB/ s| 4.324565 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 2.800197 GB/ s|345.170706 MB/ s| 1.537947 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 2.801021 GB/ s|343.456311 MB/ s| 5.074526 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 5.619942 GB/ s|350.723255 MB/ s| 1.674531 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 5.622227 GB/ s|341.458389 MB/ s| 4.306626 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 3.847992 GB/ s|351.082332 MB/ s| 5.598815 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 3.848138 GB/ s|344.682509 MB/ s| 7.839837 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 3.098571 GB/ s|351.145309 MB/ s| 795.833972 MB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 3.098692 GB/ s|328.937738 MB/ s| 6.688548 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 3.587754 GB/ s|351.158693 MB/ s| 2.567868 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 3.587855 GB/ s|344.215876 MB/ s| 6.021415 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` + +# Benchmarking Harpocrates Cipher on Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz + +Build & offload computation + +```bash +make aot_cpu +``` + +```bash +$ lscpu | grep -i cpu\(s\) # number -of CPUs to offload computation to + +CPU(s): 128 +On-line CPU(s) list: 0-127 +NUMA node0 CPU(s): 0-31,64-95 +NUMA node1 CPU(s): 32-63,96-127 +``` + +```bash +Running on Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 2.126565 GB/ s|546.455497 MB/ s| 3.417184 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 2.124579 GB/ s| 1.067024 GB/ s| 42.766273 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 2.193323 GB/ s|936.283837 MB/ s| 9.254938 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 2.194508 GB/ s| 1.292593 GB/ s| 45.618240 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 2.528398 GB/ s| 1.221785 GB/ s| 2.974330 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 2.528379 GB/ s| 1.299065 GB/ s| 46.791104 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 2.007479 GB/ s| 1.325323 GB/ s| 8.727911 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 2.007347 GB/ s| 1.261161 GB/ s| 45.413838 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 3.257346 GB/ s| 1.332674 GB/ s| 5.114467 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 3.257870 GB/ s| 1.285063 GB/ s| 48.416781 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 5.194298 GB/ s| 1.329149 GB/ s| 4.907913 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 5.194548 GB/ s| 1.284927 GB/ s| 53.109520 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 5.916689 GB/ s| 1.336834 GB/ s| 3.461926 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 5.916769 GB/ s| 1.317116 GB/ s| 40.522305 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` + +# Benchmarking Harpocrates Cipher on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz + +Build & offload computation + +```bash +make aot_cpu +``` + +```bash +$ lscpu | grep -i cpu\(s\) # number -of CPUs to offload computation to + +CPU(s): 4 +On-line CPU(s) list: 0-3 +NUMA node0 CPU(s): 0-3 +``` + +```bash +Running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz + ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 1.046195 GB/ s|35.569170 MB/ s| 3.073119 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 1.046195 GB/ s|36.635905 MB/ s| 7.246958 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 2.071121 GB/ s|35.372593 MB/ s| 3.027706 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 2.071165 GB/ s|36.536850 MB/ s| 7.001228 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 1.862519 GB/ s|35.321218 MB/ s| 4.061408 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 1.862345 GB/ s|36.545116 MB/ s| 7.350173 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 1.151175 GB/ s|35.422106 MB/ s| 3.453035 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 1.151171 GB/ s|36.374367 MB/ s| 7.365782 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 1.233114 GB/ s|35.502835 MB/ s| 7.205230 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 1.233124 GB/ s|36.568232 MB/ s| 7.564049 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 1.194326 GB/ s|35.459500 MB/ s| 3.726671 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 1.194326 GB/ s|36.711025 MB/ s| 7.564547 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 1.123413 GB/ s|35.567742 MB/ s| 7.193730 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 1.123413 GB/ s|36.564341 MB/ s| 7.562630 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +``` diff --git a/results/gpu/intel.md b/results/gpu/intel.md new file mode 100644 index 0000000..ee1f374 --- /dev/null +++ b/results/gpu/intel.md @@ -0,0 +1,43 @@ +# Benchmarking Harpocrates Cipher on Intel(R) UHD Graphics P630 + +Build & offload computation + +```bash +make aot_gpu # change device identifier argument to `0x3e96` +``` + +```bash +Running on Intel(R) UHD Graphics P630 [0x3e96] + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 8.597729 GB/ s|267.765881 MB/ s| 16.256737 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 8.598515 GB/ s|268.110738 MB/ s| 16.415505 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 8.695757 GB/ s|268.338397 MB/ s| 15.930694 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 8.696962 GB/ s|268.594882 MB/ s| 16.897317 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 8.679487 GB/ s|269.241631 MB/ s| 16.855711 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 8.688099 GB/ s|269.201400 MB/ s| 16.876867 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 8.694904 GB/ s|269.614622 MB/ s| 16.771990 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 8.694703 GB/ s|269.745347 MB/ s| 16.876489 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 8.801711 GB/ s|269.672235 MB/ s| 17.005500 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 8.803615 GB/ s|269.444002 MB/ s| 16.860051 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 9.204012 GB/ s|269.588275 MB/ s| 16.980957 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 9.205249 GB/ s|269.092258 MB/ s| 16.951713 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 10.123275 GB/ s|269.862527 MB/ s| 17.038062 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 10.123922 GB/ s|269.396075 MB/ s| 15.748901 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` diff --git a/results/gpu/nvidia.md b/results/gpu/nvidia.md new file mode 100644 index 0000000..a41158e --- /dev/null +++ b/results/gpu/nvidia.md @@ -0,0 +1,43 @@ +# Benchmarking Harpocrates Cipher on Nvidia Tesla V100 + +Build & offload computation + +```bash +make cuda +``` + +```bash +Running on Tesla V100-SXM2-16GB + ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 9.426423 GB/ s|10.351960 GB/ s| 11.581023 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 9.426810 GB/ s|10.494332 GB/ s| 11.638742 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 10.215561 GB/ s|10.458480 GB/ s| 11.648543 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 10.226479 GB/ s|10.618554 GB/ s| 11.646025 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 10.291292 GB/ s|10.510542 GB/ s| 11.664863 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 10.296363 GB/ s|10.664890 GB/ s| 11.632136 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 10.312413 GB/ s|10.537798 GB/ s| 11.670979 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 10.314698 GB/ s|10.697646 GB/ s| 11.668983 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 10.328942 GB/ s|10.550175 GB/ s| 11.677900 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 10.334467 GB/ s|10.708384 GB/ s| 11.677101 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 10.326854 GB/ s|11.640994 GB/ s| 11.680031 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 10.329666 GB/ s|11.816157 GB/ s| 11.683763 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 10.332478 GB/ s|12.360134 GB/ s| 11.648277 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 10.333625 GB/ s|12.549789 GB/ s| 11.680031 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ +``` diff --git a/test/main.cpp b/test/main.cpp index 9217639..7d6a786 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -3,6 +3,11 @@ #include #include +// For testing data-parallel Harpocrates cipher implementation, using SYCL +#if defined TEST_SYCL +#include "test_harpocrates_parallel.hpp" +#endif + int main() { @@ -19,5 +24,28 @@ main() std::cout << "[test] Harpocrates random encrypt -> decrypt works !" << std::endl; +// For testing data-parallel Harpocrates cipher implementation, using SYCL +#if defined TEST_SYCL + +#if defined TARGET_CPU + sycl::cpu_selector s{}; +#pragma message("Targeting default CPU at run-time") +#elif defined TARGET_GPU + sycl::gpu_selector s{}; +#pragma message("Targeting default GPU at run-time") +#else + sycl::default_selector s{}; +#pragma message("Targeting default Accelerator at run-time") +#endif + + sycl::device d{ s }; + sycl::context c{ d }; + sycl::queue q{ c, d }; + + test_harpocrates_parallel(q, 1024ul, 32ul); + std::cout << "[test] Data-parallel Harpocrates cipher works !" << std::endl; + +#endif + return EXIT_SUCCESS; }