Skip to content

Commit

Permalink
Merge pull request #27 from scylla-zpp-blas/benchmark
Browse files Browse the repository at this point in the history
- Adding identifier type (`scylla_blas::id_t`), as requested by @jbhayven 
- Renamed index_type to index_t
- BLOCK_SIZE is now not global, but saved per matrix
- Refactored matrix and vector classes (field/method order, driver usage, added nonstatic resize, etc.)
- Fixed bug in tests (they were using abs instead of std::abs, abs works on integers)
- Worker concurrency is now not global, but saved per scheduler. Same with scheduler sleep time.
- Worker sleep time and max retries are now settable with command line arguments.
- Sleeps now have microsecond accuracy.
- Implement simple benchmark program
- Performance optimizations
- Many other changes introduced during benchmarking
  • Loading branch information
Lorak-mmk authored Jun 14, 2021
2 parents d18a702 + 7a648f4 commit 823fe40
Show file tree
Hide file tree
Showing 57 changed files with 2,595 additions and 875 deletions.
2 changes: 2 additions & 0 deletions .idea/cmake.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 15 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,24 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

option(BUILD_EXAMPLES "Should examples be built" ON)
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
option(BUILD_BENCHMARK "Should benchmarks be built" ON)

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
option(BUILD_TESTS "Should the tests be built" ON)
set(SCYLLA_BLAS_LOGLEVEL "DEBUG" CACHE STRING "Scylla blas loglevel. Possible values: TRACE DEBUG INFO WARN ERROR CRITICAL")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
ELSE() # Release mode
else() # Release mode
option(BUILD_TESTS "Should the tests be built" OFF)
set(SCYLLA_BLAS_LOGLEVEL "INFO" CACHE STRING "Scylla blas loglevel. Possible values: TRACE DEBUG INFO WARN ERROR CRITICAL")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
ENDIF()
endif()

message("Scylla blas: build type ${CMAKE_BUILD_TYPE}")
message("Scylla blas configuration:")
message("BUILD_EXAMPLES ${BUILD_EXAMPLES}")
message("BUILD_BENCHMARK ${BUILD_BENCHMARK}")
message("BUILD_TESTS ${BUILD_TESTS}")
message("SCYLLA_BLAS_LOGLEVEL ${SCYLLA_BLAS_LOGLEVEL}")
message("CMAKE_INTERPROCEDURAL_OPTIMIZATION ${CMAKE_INTERPROCEDURAL_OPTIMIZATION}")
Expand All @@ -37,15 +42,15 @@ message("CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE}")

set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/scylla_blas)
set(BLAS_SRC
${SRC_DIR}/blas_generic.cc
${SRC_DIR}/blas_level_1.cc
${SRC_DIR}/blas_level_2.cc
${SRC_DIR}/blas_level_3.cc
${SRC_DIR}/blaslike.cc
${SRC_DIR}/matrix.cc
${SRC_DIR}/vector.cc
${SRC_DIR}/queue/scylla_queue.cc
${SRC_DIR}/queue/worker_proc.cc
)
)

set(INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include/scylla_blas)
set(BLAS_INCLUDE
Expand All @@ -68,7 +73,7 @@ set(BLAS_INCLUDE
${INCLUDE_DIR}/utils/utils.hh)

add_library(scylla_blas SHARED "${BLAS_SRC}" "${BLAS_INCLUDE}")
target_include_directories(scylla_blas PUBLIC ${CMAKE_SOURCE_DIR}/include)
target_include_directories(scylla_blas PUBLIC ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/matrix_generators)
target_link_libraries(scylla_blas PUBLIC scylla_modern_cpp_driver fmt::fmt)
target_compile_features(scylla_blas PUBLIC cxx_std_20)
target_compile_definitions(scylla_blas PUBLIC SCYLLA_BLAS_LOGLEVEL=${SCYLLA_BLAS_LOGLEVEL})
Expand All @@ -88,3 +93,7 @@ endif ()
if (BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()

if (BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif ()
9 changes: 9 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
set(BENCHMARK_SRC main.cc benchmark.hh benchmark.cc const.hh)

add_executable(scylla_blas_benchmark "${BENCHMARK_SRC}")
target_include_directories(scylla_blas_benchmark PUBLIC "${Boost_INCLUDE_DIRS}")
target_link_libraries(scylla_blas_benchmark PUBLIC scylla_blas "${Boost_LIBRARIES}")
target_include_directories(scylla_blas_benchmark PRIVATE "${CMAKE_SOURCE_DIR}/matrix_generators")

add_executable(insert_benchmark insert_benchmark.cc)
target_link_libraries(insert_benchmark PUBLIC scylla_blas)
181 changes: 181 additions & 0 deletions benchmark/benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#include <chrono>

#include <scylla_blas/matrix.hh>
#include <scylla_blas/vector.hh>

#include "const.hh"
#include "benchmark.hh"

// Matrix * Matrix

void benchmark_mm::init() {
scylla_blas::matrix<float>::init(session, l_matrix_id, 0, 0, true);
scylla_blas::matrix<float>::init(session, r_matrix_id, 0, 0, true);
scylla_blas::matrix<float>::init(session, w_matrix_id, 0, 0, true);
}

void benchmark_mm::setup(int64_t block_size, int64_t length) {
left_matrix = std::make_unique<scylla_blas::matrix<float>>(session, l_matrix_id);
right_matrix = std::make_unique<scylla_blas::matrix<float>>(session, r_matrix_id);
result_matrix = std::make_unique<scylla_blas::matrix<float>>(session, w_matrix_id);

left_matrix->resize(length, length);
right_matrix->resize(length, length);
result_matrix->resize(length, length);

left_matrix->set_block_size(block_size);
right_matrix->set_block_size(block_size);
result_matrix->set_block_size(block_size);

scheduler.srmgen(matrix_load, *right_matrix);
scheduler.srmgen(matrix_load, *left_matrix);
}

void benchmark_mm::proc() {
scheduler.sgemm(scylla_blas::NoTrans, scylla_blas::NoTrans, 1.0, *left_matrix, *right_matrix, 0.0, *result_matrix);
}

void benchmark_mm::teardown() {
left_matrix->clear_all();
right_matrix->clear_all();
result_matrix->clear_all();
}

void benchmark_mm::destroy() {
scylla_blas::matrix<float>::drop(session, l_matrix_id);
scylla_blas::matrix<float>::drop(session, r_matrix_id);
scylla_blas::matrix<float>::drop(session, w_matrix_id);
}

// Matrix * Vector

void benchmark_mv::init() {
scylla_blas::matrix<float>::init(session, l_matrix_id, 0, 0, true);
scylla_blas::vector<float>::init(session, r_vector_id, 0, true);
scylla_blas::vector<float>::init(session, w_vector_id, 0, true);
}

void benchmark_mv::setup(int64_t block_size, int64_t length) {
left_matrix = std::make_unique<scylla_blas::matrix<float>>(session, l_matrix_id);
right_vector = std::make_unique<scylla_blas::vector<float>>(session, r_vector_id);
result_vector = std::make_unique<scylla_blas::vector<float>>(session, w_vector_id);

left_matrix->resize(length, length);
right_vector->resize(length);
result_vector->resize(length);

left_matrix->set_block_size(block_size);
right_vector->set_block_size(block_size);
result_vector->set_block_size(block_size);

scheduler.srmgen(matrix_load, *left_matrix);
fill_vector(*right_vector, length);
}

void benchmark_mv::proc() {
scheduler.sgemv(scylla_blas::NoTrans, 1.0, *left_matrix, *right_vector, 0.0, *result_vector);
}

void benchmark_mv::teardown() {
left_matrix->clear_all();
right_vector->clear_all();
result_vector->clear_all();
}

void benchmark_mv::destroy() {
scylla_blas::matrix<float>::drop(session, l_matrix_id);
scylla_blas::vector<float>::drop(session, r_vector_id);
scylla_blas::vector<float>::drop(session, w_vector_id);
}

// Vector * Vector

void benchmark_vv::init() {
scylla_blas::vector<float>::init(session, l_vector_id, 0, true);
scylla_blas::vector<float>::init(session, r_vector_id, 0, true);
}

void benchmark_vv::setup(int64_t block_size, int64_t length) {
left_vector = std::make_unique<scylla_blas::vector<float>>(session, l_vector_id);
right_vector = std::make_unique<scylla_blas::vector<float>>(session, r_vector_id);

left_vector->resize(length);
right_vector->resize(length);

left_vector->set_block_size(block_size);
right_vector->set_block_size(block_size);

fill_vector(*left_vector, length);
fill_vector(*right_vector, length);
}

void benchmark_vv::proc() {
scheduler.sdot(*left_vector, *right_vector);
}

void benchmark_vv::teardown() {
left_vector->clear_all();
right_vector->clear_all();
}

void benchmark_vv::destroy() {
scylla_blas::vector<float>::drop(session, l_vector_id);
scylla_blas::vector<float>::drop(session, r_vector_id);
}

template<typename F, typename... Args>
double measure_time(F callable, Args... args) {
auto t1 = std::chrono::high_resolution_clock::now();
callable(args...);
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = t2-t1;
return duration.count();
}

benchmark_result perform_benchmark(std::unique_ptr<base_benchmark> tester,
const std::vector<int64_t> &block_sizes,
const std::vector<int64_t> &problem_sizes,
bool autoclean) {
benchmark_result results{};

LogInfo("Starting initialization... ");
results.init_time = measure_time([&](){tester->init();});
LogInfo("Initialization took {}ms", results.init_time);

for(int64_t block_size : block_sizes) {
for(int64_t problem_size : problem_sizes) {
benchmark_result::result_t current_result{};
LogInfo("Block size: {}, problem size: {}", block_size, problem_size);

LogInfo("\tStarting setup");
current_result.setup_time = measure_time([&](int64_t b, int64_t l){tester->setup(b, l);}, block_size, problem_size);
LogInfo("\tSetup took {}ms", current_result.setup_time);

LogInfo("\tStarting procedure");
current_result.proc_time = measure_time([&](){tester->proc();});
LogInfo("\tProcedure took {}ms", current_result.proc_time);

if (autoclean) {
LogInfo("\tStarting teardown");
current_result.teardown_time = measure_time([&](){tester->teardown();});
LogInfo("\tTeardown took {}ms", current_result.teardown_time);
} else {
current_result.teardown_time = 0;
LogDebug("\tAutoclean off: skipping teardown");
}

results.tests.emplace_back(block_size, problem_size, current_result);
}
}

if (autoclean) {
LogInfo("Starting destroy");
results.destroy_time = measure_time([&](){tester->destroy();});
LogInfo("Destroy took {}ms\n", results.destroy_time);
} else {
results.destroy_time = 0;
LogDebug("\tAutoclean off: skipping destroy");
}

return results;
}
119 changes: 119 additions & 0 deletions benchmark/benchmark.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#pragma once
#include <map>

#include <scmd.hh>

#include "scylla_blas/matrix.hh"
#include "scylla_blas/vector.hh"
#include "scylla_blas/routines.hh"

#include "sparse_matrix_value_generator.hh"
#include "random_value_factory.hh"

#include "const.hh"

template<class T>
void load_vector_from_generator(value_factory<T> &gen, scylla_blas::vector<T> &vector) {
scylla_blas::vector_segment<T> next_segment;
scylla_blas::index_t in_segment_index = 1;
scylla_blas::index_t segment_number = 1;
scylla_blas::index_t segment_offset = 0;

LogDebug("Filling vector with length {} and block size {}", vector.get_length(), vector.get_block_size());
for(size_t i = 0; i < vector.get_length(); i++) {
if (in_segment_index > vector.get_block_size()) {
vector.insert_segment(segment_number, next_segment);
next_segment.clear();

in_segment_index = 1;
segment_number++;
segment_offset += vector.get_block_size();
}
T next_val = gen.next();
next_segment.emplace_back(in_segment_index, next_val);
in_segment_index++;
}

if (in_segment_index != 1) {
vector.insert_segment(segment_number, next_segment);
next_segment.clear();
}

LogInfo("Loaded a vector {} from a generator", vector.get_id());
}

template<typename T>
void fill_vector(scylla_blas::vector<T> &v, scylla_blas::index_t length) {
std::shared_ptr<value_factory<T>> f = std::make_shared<random_value_factory<T>>(0, 9, RANDOM_SEED);
load_vector_from_generator(*f, v);
}

struct benchmark_result {
using result_t = struct { double setup_time; double proc_time; double teardown_time; };
double init_time;
double destroy_time;
std::vector<std::tuple<int64_t, int64_t, result_t>> tests;
};

class base_benchmark {
protected:
std::shared_ptr <scmd::session> session;
scylla_blas::routine_scheduler scheduler;
double matrix_load;
public:
explicit base_benchmark(const std::shared_ptr<scmd::session> &session) : session(session), scheduler(session) {}
virtual void init() = 0;
virtual void setup(int64_t block_size, int64_t length) = 0;
virtual void proc() = 0;
virtual void teardown() = 0;
virtual void destroy() = 0;
void set_max_workers(int64_t new_max_workers) {
scheduler.set_max_used_workers(new_max_workers);
}
void set_matrix_load(double load) {
this->matrix_load = load;
}
};

class benchmark_mm : public base_benchmark {
std::unique_ptr<scylla_blas::matrix<float>> left_matrix;
std::unique_ptr<scylla_blas::matrix<float>> right_matrix;
std::unique_ptr<scylla_blas::matrix<float>> result_matrix;
public:
explicit benchmark_mm(const std::shared_ptr<scmd::session> &session) : base_benchmark(session) {}
void init() override;
void setup(int64_t block_size, int64_t length) override;
void proc() override;
void teardown() override;
void destroy() override;
};

class benchmark_mv : public base_benchmark {
std::unique_ptr<scylla_blas::matrix<float>> left_matrix;
std::unique_ptr<scylla_blas::vector<float>> right_vector;
std::unique_ptr<scylla_blas::vector<float>> result_vector;
public:
explicit benchmark_mv(const std::shared_ptr<scmd::session> &session) : base_benchmark(session) {}
void init() override;
void setup(int64_t block_size, int64_t length) override;
void proc() override;
void teardown() override;
void destroy() override;
};

class benchmark_vv : public base_benchmark {
std::unique_ptr<scylla_blas::vector<float>> left_vector;
std::unique_ptr<scylla_blas::vector<float>> right_vector;
public:
explicit benchmark_vv(const std::shared_ptr<scmd::session> &session) : base_benchmark(session) {}
void init() override;
void setup(int64_t block_size, int64_t length) override;
void proc() override;
void teardown() override;
void destroy() override;
};

benchmark_result perform_benchmark(std::unique_ptr<base_benchmark> tester,
const std::vector<int64_t> &block_sizes,
const std::vector<int64_t> &problem_sizes,
bool autoclean);
Loading

0 comments on commit 823fe40

Please sign in to comment.