Skip to content

Commit

Permalink
compaction: introduce compaction_options parameter (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
umegane committed Feb 18, 2025
1 parent 8a5de59 commit 73ea9e7
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 156 deletions.
184 changes: 112 additions & 72 deletions src/limestone/compaction_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,76 +14,116 @@
* limitations under the License.
*/

#pragma once
#pragma once

#include <optional>
#include <set>
#include <string>
#include <functional> // std::reference_wrapper
#include <boost/filesystem.hpp>
#include "blob_file_gc_snapshot.h"
#include "limestone/api/write_version_type.h"

namespace limestone::internal {

class compaction_options {
public:
// Constructor: No file set, GC disabled
compaction_options(
const boost::filesystem::path& from,
const boost::filesystem::path& to,
int workers
) : from_dir_(from), to_dir_(to), num_worker_(workers),
file_names_(), use_file_set_(false),
gc_snapshot_(std::nullopt) // GC is disabled
{}

// Constructor: File set available, GC disabled
compaction_options(
const boost::filesystem::path& from,
const boost::filesystem::path& to,
int workers,
std::set<std::string> file_names
) : from_dir_(from), to_dir_(to), num_worker_(workers),
file_names_(file_names), use_file_set_(true),
gc_snapshot_(std::nullopt) // GC is disabled
{}

// Constructor: File set available, GC enabled
// Note: The blob_file_gc_snapshot is provided externally by reference.
compaction_options(
const boost::filesystem::path& from,
const boost::filesystem::path& to,
int workers,
std::set<std::string> file_names,
blob_file_gc_snapshot& gc_snapshot
) : from_dir_(from), to_dir_(to), num_worker_(workers),
file_names_(file_names), use_file_set_(true),
gc_snapshot_(std::ref(gc_snapshot))
{}

// Getters
const boost::filesystem::path& get_from_dir() const { return from_dir_; }
const boost::filesystem::path& get_to_dir() const { return to_dir_; }
int get_num_worker() const { return num_worker_; }

const std::set<std::string>& get_file_names() const { return file_names_; }
bool is_using_file_set() const { return use_file_set_; }

bool is_gc_enabled() const { return gc_snapshot_.has_value(); }
const blob_file_gc_snapshot& get_gc_snapshot() const { return gc_snapshot_.value().get(); }

private:
// Basic compaction settings
boost::filesystem::path from_dir_;
boost::filesystem::path to_dir_;
int num_worker_;

std::set<std::string> file_names_;
bool use_file_set_;

// Garbage collection settings
std::optional<std::reference_wrapper<blob_file_gc_snapshot>> gc_snapshot_;
};

} // namespace limestone::internal
#include <set>
#include <string>
#include <functional> // std::reference_wrapper
#include <boost/filesystem.hpp>
#include "blob_file_gc_snapshot.h"
#include "limestone/api/write_version_type.h"

namespace limestone::internal {

class compaction_options {
public:
// Constructor: Pre-compaction phase where to_dir is not provided.
// File set is provided and GC is disabled.
// In this constructor, "/not_exists_dir" is set for to_dir to indicate an error if used.
compaction_options(
const boost::filesystem::path& from,

Check warning on line 34 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
int workers,
std::set<std::string> file_names

Check warning on line 36 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
)
: from_dir_(from),
to_dir_("/not_exists_dir"),
num_worker_(workers),
file_names_(file_names),

Check warning on line 41 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-value-param

parameter 'file_names' is passed by value and only copied once; consider moving it to avoid unnecessary copies
has_file_set_(true),
gc_snapshot_(std::nullopt)
{}

// Constructor: to_dir provided, GC disabled, no file set.
compaction_options(
const boost::filesystem::path& from,

Check warning on line 48 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
const boost::filesystem::path& to,

Check warning on line 49 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
int workers
)
: from_dir_(from),
to_dir_(to),
num_worker_(workers),
file_names_(),

Check warning on line 55 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

readability-redundant-member-init

initializer for member 'file_names_' is redundant
has_file_set_(false),
gc_snapshot_(std::nullopt)
{}

// Constructor: to_dir provided, file set available, GC disabled.
compaction_options(
const boost::filesystem::path& from,

Check warning on line 62 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
const boost::filesystem::path& to,

Check warning on line 63 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
int workers,
std::set<std::string> file_names

Check warning on line 65 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
)
: from_dir_(from),
to_dir_(to),
num_worker_(workers),
file_names_(file_names),

Check warning on line 70 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-value-param

parameter 'file_names' is passed by value and only copied once; consider moving it to avoid unnecessary copies
has_file_set_(true),
gc_snapshot_(std::nullopt)
{}

// Constructor: to_dir provided, file set available, GC enabled.
// Note: The blob_file_gc_snapshot is provided externally by reference.
compaction_options(
const boost::filesystem::path& from,

Check warning on line 78 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
const boost::filesystem::path& to,

Check warning on line 79 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
int workers,
std::set<std::string> file_names,

Check warning on line 81 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move
blob_file_gc_snapshot& gc_snapshot
)
: from_dir_(from),
to_dir_(to),
num_worker_(workers),
file_names_(file_names),

Check warning on line 87 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-value-param

parameter 'file_names' is passed by value and only copied once; consider moving it to avoid unnecessary copies
has_file_set_(true),
gc_snapshot_(std::ref(gc_snapshot))
{}

// Getter for from_dir
const boost::filesystem::path& get_from_dir() const { return from_dir_; }

Check warning on line 93 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'get_from_dir' should be marked [[nodiscard]]

// Getter for to_dir.
const boost::filesystem::path& get_to_dir() const { return to_dir_; }

Check warning on line 96 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'get_to_dir' should be marked [[nodiscard]]

// Getter for num_worker.
int get_num_worker() const { return num_worker_; }

Check warning on line 99 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'get_num_worker' should be marked [[nodiscard]]

// Getter for file_names.
const std::set<std::string>& get_file_names() const { return file_names_; }

Check warning on line 102 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'get_file_names' should be marked [[nodiscard]]

// Returns true if a file set is configured.
bool has_file_set() const { return has_file_set_; }

Check warning on line 105 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'has_file_set' should be marked [[nodiscard]]

// Check if GC is enabled.
bool is_gc_enabled() const { return gc_snapshot_.has_value(); }

Check warning on line 108 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'is_gc_enabled' should be marked [[nodiscard]]

// Getter for gc_snapshot.
// It is caller's responsibility to ensure GC is enabled before calling.
blob_file_gc_snapshot& get_gc_snapshot() const { return gc_snapshot_.value().get(); }

Check warning on line 112 in src/limestone/compaction_options.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-use-nodiscard

function 'get_gc_snapshot' should be marked [[nodiscard]]

private:
// Basic compaction settings.
boost::filesystem::path from_dir_;
boost::filesystem::path to_dir_;
int num_worker_;

// File set for compaction
std::set<std::string> file_names_;
bool has_file_set_;

// Garbage collection settings.
std::optional<std::reference_wrapper<blob_file_gc_snapshot>> gc_snapshot_;
};

} // namespace limestone::internal

6 changes: 3 additions & 3 deletions src/limestone/datastore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "log_entry.h"
#include "online_compaction.h"
#include "compaction_catalog.h"
#include "compaction_options.h"
#include "blob_file_resolver.h"
#include "blob_pool_impl.h"
#include "blob_file_garbage_collector.h"
Expand Down Expand Up @@ -692,9 +693,8 @@ void datastore::compact_with_online() {
boundary_version_copy = available_boundary_version_;
}
blob_file_gc_snapshot gc_snapshot(boundary_version_copy);

blob_id_type max_blob_id = create_compact_pwal_and_get_max_blob_id(location_, compaction_temp_dir, recover_max_parallelism_, need_compaction_filenames);

compaction_options options{location_, compaction_temp_dir, recover_max_parallelism_, need_compaction_filenames, gc_snapshot};
blob_id_type max_blob_id = create_compact_pwal_and_get_max_blob_id(options);


// handle existing compacted file
Expand Down
31 changes: 13 additions & 18 deletions src/limestone/datastore_snapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,16 @@ static void insert_twisted_entry(sortdb_wrapper* sortdb, const log_entry& e) {
sortdb->put(db_key, db_value);
}

static std::pair<epoch_id_type, sorting_context> create_sorted_from_wals(
const boost::filesystem::path& from_dir,
int num_worker,
const std::set<std::string>& file_names = std::set<std::string>(),
std::optional<std::reference_wrapper<blob_file_gc_snapshot>> gc_snapshot_ref = std::nullopt) {
static std::pair<epoch_id_type, sorting_context> create_sorted_from_wals(compaction_options options) {
auto from_dir = options.get_from_dir();
auto file_names = options.get_file_names();
auto num_worker = options.get_num_worker();
#if defined SORT_METHOD_PUT_ONLY
sorting_context sctx{std::make_unique<sortdb_wrapper>(from_dir, comp_twisted_key)};
#else
sorting_context sctx{std::make_unique<sortdb_wrapper>(from_dir)};
#endif
dblog_scan logscan = file_names.empty() ? dblog_scan{from_dir} : dblog_scan{from_dir, file_names};
dblog_scan logscan = file_names.empty() ? dblog_scan{from_dir} : dblog_scan{from_dir, options};

epoch_id_type ld_epoch = logscan.last_durable_epoch_in_dir();

Expand All @@ -130,11 +129,11 @@ static std::pair<epoch_id_type, sorting_context> create_sorted_from_wals(
const auto add_entry_to_point = insert_entry_or_update_to_max;
bool works_with_multi_thread = false;
#endif
auto add_entry = [&sctx, &add_entry_to_point, &gc_snapshot_ref](const log_entry& e){
auto add_entry = [&sctx, &add_entry_to_point, &options](const log_entry& e){
switch (e.type()) {
case log_entry::entry_type::normal_with_blob:
if(gc_snapshot_ref) {
gc_snapshot_ref.value().get().sanitize_and_add_entry(e);
if (options.is_gc_enabled()) {
options.get_gc_snapshot().sanitize_and_add_entry(e);
}
add_entry_to_point(sctx.get_sortdb(), e);
break;
Expand Down Expand Up @@ -301,16 +300,11 @@ static void sortdb_foreach(
#endif
}

blob_id_type create_compact_pwal_and_get_max_blob_id(
const boost::filesystem::path& from_dir,
const boost::filesystem::path& to_dir,
int num_worker,
const std::set<std::string>& file_names,
std::optional<std::reference_wrapper<blob_file_gc_snapshot>> gc_snapshot_ref) {

auto [max_appeared_epoch, sctx] = create_sorted_from_wals(from_dir, num_worker, file_names, gc_snapshot_ref);
blob_id_type create_compact_pwal_and_get_max_blob_id(compaction_options options) {

Check warning on line 303 in src/limestone/datastore_snapshot.cpp

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-value-param

the parameter 'options' is copied for each invocation but only used as a const reference; consider making it a const reference
auto [max_appeared_epoch, sctx] = create_sorted_from_wals(options);

boost::system::error_code error;
auto to_dir = options.get_to_dir();

Check warning on line 307 in src/limestone/datastore_snapshot.cpp

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-copy-initialization

the variable 'to_dir' is copy-constructed from a const reference but is only used as const reference; consider making it a const reference
const bool result_check = boost::filesystem::exists(to_dir, error);
if (!result_check || error) {
const bool result_mkdir = boost::filesystem::create_directory(to_dir, error);
Expand Down Expand Up @@ -438,7 +432,8 @@ snapshot::~snapshot() = default;
blob_id_type datastore::create_snapshot_and_get_max_blob_id() {
const auto& from_dir = location_;
std::set<std::string> file_names = assemble_snapshot_input_filenames(compaction_catalog_, from_dir);
auto [max_appeared_epoch, sctx] = create_sorted_from_wals(from_dir, recover_max_parallelism_, file_names);
compaction_options options(from_dir, recover_max_parallelism_, file_names);
auto [max_appeared_epoch, sctx] = create_sorted_from_wals(options);
epoch_id_switched_.store(max_appeared_epoch);
epoch_id_informed_.store(max_appeared_epoch);

Expand Down
41 changes: 3 additions & 38 deletions src/limestone/dblog_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <limestone/api/datastore.h>
#include "internal.h"
#include "log_entry.h"
#include "compaction_options.h"


namespace limestone::internal {
Expand Down Expand Up @@ -84,33 +85,6 @@ class dblog_scan {

explicit dblog_scan(const boost::filesystem::path& logdir) : dblogdir_(logdir) { rescan_directory_paths(); }

Check warning on line 86 in src/limestone/dblog_scan.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move

/**
* @brief Constructor that initializes the dblog_scan with the specified log directory.
*
* This constructor uses the initial content of the `logdir` to determine the target files for processing.
* If the contents of `logdir` change during processing, you must call the `rescan_directory_paths()` method
* to update the target files accordingly.
*
* @param logdir The path to the directory containing the files to be processed.
*/
explicit dblog_scan(boost::filesystem::path&& logdir) : dblogdir_(std::move(logdir)) { rescan_directory_paths(); }

/**
* @brief Constructor that initializes the dblog_scan with the specified log directory and file names.
*
* This constructor processes only the files specified in the `file_names` set within the `logdir`.
* It is used when you do not want the target files to change even if the contents of `logdir` are modified after
* determining the target files.
*
* @param logdir The path to the directory containing the files to be processed.
* @param file_names The set of file names within `logdir` to be processed.
*/
explicit dblog_scan(const boost::filesystem::path& logdir, const std::set<std::string>& file_names) : dblogdir_(logdir) {
for (const auto& file_name : file_names) {
path_list_.emplace_back(dblogdir_ / file_name);
}
}

/**
* @brief Constructor that initializes the dblog_scan with the specified log directory and file names.
*
Expand All @@ -121,8 +95,8 @@ class dblog_scan {
* @param logdir The path to the directory containing the files to be processed.
* @param file_names The set of file names within `logdir` to be processed.
*/
explicit dblog_scan(boost::filesystem::path&& logdir, const std::set<std::string>& file_names) : dblogdir_(std::move(logdir)) {
for (const auto& file_name : file_names) {
explicit dblog_scan(const boost::filesystem::path& logdir, compaction_options options) : dblogdir_(logdir) {

Check warning on line 98 in src/limestone/dblog_scan.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

modernize-pass-by-value

pass by value and use std::move

Check warning on line 98 in src/limestone/dblog_scan.h

View workflow job for this annotation

GitHub Actions / Clang-Tidy

performance-unnecessary-value-param

the parameter 'options' is copied for each invocation but only used as a const reference; consider making it a const reference
for (const auto& file_name : options.get_file_names()) {
path_list_.emplace_back(dblogdir_ / file_name);
}
}
Expand Down Expand Up @@ -194,15 +168,6 @@ class dblog_scan {
*/
void rescan_directory_paths();

/**
* @brief Cleans up unnecessary epoch files in the log directory.
*
* This method identifies and removes unnecessary epoch files in the log directory.
* Only the main epoch file is retained, and all rotated epoch files are deleted.
* If the main epoch file does not exist among the identified files, an exception is thrown.
*/
void cleanup_rotated_epoch_files();

private:
boost::filesystem::path dblogdir_;
std::list<boost::filesystem::path> path_list_;
Expand Down
3 changes: 2 additions & 1 deletion src/limestone/dblogutil/dblogutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ void compaction(dblog_scan &ds, std::optional<epoch_id_type> epoch) {
setup_initial_logdir(tmp);

VLOG_LP(log_info) << "making compact pwal file to " << tmp;
create_compact_pwal_and_get_max_blob_id(from_dir, tmp, FLAGS_thread_num, std::set<std::string>{});
compaction_options options{from_dir, tmp, FLAGS_thread_num};
create_compact_pwal_and_get_max_blob_id(options);

// epoch file
VLOG_LP(log_info) << "making compact epoch file to " << tmp;
Expand Down
Loading

0 comments on commit 73ea9e7

Please sign in to comment.