Skip to content

Commit

Permalink
Direct IO capability for RocksDB
Browse files Browse the repository at this point in the history
Summary:
This patch adds direct IO capability to RocksDB Env.

The direct IO capability is required for persistent cache since NVM is best
accessed as 4K direct IO. SSDs can leverage direct IO for reading.

Direct IO requires the offset and size be sector size aligned, and memory to
be kernel page aligned. Since neither RocksDB/Persistent read cache data
layout is aligned to sector size, the code can accommodate reading unaligned IO size
(or unaligned memory) at the cost of an alloc/copy.

The write code path expects the size and memory to be aligned.

Test Plan: Run RocksDB unit tests

Reviewers: sdong

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D57393
  • Loading branch information
krad committed May 23, 2016
1 parent 8f12145 commit f89caa1
Show file tree
Hide file tree
Showing 5 changed files with 612 additions and 254 deletions.
6 changes: 6 additions & 0 deletions include/rocksdb/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ struct EnvOptions {
// If true, then use mmap to write data
bool use_mmap_writes = true;

// If true, then use O_DIRECT for reading data
bool use_direct_reads = false;

// If true, then use O_DIRECT for writing data
bool use_direct_writes = false;

// If false, fallocate() calls are bypassed
bool allow_fallocate = true;

Expand Down
38 changes: 38 additions & 0 deletions util/env_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,17 @@ class PosixEnv : public Env {
if (f == nullptr) {
*result = nullptr;
return IOError(fname, errno);
} else if (options.use_direct_reads && !options.use_mmap_writes) {
int flags = O_RDONLY | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
int fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
return IOError(fname, errno);
}
std::unique_ptr<PosixDirectIOSequentialFile> file(
new PosixDirectIOSequentialFile(fname, fd));
*result = std::move(file);
return Status::OK();
} else {
int fd = fileno(f);
SetFD_CLOEXEC(fd, &options);
Expand Down Expand Up @@ -189,6 +200,18 @@ class PosixEnv : public Env {
}
}
close(fd);
} else if (options.use_direct_reads) {
int flags = O_RDONLY | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
s = IOError(fname, errno);
} else {
std::unique_ptr<PosixDirectIORandomAccessFile> file(
new PosixDirectIORandomAccessFile(fname, fd));
*result = std::move(file);
s = Status::OK();
}
} else {
result->reset(new PosixRandomAccessFile(fname, fd, options));
}
Expand Down Expand Up @@ -221,6 +244,18 @@ class PosixEnv : public Env {
}
if (options.use_mmap_writes && !forceMmapOff) {
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
} else if (options.use_direct_writes) {
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
s = IOError(fname, errno);
} else {
std::unique_ptr<PosixDirectIOWritableFile> file(
new PosixDirectIOWritableFile(fname, fd));
*result = std::move(file);
s = Status::OK();
}
} else {
// disable mmap writes
EnvOptions no_mmap_writes_options = options;
Expand Down Expand Up @@ -763,6 +798,9 @@ std::string Env::GenerateUniqueId() {
return uuid2;
}

//
// Default Posix Env
//
Env* Env::Default() {
// The following function call initializes the singletons of ThreadLocalPtr
// right before the static default_env. This guarantees default_env will
Expand Down
Loading

0 comments on commit f89caa1

Please sign in to comment.